Image Classification
Transformers
Safetensors
English
clip
zero-shot-image-classification
multi-task-classification
fairface
vision
autoeval-has-no-ethical-license
Eval Results (legacy)
Instructions to use syntheticbot/clip-face-attribute-classifier with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use syntheticbot/clip-face-attribute-classifier with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-classification", model="syntheticbot/clip-face-attribute-classifier") pipe("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/parrots.png")# Load model directly from transformers import AutoProcessor, AutoModelForZeroShotImageClassification processor = AutoProcessor.from_pretrained("syntheticbot/clip-face-attribute-classifier") model = AutoModelForZeroShotImageClassification.from_pretrained("syntheticbot/clip-face-attribute-classifier") - Notebooks
- Google Colab
- Kaggle
| import pandas as pd | |
| import torch | |
| import torch.nn as nn | |
| from PIL import Image | |
| from sklearn.metrics import accuracy_score | |
| from transformers import ( | |
| Trainer, | |
| TrainingArguments, | |
| CLIPVisionModel, | |
| CLIPImageProcessor, | |
| ) | |
| from torch.utils.data import Dataset | |
| import os | |
| os.environ["WANDB_DISABLED"] = "true" | |
| # --- 1. Configuration --- | |
| # Define paths and model name | |
| BASE_PATH = './' # Assumes the script is run from the 'fairface' directory | |
| TRAIN_CSV = os.path.join(BASE_PATH, 'fairface_label_train.csv') | |
| VAL_CSV = os.path.join(BASE_PATH, 'fairface_label_val.csv') | |
| MODEL_NAME = "openai/clip-vit-large-patch14" | |
| OUTPUT_DIR = "./clip-fairface-finetuned" | |
| # --- 2. Load and Prepare Label Mappings --- | |
| # Load training data to create consistent label-to-ID mappings | |
| train_df = pd.read_csv(TRAIN_CSV) | |
| # Create sorted unique label lists to ensure consistent mapping | |
| age_labels = sorted(train_df['age'].unique()) | |
| gender_labels = sorted(train_df['gender'].unique()) | |
| race_labels = sorted(train_df['race'].unique()) | |
| # Create label-to-ID mappings for each task | |
| label_mappings = { | |
| 'age': {label: i for i, label in enumerate(age_labels)}, | |
| 'gender': {label: i for i, label in enumerate(gender_labels)}, | |
| 'race': {label: i for i, label in enumerate(race_labels)}, | |
| } | |
| NUM_LABELS = { | |
| 'age': len(age_labels), | |
| 'gender': len(gender_labels), | |
| 'race': len(race_labels), | |
| } | |
| print(f"Number of labels: Age={NUM_LABELS['age']}, Gender={NUM_LABELS['gender']}, Race={NUM_LABELS['race']}") | |
| # --- 3. Custom Dataset --- | |
| class FairFaceDataset(Dataset): | |
| def __init__(self, csv_file, image_processor, label_maps, base_path): | |
| self.df = pd.read_csv(csv_file) | |
| self.image_processor = image_processor | |
| self.label_maps = label_maps | |
| self.base_path = base_path | |
| def __len__(self): | |
| return len(self.df) | |
| def __getitem__(self, idx): | |
| row = self.df.iloc[idx] | |
| # Construct the full path to the image | |
| image_path = os.path.join(self.base_path, row['file']) | |
| image = Image.open(image_path).convert("RGB") | |
| # Process the image | |
| inputs = {} | |
| inputs['pixel_values'] = self.image_processor(images=image, return_tensors="pt").pixel_values.squeeze(0) | |
| # Process labels into a dictionary of tensors | |
| inputs['labels'] = { | |
| 'age': torch.tensor(self.label_maps['age'][row['age']], dtype=torch.long), | |
| 'gender': torch.tensor(self.label_maps['gender'][row['gender']], dtype=torch.long), | |
| 'race': torch.tensor(self.label_maps['race'][row['race']], dtype=torch.long), | |
| } | |
| return inputs | |
| # --- 4. Custom Model Definition --- | |
| # --- 4. Custom Model Definition (Corrected for Gradient Checkpointing) --- | |
| class MultiTaskClipVisionModel(nn.Module): | |
| # Add this class attribute to signal to the Trainer that we support this | |
| supports_gradient_checkpointing = True | |
| def __init__(self, num_labels): | |
| super(MultiTaskClipVisionModel, self).__init__() | |
| self.vision_model = CLIPVisionModel.from_pretrained(MODEL_NAME) | |
| # Freeze all parameters of the vision model first | |
| for param in self.vision_model.parameters(): | |
| param.requires_grad = False | |
| # Unfreeze the last few layers for fine-tuning. | |
| for layer in self.vision_model.vision_model.encoder.layers[-3:]: # Unfreeze last 3 transformer layers | |
| for param in layer.parameters(): | |
| param.requires_grad = True | |
| # Define classification heads for each task | |
| hidden_size = self.vision_model.config.hidden_size | |
| self.age_head = nn.Linear(hidden_size, num_labels['age']) | |
| self.gender_head = nn.Linear(hidden_size, num_labels['gender']) | |
| self.race_head = nn.Linear(hidden_size, num_labels['race']) | |
| # ADD THIS METHOD: This will be called by the Trainer | |
| def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None): | |
| """Activates gradient checkpointing for the underlying vision model.""" | |
| self.vision_model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) | |
| def forward(self, pixel_values, labels=None): | |
| # The forward pass now works seamlessly with gradient checkpointing enabled | |
| outputs = self.vision_model(pixel_values=pixel_values) | |
| pooled_output = outputs.pooler_output | |
| age_logits = self.age_head(pooled_output) | |
| gender_logits = self.gender_head(pooled_output) | |
| race_logits = self.race_head(pooled_output) | |
| loss = None | |
| # If labels are provided, calculate the combined loss | |
| if labels is not None: | |
| loss_fct = nn.CrossEntropyLoss() | |
| age_loss = loss_fct(age_logits, labels['age']) | |
| gender_loss = loss_fct(gender_logits, labels['gender']) | |
| race_loss = loss_fct(race_logits, labels['race']) | |
| # Total loss is the sum of individual task losses | |
| loss = age_loss + gender_loss + race_loss | |
| return { | |
| 'loss': loss, | |
| 'logits': { | |
| 'age': age_logits, | |
| 'gender': gender_logits, | |
| 'race': race_logits, | |
| }, | |
| } | |
| # --- 5. Data Collator and Metrics --- | |
| def collate_fn(batch): | |
| # Stacks pixel values and organizes labels into a dictionary of tensors | |
| pixel_values = torch.stack([item['pixel_values'] for item in batch]) | |
| labels = { | |
| 'age': torch.tensor([item['labels']['age'] for item in batch], dtype=torch.long), | |
| 'gender': torch.tensor([item['labels']['gender'] for item in batch], dtype=torch.long), | |
| 'race': torch.tensor([item['labels']['race'] for item in batch], dtype=torch.long), | |
| } | |
| return {'pixel_values': pixel_values, 'labels': labels} | |
| def compute_metrics(p): | |
| # p is an EvalPrediction object containing predictions and label_ids | |
| logits = p.predictions | |
| labels = p.label_ids | |
| # Extract predictions and labels for each task | |
| age_preds = logits['age'].argmax(-1) | |
| gender_preds = logits['gender'].argmax(-1) | |
| race_preds = logits['race'].argmax(-1) | |
| age_labels = labels['age'] | |
| gender_labels = labels['gender'] | |
| race_labels = labels['race'] | |
| # Calculate accuracy for each task | |
| return { | |
| 'age_accuracy': accuracy_score(age_labels, age_preds), | |
| 'gender_accuracy': accuracy_score(gender_labels, gender_preds), | |
| 'race_accuracy': accuracy_score(race_labels, race_preds), | |
| } | |
| # --- 6. Trainer Setup and Execution --- | |
| def main(): | |
| # Initialize the image processor and our custom model | |
| image_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME) | |
| model = MultiTaskClipVisionModel(num_labels=NUM_LABELS) | |
| # Initialize the training and validation datasets | |
| train_dataset = FairFaceDataset( | |
| csv_file=TRAIN_CSV, image_processor=image_processor, label_maps=label_mappings, base_path=BASE_PATH | |
| ) | |
| val_dataset = FairFaceDataset( | |
| csv_file=VAL_CSV, image_processor=image_processor, label_maps=label_mappings, base_path=BASE_PATH | |
| ) | |
| # Define the training arguments | |
| # In your main() function, replace the old TrainingArguments with this one | |
| # Define the training arguments | |
| training_args = TrainingArguments( | |
| output_dir=OUTPUT_DIR, | |
| num_train_epochs=5, | |
| # Set a batch size that fits in memory | |
| per_device_train_batch_size=24, | |
| per_device_eval_batch_size=32, # Evaluation does not need accumulation and can use a larger batch size | |
| # Set accumulation steps to reach the desired effective batch size (24 * 22 = 528) | |
| gradient_accumulation_steps=22, | |
| # Enable gradient checkpointing to save more memory | |
| gradient_checkpointing=True, | |
| warmup_steps=500, | |
| weight_decay=0.01, | |
| logging_dir='./logs', | |
| logging_steps=10, # Log more frequently to see progress within a large effective batch | |
| evaluation_strategy="steps", | |
| eval_steps=250, # You might want to evaluate less frequently with larger batches | |
| save_strategy="steps", | |
| save_steps=250, | |
| load_best_model_at_end=True, | |
| metric_for_best_model='gender_accuracy', | |
| save_total_limit=3, | |
| fp16=True, # Mixed-precision training is essential for large models | |
| remove_unused_columns=False, | |
| report_to="none", # Disables wandb logging | |
| ) | |
| # Initialize the Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=val_dataset, | |
| data_collator=collate_fn, | |
| compute_metrics=compute_metrics, | |
| ) | |
| # Start training | |
| print("Starting model training...") | |
| trainer.train() | |
| # Save the final model and processor | |
| print("Saving the best model...") | |
| trainer.save_model(os.path.join(OUTPUT_DIR, "best_model")) | |
| image_processor.save_pretrained(os.path.join(OUTPUT_DIR, "best_model")) | |
| print("Training complete!") | |
| if __name__ == "__main__": | |
| main() |