Spaces:
Running
Running
File size: 5,162 Bytes
2294c6e 0bca9cd 065e4c1 2294c6e 065e4c1 2294c6e 065e4c1 2294c6e 065e4c1 2294c6e 065e4c1 64ffe28 0bca9cd d478ff8 0bca9cd d478ff8 0bca9cd d478ff8 0bca9cd d478ff8 0bca9cd 64ffe28 065e4c1 02011a6 2294c6e 6469b59 2294c6e 02011a6 2294c6e 02011a6 065e4c1 2294c6e 02011a6 2294c6e a27dab4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import gradio as gr
from transformers import AutoModel, AutoTokenizer
import torch
import json
import requests
from PIL import Image
from torchvision import transforms
import urllib.request
from torchvision import models
import torch.nn as nn
# --- Define the Model ---
class FineGrainedClassifier(nn.Module):
def __init__(self, num_classes=434): # Updated to 434 classes
super(FineGrainedClassifier, self).__init__()
self.image_encoder = models.resnet50(pretrained=True)
self.image_encoder.fc = nn.Identity()
self.text_encoder = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en')
self.classifier = nn.Sequential(
nn.Linear(2048 + 768, 1024),
nn.BatchNorm1d(1024),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(1024, 512),
nn.BatchNorm1d(512),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(512, num_classes) # Updated to 434 classes
)
def forward(self, image, input_ids, attention_mask):
image_features = self.image_encoder(image)
text_output = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
text_features = text_output.last_hidden_state[:, 0, :]
combined_features = torch.cat((image_features, text_features), dim=1)
output = self.classifier(combined_features)
return output
# --- Data Augmentation Setup ---
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(15),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# Load the label-to-class mapping from your Hugging Face repository
label_map_url = "https://huggingface.co/Maverick98/EcommerceClassifier/resolve/main/label_to_class.json"
label_to_class = requests.get(label_map_url).json()
# Load your custom model from Hugging Face
model = FineGrainedClassifier(num_classes=len(label_to_class))
checkpoint_url = f"https://huggingface.co/Maverick98/EcommerceClassifier/resolve/main/model_checkpoint.pth"
checkpoint = torch.hub.load_state_dict_from_url(checkpoint_url, map_location=torch.device('cpu'))
# Strip the "module." prefix from the keys in the state_dict if they exist
# Clean up the state dictionary
state_dict = checkpoint.get('model_state_dict', checkpoint)
new_state_dict = {}
for k, v in state_dict.items():
if k.startswith("module."):
new_key = k[7:] # Remove "module." prefix
else:
new_key = k
# Check if the new_key exists in the model's state_dict, only add if it does
if new_key in model.state_dict():
new_state_dict[new_key] = v
model.load_state_dict(new_state_dict)
# Load the tokenizer from Jina
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-base-en")
def load_image(image_path_or_url):
if isinstance(image_path_or_url, str) and image_path_or_url.startswith("http"):
with urllib.request.urlopen(image_path_or_url) as url:
image = Image.open(url).convert('RGB')
else:
image = Image.open(image_path_or_url).convert('RGB')
image = transform(image)
image = image.unsqueeze(0) # Add batch dimension
return image
def predict(image_path_or_file, title, threshold=0.4):
# Preprocess the image
image = load_image(image_path_or_file)
# Tokenize title
title_encoding = tokenizer(title, padding='max_length', max_length=200, truncation=True, return_tensors='pt')
input_ids = title_encoding['input_ids']
attention_mask = title_encoding['attention_mask']
# Predict
model.eval()
with torch.no_grad():
output = model(image, input_ids=input_ids, attention_mask=attention_mask)
probabilities = torch.nn.functional.softmax(output, dim=1)
top3_probabilities, top3_indices = torch.topk(probabilities, 3, dim=1)
# Map the top 3 indices to class names
top3_classes = [label_to_class[str(idx.item())] for idx in top3_indices[0]]
# Check if the highest probability is below the threshold
if top3_probabilities[0][0].item() < threshold:
top3_classes.insert(0, "Others")
top3_probabilities = torch.cat((torch.tensor([[1.0 - top3_probabilities[0][0].item()]]), top3_probabilities), dim=1)
# Prepare the output as a dictionary
results = {}
for i in range(len(top3_classes)):
results[top3_classes[i]] = top3_probabilities[0][i].item()
return results
# Define the Gradio interface
title_input = gr.Textbox(label="Product Title", placeholder="Enter the product title here...")
image_input = gr.Image(type="filepath", label="Upload Image or Provide URL")
output = gr.JSON(label="Top 3 Predictions with Probabilities")
gr.Interface(
fn=predict,
inputs=[image_input, title_input],
outputs=output,
title="Ecommerce Classifier",
description="This model classifies ecommerce products into one of 434 categories. If the model is unsure, it outputs 'Others'.",
).launch(share=True)
|