import gradio as gr import numpy as np import cv2 from transformers import AutoImageProcessor, SiglipForImageClassification from collections import Counter # Load model model_name = "prithivMLmods/Alphabet-Sign-Language-Detection" processor = AutoImageProcessor.from_pretrained(model_name) model = SiglipForImageClassification.from_pretrained(model_name) def predict_from_video(video_path): cap = cv2.VideoCapture(video_path) predictions = [] while cap.isOpened(): ret, frame = cap.read() if not ret: break # Convert frame to RGB img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Run model inputs = processor(images=img, return_tensors="pt") outputs = model(**inputs) probs = outputs.logits.softmax(dim=-1)[0] idx = int(probs.argmax()) label = model.config.id2label[idx] predictions.append(label) cap.release() # Majority vote if predictions: most_common = Counter(predictions).most_common(1)[0] return f"Predicted Letter: {most_common[0]} (appeared {most_common[1]} times)" else: return "No frames processed." iface = gr.Interface( fn=predict_from_video, inputs=gr.Video(), # ✅ no 'type' argument in Gradio 5.x outputs="text", title="ASL Alphabet Recognition from Video", description="Upload a short video of your ASL sign (A–Z). The system will analyze frames and predict the most likely letter." ) iface.launch()