import gradio as gr
import numpy as np
import cv2
from transformers import AutoImageProcessor, SiglipForImageClassification
from collections import Counter

# Load model
model_name = "prithivMLmods/Alphabet-Sign-Language-Detection"
processor = AutoImageProcessor.from_pretrained(model_name)
model = SiglipForImageClassification.from_pretrained(model_name)

def predict_from_video(video_path):
    cap = cv2.VideoCapture(video_path)
    predictions = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Convert frame to RGB
        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Run model
        inputs = processor(images=img, return_tensors="pt")
        outputs = model(**inputs)
        probs = outputs.logits.softmax(dim=-1)[0]

        idx = int(probs.argmax())
        label = model.config.id2label[idx]
        predictions.append(label)

    cap.release()

    # Majority vote
    if predictions:
        most_common = Counter(predictions).most_common(1)[0]
        return f"Predicted Letter: {most_common[0]} (appeared {most_common[1]} times)"
    else:
        return "No frames processed."

iface = gr.Interface(
    fn=predict_from_video,
    inputs=gr.Video(),   # ✅ no 'type' argument in Gradio 5.x
    outputs="text",
    title="ASL Alphabet Recognition from Video",
    description="Upload a short video of your ASL sign (A–Z). The system will analyze frames and predict the most likely letter."
)

iface.launch()