Spaces:

MonilM
/

Lingual

Running

App Files Files Community

MonilM commited on Apr 9

Commit

6932dcb

1 Parent(s): 3d16522

Add application file

Browse files

Files changed (3) hide show

app.py +212 -0
requirements.txt +10 -0
yolov12l.pt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+import io
+import base64
+import json
+import gradio as gr
+import numpy as np
+from PIL import Image
+import whisper
+from ultralytics import YOLO
+import requests
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+# Initialize models
+print("Loading Whisper model...")
+whisper_model = whisper.load_model("small")  # Options: tiny, base, small, medium, large
+print("Loading YOLO model...")
+yolo_model = YOLO('yolov8n.pt')  # Using nano version for speed
+# Create FastAPI app
+app = FastAPI()
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # For demo, allow all
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Define request/response models
+from pydantic import BaseModel
+from typing import Optional, List, Dict, Any
+class AudioRequest(BaseModel):
+    audio: str  # base64 encoded audio
+    format: str = "wav"
+    language: Optional[str] = None
+class TextTranslationRequest(BaseModel):
+    text: str
+    from_lang: str
+    to_lang: str
+class DetectionRequest(BaseModel):
+    image: str  # base64 encoded image
+    confidence: float = 0.25
+class DetectionResponse(BaseModel):
+    objects: List[Dict[str, Any]]
+    count: int
+# API Endpoints
+@app.post("/api/transcribe")
+async def transcribe_audio(request: AudioRequest):
+    try:
+        # Decode base64 audio data
+        audio_bytes = base64.b64decode(request.audio)
+        # Save to a temporary file
+        temp_path = "temp_audio.wav"
+        with open(temp_path, "wb") as f:
+            f.write(audio_bytes)
+        # Process with Whisper
+        result = whisper_model.transcribe(
+            temp_path,
+            language=request.language if request.language else None
+        )
+        # Clean up
+        os.remove(temp_path)
+        return {
+            "status": "success",
+            "text": result["text"],
+            "language": result["language"],
+            "segments": result["segments"]
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "message": str(e)
+        }
+@app.post("/api/detect_objects")
+async def detect_objects(request: DetectionRequest):
+    try:
+        # Decode base64 image
+        image_bytes = base64.b64decode(request.image)
+        image = Image.open(io.BytesIO(image_bytes))
+        # Run YOLO detection
+        results = yolo_model(image, conf=request.confidence)
+        # Process results
+        detections = []
+        for result in results:
+            for i, (box, score, cls) in enumerate(zip(result.boxes.xyxy, result.boxes.conf, result.boxes.cls)):
+                x1, y1, x2, y2 = [float(x) for x in box]
+                detections.append({
+                    "class": int(cls),
+                    "class_name": result.names[int(cls)],
+                    "confidence": float(score),
+                    "box": {
+                        "x1": x1, "y1": y1, "x2": x2, "y2": y2,
+                        "width": x2 - x1,
+                        "height": y2 - y1
+                    }
+                })
+        return {
+            "status": "success",
+            "objects": detections,
+            "count": len(detections)
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "message": str(e)
+        }
+# Gradio UI Functions
+def transcribe_audio_ui(audio, language=None):
+    if audio is None:
+        return "Please upload an audio file."
+    try:
+        # Process with Whisper
+        result = whisper_model.transcribe(audio, language=language if language else None)
+        return result["text"]
+    except Exception as e:
+        return f"Error: {str(e)}"
+def detect_objects_ui(image, confidence=0.25):
+    if image is None:
+        return "Please upload an image."
+    try:
+        # Run detection
+        results = yolo_model(image, conf=confidence)
+        # Create annotated image
+        annotated_img = results[0].plot()
+        # Get detections for display
+        detections = []
+        for result in results:
+            for i, (box, score, cls) in enumerate(zip(result.boxes.xyxy, result.boxes.conf, result.boxes.cls)):
+                label = f"{result.names[int(cls)]}: {float(score):.2f}"
+                detections.append(label)
+        return Image.fromarray(annotated_img), "\n".join(detections)
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+# Create Gradio Interface
+with gr.Blocks(title="IPD-Lingual API") as demo:
+    gr.Markdown("# IPD-Lingual Speech & Object Detection API")
+    with gr.Tab("Speech Recognition"):
+        gr.Markdown("## Transcribe Audio")
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.Audio(type="filepath", label="Upload Audio")
+                language = gr.Dropdown(
+                    choices=["en", "hi", "es", "fr", "de", "ja", "ko", None],
+                    value=None,
+                    label="Language (optional)"
+                )
+                transcribe_btn = gr.Button("Transcribe")
+            with gr.Column():
+                text_output = gr.Textbox(label="Transcription")
+        transcribe_btn.click(
+            fn=transcribe_audio_ui,
+            inputs=[audio_input, language],
+            outputs=text_output
+        )
+    with gr.Tab("Object Detection"):
+        gr.Markdown("## Detect Objects in Image")
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(label="Upload Image")
+                confidence_slider = gr.Slider(
+                    minimum=0.1, maximum=1.0, value=0.25,
+                    label="Confidence Threshold"
+                )
+                detect_btn = gr.Button("Detect Objects")
+            with gr.Column():
+                image_output = gr.Image(label="Detection Result")
+                labels_output = gr.Textbox(label="Detected Objects")
+        detect_btn.click(
+            fn=detect_objects_ui,
+            inputs=[image_input, confidence_slider],
+            outputs=[image_output, labels_output]
+        )
+    gr.Markdown("### API Endpoints")
+    gr.Markdown("- POST `/api/transcribe` - Transcribe audio")
+    gr.Markdown("- POST `/api/detect_objects` - Detect objects in images")
+# Mount both FastAPI and Gradio
+app = gr.mount_gradio_app(app, demo, path="/")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi>=0.98.0
+uvicorn>=0.22.0
+gradio>=3.40.1
+Pillow>=9.5.0
+numpy>=1.24.0
+openai-whisper>=20230314
+ultralytics>=8.0.0
+torch>=2.0.0
+pydantic>=1.10.8
+python-multipart>=0.0.6

yolov12l.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0babd8dc8f775bb64bb052debdff3d8b9e9b57efa9d7bfa11c84bb82c3fec336
+size 53699086