MonilM commited on
Commit
6932dcb
·
1 Parent(s): 3d16522

Add application file

Browse files
Files changed (3) hide show
  1. app.py +212 -0
  2. requirements.txt +10 -0
  3. yolov12l.pt +3 -0
app.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import base64
4
+ import json
5
+ import gradio as gr
6
+ import numpy as np
7
+ from PIL import Image
8
+ import whisper
9
+ from ultralytics import YOLO
10
+ import requests
11
+ from fastapi import FastAPI
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+
14
+ # Initialize models
15
+ print("Loading Whisper model...")
16
+ whisper_model = whisper.load_model("small") # Options: tiny, base, small, medium, large
17
+
18
+ print("Loading YOLO model...")
19
+ yolo_model = YOLO('yolov8n.pt') # Using nano version for speed
20
+
21
+ # Create FastAPI app
22
+ app = FastAPI()
23
+
24
+ # Add CORS middleware
25
+ app.add_middleware(
26
+ CORSMiddleware,
27
+ allow_origins=["*"], # For demo, allow all
28
+ allow_credentials=True,
29
+ allow_methods=["*"],
30
+ allow_headers=["*"],
31
+ )
32
+
33
+ # Define request/response models
34
+ from pydantic import BaseModel
35
+ from typing import Optional, List, Dict, Any
36
+
37
+ class AudioRequest(BaseModel):
38
+ audio: str # base64 encoded audio
39
+ format: str = "wav"
40
+ language: Optional[str] = None
41
+
42
+ class TextTranslationRequest(BaseModel):
43
+ text: str
44
+ from_lang: str
45
+ to_lang: str
46
+
47
+ class DetectionRequest(BaseModel):
48
+ image: str # base64 encoded image
49
+ confidence: float = 0.25
50
+
51
+ class DetectionResponse(BaseModel):
52
+ objects: List[Dict[str, Any]]
53
+ count: int
54
+
55
+ # API Endpoints
56
+ @app.post("/api/transcribe")
57
+ async def transcribe_audio(request: AudioRequest):
58
+ try:
59
+ # Decode base64 audio data
60
+ audio_bytes = base64.b64decode(request.audio)
61
+
62
+ # Save to a temporary file
63
+ temp_path = "temp_audio.wav"
64
+ with open(temp_path, "wb") as f:
65
+ f.write(audio_bytes)
66
+
67
+ # Process with Whisper
68
+ result = whisper_model.transcribe(
69
+ temp_path,
70
+ language=request.language if request.language else None
71
+ )
72
+
73
+ # Clean up
74
+ os.remove(temp_path)
75
+
76
+ return {
77
+ "status": "success",
78
+ "text": result["text"],
79
+ "language": result["language"],
80
+ "segments": result["segments"]
81
+ }
82
+ except Exception as e:
83
+ return {
84
+ "status": "error",
85
+ "message": str(e)
86
+ }
87
+
88
+ @app.post("/api/detect_objects")
89
+ async def detect_objects(request: DetectionRequest):
90
+ try:
91
+ # Decode base64 image
92
+ image_bytes = base64.b64decode(request.image)
93
+ image = Image.open(io.BytesIO(image_bytes))
94
+
95
+ # Run YOLO detection
96
+ results = yolo_model(image, conf=request.confidence)
97
+
98
+ # Process results
99
+ detections = []
100
+ for result in results:
101
+ for i, (box, score, cls) in enumerate(zip(result.boxes.xyxy, result.boxes.conf, result.boxes.cls)):
102
+ x1, y1, x2, y2 = [float(x) for x in box]
103
+ detections.append({
104
+ "class": int(cls),
105
+ "class_name": result.names[int(cls)],
106
+ "confidence": float(score),
107
+ "box": {
108
+ "x1": x1, "y1": y1, "x2": x2, "y2": y2,
109
+ "width": x2 - x1,
110
+ "height": y2 - y1
111
+ }
112
+ })
113
+
114
+ return {
115
+ "status": "success",
116
+ "objects": detections,
117
+ "count": len(detections)
118
+ }
119
+ except Exception as e:
120
+ return {
121
+ "status": "error",
122
+ "message": str(e)
123
+ }
124
+
125
+ # Gradio UI Functions
126
+ def transcribe_audio_ui(audio, language=None):
127
+ if audio is None:
128
+ return "Please upload an audio file."
129
+
130
+ try:
131
+ # Process with Whisper
132
+ result = whisper_model.transcribe(audio, language=language if language else None)
133
+ return result["text"]
134
+ except Exception as e:
135
+ return f"Error: {str(e)}"
136
+
137
+ def detect_objects_ui(image, confidence=0.25):
138
+ if image is None:
139
+ return "Please upload an image."
140
+
141
+ try:
142
+ # Run detection
143
+ results = yolo_model(image, conf=confidence)
144
+
145
+ # Create annotated image
146
+ annotated_img = results[0].plot()
147
+
148
+ # Get detections for display
149
+ detections = []
150
+ for result in results:
151
+ for i, (box, score, cls) in enumerate(zip(result.boxes.xyxy, result.boxes.conf, result.boxes.cls)):
152
+ label = f"{result.names[int(cls)]}: {float(score):.2f}"
153
+ detections.append(label)
154
+
155
+ return Image.fromarray(annotated_img), "\n".join(detections)
156
+ except Exception as e:
157
+ return None, f"Error: {str(e)}"
158
+
159
+ # Create Gradio Interface
160
+ with gr.Blocks(title="IPD-Lingual API") as demo:
161
+ gr.Markdown("# IPD-Lingual Speech & Object Detection API")
162
+
163
+ with gr.Tab("Speech Recognition"):
164
+ gr.Markdown("## Transcribe Audio")
165
+ with gr.Row():
166
+ with gr.Column():
167
+ audio_input = gr.Audio(type="filepath", label="Upload Audio")
168
+ language = gr.Dropdown(
169
+ choices=["en", "hi", "es", "fr", "de", "ja", "ko", None],
170
+ value=None,
171
+ label="Language (optional)"
172
+ )
173
+ transcribe_btn = gr.Button("Transcribe")
174
+ with gr.Column():
175
+ text_output = gr.Textbox(label="Transcription")
176
+
177
+ transcribe_btn.click(
178
+ fn=transcribe_audio_ui,
179
+ inputs=[audio_input, language],
180
+ outputs=text_output
181
+ )
182
+
183
+ with gr.Tab("Object Detection"):
184
+ gr.Markdown("## Detect Objects in Image")
185
+ with gr.Row():
186
+ with gr.Column():
187
+ image_input = gr.Image(label="Upload Image")
188
+ confidence_slider = gr.Slider(
189
+ minimum=0.1, maximum=1.0, value=0.25,
190
+ label="Confidence Threshold"
191
+ )
192
+ detect_btn = gr.Button("Detect Objects")
193
+ with gr.Column():
194
+ image_output = gr.Image(label="Detection Result")
195
+ labels_output = gr.Textbox(label="Detected Objects")
196
+
197
+ detect_btn.click(
198
+ fn=detect_objects_ui,
199
+ inputs=[image_input, confidence_slider],
200
+ outputs=[image_output, labels_output]
201
+ )
202
+
203
+ gr.Markdown("### API Endpoints")
204
+ gr.Markdown("- POST `/api/transcribe` - Transcribe audio")
205
+ gr.Markdown("- POST `/api/detect_objects` - Detect objects in images")
206
+
207
+ # Mount both FastAPI and Gradio
208
+ app = gr.mount_gradio_app(app, demo, path="/")
209
+
210
+ if __name__ == "__main__":
211
+ import uvicorn
212
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.98.0
2
+ uvicorn>=0.22.0
3
+ gradio>=3.40.1
4
+ Pillow>=9.5.0
5
+ numpy>=1.24.0
6
+ openai-whisper>=20230314
7
+ ultralytics>=8.0.0
8
+ torch>=2.0.0
9
+ pydantic>=1.10.8
10
+ python-multipart>=0.0.6
yolov12l.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0babd8dc8f775bb64bb052debdff3d8b9e9b57efa9d7bfa11c84bb82c3fec336
3
+ size 53699086