phani50101 commited on
Commit
bbde278
·
1 Parent(s): 66b44b8
Files changed (2) hide show
  1. app.py +638 -0
  2. requirements.txt +15 -0
app.py ADDED
@@ -0,0 +1,638 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import snapshot_download
2
+ import gradio as gr
3
+ import openvino_genai
4
+ import librosa
5
+ import numpy as np
6
+ from threading import Lock, Event
7
+ from scipy.ndimage import uniform_filter1d
8
+ from queue import Queue, Empty
9
+ from googleapiclient.discovery import build
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ import time
12
+ import cpuinfo
13
+ import gc
14
+ import os
15
+ import requests
16
+ from PIL import Image
17
+ from io import BytesIO
18
+ import openvino as ov
19
+ import threading
20
+
21
+ # Set CPU affinity for optimization
22
+ os.environ["GOMP_CPU_AFFINITY"] = "0-7" # Use first 8 CPU cores
23
+ os.environ["OMP_NUM_THREADS"] = "8"
24
+
25
+ # Configuration constants
26
+ GOOGLE_API_KEY = "AIzaSyAo-1iW5MEZbc53DlEldtnUnDaYuTHUDH4"
27
+ GOOGLE_CSE_ID = "3027bedf3c88a4efb"
28
+ DEFAULT_MAX_TOKENS = 100
29
+ DEFAULT_NUM_IMAGES = 1
30
+ MAX_HISTORY_TURNS = 2
31
+ MAX_TOKENS_LIMIT = 1000
32
+
33
+ # Download models
34
+ start_time = time.time()
35
+ snapshot_download(repo_id="OpenVINO/mistral-7b-instruct-v0.1-int8-ov", local_dir="mistral-ov")
36
+ snapshot_download(repo_id="OpenVINO/whisper-tiny-fp16-ov", local_dir="whisper-ov-model")
37
+ snapshot_download(repo_id="OpenVINO/InternVL2-1B-int8-ov", local_dir="internvl-ov") # Added for image analysis
38
+ print(f"Model download time: {time.time() - start_time:.2f} seconds")
39
+
40
+ # CPU-specific configuration
41
+ cpu_features = cpuinfo.get_cpu_info()['flags']
42
+ config_options = {}
43
+ if 'avx512' in cpu_features:
44
+ config_options["ENFORCE_BF16"] = "YES"
45
+ print("Using AVX512 optimizations")
46
+ elif 'avx2' in cpu_features:
47
+ config_options["INFERENCE_PRECISION_HINT"] = "f32"
48
+ print("Using AVX2 optimizations")
49
+
50
+ # Initialize models with performance flags
51
+ start_time = time.time()
52
+ mistral_pipe = openvino_genai.LLMPipeline(
53
+ "mistral-ov",
54
+ device="CPU",
55
+ config={
56
+ "PERFORMANCE_HINT": "THROUGHPUT",
57
+ **config_options
58
+ }
59
+ )
60
+
61
+ whisper_pipe = openvino_genai.WhisperPipeline(
62
+ "whisper-ov-model",
63
+ device="CPU"
64
+ )
65
+ pipe_lock = Lock()
66
+ print(f"Model initialization time: {time.time() - start_time:.2f} seconds")
67
+
68
+ # Initialize InternVL pipeline for image analysis (lazy loading)
69
+ internvl_pipe = None
70
+ internvl_lock = Lock()
71
+
72
+ def get_internvl_pipeline():
73
+ global internvl_pipe
74
+ with internvl_lock:
75
+ if internvl_pipe is None:
76
+ print("Initializing InternVL pipeline...")
77
+ start_time = time.time()
78
+ internvl_pipe = openvino_genai.VLMPipeline("internvl-ov", device="CPU")
79
+ print(f"InternVL pipeline initialization time: {time.time() - start_time:.2f} seconds")
80
+ return internvl_pipe
81
+
82
+ # Warm up models
83
+ print("Warming up models...")
84
+ start_time = time.time()
85
+ with pipe_lock:
86
+ mistral_pipe.generate("Warmup", openvino_genai.GenerationConfig(max_new_tokens=10))
87
+ whisper_pipe.generate(np.zeros(16000, dtype=np.float32))
88
+ print(f"Model warmup time: {time.time() - start_time:.2f} seconds")
89
+
90
+ # Thread pools
91
+ generation_executor = ThreadPoolExecutor(max_workers=4) # Increased workers
92
+ image_executor = ThreadPoolExecutor(max_workers=8)
93
+
94
+ def fetch_images(query: str, num: int = DEFAULT_NUM_IMAGES) -> list:
95
+ """Fetch unique images by requesting different result pages"""
96
+ start_time = time.time()
97
+
98
+ if num <= 0:
99
+ return []
100
+
101
+ try:
102
+ service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)
103
+ image_links = []
104
+ seen_urls = set() # To track unique URLs
105
+
106
+ # Start from different positions to get unique images
107
+ for start_index in range(1, num * 2, 2): # Step by 2 to get different pages
108
+ if len(image_links) >= num:
109
+ break
110
+
111
+ res = service.cse().list(
112
+ q=query,
113
+ cx=GOOGLE_CSE_ID,
114
+ searchType="image",
115
+ num=1, # Get one result per request
116
+ start=start_index # Start at different positions
117
+ ).execute()
118
+
119
+ if "items" in res and res["items"]:
120
+ item = res["items"][0]
121
+ # Skip duplicates
122
+ if item["link"] not in seen_urls:
123
+ image_links.append(item["link"])
124
+ seen_urls.add(item["link"])
125
+
126
+ print(f"Unique image fetch time: {time.time() - start_time:.2f} seconds")
127
+ return image_links[:num] # Return only the requested number
128
+ except Exception as e:
129
+ print(f"Error in image fetching: {e}")
130
+ return []
131
+
132
+ def process_audio(data, sr):
133
+ start_time = time.time()
134
+ data = librosa.to_mono(data.T) if data.ndim > 1 else data
135
+ data = data.astype(np.float32)
136
+ data /= np.max(np.abs(data))
137
+ rms = librosa.feature.rms(y=data, frame_length=2048, hop_length=512)[0]
138
+ smoothed_rms = uniform_filter1d(rms, size=5)
139
+ speech_frames = np.where(smoothed_rms > 0.025)[0]
140
+ if not speech_frames.size:
141
+ print(f"Audio processing time: {time.time() - start_time:.2f} seconds")
142
+ return None
143
+ start = max(0, int(speech_frames[0] * 512 - 0.1 * sr))
144
+ end = min(len(data), int((speech_frames[-1] + 1) * 512 + 0.1 * sr))
145
+ print(f"Audio processing time: {time.time() - start_time:.2f} seconds")
146
+ return data[start:end]
147
+
148
+ def transcribe(audio):
149
+ start_time = time.time()
150
+ if audio is None:
151
+ print(f"Transcription time: {time.time() - start_time:.2f} seconds")
152
+ return ""
153
+ sr, data = audio
154
+ processed = process_audio(data, sr)
155
+ if processed is None or len(processed) < 1600:
156
+ print(f"Transcription time: {time.time() - start_time:.2f} seconds")
157
+ return ""
158
+ if sr != 16000:
159
+ processed = librosa.resample(processed, orig_sr=sr, target_sr=16000)
160
+ result = whisper_pipe.generate(processed)
161
+ print(f"Transcription time: {time.time() - start_time:.2f} seconds")
162
+ return result
163
+
164
+ def stream_answer(message: str, max_tokens: int, include_images: bool) -> str:
165
+ start_time = time.time()
166
+ response_queue = Queue()
167
+ completion_event = Event()
168
+ error = [None]
169
+
170
+ optimized_config = openvino_genai.GenerationConfig(
171
+ max_new_tokens=max_tokens,
172
+ num_beams=1,
173
+ do_sample=False,
174
+ temperature=1.0,
175
+ top_p=0.9,
176
+ top_k=30,
177
+ streaming=True,
178
+ streaming_interval=5 # Batch tokens in groups of 5
179
+ )
180
+
181
+ def callback(tokens): # Now accepts multiple tokens
182
+ response_queue.put("".join(tokens))
183
+ return openvino_genai.StreamingStatus.RUNNING
184
+
185
+ def generate():
186
+ try:
187
+ with pipe_lock:
188
+ mistral_pipe.generate(message, optimized_config, callback)
189
+ except Exception as e:
190
+ error[0] = str(e)
191
+ finally:
192
+ completion_event.set()
193
+
194
+ generation_executor.submit(generate)
195
+
196
+ accumulated = []
197
+ token_count = 0
198
+ last_gc = time.time()
199
+
200
+ while not completion_event.is_set() or not response_queue.empty():
201
+ if error[0]:
202
+ yield f"Error: {error[0]}"
203
+ print(f"Stream answer time: {time.time() - start_time:.2f} seconds")
204
+ return
205
+
206
+ try:
207
+ token_batch = response_queue.get_nowait()
208
+ accumulated.append(token_batch)
209
+ token_count += len(token_batch)
210
+
211
+ # Periodic garbage collection
212
+ if time.time() - last_gc > 2.0: # Every 2 seconds
213
+ gc.collect()
214
+ last_gc = time.time()
215
+
216
+ yield "".join(accumulated)
217
+ except Empty:
218
+ continue
219
+
220
+ print(f"Generated {token_count} tokens in {time.time() - start_time:.2f} seconds "
221
+ f"({token_count/(time.time() - start_time):.2f} tokens/sec)")
222
+ yield "".join(accumulated)
223
+
224
+ def run_chat(message: str, history: list, include_images: bool, max_tokens: int, num_images: int):
225
+ start_time = time.time()
226
+ final_text = ""
227
+
228
+ # Create a placeholder for the streaming response
229
+ history.append((message, "", []))
230
+ rendered_history = render_history(history)
231
+ yield rendered_history, gr.update(value="", interactive=False)
232
+
233
+ # Stream tokens and update chatbot in real-time
234
+ for output in stream_answer(message, max_tokens, include_images):
235
+ final_text = output
236
+ # Update only the last response in history
237
+ updated_history = history[:-1] + [(message, final_text, [])]
238
+ rendered_history = render_history(updated_history)
239
+ yield rendered_history, gr.update(value="", interactive=False)
240
+
241
+ images = []
242
+ if include_images:
243
+ images = fetch_images(message, num_images)
244
+
245
+ # Update history with final response and images
246
+ history[-1] = (message, final_text, images)
247
+ if len(history) > MAX_HISTORY_TURNS:
248
+ history = history[-MAX_HISTORY_TURNS:]
249
+
250
+ rendered_history = render_history(history)
251
+ print(f"Total chat time: {time.time() - start_time:.2f} seconds")
252
+ yield rendered_history, gr.update(value="", interactive=True)
253
+
254
+ def render_history(history):
255
+ start_time = time.time()
256
+ rendered = []
257
+ for user_msg, bot_msg, image_links in history:
258
+ text = bot_msg
259
+ if image_links:
260
+ images_html = "".join(
261
+ f"<img src='{url}' class='chat-image' onclick='showImage(\"{url}\")' />"
262
+ for url in image_links
263
+ )
264
+ text += f"<br><br><b>📸 Related Visuals:</b><br><div style='display: flex; flex-wrap: wrap;'>{images_html}</div>"
265
+ rendered.append((user_msg, text))
266
+
267
+ return rendered
268
+
269
+ # ===== IMAGE ANALYSIS FUNCTIONS =====
270
+ def load_image(image_source):
271
+ """Load image from various sources: file path, URL, or PIL Image"""
272
+ if isinstance(image_source, str):
273
+ if image_source.startswith(("http://", "https://")):
274
+ # Load from URL
275
+ response = requests.get(image_source)
276
+ image = Image.open(BytesIO(response.content)).convert("RGB")
277
+ else:
278
+ # Load from file path
279
+ image = Image.open(image_source).convert("RGB")
280
+ elif isinstance(image_source, Image.Image):
281
+ # Already a PIL image
282
+ image = image_source
283
+ else:
284
+ raise ValueError("Unsupported image input type")
285
+
286
+ # Convert to OpenVINO tensor
287
+ image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.byte)
288
+ return ov.Tensor(image_data)
289
+
290
+ def analyze_image(image, url, prompt):
291
+ try:
292
+ # Determine image source (priority: uploaded image > URL)
293
+ image_source = image if image is not None else url
294
+
295
+ if not image_source:
296
+ return "⚠️ Please upload an image or enter an image URL"
297
+
298
+ # Convert to OpenVINO tensor
299
+ image_tensor = load_image(image_source)
300
+
301
+ # Get pipeline (lazy initialization)
302
+ pipe = get_internvl_pipeline()
303
+
304
+ # Generate response with thread safety
305
+ with internvl_lock:
306
+ pipe.start_chat()
307
+ output = pipe.generate(prompt, image=image_tensor, max_new_tokens=100)
308
+ pipe.finish_chat()
309
+
310
+ return output
311
+
312
+ except Exception as e:
313
+ return f"❌ Error: {str(e)}"
314
+
315
+ # ===== GRADIO INTERFACE =====
316
+ css = """
317
+ .processing {
318
+ animation: pulse 1.5s infinite;
319
+ color: #4a5568;
320
+ padding: 10px;
321
+ border-radius: 5px;
322
+ text-align: center;
323
+ margin: 10px 0;
324
+ }
325
+ @keyframes pulse {
326
+ 0%, 100% { opacity: 1; }
327
+ 50% { opacity: 0.5; }
328
+ }
329
+ .chat-image {
330
+ cursor: pointer;
331
+ transition: transform 0.2s;
332
+ max-height: 100px;
333
+ margin: 4px;
334
+ border-radius: 8px;
335
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
336
+ }
337
+ .chat-image:hover {
338
+ transform: scale(1.05);
339
+ box-shadow: 0 4px 8px rgba(0,0,0,0.2);
340
+ }
341
+ .modal {
342
+ position: fixed;
343
+ top: 0;
344
+ left: 0;
345
+ width: 100%;
346
+ height: 100%;
347
+ background: rgba(0,0,0,0.8);
348
+ display: none;
349
+ z-index: 1000;
350
+ cursor: zoom-out;
351
+ }
352
+ .modal-content {
353
+ position: absolute;
354
+ top: 50%;
355
+ left: 50%;
356
+ transform: translate(-50%, -50%);
357
+ max-width: 90%;
358
+ max-height: 90%;
359
+ background: white;
360
+ padding: 10px;
361
+ border-radius: 12px;
362
+ }
363
+ .modal-img {
364
+ width: auto;
365
+ height: auto;
366
+ max-width: 100%;
367
+ max-height: 100%;
368
+ border-radius: 8px;
369
+ }
370
+ .chat-container {
371
+ border: 1px solid #e5e7eb;
372
+ border-radius: 12px;
373
+ padding: 20px;
374
+ margin-bottom: 20px;
375
+ }
376
+ .slider-container {
377
+ margin-top: 20px;
378
+ padding: 15px;
379
+ border-radius: 10px;
380
+ background-color: #f8f9fa;
381
+ }
382
+ .slider-label {
383
+ font-weight: bold;
384
+ margin-bottom: 5px;
385
+ }
386
+ .system-info {
387
+ background-color: #7B9BDB;
388
+ padding: 15px;
389
+ border-radius: 8px;
390
+ margin: 15px 0;
391
+ border-left: 4px solid #1890ff;
392
+ }
393
+ .typing-indicator {
394
+ display: inline-block;
395
+ position: relative;
396
+ width: 40px;
397
+ height: 20px;
398
+ }
399
+ .typing-dot {
400
+ display: inline-block;
401
+ width: 6px;
402
+ height: 6px;
403
+ border-radius: 50%;
404
+ background-color: #4a5568;
405
+ position: absolute;
406
+ animation: typing 1.4s infinite ease-in-out;
407
+ }
408
+ .typing-dot:nth-child(1) {
409
+ left: 0;
410
+ animation-delay: 0s;
411
+ }
412
+ .typing-dot:nth-child(2) {
413
+ left: 12px;
414
+ animation-delay: 0.2s;
415
+ }
416
+ .typing-dot:nth-child(3) {
417
+ left: 24px;
418
+ animation-delay: 0.4s;
419
+ }
420
+ @keyframes typing {
421
+ 0%, 60%, 100% { transform: translateY(0); }
422
+ 30% { transform: translateY(-5px); }
423
+ }
424
+ .tab-container {
425
+ border-radius: 12px;
426
+ padding: 20px;
427
+ background:#3fc9f8;
428
+ box-shadow: 0 4px 6px rgba(0,0,0,0.05);
429
+ margin-bottom: 20px;
430
+ }
431
+ .tab-header {
432
+ font-size: 24px;
433
+ margin-bottom: 20px;
434
+ padding-bottom: 10px;
435
+ border-bottom: 2px solid #e5e7eb;
436
+ }
437
+ """
438
+
439
+ with gr.Blocks(css=css, title="EDU Chat by Phanindra Reddy K") as demo:
440
+ gr.Markdown("# 🤖 EDU CHAT BY PHANINDRA REDDY K")
441
+
442
+ # System info banner
443
+ gr.HTML("""
444
+ <div class="system-info">
445
+ <strong>Multi-Modal AI Assistant</strong>
446
+ <ul>
447
+ <li>Text & Voice Chat with Mistral-7B</li>
448
+ <li>Image Understanding with InternVL</li>
449
+ <li>Optimized for High-RAM Systems</li>
450
+ </ul>
451
+ </div>
452
+ """)
453
+
454
+ modal_html = """
455
+ <div class="modal" id="imageModal" onclick="this.style.display='none'">
456
+ <div class="modal-content">
457
+ <img class="modal-img" id="expandedImg">
458
+ </div>
459
+ </div>
460
+ <script>
461
+ function showImage(url) {
462
+ document.getElementById('expandedImg').src = url;
463
+ document.getElementById('imageModal').style.display = 'block';
464
+ }
465
+ </script>
466
+ """
467
+ gr.HTML(modal_html)
468
+
469
+ # Create tabs for different functionalities
470
+ with gr.Tabs():
471
+ # ===== MAIN CHAT TAB =====
472
+ with gr.Tab("💬 Chat Assistant", id="chat_tab"):
473
+ state = gr.State([])
474
+
475
+ with gr.Column(scale=2, elem_classes="chat-container"):
476
+ chatbot = gr.Chatbot(label="Conversation", height=500, bubble_full_width=False)
477
+
478
+ with gr.Column(scale=1):
479
+ gr.Markdown("### 💬 Ask Your Question")
480
+
481
+ with gr.Row():
482
+ user_input = gr.Textbox(
483
+ placeholder="Type your question here...",
484
+ label="",
485
+ container=False,
486
+ elem_id="question-input"
487
+ )
488
+ include_images = gr.Checkbox(
489
+ label="Include Visuals",
490
+ value=True,
491
+ container=False,
492
+ elem_id="image-checkbox"
493
+ )
494
+
495
+ # Add the sliders container
496
+ with gr.Column(elem_classes="slider-container"):
497
+ gr.Markdown("### ⚙️ Generation Settings")
498
+
499
+ with gr.Row():
500
+ max_tokens = gr.Slider(
501
+ minimum=10,
502
+ maximum=MAX_TOKENS_LIMIT, # Increased to 1000
503
+ value=DEFAULT_MAX_TOKENS,
504
+ step=10,
505
+ label="Response Length (Tokens)",
506
+ info=f"Max: {MAX_TOKENS_LIMIT} tokens (for detailed explanations)",
507
+ elem_classes="slider-label"
508
+ )
509
+
510
+ # Conditionally visible image slider row
511
+ with gr.Row(visible=True) as image_slider_row:
512
+ num_images = gr.Slider(
513
+ minimum=0,
514
+ maximum=5,
515
+ value=DEFAULT_NUM_IMAGES,
516
+ step=1,
517
+ label="Number of Images",
518
+ info="Set to 0 to disable images",
519
+ elem_classes="slider-label"
520
+ )
521
+
522
+ with gr.Row():
523
+ submit_btn = gr.Button("Send Text", variant="primary")
524
+ mic_btn = gr.Button("Transcribe Voice", variant="secondary")
525
+ mic = gr.Audio(
526
+ sources=["microphone"],
527
+ type="numpy",
528
+ label="Voice Input",
529
+ show_label=False,
530
+ elem_id="voice-input"
531
+ )
532
+
533
+ processing = gr.HTML("""
534
+ <div id="processing" style="display: none;">
535
+ <div class="processing">🔮 Processing your request...</div>
536
+ </div>
537
+ """)
538
+
539
+ # Toggle image slider visibility based on checkbox
540
+ def toggle_image_slider(include_visuals):
541
+ return gr.update(visible=include_visuals)
542
+
543
+ include_images.change(
544
+ fn=toggle_image_slider,
545
+ inputs=include_images,
546
+ outputs=image_slider_row
547
+ )
548
+
549
+ def toggle_processing():
550
+ return gr.update(visible=True), gr.update(interactive=False)
551
+
552
+ def hide_processing():
553
+ return gr.update(visible=False), gr.update(interactive=True)
554
+
555
+ # Update the submit_btn click handler to include streaming
556
+ submit_btn.click(
557
+ fn=toggle_processing,
558
+ outputs=[processing, submit_btn]
559
+ ).then(
560
+ fn=lambda: (gr.update(visible=True), gr.update(interactive=False)),
561
+ outputs=[processing, submit_btn]
562
+ ).then(
563
+ fn=run_chat,
564
+ inputs=[user_input, state, include_images, max_tokens, num_images],
565
+ outputs=[chatbot, user_input]
566
+ ).then(
567
+ fn=lambda: (gr.update(visible=False), gr.update(interactive=True)),
568
+ outputs=[processing, submit_btn]
569
+ )
570
+
571
+ # Voice transcription
572
+ mic_btn.click(
573
+ fn=toggle_processing,
574
+ outputs=[processing, mic_btn]
575
+ ).then(
576
+ fn=transcribe,
577
+ inputs=mic,
578
+ outputs=user_input
579
+ ).then(
580
+ fn=hide_processing,
581
+ outputs=[processing, mic_btn]
582
+ )
583
+
584
+ # ===== IMAGE ANALYSIS TAB =====
585
+ with gr.Tab("🖼️ Image Analysis", id="image_tab"):
586
+ with gr.Column(elem_classes="tab-container"):
587
+ gr.Markdown("## 🖼️ Image Understanding with InternVL")
588
+ gr.Markdown("Upload an image or enter a URL, then ask questions about it")
589
+
590
+ with gr.Row():
591
+ with gr.Column():
592
+ # Image upload
593
+ image_upload = gr.Image(type="pil", label="Upload Image")
594
+
595
+ # URL input
596
+ url_input = gr.Textbox(
597
+ label="OR Enter Image URL",
598
+ placeholder="https://example.com/image.jpg",
599
+ info="Enter a direct image URL"
600
+ )
601
+
602
+ # Preview image
603
+ preview = gr.Image(label="Preview", interactive=False)
604
+
605
+ # Update preview when inputs change
606
+ def update_preview(img, url):
607
+ if img is not None:
608
+ return img
609
+ elif url and url.startswith(("http://", "https://")):
610
+ return url
611
+ return None
612
+
613
+ image_upload.change(update_preview, [image_upload, url_input], preview)
614
+ url_input.change(update_preview, [image_upload, url_input], preview)
615
+
616
+ with gr.Column():
617
+ # Question input
618
+ prompt = gr.Textbox(
619
+ label="Question",
620
+ placeholder="What is unusual in this image?",
621
+ info="Ask anything about the image"
622
+ )
623
+
624
+ # Submit button
625
+ img_submit_btn = gr.Button("Ask Question", variant="primary")
626
+
627
+ # Output
628
+ img_output = gr.Textbox(label="Model Response", interactive=False)
629
+
630
+ # Submit action
631
+ img_submit_btn.click(
632
+ fn=analyze_image,
633
+ inputs=[image_upload, url_input, prompt],
634
+ outputs=img_output
635
+ )
636
+
637
+ if __name__ == "__main__":
638
+ demo.launch(share=True, debug=True)
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.26.0
2
+ openvino-genai>=1.0.0
3
+ librosa>=0.10.0
4
+ numpy>=1.24.0
5
+ scipy>=1.10.0
6
+ huggingface_hub>=0.21.4
7
+ google-api-python-client>=2.132.0
8
+ py-cpuinfo>=8.0.0
9
+ requests>=2.32.3
10
+ Pillow>=10.3.0
11
+ soundfile>=0.12.1
12
+ openvino>=2024.1.0
13
+ tqdm>=4.66.2
14
+ protobuf>=4.25.3
15
+ tokenizers>=0.19.1