TVRRaviteja commited on
Commit
890b142
Β·
verified Β·
1 Parent(s): 837a5d8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +208 -0
app.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Multimodal-AI-Assistant-Llava7B.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1MMQJ3bBnKOejL32I5wHSrpl8h-GX5FgG
8
+ """
9
+
10
+ !pip install -q -U transformers==4.37.2
11
+ !pip install -q bitsandbytes==0.41.3 accelerate==0.25.0
12
+ !pip install -q git+https://github.com/openai/whisper.git #Installing Whisper from Openai
13
+ !pip install -q gradio
14
+ !pip install -q gTTS
15
+
16
+ import torch
17
+ from transformers import BitsAndBytesConfig, pipeline
18
+
19
+ quantization_config = BitsAndBytesConfig(
20
+ load_in_4bit=True,
21
+ bnb_4bit_compute_dtype=torch.float16
22
+ )
23
+
24
+ model_id = "llava-hf/llava-1.5-7b-hf"
25
+
26
+ pipe = pipeline("image-to-text",
27
+ model=model_id,
28
+ model_kwargs={"quantization_config": quantization_config})
29
+
30
+ import whisper
31
+ import gradio as gr
32
+ import time
33
+ import warnings
34
+ import os
35
+ from gtts import gTTS
36
+
37
+ from PIL import Image
38
+
39
+ image_path = "img.jpg"
40
+ image = Image.open((image_path))
41
+ image
42
+
43
+ import nltk
44
+ nltk.download('punkt')
45
+ from nltk import sent_tokenize
46
+
47
+ max_new_tokens = 200
48
+
49
+ prompt_instructions = """
50
+ Describe the image using as much detail as possible,
51
+ is it a painting, a photograph, what colors are predominant, what's happening in the image
52
+ what is the image about?
53
+ """
54
+
55
+ prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
56
+
57
+ outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
58
+
59
+ outputs
60
+
61
+ for sent in sent_tokenize(outputs[0]["generated_text"]):
62
+ print(sent)
63
+
64
+ warnings.filterwarnings("ignore")
65
+
66
+ import warnings
67
+ from gtts import gTTS
68
+ import numpy as np
69
+
70
+ torch.cuda.is_available()
71
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
72
+ print(f"Using torch {torch.__version__} ({DEVICE})")
73
+
74
+ import whisper
75
+ model = whisper.load_model("medium", device=DEVICE) #you can use anything with tiny, small ,base, medium and large
76
+ print(
77
+ f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
78
+ f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
79
+ )
80
+
81
+ import re
82
+ import datetime
83
+ import os
84
+
85
+ ## Logger file
86
+ tstamp = datetime.datetime.now()
87
+ tstamp = str(tstamp).replace(' ','_')
88
+ logfile = f'{tstamp}_log.txt'
89
+
90
+ def writehistory(text):
91
+ with open(logfile, 'a', encoding='utf-8') as f:
92
+ f.write(text)
93
+ f.write('\n')
94
+ f.close()
95
+
96
+ def img2txt(input_text, input_image):
97
+
98
+ # load the image
99
+ image = Image.open(input_image)
100
+
101
+ writehistory(f"Input text: {input_text} - Type: {type(input_text)} - Dir: {dir(input_text)}")
102
+ if type(input_text) == tuple:
103
+ prompt_instructions = """
104
+ Describe the image using as much detail as possible, is it a painting, a photograph, what colors are predominant, what's happening in the image, what is the image about?
105
+ """
106
+ else:
107
+ prompt_instructions = """
108
+ Act as an expert in imagery descriptive analysis, using as much detail as possible from the image, respond to the following prompt:
109
+ """ + input_text
110
+
111
+ writehistory(f"prompt_instructions: {prompt_instructions}")
112
+ prompt = "USER: <image>\n" + prompt_instructions + "\nASSISTANT:"
113
+
114
+ outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
115
+
116
+ # Properly extract the response text
117
+ if outputs is not None and len(outputs[0]["generated_text"]) > 0:
118
+ match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
119
+ if match:
120
+ # Extract the text after "ASSISTANT:"
121
+ reply = match.group(1)
122
+ else:
123
+ reply = "No response found."
124
+ else:
125
+ reply = "No response generated."
126
+
127
+ return reply
128
+
129
+ def transcribe(audio):
130
+
131
+ # Check if the audio input is None or empty
132
+ if audio is None or audio == '':
133
+ return ('','',None) # Return empty strings and None audio file
134
+
135
+ # language = 'en'
136
+
137
+ audio = whisper.load_audio(audio)
138
+ audio = whisper.pad_or_trim(audio)
139
+
140
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
141
+
142
+ _, probs = model.detect_language(mel)
143
+
144
+ options = whisper.DecodingOptions()
145
+ result = whisper.decode(model, mel, options)
146
+ result_text = result.text
147
+
148
+ return result_text
149
+
150
+ def text_to_speech(text, file_path):
151
+ language = 'en'
152
+
153
+ audioobj = gTTS(text = text,
154
+ lang = language,
155
+ slow = False)
156
+
157
+ audioobj.save(file_path)
158
+
159
+ return file_path
160
+
161
+ import locale
162
+ print(locale.getlocale()) # Before running the pipeline
163
+ # Run the pipeline
164
+ print(locale.getlocale()) # After running the pipeline
165
+
166
+ locale.getpreferredencoding = lambda: "UTF-8" #required to
167
+
168
+ !ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame Temp.mp3
169
+
170
+ import gradio as gr
171
+ import base64
172
+ import os
173
+
174
+ # A function to handle audio and image inputs
175
+ def process_inputs(audio_path, image_path):
176
+ # Process the audio file (assuming this is handled by a function called 'transcribe')
177
+ speech_to_text_output = transcribe(audio_path)
178
+
179
+ # Handle the image input
180
+ if image_path:
181
+ chatgpt_output = img2txt(speech_to_text_output, image_path)
182
+ else:
183
+ chatgpt_output = "No image provided."
184
+
185
+ # Assuming 'transcribe' also returns the path to a processed audio file
186
+ processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") # Replace with actual path if different
187
+
188
+ return speech_to_text_output, chatgpt_output, processed_audio_path
189
+
190
+ # Create the interface
191
+ iface = gr.Interface(
192
+ fn=process_inputs,
193
+ inputs=[
194
+ gr.Audio(sources=["microphone"], type="filepath"),
195
+ gr.Image(type="filepath")
196
+ ],
197
+ outputs=[
198
+ gr.Textbox(label="Speech to Text"),
199
+ gr.Textbox(label="AI Output"),
200
+ gr.Audio("Temp.mp3")
201
+ ],
202
+ title="Multi Modal AI Assistant Using Whisper and Llava",
203
+ description="Upload an image and interact via voice input and audio response."
204
+ )
205
+
206
+ # Launch the interface
207
+ iface.launch(debug=True, inline=False)
208
+