Spaces:
Runtime error
Runtime error
update
Browse files
app.py
CHANGED
@@ -29,21 +29,18 @@ def query(payload):
|
|
29 |
# Language common in both the multilingual models - English, Chinese, Spanish, and French etc
|
30 |
# Model 1: Whisper: Speech-to-text
|
31 |
model = whisper.load_model("base")
|
32 |
-
#model_med = whisper.load_model("medium")
|
33 |
|
34 |
|
35 |
#Model 2: Text-to-Speech
|
36 |
LANGUAGES = list(CoquiTTS.langs.keys())
|
37 |
coquiTTS = CoquiTTS()
|
38 |
-
print(f"Languages for Coqui are: {LANGUAGES}")
|
39 |
#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']
|
40 |
|
41 |
|
42 |
# Driver function
|
43 |
def driver_fun(audio) :
|
44 |
-
#if audio is None:
|
45 |
|
46 |
-
translation, lang = whisper_stt(audio)
|
47 |
|
48 |
random_val = random.randrange(0,231657)
|
49 |
if random_val < 226657:
|
@@ -65,8 +62,8 @@ def driver_fun(audio) :
|
|
65 |
joke = dataset_subset[indx_score]
|
66 |
print(f"Joke is : {joke}")
|
67 |
|
68 |
-
speech = tts(joke, 'en')
|
69 |
-
return translation, joke, speech
|
70 |
|
71 |
|
72 |
# Whisper - speech-to-text
|
@@ -85,24 +82,18 @@ def whisper_stt(audio):
|
|
85 |
print(f"Detected language: {max(probs, key=probs.get)}")
|
86 |
|
87 |
# decode the audio
|
88 |
-
#options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
|
89 |
options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
|
90 |
-
#result_transc = whisper.decode(model_med, mel, options_transc)
|
91 |
result_transl = whisper.decode(model, mel, options_transl) #model_med
|
92 |
|
93 |
-
# print the
|
94 |
-
#print(f"transcript is : {result_transc.text}")
|
95 |
print(f"translation is : {result_transl.text}")
|
96 |
|
97 |
-
return result_transl.text, lang
|
98 |
|
99 |
|
100 |
# Coqui - Text-to-Speech
|
101 |
def tts(text, language):
|
102 |
print(f"Inside tts - language is : {language}")
|
103 |
-
#coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga']
|
104 |
-
#if language not in coqui_langs:
|
105 |
-
# language = 'en'
|
106 |
print(f"Text is : {text}")
|
107 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
108 |
coquiTTS.get_tts(text, fp, speaker = {"language" : language})
|
@@ -119,12 +110,11 @@ with demo:
|
|
119 |
in_audio = gr.Audio(source="microphone", type="filepath", label='Record your voice command here in English -') #type='filepath'
|
120 |
b1 = gr.Button("AI Response")
|
121 |
out_transcript = gr.Textbox(label= 'Transcript of your Audio using OpenAI Whisper')
|
122 |
-
|
123 |
with gr.Column():
|
124 |
out_audio = gr.Audio(label='Audio response form CoquiTTS')
|
125 |
out_generated_joke = gr.Textbox(label= 'Joke returned! ')
|
126 |
-
|
127 |
-
|
128 |
b1.click(driver_fun,inputs=[in_audio], outputs=[out_transcript, out_generated_joke, out_audio]) #out_translation_en, out_generated_text,out_generated_text_en,
|
129 |
with gr.Row():
|
130 |
gr.Markdown(
|
|
|
29 |
# Language common in both the multilingual models - English, Chinese, Spanish, and French etc
|
30 |
# Model 1: Whisper: Speech-to-text
|
31 |
model = whisper.load_model("base")
|
|
|
32 |
|
33 |
|
34 |
#Model 2: Text-to-Speech
|
35 |
LANGUAGES = list(CoquiTTS.langs.keys())
|
36 |
coquiTTS = CoquiTTS()
|
|
|
37 |
#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']
|
38 |
|
39 |
|
40 |
# Driver function
|
41 |
def driver_fun(audio) :
|
|
|
42 |
|
43 |
+
translation, lang = whisper_stt(audio)
|
44 |
|
45 |
random_val = random.randrange(0,231657)
|
46 |
if random_val < 226657:
|
|
|
62 |
joke = dataset_subset[indx_score]
|
63 |
print(f"Joke is : {joke}")
|
64 |
|
65 |
+
speech = tts(joke, 'en')
|
66 |
+
return translation, joke, speech
|
67 |
|
68 |
|
69 |
# Whisper - speech-to-text
|
|
|
82 |
print(f"Detected language: {max(probs, key=probs.get)}")
|
83 |
|
84 |
# decode the audio
|
|
|
85 |
options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
|
|
|
86 |
result_transl = whisper.decode(model, mel, options_transl) #model_med
|
87 |
|
88 |
+
# print the transcribed text
|
|
|
89 |
print(f"translation is : {result_transl.text}")
|
90 |
|
91 |
+
return result_transl.text, lang
|
92 |
|
93 |
|
94 |
# Coqui - Text-to-Speech
|
95 |
def tts(text, language):
|
96 |
print(f"Inside tts - language is : {language}")
|
|
|
|
|
|
|
97 |
print(f"Text is : {text}")
|
98 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
99 |
coquiTTS.get_tts(text, fp, speaker = {"language" : language})
|
|
|
110 |
in_audio = gr.Audio(source="microphone", type="filepath", label='Record your voice command here in English -') #type='filepath'
|
111 |
b1 = gr.Button("AI Response")
|
112 |
out_transcript = gr.Textbox(label= 'Transcript of your Audio using OpenAI Whisper')
|
113 |
+
|
114 |
with gr.Column():
|
115 |
out_audio = gr.Audio(label='Audio response form CoquiTTS')
|
116 |
out_generated_joke = gr.Textbox(label= 'Joke returned! ')
|
117 |
+
|
|
|
118 |
b1.click(driver_fun,inputs=[in_audio], outputs=[out_transcript, out_generated_joke, out_audio]) #out_translation_en, out_generated_text,out_generated_text_en,
|
119 |
with gr.Row():
|
120 |
gr.Markdown(
|