File size: 7,413 Bytes
2682f2f
0059280
9d10166
2682f2f
0059280
2682f2f
 
 
1557704
511d264
 
 
 
 
 
 
 
 
 
 
 
 
9917453
c68ba3a
2682f2f
511d264
2682f2f
a4fd732
2682f2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd49d70
2682f2f
 
 
 
 
 
 
 
 
511d264
2682f2f
 
a4fd732
2682f2f
 
511d264
 
 
 
 
 
 
 
 
 
 
 
 
a18abb2
2682f2f
 
 
 
a4fd732
 
 
511d264
 
2682f2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4fd732
2682f2f
a4fd732
cd49d70
2682f2f
 
a4fd732
2682f2f
 
a4fd732
2682f2f
 
 
 
 
 
 
 
 
 
 
 
 
 
71843eb
2682f2f
71843eb
2682f2f
 
 
a4fd732
 
71843eb
a4fd732
2682f2f
71843eb
511d264
a4fd732
2682f2f
511d264
2682f2f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
os.system("pip install git+https://github.com/openai/whisper.git")
os.system("pip install neon-tts-plugin-coqui==0.6.0")
import gradio as gr
import whisper
import requests 
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from datasets import load_dataset
import random

dataset = load_dataset("ysharma/short_jokes", split="train")

# Model 2: Sentence Transformer
API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/msmarco-distilbert-base-tas-b"
HF_TOKEN = os.environ["HF_TOKEN"]
headers = {"Authorization": f"Bearer {HF_TOKEN}"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()



# Language common in both the multilingual models - English, Chinese, Spanish, and French etc
# Model 1: Whisper: Speech-to-text
model = whisper.load_model("base")
#model_med = whisper.load_model("medium")
# Languages covered in Whisper - (exhaustive list) : 
#"en": "english", "zh": "chinese", "de": "german", "es": "spanish", "ru": "russian", 
#"ko": "korean", "fr": "french", "ja": "japanese", "pt": "portuguese", "tr": "turkish", 
#"pl": "polish", "ca": "catalan", "nl": "dutch", "ar": "arabic", "sv": "swedish", 
#"it": "italian", "id": "indonesian", "hi": "hindi", "fi": "finnish", "vi": "vietnamese", 
#"iw": "hebrew", "uk": "ukrainian", "el": "greek", "ms": "malay", "cs": "czech", 
#"ro": "romanian", "da": "danish", "hu": "hungarian", "ta": "tamil", "no": "norwegian", 
#"th": "thai", "ur": "urdu", "hr": "croatian", "bg": "bulgarian", "lt": "lithuanian", 
#"la": "latin", "mi": "maori", "ml": "malayalam", "cy": "welsh", "sk": "slovak", 
#"te": "telugu", "fa": "persian", "lv": "latvian", "bn": "bengali", "sr": "serbian", 
#"az": "azerbaijani", "sl": "slovenian", "kn": "kannada", "et": "estonian", 
#"mk": "macedonian", "br": "breton", "eu": "basque", "is": "icelandic", "hy": "armenian", 
#"ne": "nepali", "mn": "mongolian", "bs": "bosnian", "kk": "kazakh", "sq": "albanian", 
#"sw": "swahili", "gl": "galician", "mr": "marathi", "pa": "punjabi", "si": "sinhala", 
#"km": "khmer", "sn": "shona", "yo": "yoruba", "so": "somali", "af": "afrikaans", 
#"oc": "occitan", "ka": "georgian", "be": "belarusian", "tg": "tajik", "sd": "sindhi", 
#"gu": "gujarati", "am": "amharic", "yi": "yiddish", "lo": "lao", "uz": "uzbek", 
#"fo": "faroese", "ht": "haitian creole", "ps": "pashto", "tk": "turkmen", "nn": "nynorsk", 
#"mt": "maltese", "sa": "sanskrit", "lb": "luxembourgish", "my": "myanmar", "bo": "tibetan", 
#"tl": "tagalog", "mg": "malagasy", "as": "assamese", "tt": "tatar", "haw": "hawaiian", 
#"ln": "lingala", "ha": "hausa", "ba": "bashkir", "jw": "javanese", "su": "sundanese",


#Model 2:  Text-to-Speech
LANGUAGES = list(CoquiTTS.langs.keys())
coquiTTS = CoquiTTS()
print(f"Languages for Coqui are: {LANGUAGES}")
#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']
# en - English, es - Spanish, fr -  French, de - German, pl - Polish
# uk - Ukrainian, ro - Romanian, hu - Hungarian, el - Greek, bg - Bulgarian,
# nl - dutch, fi - finnish, sl - slovenian, lv - latvian, ga - ??  


  
# Driver function
def driver_fun(audio) : 
  translation, lang = whisper_stt(audio)  # older : transcribe, translation, lang 
  #text1 = model.transcribe(audio)["text"]
  
  random_val = random.randrange(0,231657)
  if random_val < 226657:
    lower_limit = random_val
    upper_limit = random_val + 5000 
  else:
    lower_limit = random_val - 5000
    upper_limit = random_val 
  print(f"lower_limit : upper_limit = {lower_limit} : {upper_limit}")  
  dataset_subset = dataset['Joke'][lower_limit : upper_limit]
  data = query({"inputs": {"source_sentence": "That is a happy person","sentences": dataset_subset} } )
  max_match_score = max(data)
  indx_score = data.index(max_match_score)
  joke = max_match_score[indx_score]
  #if translation 
  #For now only taking in English text for Bloom prompting as inference model is not high spec
  #text_generated = lang_model_response(transcribe, lang)
  #text_generated_en = lang_model_response(translation, 'en')
  
  #if lang in ['es', 'fr']:
  #  speech = tts(transcribe, lang)
  #else:
  speech = tts(joke, 'en') #'en' # translation
  return translation, joke, speech #transcribe, 


# Whisper - speech-to-text
def whisper_stt(audio):
  print("Inside Whisper TTS")
  # load audio and pad/trim it to fit 30 seconds
  audio = whisper.load_audio(audio)
  audio = whisper.pad_or_trim(audio)
  
  # make log-Mel spectrogram and move to the same device as the model
  mel = whisper.log_mel_spectrogram(audio).to(model.device)
  
  # detect the spoken language
  _, probs = model.detect_language(mel)
  lang = max(probs, key=probs.get)
  print(f"Detected language: {max(probs, key=probs.get)}")
  
  # decode the audio
  #options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
  options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
  #result_transc = whisper.decode(model_med, mel, options_transc)
  result_transl = whisper.decode(model, mel, options_transl)  #model_med
  
  # print the recognized text
  #print(f"transcript is : {result_transc.text}")
  print(f"translation is : {result_transl.text}")

  return result_transl.text, lang #result_transc.text, 


# Coqui - Text-to-Speech
def tts(text, language):
  print(f"Inside tts - language is : {language}")
  coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga']
  if language not in coqui_langs:
    language = 'en'
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
      coquiTTS.get_tts(text, fp, speaker = {"language" : language})
      return fp.name

demo = gr.Blocks()
with demo:
  gr.Markdown("<h1><center>AI Assistant - Voice to Joke</center></h1>")
  gr.Markdown(
        """Model pipeline consisting of - <br>- [**Whisper**](https://github.com/openai/whisper) for Speech-to-text, <br>- [**CoquiTTS**](https://huggingface.co/coqui)  for Text-To-Speech. <br>- Front end is built using [**Gradio Block API**](https://gradio.app/docs/#blocks).<br><br>Both CoquiTTS and Whisper are Multilingual, there are several overlapping languages between them. Hence it would be suggested to test this ML-App using these two languages to get the best results</u>.<br>If you want to reuse the App, simply click on the small cross button in the top right corner of your voice record panel, and then press record again!
        """)
  with gr.Row():
    with gr.Column(): 
      in_audio = gr.Audio(source="microphone",  type="filepath", label='Record your voice command here in English -')  #type='filepath'
      b1 = gr.Button("AI Response")
      out_transcript = gr.Textbox(label= 'Transcript of your Audio using OpenAI Whisper')
      #out_translation_en = gr.Textbox(label= 'English Translation of audio using OpenAI Whisper')
    with gr.Column():
      out_audio = gr.Audio(label='Audio response form CoquiTTS')  
      out_generated_joke = gr.Textbox(label= 'Joke returned! ')
      #out_generated_text_en = gr.Textbox(label= 'AI response to your query in English using Bloom! ')
    
      b1.click(driver_fun,inputs=[in_audio], outputs=[out_transcript, out_generated_joke, out_audio]) #out_translation_en, out_generated_text,out_generated_text_en, 
    
demo.launch(enable_queue=True, debug=True)