Spaces:
Runtime error
Runtime error
File size: 7,413 Bytes
2682f2f 0059280 9d10166 2682f2f 0059280 2682f2f 1557704 511d264 9917453 c68ba3a 2682f2f 511d264 2682f2f a4fd732 2682f2f cd49d70 2682f2f 511d264 2682f2f a4fd732 2682f2f 511d264 a18abb2 2682f2f a4fd732 511d264 2682f2f a4fd732 2682f2f a4fd732 cd49d70 2682f2f a4fd732 2682f2f a4fd732 2682f2f 71843eb 2682f2f 71843eb 2682f2f a4fd732 71843eb a4fd732 2682f2f 71843eb 511d264 a4fd732 2682f2f 511d264 2682f2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import os
os.system("pip install git+https://github.com/openai/whisper.git")
os.system("pip install neon-tts-plugin-coqui==0.6.0")
import gradio as gr
import whisper
import requests
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from datasets import load_dataset
import random
dataset = load_dataset("ysharma/short_jokes", split="train")
# Model 2: Sentence Transformer
API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/msmarco-distilbert-base-tas-b"
HF_TOKEN = os.environ["HF_TOKEN"]
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
# Language common in both the multilingual models - English, Chinese, Spanish, and French etc
# Model 1: Whisper: Speech-to-text
model = whisper.load_model("base")
#model_med = whisper.load_model("medium")
# Languages covered in Whisper - (exhaustive list) :
#"en": "english", "zh": "chinese", "de": "german", "es": "spanish", "ru": "russian",
#"ko": "korean", "fr": "french", "ja": "japanese", "pt": "portuguese", "tr": "turkish",
#"pl": "polish", "ca": "catalan", "nl": "dutch", "ar": "arabic", "sv": "swedish",
#"it": "italian", "id": "indonesian", "hi": "hindi", "fi": "finnish", "vi": "vietnamese",
#"iw": "hebrew", "uk": "ukrainian", "el": "greek", "ms": "malay", "cs": "czech",
#"ro": "romanian", "da": "danish", "hu": "hungarian", "ta": "tamil", "no": "norwegian",
#"th": "thai", "ur": "urdu", "hr": "croatian", "bg": "bulgarian", "lt": "lithuanian",
#"la": "latin", "mi": "maori", "ml": "malayalam", "cy": "welsh", "sk": "slovak",
#"te": "telugu", "fa": "persian", "lv": "latvian", "bn": "bengali", "sr": "serbian",
#"az": "azerbaijani", "sl": "slovenian", "kn": "kannada", "et": "estonian",
#"mk": "macedonian", "br": "breton", "eu": "basque", "is": "icelandic", "hy": "armenian",
#"ne": "nepali", "mn": "mongolian", "bs": "bosnian", "kk": "kazakh", "sq": "albanian",
#"sw": "swahili", "gl": "galician", "mr": "marathi", "pa": "punjabi", "si": "sinhala",
#"km": "khmer", "sn": "shona", "yo": "yoruba", "so": "somali", "af": "afrikaans",
#"oc": "occitan", "ka": "georgian", "be": "belarusian", "tg": "tajik", "sd": "sindhi",
#"gu": "gujarati", "am": "amharic", "yi": "yiddish", "lo": "lao", "uz": "uzbek",
#"fo": "faroese", "ht": "haitian creole", "ps": "pashto", "tk": "turkmen", "nn": "nynorsk",
#"mt": "maltese", "sa": "sanskrit", "lb": "luxembourgish", "my": "myanmar", "bo": "tibetan",
#"tl": "tagalog", "mg": "malagasy", "as": "assamese", "tt": "tatar", "haw": "hawaiian",
#"ln": "lingala", "ha": "hausa", "ba": "bashkir", "jw": "javanese", "su": "sundanese",
#Model 2: Text-to-Speech
LANGUAGES = list(CoquiTTS.langs.keys())
coquiTTS = CoquiTTS()
print(f"Languages for Coqui are: {LANGUAGES}")
#Languages for Coqui are: ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'el', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']
# en - English, es - Spanish, fr - French, de - German, pl - Polish
# uk - Ukrainian, ro - Romanian, hu - Hungarian, el - Greek, bg - Bulgarian,
# nl - dutch, fi - finnish, sl - slovenian, lv - latvian, ga - ??
# Driver function
def driver_fun(audio) :
translation, lang = whisper_stt(audio) # older : transcribe, translation, lang
#text1 = model.transcribe(audio)["text"]
random_val = random.randrange(0,231657)
if random_val < 226657:
lower_limit = random_val
upper_limit = random_val + 5000
else:
lower_limit = random_val - 5000
upper_limit = random_val
print(f"lower_limit : upper_limit = {lower_limit} : {upper_limit}")
dataset_subset = dataset['Joke'][lower_limit : upper_limit]
data = query({"inputs": {"source_sentence": "That is a happy person","sentences": dataset_subset} } )
max_match_score = max(data)
indx_score = data.index(max_match_score)
joke = max_match_score[indx_score]
#if translation
#For now only taking in English text for Bloom prompting as inference model is not high spec
#text_generated = lang_model_response(transcribe, lang)
#text_generated_en = lang_model_response(translation, 'en')
#if lang in ['es', 'fr']:
# speech = tts(transcribe, lang)
#else:
speech = tts(joke, 'en') #'en' # translation
return translation, joke, speech #transcribe,
# Whisper - speech-to-text
def whisper_stt(audio):
print("Inside Whisper TTS")
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
lang = max(probs, key=probs.get)
print(f"Detected language: {max(probs, key=probs.get)}")
# decode the audio
#options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang
options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang
#result_transc = whisper.decode(model_med, mel, options_transc)
result_transl = whisper.decode(model, mel, options_transl) #model_med
# print the recognized text
#print(f"transcript is : {result_transc.text}")
print(f"translation is : {result_transl.text}")
return result_transl.text, lang #result_transc.text,
# Coqui - Text-to-Speech
def tts(text, language):
print(f"Inside tts - language is : {language}")
coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga']
if language not in coqui_langs:
language = 'en'
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(text, fp, speaker = {"language" : language})
return fp.name
demo = gr.Blocks()
with demo:
gr.Markdown("<h1><center>AI Assistant - Voice to Joke</center></h1>")
gr.Markdown(
"""Model pipeline consisting of - <br>- [**Whisper**](https://github.com/openai/whisper) for Speech-to-text, <br>- [**CoquiTTS**](https://huggingface.co/coqui) for Text-To-Speech. <br>- Front end is built using [**Gradio Block API**](https://gradio.app/docs/#blocks).<br><br>Both CoquiTTS and Whisper are Multilingual, there are several overlapping languages between them. Hence it would be suggested to test this ML-App using these two languages to get the best results</u>.<br>If you want to reuse the App, simply click on the small cross button in the top right corner of your voice record panel, and then press record again!
""")
with gr.Row():
with gr.Column():
in_audio = gr.Audio(source="microphone", type="filepath", label='Record your voice command here in English -') #type='filepath'
b1 = gr.Button("AI Response")
out_transcript = gr.Textbox(label= 'Transcript of your Audio using OpenAI Whisper')
#out_translation_en = gr.Textbox(label= 'English Translation of audio using OpenAI Whisper')
with gr.Column():
out_audio = gr.Audio(label='Audio response form CoquiTTS')
out_generated_joke = gr.Textbox(label= 'Joke returned! ')
#out_generated_text_en = gr.Textbox(label= 'AI response to your query in English using Bloom! ')
b1.click(driver_fun,inputs=[in_audio], outputs=[out_transcript, out_generated_joke, out_audio]) #out_translation_en, out_generated_text,out_generated_text_en,
demo.launch(enable_queue=True, debug=True) |