Spaces:
Running
on
Zero
Running
on
Zero
File size: 8,178 Bytes
9f8fb3c 5b76aca 0925288 5b76aca 0925288 5b76aca d80df17 29b4682 1323ad0 8e6e7a4 1323ad0 b7d5671 f835a2f d80df17 b7a05a0 ac9af6a d80df17 795640b d70f358 a5a3ff6 a736521 f4d388e 34172eb f4d388e c249a04 f4d388e 1323ad0 06fb866 687a46a edb666b f0b6673 69cfc54 b1622fb fed040e db29651 49d0c70 5b76aca 6deb874 34172eb 85a23a7 821d0bc 1c75248 34f0437 1c75248 d4d088e ac9af6a d4d088e 63fab9f e5e638d 5a8761e 6dcf7b3 fed040e 63fab9f d4d088e ac9af6a d4d088e e5e638d 5a8761e 6dcf7b3 fed040e d4d088e ac9af6a 5766687 e5e638d 5a8761e fed040e ac9af6a d4d088e 9ddc7f6 b7d5671 1c75248 d4d088e ac9af6a 69cfc54 5ce404d b7d5671 1c75248 a5a3ff6 1c75248 b1622fb 1c75248 b7d5671 1c75248 34f0437 15b739a 34f0437 1c75248 88ddd74 13f2800 1c75248 b7d5671 34f0437 aabaa7c 1c75248 a4e384d a2f4706 790c751 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import spaces
import os
os.putenv('PYTORCH_NVML_BASED_CUDA_CHECK','1')
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
alloc_conf_parts = [
'expandable_segments:True',
'pinned_use_background_threads:True' # Specific to pinned memory.
]
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ','.join(alloc_conf_parts)
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
import torch
import gradio as gr
from transformers import pipeline
from transformers import LlamaTokenizer, LlamaForCausalLM
#from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import numpy as np
#import yaml
#import os
import requests
import nltk
import scipy.io.wavfile
import subprocess
# subprocess.run(['bash','esp.sh'])
from espnet2.bin.tts_inference import Text2Speech
try:
nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except LookupError:
nltk.download('averaged_perceptron_tagger_eng')
try:
nltk.data.find('corpora/cmudict') # Check for cmudict
except LookupError:
nltk.download('cmudict')
ASR_MODEL_NAME = "openai/whisper-medium.en"
asr_pipe = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
chunk_length_s=30,
device='cuda' if torch.cuda.is_available() else 'cpu', # Use GPU if available
)
all_special_ids = asr_pipe.tokenizer.all_special_ids
transcribe_token_id = all_special_ids[-5]
translate_token_id = all_special_ids[-6]
def _preload_and_load_models():
global vicuna_tokenizer, vicuna_model
#VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model
#VICUNA_MODEL_NAME = "lmsys/vicuna-13b-v1.5" # Or another model
VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5" # Or another model
#VICUNA_MODEL_NAME = "Jiayi-Pan/Tiny-Vicuna-1B" # Or another model
vicuna_tokenizer = LlamaTokenizer.from_pretrained(VICUNA_MODEL_NAME)
vicuna_model = LlamaForCausalLM.from_pretrained(
VICUNA_MODEL_NAME,
#torch_dtype=torch.float16,
device_map="cuda", # or.to('cuda')
).to(torch.device('cuda'),torch.bfloat16) # Explicitly move to CUDA after loading
_preload_and_load_models()
tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')
@spaces.GPU(required=True)
def process_audio(microphone, audio_upload, state, answer_mode): # Added audio_upload
audio_source = None
if microphone:
audio_source = microphone
asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
text = asr_pipe(audio_source)["text"]
elif audio_upload:
audio_source = audio_upload
rate, data = scipy.io.wavfile.read(audio_source)
asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
text = asr_pipe(data)["text"]
else:
return state, state, None # No audio input
system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
You answer questions clearly and simply, using age-appropriate language.
You are also a little bit silly and like to make jokes."""
prompt = f"{system_prompt}\nUser: {text}"
#with torch.no_grad():
#with torch.inference_mode():
vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
if answer_mode == 'slow':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("highest")
with torch.no_grad():
vicuna_output = vicuna_model.generate(
**vicuna_input,
max_length = 512,
min_new_tokens = 256,
do_sample = True,
low_memory = False
)
if answer_mode == 'medium':
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False
torch.set_float32_matmul_precision("high")
with torch.no_grad():
vicuna_output = vicuna_model.generate(
**vicuna_input,
max_length = 192,
min_new_tokens = 64,
do_sample = True,
low_memory = False
)
if answer_mode == 'fast':
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# torch.backends.cuda.preferred_blas_library="cublas"
# torch.backends.cuda.preferred_linalg_library="cusolver"
torch.set_float32_matmul_precision("medium")
#with torch.no_grad():
with torch.inference_mode():
vicuna_output = vicuna_model.generate(
**vicuna_input,
max_new_tokens = 96,
min_new_tokens = 42,
do_sample = True,
low_memory = True
)
vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
vicuna_response = vicuna_response.replace(prompt, "").strip()
updated_state = state + "\nUser: " + text + "\n" + "Tutor: " + vicuna_response
try:
with torch.inference_mode():
output = tts(vicuna_response)
wav = output["wav"]
sr = tts.fs
audio_arr = wav.cpu().numpy()
SAMPLE_RATE = sr
audio_arr = audio_arr / np.abs(audio_arr).max()
audio_output = (SAMPLE_RATE, audio_arr)
#sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE) # Removed writing to file
except requests.exceptions.RequestException as e:
print(f"Error in Hugging Face API request: {e}")
audio_output = None
except Exception as e:
print(f"Error in speech synthesis: {e}")
audio_output = None
return updated_state, updated_state, audio_output
with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo: # Updated title
gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS")
gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
with gr.Tab("Transcribe & Synthesize"):
with gr.Row(): # Added a row for better layout
mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here", elem_id="mic_audio")
audio_upload = gr.Audio(sources="upload", type="filepath", label="Or Upload Audio File") # Added upload component
transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium')
transcription_state = gr.State(value="")
mic_input.change(
fn=process_audio,
inputs=[mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
outputs=[transcription_output, transcription_state, audio_output]
)
audio_upload.change( # Added change event for upload
fn=process_audio,
inputs=[mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
outputs=[transcription_output, transcription_state, audio_output],
api_name='/api/predict'
)
if __name__ == '__main__':
demo.launch(share=False) |