ford442's picture
Update app.py
db29651 verified
import spaces
import os
os.putenv('PYTORCH_NVML_BASED_CUDA_CHECK','1')
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
alloc_conf_parts = [
'expandable_segments:True',
'pinned_use_background_threads:True' # Specific to pinned memory.
]
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ','.join(alloc_conf_parts)
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
import torch
import gradio as gr
from transformers import pipeline
from transformers import LlamaTokenizer, LlamaForCausalLM
#from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import numpy as np
#import yaml
#import os
import requests
import nltk
import scipy.io.wavfile
import subprocess
# subprocess.run(['bash','esp.sh'])
from espnet2.bin.tts_inference import Text2Speech
try:
nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except LookupError:
nltk.download('averaged_perceptron_tagger_eng')
try:
nltk.data.find('corpora/cmudict') # Check for cmudict
except LookupError:
nltk.download('cmudict')
ASR_MODEL_NAME = "openai/whisper-medium.en"
asr_pipe = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
chunk_length_s=30,
device='cuda' if torch.cuda.is_available() else 'cpu', # Use GPU if available
)
all_special_ids = asr_pipe.tokenizer.all_special_ids
transcribe_token_id = all_special_ids[-5]
translate_token_id = all_special_ids[-6]
def _preload_and_load_models():
global vicuna_tokenizer, vicuna_model
#VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model
#VICUNA_MODEL_NAME = "lmsys/vicuna-13b-v1.5" # Or another model
VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5" # Or another model
#VICUNA_MODEL_NAME = "Jiayi-Pan/Tiny-Vicuna-1B" # Or another model
vicuna_tokenizer = LlamaTokenizer.from_pretrained(VICUNA_MODEL_NAME)
vicuna_model = LlamaForCausalLM.from_pretrained(
VICUNA_MODEL_NAME,
#torch_dtype=torch.float16,
device_map="cuda", # or.to('cuda')
).to(torch.device('cuda'),torch.bfloat16) # Explicitly move to CUDA after loading
_preload_and_load_models()
tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')
@spaces.GPU(required=True)
def process_audio(microphone, audio_upload, state, answer_mode): # Added audio_upload
audio_source = None
if microphone:
audio_source = microphone
asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
text = asr_pipe(audio_source)["text"]
elif audio_upload:
audio_source = audio_upload
rate, data = scipy.io.wavfile.read(audio_source)
asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
text = asr_pipe(data)["text"]
else:
return state, state, None # No audio input
system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
You answer questions clearly and simply, using age-appropriate language.
You are also a little bit silly and like to make jokes."""
prompt = f"{system_prompt}\nUser: {text}"
#with torch.no_grad():
#with torch.inference_mode():
vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
if answer_mode == 'slow':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("highest")
with torch.no_grad():
vicuna_output = vicuna_model.generate(
**vicuna_input,
max_length = 512,
min_new_tokens = 256,
do_sample = True,
low_memory = False
)
if answer_mode == 'medium':
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False
torch.set_float32_matmul_precision("high")
with torch.no_grad():
vicuna_output = vicuna_model.generate(
**vicuna_input,
max_length = 192,
min_new_tokens = 64,
do_sample = True,
low_memory = False
)
if answer_mode == 'fast':
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# torch.backends.cuda.preferred_blas_library="cublas"
# torch.backends.cuda.preferred_linalg_library="cusolver"
torch.set_float32_matmul_precision("medium")
#with torch.no_grad():
with torch.inference_mode():
vicuna_output = vicuna_model.generate(
**vicuna_input,
max_new_tokens = 96,
min_new_tokens = 42,
do_sample = True,
low_memory = True
)
vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
vicuna_response = vicuna_response.replace(prompt, "").strip()
updated_state = state + "\nUser: " + text + "\n" + "Tutor: " + vicuna_response
try:
with torch.inference_mode():
output = tts(vicuna_response)
wav = output["wav"]
sr = tts.fs
audio_arr = wav.cpu().numpy()
SAMPLE_RATE = sr
audio_arr = audio_arr / np.abs(audio_arr).max()
audio_output = (SAMPLE_RATE, audio_arr)
#sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE) # Removed writing to file
except requests.exceptions.RequestException as e:
print(f"Error in Hugging Face API request: {e}")
audio_output = None
except Exception as e:
print(f"Error in speech synthesis: {e}")
audio_output = None
return updated_state, updated_state, audio_output
with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo: # Updated title
gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS")
gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
with gr.Tab("Transcribe & Synthesize"):
with gr.Row(): # Added a row for better layout
mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here", elem_id="mic_audio")
audio_upload = gr.Audio(sources="upload", type="filepath", label="Or Upload Audio File") # Added upload component
transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium')
transcription_state = gr.State(value="")
mic_input.change(
fn=process_audio,
inputs=[mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
outputs=[transcription_output, transcription_state, audio_output]
)
audio_upload.change( # Added change event for upload
fn=process_audio,
inputs=[mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
outputs=[transcription_output, transcription_state, audio_output],
api_name='/api/predict'
)
if __name__ == '__main__':
demo.launch(share=False)