facebook-fastspeech2-en-ljspeech

Running on Zero

File size: 8,178 Bytes

9f8fb3c
5b76aca
 
0925288
5b76aca
0925288
 
 
 
 
5b76aca
 
d80df17
29b4682
1323ad0
8e6e7a4
 
 
1323ad0
b7d5671
 
 
 
f835a2f
d80df17
b7a05a0
ac9af6a
d80df17
795640b
d70f358
 
 
 
 
 
 
 
a5a3ff6
a736521
f4d388e
 
 
 
34172eb
f4d388e
c249a04
f4d388e
 
 
1323ad0
06fb866
 
687a46a
edb666b
 
f0b6673
69cfc54
 
b1622fb
fed040e
db29651
49d0c70
5b76aca
6deb874
34172eb
85a23a7
821d0bc
1c75248
34f0437
 
 
 
 
 
 
 
 
 
 
 
 
1c75248
 
 
 
d4d088e
 
 
 
ac9af6a
 
 
 
 
 
 
d4d088e
63fab9f
e5e638d
5a8761e
6dcf7b3
fed040e
 
63fab9f
d4d088e
ac9af6a
 
 
 
 
 
 
d4d088e
 
e5e638d
5a8761e
6dcf7b3
fed040e
 
d4d088e
 
ac9af6a
 
 
 
 
 
 
 
 
5766687
 
e5e638d
 
5a8761e
 
fed040e
 
ac9af6a
d4d088e
9ddc7f6
b7d5671
1c75248
d4d088e
ac9af6a
69cfc54
 
 
5ce404d
b7d5671
1c75248
a5a3ff6
1c75248
 
 
 
 
 
 
b1622fb
1c75248
b7d5671
1c75248
 
34f0437
15b739a
34f0437
1c75248
88ddd74
13f2800
1c75248
 
b7d5671
34f0437
 
 
 
 
 
aabaa7c
 
1c75248
a4e384d
a2f4706
790c751

import spaces
import os

os.putenv('PYTORCH_NVML_BASED_CUDA_CHECK','1')
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
alloc_conf_parts = [
    'expandable_segments:True',
    'pinned_use_background_threads:True'  # Specific to pinned memory.
]
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ','.join(alloc_conf_parts)
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')

import torch
import gradio as gr
from transformers import pipeline
from transformers import LlamaTokenizer, LlamaForCausalLM
#from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import numpy as np
#import yaml
#import os
import requests
import nltk
import scipy.io.wavfile
import subprocess
# subprocess.run(['bash','esp.sh'])

from espnet2.bin.tts_inference import Text2Speech

try:
    nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except LookupError:
    nltk.download('averaged_perceptron_tagger_eng')
try:
    nltk.data.find('corpora/cmudict')  # Check for cmudict
except LookupError:
    nltk.download('cmudict')

ASR_MODEL_NAME = "openai/whisper-medium.en"
asr_pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL_NAME,
    chunk_length_s=30,
    device='cuda' if torch.cuda.is_available() else 'cpu', # Use GPU if available
)

all_special_ids = asr_pipe.tokenizer.all_special_ids
transcribe_token_id = all_special_ids[-5]
translate_token_id = all_special_ids[-6]

def _preload_and_load_models():
    global vicuna_tokenizer, vicuna_model
    #VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"  # Or another model
    #VICUNA_MODEL_NAME = "lmsys/vicuna-13b-v1.5"  # Or another model
    VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"  # Or another model
    #VICUNA_MODEL_NAME = "Jiayi-Pan/Tiny-Vicuna-1B"  # Or another model
    vicuna_tokenizer = LlamaTokenizer.from_pretrained(VICUNA_MODEL_NAME)
    vicuna_model = LlamaForCausalLM.from_pretrained(
        VICUNA_MODEL_NAME,
        #torch_dtype=torch.float16,
        device_map="cuda", # or.to('cuda')
    ).to(torch.device('cuda'),torch.bfloat16) # Explicitly move to CUDA after loading
    
_preload_and_load_models()

tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')

@spaces.GPU(required=True)
def process_audio(microphone, audio_upload, state, answer_mode):  # Added audio_upload
    audio_source = None
    if microphone:
        audio_source = microphone
        asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
        text = asr_pipe(audio_source)["text"]
    elif audio_upload:
        audio_source = audio_upload
        rate, data = scipy.io.wavfile.read(audio_source)
        asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
        text = asr_pipe(data)["text"]
    else:
        return state, state, None  # No audio input
    system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
        You answer questions clearly and simply, using age-appropriate language.
        You are also a little bit silly and like to make jokes."""
    prompt = f"{system_prompt}\nUser: {text}"
    #with torch.no_grad():
    #with torch.inference_mode():
    vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
    if answer_mode == 'slow':
            torch.backends.cuda.matmul.allow_tf32 = False
            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
            torch.backends.cudnn.allow_tf32 = False
            torch.backends.cudnn.deterministic = False
            torch.backends.cudnn.benchmark = True
            torch.set_float32_matmul_precision("highest")
            with torch.no_grad():
             vicuna_output = vicuna_model.generate(
                **vicuna_input,
                max_length = 512,
                min_new_tokens = 256,
                do_sample = True,
                low_memory = False
             )
    if answer_mode == 'medium':
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
            torch.backends.cudnn.allow_tf32 = True
            torch.backends.cudnn.deterministic = False
            torch.backends.cudnn.benchmark = False
            torch.set_float32_matmul_precision("high")
            with torch.no_grad():
             vicuna_output = vicuna_model.generate(
                **vicuna_input,
                max_length = 192,
                min_new_tokens = 64,
                do_sample = True,
                low_memory = False
             )
    if answer_mode == 'fast':
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
            torch.backends.cudnn.allow_tf32 = True
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False
            # torch.backends.cuda.preferred_blas_library="cublas"
            # torch.backends.cuda.preferred_linalg_library="cusolver"
            torch.set_float32_matmul_precision("medium")
            #with torch.no_grad():
            with torch.inference_mode():
                vicuna_output = vicuna_model.generate(
                    **vicuna_input,
                    max_new_tokens = 96,
                    min_new_tokens = 42,
                    do_sample = True,
                    low_memory = True
                )
    vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
    vicuna_response = vicuna_response.replace(prompt, "").strip()
    updated_state = state + "\nUser: " + text + "\n" + "Tutor: " + vicuna_response
    try:
        with torch.inference_mode():
            output = tts(vicuna_response)
        wav = output["wav"]
        sr = tts.fs
        audio_arr = wav.cpu().numpy()
        SAMPLE_RATE = sr
        audio_arr = audio_arr / np.abs(audio_arr).max()
        audio_output = (SAMPLE_RATE, audio_arr)
        #sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE) # Removed writing to file
    except requests.exceptions.RequestException as e:
        print(f"Error in Hugging Face API request: {e}")
        audio_output = None
    except Exception as e:
        print(f"Error in speech synthesis: {e}")
        audio_output = None
    return updated_state, updated_state, audio_output

with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo:  # Updated title
    gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS")
    gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
    with gr.Tab("Transcribe & Synthesize"):
        with gr.Row(): # Added a row for better layout
            mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here", elem_id="mic_audio")
            audio_upload = gr.Audio(sources="upload", type="filepath", label="Or Upload Audio File") # Added upload component
        transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
        audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
        answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium')
        transcription_state = gr.State(value="")
        mic_input.change(
            fn=process_audio,
            inputs=[mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
            outputs=[transcription_output, transcription_state, audio_output]
        )
        audio_upload.change( # Added change event for upload
            fn=process_audio,
            inputs=[mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
            outputs=[transcription_output, transcription_state, audio_output],
            api_name='/api/predict'
        )
        
if __name__ == '__main__':
    demo.launch(share=False)