Spaces:

ankita-01
/

streaming-st_model

Runtime error

App Files Files Community

ankita-01 commited on Aug 23, 2024

Commit

32e6cc8

1 Parent(s): bb5c392

add app.py

Browse files

Files changed (1) hide show

app.py +69 -0

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# import torch
+# import torchaudio
+import numpy as np
+from espnet2.bin.st_inference_streaming import Speech2TextStreaming
+import gradio as gr
+import soundfile as sf
+import librosa
+# Load your custom model
+model = Speech2TextStreaming(
+    st_model_file="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/valid.acc.ave_10best.pth",  # path to your model weights
+    st_train_config="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/config.yaml",  # path to your config file
+    device="cuda",
+    minlenratio=0.1,
+    maxlenratio=0.7,
+    beam_size=1  # change to "cuda" if using GPU
+)
+silence_threshold = 0.01  # Adjust this threshold based on your audio levels
+silence_duration = 1.0  # Duration of silence to detect (in seconds)
+def is_silence(audio_chunk, sr, threshold=silence_threshold):
+    return np.mean(np.abs(audio_chunk)) < threshold
+def transcribe(state, new_chunk):
+    stream, silence_time = state
+    if new_chunk is None:
+        return (None, None), ""
+    sr, y = new_chunk
+    y = y.astype(np.float32)
+    if sr != 16000:
+        y = librosa.resample(y=y, orig_sr=sr, target_sr=16000)
+    y /= np.max(np.abs(y))
+    if stream is not None:
+        stream = np.concatenate([stream, y])
+    else:
+        stream = y
+        model(np.zeros(stream.shape), is_final=True)
+    if is_silence(y, sr):
+        silence_time += len(y) / sr
+    else:
+        silence_time = 0
+    if silence_time >= silence_duration:
+        output = model(stream, is_final=True)
+        return (None, 0), output[0][0] if output else ""
+    else:
+        output = model(stream)
+        return (stream, silence_time), output[0][0] if output else ""
+def clear_transcription():
+    return (None, 0), ""
+with gr.Blocks() as demo:
+    state = gr.State((None, 0))
+    audio = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
+    text = gr.Textbox()
+    clear_button = gr.Button("Clear")
+    audio.stream(transcribe, inputs=[state, audio], outputs=[state, text])
+    clear_button.click(clear_transcription, inputs=[], outputs=[state, text])
+demo.launch()