Spaces:
Running
on
Zero
Running
on
Zero
Add automatic speech recognition (ASR) sample
Browse files- app.py +12 -1
- automatic_speech_recognition.py +14 -0
- requirements.txt +7 -4
- text_to_speech.py +3 -1
- utils.py +21 -2
app.py
CHANGED
|
@@ -2,6 +2,7 @@ from dotenv import load_dotenv
|
|
| 2 |
from functools import partial
|
| 3 |
import gradio as gr
|
| 4 |
from huggingface_hub import InferenceClient
|
|
|
|
| 5 |
from image_classification import image_classification
|
| 6 |
from image_to_text import image_to_text
|
| 7 |
from text_to_image import text_to_image
|
|
@@ -64,7 +65,7 @@ class App:
|
|
| 64 |
outputs=image_classification_output
|
| 65 |
)
|
| 66 |
with gr.Tab("Text-to-speech (TTS)"):
|
| 67 |
-
gr.Markdown("Generate speech from
|
| 68 |
text_to_speech_text = gr.Textbox(label="Text")
|
| 69 |
text_to_speech_generate_button = gr.Button("Generate")
|
| 70 |
text_to_speech_output = gr.Audio(label="Speech")
|
|
@@ -73,6 +74,16 @@ class App:
|
|
| 73 |
inputs=text_to_speech_text,
|
| 74 |
outputs=text_to_speech_output
|
| 75 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
demo.launch()
|
| 78 |
|
|
|
|
| 2 |
from functools import partial
|
| 3 |
import gradio as gr
|
| 4 |
from huggingface_hub import InferenceClient
|
| 5 |
+
from automatic_speech_recognition import automatic_speech_recognition
|
| 6 |
from image_classification import image_classification
|
| 7 |
from image_to_text import image_to_text
|
| 8 |
from text_to_image import text_to_image
|
|
|
|
| 65 |
outputs=image_classification_output
|
| 66 |
)
|
| 67 |
with gr.Tab("Text-to-speech (TTS)"):
|
| 68 |
+
gr.Markdown("Generate speech from text.")
|
| 69 |
text_to_speech_text = gr.Textbox(label="Text")
|
| 70 |
text_to_speech_generate_button = gr.Button("Generate")
|
| 71 |
text_to_speech_output = gr.Audio(label="Speech")
|
|
|
|
| 74 |
inputs=text_to_speech_text,
|
| 75 |
outputs=text_to_speech_output
|
| 76 |
)
|
| 77 |
+
with gr.Tab("Audio Transcription or Automatic Speech Recognition (ASR)"):
|
| 78 |
+
gr.Markdown("Transcribe audio to text.")
|
| 79 |
+
audio_transcription_audio_input = gr.Audio(label="Audio")
|
| 80 |
+
audio_transcription_generate_button = gr.Button("Transcribe")
|
| 81 |
+
audio_transcription_output = gr.Textbox(label="Text")
|
| 82 |
+
audio_transcription_generate_button.click(
|
| 83 |
+
fn=automatic_speech_recognition,
|
| 84 |
+
inputs=audio_transcription_audio_input,
|
| 85 |
+
outputs=audio_transcription_output
|
| 86 |
+
)
|
| 87 |
|
| 88 |
demo.launch()
|
| 89 |
|
automatic_speech_recognition.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gc
|
| 2 |
+
from os import getenv
|
| 3 |
+
from transformers import pipeline
|
| 4 |
+
from utils import spaces_gpu, resample_audio
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@spaces_gpu
|
| 8 |
+
def automatic_speech_recognition(audio: tuple[int, bytes]) -> str:
|
| 9 |
+
asr = pipeline(task="automatic-speech-recognition", model=getenv("AUDIO_TRANSCRIPTION_MODEL"))
|
| 10 |
+
audio_array = resample_audio(asr.feature_extractor.sampling_rate, audio)
|
| 11 |
+
result = asr(audio_array)
|
| 12 |
+
del asr
|
| 13 |
+
gc.collect()
|
| 14 |
+
return result["text"]
|
requirements.txt
CHANGED
|
@@ -1,10 +1,13 @@
|
|
| 1 |
gradio>=5.49.1
|
| 2 |
huggingface-hub>=0.34.0,<1.0
|
| 3 |
-
|
|
|
|
|
|
|
| 4 |
pandas>=2.0.0
|
|
|
|
| 5 |
pillow>=10.0.0
|
|
|
|
| 6 |
requests>=2.31.0
|
| 7 |
-
|
| 8 |
timm>=1.0.0
|
| 9 |
-
|
| 10 |
-
phonemizer>=3.0.0
|
|
|
|
| 1 |
gradio>=5.49.1
|
| 2 |
huggingface-hub>=0.34.0,<1.0
|
| 3 |
+
inflect>=7.0.0
|
| 4 |
+
librosa>=0.10.0
|
| 5 |
+
numpy>=1.24.0
|
| 6 |
pandas>=2.0.0
|
| 7 |
+
phonemizer>=3.0.0
|
| 8 |
pillow>=10.0.0
|
| 9 |
+
python-dotenv>=1.0.0
|
| 10 |
requests>=2.31.0
|
| 11 |
+
soundfile>=0.12.0
|
| 12 |
timm>=1.0.0
|
| 13 |
+
transformers>=4.40.0
|
|
|
text_to_speech.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
import gc
|
|
|
|
| 2 |
from transformers import pipeline
|
| 3 |
from utils import spaces_gpu
|
| 4 |
|
|
|
|
| 5 |
@spaces_gpu
|
| 6 |
def text_to_speech(text: str) -> tuple[int, bytes]:
|
| 7 |
-
narrator = pipeline("text-to-speech", "
|
| 8 |
del narrator
|
| 9 |
gc.collect()
|
| 10 |
result = narrator(text)
|
|
|
|
| 1 |
import gc
|
| 2 |
+
from os import getenv
|
| 3 |
from transformers import pipeline
|
| 4 |
from utils import spaces_gpu
|
| 5 |
|
| 6 |
+
|
| 7 |
@spaces_gpu
|
| 8 |
def text_to_speech(text: str) -> tuple[int, bytes]:
|
| 9 |
+
narrator = pipeline("text-to-speech", getenv("TEXT_TO_SPEECH_MODEL"))
|
| 10 |
del narrator
|
| 11 |
gc.collect()
|
| 12 |
result = narrator(text)
|
utils.py
CHANGED
|
@@ -5,12 +5,14 @@ from os import getenv
|
|
| 5 |
import requests
|
| 6 |
from tempfile import NamedTemporaryFile
|
| 7 |
import torch
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
# Try to import spaces decorator (for Hugging Face Spaces), otherwise use no-op decorator.
|
| 11 |
try:
|
| 12 |
-
import
|
| 13 |
-
spaces_gpu = spaces.GPU
|
| 14 |
except ImportError:
|
| 15 |
# For local development, use a no-op decorator because spaces is not available.
|
| 16 |
def spaces_gpu(func):
|
|
@@ -42,3 +44,20 @@ def save_image_to_temp_file(image: Image) -> str:
|
|
| 42 |
temp_file.close()
|
| 43 |
image.save(temp_path, format=image_format)
|
| 44 |
return temp_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
import requests
|
| 6 |
from tempfile import NamedTemporaryFile
|
| 7 |
import torch
|
| 8 |
+
import numpy as np
|
| 9 |
+
import soundfile as sf
|
| 10 |
+
import librosa
|
| 11 |
|
| 12 |
|
| 13 |
# Try to import spaces decorator (for Hugging Face Spaces), otherwise use no-op decorator.
|
| 14 |
try:
|
| 15 |
+
from spaces import GPU as spaces_gpu
|
|
|
|
| 16 |
except ImportError:
|
| 17 |
# For local development, use a no-op decorator because spaces is not available.
|
| 18 |
def spaces_gpu(func):
|
|
|
|
| 44 |
temp_file.close()
|
| 45 |
image.save(temp_path, format=image_format)
|
| 46 |
return temp_path
|
| 47 |
+
|
| 48 |
+
def resample_audio(target_sample_rate: int, audio: tuple[int, bytes | np.ndarray]) -> np.ndarray:
|
| 49 |
+
sample_rate, audio_data = audio
|
| 50 |
+
|
| 51 |
+
# Convert audio data to a numpy array if it’s bytes
|
| 52 |
+
if isinstance(audio_data, bytes):
|
| 53 |
+
audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
|
| 54 |
+
elif isinstance(audio_data, np.ndarray):
|
| 55 |
+
audio_array = audio_data.astype(np.float32)
|
| 56 |
+
else:
|
| 57 |
+
raise ValueError(f"Unsupported audio_data type: {type(audio_data)}")
|
| 58 |
+
|
| 59 |
+
# Resample if sample rates don’t match.
|
| 60 |
+
if sample_rate != target_sample_rate:
|
| 61 |
+
audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=target_sample_rate)
|
| 62 |
+
|
| 63 |
+
return audio_array
|