LiKenun commited on
Commit
02c9b64
·
1 Parent(s): caf2559

Add automatic speech recognition (ASR) sample

Browse files
Files changed (5) hide show
  1. app.py +12 -1
  2. automatic_speech_recognition.py +14 -0
  3. requirements.txt +7 -4
  4. text_to_speech.py +3 -1
  5. utils.py +21 -2
app.py CHANGED
@@ -2,6 +2,7 @@ from dotenv import load_dotenv
2
  from functools import partial
3
  import gradio as gr
4
  from huggingface_hub import InferenceClient
 
5
  from image_classification import image_classification
6
  from image_to_text import image_to_text
7
  from text_to_image import text_to_image
@@ -64,7 +65,7 @@ class App:
64
  outputs=image_classification_output
65
  )
66
  with gr.Tab("Text-to-speech (TTS)"):
67
- gr.Markdown("Generate speech from a text.")
68
  text_to_speech_text = gr.Textbox(label="Text")
69
  text_to_speech_generate_button = gr.Button("Generate")
70
  text_to_speech_output = gr.Audio(label="Speech")
@@ -73,6 +74,16 @@ class App:
73
  inputs=text_to_speech_text,
74
  outputs=text_to_speech_output
75
  )
 
 
 
 
 
 
 
 
 
 
76
 
77
  demo.launch()
78
 
 
2
  from functools import partial
3
  import gradio as gr
4
  from huggingface_hub import InferenceClient
5
+ from automatic_speech_recognition import automatic_speech_recognition
6
  from image_classification import image_classification
7
  from image_to_text import image_to_text
8
  from text_to_image import text_to_image
 
65
  outputs=image_classification_output
66
  )
67
  with gr.Tab("Text-to-speech (TTS)"):
68
+ gr.Markdown("Generate speech from text.")
69
  text_to_speech_text = gr.Textbox(label="Text")
70
  text_to_speech_generate_button = gr.Button("Generate")
71
  text_to_speech_output = gr.Audio(label="Speech")
 
74
  inputs=text_to_speech_text,
75
  outputs=text_to_speech_output
76
  )
77
+ with gr.Tab("Audio Transcription or Automatic Speech Recognition (ASR)"):
78
+ gr.Markdown("Transcribe audio to text.")
79
+ audio_transcription_audio_input = gr.Audio(label="Audio")
80
+ audio_transcription_generate_button = gr.Button("Transcribe")
81
+ audio_transcription_output = gr.Textbox(label="Text")
82
+ audio_transcription_generate_button.click(
83
+ fn=automatic_speech_recognition,
84
+ inputs=audio_transcription_audio_input,
85
+ outputs=audio_transcription_output
86
+ )
87
 
88
  demo.launch()
89
 
automatic_speech_recognition.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ from os import getenv
3
+ from transformers import pipeline
4
+ from utils import spaces_gpu, resample_audio
5
+
6
+
7
+ @spaces_gpu
8
+ def automatic_speech_recognition(audio: tuple[int, bytes]) -> str:
9
+ asr = pipeline(task="automatic-speech-recognition", model=getenv("AUDIO_TRANSCRIPTION_MODEL"))
10
+ audio_array = resample_audio(asr.feature_extractor.sampling_rate, audio)
11
+ result = asr(audio_array)
12
+ del asr
13
+ gc.collect()
14
+ return result["text"]
requirements.txt CHANGED
@@ -1,10 +1,13 @@
1
  gradio>=5.49.1
2
  huggingface-hub>=0.34.0,<1.0
3
- python-dotenv>=1.0.0
 
 
4
  pandas>=2.0.0
 
5
  pillow>=10.0.0
 
6
  requests>=2.31.0
7
- transformers>=4.40.0
8
  timm>=1.0.0
9
- inflect>=7.0.0
10
- phonemizer>=3.0.0
 
1
  gradio>=5.49.1
2
  huggingface-hub>=0.34.0,<1.0
3
+ inflect>=7.0.0
4
+ librosa>=0.10.0
5
+ numpy>=1.24.0
6
  pandas>=2.0.0
7
+ phonemizer>=3.0.0
8
  pillow>=10.0.0
9
+ python-dotenv>=1.0.0
10
  requests>=2.31.0
11
+ soundfile>=0.12.0
12
  timm>=1.0.0
13
+ transformers>=4.40.0
 
text_to_speech.py CHANGED
@@ -1,10 +1,12 @@
1
  import gc
 
2
  from transformers import pipeline
3
  from utils import spaces_gpu
4
 
 
5
  @spaces_gpu
6
  def text_to_speech(text: str) -> tuple[int, bytes]:
7
- narrator = pipeline("text-to-speech", "kakao-enterprise/vits-ljs")
8
  del narrator
9
  gc.collect()
10
  result = narrator(text)
 
1
  import gc
2
+ from os import getenv
3
  from transformers import pipeline
4
  from utils import spaces_gpu
5
 
6
+
7
  @spaces_gpu
8
  def text_to_speech(text: str) -> tuple[int, bytes]:
9
+ narrator = pipeline("text-to-speech", getenv("TEXT_TO_SPEECH_MODEL"))
10
  del narrator
11
  gc.collect()
12
  result = narrator(text)
utils.py CHANGED
@@ -5,12 +5,14 @@ from os import getenv
5
  import requests
6
  from tempfile import NamedTemporaryFile
7
  import torch
 
 
 
8
 
9
 
10
  # Try to import spaces decorator (for Hugging Face Spaces), otherwise use no-op decorator.
11
  try:
12
- import spaces
13
- spaces_gpu = spaces.GPU
14
  except ImportError:
15
  # For local development, use a no-op decorator because spaces is not available.
16
  def spaces_gpu(func):
@@ -42,3 +44,20 @@ def save_image_to_temp_file(image: Image) -> str:
42
  temp_file.close()
43
  image.save(temp_path, format=image_format)
44
  return temp_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import requests
6
  from tempfile import NamedTemporaryFile
7
  import torch
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import librosa
11
 
12
 
13
  # Try to import spaces decorator (for Hugging Face Spaces), otherwise use no-op decorator.
14
  try:
15
+ from spaces import GPU as spaces_gpu
 
16
  except ImportError:
17
  # For local development, use a no-op decorator because spaces is not available.
18
  def spaces_gpu(func):
 
44
  temp_file.close()
45
  image.save(temp_path, format=image_format)
46
  return temp_path
47
+
48
+ def resample_audio(target_sample_rate: int, audio: tuple[int, bytes | np.ndarray]) -> np.ndarray:
49
+ sample_rate, audio_data = audio
50
+
51
+ # Convert audio data to a numpy array if it’s bytes
52
+ if isinstance(audio_data, bytes):
53
+ audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
54
+ elif isinstance(audio_data, np.ndarray):
55
+ audio_array = audio_data.astype(np.float32)
56
+ else:
57
+ raise ValueError(f"Unsupported audio_data type: {type(audio_data)}")
58
+
59
+ # Resample if sample rates don’t match.
60
+ if sample_rate != target_sample_rate:
61
+ audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=target_sample_rate)
62
+
63
+ return audio_array