Spaces:

LiKenun
/

ai-building-blocks

Running on Zero

App Files Files Community

LiKenun commited on Nov 4

Commit

1c1b97a

1 Parent(s): 65e848c

Move environment variable querying code out of the inference functions

Browse files

Files changed (6) hide show

automatic_speech_recognition.py +6 -7
chatbot.py +14 -11
image_classification.py +5 -5
image_to_text.py +10 -9
text_to_image.py +5 -6
text_to_speech.py +6 -4

automatic_speech_recognition.py CHANGED Viewed

@@ -4,7 +4,7 @@ from os import getenv, path, unlink
 import gradio as gr
 from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio
-def automatic_speech_recognition(client: InferenceClient, audio: tuple[int, bytes]) -> str:
     """Transcribe audio to text using Hugging Face Inference API.
     This function converts speech audio into text transcription. The audio is
@@ -13,6 +13,7 @@ def automatic_speech_recognition(client: InferenceClient, audio: tuple[int, byte
     Args:
         client: Hugging Face InferenceClient instance for API calls.
         audio: Tuple containing:
             - int: Sample rate of the input audio (e.g., 44100 Hz)
             - bytes: Raw audio data as bytes
@@ -21,18 +22,15 @@ def automatic_speech_recognition(client: InferenceClient, audio: tuple[int, byte
         String containing the transcribed text from the audio.
     Note:
-        - The model ID is determined by the AUDIO_TRANSCRIPTION_MODEL environment variable.
         - Audio is automatically resampled to match the model's expected sample rate.
         - Audio is saved as a WAV file for InferenceClient compatibility.
         - Automatically cleans up temporary files after transcription.
-        - Uses openai/whisper-large-v3 or similar ASR models.
     """
     temp_file_path = None
     try:
-        model_id = getenv("AUDIO_TRANSCRIPTION_MODEL")
-        sample_rate = get_model_sample_rate(model_id)
         temp_file_path = save_audio_to_temp_file(sample_rate, audio)
-        result = client.automatic_speech_recognition(temp_file_path, model=model_id)
         return result["text"]
     finally:
         if temp_file_path and path.exists(temp_file_path): # Clean up temporary file.
@@ -54,6 +52,7 @@ def create_asr_tab(client: InferenceClient):
     Args:
         client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
     """
     gr.Markdown("Transcribe audio to text.")
     audio_transcription_url_input = gr.Textbox(label="Audio URL")
     audio_transcription_audio_request_button = gr.Button("Get Audio")
@@ -66,7 +65,7 @@ def create_asr_tab(client: InferenceClient):
     audio_transcription_generate_button = gr.Button("Transcribe")
     audio_transcription_output = gr.Textbox(label="Text")
     audio_transcription_generate_button.click(
-        fn=partial(automatic_speech_recognition, client),
         inputs=audio_transcription_audio_input,
         outputs=audio_transcription_output
     )

 import gradio as gr
 from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio
+def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes]) -> str:
     """Transcribe audio to text using Hugging Face Inference API.
     This function converts speech audio into text transcription. The audio is
     Args:
         client: Hugging Face InferenceClient instance for API calls.
+        model: Hugging Face model ID to use for automatic speech recognition.
         audio: Tuple containing:
             - int: Sample rate of the input audio (e.g., 44100 Hz)
             - bytes: Raw audio data as bytes
         String containing the transcribed text from the audio.
     Note:
         - Audio is automatically resampled to match the model's expected sample rate.
         - Audio is saved as a WAV file for InferenceClient compatibility.
         - Automatically cleans up temporary files after transcription.
     """
     temp_file_path = None
     try:
+        sample_rate = get_model_sample_rate(model)
         temp_file_path = save_audio_to_temp_file(sample_rate, audio)
+        result = client.automatic_speech_recognition(temp_file_path, model=model)
         return result["text"]
     finally:
         if temp_file_path and path.exists(temp_file_path): # Clean up temporary file.
     Args:
         client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
     """
+    model_id = getenv("AUDIO_TRANSCRIPTION_MODEL")
     gr.Markdown("Transcribe audio to text.")
     audio_transcription_url_input = gr.Textbox(label="Audio URL")
     audio_transcription_audio_request_button = gr.Button("Get Audio")
     audio_transcription_generate_button = gr.Button("Transcribe")
     audio_transcription_output = gr.Textbox(label="Text")
     audio_transcription_generate_button.click(
+        fn=partial(automatic_speech_recognition, client, model_id),
         inputs=audio_transcription_audio_input,
         outputs=audio_transcription_output
     )

chatbot.py CHANGED Viewed

@@ -8,7 +8,7 @@ _chatbot = None
 _tokenizer = None
 _is_seq2seq = None
-def get_chatbot():
     """Get or create the chatbot model instance.
     This function implements a singleton pattern to load and cache the chatbot
@@ -16,6 +16,9 @@ def get_chatbot():
     models) and sequence-to-sequence models (like BlenderBot). The model type
     is automatically detected from the model configuration.
     Returns:
         Tuple containing:
             - Model: The loaded transformer model (AutoModelForCausalLM or AutoModelForSeq2SeqLM)
@@ -23,7 +26,6 @@ def get_chatbot():
             - bool: Whether the model is a seq2seq model (True) or causal LM (False)
     Note:
-        - The model ID is determined by the CHAT_MODEL environment variable.
         - Models are loaded with safetensors for secure loading.
         - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
         - Sets pad_token to eos_token if pad_token is not configured.
@@ -31,15 +33,14 @@ def get_chatbot():
     """
     global _chatbot, _tokenizer, _is_seq2seq
     if _chatbot is None:
-        model_id = getenv("CHAT_MODEL")
         device = get_pytorch_device()
-        _tokenizer = AutoTokenizer.from_pretrained(model_id)
         # Try to determine model type and load accordingly
         # Check tokenizer config or model config to see if it's seq2seq
         try:
             from transformers import AutoConfig
-            config = AutoConfig.from_pretrained(model_id)
             # Seq2seq models have encoder/decoder, causal LMs don't
             _is_seq2seq = hasattr(config, 'is_encoder_decoder') and config.is_encoder_decoder
         except Exception:
@@ -48,12 +49,12 @@ def get_chatbot():
         if _is_seq2seq:
             _chatbot = AutoModelForSeq2SeqLM.from_pretrained(
-                model_id,
                 use_safetensors=True
             ).to(device)
         else:
             _chatbot = AutoModelForCausalLM.from_pretrained(
-                model_id,
                 use_safetensors=True
             ).to(device)
@@ -64,7 +65,7 @@ def get_chatbot():
     return _chatbot, _tokenizer, _is_seq2seq
 @spaces_gpu
-def chat(message: str, conversation_history: list[dict] | None) -> tuple[str, list[dict]]:
     """Generate a chatbot response given a user message and conversation history.
     This function handles conversation with AI chatbots, supporting both modern
@@ -73,6 +74,7 @@ def chat(message: str, conversation_history: list[dict] | None) -> tuple[str, li
     formats inputs appropriately based on the model type.
     Args:
         message: The user's current message as a string.
         conversation_history: Optional list of previous conversation messages.
             Each message is a dict with "role" ("user" or "assistant") and "content".
@@ -92,7 +94,7 @@ def chat(message: str, conversation_history: list[dict] | None) -> tuple[str, li
         - Automatically manages conversation context and history
         - Extracts only newly generated text for causal LMs with chat templates
     """
-    model, tokenizer, is_seq2seq = get_chatbot()
     # Initialize conversation history if this is the first message
     if conversation_history is None:
@@ -141,7 +143,7 @@ def chat(message: str, conversation_history: list[dict] | None) -> tuple[str, li
             inputs = tokenizer(dialogue_text, return_tensors="pt", truncation=True, max_length=1024).to(device)
     # Generate response
-    outputs = model.generate(
         **inputs,
         max_new_tokens=256,
         do_sample=True,
@@ -188,6 +190,7 @@ def create_chatbot_tab():
     and manages the conversion between Gradio's chat format and the internal
     conversation history format.
     """
     gr.Markdown("Have a conversation with an AI chatbot.")
     chatbot_history = gr.State(value=None)  # Store the conversation history.
     chatbot_output = gr.Chatbot(label="Conversation")
@@ -214,7 +217,7 @@ def create_chatbot_tab():
         """
         if not message.strip():
             return history, conversation_state, ""
-        response, updated_conversation = chat(message, conversation_state)  # Get response from chatbot.
         if history is None:  # Update Gradio chat history format: list of [user_message, bot_message] pairs.
             history = []
         history.append([message, response])

 _tokenizer = None
 _is_seq2seq = None
+def get_chatbot(model: str):
     """Get or create the chatbot model instance.
     This function implements a singleton pattern to load and cache the chatbot
     models) and sequence-to-sequence models (like BlenderBot). The model type
     is automatically detected from the model configuration.
+    Args:
+        model: Hugging Face model ID to use for the chatbot.
     Returns:
         Tuple containing:
             - Model: The loaded transformer model (AutoModelForCausalLM or AutoModelForSeq2SeqLM)
             - bool: Whether the model is a seq2seq model (True) or causal LM (False)
     Note:
         - Models are loaded with safetensors for secure loading.
         - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
         - Sets pad_token to eos_token if pad_token is not configured.
     """
     global _chatbot, _tokenizer, _is_seq2seq
     if _chatbot is None:
         device = get_pytorch_device()
+        _tokenizer = AutoTokenizer.from_pretrained(model)
         # Try to determine model type and load accordingly
         # Check tokenizer config or model config to see if it's seq2seq
         try:
             from transformers import AutoConfig
+            config = AutoConfig.from_pretrained(model)
             # Seq2seq models have encoder/decoder, causal LMs don't
             _is_seq2seq = hasattr(config, 'is_encoder_decoder') and config.is_encoder_decoder
         except Exception:
         if _is_seq2seq:
             _chatbot = AutoModelForSeq2SeqLM.from_pretrained(
+                model,
                 use_safetensors=True
             ).to(device)
         else:
             _chatbot = AutoModelForCausalLM.from_pretrained(
+                model,
                 use_safetensors=True
             ).to(device)
     return _chatbot, _tokenizer, _is_seq2seq
 @spaces_gpu
+def chat(model: str, message: str, conversation_history: list[dict] | None) -> tuple[str, list[dict]]:
     """Generate a chatbot response given a user message and conversation history.
     This function handles conversation with AI chatbots, supporting both modern
     formats inputs appropriately based on the model type.
     Args:
+        model: Hugging Face model ID to use for the chatbot.
         message: The user's current message as a string.
         conversation_history: Optional list of previous conversation messages.
             Each message is a dict with "role" ("user" or "assistant") and "content".
         - Automatically manages conversation context and history
         - Extracts only newly generated text for causal LMs with chat templates
     """
+    model_instance, tokenizer, is_seq2seq = get_chatbot(model)
     # Initialize conversation history if this is the first message
     if conversation_history is None:
             inputs = tokenizer(dialogue_text, return_tensors="pt", truncation=True, max_length=1024).to(device)
     # Generate response
+    outputs = model_instance.generate(
         **inputs,
         max_new_tokens=256,
         do_sample=True,
     and manages the conversion between Gradio's chat format and the internal
     conversation history format.
     """
+    model_id = getenv("CHAT_MODEL")
     gr.Markdown("Have a conversation with an AI chatbot.")
     chatbot_history = gr.State(value=None)  # Store the conversation history.
     chatbot_output = gr.Chatbot(label="Conversation")
         """
         if not message.strip():
             return history, conversation_state, ""
+        response, updated_conversation = chat(model_id, message, conversation_state)  # Get response from chatbot.
         if history is None:  # Update Gradio chat history format: list of [user_message, bot_message] pairs.
             history = []
         history.append([message, response])

image_classification.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pandas import DataFrame
 from utils import save_image_to_temp_file, request_image
-def image_classification(client: InferenceClient, image: Image) -> DataFrame:
     """Classify an image using Hugging Face Inference API.
     This function classifies a recyclable item image into categories:
@@ -18,6 +18,7 @@ def image_classification(client: InferenceClient, image: Image) -> DataFrame:
     Args:
         client: Hugging Face InferenceClient instance for API calls.
         image: PIL Image object to classify.
     Returns:
@@ -26,14 +27,12 @@ def image_classification(client: InferenceClient, image: Image) -> DataFrame:
             - Probability: The confidence score as a percentage string (e.g., "95.23%")
     Note:
-        - The model ID is determined by the IMAGE_CLASSIFICATION_MODEL environment variable.
-        - Uses Trash-Net model for recyclable item classification.
         - Automatically cleans up temporary files after classification.
         - Temporary file is created with format preservation if possible.
     """
     try:
         temp_file_path = save_image_to_temp_file(image) # Needed because InferenceClient does not accept PIL Images directly.
-        classifications = client.image_classification(temp_file_path, model=getenv("IMAGE_CLASSIFICATION_MODEL"))
         return pd.DataFrame({
                                 "Label": classification.label,
                                 "Probability": f"{classification.score:.2%}"
@@ -60,6 +59,7 @@ def create_image_classification_tab(client: InferenceClient):
     Args:
         client: Hugging Face InferenceClient instance to pass to the image_classification function.
     """
     gr.Markdown("Classify a recyclable item as one of: cardboard, glass, metal, paper, plastic, or other using [Trash-Net](https://huggingface.co/prithivMLmods/Trash-Net).")
     image_classification_url_input = gr.Textbox(label="Image URL")
     image_classification_image_request_button = gr.Button("Get Image")
@@ -72,7 +72,7 @@ def create_image_classification_tab(client: InferenceClient):
     image_classification_button = gr.Button("Classify")
     image_classification_output = gr.Dataframe(label="Classification", headers=["Label", "Probability"], interactive=False)
     image_classification_button.click(
-        fn=partial(image_classification, client),
         inputs=image_classification_image_input,
         outputs=image_classification_output
     )

 from utils import save_image_to_temp_file, request_image
+def image_classification(client: InferenceClient, model: str, image: Image) -> DataFrame:
     """Classify an image using Hugging Face Inference API.
     This function classifies a recyclable item image into categories:
     Args:
         client: Hugging Face InferenceClient instance for API calls.
+        model: Hugging Face model ID to use for image classification.
         image: PIL Image object to classify.
     Returns:
             - Probability: The confidence score as a percentage string (e.g., "95.23%")
     Note:
         - Automatically cleans up temporary files after classification.
         - Temporary file is created with format preservation if possible.
     """
     try:
         temp_file_path = save_image_to_temp_file(image) # Needed because InferenceClient does not accept PIL Images directly.
+        classifications = client.image_classification(temp_file_path, model=model)
         return pd.DataFrame({
                                 "Label": classification.label,
                                 "Probability": f"{classification.score:.2%}"
     Args:
         client: Hugging Face InferenceClient instance to pass to the image_classification function.
     """
+    model_id = getenv("IMAGE_CLASSIFICATION_MODEL")
     gr.Markdown("Classify a recyclable item as one of: cardboard, glass, metal, paper, plastic, or other using [Trash-Net](https://huggingface.co/prithivMLmods/Trash-Net).")
     image_classification_url_input = gr.Textbox(label="Image URL")
     image_classification_image_request_button = gr.Button("Get Image")
     image_classification_button = gr.Button("Classify")
     image_classification_output = gr.Dataframe(label="Classification", headers=["Label", "Probability"], interactive=False)
     image_classification_button.click(
+        fn=partial(image_classification, client, model_id),
         inputs=image_classification_image_input,
         outputs=image_classification_output
     )

image_to_text.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gc
 from os import getenv
 import gradio as gr
 from PIL.Image import Image
@@ -7,7 +8,7 @@ from utils import get_pytorch_device, spaces_gpu, request_image
 @spaces_gpu
-def image_to_text(image: Image) -> list[str]:
     """Generate text captions for an image using BLIP model.
     This function uses a BLIP (Bootstrapping Language-Image Pre-training) model
@@ -15,29 +16,28 @@ def image_to_text(image: Image) -> list[str]:
     loaded, inference is performed, and then cleaned up to free GPU memory.
     Args:
         image: PIL Image object to generate captions for.
     Returns:
         List of string captions describing the image.
     Note:
-        - The model ID is determined by the IMAGE_TO_TEXT_MODEL environment variable.
         - Uses safetensors for secure model loading.
         - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
         - Cleans up model and GPU memory after inference.
         - Uses beam search with 3 beams, max length 20, min length 5.
     """
-    image_to_text_model_id = getenv("IMAGE_TO_TEXT_MODEL")
     pytorch_device = get_pytorch_device()
-    processor = AutoProcessor.from_pretrained(image_to_text_model_id)
-    model = BlipForConditionalGeneration.from_pretrained(
-        image_to_text_model_id,
         use_safetensors=True # Use safetensors to avoid torch.load restriction.
     ).to(pytorch_device)
     inputs = processor(images=image, return_tensors="pt").to(pytorch_device)
-    generated_ids = model.generate(pixel_values=inputs.pixel_values, num_beams=3, max_length=20, min_length=5)
     results = processor.batch_decode(generated_ids, skip_special_tokens=True)
-    del model, inputs
     gc.collect()
     return results
@@ -51,6 +51,7 @@ def create_image_to_text_tab():
     - Image preview component
     - Caption button and output list
     """
     gr.Markdown("Generate a text description of an image.")
     image_to_text_url_input = gr.Textbox(label="Image URL")
     image_to_text_image_request_button = gr.Button("Get Image")
@@ -63,7 +64,7 @@ def create_image_to_text_tab():
     image_to_text_button = gr.Button("Caption")
     image_to_text_output = gr.List(label="Captions", headers=["Caption"])
     image_to_text_button.click(
-        fn=image_to_text,
         inputs=image_to_text_image_input,
         outputs=image_to_text_output
     )

 import gc
+from functools import partial
 from os import getenv
 import gradio as gr
 from PIL.Image import Image
 @spaces_gpu
+def image_to_text(model: str, image: Image) -> list[str]:
     """Generate text captions for an image using BLIP model.
     This function uses a BLIP (Bootstrapping Language-Image Pre-training) model
     loaded, inference is performed, and then cleaned up to free GPU memory.
     Args:
+        model: Hugging Face model ID to use for image captioning.
         image: PIL Image object to generate captions for.
     Returns:
         List of string captions describing the image.
     Note:
         - Uses safetensors for secure model loading.
         - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
         - Cleans up model and GPU memory after inference.
         - Uses beam search with 3 beams, max length 20, min length 5.
     """
     pytorch_device = get_pytorch_device()
+    processor = AutoProcessor.from_pretrained(model)
+    model_instance = BlipForConditionalGeneration.from_pretrained(
+        model,
         use_safetensors=True # Use safetensors to avoid torch.load restriction.
     ).to(pytorch_device)
     inputs = processor(images=image, return_tensors="pt").to(pytorch_device)
+    generated_ids = model_instance.generate(pixel_values=inputs.pixel_values, num_beams=3, max_length=20, min_length=5)
     results = processor.batch_decode(generated_ids, skip_special_tokens=True)
+    del model_instance, inputs
     gc.collect()
     return results
     - Image preview component
     - Caption button and output list
     """
+    model_id = getenv("IMAGE_TO_TEXT_MODEL")
     gr.Markdown("Generate a text description of an image.")
     image_to_text_url_input = gr.Textbox(label="Image URL")
     image_to_text_image_request_button = gr.Button("Get Image")
     image_to_text_button = gr.Button("Caption")
     image_to_text_output = gr.List(label="Captions", headers=["Caption"])
     image_to_text_button.click(
+        fn=partial(image_to_text, model_id),
         inputs=image_to_text_image_input,
         outputs=image_to_text_output
     )

text_to_image.py CHANGED Viewed

@@ -5,20 +5,18 @@ from PIL.Image import Image
 from huggingface_hub import InferenceClient
-def text_to_image(client: InferenceClient, prompt: str) -> Image:
     """Generate an image from a text prompt using Hugging Face Inference API.
     Args:
         client: Hugging Face InferenceClient instance for API calls.
         prompt: Text description of the desired image.
     Returns:
         PIL Image object representing the generated image.
-    Note:
-        The model to use is determined by the TEXT_TO_IMAGE_MODEL environment variable.
     """
-    return client.text_to_image(prompt, model=getenv("TEXT_TO_IMAGE_MODEL"))
 def create_text_to_image_tab(client: InferenceClient):
@@ -30,12 +28,13 @@ def create_text_to_image_tab(client: InferenceClient):
     Args:
         client: Hugging Face InferenceClient instance to pass to the text_to_image function.
     """
     gr.Markdown("Generate an image from a text prompt.")
     text_to_image_prompt = gr.Textbox(label="Prompt")
     text_to_image_generate_button = gr.Button("Generate")
     text_to_image_output = gr.Image(label="Image", type="pil")
     text_to_image_generate_button.click(
-        fn=partial(text_to_image, client),
         inputs=text_to_image_prompt,
         outputs=text_to_image_output
     )

 from huggingface_hub import InferenceClient
+def text_to_image(client: InferenceClient, model: str, prompt: str) -> Image:
     """Generate an image from a text prompt using Hugging Face Inference API.
     Args:
         client: Hugging Face InferenceClient instance for API calls.
+        model: Hugging Face model ID to use for text-to-image generation.
         prompt: Text description of the desired image.
     Returns:
         PIL Image object representing the generated image.
     """
+    return client.text_to_image(prompt, model=model)
 def create_text_to_image_tab(client: InferenceClient):
     Args:
         client: Hugging Face InferenceClient instance to pass to the text_to_image function.
     """
+    model_id = getenv("TEXT_TO_IMAGE_MODEL")
     gr.Markdown("Generate an image from a text prompt.")
     text_to_image_prompt = gr.Textbox(label="Prompt")
     text_to_image_generate_button = gr.Button("Generate")
     text_to_image_output = gr.Image(label="Image", type="pil")
     text_to_image_generate_button.click(
+        fn=partial(text_to_image, client, model_id),
         inputs=text_to_image_prompt,
         outputs=text_to_image_output
     )

text_to_speech.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gc
 from os import getenv
 import gradio as gr
 from transformers import pipeline
@@ -6,7 +7,7 @@ from utils import spaces_gpu
 @spaces_gpu
-def text_to_speech(text: str) -> tuple[int, bytes]:
     """Convert text to speech audio using a TTS (Text-to-Speech) model.
     This function uses a transformer pipeline to generate speech audio from
@@ -14,6 +15,7 @@ def text_to_speech(text: str) -> tuple[int, bytes]:
     up to free GPU memory.
     Args:
         text: Input text string to convert to speech.
     Returns:
@@ -22,7 +24,6 @@ def text_to_speech(text: str) -> tuple[int, bytes]:
             - bytes: Raw audio data as bytes
     Note:
-        - The model ID is determined by the TEXT_TO_SPEECH_MODEL environment variable.
         - Uses safetensors for secure model loading.
         - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
         - Cleans up model and GPU memory after inference.
@@ -30,7 +31,7 @@ def text_to_speech(text: str) -> tuple[int, bytes]:
     """
     narrator = pipeline(
         "text-to-speech",
-        getenv("TEXT_TO_SPEECH_MODEL"),
         model_kwargs={"use_safetensors": True} # Use safetensors to avoid torch.load restriction.
     )
     result = narrator(text)
@@ -45,12 +46,13 @@ def create_text_to_speech_tab():
     This function sets up all UI components for text-to-speech generation,
     including input textbox, generate button, and output audio player.
     """
     gr.Markdown("Generate speech from text.")
     text_to_speech_text = gr.Textbox(label="Text")
     text_to_speech_generate_button = gr.Button("Generate")
     text_to_speech_output = gr.Audio(label="Speech")
     text_to_speech_generate_button.click(
-        fn=text_to_speech,
         inputs=text_to_speech_text,
         outputs=text_to_speech_output
     )

 import gc
+from functools import partial
 from os import getenv
 import gradio as gr
 from transformers import pipeline
 @spaces_gpu
+def text_to_speech(model: str, text: str) -> tuple[int, bytes]:
     """Convert text to speech audio using a TTS (Text-to-Speech) model.
     This function uses a transformer pipeline to generate speech audio from
     up to free GPU memory.
     Args:
+        model: Hugging Face model ID to use for text-to-speech.
         text: Input text string to convert to speech.
     Returns:
             - bytes: Raw audio data as bytes
     Note:
         - Uses safetensors for secure model loading.
         - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
         - Cleans up model and GPU memory after inference.
     """
     narrator = pipeline(
         "text-to-speech",
+        model,
         model_kwargs={"use_safetensors": True} # Use safetensors to avoid torch.load restriction.
     )
     result = narrator(text)
     This function sets up all UI components for text-to-speech generation,
     including input textbox, generate button, and output audio player.
     """
+    model_id = getenv("TEXT_TO_SPEECH_MODEL")
     gr.Markdown("Generate speech from text.")
     text_to_speech_text = gr.Textbox(label="Text")
     text_to_speech_generate_button = gr.Button("Generate")
     text_to_speech_output = gr.Audio(label="Speech")
     text_to_speech_generate_button.click(
+        fn=partial(text_to_speech, model_id),
         inputs=text_to_speech_text,
         outputs=text_to_speech_output
     )