Spaces:

LiKenun
/

ai-building-blocks

Running on Zero

App Files Files Community

LiKenun commited on Nov 9

Commit

5c395b2

1 Parent(s): 24f37c6

Switch to use GPU instead of inference client

Browse files

Files changed (10) hide show

README.md +29 -5
app.py +5 -14
automatic_speech_recognition.py +62 -25
image_classification.py +42 -29
image_to_text.py +16 -4
requirements.txt +4 -0
text_to_image.py +40 -8
text_to_speech.py +20 -3
translation.py +44 -19
utils.py +8 -0

README.md CHANGED Viewed

@@ -96,8 +96,11 @@ Create a `.env` file in the project root directory with the following environmen
 ### Required Environment Variables
 ```env
-# Hugging Face API Token (required for Inference API access)
 # Get your token from: https://huggingface.co/settings/tokens
 HF_TOKEN=your_huggingface_token_here
 # Model IDs for each building block
@@ -114,6 +117,11 @@ CHAT_MODEL=model_id_for_chatbot
 ```env
 # Request timeout in seconds (default: 45)
 REQUEST_TIMEOUT=45
 ```
 ### Example `.env` File
@@ -206,7 +214,23 @@ If you encounter a `RuntimeError: espeak not installed on your system` error:
 If you encounter errors loading models:
-1. Verify your `HF_TOKEN` is valid and has access to the models. Some models are gated.
-2. Check that model IDs in your `.env` file are correct.
-3. Ensure you have sufficient disk space for model downloads.
-4. For local models, ensure you have sufficient RAM or VRAM.

 ### Required Environment Variables
 ```env
+# Hugging Face API Token (required for gated models and Inference API access)
 # Get your token from: https://huggingface.co/settings/tokens
+# Required fine-grained permissions:
+#   1. "Make calls to Inference Providers"
+#   2. "Read access to contents of all public gated repos you can access"
 HF_TOKEN=your_huggingface_token_here
 # Model IDs for each building block
 ```env
 # Request timeout in seconds (default: 45)
 REQUEST_TIMEOUT=45
+# Enable reduced memory usage by using lower precision (float16) for all models (default: False).
+# Set to "True" to reduce GPU memory usage at the cost of slightly lower precision.
+# Sometimes this is still not enough—in which case you must choose another model that will fit in memory.
+REDUCED_MEMORY=False
 ```
 ### Example `.env` File
 If you encounter errors loading models:
+1. Verify your `HF_TOKEN` is valid and has the required permissions:
+   - "Make calls to Inference Providers"
+   - "Read access to contents of all public gated repos you can access"
+   Some models (like `black-forest-labs/FLUX.1-dev`) are gated and require these permissions.
+2. Ensure you have accepted the terms of use for gated models on their Hugging Face model pages.
+3. Check that model IDs in your `.env` file are correct.
+4. Ensure you have sufficient disk space for model downloads.
+5. For local models, ensure you have sufficient RAM or VRAM.
+### CUDA Out of Memory Errors
+If you encounter `torch.OutOfMemoryError: CUDA out of memory` errors:
+1. **Enable reduced memory mode**: Set `REDUCED_MEMORY=True` in your `.env` file to use lower precision (float16) for all models, which can reduce memory usage by approximately 50% at the cost of slightly lower precision.
+2. **Reduce model size**: Use smaller models or quantized versions when available.
+3. **Clear GPU cache**: The application automatically clears GPU memory after each inference, but you can manually clear it by restarting the application.
+4. **Set environment variable**: To reduce memory fragmentation, you can set `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`.
+   Add this to your shell profile (e.g., `~/.bashrc` or `~/.zshrc`) or set it before running the application.
+5. **Use CPU fallback**: If GPU memory is insufficient, the application will automatically fall back to CPU (though this will be slower).
+6. **Close other GPU applications**: Ensure no other applications are using the GPU simultaneously.

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from dotenv import load_dotenv
 from os import getenv
 import gradio as gr
-from huggingface_hub import InferenceClient
 from automatic_speech_recognition import create_asr_tab
 from chatbot import create_chatbot_tab
 from image_classification import create_image_classification_tab
@@ -20,7 +19,6 @@ class App:
     def __init__(
         self,
-        client: InferenceClient,
         text_to_image_model: str,
         image_to_text_model: str,
         image_classification_model: str,
@@ -29,11 +27,9 @@ class App:
         chat_model: str,
         fallback_translation_model: str
     ):
-        """Initialize the App with an InferenceClient instance and model IDs.
         Args:
-            client: Hugging Face InferenceClient instance for making API calls
-                to Hugging Face's inference endpoints.
             text_to_image_model: Model ID for text-to-image generation.
             image_to_text_model: Model ID for image captioning.
             image_classification_model: Model ID for image classification.
@@ -43,7 +39,6 @@ class App:
             fallback_translation_model: Fallback translation model ID for languages
                 without specific translation models.
         """
-        self.client = client
         self.text_to_image_model = text_to_image_model
         self.image_to_text_model = image_to_text_model
         self.image_classification_model = image_classification_model
@@ -64,22 +59,19 @@ class App:
             gr.Markdown("A gallery of building blocks for building AI applications")
             with gr.Tabs():
                 with gr.Tab("Text-to-image Generation"):
-                    create_text_to_image_tab(self.client, self.text_to_image_model)
                 with gr.Tab("Image-to-text or Image Captioning"):
                     create_image_to_text_tab(self.image_to_text_model)
                 with gr.Tab("Image Classification"):
-                    create_image_classification_tab(self.client, self.image_classification_model)
                 with gr.Tab("Text-to-speech (TTS)"):
                     create_text_to_speech_tab(self.text_to_speech_model)
                 with gr.Tab("Automatic Speech Recognition (ASR)"):
-                    create_asr_tab(self.client, self.audio_transcription_model)
                 with gr.Tab("Chat"):
                     create_chatbot_tab(self.chat_model)
                 with gr.Tab("Translation to English"):
-                    create_translation_tab(
-                        self.client,
-                        self.fallback_translation_model
-                    )
             demo.launch()
@@ -87,7 +79,6 @@ class App:
 if __name__ == "__main__":
     load_dotenv()
     app = App(
-        client=InferenceClient(),
         text_to_image_model=getenv("TEXT_TO_IMAGE_MODEL"),
         image_to_text_model=getenv("IMAGE_TO_TEXT_MODEL"),
         image_classification_model=getenv("IMAGE_CLASSIFICATION_MODEL"),

 from dotenv import load_dotenv
 from os import getenv
 import gradio as gr
 from automatic_speech_recognition import create_asr_tab
 from chatbot import create_chatbot_tab
 from image_classification import create_image_classification_tab
     def __init__(
         self,
         text_to_image_model: str,
         image_to_text_model: str,
         image_classification_model: str,
         chat_model: str,
         fallback_translation_model: str
     ):
+        """Initialize the App with model IDs.
         Args:
             text_to_image_model: Model ID for text-to-image generation.
             image_to_text_model: Model ID for image captioning.
             image_classification_model: Model ID for image classification.
             fallback_translation_model: Fallback translation model ID for languages
                 without specific translation models.
         """
         self.text_to_image_model = text_to_image_model
         self.image_to_text_model = image_to_text_model
         self.image_classification_model = image_classification_model
             gr.Markdown("A gallery of building blocks for building AI applications")
             with gr.Tabs():
                 with gr.Tab("Text-to-image Generation"):
+                    create_text_to_image_tab(self.text_to_image_model)
                 with gr.Tab("Image-to-text or Image Captioning"):
                     create_image_to_text_tab(self.image_to_text_model)
                 with gr.Tab("Image Classification"):
+                    create_image_classification_tab(self.image_classification_model)
                 with gr.Tab("Text-to-speech (TTS)"):
                     create_text_to_speech_tab(self.text_to_speech_model)
                 with gr.Tab("Automatic Speech Recognition (ASR)"):
+                    create_asr_tab(self.audio_transcription_model)
                 with gr.Tab("Chat"):
                     create_chatbot_tab(self.chat_model)
                 with gr.Tab("Translation to English"):
+                    create_translation_tab(self.fallback_translation_model)
             demo.launch()
 if __name__ == "__main__":
     load_dotenv()
     app = App(
         text_to_image_model=getenv("TEXT_TO_IMAGE_MODEL"),
         image_to_text_model=getenv("IMAGE_TO_TEXT_MODEL"),
         image_classification_model=getenv("IMAGE_CLASSIFICATION_MODEL"),

automatic_speech_recognition.py CHANGED Viewed

@@ -1,46 +1,84 @@
 from functools import partial
-from huggingface_hub import InferenceClient
-from os import path, unlink
 import gradio as gr
-from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio
-def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes]) -> str:
-    """Transcribe audio to text using Hugging Face Inference API.
     This function converts speech audio into text transcription. The audio is
-    resampled to match the model's expected sample rate, saved to a temporary
-    file, and then sent to the Inference API for transcription.
     Args:
-        client: Hugging Face InferenceClient instance for API calls.
         model: Hugging Face model ID to use for automatic speech recognition.
         audio: Tuple containing:
             - int: Sample rate of the input audio (e.g., 44100 Hz)
-            - bytes: Raw audio data as bytes
     Returns:
         String containing the transcribed text from the audio.
     Note:
         - Audio is automatically resampled to match the model's expected sample rate.
-        - Audio is saved as a WAV file for InferenceClient compatibility.
-        - Automatically cleans up temporary files after transcription.
     """
-    temp_file_path = None
-    try:
-        sample_rate = get_model_sample_rate(model)
-        temp_file_path = save_audio_to_temp_file(sample_rate, audio)
-        result = client.automatic_speech_recognition(temp_file_path, model=model)
         return result["text"]
-    finally:
-        if temp_file_path and path.exists(temp_file_path): # Clean up temporary file.
-            try:
-                unlink(temp_file_path)
-            except Exception:
-                pass # Ignore clean-up errors.
-def create_asr_tab(client: InferenceClient, model: str):
     """Create the automatic speech recognition tab in the Gradio interface.
     This function sets up all UI components for automatic speech recognition, including:
@@ -50,7 +88,6 @@ def create_asr_tab(client: InferenceClient, model: str):
     - Transcribe button and output textbox
     Args:
-        client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
         model: Hugging Face model ID to use for automatic speech recognition.
     """
     gr.Markdown("Transcribe audio to text.")
@@ -65,7 +102,7 @@ def create_asr_tab(client: InferenceClient, model: str):
     audio_transcription_generate_button = gr.Button("Transcribe")
     audio_transcription_output = gr.Textbox(label="Text")
     audio_transcription_generate_button.click(
-        fn=partial(automatic_speech_recognition, client, model),
         inputs=audio_transcription_audio_input,
         outputs=audio_transcription_output
     )

+import gc
 from functools import partial
 import gradio as gr
+import numpy as np
+import torch
+from transformers import pipeline
+from utils import get_pytorch_device, spaces_gpu, resample_audio, get_model_sample_rate, request_audio, get_torch_dtype
+@spaces_gpu
+def automatic_speech_recognition(model: str, audio: tuple[int, bytes | np.ndarray]) -> str:
+    """Transcribe audio to text using a Whisper or similar ASR model.
     This function converts speech audio into text transcription. The audio is
+    resampled to match the model's expected sample rate, then processed locally.
     Args:
         model: Hugging Face model ID to use for automatic speech recognition.
         audio: Tuple containing:
             - int: Sample rate of the input audio (e.g., 44100 Hz)
+            - bytes | np.ndarray: Raw audio data as bytes or numpy array
     Returns:
         String containing the transcribed text from the audio.
     Note:
         - Audio is automatically resampled to match the model's expected sample rate.
+        - Uses safetensors for secure model loading.
+        - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
+        - Cleans up model and GPU memory after inference.
     """
+    pytorch_device = get_pytorch_device()
+    target_sample_rate = get_model_sample_rate(model)
+    # Resample audio to target sample rate
+    audio_array = resample_audio(target_sample_rate, audio)
+    # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
+    # reduces memory consumption by not storing gradients. This can significantly reduce the
+    # amount of memory used during the inference phase.
+    dtype = get_torch_dtype()
+    model_kwargs = {"use_safetensors": True}
+    if dtype is not None:
+        model_kwargs["dtype"] = dtype
+    # Load and run ASR pipeline
+    asr_pipeline = pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        device=0 if pytorch_device == "cuda" else -1,
+        model_kwargs=model_kwargs
+    )
+    # Use return_timestamps="word" for long audio (>30 seconds) to avoid errors
+    # Using "word" ensures WhisperTimeStampLogitsProcessor is properly used during generation
+    # Set task='transcribe' and language='en' to avoid deprecation warnings and language detection
+    # Note: sampling_rate is not passed here since audio is already resampled to the model's expected rate
+    with torch.no_grad():
+        result = asr_pipeline(
+            audio_array,
+            return_timestamps="word",
+            task="transcribe",
+            language="en"
+        )
+    # Clean up GPU memory
+    del asr_pipeline
+    if pytorch_device == "cuda":
+        torch.cuda.empty_cache()
+    gc.collect()
+    # Extract text from result (works for both short and long audio)
+    if isinstance(result, dict) and "text" in result:
         return result["text"]
+    elif isinstance(result, str):
+        return result
+    else:
+        # Fallback: try to extract text from chunks if present
+        if isinstance(result, dict) and "chunks" in result:
+            return " ".join(chunk.get("text", "") for chunk in result["chunks"] if isinstance(chunk, dict))
+        return str(result)
+def create_asr_tab(model: str):
     """Create the automatic speech recognition tab in the Gradio interface.
     This function sets up all UI components for automatic speech recognition, including:
     - Transcribe button and output textbox
     Args:
         model: Hugging Face model ID to use for automatic speech recognition.
     """
     gr.Markdown("Transcribe audio to text.")
     audio_transcription_generate_button = gr.Button("Transcribe")
     audio_transcription_output = gr.Textbox(label="Text")
     audio_transcription_generate_button.click(
+        fn=partial(automatic_speech_recognition, model),
         inputs=audio_transcription_audio_input,
         outputs=audio_transcription_output
     )

image_classification.py CHANGED Viewed

@@ -1,23 +1,23 @@
 from functools import partial
-from huggingface_hub import InferenceClient
-from os import path, unlink
 import gradio as gr
 from PIL.Image import Image
 import pandas as pd
 from pandas import DataFrame
-from utils import save_image_to_temp_file, request_image
-def image_classification(client: InferenceClient, model: str, image: Image) -> DataFrame:
-    """Classify an image using Hugging Face Inference API.
     This function classifies a recyclable item image into categories:
-    cardboard, glass, metal, paper, plastic, or other. The image is saved
-    to a temporary file since InferenceClient requires a file path rather than
-    a PIL Image object directly.
     Args:
-        client: Hugging Face InferenceClient instance for API calls.
         model: Hugging Face model ID to use for image classification.
         image: PIL Image object to classify.
@@ -27,27 +27,41 @@ def image_classification(client: InferenceClient, model: str, image: Image) -> D
             - Probability: The confidence score as a percentage string (e.g., "95.23%")
     Note:
-        - Automatically cleans up temporary files after classification.
-        - Temporary file is created with format preservation if possible.
     """
-    try:
-        temp_file_path = save_image_to_temp_file(image) # Needed because InferenceClient does not accept PIL Images directly.
-        classifications = client.image_classification(temp_file_path, model=model)
-        return pd.DataFrame({
-                                "Label": classification.label,
-                                "Probability": f"{classification.score:.2%}"
-                            }
-                            for classification
-                            in classifications)
-    finally:
-        if temp_file_path and path.exists(temp_file_path): # Clean up temporary file.
-            try:
-                unlink(temp_file_path)
-            except Exception:
-                pass # Ignore clean-up errors.
-def create_image_classification_tab(client: InferenceClient, model: str):
     """Create the image classification tab in the Gradio interface.
     This function sets up all UI components for image classification, including:
@@ -57,7 +71,6 @@ def create_image_classification_tab(client: InferenceClient, model: str):
     - Classify button and output dataframe showing labels and probabilities
     Args:
-        client: Hugging Face InferenceClient instance to pass to the image_classification function.
         model: Hugging Face model ID to use for image classification.
     """
     gr.Markdown("Classify a recyclable item as one of: cardboard, glass, metal, paper, plastic, or other using [Trash-Net](https://huggingface.co/prithivMLmods/Trash-Net).")
@@ -72,7 +85,7 @@ def create_image_classification_tab(client: InferenceClient, model: str):
     image_classification_button = gr.Button("Classify")
     image_classification_output = gr.Dataframe(label="Classification", headers=["Label", "Probability"], interactive=False)
     image_classification_button.click(
-        fn=partial(image_classification, client, model),
         inputs=image_classification_image_input,
         outputs=image_classification_output
     )

+import gc
 from functools import partial
 import gradio as gr
+import torch
 from PIL.Image import Image
 import pandas as pd
 from pandas import DataFrame
+from transformers import pipeline
+from utils import get_pytorch_device, spaces_gpu, request_image, get_torch_dtype
+@spaces_gpu
+def image_classification(model: str, image: Image) -> DataFrame:
+    """Classify an image using a vision transformer model.
     This function classifies a recyclable item image into categories:
+    cardboard, glass, metal, paper, plastic, or other. The model is loaded,
+    inference is performed, and then cleaned up to free GPU memory.
     Args:
         model: Hugging Face model ID to use for image classification.
         image: PIL Image object to classify.
             - Probability: The confidence score as a percentage string (e.g., "95.23%")
     Note:
+        - Uses safetensors for secure model loading.
+        - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
+        - Cleans up model and GPU memory after inference.
     """
+    pytorch_device = get_pytorch_device()
+    dtype = get_torch_dtype()
+    # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
+    # reduces memory consumption by not storing gradients. This can significantly reduce the
+    # amount of memory used during the inference phase.
+    model_kwargs = {"use_safetensors": True}
+    if dtype is not None:
+        model_kwargs["dtype"] = dtype
+    classifier = pipeline(
+        "image-classification",
+        model=model,
+        device=0 if pytorch_device == "cuda" else -1,
+        model_kwargs=model_kwargs
+    )
+    with torch.no_grad():
+        results = classifier(image)
+    # Clean up GPU memory
+    del classifier
+    if pytorch_device == "cuda":
+        torch.cuda.empty_cache()
+    gc.collect()
+    return pd.DataFrame({
+        "Label": [result["label"] for result in results],
+        "Probability": [f"{result['score']:.2%}" for result in results]
+    })
+def create_image_classification_tab(model: str):
     """Create the image classification tab in the Gradio interface.
     This function sets up all UI components for image classification, including:
     - Classify button and output dataframe showing labels and probabilities
     Args:
         model: Hugging Face model ID to use for image classification.
     """
     gr.Markdown("Classify a recyclable item as one of: cardboard, glass, metal, paper, plastic, or other using [Trash-Net](https://huggingface.co/prithivMLmods/Trash-Net).")
     image_classification_button = gr.Button("Classify")
     image_classification_output = gr.Dataframe(label="Classification", headers=["Label", "Probability"], interactive=False)
     image_classification_button.click(
+        fn=partial(image_classification, model),
         inputs=image_classification_image_input,
         outputs=image_classification_output
     )

image_to_text.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import gc
 from functools import partial
 import gradio as gr
 from PIL.Image import Image
 from transformers import AutoProcessor, BlipForConditionalGeneration
-from utils import get_pytorch_device, spaces_gpu, request_image
 @spaces_gpu
@@ -28,15 +29,26 @@ def image_to_text(model: str, image: Image) -> list[str]:
         - Uses beam search with 3 beams, max length 20, min length 5.
     """
     pytorch_device = get_pytorch_device()
     processor = AutoProcessor.from_pretrained(model)
     model_instance = BlipForConditionalGeneration.from_pretrained(
         model,
-        use_safetensors=True # Use safetensors to avoid torch.load restriction.
     ).to(pytorch_device)
     inputs = processor(images=image, return_tensors="pt").to(pytorch_device)
-    generated_ids = model_instance.generate(pixel_values=inputs.pixel_values, num_beams=3, max_length=20, min_length=5)
     results = processor.batch_decode(generated_ids, skip_special_tokens=True)
-    del model_instance, inputs
     gc.collect()
     return results

 import gc
 from functools import partial
 import gradio as gr
+import torch
 from PIL.Image import Image
 from transformers import AutoProcessor, BlipForConditionalGeneration
+from utils import get_pytorch_device, spaces_gpu, request_image, get_torch_dtype
 @spaces_gpu
         - Uses beam search with 3 beams, max length 20, min length 5.
     """
     pytorch_device = get_pytorch_device()
+    dtype = get_torch_dtype()
+    # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
+    # reduces memory consumption by not storing gradients. This can significantly reduce the
+    # amount of memory used during the inference phase.
     processor = AutoProcessor.from_pretrained(model)
     model_instance = BlipForConditionalGeneration.from_pretrained(
         model,
+        use_safetensors=True, # Use safetensors to avoid torch.load restriction.
+        dtype=dtype
     ).to(pytorch_device)
     inputs = processor(images=image, return_tensors="pt").to(pytorch_device)
+    with torch.no_grad():
+        generated_ids = model_instance.generate(pixel_values=inputs.pixel_values, num_beams=3, max_length=20, min_length=5)
     results = processor.batch_decode(generated_ids, skip_special_tokens=True)
+    # Clean up GPU memory
+    del model_instance, inputs, generated_ids
+    if pytorch_device == "cuda":
+        torch.cuda.empty_cache()
     gc.collect()
     return results

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 gradio>=5.49.1
 huggingface-hub>=0.34.0,<1.0
 inflect>=7.0.0
@@ -7,8 +8,11 @@ numpy>=1.24.0
 pandas>=2.0.0
 phonemizer>=3.0.0
 pillow>=10.0.0
 python-dotenv>=1.0.0
 requests>=2.31.0
 soundfile>=0.12.0
 timm>=1.0.0
 transformers>=4.40.0

+diffusers>=0.30.0
 gradio>=5.49.1
 huggingface-hub>=0.34.0,<1.0
 inflect>=7.0.0
 pandas>=2.0.0
 phonemizer>=3.0.0
 pillow>=10.0.0
+protobuf>=4.25.0
 python-dotenv>=1.0.0
 requests>=2.31.0
+sacremoses>=0.0.53
+sentencepiece>=0.1.99
 soundfile>=0.12.0
 timm>=1.0.0
 transformers>=4.40.0

text_to_image.py CHANGED Viewed

@@ -1,31 +1,63 @@
 from functools import partial
 import gradio as gr
 from PIL.Image import Image
-from huggingface_hub import InferenceClient
-def text_to_image(client: InferenceClient, model: str, prompt: str) -> Image:
-    """Generate an image from a text prompt using Hugging Face Inference API.
     Args:
-        client: Hugging Face InferenceClient instance for API calls.
         model: Hugging Face model ID to use for text-to-image generation.
         prompt: Text description of the desired image.
     Returns:
         PIL Image object representing the generated image.
     """
-    return client.text_to_image(prompt, model=model)
-def create_text_to_image_tab(client: InferenceClient, model: str):
     """Create the text-to-image generation tab in the Gradio interface.
     This function sets up all UI components for text-to-image generation,
     including input textbox, generate button, and output image display.
     Args:
-        client: Hugging Face InferenceClient instance to pass to the text_to_image function.
         model: Hugging Face model ID to use for text-to-image generation.
     """
     gr.Markdown("Generate an image from a text prompt.")
@@ -33,7 +65,7 @@ def create_text_to_image_tab(client: InferenceClient, model: str):
     text_to_image_generate_button = gr.Button("Generate")
     text_to_image_output = gr.Image(label="Image", type="pil")
     text_to_image_generate_button.click(
-        fn=partial(text_to_image, client, model),
         inputs=text_to_image_prompt,
         outputs=text_to_image_output
     )

+import gc
 from functools import partial
 import gradio as gr
+import torch
+from os import getenv
 from PIL.Image import Image
+from diffusers import DiffusionPipeline
+from utils import get_pytorch_device, spaces_gpu, get_torch_dtype
+@spaces_gpu
+def text_to_image(model: str, prompt: str) -> Image:
+    """Generate an image from a text prompt using a diffusion model.
+    This function uses a diffusion pipeline (e.g., Stable Diffusion, FLUX) to generate
+    images from text prompts. The model is loaded, inference is performed, and then
+    cleaned up to free GPU memory.
     Args:
         model: Hugging Face model ID to use for text-to-image generation.
         prompt: Text description of the desired image.
     Returns:
         PIL Image object representing the generated image.
+    Note:
+        - Uses safetensors for secure model loading.
+        - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
+        - Cleans up model and GPU memory after inference.
     """
+    pytorch_device = get_pytorch_device()
+    dtype = get_torch_dtype()
+    # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
+    # reduces memory consumption by not storing gradients. This can significantly reduce the
+    # amount of memory used during the inference phase.
+    pipe = DiffusionPipeline.from_pretrained(
+        model,
+        use_safetensors=True,
+        dtype=dtype
+    )
+    pipe = pipe.to(pytorch_device)
+    with torch.no_grad():
+        result = pipe(prompt).images[0]
+    # Clean up GPU memory
+    del pipe
+    if pytorch_device == "cuda":
+        torch.cuda.empty_cache()
+    gc.collect()
+    return result
+def create_text_to_image_tab(model: str):
     """Create the text-to-image generation tab in the Gradio interface.
     This function sets up all UI components for text-to-image generation,
     including input textbox, generate button, and output image display.
     Args:
         model: Hugging Face model ID to use for text-to-image generation.
     """
     gr.Markdown("Generate an image from a text prompt.")
     text_to_image_generate_button = gr.Button("Generate")
     text_to_image_output = gr.Image(label="Image", type="pil")
     text_to_image_generate_button.click(
+        fn=partial(text_to_image, model),
         inputs=text_to_image_prompt,
         outputs=text_to_image_output
     )

text_to_speech.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import gc
 from functools import partial
 import gradio as gr
 from transformers import pipeline
-from utils import spaces_gpu
 @spaces_gpu
@@ -28,13 +29,29 @@ def text_to_speech(model: str, text: str) -> tuple[int, bytes]:
         - Cleans up model and GPU memory after inference.
         - Returns audio in format compatible with Gradio Audio component.
     """
     narrator = pipeline(
         "text-to-speech",
         model,
-        model_kwargs={"use_safetensors": True} # Use safetensors to avoid torch.load restriction.
     )
-    result = narrator(text)
     del narrator
     gc.collect()
     return (result["sampling_rate"], result["audio"][0])

 import gc
 from functools import partial
 import gradio as gr
+import torch
 from transformers import pipeline
+from utils import get_pytorch_device, spaces_gpu, get_torch_dtype
 @spaces_gpu
         - Cleans up model and GPU memory after inference.
         - Returns audio in format compatible with Gradio Audio component.
     """
+    pytorch_device = get_pytorch_device()
+    dtype = get_torch_dtype()
+    # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
+    # reduces memory consumption by not storing gradients. This can significantly reduce the
+    # amount of memory used during the inference phase.
+    model_kwargs = {"use_safetensors": True} # Use safetensors to avoid torch.load restriction.
+    if dtype is not None:
+        model_kwargs["dtype"] = dtype
     narrator = pipeline(
         "text-to-speech",
         model,
+        device=0 if pytorch_device == "cuda" else -1,
+        model_kwargs=model_kwargs
     )
+    with torch.no_grad():
+        result = narrator(text)
+    # Clean up GPU memory
     del narrator
+    if pytorch_device == "cuda":
+        torch.cuda.empty_cache()
     gc.collect()
     return (result["sampling_rate"], result["audio"][0])

translation.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from functools import partial
 import gradio as gr
-from huggingface_hub import InferenceClient
 from langdetect import detect, LangDetectException
 # Language code mapping to Helsinki-NLP translation models
@@ -70,25 +73,27 @@ def get_translation_model(language_code: str, fallback_model: str) -> str:
     return LANGUAGE_TO_MODEL_MAP.get(language_code, fallback_model)
-def translate_to_english(
-    client: InferenceClient,
-    fallback_translation_model: str,
-    text: str
-) -> str:
     """Translate text to English using automatic language detection.
     First detects the source language using the langdetect library, then selects
-    the appropriate translation model and translates the text to English.
     Args:
-        client: Hugging Face InferenceClient instance for API calls.
         fallback_translation_model: Fallback translation model to use if no
             language-specific model is available.
         text: Input text to translate to English.
     Returns:
-        String containing the translated text in English, or a message if the
-        text is already in English.
     """
     # Detect the language using langdetect library
     detected_lang = detect_language(text)
@@ -100,22 +105,42 @@ def translate_to_english(
     # Get the appropriate translation model
     translation_model = get_translation_model(detected_lang, fallback_translation_model)
-    # Translate using the selected model
-    result = client.translation(text, model=translation_model)
-    return result.translation_text
-def create_translation_tab(
-    client: InferenceClient,
-    fallback_translation_model: str
-):
     """Create the translation to English tab in the Gradio interface.
     This function sets up all UI components for translation with automatic
     language detection, including input textbox, translate button, and output textbox.
     Args:
-        client: Hugging Face InferenceClient instance for API calls.
         fallback_translation_model: Fallback translation model to use if no
             language-specific model is available.
     """
@@ -124,7 +149,7 @@ def create_translation_tab(
     translation_button = gr.Button("Translate")
     translation_output = gr.Textbox(label="Translated Text", lines=5, interactive=False)
     translation_button.click(
-        fn=partial(translate_to_english, client, fallback_translation_model),
         inputs=translation_input,
         outputs=translation_output
     )

+import gc
 from functools import partial
 import gradio as gr
+import torch
 from langdetect import detect, LangDetectException
+from transformers import MarianMTModel, MarianTokenizer
+from utils import get_pytorch_device, spaces_gpu, get_torch_dtype
 # Language code mapping to Helsinki-NLP translation models
     return LANGUAGE_TO_MODEL_MAP.get(language_code, fallback_model)
+@spaces_gpu
+def translate_to_english(fallback_translation_model: str, text: str) -> str:
     """Translate text to English using automatic language detection.
     First detects the source language using the langdetect library, then selects
+    the appropriate translation model and translates the text to English using
+    a local MarianMT model.
     Args:
         fallback_translation_model: Fallback translation model to use if no
             language-specific model is available.
         text: Input text to translate to English.
     Returns:
+        String containing the translated text in English, or the original text
+        if it is already in English.
+    Note:
+        - Uses safetensors for secure model loading.
+        - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
+        - Cleans up model and GPU memory after inference.
     """
     # Detect the language using langdetect library
     detected_lang = detect_language(text)
     # Get the appropriate translation model
     translation_model = get_translation_model(detected_lang, fallback_translation_model)
+    # Load model and tokenizer
+    pytorch_device = get_pytorch_device()
+    dtype = get_torch_dtype()
+    # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
+    # reduces memory consumption by not storing gradients. This can significantly reduce the
+    # amount of memory used during the inference phase.
+    tokenizer = MarianTokenizer.from_pretrained(translation_model)
+    model = MarianMTModel.from_pretrained(
+        translation_model,
+        use_safetensors=True,
+        dtype=dtype
+    ).to(pytorch_device)
+    # Tokenize and translate
+    inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(pytorch_device)
+    with torch.no_grad():
+        translated = model.generate(**inputs)
+    translation = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
+    # Clean up GPU memory
+    del model, tokenizer, inputs, translated
+    if pytorch_device == "cuda":
+        torch.cuda.empty_cache()
+    gc.collect()
+    return translation
+def create_translation_tab(fallback_translation_model: str):
     """Create the translation to English tab in the Gradio interface.
     This function sets up all UI components for translation with automatic
     language detection, including input textbox, translate button, and output textbox.
     Args:
         fallback_translation_model: Fallback translation model to use if no
             language-specific model is available.
     """
     translation_button = gr.Button("Translate")
     translation_output = gr.Textbox(label="Translated Text", lines=5, interactive=False)
     translation_button.click(
+        fn=partial(translate_to_english, fallback_translation_model),
         inputs=translation_input,
         outputs=translation_output
     )

utils.py CHANGED Viewed

@@ -37,6 +37,14 @@ def get_pytorch_device() -> str:
        else "mps" if torch.mps.is_available() # Apple Silicon
        else "cpu") # gl bro 🫠
 def request_image(url: str) -> Image:
     """Fetch an image from a URL and return it as a PIL Image.

        else "mps" if torch.mps.is_available() # Apple Silicon
        else "cpu") # gl bro 🫠
+def get_torch_dtype():
+    """Get the appropriate torch dtype based on reduced memory setting.
+    Returns:
+        torch.float16 if reduced memory is enabled, None otherwise (uses default precision).
+    """
+    return torch.float16 if getenv("REDUCED_MEMORY", "False").lower() == "true" else None
 def request_image(url: str) -> Image:
     """Fetch an image from a URL and return it as a PIL Image.