Spaces:
Running
on
Zero
Running
on
Zero
Switch to use GPU instead of inference client
Browse files- README.md +29 -5
- app.py +5 -14
- automatic_speech_recognition.py +62 -25
- image_classification.py +42 -29
- image_to_text.py +16 -4
- requirements.txt +4 -0
- text_to_image.py +40 -8
- text_to_speech.py +20 -3
- translation.py +44 -19
- utils.py +8 -0
README.md
CHANGED
|
@@ -96,8 +96,11 @@ Create a `.env` file in the project root directory with the following environmen
|
|
| 96 |
### Required Environment Variables
|
| 97 |
|
| 98 |
```env
|
| 99 |
-
# Hugging Face API Token (required for Inference API access)
|
| 100 |
# Get your token from: https://huggingface.co/settings/tokens
|
|
|
|
|
|
|
|
|
|
| 101 |
HF_TOKEN=your_huggingface_token_here
|
| 102 |
|
| 103 |
# Model IDs for each building block
|
|
@@ -114,6 +117,11 @@ CHAT_MODEL=model_id_for_chatbot
|
|
| 114 |
```env
|
| 115 |
# Request timeout in seconds (default: 45)
|
| 116 |
REQUEST_TIMEOUT=45
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
```
|
| 118 |
|
| 119 |
### Example `.env` File
|
|
@@ -206,7 +214,23 @@ If you encounter a `RuntimeError: espeak not installed on your system` error:
|
|
| 206 |
|
| 207 |
If you encounter errors loading models:
|
| 208 |
|
| 209 |
-
1. Verify your `HF_TOKEN` is valid and has
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
### Required Environment Variables
|
| 97 |
|
| 98 |
```env
|
| 99 |
+
# Hugging Face API Token (required for gated models and Inference API access)
|
| 100 |
# Get your token from: https://huggingface.co/settings/tokens
|
| 101 |
+
# Required fine-grained permissions:
|
| 102 |
+
# 1. "Make calls to Inference Providers"
|
| 103 |
+
# 2. "Read access to contents of all public gated repos you can access"
|
| 104 |
HF_TOKEN=your_huggingface_token_here
|
| 105 |
|
| 106 |
# Model IDs for each building block
|
|
|
|
| 117 |
```env
|
| 118 |
# Request timeout in seconds (default: 45)
|
| 119 |
REQUEST_TIMEOUT=45
|
| 120 |
+
|
| 121 |
+
# Enable reduced memory usage by using lower precision (float16) for all models (default: False).
|
| 122 |
+
# Set to "True" to reduce GPU memory usage at the cost of slightly lower precision.
|
| 123 |
+
# Sometimes this is still not enough—in which case you must choose another model that will fit in memory.
|
| 124 |
+
REDUCED_MEMORY=False
|
| 125 |
```
|
| 126 |
|
| 127 |
### Example `.env` File
|
|
|
|
| 214 |
|
| 215 |
If you encounter errors loading models:
|
| 216 |
|
| 217 |
+
1. Verify your `HF_TOKEN` is valid and has the required permissions:
|
| 218 |
+
- "Make calls to Inference Providers"
|
| 219 |
+
- "Read access to contents of all public gated repos you can access"
|
| 220 |
+
Some models (like `black-forest-labs/FLUX.1-dev`) are gated and require these permissions.
|
| 221 |
+
2. Ensure you have accepted the terms of use for gated models on their Hugging Face model pages.
|
| 222 |
+
3. Check that model IDs in your `.env` file are correct.
|
| 223 |
+
4. Ensure you have sufficient disk space for model downloads.
|
| 224 |
+
5. For local models, ensure you have sufficient RAM or VRAM.
|
| 225 |
+
|
| 226 |
+
### CUDA Out of Memory Errors
|
| 227 |
+
|
| 228 |
+
If you encounter `torch.OutOfMemoryError: CUDA out of memory` errors:
|
| 229 |
+
|
| 230 |
+
1. **Enable reduced memory mode**: Set `REDUCED_MEMORY=True` in your `.env` file to use lower precision (float16) for all models, which can reduce memory usage by approximately 50% at the cost of slightly lower precision.
|
| 231 |
+
2. **Reduce model size**: Use smaller models or quantized versions when available.
|
| 232 |
+
3. **Clear GPU cache**: The application automatically clears GPU memory after each inference, but you can manually clear it by restarting the application.
|
| 233 |
+
4. **Set environment variable**: To reduce memory fragmentation, you can set `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`.
|
| 234 |
+
Add this to your shell profile (e.g., `~/.bashrc` or `~/.zshrc`) or set it before running the application.
|
| 235 |
+
5. **Use CPU fallback**: If GPU memory is insufficient, the application will automatically fall back to CPU (though this will be slower).
|
| 236 |
+
6. **Close other GPU applications**: Ensure no other applications are using the GPU simultaneously.
|
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
from dotenv import load_dotenv
|
| 2 |
from os import getenv
|
| 3 |
import gradio as gr
|
| 4 |
-
from huggingface_hub import InferenceClient
|
| 5 |
from automatic_speech_recognition import create_asr_tab
|
| 6 |
from chatbot import create_chatbot_tab
|
| 7 |
from image_classification import create_image_classification_tab
|
|
@@ -20,7 +19,6 @@ class App:
|
|
| 20 |
|
| 21 |
def __init__(
|
| 22 |
self,
|
| 23 |
-
client: InferenceClient,
|
| 24 |
text_to_image_model: str,
|
| 25 |
image_to_text_model: str,
|
| 26 |
image_classification_model: str,
|
|
@@ -29,11 +27,9 @@ class App:
|
|
| 29 |
chat_model: str,
|
| 30 |
fallback_translation_model: str
|
| 31 |
):
|
| 32 |
-
"""Initialize the App with
|
| 33 |
|
| 34 |
Args:
|
| 35 |
-
client: Hugging Face InferenceClient instance for making API calls
|
| 36 |
-
to Hugging Face's inference endpoints.
|
| 37 |
text_to_image_model: Model ID for text-to-image generation.
|
| 38 |
image_to_text_model: Model ID for image captioning.
|
| 39 |
image_classification_model: Model ID for image classification.
|
|
@@ -43,7 +39,6 @@ class App:
|
|
| 43 |
fallback_translation_model: Fallback translation model ID for languages
|
| 44 |
without specific translation models.
|
| 45 |
"""
|
| 46 |
-
self.client = client
|
| 47 |
self.text_to_image_model = text_to_image_model
|
| 48 |
self.image_to_text_model = image_to_text_model
|
| 49 |
self.image_classification_model = image_classification_model
|
|
@@ -64,22 +59,19 @@ class App:
|
|
| 64 |
gr.Markdown("A gallery of building blocks for building AI applications")
|
| 65 |
with gr.Tabs():
|
| 66 |
with gr.Tab("Text-to-image Generation"):
|
| 67 |
-
create_text_to_image_tab(self.
|
| 68 |
with gr.Tab("Image-to-text or Image Captioning"):
|
| 69 |
create_image_to_text_tab(self.image_to_text_model)
|
| 70 |
with gr.Tab("Image Classification"):
|
| 71 |
-
create_image_classification_tab(self.
|
| 72 |
with gr.Tab("Text-to-speech (TTS)"):
|
| 73 |
create_text_to_speech_tab(self.text_to_speech_model)
|
| 74 |
with gr.Tab("Automatic Speech Recognition (ASR)"):
|
| 75 |
-
create_asr_tab(self.
|
| 76 |
with gr.Tab("Chat"):
|
| 77 |
create_chatbot_tab(self.chat_model)
|
| 78 |
with gr.Tab("Translation to English"):
|
| 79 |
-
create_translation_tab(
|
| 80 |
-
self.client,
|
| 81 |
-
self.fallback_translation_model
|
| 82 |
-
)
|
| 83 |
|
| 84 |
demo.launch()
|
| 85 |
|
|
@@ -87,7 +79,6 @@ class App:
|
|
| 87 |
if __name__ == "__main__":
|
| 88 |
load_dotenv()
|
| 89 |
app = App(
|
| 90 |
-
client=InferenceClient(),
|
| 91 |
text_to_image_model=getenv("TEXT_TO_IMAGE_MODEL"),
|
| 92 |
image_to_text_model=getenv("IMAGE_TO_TEXT_MODEL"),
|
| 93 |
image_classification_model=getenv("IMAGE_CLASSIFICATION_MODEL"),
|
|
|
|
| 1 |
from dotenv import load_dotenv
|
| 2 |
from os import getenv
|
| 3 |
import gradio as gr
|
|
|
|
| 4 |
from automatic_speech_recognition import create_asr_tab
|
| 5 |
from chatbot import create_chatbot_tab
|
| 6 |
from image_classification import create_image_classification_tab
|
|
|
|
| 19 |
|
| 20 |
def __init__(
|
| 21 |
self,
|
|
|
|
| 22 |
text_to_image_model: str,
|
| 23 |
image_to_text_model: str,
|
| 24 |
image_classification_model: str,
|
|
|
|
| 27 |
chat_model: str,
|
| 28 |
fallback_translation_model: str
|
| 29 |
):
|
| 30 |
+
"""Initialize the App with model IDs.
|
| 31 |
|
| 32 |
Args:
|
|
|
|
|
|
|
| 33 |
text_to_image_model: Model ID for text-to-image generation.
|
| 34 |
image_to_text_model: Model ID for image captioning.
|
| 35 |
image_classification_model: Model ID for image classification.
|
|
|
|
| 39 |
fallback_translation_model: Fallback translation model ID for languages
|
| 40 |
without specific translation models.
|
| 41 |
"""
|
|
|
|
| 42 |
self.text_to_image_model = text_to_image_model
|
| 43 |
self.image_to_text_model = image_to_text_model
|
| 44 |
self.image_classification_model = image_classification_model
|
|
|
|
| 59 |
gr.Markdown("A gallery of building blocks for building AI applications")
|
| 60 |
with gr.Tabs():
|
| 61 |
with gr.Tab("Text-to-image Generation"):
|
| 62 |
+
create_text_to_image_tab(self.text_to_image_model)
|
| 63 |
with gr.Tab("Image-to-text or Image Captioning"):
|
| 64 |
create_image_to_text_tab(self.image_to_text_model)
|
| 65 |
with gr.Tab("Image Classification"):
|
| 66 |
+
create_image_classification_tab(self.image_classification_model)
|
| 67 |
with gr.Tab("Text-to-speech (TTS)"):
|
| 68 |
create_text_to_speech_tab(self.text_to_speech_model)
|
| 69 |
with gr.Tab("Automatic Speech Recognition (ASR)"):
|
| 70 |
+
create_asr_tab(self.audio_transcription_model)
|
| 71 |
with gr.Tab("Chat"):
|
| 72 |
create_chatbot_tab(self.chat_model)
|
| 73 |
with gr.Tab("Translation to English"):
|
| 74 |
+
create_translation_tab(self.fallback_translation_model)
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
demo.launch()
|
| 77 |
|
|
|
|
| 79 |
if __name__ == "__main__":
|
| 80 |
load_dotenv()
|
| 81 |
app = App(
|
|
|
|
| 82 |
text_to_image_model=getenv("TEXT_TO_IMAGE_MODEL"),
|
| 83 |
image_to_text_model=getenv("IMAGE_TO_TEXT_MODEL"),
|
| 84 |
image_classification_model=getenv("IMAGE_CLASSIFICATION_MODEL"),
|
automatic_speech_recognition.py
CHANGED
|
@@ -1,46 +1,84 @@
|
|
|
|
|
| 1 |
from functools import partial
|
| 2 |
-
from huggingface_hub import InferenceClient
|
| 3 |
-
from os import path, unlink
|
| 4 |
import gradio as gr
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
|
|
|
| 9 |
|
| 10 |
This function converts speech audio into text transcription. The audio is
|
| 11 |
-
resampled to match the model's expected sample rate,
|
| 12 |
-
file, and then sent to the Inference API for transcription.
|
| 13 |
|
| 14 |
Args:
|
| 15 |
-
client: Hugging Face InferenceClient instance for API calls.
|
| 16 |
model: Hugging Face model ID to use for automatic speech recognition.
|
| 17 |
audio: Tuple containing:
|
| 18 |
- int: Sample rate of the input audio (e.g., 44100 Hz)
|
| 19 |
-
- bytes: Raw audio data as bytes
|
| 20 |
|
| 21 |
Returns:
|
| 22 |
String containing the transcribed text from the audio.
|
| 23 |
|
| 24 |
Note:
|
| 25 |
- Audio is automatically resampled to match the model's expected sample rate.
|
| 26 |
-
-
|
| 27 |
-
- Automatically
|
|
|
|
| 28 |
"""
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
return result["text"]
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
-
def create_asr_tab(
|
| 44 |
"""Create the automatic speech recognition tab in the Gradio interface.
|
| 45 |
|
| 46 |
This function sets up all UI components for automatic speech recognition, including:
|
|
@@ -50,7 +88,6 @@ def create_asr_tab(client: InferenceClient, model: str):
|
|
| 50 |
- Transcribe button and output textbox
|
| 51 |
|
| 52 |
Args:
|
| 53 |
-
client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
|
| 54 |
model: Hugging Face model ID to use for automatic speech recognition.
|
| 55 |
"""
|
| 56 |
gr.Markdown("Transcribe audio to text.")
|
|
@@ -65,7 +102,7 @@ def create_asr_tab(client: InferenceClient, model: str):
|
|
| 65 |
audio_transcription_generate_button = gr.Button("Transcribe")
|
| 66 |
audio_transcription_output = gr.Textbox(label="Text")
|
| 67 |
audio_transcription_generate_button.click(
|
| 68 |
-
fn=partial(automatic_speech_recognition,
|
| 69 |
inputs=audio_transcription_audio_input,
|
| 70 |
outputs=audio_transcription_output
|
| 71 |
)
|
|
|
|
| 1 |
+
import gc
|
| 2 |
from functools import partial
|
|
|
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
+
import numpy as np
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import pipeline
|
| 7 |
+
from utils import get_pytorch_device, spaces_gpu, resample_audio, get_model_sample_rate, request_audio, get_torch_dtype
|
| 8 |
|
| 9 |
+
@spaces_gpu
|
| 10 |
+
def automatic_speech_recognition(model: str, audio: tuple[int, bytes | np.ndarray]) -> str:
|
| 11 |
+
"""Transcribe audio to text using a Whisper or similar ASR model.
|
| 12 |
|
| 13 |
This function converts speech audio into text transcription. The audio is
|
| 14 |
+
resampled to match the model's expected sample rate, then processed locally.
|
|
|
|
| 15 |
|
| 16 |
Args:
|
|
|
|
| 17 |
model: Hugging Face model ID to use for automatic speech recognition.
|
| 18 |
audio: Tuple containing:
|
| 19 |
- int: Sample rate of the input audio (e.g., 44100 Hz)
|
| 20 |
+
- bytes | np.ndarray: Raw audio data as bytes or numpy array
|
| 21 |
|
| 22 |
Returns:
|
| 23 |
String containing the transcribed text from the audio.
|
| 24 |
|
| 25 |
Note:
|
| 26 |
- Audio is automatically resampled to match the model's expected sample rate.
|
| 27 |
+
- Uses safetensors for secure model loading.
|
| 28 |
+
- Automatically selects the best available device (CUDA/XPU/MPS/CPU).
|
| 29 |
+
- Cleans up model and GPU memory after inference.
|
| 30 |
"""
|
| 31 |
+
pytorch_device = get_pytorch_device()
|
| 32 |
+
target_sample_rate = get_model_sample_rate(model)
|
| 33 |
+
|
| 34 |
+
# Resample audio to target sample rate
|
| 35 |
+
audio_array = resample_audio(target_sample_rate, audio)
|
| 36 |
+
|
| 37 |
+
# During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
|
| 38 |
+
# reduces memory consumption by not storing gradients. This can significantly reduce the
|
| 39 |
+
# amount of memory used during the inference phase.
|
| 40 |
+
dtype = get_torch_dtype()
|
| 41 |
+
model_kwargs = {"use_safetensors": True}
|
| 42 |
+
if dtype is not None:
|
| 43 |
+
model_kwargs["dtype"] = dtype
|
| 44 |
+
|
| 45 |
+
# Load and run ASR pipeline
|
| 46 |
+
asr_pipeline = pipeline(
|
| 47 |
+
"automatic-speech-recognition",
|
| 48 |
+
model=model,
|
| 49 |
+
device=0 if pytorch_device == "cuda" else -1,
|
| 50 |
+
model_kwargs=model_kwargs
|
| 51 |
+
)
|
| 52 |
+
# Use return_timestamps="word" for long audio (>30 seconds) to avoid errors
|
| 53 |
+
# Using "word" ensures WhisperTimeStampLogitsProcessor is properly used during generation
|
| 54 |
+
# Set task='transcribe' and language='en' to avoid deprecation warnings and language detection
|
| 55 |
+
# Note: sampling_rate is not passed here since audio is already resampled to the model's expected rate
|
| 56 |
+
with torch.no_grad():
|
| 57 |
+
result = asr_pipeline(
|
| 58 |
+
audio_array,
|
| 59 |
+
return_timestamps="word",
|
| 60 |
+
task="transcribe",
|
| 61 |
+
language="en"
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# Clean up GPU memory
|
| 65 |
+
del asr_pipeline
|
| 66 |
+
if pytorch_device == "cuda":
|
| 67 |
+
torch.cuda.empty_cache()
|
| 68 |
+
gc.collect()
|
| 69 |
+
# Extract text from result (works for both short and long audio)
|
| 70 |
+
if isinstance(result, dict) and "text" in result:
|
| 71 |
return result["text"]
|
| 72 |
+
elif isinstance(result, str):
|
| 73 |
+
return result
|
| 74 |
+
else:
|
| 75 |
+
# Fallback: try to extract text from chunks if present
|
| 76 |
+
if isinstance(result, dict) and "chunks" in result:
|
| 77 |
+
return " ".join(chunk.get("text", "") for chunk in result["chunks"] if isinstance(chunk, dict))
|
| 78 |
+
return str(result)
|
| 79 |
|
| 80 |
|
| 81 |
+
def create_asr_tab(model: str):
|
| 82 |
"""Create the automatic speech recognition tab in the Gradio interface.
|
| 83 |
|
| 84 |
This function sets up all UI components for automatic speech recognition, including:
|
|
|
|
| 88 |
- Transcribe button and output textbox
|
| 89 |
|
| 90 |
Args:
|
|
|
|
| 91 |
model: Hugging Face model ID to use for automatic speech recognition.
|
| 92 |
"""
|
| 93 |
gr.Markdown("Transcribe audio to text.")
|
|
|
|
| 102 |
audio_transcription_generate_button = gr.Button("Transcribe")
|
| 103 |
audio_transcription_output = gr.Textbox(label="Text")
|
| 104 |
audio_transcription_generate_button.click(
|
| 105 |
+
fn=partial(automatic_speech_recognition, model),
|
| 106 |
inputs=audio_transcription_audio_input,
|
| 107 |
outputs=audio_transcription_output
|
| 108 |
)
|
image_classification.py
CHANGED
|
@@ -1,23 +1,23 @@
|
|
|
|
|
| 1 |
from functools import partial
|
| 2 |
-
from huggingface_hub import InferenceClient
|
| 3 |
-
from os import path, unlink
|
| 4 |
import gradio as gr
|
|
|
|
| 5 |
from PIL.Image import Image
|
| 6 |
import pandas as pd
|
| 7 |
from pandas import DataFrame
|
| 8 |
-
from
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
|
|
|
| 13 |
|
| 14 |
This function classifies a recyclable item image into categories:
|
| 15 |
-
cardboard, glass, metal, paper, plastic, or other. The
|
| 16 |
-
|
| 17 |
-
a PIL Image object directly.
|
| 18 |
|
| 19 |
Args:
|
| 20 |
-
client: Hugging Face InferenceClient instance for API calls.
|
| 21 |
model: Hugging Face model ID to use for image classification.
|
| 22 |
image: PIL Image object to classify.
|
| 23 |
|
|
@@ -27,27 +27,41 @@ def image_classification(client: InferenceClient, model: str, image: Image) -> D
|
|
| 27 |
- Probability: The confidence score as a percentage string (e.g., "95.23%")
|
| 28 |
|
| 29 |
Note:
|
| 30 |
-
-
|
| 31 |
-
-
|
|
|
|
| 32 |
"""
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
-
def create_image_classification_tab(
|
| 51 |
"""Create the image classification tab in the Gradio interface.
|
| 52 |
|
| 53 |
This function sets up all UI components for image classification, including:
|
|
@@ -57,7 +71,6 @@ def create_image_classification_tab(client: InferenceClient, model: str):
|
|
| 57 |
- Classify button and output dataframe showing labels and probabilities
|
| 58 |
|
| 59 |
Args:
|
| 60 |
-
client: Hugging Face InferenceClient instance to pass to the image_classification function.
|
| 61 |
model: Hugging Face model ID to use for image classification.
|
| 62 |
"""
|
| 63 |
gr.Markdown("Classify a recyclable item as one of: cardboard, glass, metal, paper, plastic, or other using [Trash-Net](https://huggingface.co/prithivMLmods/Trash-Net).")
|
|
@@ -72,7 +85,7 @@ def create_image_classification_tab(client: InferenceClient, model: str):
|
|
| 72 |
image_classification_button = gr.Button("Classify")
|
| 73 |
image_classification_output = gr.Dataframe(label="Classification", headers=["Label", "Probability"], interactive=False)
|
| 74 |
image_classification_button.click(
|
| 75 |
-
fn=partial(image_classification,
|
| 76 |
inputs=image_classification_image_input,
|
| 77 |
outputs=image_classification_output
|
| 78 |
)
|
|
|
|
| 1 |
+
import gc
|
| 2 |
from functools import partial
|
|
|
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
+
import torch
|
| 5 |
from PIL.Image import Image
|
| 6 |
import pandas as pd
|
| 7 |
from pandas import DataFrame
|
| 8 |
+
from transformers import pipeline
|
| 9 |
+
from utils import get_pytorch_device, spaces_gpu, request_image, get_torch_dtype
|
| 10 |
|
| 11 |
|
| 12 |
+
@spaces_gpu
|
| 13 |
+
def image_classification(model: str, image: Image) -> DataFrame:
|
| 14 |
+
"""Classify an image using a vision transformer model.
|
| 15 |
|
| 16 |
This function classifies a recyclable item image into categories:
|
| 17 |
+
cardboard, glass, metal, paper, plastic, or other. The model is loaded,
|
| 18 |
+
inference is performed, and then cleaned up to free GPU memory.
|
|
|
|
| 19 |
|
| 20 |
Args:
|
|
|
|
| 21 |
model: Hugging Face model ID to use for image classification.
|
| 22 |
image: PIL Image object to classify.
|
| 23 |
|
|
|
|
| 27 |
- Probability: The confidence score as a percentage string (e.g., "95.23%")
|
| 28 |
|
| 29 |
Note:
|
| 30 |
+
- Uses safetensors for secure model loading.
|
| 31 |
+
- Automatically selects the best available device (CUDA/XPU/MPS/CPU).
|
| 32 |
+
- Cleans up model and GPU memory after inference.
|
| 33 |
"""
|
| 34 |
+
pytorch_device = get_pytorch_device()
|
| 35 |
+
dtype = get_torch_dtype()
|
| 36 |
+
|
| 37 |
+
# During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
|
| 38 |
+
# reduces memory consumption by not storing gradients. This can significantly reduce the
|
| 39 |
+
# amount of memory used during the inference phase.
|
| 40 |
+
model_kwargs = {"use_safetensors": True}
|
| 41 |
+
if dtype is not None:
|
| 42 |
+
model_kwargs["dtype"] = dtype
|
| 43 |
+
|
| 44 |
+
classifier = pipeline(
|
| 45 |
+
"image-classification",
|
| 46 |
+
model=model,
|
| 47 |
+
device=0 if pytorch_device == "cuda" else -1,
|
| 48 |
+
model_kwargs=model_kwargs
|
| 49 |
+
)
|
| 50 |
+
with torch.no_grad():
|
| 51 |
+
results = classifier(image)
|
| 52 |
+
|
| 53 |
+
# Clean up GPU memory
|
| 54 |
+
del classifier
|
| 55 |
+
if pytorch_device == "cuda":
|
| 56 |
+
torch.cuda.empty_cache()
|
| 57 |
+
gc.collect()
|
| 58 |
+
return pd.DataFrame({
|
| 59 |
+
"Label": [result["label"] for result in results],
|
| 60 |
+
"Probability": [f"{result['score']:.2%}" for result in results]
|
| 61 |
+
})
|
| 62 |
|
| 63 |
|
| 64 |
+
def create_image_classification_tab(model: str):
|
| 65 |
"""Create the image classification tab in the Gradio interface.
|
| 66 |
|
| 67 |
This function sets up all UI components for image classification, including:
|
|
|
|
| 71 |
- Classify button and output dataframe showing labels and probabilities
|
| 72 |
|
| 73 |
Args:
|
|
|
|
| 74 |
model: Hugging Face model ID to use for image classification.
|
| 75 |
"""
|
| 76 |
gr.Markdown("Classify a recyclable item as one of: cardboard, glass, metal, paper, plastic, or other using [Trash-Net](https://huggingface.co/prithivMLmods/Trash-Net).")
|
|
|
|
| 85 |
image_classification_button = gr.Button("Classify")
|
| 86 |
image_classification_output = gr.Dataframe(label="Classification", headers=["Label", "Probability"], interactive=False)
|
| 87 |
image_classification_button.click(
|
| 88 |
+
fn=partial(image_classification, model),
|
| 89 |
inputs=image_classification_image_input,
|
| 90 |
outputs=image_classification_output
|
| 91 |
)
|
image_to_text.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
import gc
|
| 2 |
from functools import partial
|
| 3 |
import gradio as gr
|
|
|
|
| 4 |
from PIL.Image import Image
|
| 5 |
from transformers import AutoProcessor, BlipForConditionalGeneration
|
| 6 |
-
from utils import get_pytorch_device, spaces_gpu, request_image
|
| 7 |
|
| 8 |
|
| 9 |
@spaces_gpu
|
|
@@ -28,15 +29,26 @@ def image_to_text(model: str, image: Image) -> list[str]:
|
|
| 28 |
- Uses beam search with 3 beams, max length 20, min length 5.
|
| 29 |
"""
|
| 30 |
pytorch_device = get_pytorch_device()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
processor = AutoProcessor.from_pretrained(model)
|
| 32 |
model_instance = BlipForConditionalGeneration.from_pretrained(
|
| 33 |
model,
|
| 34 |
-
use_safetensors=True # Use safetensors to avoid torch.load restriction.
|
|
|
|
| 35 |
).to(pytorch_device)
|
| 36 |
inputs = processor(images=image, return_tensors="pt").to(pytorch_device)
|
| 37 |
-
|
|
|
|
| 38 |
results = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
gc.collect()
|
| 41 |
return results
|
| 42 |
|
|
|
|
| 1 |
import gc
|
| 2 |
from functools import partial
|
| 3 |
import gradio as gr
|
| 4 |
+
import torch
|
| 5 |
from PIL.Image import Image
|
| 6 |
from transformers import AutoProcessor, BlipForConditionalGeneration
|
| 7 |
+
from utils import get_pytorch_device, spaces_gpu, request_image, get_torch_dtype
|
| 8 |
|
| 9 |
|
| 10 |
@spaces_gpu
|
|
|
|
| 29 |
- Uses beam search with 3 beams, max length 20, min length 5.
|
| 30 |
"""
|
| 31 |
pytorch_device = get_pytorch_device()
|
| 32 |
+
dtype = get_torch_dtype()
|
| 33 |
+
|
| 34 |
+
# During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
|
| 35 |
+
# reduces memory consumption by not storing gradients. This can significantly reduce the
|
| 36 |
+
# amount of memory used during the inference phase.
|
| 37 |
processor = AutoProcessor.from_pretrained(model)
|
| 38 |
model_instance = BlipForConditionalGeneration.from_pretrained(
|
| 39 |
model,
|
| 40 |
+
use_safetensors=True, # Use safetensors to avoid torch.load restriction.
|
| 41 |
+
dtype=dtype
|
| 42 |
).to(pytorch_device)
|
| 43 |
inputs = processor(images=image, return_tensors="pt").to(pytorch_device)
|
| 44 |
+
with torch.no_grad():
|
| 45 |
+
generated_ids = model_instance.generate(pixel_values=inputs.pixel_values, num_beams=3, max_length=20, min_length=5)
|
| 46 |
results = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
| 47 |
+
|
| 48 |
+
# Clean up GPU memory
|
| 49 |
+
del model_instance, inputs, generated_ids
|
| 50 |
+
if pytorch_device == "cuda":
|
| 51 |
+
torch.cuda.empty_cache()
|
| 52 |
gc.collect()
|
| 53 |
return results
|
| 54 |
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
gradio>=5.49.1
|
| 2 |
huggingface-hub>=0.34.0,<1.0
|
| 3 |
inflect>=7.0.0
|
|
@@ -7,8 +8,11 @@ numpy>=1.24.0
|
|
| 7 |
pandas>=2.0.0
|
| 8 |
phonemizer>=3.0.0
|
| 9 |
pillow>=10.0.0
|
|
|
|
| 10 |
python-dotenv>=1.0.0
|
| 11 |
requests>=2.31.0
|
|
|
|
|
|
|
| 12 |
soundfile>=0.12.0
|
| 13 |
timm>=1.0.0
|
| 14 |
transformers>=4.40.0
|
|
|
|
| 1 |
+
diffusers>=0.30.0
|
| 2 |
gradio>=5.49.1
|
| 3 |
huggingface-hub>=0.34.0,<1.0
|
| 4 |
inflect>=7.0.0
|
|
|
|
| 8 |
pandas>=2.0.0
|
| 9 |
phonemizer>=3.0.0
|
| 10 |
pillow>=10.0.0
|
| 11 |
+
protobuf>=4.25.0
|
| 12 |
python-dotenv>=1.0.0
|
| 13 |
requests>=2.31.0
|
| 14 |
+
sacremoses>=0.0.53
|
| 15 |
+
sentencepiece>=0.1.99
|
| 16 |
soundfile>=0.12.0
|
| 17 |
timm>=1.0.0
|
| 18 |
transformers>=4.40.0
|
text_to_image.py
CHANGED
|
@@ -1,31 +1,63 @@
|
|
|
|
|
| 1 |
from functools import partial
|
| 2 |
import gradio as gr
|
|
|
|
|
|
|
| 3 |
from PIL.Image import Image
|
| 4 |
-
from
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
Args:
|
| 11 |
-
client: Hugging Face InferenceClient instance for API calls.
|
| 12 |
model: Hugging Face model ID to use for text-to-image generation.
|
| 13 |
prompt: Text description of the desired image.
|
| 14 |
|
| 15 |
Returns:
|
| 16 |
PIL Image object representing the generated image.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"""
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
-
def create_text_to_image_tab(
|
| 22 |
"""Create the text-to-image generation tab in the Gradio interface.
|
| 23 |
|
| 24 |
This function sets up all UI components for text-to-image generation,
|
| 25 |
including input textbox, generate button, and output image display.
|
| 26 |
|
| 27 |
Args:
|
| 28 |
-
client: Hugging Face InferenceClient instance to pass to the text_to_image function.
|
| 29 |
model: Hugging Face model ID to use for text-to-image generation.
|
| 30 |
"""
|
| 31 |
gr.Markdown("Generate an image from a text prompt.")
|
|
@@ -33,7 +65,7 @@ def create_text_to_image_tab(client: InferenceClient, model: str):
|
|
| 33 |
text_to_image_generate_button = gr.Button("Generate")
|
| 34 |
text_to_image_output = gr.Image(label="Image", type="pil")
|
| 35 |
text_to_image_generate_button.click(
|
| 36 |
-
fn=partial(text_to_image,
|
| 37 |
inputs=text_to_image_prompt,
|
| 38 |
outputs=text_to_image_output
|
| 39 |
)
|
|
|
|
| 1 |
+
import gc
|
| 2 |
from functools import partial
|
| 3 |
import gradio as gr
|
| 4 |
+
import torch
|
| 5 |
+
from os import getenv
|
| 6 |
from PIL.Image import Image
|
| 7 |
+
from diffusers import DiffusionPipeline
|
| 8 |
+
from utils import get_pytorch_device, spaces_gpu, get_torch_dtype
|
| 9 |
|
| 10 |
|
| 11 |
+
@spaces_gpu
|
| 12 |
+
def text_to_image(model: str, prompt: str) -> Image:
|
| 13 |
+
"""Generate an image from a text prompt using a diffusion model.
|
| 14 |
+
|
| 15 |
+
This function uses a diffusion pipeline (e.g., Stable Diffusion, FLUX) to generate
|
| 16 |
+
images from text prompts. The model is loaded, inference is performed, and then
|
| 17 |
+
cleaned up to free GPU memory.
|
| 18 |
|
| 19 |
Args:
|
|
|
|
| 20 |
model: Hugging Face model ID to use for text-to-image generation.
|
| 21 |
prompt: Text description of the desired image.
|
| 22 |
|
| 23 |
Returns:
|
| 24 |
PIL Image object representing the generated image.
|
| 25 |
+
|
| 26 |
+
Note:
|
| 27 |
+
- Uses safetensors for secure model loading.
|
| 28 |
+
- Automatically selects the best available device (CUDA/XPU/MPS/CPU).
|
| 29 |
+
- Cleans up model and GPU memory after inference.
|
| 30 |
"""
|
| 31 |
+
pytorch_device = get_pytorch_device()
|
| 32 |
+
dtype = get_torch_dtype()
|
| 33 |
+
|
| 34 |
+
# During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
|
| 35 |
+
# reduces memory consumption by not storing gradients. This can significantly reduce the
|
| 36 |
+
# amount of memory used during the inference phase.
|
| 37 |
+
pipe = DiffusionPipeline.from_pretrained(
|
| 38 |
+
model,
|
| 39 |
+
use_safetensors=True,
|
| 40 |
+
dtype=dtype
|
| 41 |
+
)
|
| 42 |
+
pipe = pipe.to(pytorch_device)
|
| 43 |
+
with torch.no_grad():
|
| 44 |
+
result = pipe(prompt).images[0]
|
| 45 |
+
|
| 46 |
+
# Clean up GPU memory
|
| 47 |
+
del pipe
|
| 48 |
+
if pytorch_device == "cuda":
|
| 49 |
+
torch.cuda.empty_cache()
|
| 50 |
+
gc.collect()
|
| 51 |
+
return result
|
| 52 |
|
| 53 |
|
| 54 |
+
def create_text_to_image_tab(model: str):
|
| 55 |
"""Create the text-to-image generation tab in the Gradio interface.
|
| 56 |
|
| 57 |
This function sets up all UI components for text-to-image generation,
|
| 58 |
including input textbox, generate button, and output image display.
|
| 59 |
|
| 60 |
Args:
|
|
|
|
| 61 |
model: Hugging Face model ID to use for text-to-image generation.
|
| 62 |
"""
|
| 63 |
gr.Markdown("Generate an image from a text prompt.")
|
|
|
|
| 65 |
text_to_image_generate_button = gr.Button("Generate")
|
| 66 |
text_to_image_output = gr.Image(label="Image", type="pil")
|
| 67 |
text_to_image_generate_button.click(
|
| 68 |
+
fn=partial(text_to_image, model),
|
| 69 |
inputs=text_to_image_prompt,
|
| 70 |
outputs=text_to_image_output
|
| 71 |
)
|
text_to_speech.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
import gc
|
| 2 |
from functools import partial
|
| 3 |
import gradio as gr
|
|
|
|
| 4 |
from transformers import pipeline
|
| 5 |
-
from utils import spaces_gpu
|
| 6 |
|
| 7 |
|
| 8 |
@spaces_gpu
|
|
@@ -28,13 +29,29 @@ def text_to_speech(model: str, text: str) -> tuple[int, bytes]:
|
|
| 28 |
- Cleans up model and GPU memory after inference.
|
| 29 |
- Returns audio in format compatible with Gradio Audio component.
|
| 30 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
narrator = pipeline(
|
| 32 |
"text-to-speech",
|
| 33 |
model,
|
| 34 |
-
|
|
|
|
| 35 |
)
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
| 37 |
del narrator
|
|
|
|
|
|
|
| 38 |
gc.collect()
|
| 39 |
return (result["sampling_rate"], result["audio"][0])
|
| 40 |
|
|
|
|
| 1 |
import gc
|
| 2 |
from functools import partial
|
| 3 |
import gradio as gr
|
| 4 |
+
import torch
|
| 5 |
from transformers import pipeline
|
| 6 |
+
from utils import get_pytorch_device, spaces_gpu, get_torch_dtype
|
| 7 |
|
| 8 |
|
| 9 |
@spaces_gpu
|
|
|
|
| 29 |
- Cleans up model and GPU memory after inference.
|
| 30 |
- Returns audio in format compatible with Gradio Audio component.
|
| 31 |
"""
|
| 32 |
+
pytorch_device = get_pytorch_device()
|
| 33 |
+
dtype = get_torch_dtype()
|
| 34 |
+
|
| 35 |
+
# During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
|
| 36 |
+
# reduces memory consumption by not storing gradients. This can significantly reduce the
|
| 37 |
+
# amount of memory used during the inference phase.
|
| 38 |
+
model_kwargs = {"use_safetensors": True} # Use safetensors to avoid torch.load restriction.
|
| 39 |
+
if dtype is not None:
|
| 40 |
+
model_kwargs["dtype"] = dtype
|
| 41 |
+
|
| 42 |
narrator = pipeline(
|
| 43 |
"text-to-speech",
|
| 44 |
model,
|
| 45 |
+
device=0 if pytorch_device == "cuda" else -1,
|
| 46 |
+
model_kwargs=model_kwargs
|
| 47 |
)
|
| 48 |
+
with torch.no_grad():
|
| 49 |
+
result = narrator(text)
|
| 50 |
+
|
| 51 |
+
# Clean up GPU memory
|
| 52 |
del narrator
|
| 53 |
+
if pytorch_device == "cuda":
|
| 54 |
+
torch.cuda.empty_cache()
|
| 55 |
gc.collect()
|
| 56 |
return (result["sampling_rate"], result["audio"][0])
|
| 57 |
|
translation.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
|
|
| 1 |
from functools import partial
|
| 2 |
import gradio as gr
|
| 3 |
-
|
| 4 |
from langdetect import detect, LangDetectException
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
# Language code mapping to Helsinki-NLP translation models
|
|
@@ -70,25 +73,27 @@ def get_translation_model(language_code: str, fallback_model: str) -> str:
|
|
| 70 |
return LANGUAGE_TO_MODEL_MAP.get(language_code, fallback_model)
|
| 71 |
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
fallback_translation_model: str,
|
| 76 |
-
text: str
|
| 77 |
-
) -> str:
|
| 78 |
"""Translate text to English using automatic language detection.
|
| 79 |
|
| 80 |
First detects the source language using the langdetect library, then selects
|
| 81 |
-
the appropriate translation model and translates the text to English
|
|
|
|
| 82 |
|
| 83 |
Args:
|
| 84 |
-
client: Hugging Face InferenceClient instance for API calls.
|
| 85 |
fallback_translation_model: Fallback translation model to use if no
|
| 86 |
language-specific model is available.
|
| 87 |
text: Input text to translate to English.
|
| 88 |
|
| 89 |
Returns:
|
| 90 |
-
String containing the translated text in English, or
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
"""
|
| 93 |
# Detect the language using langdetect library
|
| 94 |
detected_lang = detect_language(text)
|
|
@@ -100,22 +105,42 @@ def translate_to_english(
|
|
| 100 |
# Get the appropriate translation model
|
| 101 |
translation_model = get_translation_model(detected_lang, fallback_translation_model)
|
| 102 |
|
| 103 |
-
#
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
|
| 108 |
-
def create_translation_tab(
|
| 109 |
-
client: InferenceClient,
|
| 110 |
-
fallback_translation_model: str
|
| 111 |
-
):
|
| 112 |
"""Create the translation to English tab in the Gradio interface.
|
| 113 |
|
| 114 |
This function sets up all UI components for translation with automatic
|
| 115 |
language detection, including input textbox, translate button, and output textbox.
|
| 116 |
|
| 117 |
Args:
|
| 118 |
-
client: Hugging Face InferenceClient instance for API calls.
|
| 119 |
fallback_translation_model: Fallback translation model to use if no
|
| 120 |
language-specific model is available.
|
| 121 |
"""
|
|
@@ -124,7 +149,7 @@ def create_translation_tab(
|
|
| 124 |
translation_button = gr.Button("Translate")
|
| 125 |
translation_output = gr.Textbox(label="Translated Text", lines=5, interactive=False)
|
| 126 |
translation_button.click(
|
| 127 |
-
fn=partial(translate_to_english,
|
| 128 |
inputs=translation_input,
|
| 129 |
outputs=translation_output
|
| 130 |
)
|
|
|
|
| 1 |
+
import gc
|
| 2 |
from functools import partial
|
| 3 |
import gradio as gr
|
| 4 |
+
import torch
|
| 5 |
from langdetect import detect, LangDetectException
|
| 6 |
+
from transformers import MarianMTModel, MarianTokenizer
|
| 7 |
+
from utils import get_pytorch_device, spaces_gpu, get_torch_dtype
|
| 8 |
|
| 9 |
|
| 10 |
# Language code mapping to Helsinki-NLP translation models
|
|
|
|
| 73 |
return LANGUAGE_TO_MODEL_MAP.get(language_code, fallback_model)
|
| 74 |
|
| 75 |
|
| 76 |
+
@spaces_gpu
|
| 77 |
+
def translate_to_english(fallback_translation_model: str, text: str) -> str:
|
|
|
|
|
|
|
|
|
|
| 78 |
"""Translate text to English using automatic language detection.
|
| 79 |
|
| 80 |
First detects the source language using the langdetect library, then selects
|
| 81 |
+
the appropriate translation model and translates the text to English using
|
| 82 |
+
a local MarianMT model.
|
| 83 |
|
| 84 |
Args:
|
|
|
|
| 85 |
fallback_translation_model: Fallback translation model to use if no
|
| 86 |
language-specific model is available.
|
| 87 |
text: Input text to translate to English.
|
| 88 |
|
| 89 |
Returns:
|
| 90 |
+
String containing the translated text in English, or the original text
|
| 91 |
+
if it is already in English.
|
| 92 |
+
|
| 93 |
+
Note:
|
| 94 |
+
- Uses safetensors for secure model loading.
|
| 95 |
+
- Automatically selects the best available device (CUDA/XPU/MPS/CPU).
|
| 96 |
+
- Cleans up model and GPU memory after inference.
|
| 97 |
"""
|
| 98 |
# Detect the language using langdetect library
|
| 99 |
detected_lang = detect_language(text)
|
|
|
|
| 105 |
# Get the appropriate translation model
|
| 106 |
translation_model = get_translation_model(detected_lang, fallback_translation_model)
|
| 107 |
|
| 108 |
+
# Load model and tokenizer
|
| 109 |
+
pytorch_device = get_pytorch_device()
|
| 110 |
+
dtype = get_torch_dtype()
|
| 111 |
+
|
| 112 |
+
# During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
|
| 113 |
+
# reduces memory consumption by not storing gradients. This can significantly reduce the
|
| 114 |
+
# amount of memory used during the inference phase.
|
| 115 |
+
tokenizer = MarianTokenizer.from_pretrained(translation_model)
|
| 116 |
+
model = MarianMTModel.from_pretrained(
|
| 117 |
+
translation_model,
|
| 118 |
+
use_safetensors=True,
|
| 119 |
+
dtype=dtype
|
| 120 |
+
).to(pytorch_device)
|
| 121 |
+
|
| 122 |
+
# Tokenize and translate
|
| 123 |
+
inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(pytorch_device)
|
| 124 |
+
with torch.no_grad():
|
| 125 |
+
translated = model.generate(**inputs)
|
| 126 |
+
translation = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
|
| 127 |
+
|
| 128 |
+
# Clean up GPU memory
|
| 129 |
+
del model, tokenizer, inputs, translated
|
| 130 |
+
if pytorch_device == "cuda":
|
| 131 |
+
torch.cuda.empty_cache()
|
| 132 |
+
gc.collect()
|
| 133 |
+
|
| 134 |
+
return translation
|
| 135 |
|
| 136 |
|
| 137 |
+
def create_translation_tab(fallback_translation_model: str):
|
|
|
|
|
|
|
|
|
|
| 138 |
"""Create the translation to English tab in the Gradio interface.
|
| 139 |
|
| 140 |
This function sets up all UI components for translation with automatic
|
| 141 |
language detection, including input textbox, translate button, and output textbox.
|
| 142 |
|
| 143 |
Args:
|
|
|
|
| 144 |
fallback_translation_model: Fallback translation model to use if no
|
| 145 |
language-specific model is available.
|
| 146 |
"""
|
|
|
|
| 149 |
translation_button = gr.Button("Translate")
|
| 150 |
translation_output = gr.Textbox(label="Translated Text", lines=5, interactive=False)
|
| 151 |
translation_button.click(
|
| 152 |
+
fn=partial(translate_to_english, fallback_translation_model),
|
| 153 |
inputs=translation_input,
|
| 154 |
outputs=translation_output
|
| 155 |
)
|
utils.py
CHANGED
|
@@ -37,6 +37,14 @@ def get_pytorch_device() -> str:
|
|
| 37 |
else "mps" if torch.mps.is_available() # Apple Silicon
|
| 38 |
else "cpu") # gl bro 🫠
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
def request_image(url: str) -> Image:
|
| 41 |
"""Fetch an image from a URL and return it as a PIL Image.
|
| 42 |
|
|
|
|
| 37 |
else "mps" if torch.mps.is_available() # Apple Silicon
|
| 38 |
else "cpu") # gl bro 🫠
|
| 39 |
|
| 40 |
+
def get_torch_dtype():
|
| 41 |
+
"""Get the appropriate torch dtype based on reduced memory setting.
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
torch.float16 if reduced memory is enabled, None otherwise (uses default precision).
|
| 45 |
+
"""
|
| 46 |
+
return torch.float16 if getenv("REDUCED_MEMORY", "False").lower() == "true" else None
|
| 47 |
+
|
| 48 |
def request_image(url: str) -> Image:
|
| 49 |
"""Fetch an image from a URL and return it as a PIL Image.
|
| 50 |
|