LiKenun commited on
Commit
5c395b2
·
1 Parent(s): 24f37c6

Switch to use GPU instead of inference client

Browse files
README.md CHANGED
@@ -96,8 +96,11 @@ Create a `.env` file in the project root directory with the following environmen
96
  ### Required Environment Variables
97
 
98
  ```env
99
- # Hugging Face API Token (required for Inference API access)
100
  # Get your token from: https://huggingface.co/settings/tokens
 
 
 
101
  HF_TOKEN=your_huggingface_token_here
102
 
103
  # Model IDs for each building block
@@ -114,6 +117,11 @@ CHAT_MODEL=model_id_for_chatbot
114
  ```env
115
  # Request timeout in seconds (default: 45)
116
  REQUEST_TIMEOUT=45
 
 
 
 
 
117
  ```
118
 
119
  ### Example `.env` File
@@ -206,7 +214,23 @@ If you encounter a `RuntimeError: espeak not installed on your system` error:
206
 
207
  If you encounter errors loading models:
208
 
209
- 1. Verify your `HF_TOKEN` is valid and has access to the models. Some models are gated.
210
- 2. Check that model IDs in your `.env` file are correct.
211
- 3. Ensure you have sufficient disk space for model downloads.
212
- 4. For local models, ensure you have sufficient RAM or VRAM.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  ### Required Environment Variables
97
 
98
  ```env
99
+ # Hugging Face API Token (required for gated models and Inference API access)
100
  # Get your token from: https://huggingface.co/settings/tokens
101
+ # Required fine-grained permissions:
102
+ # 1. "Make calls to Inference Providers"
103
+ # 2. "Read access to contents of all public gated repos you can access"
104
  HF_TOKEN=your_huggingface_token_here
105
 
106
  # Model IDs for each building block
 
117
  ```env
118
  # Request timeout in seconds (default: 45)
119
  REQUEST_TIMEOUT=45
120
+
121
+ # Enable reduced memory usage by using lower precision (float16) for all models (default: False).
122
+ # Set to "True" to reduce GPU memory usage at the cost of slightly lower precision.
123
+ # Sometimes this is still not enough—in which case you must choose another model that will fit in memory.
124
+ REDUCED_MEMORY=False
125
  ```
126
 
127
  ### Example `.env` File
 
214
 
215
  If you encounter errors loading models:
216
 
217
+ 1. Verify your `HF_TOKEN` is valid and has the required permissions:
218
+ - "Make calls to Inference Providers"
219
+ - "Read access to contents of all public gated repos you can access"
220
+ Some models (like `black-forest-labs/FLUX.1-dev`) are gated and require these permissions.
221
+ 2. Ensure you have accepted the terms of use for gated models on their Hugging Face model pages.
222
+ 3. Check that model IDs in your `.env` file are correct.
223
+ 4. Ensure you have sufficient disk space for model downloads.
224
+ 5. For local models, ensure you have sufficient RAM or VRAM.
225
+
226
+ ### CUDA Out of Memory Errors
227
+
228
+ If you encounter `torch.OutOfMemoryError: CUDA out of memory` errors:
229
+
230
+ 1. **Enable reduced memory mode**: Set `REDUCED_MEMORY=True` in your `.env` file to use lower precision (float16) for all models, which can reduce memory usage by approximately 50% at the cost of slightly lower precision.
231
+ 2. **Reduce model size**: Use smaller models or quantized versions when available.
232
+ 3. **Clear GPU cache**: The application automatically clears GPU memory after each inference, but you can manually clear it by restarting the application.
233
+ 4. **Set environment variable**: To reduce memory fragmentation, you can set `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`.
234
+ Add this to your shell profile (e.g., `~/.bashrc` or `~/.zshrc`) or set it before running the application.
235
+ 5. **Use CPU fallback**: If GPU memory is insufficient, the application will automatically fall back to CPU (though this will be slower).
236
+ 6. **Close other GPU applications**: Ensure no other applications are using the GPU simultaneously.
app.py CHANGED
@@ -1,7 +1,6 @@
1
  from dotenv import load_dotenv
2
  from os import getenv
3
  import gradio as gr
4
- from huggingface_hub import InferenceClient
5
  from automatic_speech_recognition import create_asr_tab
6
  from chatbot import create_chatbot_tab
7
  from image_classification import create_image_classification_tab
@@ -20,7 +19,6 @@ class App:
20
 
21
  def __init__(
22
  self,
23
- client: InferenceClient,
24
  text_to_image_model: str,
25
  image_to_text_model: str,
26
  image_classification_model: str,
@@ -29,11 +27,9 @@ class App:
29
  chat_model: str,
30
  fallback_translation_model: str
31
  ):
32
- """Initialize the App with an InferenceClient instance and model IDs.
33
 
34
  Args:
35
- client: Hugging Face InferenceClient instance for making API calls
36
- to Hugging Face's inference endpoints.
37
  text_to_image_model: Model ID for text-to-image generation.
38
  image_to_text_model: Model ID for image captioning.
39
  image_classification_model: Model ID for image classification.
@@ -43,7 +39,6 @@ class App:
43
  fallback_translation_model: Fallback translation model ID for languages
44
  without specific translation models.
45
  """
46
- self.client = client
47
  self.text_to_image_model = text_to_image_model
48
  self.image_to_text_model = image_to_text_model
49
  self.image_classification_model = image_classification_model
@@ -64,22 +59,19 @@ class App:
64
  gr.Markdown("A gallery of building blocks for building AI applications")
65
  with gr.Tabs():
66
  with gr.Tab("Text-to-image Generation"):
67
- create_text_to_image_tab(self.client, self.text_to_image_model)
68
  with gr.Tab("Image-to-text or Image Captioning"):
69
  create_image_to_text_tab(self.image_to_text_model)
70
  with gr.Tab("Image Classification"):
71
- create_image_classification_tab(self.client, self.image_classification_model)
72
  with gr.Tab("Text-to-speech (TTS)"):
73
  create_text_to_speech_tab(self.text_to_speech_model)
74
  with gr.Tab("Automatic Speech Recognition (ASR)"):
75
- create_asr_tab(self.client, self.audio_transcription_model)
76
  with gr.Tab("Chat"):
77
  create_chatbot_tab(self.chat_model)
78
  with gr.Tab("Translation to English"):
79
- create_translation_tab(
80
- self.client,
81
- self.fallback_translation_model
82
- )
83
 
84
  demo.launch()
85
 
@@ -87,7 +79,6 @@ class App:
87
  if __name__ == "__main__":
88
  load_dotenv()
89
  app = App(
90
- client=InferenceClient(),
91
  text_to_image_model=getenv("TEXT_TO_IMAGE_MODEL"),
92
  image_to_text_model=getenv("IMAGE_TO_TEXT_MODEL"),
93
  image_classification_model=getenv("IMAGE_CLASSIFICATION_MODEL"),
 
1
  from dotenv import load_dotenv
2
  from os import getenv
3
  import gradio as gr
 
4
  from automatic_speech_recognition import create_asr_tab
5
  from chatbot import create_chatbot_tab
6
  from image_classification import create_image_classification_tab
 
19
 
20
  def __init__(
21
  self,
 
22
  text_to_image_model: str,
23
  image_to_text_model: str,
24
  image_classification_model: str,
 
27
  chat_model: str,
28
  fallback_translation_model: str
29
  ):
30
+ """Initialize the App with model IDs.
31
 
32
  Args:
 
 
33
  text_to_image_model: Model ID for text-to-image generation.
34
  image_to_text_model: Model ID for image captioning.
35
  image_classification_model: Model ID for image classification.
 
39
  fallback_translation_model: Fallback translation model ID for languages
40
  without specific translation models.
41
  """
 
42
  self.text_to_image_model = text_to_image_model
43
  self.image_to_text_model = image_to_text_model
44
  self.image_classification_model = image_classification_model
 
59
  gr.Markdown("A gallery of building blocks for building AI applications")
60
  with gr.Tabs():
61
  with gr.Tab("Text-to-image Generation"):
62
+ create_text_to_image_tab(self.text_to_image_model)
63
  with gr.Tab("Image-to-text or Image Captioning"):
64
  create_image_to_text_tab(self.image_to_text_model)
65
  with gr.Tab("Image Classification"):
66
+ create_image_classification_tab(self.image_classification_model)
67
  with gr.Tab("Text-to-speech (TTS)"):
68
  create_text_to_speech_tab(self.text_to_speech_model)
69
  with gr.Tab("Automatic Speech Recognition (ASR)"):
70
+ create_asr_tab(self.audio_transcription_model)
71
  with gr.Tab("Chat"):
72
  create_chatbot_tab(self.chat_model)
73
  with gr.Tab("Translation to English"):
74
+ create_translation_tab(self.fallback_translation_model)
 
 
 
75
 
76
  demo.launch()
77
 
 
79
  if __name__ == "__main__":
80
  load_dotenv()
81
  app = App(
 
82
  text_to_image_model=getenv("TEXT_TO_IMAGE_MODEL"),
83
  image_to_text_model=getenv("IMAGE_TO_TEXT_MODEL"),
84
  image_classification_model=getenv("IMAGE_CLASSIFICATION_MODEL"),
automatic_speech_recognition.py CHANGED
@@ -1,46 +1,84 @@
 
1
  from functools import partial
2
- from huggingface_hub import InferenceClient
3
- from os import path, unlink
4
  import gradio as gr
5
- from utils import save_audio_to_temp_file, get_model_sample_rate, request_audio
 
 
 
6
 
7
- def automatic_speech_recognition(client: InferenceClient, model: str, audio: tuple[int, bytes]) -> str:
8
- """Transcribe audio to text using Hugging Face Inference API.
 
9
 
10
  This function converts speech audio into text transcription. The audio is
11
- resampled to match the model's expected sample rate, saved to a temporary
12
- file, and then sent to the Inference API for transcription.
13
 
14
  Args:
15
- client: Hugging Face InferenceClient instance for API calls.
16
  model: Hugging Face model ID to use for automatic speech recognition.
17
  audio: Tuple containing:
18
  - int: Sample rate of the input audio (e.g., 44100 Hz)
19
- - bytes: Raw audio data as bytes
20
 
21
  Returns:
22
  String containing the transcribed text from the audio.
23
 
24
  Note:
25
  - Audio is automatically resampled to match the model's expected sample rate.
26
- - Audio is saved as a WAV file for InferenceClient compatibility.
27
- - Automatically cleans up temporary files after transcription.
 
28
  """
29
- temp_file_path = None
30
- try:
31
- sample_rate = get_model_sample_rate(model)
32
- temp_file_path = save_audio_to_temp_file(sample_rate, audio)
33
- result = client.automatic_speech_recognition(temp_file_path, model=model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  return result["text"]
35
- finally:
36
- if temp_file_path and path.exists(temp_file_path): # Clean up temporary file.
37
- try:
38
- unlink(temp_file_path)
39
- except Exception:
40
- pass # Ignore clean-up errors.
 
41
 
42
 
43
- def create_asr_tab(client: InferenceClient, model: str):
44
  """Create the automatic speech recognition tab in the Gradio interface.
45
 
46
  This function sets up all UI components for automatic speech recognition, including:
@@ -50,7 +88,6 @@ def create_asr_tab(client: InferenceClient, model: str):
50
  - Transcribe button and output textbox
51
 
52
  Args:
53
- client: Hugging Face InferenceClient instance to pass to the automatic_speech_recognition function.
54
  model: Hugging Face model ID to use for automatic speech recognition.
55
  """
56
  gr.Markdown("Transcribe audio to text.")
@@ -65,7 +102,7 @@ def create_asr_tab(client: InferenceClient, model: str):
65
  audio_transcription_generate_button = gr.Button("Transcribe")
66
  audio_transcription_output = gr.Textbox(label="Text")
67
  audio_transcription_generate_button.click(
68
- fn=partial(automatic_speech_recognition, client, model),
69
  inputs=audio_transcription_audio_input,
70
  outputs=audio_transcription_output
71
  )
 
1
+ import gc
2
  from functools import partial
 
 
3
  import gradio as gr
4
+ import numpy as np
5
+ import torch
6
+ from transformers import pipeline
7
+ from utils import get_pytorch_device, spaces_gpu, resample_audio, get_model_sample_rate, request_audio, get_torch_dtype
8
 
9
+ @spaces_gpu
10
+ def automatic_speech_recognition(model: str, audio: tuple[int, bytes | np.ndarray]) -> str:
11
+ """Transcribe audio to text using a Whisper or similar ASR model.
12
 
13
  This function converts speech audio into text transcription. The audio is
14
+ resampled to match the model's expected sample rate, then processed locally.
 
15
 
16
  Args:
 
17
  model: Hugging Face model ID to use for automatic speech recognition.
18
  audio: Tuple containing:
19
  - int: Sample rate of the input audio (e.g., 44100 Hz)
20
+ - bytes | np.ndarray: Raw audio data as bytes or numpy array
21
 
22
  Returns:
23
  String containing the transcribed text from the audio.
24
 
25
  Note:
26
  - Audio is automatically resampled to match the model's expected sample rate.
27
+ - Uses safetensors for secure model loading.
28
+ - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
29
+ - Cleans up model and GPU memory after inference.
30
  """
31
+ pytorch_device = get_pytorch_device()
32
+ target_sample_rate = get_model_sample_rate(model)
33
+
34
+ # Resample audio to target sample rate
35
+ audio_array = resample_audio(target_sample_rate, audio)
36
+
37
+ # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
38
+ # reduces memory consumption by not storing gradients. This can significantly reduce the
39
+ # amount of memory used during the inference phase.
40
+ dtype = get_torch_dtype()
41
+ model_kwargs = {"use_safetensors": True}
42
+ if dtype is not None:
43
+ model_kwargs["dtype"] = dtype
44
+
45
+ # Load and run ASR pipeline
46
+ asr_pipeline = pipeline(
47
+ "automatic-speech-recognition",
48
+ model=model,
49
+ device=0 if pytorch_device == "cuda" else -1,
50
+ model_kwargs=model_kwargs
51
+ )
52
+ # Use return_timestamps="word" for long audio (>30 seconds) to avoid errors
53
+ # Using "word" ensures WhisperTimeStampLogitsProcessor is properly used during generation
54
+ # Set task='transcribe' and language='en' to avoid deprecation warnings and language detection
55
+ # Note: sampling_rate is not passed here since audio is already resampled to the model's expected rate
56
+ with torch.no_grad():
57
+ result = asr_pipeline(
58
+ audio_array,
59
+ return_timestamps="word",
60
+ task="transcribe",
61
+ language="en"
62
+ )
63
+
64
+ # Clean up GPU memory
65
+ del asr_pipeline
66
+ if pytorch_device == "cuda":
67
+ torch.cuda.empty_cache()
68
+ gc.collect()
69
+ # Extract text from result (works for both short and long audio)
70
+ if isinstance(result, dict) and "text" in result:
71
  return result["text"]
72
+ elif isinstance(result, str):
73
+ return result
74
+ else:
75
+ # Fallback: try to extract text from chunks if present
76
+ if isinstance(result, dict) and "chunks" in result:
77
+ return " ".join(chunk.get("text", "") for chunk in result["chunks"] if isinstance(chunk, dict))
78
+ return str(result)
79
 
80
 
81
+ def create_asr_tab(model: str):
82
  """Create the automatic speech recognition tab in the Gradio interface.
83
 
84
  This function sets up all UI components for automatic speech recognition, including:
 
88
  - Transcribe button and output textbox
89
 
90
  Args:
 
91
  model: Hugging Face model ID to use for automatic speech recognition.
92
  """
93
  gr.Markdown("Transcribe audio to text.")
 
102
  audio_transcription_generate_button = gr.Button("Transcribe")
103
  audio_transcription_output = gr.Textbox(label="Text")
104
  audio_transcription_generate_button.click(
105
+ fn=partial(automatic_speech_recognition, model),
106
  inputs=audio_transcription_audio_input,
107
  outputs=audio_transcription_output
108
  )
image_classification.py CHANGED
@@ -1,23 +1,23 @@
 
1
  from functools import partial
2
- from huggingface_hub import InferenceClient
3
- from os import path, unlink
4
  import gradio as gr
 
5
  from PIL.Image import Image
6
  import pandas as pd
7
  from pandas import DataFrame
8
- from utils import save_image_to_temp_file, request_image
 
9
 
10
 
11
- def image_classification(client: InferenceClient, model: str, image: Image) -> DataFrame:
12
- """Classify an image using Hugging Face Inference API.
 
13
 
14
  This function classifies a recyclable item image into categories:
15
- cardboard, glass, metal, paper, plastic, or other. The image is saved
16
- to a temporary file since InferenceClient requires a file path rather than
17
- a PIL Image object directly.
18
 
19
  Args:
20
- client: Hugging Face InferenceClient instance for API calls.
21
  model: Hugging Face model ID to use for image classification.
22
  image: PIL Image object to classify.
23
 
@@ -27,27 +27,41 @@ def image_classification(client: InferenceClient, model: str, image: Image) -> D
27
  - Probability: The confidence score as a percentage string (e.g., "95.23%")
28
 
29
  Note:
30
- - Automatically cleans up temporary files after classification.
31
- - Temporary file is created with format preservation if possible.
 
32
  """
33
- try:
34
- temp_file_path = save_image_to_temp_file(image) # Needed because InferenceClient does not accept PIL Images directly.
35
- classifications = client.image_classification(temp_file_path, model=model)
36
- return pd.DataFrame({
37
- "Label": classification.label,
38
- "Probability": f"{classification.score:.2%}"
39
- }
40
- for classification
41
- in classifications)
42
- finally:
43
- if temp_file_path and path.exists(temp_file_path): # Clean up temporary file.
44
- try:
45
- unlink(temp_file_path)
46
- except Exception:
47
- pass # Ignore clean-up errors.
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
 
50
- def create_image_classification_tab(client: InferenceClient, model: str):
51
  """Create the image classification tab in the Gradio interface.
52
 
53
  This function sets up all UI components for image classification, including:
@@ -57,7 +71,6 @@ def create_image_classification_tab(client: InferenceClient, model: str):
57
  - Classify button and output dataframe showing labels and probabilities
58
 
59
  Args:
60
- client: Hugging Face InferenceClient instance to pass to the image_classification function.
61
  model: Hugging Face model ID to use for image classification.
62
  """
63
  gr.Markdown("Classify a recyclable item as one of: cardboard, glass, metal, paper, plastic, or other using [Trash-Net](https://huggingface.co/prithivMLmods/Trash-Net).")
@@ -72,7 +85,7 @@ def create_image_classification_tab(client: InferenceClient, model: str):
72
  image_classification_button = gr.Button("Classify")
73
  image_classification_output = gr.Dataframe(label="Classification", headers=["Label", "Probability"], interactive=False)
74
  image_classification_button.click(
75
- fn=partial(image_classification, client, model),
76
  inputs=image_classification_image_input,
77
  outputs=image_classification_output
78
  )
 
1
+ import gc
2
  from functools import partial
 
 
3
  import gradio as gr
4
+ import torch
5
  from PIL.Image import Image
6
  import pandas as pd
7
  from pandas import DataFrame
8
+ from transformers import pipeline
9
+ from utils import get_pytorch_device, spaces_gpu, request_image, get_torch_dtype
10
 
11
 
12
+ @spaces_gpu
13
+ def image_classification(model: str, image: Image) -> DataFrame:
14
+ """Classify an image using a vision transformer model.
15
 
16
  This function classifies a recyclable item image into categories:
17
+ cardboard, glass, metal, paper, plastic, or other. The model is loaded,
18
+ inference is performed, and then cleaned up to free GPU memory.
 
19
 
20
  Args:
 
21
  model: Hugging Face model ID to use for image classification.
22
  image: PIL Image object to classify.
23
 
 
27
  - Probability: The confidence score as a percentage string (e.g., "95.23%")
28
 
29
  Note:
30
+ - Uses safetensors for secure model loading.
31
+ - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
32
+ - Cleans up model and GPU memory after inference.
33
  """
34
+ pytorch_device = get_pytorch_device()
35
+ dtype = get_torch_dtype()
36
+
37
+ # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
38
+ # reduces memory consumption by not storing gradients. This can significantly reduce the
39
+ # amount of memory used during the inference phase.
40
+ model_kwargs = {"use_safetensors": True}
41
+ if dtype is not None:
42
+ model_kwargs["dtype"] = dtype
43
+
44
+ classifier = pipeline(
45
+ "image-classification",
46
+ model=model,
47
+ device=0 if pytorch_device == "cuda" else -1,
48
+ model_kwargs=model_kwargs
49
+ )
50
+ with torch.no_grad():
51
+ results = classifier(image)
52
+
53
+ # Clean up GPU memory
54
+ del classifier
55
+ if pytorch_device == "cuda":
56
+ torch.cuda.empty_cache()
57
+ gc.collect()
58
+ return pd.DataFrame({
59
+ "Label": [result["label"] for result in results],
60
+ "Probability": [f"{result['score']:.2%}" for result in results]
61
+ })
62
 
63
 
64
+ def create_image_classification_tab(model: str):
65
  """Create the image classification tab in the Gradio interface.
66
 
67
  This function sets up all UI components for image classification, including:
 
71
  - Classify button and output dataframe showing labels and probabilities
72
 
73
  Args:
 
74
  model: Hugging Face model ID to use for image classification.
75
  """
76
  gr.Markdown("Classify a recyclable item as one of: cardboard, glass, metal, paper, plastic, or other using [Trash-Net](https://huggingface.co/prithivMLmods/Trash-Net).")
 
85
  image_classification_button = gr.Button("Classify")
86
  image_classification_output = gr.Dataframe(label="Classification", headers=["Label", "Probability"], interactive=False)
87
  image_classification_button.click(
88
+ fn=partial(image_classification, model),
89
  inputs=image_classification_image_input,
90
  outputs=image_classification_output
91
  )
image_to_text.py CHANGED
@@ -1,9 +1,10 @@
1
  import gc
2
  from functools import partial
3
  import gradio as gr
 
4
  from PIL.Image import Image
5
  from transformers import AutoProcessor, BlipForConditionalGeneration
6
- from utils import get_pytorch_device, spaces_gpu, request_image
7
 
8
 
9
  @spaces_gpu
@@ -28,15 +29,26 @@ def image_to_text(model: str, image: Image) -> list[str]:
28
  - Uses beam search with 3 beams, max length 20, min length 5.
29
  """
30
  pytorch_device = get_pytorch_device()
 
 
 
 
 
31
  processor = AutoProcessor.from_pretrained(model)
32
  model_instance = BlipForConditionalGeneration.from_pretrained(
33
  model,
34
- use_safetensors=True # Use safetensors to avoid torch.load restriction.
 
35
  ).to(pytorch_device)
36
  inputs = processor(images=image, return_tensors="pt").to(pytorch_device)
37
- generated_ids = model_instance.generate(pixel_values=inputs.pixel_values, num_beams=3, max_length=20, min_length=5)
 
38
  results = processor.batch_decode(generated_ids, skip_special_tokens=True)
39
- del model_instance, inputs
 
 
 
 
40
  gc.collect()
41
  return results
42
 
 
1
  import gc
2
  from functools import partial
3
  import gradio as gr
4
+ import torch
5
  from PIL.Image import Image
6
  from transformers import AutoProcessor, BlipForConditionalGeneration
7
+ from utils import get_pytorch_device, spaces_gpu, request_image, get_torch_dtype
8
 
9
 
10
  @spaces_gpu
 
29
  - Uses beam search with 3 beams, max length 20, min length 5.
30
  """
31
  pytorch_device = get_pytorch_device()
32
+ dtype = get_torch_dtype()
33
+
34
+ # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
35
+ # reduces memory consumption by not storing gradients. This can significantly reduce the
36
+ # amount of memory used during the inference phase.
37
  processor = AutoProcessor.from_pretrained(model)
38
  model_instance = BlipForConditionalGeneration.from_pretrained(
39
  model,
40
+ use_safetensors=True, # Use safetensors to avoid torch.load restriction.
41
+ dtype=dtype
42
  ).to(pytorch_device)
43
  inputs = processor(images=image, return_tensors="pt").to(pytorch_device)
44
+ with torch.no_grad():
45
+ generated_ids = model_instance.generate(pixel_values=inputs.pixel_values, num_beams=3, max_length=20, min_length=5)
46
  results = processor.batch_decode(generated_ids, skip_special_tokens=True)
47
+
48
+ # Clean up GPU memory
49
+ del model_instance, inputs, generated_ids
50
+ if pytorch_device == "cuda":
51
+ torch.cuda.empty_cache()
52
  gc.collect()
53
  return results
54
 
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  gradio>=5.49.1
2
  huggingface-hub>=0.34.0,<1.0
3
  inflect>=7.0.0
@@ -7,8 +8,11 @@ numpy>=1.24.0
7
  pandas>=2.0.0
8
  phonemizer>=3.0.0
9
  pillow>=10.0.0
 
10
  python-dotenv>=1.0.0
11
  requests>=2.31.0
 
 
12
  soundfile>=0.12.0
13
  timm>=1.0.0
14
  transformers>=4.40.0
 
1
+ diffusers>=0.30.0
2
  gradio>=5.49.1
3
  huggingface-hub>=0.34.0,<1.0
4
  inflect>=7.0.0
 
8
  pandas>=2.0.0
9
  phonemizer>=3.0.0
10
  pillow>=10.0.0
11
+ protobuf>=4.25.0
12
  python-dotenv>=1.0.0
13
  requests>=2.31.0
14
+ sacremoses>=0.0.53
15
+ sentencepiece>=0.1.99
16
  soundfile>=0.12.0
17
  timm>=1.0.0
18
  transformers>=4.40.0
text_to_image.py CHANGED
@@ -1,31 +1,63 @@
 
1
  from functools import partial
2
  import gradio as gr
 
 
3
  from PIL.Image import Image
4
- from huggingface_hub import InferenceClient
 
5
 
6
 
7
- def text_to_image(client: InferenceClient, model: str, prompt: str) -> Image:
8
- """Generate an image from a text prompt using Hugging Face Inference API.
 
 
 
 
 
9
 
10
  Args:
11
- client: Hugging Face InferenceClient instance for API calls.
12
  model: Hugging Face model ID to use for text-to-image generation.
13
  prompt: Text description of the desired image.
14
 
15
  Returns:
16
  PIL Image object representing the generated image.
 
 
 
 
 
17
  """
18
- return client.text_to_image(prompt, model=model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
- def create_text_to_image_tab(client: InferenceClient, model: str):
22
  """Create the text-to-image generation tab in the Gradio interface.
23
 
24
  This function sets up all UI components for text-to-image generation,
25
  including input textbox, generate button, and output image display.
26
 
27
  Args:
28
- client: Hugging Face InferenceClient instance to pass to the text_to_image function.
29
  model: Hugging Face model ID to use for text-to-image generation.
30
  """
31
  gr.Markdown("Generate an image from a text prompt.")
@@ -33,7 +65,7 @@ def create_text_to_image_tab(client: InferenceClient, model: str):
33
  text_to_image_generate_button = gr.Button("Generate")
34
  text_to_image_output = gr.Image(label="Image", type="pil")
35
  text_to_image_generate_button.click(
36
- fn=partial(text_to_image, client, model),
37
  inputs=text_to_image_prompt,
38
  outputs=text_to_image_output
39
  )
 
1
+ import gc
2
  from functools import partial
3
  import gradio as gr
4
+ import torch
5
+ from os import getenv
6
  from PIL.Image import Image
7
+ from diffusers import DiffusionPipeline
8
+ from utils import get_pytorch_device, spaces_gpu, get_torch_dtype
9
 
10
 
11
+ @spaces_gpu
12
+ def text_to_image(model: str, prompt: str) -> Image:
13
+ """Generate an image from a text prompt using a diffusion model.
14
+
15
+ This function uses a diffusion pipeline (e.g., Stable Diffusion, FLUX) to generate
16
+ images from text prompts. The model is loaded, inference is performed, and then
17
+ cleaned up to free GPU memory.
18
 
19
  Args:
 
20
  model: Hugging Face model ID to use for text-to-image generation.
21
  prompt: Text description of the desired image.
22
 
23
  Returns:
24
  PIL Image object representing the generated image.
25
+
26
+ Note:
27
+ - Uses safetensors for secure model loading.
28
+ - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
29
+ - Cleans up model and GPU memory after inference.
30
  """
31
+ pytorch_device = get_pytorch_device()
32
+ dtype = get_torch_dtype()
33
+
34
+ # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
35
+ # reduces memory consumption by not storing gradients. This can significantly reduce the
36
+ # amount of memory used during the inference phase.
37
+ pipe = DiffusionPipeline.from_pretrained(
38
+ model,
39
+ use_safetensors=True,
40
+ dtype=dtype
41
+ )
42
+ pipe = pipe.to(pytorch_device)
43
+ with torch.no_grad():
44
+ result = pipe(prompt).images[0]
45
+
46
+ # Clean up GPU memory
47
+ del pipe
48
+ if pytorch_device == "cuda":
49
+ torch.cuda.empty_cache()
50
+ gc.collect()
51
+ return result
52
 
53
 
54
+ def create_text_to_image_tab(model: str):
55
  """Create the text-to-image generation tab in the Gradio interface.
56
 
57
  This function sets up all UI components for text-to-image generation,
58
  including input textbox, generate button, and output image display.
59
 
60
  Args:
 
61
  model: Hugging Face model ID to use for text-to-image generation.
62
  """
63
  gr.Markdown("Generate an image from a text prompt.")
 
65
  text_to_image_generate_button = gr.Button("Generate")
66
  text_to_image_output = gr.Image(label="Image", type="pil")
67
  text_to_image_generate_button.click(
68
+ fn=partial(text_to_image, model),
69
  inputs=text_to_image_prompt,
70
  outputs=text_to_image_output
71
  )
text_to_speech.py CHANGED
@@ -1,8 +1,9 @@
1
  import gc
2
  from functools import partial
3
  import gradio as gr
 
4
  from transformers import pipeline
5
- from utils import spaces_gpu
6
 
7
 
8
  @spaces_gpu
@@ -28,13 +29,29 @@ def text_to_speech(model: str, text: str) -> tuple[int, bytes]:
28
  - Cleans up model and GPU memory after inference.
29
  - Returns audio in format compatible with Gradio Audio component.
30
  """
 
 
 
 
 
 
 
 
 
 
31
  narrator = pipeline(
32
  "text-to-speech",
33
  model,
34
- model_kwargs={"use_safetensors": True} # Use safetensors to avoid torch.load restriction.
 
35
  )
36
- result = narrator(text)
 
 
 
37
  del narrator
 
 
38
  gc.collect()
39
  return (result["sampling_rate"], result["audio"][0])
40
 
 
1
  import gc
2
  from functools import partial
3
  import gradio as gr
4
+ import torch
5
  from transformers import pipeline
6
+ from utils import get_pytorch_device, spaces_gpu, get_torch_dtype
7
 
8
 
9
  @spaces_gpu
 
29
  - Cleans up model and GPU memory after inference.
30
  - Returns audio in format compatible with Gradio Audio component.
31
  """
32
+ pytorch_device = get_pytorch_device()
33
+ dtype = get_torch_dtype()
34
+
35
+ # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
36
+ # reduces memory consumption by not storing gradients. This can significantly reduce the
37
+ # amount of memory used during the inference phase.
38
+ model_kwargs = {"use_safetensors": True} # Use safetensors to avoid torch.load restriction.
39
+ if dtype is not None:
40
+ model_kwargs["dtype"] = dtype
41
+
42
  narrator = pipeline(
43
  "text-to-speech",
44
  model,
45
+ device=0 if pytorch_device == "cuda" else -1,
46
+ model_kwargs=model_kwargs
47
  )
48
+ with torch.no_grad():
49
+ result = narrator(text)
50
+
51
+ # Clean up GPU memory
52
  del narrator
53
+ if pytorch_device == "cuda":
54
+ torch.cuda.empty_cache()
55
  gc.collect()
56
  return (result["sampling_rate"], result["audio"][0])
57
 
translation.py CHANGED
@@ -1,7 +1,10 @@
 
1
  from functools import partial
2
  import gradio as gr
3
- from huggingface_hub import InferenceClient
4
  from langdetect import detect, LangDetectException
 
 
5
 
6
 
7
  # Language code mapping to Helsinki-NLP translation models
@@ -70,25 +73,27 @@ def get_translation_model(language_code: str, fallback_model: str) -> str:
70
  return LANGUAGE_TO_MODEL_MAP.get(language_code, fallback_model)
71
 
72
 
73
- def translate_to_english(
74
- client: InferenceClient,
75
- fallback_translation_model: str,
76
- text: str
77
- ) -> str:
78
  """Translate text to English using automatic language detection.
79
 
80
  First detects the source language using the langdetect library, then selects
81
- the appropriate translation model and translates the text to English.
 
82
 
83
  Args:
84
- client: Hugging Face InferenceClient instance for API calls.
85
  fallback_translation_model: Fallback translation model to use if no
86
  language-specific model is available.
87
  text: Input text to translate to English.
88
 
89
  Returns:
90
- String containing the translated text in English, or a message if the
91
- text is already in English.
 
 
 
 
 
92
  """
93
  # Detect the language using langdetect library
94
  detected_lang = detect_language(text)
@@ -100,22 +105,42 @@ def translate_to_english(
100
  # Get the appropriate translation model
101
  translation_model = get_translation_model(detected_lang, fallback_translation_model)
102
 
103
- # Translate using the selected model
104
- result = client.translation(text, model=translation_model)
105
- return result.translation_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
 
108
- def create_translation_tab(
109
- client: InferenceClient,
110
- fallback_translation_model: str
111
- ):
112
  """Create the translation to English tab in the Gradio interface.
113
 
114
  This function sets up all UI components for translation with automatic
115
  language detection, including input textbox, translate button, and output textbox.
116
 
117
  Args:
118
- client: Hugging Face InferenceClient instance for API calls.
119
  fallback_translation_model: Fallback translation model to use if no
120
  language-specific model is available.
121
  """
@@ -124,7 +149,7 @@ def create_translation_tab(
124
  translation_button = gr.Button("Translate")
125
  translation_output = gr.Textbox(label="Translated Text", lines=5, interactive=False)
126
  translation_button.click(
127
- fn=partial(translate_to_english, client, fallback_translation_model),
128
  inputs=translation_input,
129
  outputs=translation_output
130
  )
 
1
+ import gc
2
  from functools import partial
3
  import gradio as gr
4
+ import torch
5
  from langdetect import detect, LangDetectException
6
+ from transformers import MarianMTModel, MarianTokenizer
7
+ from utils import get_pytorch_device, spaces_gpu, get_torch_dtype
8
 
9
 
10
  # Language code mapping to Helsinki-NLP translation models
 
73
  return LANGUAGE_TO_MODEL_MAP.get(language_code, fallback_model)
74
 
75
 
76
+ @spaces_gpu
77
+ def translate_to_english(fallback_translation_model: str, text: str) -> str:
 
 
 
78
  """Translate text to English using automatic language detection.
79
 
80
  First detects the source language using the langdetect library, then selects
81
+ the appropriate translation model and translates the text to English using
82
+ a local MarianMT model.
83
 
84
  Args:
 
85
  fallback_translation_model: Fallback translation model to use if no
86
  language-specific model is available.
87
  text: Input text to translate to English.
88
 
89
  Returns:
90
+ String containing the translated text in English, or the original text
91
+ if it is already in English.
92
+
93
+ Note:
94
+ - Uses safetensors for secure model loading.
95
+ - Automatically selects the best available device (CUDA/XPU/MPS/CPU).
96
+ - Cleans up model and GPU memory after inference.
97
  """
98
  # Detect the language using langdetect library
99
  detected_lang = detect_language(text)
 
105
  # Get the appropriate translation model
106
  translation_model = get_translation_model(detected_lang, fallback_translation_model)
107
 
108
+ # Load model and tokenizer
109
+ pytorch_device = get_pytorch_device()
110
+ dtype = get_torch_dtype()
111
+
112
+ # During inference or evaluation, gradient calculations are unnecessary. Using torch.no_grad()
113
+ # reduces memory consumption by not storing gradients. This can significantly reduce the
114
+ # amount of memory used during the inference phase.
115
+ tokenizer = MarianTokenizer.from_pretrained(translation_model)
116
+ model = MarianMTModel.from_pretrained(
117
+ translation_model,
118
+ use_safetensors=True,
119
+ dtype=dtype
120
+ ).to(pytorch_device)
121
+
122
+ # Tokenize and translate
123
+ inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(pytorch_device)
124
+ with torch.no_grad():
125
+ translated = model.generate(**inputs)
126
+ translation = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
127
+
128
+ # Clean up GPU memory
129
+ del model, tokenizer, inputs, translated
130
+ if pytorch_device == "cuda":
131
+ torch.cuda.empty_cache()
132
+ gc.collect()
133
+
134
+ return translation
135
 
136
 
137
+ def create_translation_tab(fallback_translation_model: str):
 
 
 
138
  """Create the translation to English tab in the Gradio interface.
139
 
140
  This function sets up all UI components for translation with automatic
141
  language detection, including input textbox, translate button, and output textbox.
142
 
143
  Args:
 
144
  fallback_translation_model: Fallback translation model to use if no
145
  language-specific model is available.
146
  """
 
149
  translation_button = gr.Button("Translate")
150
  translation_output = gr.Textbox(label="Translated Text", lines=5, interactive=False)
151
  translation_button.click(
152
+ fn=partial(translate_to_english, fallback_translation_model),
153
  inputs=translation_input,
154
  outputs=translation_output
155
  )
utils.py CHANGED
@@ -37,6 +37,14 @@ def get_pytorch_device() -> str:
37
  else "mps" if torch.mps.is_available() # Apple Silicon
38
  else "cpu") # gl bro 🫠
39
 
 
 
 
 
 
 
 
 
40
  def request_image(url: str) -> Image:
41
  """Fetch an image from a URL and return it as a PIL Image.
42
 
 
37
  else "mps" if torch.mps.is_available() # Apple Silicon
38
  else "cpu") # gl bro 🫠
39
 
40
+ def get_torch_dtype():
41
+ """Get the appropriate torch dtype based on reduced memory setting.
42
+
43
+ Returns:
44
+ torch.float16 if reduced memory is enabled, None otherwise (uses default precision).
45
+ """
46
+ return torch.float16 if getenv("REDUCED_MEMORY", "False").lower() == "true" else None
47
+
48
  def request_image(url: str) -> Image:
49
  """Fetch an image from a URL and return it as a PIL Image.
50