Spaces:

alethanhson
/

csm-1b-gradio-v2

Running

App Files Files Community

phucbienvan commited on 14 days ago

Commit

633ab26

1 Parent(s): e02c9de

fix zero gpu

Browse files

Files changed (10) hide show

.gitattributes +0 -0
.gitignore +0 -0
README.md +0 -0
app.py +58 -50
generator.py +1 -0
hf_requirements.txt +0 -0
models.py +0 -0
requirements.txt +1 -1
test_model.py +0 -0
watermarking.py +0 -0

.gitattributes CHANGED Viewed

File without changes

.gitignore CHANGED Viewed

File without changes

README.md CHANGED Viewed

File without changes

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from dataclasses import dataclass
 from generator import Segment, load_csm_1b
 from huggingface_hub import login
 # Disable torch compile feature to avoid triton error
 torch._dynamo.config.suppress_errors = True
@@ -36,7 +37,7 @@ generator = None
 model_loaded = False
 # Function to load model in ZeroGPU
-@spaces.GPU(duration=30)
 def initialize_model():
     global generator, model_loaded
     if not model_loaded:
@@ -47,7 +48,7 @@ def initialize_model():
     return generator
 # Function to get the loaded model
-@spaces.GPU(duration=30)
 def get_model():
     global generator, model_loaded
     if not model_loaded:
@@ -80,13 +81,13 @@ def audio_to_tensor(audio_path: str) -> Tuple[torch.Tensor, int]:
 # Function to save audio tensor to file
 def save_audio(audio_tensor: torch.Tensor, sample_rate: int) -> str:
-    temp_dir = tempfile.gettempdir()
-    output_path = os.path.join(temp_dir, f"csm1b_output_{int(time.time())}.wav")
     torchaudio.save(output_path, audio_tensor.unsqueeze(0), sample_rate)
     return output_path
 # Function to generate speech from text using ZeroGPU
-@spaces.GPU(duration=30)
 def generate_speech(
     text: str,
     speaker_id: int,
@@ -132,13 +133,14 @@ def generate_speech(
             speaker=speaker_id,
             context=context,
             max_audio_length_ms=max_duration_ms,
-            temperature=temperature,
-            topk=top_k
         )
         progress(0.8, "Saving audio...")
         # Save audio to file
-        output_path = save_audio(audio, generator.sample_rate)
         progress(1.0, "Completed!")
         return output_path
@@ -156,7 +158,7 @@ def generate_speech(
         return f"Error generating speech: {str(e)}"
 # Function to generate simple speech without context
-@spaces.GPU(duration=30)
 def generate_speech_simple(
     text: str,
     speaker_id: int,
@@ -176,17 +178,23 @@ def generate_speech_simple(
             speaker=speaker_id,
             context=[],  # No context
             max_audio_length_ms=max_duration_ms,
-            temperature=temperature,
-            topk=top_k
         )
         progress(0.8, "Saving audio...")
         # Save audio to file
-        output_path = save_audio(audio, generator.sample_rate)
         progress(1.0, "Completed!")
         return output_path
-    except spaces.zero.gradio.HTMLError as e:
         # Handle ZeroGPU quota exceeded error
         error_message = str(e)
         if "GPU quota exceeded" in error_message:
@@ -229,25 +237,25 @@ def create_demo():
                             value=30000,
                             step=1000
                         )
-                        temperature = gr.Slider(
-                            label="Temperature",
-                            minimum=0.1,
-                            maximum=1.5,
-                            value=0.9,
-                            step=0.1
-                        )
-                        top_k = gr.Slider(
-                            label="Top-K",
-                            minimum=1,
-                            maximum=100,
-                            value=50,
-                            step=1
-                        )
                     generate_btn = gr.Button("Generate Audio")
                 with gr.Column():
-                    output_audio = gr.Audio(label="Output Audio", type="filepath")
         with gr.Tab("Audio Generation with Context"):
             gr.Markdown("This feature allows you to provide audio clips and text as context to help the model generate more appropriate speech.")
@@ -281,25 +289,25 @@ def create_demo():
                             value=30000,
                             step=1000
                         )
-                        temperature_context = gr.Slider(
-                            label="Temperature",
-                            minimum=0.1,
-                            maximum=1.5,
-                            value=0.9,
-                            step=0.1
-                        )
-                        top_k_context = gr.Slider(
-                            label="Top-K",
-                            minimum=1,
-                            maximum=100,
-                            value=50,
-                            step=1
-                        )
                     generate_context_btn = gr.Button("Generate Audio with Context")
                 with gr.Column():
-                    output_audio_context = gr.Audio(label="Output Audio", type="filepath")
         # Add Hugging Face configuration tab
         with gr.Tab("Configuration"):
@@ -357,7 +365,7 @@ def create_demo():
             If you encounter a "GPU quota exceeded" error, please wait for the specified time and try again.
             """)
-            @spaces.GPU(duration=10)
             def check_gpu():
                 if torch.cuda.is_available():
                     gpu_name = torch.cuda.get_device_name(0)
@@ -375,7 +383,7 @@ def create_demo():
             load_model_btn = gr.Button("Load Model")
             model_status = gr.Textbox(label="Model Status", interactive=False)
-            @spaces.GPU(duration=10)
             def load_model_and_report():
                 global model_loaded
                 if model_loaded:
@@ -393,8 +401,8 @@ def create_demo():
                 text_input,
                 speaker_id,
                 max_duration,
-                temperature,
-                top_k
             ],
             outputs=output_audio
         )
@@ -411,8 +419,8 @@ def create_demo():
                 context_text2,
                 context_speaker2,
                 max_duration_context,
-                temperature_context,
-                top_k_context
             ],
             outputs=output_audio_context
         )
@@ -422,4 +430,4 @@ def create_demo():
 # Launch the application
 if __name__ == "__main__":
     demo = create_demo()
-    demo.queue().launch()

 from generator import Segment, load_csm_1b
 from huggingface_hub import login
 # Disable torch compile feature to avoid triton error
 torch._dynamo.config.suppress_errors = True
 model_loaded = False
 # Function to load model in ZeroGPU
+# @spaces.GPU(duration=30)
 def initialize_model():
     global generator, model_loaded
     if not model_loaded:
     return generator
 # Function to get the loaded model
+# @spaces.GPU(duration=30)
 def get_model():
     global generator, model_loaded
     if not model_loaded:
 # Function to save audio tensor to file
 def save_audio(audio_tensor: torch.Tensor, sample_rate: int) -> str:
+    # Lưu file vào thư mục hiện tại hoặc thư mục files mà Gradio mặc định sử dụng
+    output_path = f"csm1b_output_{int(time.time())}.wav"
     torchaudio.save(output_path, audio_tensor.unsqueeze(0), sample_rate)
     return output_path
 # Function to generate speech from text using ZeroGPU
+# @spaces.GPU(duration=30)
 def generate_speech(
     text: str,
     speaker_id: int,
             speaker=speaker_id,
             context=context,
             max_audio_length_ms=max_duration_ms,
+            # temperature=temperature,
+            # topk=top_k
         )
         progress(0.8, "Saving audio...")
         # Save audio to file
+        # output_path = save_audio(audio, generator.sample_rate)
+        output_path = f"csm1b_output_{int(time.time())}.wav"
         progress(1.0, "Completed!")
         return output_path
         return f"Error generating speech: {str(e)}"
 # Function to generate simple speech without context
+# @spaces.GPU(duration=30)
 def generate_speech_simple(
     text: str,
     speaker_id: int,
             speaker=speaker_id,
             context=[],  # No context
             max_audio_length_ms=max_duration_ms,
+            # temperature=temperature,
+            # topk=top_k
         )
         progress(0.8, "Saving audio...")
         # Save audio to file
+        # output_path = save_audio(audio, generator.sample_rate)
+        output_path = f"csm1b_output_{int(time.time())}.wav"
+        torchaudio.save(output_path, audio.unsqueeze(0).cpu(), generator.sample_rate)
+        print(f"Audio saved to {output_path}")
         progress(1.0, "Completed!")
         return output_path
+    except Exception as e:
         # Handle ZeroGPU quota exceeded error
         error_message = str(e)
         if "GPU quota exceeded" in error_message:
                             value=30000,
                             step=1000
                         )
+                        # temperature = gr.Slider(
+                        #     label="Temperature",
+                        #     minimum=0.1,
+                        #     maximum=1.5,
+                        #     value=0.9,
+                        #     step=0.1
+                        # )
+                        # top_k = gr.Slider(
+                        #     label="Top-K",
+                        #     minimum=1,
+                        #     maximum=100,
+                        #     value=50,
+                        #     step=1
+                        # )
                     generate_btn = gr.Button("Generate Audio")
                 with gr.Column():
+                    output_audio = gr.Audio(label="Output Audio", type="filepath", autoplay=True)
         with gr.Tab("Audio Generation with Context"):
             gr.Markdown("This feature allows you to provide audio clips and text as context to help the model generate more appropriate speech.")
                             value=30000,
                             step=1000
                         )
+                        # temperature_context = gr.Slider(
+                        #     label="Temperature",
+                        #     minimum=0.1,
+                        #     maximum=1.5,
+                        #     value=0.9,
+                        #     step=0.1
+                        # )
+                        # top_k_context = gr.Slider(
+                        #     label="Top-K",
+                        #     minimum=1,
+                        #     maximum=100,
+                        #     value=50,
+                        #     step=1
+                        # )
                     generate_context_btn = gr.Button("Generate Audio with Context")
                 with gr.Column():
+                    output_audio_context = gr.Audio(label="Output Audio", type="filepath", autoplay=True)
         # Add Hugging Face configuration tab
         with gr.Tab("Configuration"):
             If you encounter a "GPU quota exceeded" error, please wait for the specified time and try again.
             """)
+            # @spaces.GPU(duration=10)
             def check_gpu():
                 if torch.cuda.is_available():
                     gpu_name = torch.cuda.get_device_name(0)
             load_model_btn = gr.Button("Load Model")
             model_status = gr.Textbox(label="Model Status", interactive=False)
+            # @spaces.GPU(duration=10)
             def load_model_and_report():
                 global model_loaded
                 if model_loaded:
                 text_input,
                 speaker_id,
                 max_duration,
+                # temperature,
+                # top_k
             ],
             outputs=output_audio
         )
                 context_text2,
                 context_speaker2,
                 max_duration_context,
+                # temperature_context,
+                # top_k_context
             ],
             outputs=output_audio_context
         )
 # Launch the application
 if __name__ == "__main__":
     demo = create_demo()
+    demo.queue().launch(share=True)

generator.py CHANGED Viewed

@@ -178,6 +178,7 @@ def load_csm_1b(device: str = "cuda") -> Generator:
     try:
         # In ZeroGPU, CUDA should not be initialized in the main process
         # Only move the model to GPU when called in a function with the @spaces.GPU decorator
         if 'cuda' in device and not torch.cuda.is_initialized():
             # Use CPU for the main process
             model = Model.from_pretrained("sesame/csm-1b")

     try:
         # In ZeroGPU, CUDA should not be initialized in the main process
         # Only move the model to GPU when called in a function with the @spaces.GPU decorator
+        print(f"Loading model on {device}")
         if 'cuda' in device and not torch.cuda.is_initialized():
             # Use CPU for the main process
             model = Model.from_pretrained("sesame/csm-1b")

hf_requirements.txt CHANGED Viewed

File without changes

models.py CHANGED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -3,7 +3,7 @@ torchaudio==2.4.0
 tokenizers==0.21.0
 transformers==4.49.0
 huggingface_hub==0.28.1
-moshi==0.2.2
 torchtune==0.4.0
 torchao==0.9.0
 silentcipher @ git+https://github.com/SesameAILabs/silentcipher@master

 tokenizers==0.21.0
 transformers==4.49.0
 huggingface_hub==0.28.1
+# moshi==0.2.2
 torchtune==0.4.0
 torchao==0.9.0
 silentcipher @ git+https://github.com/SesameAILabs/silentcipher@master

test_model.py CHANGED Viewed

File without changes

watermarking.py CHANGED Viewed

File without changes