Spaces:

nari-labs
/

Dia-1.6B

Running on Zero

App Files Files Community

buttercrab commited on about 12 hours ago

Commit

26268b4

unverified ·

1 Parent(s): b9c8c2a

fix

Browse files

Files changed (1) hide show

app.py +30 -3

app.py CHANGED Viewed

@@ -26,6 +26,7 @@ except Exception as e:
 def run_inference(
     text_input: str,
     audio_prompt_input: Optional[Tuple[int, np.ndarray]],
     max_new_tokens: int,
     cfg_scale: float,
     temperature: float,
@@ -50,6 +51,10 @@ def run_inference(
         prompt_path_for_generate = None
         if audio_prompt_input is not None:
             sr, audio_data = audio_prompt_input
             # Check if audio_data is valid
             if (
                 audio_data is None or audio_data.size == 0 or audio_data.max() == 0
@@ -117,8 +122,15 @@ def run_inference(
         # Use torch.inference_mode() context manager for the generation call
         with torch.inference_mode():
             output_audio_np = model.generate(
-                text_input,
                 max_tokens=max_new_tokens,
                 cfg_scale=cfg_scale,
                 temperature=temperature,
@@ -242,11 +254,16 @@ with gr.Blocks(css=css) as demo:
                 lines=5,  # Increased lines
             )
             audio_prompt_input = gr.Audio(
-                label="Audio Prompt (Optional)",
                 show_label=True,
                 sources=["upload", "microphone"],
                 type="numpy",
             )
             with gr.Accordion("Generation Parameters", open=False):
                 max_new_tokens = gr.Slider(
                     label="Max New Tokens (Audio Length)",
@@ -312,6 +329,7 @@ with gr.Blocks(css=css) as demo:
         inputs=[
             text_input,
             audio_prompt_input,
             max_new_tokens,
             cfg_scale,
             temperature,
@@ -350,10 +368,19 @@ with gr.Blocks(css=css) as demo:
     if examples_list:
         gr.Examples(
-            examples=examples_list,
             inputs=[
                 text_input,
                 audio_prompt_input,
                 max_new_tokens,
                 cfg_scale,
                 temperature,

 def run_inference(
     text_input: str,
     audio_prompt_input: Optional[Tuple[int, np.ndarray]],
+    transcription_input: Optional[str],
     max_new_tokens: int,
     cfg_scale: float,
     temperature: float,
         prompt_path_for_generate = None
         if audio_prompt_input is not None:
             sr, audio_data = audio_prompt_input
+            # Enforce maximum duration of 10 seconds for the audio prompt
+            duration_sec = len(audio_data) / float(sr) if sr else 0
+            if duration_sec > 10.0:
+                raise gr.Error("Audio prompt must be 10 seconds or shorter.")
             # Check if audio_data is valid
             if (
                 audio_data is None or audio_data.size == 0 or audio_data.max() == 0
         # Use torch.inference_mode() context manager for the generation call
         with torch.inference_mode():
+            # Concatenate transcription (if provided) to the main text
+            combined_text = (
+                text_input.strip() + "\n" + transcription_input.strip()
+                if transcription_input and not transcription_input.isspace()
+                else text_input
+            )
             output_audio_np = model.generate(
+                combined_text,
                 max_tokens=max_new_tokens,
                 cfg_scale=cfg_scale,
                 temperature=temperature,
                 lines=5,  # Increased lines
             )
             audio_prompt_input = gr.Audio(
+                label="Audio Prompt (≤ 10 s, Optional)",
                 show_label=True,
                 sources=["upload", "microphone"],
                 type="numpy",
             )
+            transcription_input = gr.Textbox(
+                label="Audio Prompt Transcription (Optional)",
+                placeholder="Enter transcription of your audio prompt here...",
+                lines=3,
+            )
             with gr.Accordion("Generation Parameters", open=False):
                 max_new_tokens = gr.Slider(
                     label="Max New Tokens (Audio Length)",
         inputs=[
             text_input,
             audio_prompt_input,
+            transcription_input,
             max_new_tokens,
             cfg_scale,
             temperature,
     if examples_list:
         gr.Examples(
+            examples=[
+                [
+                    ex[0],  # text
+                    ex[1],  # audio prompt path
+                    "",  # transcription placeholder
+                    *ex[2:],
+                ]
+                for ex in examples_list
+            ],
             inputs=[
                 text_input,
                 audio_prompt_input,
+                transcription_input,
                 max_new_tokens,
                 cfg_scale,
                 temperature,