Spaces:

szili2011
/

FNaF-Audio-Generation

Runtime error

App Files Files Community

szili2011 commited on Jul 5

Commit

773ac74

verified ·

1 Parent(s): 90e00da

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -11

app.py CHANGED Viewed

@@ -5,15 +5,17 @@ import nltk
 from nltk.corpus import cmudict
 from scipy.io.wavfile import write
-# --- FIX 1: Define sample_rate as a global constant ---
 SAMPLE_RATE = 22050
 # Download required NLTK data
-nltk.download('averaged_perceptron_tagger')
-nltk.download('cmudict')
 # Load your model from the root directory
-model = tf.keras.models.load_model('audio_model.h5')
 # Preprocess input text
 def preprocess_text(text):
@@ -32,6 +34,9 @@ def preprocess_text(text):
     # Create dummy 13-feature vectors for each phoneme (implement your own feature extraction)
     num_features = 13
     sequence_length = len(flattened_phonemes)
     input_data = np.random.rand(sequence_length, num_features)
     # Add batch dimension
@@ -41,19 +46,30 @@ def preprocess_text(text):
 # Convert model output to an audio file
 def convert_to_audio(model_output, filename="output.wav"):
-    # Now uses the global SAMPLE_RATE constant
     normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
     write(filename, SAMPLE_RATE, normalized_output.astype(np.float32))
     return filename
 # Define function to generate sound effect
-def generate_sfx(text, duration): # duration no longer needs a default here
     input_data = preprocess_text(text)
     prediction = model.predict(input_data)
     # Generate longer output by repeating or padding
-    # This line now works because SAMPLE_RATE is defined globally
-    audio_data = np.tile(prediction.flatten(), (duration * SAMPLE_RATE // len(prediction.flatten()) + 1))[:duration * SAMPLE_RATE]
     audio_file = convert_to_audio(audio_data, filename="output.wav")
@@ -64,8 +80,7 @@ interface = gr.Interface(
     fn=generate_sfx,
     inputs=[
         gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
-        # --- FIX 2: Corrected the default slider value to be within the min/max range ---
-        gr.Slider(minimum=2, maximum=20, value=5, label="Duration (seconds)")
     ],
     outputs=gr.Audio(label="Generated SFX", type="filepath"),
     title="SFX Generator from Text",
@@ -75,4 +90,5 @@ interface = gr.Interface(
 # Run the interface
 if __name__ == "__main__":
     tf.config.set_visible_devices([], 'GPU')  # Disable GPU
-    interface.launch()

 from nltk.corpus import cmudict
 from scipy.io.wavfile import write
+# Define sample_rate as a global constant
 SAMPLE_RATE = 22050
 # Download required NLTK data
+nltk.download('averaged_perceptron_tagger', quiet=True)
+nltk.download('cmudict', quiet=True)
 # Load your model from the root directory
+# Add compile=False as it's often needed for inference-only models
+# and can resolve some loading warnings.
+model = tf.keras.models.load_model('audio_model.h5', compile=False)
 # Preprocess input text
 def preprocess_text(text):
     # Create dummy 13-feature vectors for each phoneme (implement your own feature extraction)
     num_features = 13
     sequence_length = len(flattened_phonemes)
+    if sequence_length == 0: # Handle empty input
+        return np.zeros((1, 1, num_features))
     input_data = np.random.rand(sequence_length, num_features)
     # Add batch dimension
 # Convert model output to an audio file
 def convert_to_audio(model_output, filename="output.wav"):
+    if model_output.size == 0: # Handle empty output
+        return None
+    # Normalize audio to be between -1 and 1
     normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
     write(filename, SAMPLE_RATE, normalized_output.astype(np.float32))
     return filename
 # Define function to generate sound effect
+def generate_sfx(text, duration):
     input_data = preprocess_text(text)
+    # Check for empty input after preprocessing
+    if input_data.shape[1] == 0:
+        return None # Return None to clear the audio component
     prediction = model.predict(input_data)
+    flat_prediction = prediction.flatten()
+    if len(flat_prediction) == 0:
+        return None
     # Generate longer output by repeating or padding
+    num_repeats = (duration * SAMPLE_RATE // len(flat_prediction)) + 1
+    audio_data = np.tile(flat_prediction, num_repeats)[:duration * SAMPLE_RATE]
     audio_file = convert_to_audio(audio_data, filename="output.wav")
     fn=generate_sfx,
     inputs=[
         gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
+        gr.Slider(minimum=1, maximum=20, value=3, step=1, label="Duration (seconds)")
     ],
     outputs=gr.Audio(label="Generated SFX", type="filepath"),
     title="SFX Generator from Text",
 # Run the interface
 if __name__ == "__main__":
     tf.config.set_visible_devices([], 'GPU')  # Disable GPU
+    # --- THIS IS THE KEY FIX FOR THE ValueError ---
+    interface.launch(share=True)