szili2011 commited on
Commit
773ac74
·
verified ·
1 Parent(s): 90e00da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -11
app.py CHANGED
@@ -5,15 +5,17 @@ import nltk
5
  from nltk.corpus import cmudict
6
  from scipy.io.wavfile import write
7
 
8
- # --- FIX 1: Define sample_rate as a global constant ---
9
  SAMPLE_RATE = 22050
10
 
11
  # Download required NLTK data
12
- nltk.download('averaged_perceptron_tagger')
13
- nltk.download('cmudict')
14
 
15
  # Load your model from the root directory
16
- model = tf.keras.models.load_model('audio_model.h5')
 
 
17
 
18
  # Preprocess input text
19
  def preprocess_text(text):
@@ -32,6 +34,9 @@ def preprocess_text(text):
32
  # Create dummy 13-feature vectors for each phoneme (implement your own feature extraction)
33
  num_features = 13
34
  sequence_length = len(flattened_phonemes)
 
 
 
35
  input_data = np.random.rand(sequence_length, num_features)
36
 
37
  # Add batch dimension
@@ -41,19 +46,30 @@ def preprocess_text(text):
41
 
42
  # Convert model output to an audio file
43
  def convert_to_audio(model_output, filename="output.wav"):
44
- # Now uses the global SAMPLE_RATE constant
 
 
45
  normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
46
  write(filename, SAMPLE_RATE, normalized_output.astype(np.float32))
47
  return filename
48
 
49
  # Define function to generate sound effect
50
- def generate_sfx(text, duration): # duration no longer needs a default here
51
  input_data = preprocess_text(text)
 
 
 
 
 
52
  prediction = model.predict(input_data)
 
 
 
 
53
 
54
  # Generate longer output by repeating or padding
55
- # This line now works because SAMPLE_RATE is defined globally
56
- audio_data = np.tile(prediction.flatten(), (duration * SAMPLE_RATE // len(prediction.flatten()) + 1))[:duration * SAMPLE_RATE]
57
 
58
  audio_file = convert_to_audio(audio_data, filename="output.wav")
59
 
@@ -64,8 +80,7 @@ interface = gr.Interface(
64
  fn=generate_sfx,
65
  inputs=[
66
  gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
67
- # --- FIX 2: Corrected the default slider value to be within the min/max range ---
68
- gr.Slider(minimum=2, maximum=20, value=5, label="Duration (seconds)")
69
  ],
70
  outputs=gr.Audio(label="Generated SFX", type="filepath"),
71
  title="SFX Generator from Text",
@@ -75,4 +90,5 @@ interface = gr.Interface(
75
  # Run the interface
76
  if __name__ == "__main__":
77
  tf.config.set_visible_devices([], 'GPU') # Disable GPU
78
- interface.launch()
 
 
5
  from nltk.corpus import cmudict
6
  from scipy.io.wavfile import write
7
 
8
+ # Define sample_rate as a global constant
9
  SAMPLE_RATE = 22050
10
 
11
  # Download required NLTK data
12
+ nltk.download('averaged_perceptron_tagger', quiet=True)
13
+ nltk.download('cmudict', quiet=True)
14
 
15
  # Load your model from the root directory
16
+ # Add compile=False as it's often needed for inference-only models
17
+ # and can resolve some loading warnings.
18
+ model = tf.keras.models.load_model('audio_model.h5', compile=False)
19
 
20
  # Preprocess input text
21
  def preprocess_text(text):
 
34
  # Create dummy 13-feature vectors for each phoneme (implement your own feature extraction)
35
  num_features = 13
36
  sequence_length = len(flattened_phonemes)
37
+ if sequence_length == 0: # Handle empty input
38
+ return np.zeros((1, 1, num_features))
39
+
40
  input_data = np.random.rand(sequence_length, num_features)
41
 
42
  # Add batch dimension
 
46
 
47
  # Convert model output to an audio file
48
  def convert_to_audio(model_output, filename="output.wav"):
49
+ if model_output.size == 0: # Handle empty output
50
+ return None
51
+ # Normalize audio to be between -1 and 1
52
  normalized_output = np.interp(model_output, (model_output.min(), model_output.max()), (-1, 1))
53
  write(filename, SAMPLE_RATE, normalized_output.astype(np.float32))
54
  return filename
55
 
56
  # Define function to generate sound effect
57
+ def generate_sfx(text, duration):
58
  input_data = preprocess_text(text)
59
+
60
+ # Check for empty input after preprocessing
61
+ if input_data.shape[1] == 0:
62
+ return None # Return None to clear the audio component
63
+
64
  prediction = model.predict(input_data)
65
+
66
+ flat_prediction = prediction.flatten()
67
+ if len(flat_prediction) == 0:
68
+ return None
69
 
70
  # Generate longer output by repeating or padding
71
+ num_repeats = (duration * SAMPLE_RATE // len(flat_prediction)) + 1
72
+ audio_data = np.tile(flat_prediction, num_repeats)[:duration * SAMPLE_RATE]
73
 
74
  audio_file = convert_to_audio(audio_data, filename="output.wav")
75
 
 
80
  fn=generate_sfx,
81
  inputs=[
82
  gr.Textbox(label="Enter a Word", placeholder="Write a Word To Convert it into SFX Sound"),
83
+ gr.Slider(minimum=1, maximum=20, value=3, step=1, label="Duration (seconds)")
 
84
  ],
85
  outputs=gr.Audio(label="Generated SFX", type="filepath"),
86
  title="SFX Generator from Text",
 
90
  # Run the interface
91
  if __name__ == "__main__":
92
  tf.config.set_visible_devices([], 'GPU') # Disable GPU
93
+ # --- THIS IS THE KEY FIX FOR THE ValueError ---
94
+ interface.launch(share=True)