fantos commited on
Commit
0473607
·
verified ·
1 Parent(s): c1e33d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -146
app.py CHANGED
@@ -20,13 +20,10 @@ def generate_tts(text, temperature, repetition_penalty, speaker_selection, refer
20
  interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)
21
 
22
  try:
23
- # Validate inputs for custom speaker
24
  if reference_audio:
25
  speaker = interface.create_speaker(reference_audio)
26
- # Use selected default speaker
27
  elif speaker_selection and speaker_selection != "None":
28
  speaker = interface.load_default_speaker(speaker_selection)
29
- # No speaker - random characteristics
30
  else:
31
  speaker = None
32
 
@@ -39,175 +36,58 @@ def generate_tts(text, temperature, repetition_penalty, speaker_selection, refer
39
  )
40
  output = interface.generate(config=gen_cfg)
41
 
42
- # Verify output
43
  if output.audio is None:
44
  raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.")
45
 
46
- # Save and return output
47
  output_path = "output.wav"
48
  output.save(output_path)
49
  return output_path, None
50
  except Exception as e:
51
  return None, str(e)
52
 
53
- # Custom CSS for 3D styling
54
- custom_css = """
55
- .container {
56
- background: linear-gradient(145deg, #f3f4f6, #ffffff);
57
- border-radius: 20px;
58
- box-shadow: 10px 10px 20px #d1d1d1, -10px -10px 20px #ffffff;
59
- padding: 2rem;
60
- margin: 1rem;
61
- transition: all 0.3s ease;
62
- min-height: 800px;
63
- width: 100%;
64
- max-width: 1400px;
65
- margin: 0 auto;
66
- }
67
-
68
- .title {
69
- font-size: 2.5rem;
70
- font-weight: bold;
71
- color: #1a1a1a;
72
- text-align: center;
73
- margin-bottom: 2rem;
74
- text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.1);
75
- }
76
-
77
- .input-group {
78
- background: #ffffff;
79
- border-radius: 15px;
80
- padding: 1.5rem;
81
- margin: 1rem 0;
82
- box-shadow: inset 5px 5px 10px #e0e0e0, inset -5px -5px 10px #ffffff;
83
- }
84
-
85
- .button-3d {
86
- background: linear-gradient(145deg, #3b82f6, #2563eb);
87
- color: white;
88
- border: none;
89
- padding: 0.8rem 1.5rem;
90
- border-radius: 10px;
91
- font-weight: bold;
92
- cursor: pointer;
93
- transition: all 0.3s ease;
94
- box-shadow: 5px 5px 10px #d1d1d1, -5px -5px 10px #ffffff;
95
- }
96
-
97
- .button-3d:hover {
98
- transform: translateY(-2px);
99
- box-shadow: 7px 7px 15px #d1d1d1, -7px -7px 15px #ffffff;
100
- }
101
-
102
- .slider-3d {
103
- height: 12px;
104
- border-radius: 6px;
105
- background: linear-gradient(145deg, #e6e7eb, #ffffff);
106
- box-shadow: inset 3px 3px 6px #d1d1d1, inset -3px -3px 6px #ffffff;
107
- }
108
-
109
- .error-box {
110
- background: #fee2e2;
111
- border-left: 4px solid #ef4444;
112
- padding: 1rem;
113
- border-radius: 8px;
114
- margin: 1rem 0;
115
- }
116
-
117
- .right-column {
118
- display: flex;
119
- flex-direction: column;
120
- gap: 1rem;
121
- }
122
-
123
- .options-panel {
124
- margin-top: 2rem;
125
- background: linear-gradient(145deg, #f3f4f6, #ffffff);
126
- border-radius: 15px;
127
- padding: 2.5rem;
128
- box-shadow: 5px 5px 10px #d1d1d1, -5px -5px 10px #ffffff;
129
- display: flex;
130
- flex-direction: column;
131
- gap: 2rem;
132
- }
133
-
134
- .input-box {
135
- padding: 1.5rem;
136
- background: #ffffff;
137
- border-radius: 12px;
138
- margin-bottom: 1.5rem;
139
- box-shadow: inset 3px 3px 7px #e0e0e0, inset -3px -3px 7px #ffffff;
140
- }
141
-
142
- .slider-container {
143
- margin: 2rem 0;
144
- }
145
-
146
- .textbox-container {
147
- min-height: 150px;
148
- }
149
- """
150
-
151
- # Create the Gradio interface with 3D styling
152
- with gr.Blocks(css=custom_css) as demo:
153
- gr.Markdown('<div class="title">Voice Clone Multilingual TTS</div>')
154
 
155
- error_box = gr.Textbox(label="Error Messages", visible=False, elem_classes="error-box")
156
 
157
- with gr.Row(elem_classes="container"):
158
- # Left column for text input
159
  with gr.Column(scale=1):
160
- with gr.Group(elem_classes="textbox-container"):
161
- text_input = gr.Textbox(
162
- label="Text to Synthesize",
163
- placeholder="Enter text here...",
164
- elem_classes="input-group",
165
- lines=8
166
- )
167
-
168
- submit_button = gr.Button(
169
- "Generate Speech",
170
- elem_classes="button-3d"
171
  )
 
 
172
 
173
- # Right column for output and options
174
- with gr.Column(scale=1, elem_classes="right-column"):
175
- # Audio output at the top
176
  audio_output = gr.Audio(
177
  label="Generated Audio",
178
- type="filepath",
179
- elem_classes="input-group"
180
  )
181
 
182
- # Options panel below the output
183
- with gr.Group(elem_classes="options-panel"):
184
  speaker_dropdown = gr.Dropdown(
185
  choices=get_available_speakers(),
186
  value="en_male_1",
187
- label="Speaker Selection",
188
- elem_classes="input-group"
189
  )
190
 
191
- with gr.Group(elem_classes="slider-container"):
192
- temperature = gr.Slider(
193
- 0.1, 1.0,
194
- value=0.1,
195
- label="Temperature (lower = more stable tone, higher = more expressive)",
196
- elem_classes="slider-3d"
197
- )
198
 
199
- with gr.Group(elem_classes="slider-container"):
200
- repetition_penalty = gr.Slider(
201
- 0.5, 2.0,
202
- value=1.1,
203
- label="Repetition Penalty",
204
- elem_classes="slider-3d"
205
- )
206
 
207
  reference_audio = gr.Audio(
208
  label="Reference Audio (for voice cloning)",
209
- type="filepath",
210
- elem_classes="input-group"
211
  )
212
 
213
  gr.Markdown("""
@@ -216,7 +96,7 @@ with gr.Blocks(css=custom_css) as demo:
216
  - For transcription interface will use Whisper turbo to transcribe the audio file
217
  - Longer audio clips will reduce maximum output length
218
  - Custom speaker overrides speaker selection
219
- """, elem_classes="input-group")
220
 
221
  submit_button.click(
222
  fn=generate_tts,
 
20
  interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)
21
 
22
  try:
 
23
  if reference_audio:
24
  speaker = interface.create_speaker(reference_audio)
 
25
  elif speaker_selection and speaker_selection != "None":
26
  speaker = interface.load_default_speaker(speaker_selection)
 
27
  else:
28
  speaker = None
29
 
 
36
  )
37
  output = interface.generate(config=gen_cfg)
38
 
 
39
  if output.audio is None:
40
  raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.")
41
 
 
42
  output_path = "output.wav"
43
  output.save(output_path)
44
  return output_path, None
45
  except Exception as e:
46
  return None, str(e)
47
 
48
+ with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange") as demo:
49
+ gr.Markdown("# Voice Clone Multilingual TTS")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ error_box = gr.Textbox(label="Error Messages", visible=False)
52
 
53
+ with gr.Row():
 
54
  with gr.Column(scale=1):
55
+ text_input = gr.Textbox(
56
+ label="Text to Synthesize",
57
+ placeholder="Enter text here...",
58
+ lines=8
 
 
 
 
 
 
 
59
  )
60
+
61
+ submit_button = gr.Button("Generate Speech")
62
 
63
+ with gr.Column(scale=1):
 
 
64
  audio_output = gr.Audio(
65
  label="Generated Audio",
66
+ type="filepath"
 
67
  )
68
 
69
+ with gr.Group():
 
70
  speaker_dropdown = gr.Dropdown(
71
  choices=get_available_speakers(),
72
  value="en_male_1",
73
+ label="Speaker Selection"
 
74
  )
75
 
76
+ temperature = gr.Slider(
77
+ 0.1, 1.0,
78
+ value=0.1,
79
+ label="Temperature (lower = more stable tone, higher = more expressive)"
80
+ )
 
 
81
 
82
+ repetition_penalty = gr.Slider(
83
+ 0.5, 2.0,
84
+ value=1.1,
85
+ label="Repetition Penalty"
86
+ )
 
 
87
 
88
  reference_audio = gr.Audio(
89
  label="Reference Audio (for voice cloning)",
90
+ type="filepath"
 
91
  )
92
 
93
  gr.Markdown("""
 
96
  - For transcription interface will use Whisper turbo to transcribe the audio file
97
  - Longer audio clips will reduce maximum output length
98
  - Custom speaker overrides speaker selection
99
+ """)
100
 
101
  submit_button.click(
102
  fn=generate_tts,