PrinzPesia commited on
Commit
4b65101
·
verified ·
1 Parent(s): 2d46271

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -313
app.py CHANGED
@@ -14,229 +14,57 @@ from typing import List, Dict
14
 
15
  # Constants
16
  MAX_FILE_SIZE_MB = 20
17
- MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes
 
18
 
19
- class PodcastGenerator:
20
  def __init__(self):
21
  pass
22
 
23
- async def generate_script(self, prompt: str, language: str, api_key: str, file_obj=None, progress=None) -> Dict:
24
  example = """
25
  {
26
  "topic": "AGI",
27
- "podcast": [
28
- {
29
- "speaker": 2,
30
- "line": "So, AGI, huh? Seems like everyone's talking about it these days."
31
- },
32
- {
33
- "speaker": 1,
34
- "line": "Yeah, it's definitely having a moment, isn't it?"
35
- },
36
- {
37
- "speaker": 2,
38
- "line": "It is and for good reason, right? I mean, you've been digging into this stuff, listening to the podcasts and everything. What really stood out to you? What got you hooked?"
39
- },
40
- {
41
- "speaker": 1,
42
- "line": "Honestly, it's the sheer scale of what AGI could do. We're talking about potentially reshaping well everything."
43
- },
44
- {
45
- "speaker": 2,
46
- "line": "No kidding, but let's be real. Sometimes it feels like every other headline is either hyping AGI up as this technological utopia or painting it as our inevitable robot overlords."
47
- },
48
- {
49
- "speaker": 1,
50
- "line": "It's easy to get lost in the noise, for sure."
51
- },
52
- {
53
- "speaker": 2,
54
- "line": "Exactly. So how about we try to cut through some of that, shall we?"
55
- },
56
- {
57
- "speaker": 1,
58
- "line": "Sounds like a plan."
59
- },
60
- {
61
- "speaker": 2,
62
- "line": "Okay, so first things first, AGI, what is it really? And I don't just mean some dictionary definition, we're talking about something way bigger than just a super smart computer, right?"
63
- },
64
- {
65
- "speaker": 1,
66
- "line": "Right, it's not just about more processing power or better algorithms, it's about a fundamental shift in how we think about intelligence itself."
67
- },
68
- {
69
- "speaker": 2,
70
- "line": "So like, instead of programming a machine for a specific task, we're talking about creating something that can learn and adapt like we do."
71
- },
72
- {
73
- "speaker": 1,
74
- "line": "Exactly, think of it this way: Right now, we've got AI that can beat a grandmaster at chess but ask that same AI to, say, write a poem or compose a symphony. No chance."
75
- },
76
- {
77
- "speaker": 2,
78
- "line": "Okay, I see. So, AGI is about bridging that gap, creating something that can move between those different realms of knowledge seamlessly."
79
- },
80
- {
81
- "speaker": 1,
82
- "line": "Precisely. It's about replicating that uniquely human ability to learn something new and apply that knowledge in completely different contexts and that's a tall order, let me tell you."
83
- },
84
- {
85
- "speaker": 2,
86
- "line": "I bet. I mean, think about how much we still don't even understand about our own brains."
87
- },
88
- {
89
- "speaker": 1,
90
- "line": "That's exactly it. We're essentially trying to reverse-engineer something we don't fully comprehend."
91
- },
92
- {
93
- "speaker": 2,
94
- "line": "And how are researchers even approaching that? What are some of the big ideas out there?"
95
- },
96
- {
97
- "speaker": 1,
98
- "line": "Well, there are a few different schools of thought. One is this idea of neuromorphic computing where they're literally trying to build computer chips that mimic the structure and function of the human brain."
99
- },
100
- {
101
- "speaker": 2,
102
- "line": "Wow, so like actually replicating the physical architecture of the brain. That's wild."
103
- },
104
- {
105
- "speaker": 1,
106
- "line": "It's pretty mind-blowing stuff and then you've got folks working on something called whole brain emulation."
107
- },
108
- {
109
- "speaker": 2,
110
- "line": "Okay, and what's that all about?"
111
- },
112
- {
113
- "speaker": 1,
114
- "line": "The basic idea there is to create a complete digital copy of a human brain down to the last neuron and synapse and run it on a sufficiently powerful computer simulation."
115
- },
116
- {
117
- "speaker": 2,
118
- "line": "Hold on, a digital copy of an entire brain, that sounds like something straight out of science fiction."
119
- },
120
- {
121
- "speaker": 1,
122
- "line": "It does, doesn't it? But it gives you an idea of the kind of ambition we're talking about here and the truth is we're still a long way off from truly achieving AGI, no matter which approach you look at."
123
- },
124
- {
125
- "speaker": 2,
126
- "line": "That makes sense but it's still exciting to think about the possibilities, even if they're a ways off."
127
- },
128
- {
129
- "speaker": 1,
130
- "line": "Absolutely and those possibilities are what really get people fired up about AGI, right? Yeah."
131
- },
132
- {
133
- "speaker": 2,
134
- "line": "For sure. In fact, I remember you mentioning something in that podcast about AGI's potential to revolutionize scientific research. Something about supercharging breakthroughs."
135
- },
136
- {
137
- "speaker": 1,
138
- "line": "Oh, absolutely. Imagine an AI that doesn't just crunch numbers but actually understands scientific data the way a human researcher does. We're talking about potential breakthroughs in everything from medicine and healthcare to material science and climate change."
139
- },
140
- {
141
- "speaker": 2,
142
- "line": "It's like giving scientists this incredibly powerful new tool to tackle some of the biggest challenges we face."
143
- },
144
- {
145
- "speaker": 1,
146
- "line": "Exactly, it could be a total game changer."
147
- },
148
- {
149
- "speaker": 2,
150
- "line": "Okay, but let's be real, every coin has two sides. What about the potential downsides of AGI? Because it can't all be sunshine and roses, right?"
151
- },
152
- {
153
- "speaker": 1,
154
- "line": "Right, there are definitely valid concerns. Probably the biggest one is the impact on the job market. As AGI gets more sophisticated, there's a real chance it could automate a lot of jobs that are currently done by humans."
155
- },
156
- {
157
- "speaker": 2,
158
- "line": "So we're not just talking about robots taking over factories but potentially things like, what, legal work, analysis, even creative fields?"
159
- },
160
- {
161
- "speaker": 1,
162
- "line": "Potentially, yes. And that raises a whole host of questions about what happens to those workers, how we retrain them, how we ensure that the benefits of AGI are shared equitably."
163
- },
164
- {
165
- "speaker": 2,
166
- "line": "Right, because it's not just about the technology itself, but how we choose to integrate it into society."
167
- },
168
- {
169
- "speaker": 1,
170
- "line": "Absolutely. We need to be having these conversations now about ethics, about regulation, about how to make sure AGI is developed and deployed responsibly."
171
- },
172
- {
173
- "speaker": 2,
174
- "line": "So it's less about preventing some kind of sci-fi robot apocalypse and more about making sure we're steering this technology in the right direction from the get-go."
175
- },
176
- {
177
- "speaker": 1,
178
- "line": "Exactly, AGI has the potential to be incredibly beneficial, but it's not going to magically solve all our problems. It's on us to make sure we're using it for good."
179
- },
180
- {
181
- "speaker": 2,
182
- "line": "It's like you said earlier, it's about shaping the future of intelligence."
183
- },
184
- {
185
- "speaker": 1,
186
- "line": "I like that. It really is."
187
- },
188
- {
189
- "speaker": 2,
190
- "line": "And honestly, that's a responsibility that extends beyond just the researchers and the policymakers."
191
- },
192
- {
193
- "speaker": 1,
194
- "line": "100%"
195
- },
196
- {
197
- "speaker": 2,
198
- "line": "So to everyone listening out there I'll leave you with this. As AGI continues to develop, what role do you want to play in shaping its future?"
199
- },
200
- {
201
- "speaker": 1,
202
- "line": "That's a question worth pondering."
203
- },
204
- {
205
- "speaker": 2,
206
- "line": "It certainly is and on that note, we'll wrap up this deep dive. Thanks for listening, everyone."
207
- },
208
- {
209
- "speaker": 1,
210
- "line": "Peace."
211
- }
212
  ]
213
  }
214
  """
215
 
216
  if language == "Auto Detect":
217
- language_instruction = "- The podcast MUST be in the same language as the user input."
218
  else:
219
- language_instruction = f"- The podcast MUST be in {language} language"
220
-
 
 
 
221
  system_prompt = f"""
222
- You are a professional podcast generator. Your task is to generate a professional podcast script based on the user input.
223
  {language_instruction}
224
- - The podcast should have 2 speakers.
225
- - The podcast should be long.
226
- - Do not use names for the speakers.
227
- - The podcast should be interesting, lively, and engaging, and hook the listener from the start.
228
- - The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast.
229
- - The script must be in JSON format.
230
- Follow this example structure:
231
  {example}
232
  """
233
  user_prompt = ""
234
  if prompt and file_obj:
235
- user_prompt = f"Please generate a podcast script based on the uploaded file following user input:\n{prompt}"
236
  elif prompt:
237
- user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}"
238
  else:
239
- user_prompt = "Please generate a podcast script based on the uploaded file."
240
 
241
  messages = []
242
 
@@ -290,7 +118,7 @@ Follow this example structure:
290
 
291
  try:
292
  if progress:
293
- progress(0.3, "Generating podcast script...")
294
 
295
  # Add timeout to the API call
296
  response = await asyncio.wait_for(
@@ -309,35 +137,32 @@ Follow this example structure:
309
  system_instruction=system_prompt
310
  )
311
  ),
312
- timeout=60 # 60 seconds timeout
313
  )
314
  except asyncio.TimeoutError:
315
- raise Exception("The script generation request timed out. Please try again later.")
316
  except Exception as e:
317
  if "API key not valid" in str(e):
318
- raise Exception("Invalid API key. Please provide a valid Gemini API key.")
319
  elif "rate limit" in str(e).lower():
320
- raise Exception("Rate limit exceeded for the API key. Please try again later or provide your own Gemini API key.")
321
  else:
322
- raise Exception(f"Failed to generate podcast script: {e}")
323
 
324
- print(f"Generated podcast script:\n{response.text}")
325
-
326
  if progress:
327
- progress(0.4, "Script generated successfully!")
328
 
329
  return json.loads(response.text)
330
 
331
  async def _read_file_bytes(self, file_obj) -> bytes:
332
  """Read file bytes from a file object"""
333
- # Check file size before reading
334
  if hasattr(file_obj, 'size'):
335
  file_size = file_obj.size
336
  else:
337
  file_size = os.path.getsize(file_obj.name)
338
 
339
  if file_size > MAX_FILE_SIZE_BYTES:
340
- raise Exception(f"File size exceeds the {MAX_FILE_SIZE_MB}MB limit. Please upload a smaller file.")
341
 
342
  if hasattr(file_obj, 'read'):
343
  return file_obj.read()
@@ -353,23 +178,28 @@ Follow this example structure:
353
  elif ext == '.txt':
354
  return "text/plain"
355
  else:
356
- # Fallback to the default mime type detector
357
  mime_type, _ = mimetypes.guess_type(filename)
358
  return mime_type or "application/octet-stream"
359
 
360
- async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
361
- voice = speaker1 if speaker == 1 else speaker2
 
 
 
 
 
 
 
362
  speech = edge_tts.Communicate(text, voice)
363
 
364
  temp_filename = f"temp_{uuid.uuid4()}.wav"
365
  try:
366
- # Add timeout to TTS generation
367
- await asyncio.wait_for(speech.save(temp_filename), timeout=30) # 30 seconds timeout
368
  return temp_filename
369
  except asyncio.TimeoutError:
370
  if os.path.exists(temp_filename):
371
  os.remove(temp_filename)
372
- raise Exception("Text-to-speech generation timed out. Please try with a shorter text.")
373
  except Exception as e:
374
  if os.path.exists(temp_filename):
375
  os.remove(temp_filename)
@@ -377,154 +207,154 @@ Follow this example structure:
377
 
378
  async def combine_audio_files(self, audio_files: List[str], progress=None) -> str:
379
  if progress:
380
- progress(0.9, "Combining audio files...")
381
 
382
  combined_audio = AudioSegment.empty()
383
  for audio_file in audio_files:
384
  combined_audio += AudioSegment.from_file(audio_file)
385
- os.remove(audio_file) # Clean up temporary files
386
 
387
  output_filename = f"output_{uuid.uuid4()}.wav"
388
  combined_audio.export(output_filename, format="wav")
389
 
390
  if progress:
391
- progress(1.0, "Podcast generated successfully!")
392
 
393
  return output_filename
394
 
395
- async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_obj=None, progress=None) -> str:
 
 
396
  try:
397
  if progress:
398
- progress(0.1, "Starting podcast generation...")
399
 
400
- # Set overall timeout for the entire process
401
  return await asyncio.wait_for(
402
- self._generate_podcast_internal(input_text, language, speaker1, speaker2, api_key, file_obj, progress),
403
- timeout=600 # 10 minutes total timeout
404
  )
405
  except asyncio.TimeoutError:
406
- raise Exception("The podcast generation process timed out. Please try with shorter text or try again later.")
407
  except Exception as e:
408
- raise Exception(f"Error generating podcast: {str(e)}")
409
 
410
- async def _generate_podcast_internal(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_obj=None, progress=None) -> str:
 
 
411
  if progress:
412
- progress(0.2, "Generating podcast script...")
413
 
414
- podcast_json = await self.generate_script(input_text, language, api_key, file_obj, progress)
415
 
416
  if progress:
417
- progress(0.5, "Converting text to speech...")
418
 
419
- # Process TTS in batches for concurrent processing
420
  audio_files = []
421
- total_lines = len(podcast_json['podcast'])
 
422
 
423
- # Define batch size to control concurrency
424
- batch_size = 10 # Adjust based on system resources
425
-
426
- # Process in batches
427
  for batch_start in range(0, total_lines, batch_size):
428
  batch_end = min(batch_start + batch_size, total_lines)
429
- batch = podcast_json['podcast'][batch_start:batch_end]
430
 
431
- # Create tasks for concurrent processing
432
  tts_tasks = []
433
  for item in batch:
434
- tts_task = self.tts_generate(item['line'], item['speaker'], speaker1, speaker2)
435
  tts_tasks.append(tts_task)
436
 
437
  try:
438
- # Process batch concurrently
439
  batch_results = await asyncio.gather(*tts_tasks, return_exceptions=True)
440
 
441
- # Check for exceptions and handle results
442
  for i, result in enumerate(batch_results):
443
  if isinstance(result, Exception):
444
- # Clean up any files already created
445
  for file in audio_files:
446
  if os.path.exists(file):
447
  os.remove(file)
448
- raise Exception(f"Error generating speech: {str(result)}")
449
  else:
450
  audio_files.append(result)
451
 
452
- # Update progress
453
  if progress:
454
  current_progress = 0.5 + (0.4 * (batch_end / total_lines))
455
- progress(current_progress, f"Processed {batch_end}/{total_lines} speech segments...")
456
 
457
  except Exception as e:
458
- # Clean up any files already created
459
  for file in audio_files:
460
  if os.path.exists(file):
461
  os.remove(file)
462
- raise Exception(f"Error in batch TTS generation: {str(e)}")
463
 
464
  combined_audio = await self.combine_audio_files(audio_files, progress)
465
  return combined_audio
466
 
467
- async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, api_key: str = "", progress=None) -> str:
 
 
468
  start_time = time.time()
469
 
470
  voice_names = {
471
- "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
472
- "Ava - English (United States)": "en-US-AvaMultilingualNeural",
473
- "Brian - English (United States)": "en-US-BrianMultilingualNeural",
474
- "Emma - English (United States)": "en-US-EmmaMultilingualNeural",
475
- "Florian - German (Germany)": "de-DE-FlorianMultilingualNeural",
476
- "Seraphina - German (Germany)": "de-DE-SeraphinaMultilingualNeural",
477
- "Remy - French (France)": "fr-FR-RemyMultilingualNeural",
478
- "Vivienne - French (France)": "fr-FR-VivienneMultilingualNeural"
479
  }
480
 
481
  speaker1 = voice_names[speaker1]
482
  speaker2 = voice_names[speaker2]
 
 
483
 
484
  try:
485
  if progress:
486
- progress(0.05, "Processing input...")
487
 
488
  if not api_key:
489
  api_key = os.getenv("GENAI_API_KEY")
490
  if not api_key:
491
- raise Exception("No API key provided. Please provide a Gemini API key.")
492
 
493
- podcast_generator = PodcastGenerator()
494
- podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key, input_file, progress)
 
 
495
 
496
  end_time = time.time()
497
- print(f"Total podcast generation time: {end_time - start_time:.2f} seconds")
498
- return podcast
499
 
500
  except Exception as e:
501
- # Ensure we show a user-friendly error
502
  error_msg = str(e)
503
  if "rate limit" in error_msg.lower():
504
- raise Exception("Rate limit exceeded. Please try again later or use your own API key.")
505
  elif "timeout" in error_msg.lower():
506
- raise Exception("The request timed out. This could be due to server load or the length of your input. Please try again with shorter text.")
507
  else:
508
- raise Exception(f"Error: {error_msg}")
509
 
510
  # Gradio UI
511
- def generate_podcast_gradio(input_text, input_file, language, speaker1, speaker2, api_key, progress=gr.Progress()):
512
- # Handle the file if uploaded
 
513
  file_obj = None
514
  if input_file is not None:
515
  file_obj = input_file
516
 
517
- # Use the progress function from Gradio
518
  def progress_callback(value, text):
519
  progress(value, text)
520
 
521
- # Run the async function in the event loop
522
  result = asyncio.run(process_input(
523
  input_text,
524
  file_obj,
525
  language,
526
  speaker1,
527
  speaker2,
 
 
 
528
  api_key,
529
  progress_callback
530
  ))
@@ -532,65 +362,50 @@ def generate_podcast_gradio(input_text, input_file, language, speaker1, speaker2
532
  return result
533
 
534
  def main():
535
- # Define language options
536
  language_options = [
537
  "Auto Detect",
538
- "Afrikaans", "Albanian", "Amharic", "Arabic", "Armenian", "Azerbaijani",
539
- "Bahasa Indonesian", "Bangla", "Basque", "Bengali", "Bosnian", "Bulgarian",
540
- "Burmese", "Catalan", "Chinese Cantonese", "Chinese Mandarin",
541
- "Chinese Taiwanese", "Croatian", "Czech", "Danish", "Dutch", "English",
542
- "Estonian", "Filipino", "Finnish", "French", "Galician", "Georgian",
543
- "German", "Greek", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Irish",
544
- "Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Khmer", "Korean",
545
- "Lao", "Latvian", "Lithuanian", "Macedonian", "Malay", "Malayalam",
546
- "Maltese", "Mongolian", "Nepali", "Norwegian Bokmål", "Pashto", "Persian",
547
- "Polish", "Portuguese", "Romanian", "Russian", "Serbian", "Sinhala",
548
- "Slovak", "Slovene", "Somali", "Spanish", "Sundanese", "Swahili",
549
- "Swedish", "Tamil", "Telugu", "Thai", "Turkish", "Ukrainian", "Urdu",
550
- "Uzbek", "Vietnamese", "Welsh", "Zulu"
551
  ]
552
 
553
- # Define voice options
554
  voice_options = [
555
- "Andrew - English (United States)",
556
- "Ava - English (United States)",
557
- "Brian - English (United States)",
558
- "Emma - English (United States)",
559
- "Florian - German (Germany)",
560
- "Seraphina - German (Germany)",
561
- "Remy - French (France)",
562
- "Vivienne - French (France)"
563
  ]
564
 
565
- # Create Gradio interface
566
- with gr.Blocks(title="PodcastGen 🎙️") as demo:
567
- gr.Markdown("# PodcastGen 🎙️")
568
- gr.Markdown("Generate a 2-speaker podcast from text input or documents!")
569
 
570
  with gr.Row():
571
  with gr.Column(scale=2):
572
- input_text = gr.Textbox(label="Input Text", lines=10, placeholder="Enter text for podcast generation...")
573
-
574
  with gr.Column(scale=1):
575
- input_file = gr.File(label="Or Upload a PDF or TXT file", file_types=[".pdf", ".txt"])
576
 
577
  with gr.Row():
578
  with gr.Column():
579
- api_key = gr.Textbox(label="Your Gemini API Key (Optional)", placeholder="Enter API key here if you're getting rate limited", type="password")
580
- language = gr.Dropdown(label="Language", choices=language_options, value="Auto Detect")
 
581
 
582
  with gr.Column():
583
- speaker1 = gr.Dropdown(label="Speaker 1 Voice", choices=voice_options, value="Andrew - English (United States)")
584
- speaker2 = gr.Dropdown(label="Speaker 2 Voice", choices=voice_options, value="Ava - English (United States)")
 
 
585
 
586
- generate_btn = gr.Button("Generate Podcast", variant="primary")
587
-
588
- with gr.Row():
589
- output_audio = gr.Audio(label="Generated Podcast", type="filepath", format="wav")
590
 
591
  generate_btn.click(
592
- fn=generate_podcast_gradio,
593
- inputs=[input_text, input_file, language, speaker1, speaker2, api_key],
594
  outputs=[output_audio]
595
  )
596
 
 
14
 
15
  # Constants
16
  MAX_FILE_SIZE_MB = 20
17
+ MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
18
+ WORDS_PER_MINUTE = 150 # Durchschnittliche Sprechgeschwindigkeit (Wörter pro Minute)
19
 
20
+ class MeetingGenerator:
21
  def __init__(self):
22
  pass
23
 
24
+ async def generate_script(self, prompt: str, language: str, api_key: str, duration_minutes: int, file_obj=None, progress=None) -> Dict:
25
  example = """
26
  {
27
  "topic": "AGI",
28
+ "meeting": [
29
+ {"speaker": 1, "line": "So, AGI is our main topic today..."},
30
+ {"speaker": 2, "line": "Yes, I think we should start with definitions"},
31
+ {"speaker": 3, "line": "I agree, let's clarify terminology first"},
32
+ {"speaker": 4, "line": "From my perspective, AGI means..."},
33
+ {"speaker": 1, "line": "That's a good starting point..."},
34
+ {"speaker": 2, "line": "But we should consider practical applications..."},
35
+ {"speaker": 3, "line": "What about ethical implications?"},
36
+ {"speaker": 4, "line": "I have some thoughts on that..."}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  ]
38
  }
39
  """
40
 
41
  if language == "Auto Detect":
42
+ language_instruction = "- Die Besprechung MUSS in derselben Sprache wie die Benutzereingabe sein."
43
  else:
44
+ language_instruction = f"- Die Besprechung MUSS in {language} sein"
45
+
46
+ # Berechne die benötigte Wortanzahl basierend auf der gewünschten Dauer
47
+ target_word_count = int(duration_minutes * WORDS_PER_MINUTE)
48
+
49
  system_prompt = f"""
50
+ Sie sind ein professioneller Besprechungsgenerator. Ihre Aufgabe ist es, ein realistisches Besprechungsskript mit 4 Teilnehmern zu erstellen.
51
  {language_instruction}
52
+ - Teilnehmer sollten als Sprecher 1, 2, 3, 4 identifiziert werden
53
+ - Natürliche Dialoge mit Unterbrechungen, Zustimmungen und Diskussionen generieren
54
+ - Besprechungsspezifische Phrasen einfügen ("Kommen wir darauf zurück", "Was denkt ihr alle?")
55
+ - Das Skript MUSS im JSON-Format sein
56
+ - Die Gesamtzahl der Wörter sollte etwa {target_word_count} betragen, um eine Dauer von {duration_minutes} Minuten zu erreichen
57
+ - Erstellen Sie eine Diskussion zu: {prompt}
58
+ Folgen Sie dieser Struktur:
59
  {example}
60
  """
61
  user_prompt = ""
62
  if prompt and file_obj:
63
+ user_prompt = f"Bitte generieren Sie ein Besprechungsskript basierend auf der hochgeladenen Datei und Benutzereingabe:\n{prompt}"
64
  elif prompt:
65
+ user_prompt = f"Bitte generieren Sie ein Besprechungsskript basierend auf der Benutzereingabe:\n{prompt}"
66
  else:
67
+ user_prompt = "Bitte generieren Sie ein Besprechungsskript basierend auf der hochgeladenen Datei."
68
 
69
  messages = []
70
 
 
118
 
119
  try:
120
  if progress:
121
+ progress(0.3, "Generiere Besprechungsskript...")
122
 
123
  # Add timeout to the API call
124
  response = await asyncio.wait_for(
 
137
  system_instruction=system_prompt
138
  )
139
  ),
140
+ timeout=60
141
  )
142
  except asyncio.TimeoutError:
143
+ raise Exception("Die Anfrage zur Skripterstellung hat das Zeitlimit überschritten.")
144
  except Exception as e:
145
  if "API key not valid" in str(e):
146
+ raise Exception("Ungültiger API-Schlüssel.")
147
  elif "rate limit" in str(e).lower():
148
+ raise Exception("API-Ratenlimit überschritten.")
149
  else:
150
+ raise Exception(f"Fehler beim Generieren des Skripts: {e}")
151
 
 
 
152
  if progress:
153
+ progress(0.4, "Skript erfolgreich generiert!")
154
 
155
  return json.loads(response.text)
156
 
157
  async def _read_file_bytes(self, file_obj) -> bytes:
158
  """Read file bytes from a file object"""
 
159
  if hasattr(file_obj, 'size'):
160
  file_size = file_obj.size
161
  else:
162
  file_size = os.path.getsize(file_obj.name)
163
 
164
  if file_size > MAX_FILE_SIZE_BYTES:
165
+ raise Exception(f"Dateigröße überschreitet {MAX_FILE_SIZE_MB}MB Limit.")
166
 
167
  if hasattr(file_obj, 'read'):
168
  return file_obj.read()
 
178
  elif ext == '.txt':
179
  return "text/plain"
180
  else:
 
181
  mime_type, _ = mimetypes.guess_type(filename)
182
  return mime_type or "application/octet-stream"
183
 
184
+ async def tts_generate(self, text: str, speaker: int,
185
+ speaker1: str, speaker2: str, speaker3: str, speaker4: str) -> str:
186
+ voice_mapping = {
187
+ 1: speaker1,
188
+ 2: speaker2,
189
+ 3: speaker3,
190
+ 4: speaker4
191
+ }
192
+ voice = voice_mapping.get(speaker, speaker1)
193
  speech = edge_tts.Communicate(text, voice)
194
 
195
  temp_filename = f"temp_{uuid.uuid4()}.wav"
196
  try:
197
+ await asyncio.wait_for(speech.save(temp_filename), timeout=30)
 
198
  return temp_filename
199
  except asyncio.TimeoutError:
200
  if os.path.exists(temp_filename):
201
  os.remove(temp_filename)
202
+ raise Exception("Text-to-Speech Generierung hat zu lange gedauert.")
203
  except Exception as e:
204
  if os.path.exists(temp_filename):
205
  os.remove(temp_filename)
 
207
 
208
  async def combine_audio_files(self, audio_files: List[str], progress=None) -> str:
209
  if progress:
210
+ progress(0.9, "Kombiniere Audio-Dateien...")
211
 
212
  combined_audio = AudioSegment.empty()
213
  for audio_file in audio_files:
214
  combined_audio += AudioSegment.from_file(audio_file)
215
+ os.remove(audio_file)
216
 
217
  output_filename = f"output_{uuid.uuid4()}.wav"
218
  combined_audio.export(output_filename, format="wav")
219
 
220
  if progress:
221
+ progress(1.0, "Besprechung erfolgreich generiert!")
222
 
223
  return output_filename
224
 
225
+ async def generate_meeting(self, input_text: str, language: str,
226
+ speaker1: str, speaker2: str, speaker3: str, speaker4: str,
227
+ duration_minutes: int, api_key: str, file_obj=None, progress=None) -> str:
228
  try:
229
  if progress:
230
+ progress(0.1, "Starte Generierung...")
231
 
 
232
  return await asyncio.wait_for(
233
+ self._generate_meeting_internal(input_text, language, speaker1, speaker2, speaker3, speaker4, duration_minutes, api_key, file_obj, progress),
234
+ timeout=600
235
  )
236
  except asyncio.TimeoutError:
237
+ raise Exception("Generierung hat das Zeitlimit überschritten.")
238
  except Exception as e:
239
+ raise Exception(f"Fehler: {str(e)}")
240
 
241
+ async def _generate_meeting_internal(self, input_text: str, language: str,
242
+ speaker1: str, speaker2: str, speaker3: str, speaker4: str,
243
+ duration_minutes: int, api_key: str, file_obj=None, progress=None) -> str:
244
  if progress:
245
+ progress(0.2, "Generiere Besprechungsskript...")
246
 
247
+ meeting_json = await self.generate_script(input_text, language, api_key, duration_minutes, file_obj, progress)
248
 
249
  if progress:
250
+ progress(0.5, "Konvertiere Text zu Sprache...")
251
 
 
252
  audio_files = []
253
+ total_lines = len(meeting_json['meeting'])
254
+ batch_size = 10
255
 
 
 
 
 
256
  for batch_start in range(0, total_lines, batch_size):
257
  batch_end = min(batch_start + batch_size, total_lines)
258
+ batch = meeting_json['meeting'][batch_start:batch_end]
259
 
 
260
  tts_tasks = []
261
  for item in batch:
262
+ tts_task = self.tts_generate(item['line'], item['speaker'], speaker1, speaker2, speaker3, speaker4)
263
  tts_tasks.append(tts_task)
264
 
265
  try:
 
266
  batch_results = await asyncio.gather(*tts_tasks, return_exceptions=True)
267
 
 
268
  for i, result in enumerate(batch_results):
269
  if isinstance(result, Exception):
 
270
  for file in audio_files:
271
  if os.path.exists(file):
272
  os.remove(file)
273
+ raise Exception(f"Fehler bei Sprachgenerierung: {str(result)}")
274
  else:
275
  audio_files.append(result)
276
 
 
277
  if progress:
278
  current_progress = 0.5 + (0.4 * (batch_end / total_lines))
279
+ progress(current_progress, f"Verarbeitet {batch_end}/{total_lines} Segmente...")
280
 
281
  except Exception as e:
 
282
  for file in audio_files:
283
  if os.path.exists(file):
284
  os.remove(file)
285
+ raise Exception(f"Fehler in Batch-Verarbeitung: {str(e)}")
286
 
287
  combined_audio = await self.combine_audio_files(audio_files, progress)
288
  return combined_audio
289
 
290
+ async def process_input(input_text: str, input_file, language: str,
291
+ speaker1: str, speaker2: str, speaker3: str, speaker4: str,
292
+ duration_minutes: int, api_key: str = "", progress=None) -> str:
293
  start_time = time.time()
294
 
295
  voice_names = {
296
+ "Andrew - Englisch (USA)": "en-US-AndrewMultilingualNeural",
297
+ "Ava - Englisch (USA)": "en-US-AvaMultilingualNeural",
298
+ "Brian - Englisch (USA)": "en-US-BrianMultilingualNeural",
299
+ "Emma - Englisch (USA)": "en-US-EmmaMultilingualNeural",
300
+ "Florian - Deutsch (Deutschland)": "de-DE-FlorianMultilingualNeural",
301
+ "Seraphina - Deutsch (Deutschland)": "de-DE-SeraphinaMultilingualNeural",
302
+ "Remy - Französisch (Frankreich)": "fr-FR-RemyMultilingualNeural",
303
+ "Vivienne - Französisch (Frankreich)": "fr-FR-VivienneMultilingualNeural"
304
  }
305
 
306
  speaker1 = voice_names[speaker1]
307
  speaker2 = voice_names[speaker2]
308
+ speaker3 = voice_names[speaker3]
309
+ speaker4 = voice_names[speaker4]
310
 
311
  try:
312
  if progress:
313
+ progress(0.05, "Verarbeite Eingabe...")
314
 
315
  if not api_key:
316
  api_key = os.getenv("GENAI_API_KEY")
317
  if not api_key:
318
+ raise Exception("Kein API-Schlüssel angegeben.")
319
 
320
+ meeting_generator = MeetingGenerator()
321
+ meeting = await meeting_generator.generate_meeting(
322
+ input_text, language, speaker1, speaker2, speaker3, speaker4, duration_minutes, api_key, input_file, progress
323
+ )
324
 
325
  end_time = time.time()
326
+ print(f"Generierungsdauer: {end_time - start_time:.2f} Sekunden")
327
+ return meeting
328
 
329
  except Exception as e:
 
330
  error_msg = str(e)
331
  if "rate limit" in error_msg.lower():
332
+ raise Exception("Ratenlimit überschritten.")
333
  elif "timeout" in error_msg.lower():
334
+ raise Exception("Zeitüberschreitung bei der Anfrage.")
335
  else:
336
+ raise Exception(f"Fehler: {error_msg}")
337
 
338
  # Gradio UI
339
+ def generate_meeting_gradio(input_text, input_file, language,
340
+ speaker1, speaker2, speaker3, speaker4,
341
+ duration_minutes, api_key, progress=gr.Progress()):
342
  file_obj = None
343
  if input_file is not None:
344
  file_obj = input_file
345
 
 
346
  def progress_callback(value, text):
347
  progress(value, text)
348
 
 
349
  result = asyncio.run(process_input(
350
  input_text,
351
  file_obj,
352
  language,
353
  speaker1,
354
  speaker2,
355
+ speaker3,
356
+ speaker4,
357
+ duration_minutes,
358
  api_key,
359
  progress_callback
360
  ))
 
362
  return result
363
 
364
  def main():
 
365
  language_options = [
366
  "Auto Detect",
367
+ "Deutsch", "Englisch", "Französisch", "Spanisch", "Italienisch", "Japanisch", "Chinesisch"
 
 
 
 
 
 
 
 
 
 
 
 
368
  ]
369
 
 
370
  voice_options = [
371
+ "Andrew - Englisch (USA)",
372
+ "Ava - Englisch (USA)",
373
+ "Brian - Englisch (USA)",
374
+ "Emma - Englisch (USA)",
375
+ "Florian - Deutsch (Deutschland)",
376
+ "Seraphina - Deutsch (Deutschland)",
377
+ "Remy - Französisch (Frankreich)",
378
+ "Vivienne - Französisch (Frankreich)"
379
  ]
380
 
381
+ with gr.Blocks(title="Meeting Generator 🤝") as demo:
382
+ gr.Markdown("# Meeting Generator 🤝")
383
+ gr.Markdown("Generieren Sie realistische 4-Personen-Besprechungen!")
 
384
 
385
  with gr.Row():
386
  with gr.Column(scale=2):
387
+ input_text = gr.Textbox(label="Diskussionsthema", lines=5, placeholder="Thema eingeben...")
 
388
  with gr.Column(scale=1):
389
+ input_file = gr.File(label="Unterstützendes Dokument (PDF/TXT)", file_types=[".pdf", ".txt"])
390
 
391
  with gr.Row():
392
  with gr.Column():
393
+ api_key = gr.Textbox(label="Gemini API Key", type="password")
394
+ language = gr.Dropdown(label="Sprache", choices=language_options, value="Auto Detect")
395
+ duration_minutes = gr.Slider(label="Gewünschte Dauer (Minuten)", minimum=1, maximum=30, value=5, step=1)
396
 
397
  with gr.Column():
398
+ speaker1 = gr.Dropdown(label="Teilnehmer 1", choices=voice_options, value="Florian - Deutsch (Deutschland)")
399
+ speaker2 = gr.Dropdown(label="Teilnehmer 2", choices=voice_options, value="Seraphina - Deutsch (Deutschland)")
400
+ speaker3 = gr.Dropdown(label="Teilnehmer 3", choices=voice_options, value="Andrew - Englisch (USA)")
401
+ speaker4 = gr.Dropdown(label="Teilnehmer 4", choices=voice_options, value="Ava - Englisch (USA)")
402
 
403
+ generate_btn = gr.Button("Besprechung generieren", variant="primary")
404
+ output_audio = gr.Audio(label="Besprechungsaufnahme", type="filepath", format="wav")
 
 
405
 
406
  generate_btn.click(
407
+ fn=generate_meeting_gradio,
408
+ inputs=[input_text, input_file, language, speaker1, speaker2, speaker3, speaker4, duration_minutes, api_key],
409
  outputs=[output_audio]
410
  )
411