import tempfile import os import uuid import time import subprocess import openai import whisper from ffmpy import FFmpeg import gradio as gr from elevenlabs import clone, generate, get_api_key, set_api_key css = """ #col-container{ margin: 0 auto; max-width: 840px; text-align: left; } """ default_prompt = '你是一个专业的视频字幕翻译。请翻译下面的文本到{{target_lang}},注意保留数字和换行符,请勿自行创建内容,除了翻译,不要输出任何其他文本。' openai.api_type = 'azure' openai.api_base = 'https://tencent-openai01.openai.azure.com' openai.api_key = '49eb7c2c3acd41f4ac81fef59ceacbba' openai.api_version = "2023-05-15" openai.log = "debug" # *************************# # 1. Resize the video # # 2. Extract the audio # # 3. Translate the text from audio # # 4. Translate the text # # 5. Voice Synthesis # # 6. Wave2lip # start = time.perf_counter() model = whisper.load_model("base", download_root='./checkpoints') end = time.perf_counter() print('whisper load model time: ', end - start) set_api_key('05a491535c6526e1fc9fc8e195f2fe25') print('elevenlab api key', get_api_key()) language_mapping = { 'English': '英语', 'Spanish': '西班牙语', 'French': '法语', 'German': '德语', 'Italian': '意大利语', 'Portuguese': '葡萄牙语', 'Polish': '波兰语', 'Turkish': '土耳其语', 'Russian': '俄语', 'Dutch': '荷兰语', 'Czech': '捷克语', 'Arabic': '阿拉伯语', 'Chinese': '中文普通话' } def resize_video(video_source): return video_source def extract_audio(video_source, output_dir='./'): output_audio = os.path.join(output_dir, 'output_orignal_audio.wav') ff = FFmpeg( inputs={video_source: None}, outputs={output_audio: '-acodec pcm_s24le -ar 48000 -q:a 0 -map a -y'} ) print('ffmpeg command: ', ff.cmd) ff.run() return output_audio def clone_audio(audio_file, audio_text): voice = clone( name=uuid.uuid4().hex, description="", # Optional files=[audio_file]) print('voice: ', voice) audio = generate(text=audio_text, voice=voice, model='eleven_multilingual_v2') return audio # todo def translate_text(text, target_language): target_language_name = language_mapping[target_language] chat_completion = openai.ChatCompletion.create( engine="gpt-4", temperature=0.1, max_tokens=2048, messages=[ {"role": "system", "content": default_prompt.replace( '{{target_lang}}', target_language_name)}, {"role": "user", "content": text}]) # print the completion print(chat_completion.choices[0].message.content) translated_text = chat_completion.choices[0].message.content return translated_text def infer(video_source, target_language): print('video_source: ', video_source) # check the video format # Create a temporary directory to store the output file output_dir = tempfile.mkdtemp() output_video_file = os.path.join(output_dir, 'output_video.mp4') print("Output file: ", output_video_file) output_audio = extract_audio(video_source, output_dir=output_dir) result = model.transcribe(output_audio) whisper_text = result["text"] whisper_language = result['language'] print("Whisper text: ", whisper_text, whisper_language) target_language_code = language_mapping[target_language] print("Target language code: ", target_language_code) translated_text = translate_text(whisper_text, target_language) print("Translated text: ", translated_text) # 声音 clone && 合成 audio = clone_audio(output_audio, translated_text) audio_file = os.path.join(output_dir, 'output_clone_audio.wav') with open(audio_file, 'wb') as f: f.write(audio) # 合成视频 wav2lip = f"python inference.py --checkpoint_path 'checkpoints/wav2lip_gan.pth' --face {video_source} --audio {audio_file} --resize_factor 1 --nosmooth --outfile {output_video_file}" subprocess.run(wav2lip, shell=True, stdout=subprocess.PIPE) print("Video conversion successful.") return output_video_file with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.Markdown("""
This is a demo for AI Translation.
""") with gr.Row(): with gr.Column(): video_source = gr.Video( label="Source Video", show_label=True, interactive=True) target_language = gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese"], label="Target language", info="Target language!", value="English") submit_btn = gr.Button(value="Submit") with gr.Column(): result = gr.Video(label="Result") with gr.Row(): gr.Examples( label="Video Examples", examples=['dictator.mp4'], inputs=[video_source] ) submit_btn.click( infer, inputs=[video_source, target_language], outputs=result) demo.queue(5).launch()