import os try: from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_audioclips print("Moviepy importado com sucesso!") except ModuleNotFoundError: print("Erro: 'moviepy' não está instalado. Adicione 'moviepy' ao requirements.txt ou instale com 'pip install moviepy'.") raise SystemExit from transformers import WhisperProcessor, WhisperForConditionalGeneration, M2M100Tokenizer, M2M100ForConditionalGeneration from transformers import AutoProcessor, AutoModelForTextToSpeech import librosa import soundfile as sf import numpy as np import streamlit as st # Carregando os modelos whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small") whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small") m2m_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") m2m_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") bark_processor = AutoProcessor.from_pretrained("suno/bark-small") bark_model = AutoModelForTextToSpeech.from_pretrained("suno/bark-small") # Função para ajustar a velocidade do áudio def adjust_speed(audio, original_duration, target_duration, sample_rate=24000): rate = original_duration / target_duration adjusted_audio = librosa.effects.time_stretch(audio, rate=rate) return adjusted_audio # Função principal para processar o vídeo def process_video(video_path): try: video = VideoFileClip(video_path) original_duration = video.duration audio = video.audio audio.write_audiofile("temp_audio.wav") audio_data, sample_rate = librosa.load("temp_audio.wav", sr=16000) input_features = whisper_processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features predicted_ids = whisper_model.generate(input_features, return_timestamps=True, language="en") transcription = whisper_processor.decode(predicted_ids[0], skip_special_tokens=True, output_word_offsets=True) segments = transcription.get("word_offsets", []) detected_lang = "en" translated_segments = [] for segment in segments: text = segment.get("word", "") start = segment.get("offset", 0) / 1000 end = start + segment.get("length", 0) / 1000 if text.strip(): m2m_tokenizer.src_lang = detected_lang inputs = m2m_tokenizer(text, return_tensors="pt") translated_ids = m2m_model.generate(**inputs, forced_bos_token_id=m2m_tokenizer.get_lang_id("pt")) translated_text = m2m_tokenizer.decode(translated_ids[0], skip_special_tokens=True) translated_segments.append({"text": translated_text, "start": start, "end": end}) speech_segments = [] for segment in translated_segments: inputs = bark_processor(segment["text"], return_tensors="pt") speech = bark_model.generate(**inputs).cpu().numpy() generated_duration = len(speech) / 24000 target_duration = segment["end"] - segment["start"] adjusted_speech = adjust_speed(speech, generated_duration, target_duration) speech_segments.append(adjusted_speech) full_speech = np.concatenate(speech_segments) sf.write("new_audio.wav", full_speech, 24000) new_audio = AudioFileClip("new_audio.wav") if new_audio.duration < original_duration: silence = AudioFileClip("silence.wav").set_duration(original_duration - new_audio.duration) final_audio = concatenate_audioclips([new_audio, silence]) else: final_audio = new_audio.set_duration(original_duration) final_video = video.set_audio(final_audio) output_path = "output_video.mp4" final_video.write_videofile(output_path) for temp_file in ["temp_audio.wav", "new_audio.wav", "silence.wav"]: if os.path.exists(temp_file): os.remove(temp_file) return output_path except Exception as e: return f"Erro ao processar o vídeo: {str(e)}" # Criar arquivo de silêncio def create_silence_file(): silence = np.zeros(int(24000 * 1.0)) sf.write("silence.wav", silence, 24000) # Interface com Streamlit def main(): st.title("Conversor de Áudio de Vídeo para Português") st.write("Carregue um vídeo e obtenha uma versão com o áudio traduzido para português usando vozes realistas.") uploaded_file = st.file_uploader("Escolha um vídeo", type=["mp4", "avi", "mov"]) if uploaded_file is not None: with open("input_video.mp4", "wb") as f: f.write(uploaded_file.read()) st.write("Processando...") result = process_video("input_video.mp4") if os.path.exists(result): st.video(result) with open(result, "rb") as file: st.download_button("Baixar vídeo traduzido", file, file_name="output_video.mp4") else: st.error(result) if __name__ == "__main__": create_silence_file() main()