Spaces:

Merlintxu
/

diarizacion

Sleeping

App Files Files Community

Merlintxu commited on Jul 26

Commit

33704de

verified ·

1 Parent(s): 53f6104

Update app.py

Browse files

Files changed (1) hide show

app.py +201 -93

app.py CHANGED Viewed

@@ -4,135 +4,244 @@ import tempfile
 import torch
 import numpy as np
 import datetime
 import whisper
-from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
 from pyannote.audio import Audio
 from pyannote.core import Segment
 from sklearn.cluster import AgglomerativeClustering
 import gradio as gr
 import warnings
 warnings.filterwarnings("ignore", category=UserWarning)
-# --- Configuración de Modelos (Ligeros para Spaces) ---
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-# Cargar Whisper Small (multilingüe, más ligero)
-print("Cargando modelo Whisper small...")
-whisper_model = whisper.load_model("small", device=DEVICE)
-print("Modelo Whisper cargado.")
-# Cargar modelo de embeddings de hablante
-print("Cargando modelo de embeddings...")
-embedding_model = PretrainedSpeakerEmbedding(
-    "speechbrain/spkrec-ecapa-voxceleb",
-    device=DEVICE
-)
 audio_processor = Audio()
-print("Modelos cargados.")
 def time(secs):
     return datetime.timedelta(seconds=round(secs))
 def convert_to_wav(input_path):
-    """Convierte cualquier audio a WAV usando ffmpeg."""
     if input_path.lower().endswith('.wav'):
-        return input_path
-    output_path = input_path.rsplit('.', 1)[0] + '_converted.wav'
-    os.system(f"ffmpeg -y -i '{input_path}' -ac 1 -ar 16000 -acodec pcm_s16le '{output_path}'")
     return output_path
 def get_duration(path):
     import soundfile as sf
-    info = sf.info(path)
-    return info.duration
 def segment_embedding(path, segment, duration):
     start = segment["start"]
     end = min(duration, segment["end"])
     clip = Segment(start, end)
-    waveform, sample_rate = audio_processor.crop(path, clip)
-    embedding = embedding_model(waveform[None])
-    return embedding.cpu().detach().numpy().squeeze()
 def transcribe_and_diarize(audio_file, num_speakers):
     """Función principal de transcripción y diarización."""
-    status_update = ""
-    # --- 1. Conversión ---
-    status_update += "1. Convirtiendo audio a formato WAV...\n"
-    yield status_update, ""
-    wav_path = convert_to_wav(audio_file)
-    # --- 2. Duración ---
-    status_update += "2. Obteniendo duración del audio...\n"
-    yield status_update, ""
-    duration = get_duration(wav_path)
-    if duration > 30 * 60: # Limitar a 30 minutos para evitar tiempos excesivos
-         yield status_update + "Error: El audio es demasiado largo (máximo 30 minutos).\n", ""
-         return
-    # --- 3. Transcripción ---
-    status_update += "3. Transcribiendo audio con Whisper (modelo 'small')...\n"
-    yield status_update, ""
-    # Transcribir en español
-    result = whisper_model.transcribe(wav_path, language='es', task='transcribe', verbose=False)
-    segments = result["segments"]
-    if not segments:
-        yield status_update + "Error: No se detectó habla en el audio.\n", ""
-        return
-    # --- 4. Diarización ---
-    status_update += "4. Preparando para diarización...\n"
-    yield status_update, ""
-    # Limitar número de hablantes
-    num_speakers = max(2, min(6, int(num_speakers))) # Entre 2 y 6
-    num_speakers = min(num_speakers, len(segments))
-    if len(segments) <= 1:
-        segments[0]['speaker'] = 'HABLANTE 1'
-    else:
-        status_update += "   -> Extrayendo embeddings de audio...\n"
         yield status_update, ""
-        embeddings = np.zeros(shape=(len(segments), 192))
-        for i, segment in enumerate(segments):
-            embeddings[i] = segment_embedding(wav_path, segment, duration)
-        embeddings = np.nan_to_num(embeddings)
-        status_update += "   -> Agrupando hablantes...\n"
         yield status_update, ""
-        # Clustering
-        clustering = AgglomerativeClustering(n_clusters=num_speakers).fit(embeddings)
-        labels = clustering.labels_
-        for i in range(len(segments)):
-            segments[i]["speaker"] = f'HABLANTE {labels[i] + 1}'
-    # --- 5. Formateo de salida ---
-    status_update += "5. Generando transcripción final...\n"
-    yield status_update, ""
-    output_text = ""
-    for (i, segment) in enumerate(segments):
-        if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
-            if i != 0:
-                output_text += '\n\n'
-            output_text += f"{segment['speaker']} [{time(segment['start'])}]\n\n"
-        output_text += segment["text"].strip() + ' '
-    yield status_update + "¡Proceso completado!\n", output_text
 # --- Interfaz Gradio ---
 with gr.Blocks(title="Diarización de Audio en Español") as demo:
     gr.Markdown("# 🎤 Diarización de Audio en Español")
     gr.Markdown("Sube un archivo de audio (hasta 30 minutos) y obtén una transcripción separada por hablantes. Optimizado para español.")
     with gr.Row():
         with gr.Column():
@@ -140,22 +249,21 @@ with gr.Blocks(title="Diarización de Audio en Español") as demo:
             num_speakers = gr.Slider(2, 6, value=3, step=1, label="Número aproximado de hablantes")
             run_button = gr.Button("🚀 Iniciar Diarización")
         with gr.Column():
-            status_output = gr.Textbox(label="Estado", interactive=False, lines=10)
             text_output = gr.Textbox(label="Transcripción con Hablantes", interactive=False, lines=20)
     run_button.click(
         fn=transcribe_and_diarize,
         inputs=[audio_input, num_speakers],
         outputs=[status_output, text_output],
-        queue=True # Importante para procesos largos
     )
     gr.Markdown("---")
-    gr.Markdown("**Nota:** Este demo usa modelos ligeros. Para audio con mucho ruido o más de 10 minutos, los resultados pueden ser menos precisos.")
-# Para ejecutar localmente (opcional)
-# if __name__ == "__main__":
-#     demo.launch()
 # Para Hugging Face Spaces
-demo.launch()

 import torch
 import numpy as np
 import datetime
+import gc
 import whisper
 from pyannote.audio import Audio
 from pyannote.core import Segment
 from sklearn.cluster import AgglomerativeClustering
 import gradio as gr
 import warnings
+from huggingface_hub import hf_hub_download
 warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+# --- Configuración de Modelos ---
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(f"Usando dispositivo: {DEVICE}")
+# --- Cargar Whisper (intentar con una versión más reciente si es viable) ---
+WHISPER_MODEL_NAME = "small" # Empezar con 'small' para Spaces. Probar 'medium' o 'large-v3' si hay recursos.
+try:
+    print(f"Cargando modelo Whisper '{WHISPER_MODEL_NAME}'...")
+    whisper_model = whisper.load_model(WHISPER_MODEL_NAME, device=DEVICE)
+    print(f"Modelo Whisper '{WHISPER_MODEL_NAME}' cargado exitosamente.")
+except Exception as e:
+    print(f"Error cargando Whisper '{WHISPER_MODEL_NAME}': {e}")
+    print("Intentando cargar 'base' como fallback...")
+    WHISPER_MODEL_NAME = "base"
+    whisper_model = whisper.load_model(WHISPER_MODEL_NAME, device=DEVICE)
+    print(f"Modelo Whisper '{WHISPER_MODEL_NAME}' cargado.")
+# --- Cargar modelo de embeddings de Pyannote v3.x ---
+# Usar el nuevo modelo de embedding recomendado para pyannote.audio 3.x
+EMBEDDING_MODEL_NAME = "pyannote/embedding"
+EMBEDDING_REVISION = "main" # O especificar un commit si es necesario
+try:
+    print(f"Cargando modelo de embeddings '{EMBEDDING_MODEL_NAME}'...")
+    # Importar el pipeline de embedding de pyannote v3
+    from pyannote.audio import Model
+    embedding_model = Model.from_pretrained(
+        EMBEDDING_MODEL_NAME,
+        use_auth_token=False, # No se necesita token para modelos públicos
+        revision=EMBEDDING_REVISION
+    )
+    embedding_model.to(DEVICE)
+    print(f"Modelo de embeddings '{EMBEDDING_MODEL_NAME}' cargado.")
+except Exception as e:
+    print(f"Error cargando el modelo de embeddings '{EMBEDDING_MODEL_NAME}': {e}")
+    print("Intentando con speechbrain como fallback...")
+    # Fallback al modelo SpeechBrain si el de Pyannote falla
+    try:
+        from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
+        embedding_model = PretrainedSpeakerEmbedding(
+            "speechbrain/spkrec-ecapa-voxceleb",
+            device=DEVICE
+        )
+        print("Modelo de embeddings 'speechbrain/spkrec-ecapa-voxceleb' cargado como fallback.")
+    except Exception as e_fallback:
+        print(f"Error crítico cargando modelo de embeddings: {e_fallback}")
+        raise RuntimeError("No se pudo cargar ningún modelo de embeddings.")
 audio_processor = Audio()
 def time(secs):
     return datetime.timedelta(seconds=round(secs))
 def convert_to_wav(input_path):
+    """Convierte cualquier audio a WAV mono 16kHz usando ffmpeg."""
     if input_path.lower().endswith('.wav'):
+        # Verificar si ya es mono y 16kHz podría ser útil, pero para simplificar, convertimos siempre
+        pass
+    # Usar un nombre temporal seguro
+    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmpfile:
+        output_path = tmpfile.name
+    # Comando ffmpeg para convertir a WAV mono 16kHz
+    cmd = f"ffmpeg -y -i '{input_path}' -ac 1 -ar 16000 -acodec pcm_s16le '{output_path}'"
+    print(f"Ejecutando conversión: {cmd}")
+    os.system(cmd)
+    if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
+        raise RuntimeError("La conversión a WAV falló o produjo un archivo vacío.")
     return output_path
 def get_duration(path):
     import soundfile as sf
+    try:
+        info = sf.info(path)
+        return info.duration
+    except Exception as e:
+        print(f"Error obteniendo duración con soundfile: {e}")
+        # Fallback a wave (menos robusto)
+        import wave
+        import contextlib
+        with contextlib.closing(wave.open(path,'r')) as f:
+            frames = f.getnframes()
+            rate = f.getframerate()
+            return frames / float(rate)
 def segment_embedding(path, segment, duration):
     start = segment["start"]
     end = min(duration, segment["end"])
     clip = Segment(start, end)
+    try:
+        waveform, sample_rate = audio_processor.crop(path, clip)
+        with torch.no_grad():
+            # Para modelos Pyannote v3
+            if hasattr(embedding_model, 'encode'):
+                 # Modelos nuevos de pyannote devuelven diccionarios
+                output = embedding_model.encode(waveform[None].to(DEVICE))
+                if isinstance(output, dict) and 'embedding' in output:
+                    embedding = output['embedding']
+                else:
+                    embedding = output
+            else:
+                # Fallback para modelos compatibles con la API antigua o SpeechBrain
+                embedding = embedding_model(waveform[None].to(DEVICE))
+            # Asegurar que el embedding sea un tensor y luego numpy
+            if isinstance(embedding, torch.Tensor):
+                return embedding.squeeze().cpu().numpy()
+            else:
+                # Para embeddings que ya son numpy (ej. SpeechBrain wrapper)
+                return np.squeeze(embedding)
+    except Exception as e:
+        print(f"Error extrayendo embedding para segmento {start}-{end}: {e}")
+        # Devolver un embedding de ceros en caso de error
+        return np.zeros(512) # Ajustar tamaño si se sabe el dim del embedding
 def transcribe_and_diarize(audio_file, num_speakers):
     """Función principal de transcripción y diarización."""
+    temp_files = []
+    try:
+        status_update = ""
+        # --- 1. Conversión ---
+        status_update += "1. Convirtiendo audio a formato WAV (16kHz, mono)...\n"
+        yield status_update, ""
+        wav_path = convert_to_wav(audio_file)
+        temp_files.append(wav_path) # Para limpieza posterior
+        # --- 2. Duración ---
+        status_update += "2. Obteniendo duración del audio...\n"
+        yield status_update, ""
+        duration = get_duration(wav_path)
+        if duration > 30 * 60: # Limitar a 30 minutos
+             yield status_update + "Error: El audio es demasiado largo (máximo 30 minutos).\n", ""
+             return
+        # --- 3. Transcripción ---
+        status_update += f"3. Transcribiendo audio con Whisper (modelo '{WHISPER_MODEL_NAME}')...\n"
+        yield status_update, ""
+        # Transcribir en español
+        result = whisper_model.transcribe(wav_path, language='es', task='transcribe', verbose=False)
+        segments = result["segments"]
+        if not segments:
+            yield status_update + "Error: No se detectó habla en el audio.\n", ""
+            return
+        # --- 4. Diarización ---
+        status_update += "4. Preparando para diarización...\n"
         yield status_update, ""
+        # Limitar número de hablantes
+        num_speakers = max(2, min(6, int(num_speakers)))
+        num_speakers = min(num_speakers, len(segments))
+        if len(segments) <= 1:
+            segments[0]['speaker'] = 'HABLANTE 1'
+            status_update += "   -> Solo se detectó 1 segmento de habla. Asignando un hablante.\n"
+        else:
+            status_update += "   -> Extrayendo embeddings de audio...\n"
+            yield status_update, ""
+            # Determinar la dimensión del embedding con una muestra
+            sample_embedding = segment_embedding(wav_path, segments[0], duration)
+            embedding_dim = sample_embedding.shape[-1] if hasattr(sample_embedding, 'shape') else 512
+            print(f"Dimensión del embedding detectada: {embedding_dim}")
+            embeddings = np.zeros(shape=(len(segments), embedding_dim))
+            for i, segment in enumerate(segments):
+                embeddings[i] = segment_embedding(wav_path, segment, duration)
+            embeddings = np.nan_to_num(embeddings)
+            status_update += "   -> Agrupando hablantes...\n"
+            yield status_update, ""
+            # Clustering
+            clustering = AgglomerativeClustering(n_clusters=num_speakers).fit(embeddings)
+            labels = clustering.labels_
+            for i in range(len(segments)):
+                segments[i]["speaker"] = f'HABLANTE {labels[i] + 1}'
+        # --- 5. Formateo de salida ---
+        status_update += "5. Generando transcripción final...\n"
         yield status_update, ""
+        output_text = ""
+        for (i, segment) in enumerate(segments):
+            if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
+                if i != 0:
+                    output_text += '\n\n'
+                output_text += f"{segment['speaker']} [{time(segment['start'])}]\n\n"
+            output_text += segment["text"].strip() + ' '
+        yield status_update + "¡Proceso completado!\n", output_text
+    except Exception as e:
+        error_msg = f"Error durante el proceso: {str(e)}"
+        print(error_msg)
+        yield f"Error: {error_msg}\n", ""
+    finally:
+        # Limpiar archivos temporales
+        for f in temp_files:
+            try:
+                os.remove(f)
+                print(f"Archivo temporal eliminado: {f}")
+            except OSError:
+                pass
+        # Liberar memoria GPU/CPU
+        if 'whisper_model' in globals():
+            del whisper_model
+        if 'embedding_model' in globals():
+            del embedding_model
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
 # --- Interfaz Gradio ---
 with gr.Blocks(title="Diarización de Audio en Español") as demo:
     gr.Markdown("# 🎤 Diarización de Audio en Español")
     gr.Markdown("Sube un archivo de audio (hasta 30 minutos) y obtén una transcripción separada por hablantes. Optimizado para español.")
+    gr.Markdown("**Nota:** Este demo usa modelos ligeros. Para audio con mucho ruido o más de 10 minutos, los resultados pueden ser menos precisos.")
     with gr.Row():
         with gr.Column():
             num_speakers = gr.Slider(2, 6, value=3, step=1, label="Número aproximado de hablantes")
             run_button = gr.Button("🚀 Iniciar Diarización")
         with gr.Column():
+            status_output = gr.Textbox(label="Estado", interactive=False, lines=10, max_lines=10)
             text_output = gr.Textbox(label="Transcripción con Hablantes", interactive=False, lines=20)
     run_button.click(
         fn=transcribe_and_diarize,
         inputs=[audio_input, num_speakers],
         outputs=[status_output, text_output],
+        queue=True,
+        concurrency_limit=1 # Limitar a 1 ejecución simultánea para evitar sobrecarga
     )
     gr.Markdown("---")
+    gr.Markdown("**Modelos Usados:**\n"
+                "* **Transcripción:** Whisper (`small`)\n"
+                "* **Diarización:** Pyannote.Audio (`pyannote/embedding` o `speechbrain/spkrec-ecapa-voxceleb`)\n")
 # Para Hugging Face Spaces
+demo.launch()