|
|
|
import tempfile |
|
from pydub import AudioSegment |
|
from transformers import pipeline |
|
from pyannote.audio import Pipeline |
|
import gradio as gr |
|
|
|
import gradio as gr |
|
from transformers import pipeline |
|
import whisper |
|
from pyannote.audio import Pipeline |
|
import tempfile |
|
import os |
|
|
|
def load_models(model_size): |
|
if model_size == "transcriber": |
|
model_name = "clinifyemr/yoruba-model-finetuned" |
|
transcriber = pipeline("automatic-speech-recognition", model=model_name) |
|
return transcriber, None |
|
else: |
|
model = whisper.load_model(model_size) |
|
return None, model |
|
|
|
from flask import jsonify |
|
import tempfile |
|
import os |
|
import io |
|
|
|
def process_audio(audio_file, num_speakers, model_size): |
|
transcriber, whisper_model = load_models(model_size) |
|
|
|
|
|
if audio_file is None: |
|
return jsonify({"error": "Audio file is required"}), 400 |
|
|
|
try: |
|
audio_file.seek(0) |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: |
|
tmp.write(audio_file.read()) |
|
tmp_path = tmp.name |
|
|
|
|
|
transcription_text = None |
|
|
|
if transcriber: |
|
result = transcriber(tmp_path) |
|
transcription_text = result['text'] |
|
elif whisper_model: |
|
result = whisper_model.transcribe(tmp_path) |
|
transcription_text = result['text'] |
|
|
|
if transcription_text is None: |
|
raise ValueError("No transcription results") |
|
|
|
|
|
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN) |
|
diarization = diarization_pipeline(tmp_path, min_speakers=num_speakers, max_speakers=5) |
|
|
|
os.remove(tmp_path) |
|
|
|
return jsonify({ |
|
"transcription": transcription_text, |
|
"diarization": diarization.get_timeline().json() |
|
}) |
|
|
|
except Exception as e: |
|
os.remove(tmp_path) |
|
return jsonify({"error": f"Error processing audio file: {e}"}), 500 |
|
|
|
|
|
def gradio_interface(audio_file, num_speakers, model_size): |
|
transcription, diarization = process_audio(audio_file, num_speakers, model_size) |
|
if transcription is None or diarization is None: |
|
return "Error in processing audio file", "No diarization result" |
|
return transcription, diarization |
|
|
|
iface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=[ |
|
gr.Audio(type="filepath", label="Upload Audio"), |
|
gr.Dropdown(choices=[1,2,3,4,5], label="Number of Speakers"), |
|
gr.Dropdown(choices=['base', 'small', 'medium', 'large', 'transcriber'], label="Model Selection") |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Transcription"), |
|
gr.JSON(label="Diarization Output") |
|
], |
|
title="Audio Transcription and Speaker Diarization", |
|
description="Upload your audio file to transcribe and analyze speaker diarization." |
|
) |
|
|
|
iface.launch() |
|
|