File size: 5,480 Bytes
23e0fa2
186dc8d
 
23e0fa2
 
 
 
 
 
 
871b8f2
23e0fa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186dc8d
23e0fa2
 
 
 
 
186dc8d
 
23e0fa2
186dc8d
 
 
 
 
 
 
 
 
23e0fa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186dc8d
23e0fa2
 
 
186dc8d
23e0fa2
 
 
 
 
186dc8d
23e0fa2
 
 
186dc8d
23e0fa2
 
 
 
 
186dc8d
23e0fa2
 
186dc8d
23e0fa2
 
186dc8d
23e0fa2
 
 
 
186dc8d
23e0fa2
186dc8d
23e0fa2
 
186dc8d
23e0fa2
 
186dc8d
23e0fa2
 
 
 
186dc8d
23e0fa2
 
 
 
 
 
186dc8d
23e0fa2
 
 
186dc8d
 
 
23e0fa2
186dc8d
23e0fa2
 
186dc8d
 
 
 
 
 
23e0fa2
 
 
 
 
 
186dc8d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import gradio as gr
import whisper
from pyannote.audio import Pipeline
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import torch

# Load Whisper model for transcription
whisper_model = whisper.load_model("large-v2")  # Use "large-v2" if "large" doesn't work

# Load translation pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ar-en")

# Load summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Load LLaMA model and tokenizer for chat-based interaction
llama_model_name = "meta-llama/Llama-2-7b-chat"
tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
model = AutoModelForCausalLM.from_pretrained(llama_model_name)

# Global variables to store processed data
aligned_transcription = []
translated_text = ""
topics = []
summary = ""

def perform_speaker_diarization(audio_path, hf_token="YOUR_HUGGINGFACE_TOKEN"):
    # Load the speaker diarization pipeline
    pipeline = Pipeline.from_pretrained("pyannote/[email protected]", use_auth_token=hf_token)
    
    # Apply diarization
    diarization = pipeline(audio_path)
    
    # Extract speaker segments
    speaker_segments = []
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        speaker_segments.append({
            "start": turn.start,
            "end": turn.end,
            "speaker": speaker
        })
    return speaker_segments

def transcribe_with_speaker_diarization(audio_path, hf_token="YOUR_HUGGINGFACE_TOKEN"):
    # Step 1: Perform speaker diarization
    speaker_segments = perform_speaker_diarization(audio_path, hf_token)
    
    # Step 2: Transcribe audio
    transcription = whisper_model.transcribe(audio_path)
    
    # Step 3: Align transcription with speaker segments
    aligned_transcription = []
    for segment in transcription["segments"]:
        start_time = segment["start"]
        end_time = segment["end"]
        text = segment["text"]
        
        # Find the corresponding speaker
        speaker = "Unknown"
        for spk_segment in speaker_segments:
            if spk_segment["start"] <= start_time <= spk_segment["end"]:
                speaker = spk_segment["speaker"]
                break
        
        aligned_transcription.append({
            "speaker": speaker,
            "start": start_time,
            "end": end_time,
            "text": text
        })
    
    return aligned_transcription

def translate_text(text, src_lang="ar", tgt_lang="en"):
    translated = translator(text, max_length=400)
    return translated[0]["translation_text"]

def perform_topic_modeling(texts):
    vectorizer = CountVectorizer(stop_words="english")
    topic_model = BERTopic(vectorizer_model=vectorizer, calculate_probabilities=True)
    topics, probs = topic_model.fit_transform(texts)
    return topic_model.get_topic_info(), topic_model.visualize_topics()

def summarize_text(text, max_length=150, min_length=30):
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]["summary_text"]

def generate_response(prompt, max_tokens=150):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs["input_ids"], max_length=max_tokens)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def process_audio(audio_path, language="auto", hf_token="YOUR_HUGGINGFACE_TOKEN"):
    global aligned_transcription, translated_text, topics, summary
    
    # Step 1: Transcribe audio with speaker diarization
    aligned_transcription = transcribe_with_speaker_diarization(audio_path, hf_token)
    
    # Step 2: Translate text if needed
    full_text = " ".join([seg["text"] for seg in aligned_transcription])
    if language != "en":
        translated_text = translate_text(full_text, src_lang="ar", tgt_lang="en")
    else:
        translated_text = full_text
    
    # Step 3: Perform topic modeling
    topics, _ = perform_topic_modeling([translated_text])
    
    # Step 4: Summarize text
    summary = summarize_text(translated_text)
    
    return "Audio processed successfully!"

def answer_question(query):
    global aligned_transcription, translated_text, topics, summary
    
    # Combine context for the LLM
    context = f"""
    Transcription: {translated_text}
    Topics: {topics.to_string(index=False)}
    Summary: {summary}
    """
    
    # Generate response using LLM
    response = generate_response(f"{context}\nQuestion: {query}")
    return response

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# Advanced Audio Analysis App with Speaker Diarization")
    audio_input = gr.Audio(label="Upload Audio File")
    language_input = gr.Dropdown(choices=["auto", "en", "ar"], label="Language", value="auto")
    hf_token_input = gr.Textbox(label="Hugging Face Token (for pyannote.audio)", type="password")
    process_button = gr.Button("Process Audio")
    status_output = gr.Textbox(label="Status")
    
    question_input = gr.Textbox(label="Ask a Question")
    answer_output = gr.Textbox(label="Answer")
    
    process_button.click(
        process_audio,
        inputs=[audio_input, language_input, hf_token_input],
        outputs=status_output
    )
    question_input.submit(answer_question, inputs=question_input, outputs=answer_output)

demo.launch()