Spaces:

Alimubariz124
/

Topic_modelling

Runtime error

File size: 8,380 Bytes

c2fded3
 
 
 
eda8ec5
 
c2fded3
4658555
eda8ec5
 
fa841be
c2fded3
e25fc13
 
c2fded3
eda8ec5
e25fc13
 
 
 
 
 
 
 
 
c2fded3
e25fc13
eda8ec5
 
 
 
 
 
c2fded3
eda8ec5
 
c2fded3
 
 
 
 
eda8ec5
 
 
c2fded3
 
eda8ec5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4658555
eda8ec5
c2fded3
 
eda8ec5
c2fded3
 
 
 
 
 
eda8ec5
 
 
c2fded3
 
 
 
eda8ec5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2fded3
eda8ec5
c2fded3
 
 
eda8ec5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2fded3
 
 
 
 
 
eda8ec5
c2fded3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eda8ec5
 
c2fded3
 
 
 
 
 
 
 
 
 
 
 
eda8ec5
 
 
c2fded3
 
 
e2a2e88
c2fded3
e2a2e88
c2fded3
 
 
 
 
 
 
 
 
 
 
 
 
 
eda8ec5

import gradio as gr
import pandas as pd
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
import numpy as np
import os
import io
from crewai import Agent, Task, Crew
from langchain_community.llms import HuggingFaceHub
from langchain_huggingface import HuggingFaceEndpoint



# === CONFIGURATION ===
HUGGINGFACEHUB_API_TOKEN = os.getenv("HF_API_TOKEN")  # Set this in environment
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"  # Publicly available!

# Setup LLM via HuggingFace Hub
llm = HuggingFaceEndpoint(
    repo_id=MODEL_NAME,
    max_length=128,
    temperature=0.4,
    token=HUGGINGFACEHUB_API_TOKEN
)

# Load embedding model and session state as before...
# Setup LLM via HuggingFace Hub
llm = HuggingFaceHub(
    repo_id=MODEL_NAME,
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
    model_kwargs={"temperature": 0.4, "max_new_tokens": 64}
)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
keyword_extractor = KeyBERT(model="distilbert-base-nli-mean-tokens")

session = {
    "original_df": None,
    "current_df": None,
    "context": "",
    "topic_labels": {},
    "keywords": {},
    "clusters_verified": False
}

# === AGENTS ===
keyword_agent = Agent(
    role='Keyword Analyst',
    goal='Extract top 5 keywords from a group of similar texts',
    backstory="""You are a skilled keyword analyst who identifies patterns in text data.
    You focus on extracting concise, meaningful keywords that represent the core themes.""",
    llm=llm,
    verbose=False
)

labeling_agent = Agent(
    role='Topic Labeler',
    goal='Generate a short label for a group of similar texts based on context',
    backstory="""You are a professional theme summarizer. Given example texts and a user context,
    you generate clear and actionable topic labels.""",
    llm=llm,
    verbose=False
)

validation_agent = Agent(
    role='QA Analyst',
    goal='Evaluate whether the clustered topics and keywords form coherent themes',
    backstory="""You are a quality assurance expert evaluating if generated topics make sense.
    You return 'Approved' or 'Needs Refinement' based on coherence.""",
    llm=llm,
    verbose=False
)

finalizer_agent = Agent(
    role='Data Engineer',
    goal='Prepare final labeled dataset for download',
    backstory="""You finalize the structured output file after approval and ensure it's ready for export.""",
    llm=llm,
    verbose=False
)

# === TASKS ===
def create_tasks(text_samples, context_input):
    extract_keywords_task = Task(
        description=f"Extract 5 most relevant keywords from the following sample texts:\n\n{text_samples}",
        agent=keyword_agent,
        expected_output="Comma-separated list of keywords"
    )

    label_topic_task = Task(
        description=f"Based on the following examples and instruction: '{context_input}', generate a concise topic label.\n\n{text_samples}",
        agent=labeling_agent,
        expected_output="A single line topic label"
    )

    validate_cluster_task = Task(
        description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {{label}}\nKEYWORDS: {{keywords}}",
        agent=validation_agent,
        expected_output="'Approved' or 'Needs Refinement'"
    )

    finalize_data_task = Task(
        description="Take the approved labeled DataFrame and format it for download.",
        agent=finalizer_agent,
        expected_output="Final CSV content as string"
    )

    return extract_keywords_task, label_topic_task, validate_cluster_task, finalize_data_task

# === CLUSTERING ===
def cluster_texts(texts, n_clusters=10):
    embeddings = embedding_model.encode(texts, show_progress_bar=False)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    return kmeans.fit_predict(embeddings)

# === FULL PIPELINE FUNCTION ===
def run_initial_analysis(csv_file, context_input, n_clusters=10):
    try:
        df = pd.read_csv(csv_file.name)
    except Exception as e:
        return f"Error reading CSV: {str(e)}", "", ""

    session['original_df'] = df.copy()
    session['context'] = context_input

    if 'text' not in df.columns:
        return "CSV must contain a column named 'text'", "", ""

    texts = df['text'].tolist()
    clusters = cluster_texts(texts, n_clusters)
    df['cluster'] = clusters

    topic_labels = {}
    keywords_map = {}

    for i in range(n_clusters):
        cluster_texts_i = [texts[j] for j in range(len(clusters)) if clusters[j] == i]
        if not cluster_texts_i:
            continue

        samples = "\n".join(cluster_texts_i[:3])

        # Create CrewAI Tasks for this cluster
        ext_task, lbl_task, val_task, _ = create_tasks(samples, context_input)

        # Run keyword extraction
        crew_keyword = Crew(agents=[keyword_agent], tasks=[ext_task])
        keyword_result = crew_keyword.kickoff()
        keywords_map[i] = keyword_result.raw.strip()

        # Run labeling
        crew_label = Crew(agents=[labeling_agent], tasks=[lbl_task])
        label_result = crew_label.kickoff()
        topic_labels[i] = label_result.raw.strip()

    # Assign labels and keywords back to DataFrame
    df['label'] = df['cluster'].map(topic_labels)
    df['keywords'] = df['cluster'].map(keywords_map)

    session['current_df'] = df

    # Validate Clusters
    validation_prompts = []
    for cid in topic_labels:
        val_task = Task(
            description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {topic_labels[cid]}\nKEYWORDS: {keywords_map.get(cid, '')}",
            agent=validation_agent,
            expected_output="'Approved' or 'Needs Refinement'"
        )
        crew_validate = Crew(agents=[validation_agent], tasks=[val_task])
        res = crew_validate.kickoff()
        if "Needs" in res.raw:
            session["clusters_verified"] = False
            break
    else:
        session["clusters_verified"] = True

    output = io.StringIO()
    df.to_csv(output, index=False)
    csv_str = output.getvalue()

    return "Initial analysis complete!", csv_str, df.head(10).to_markdown(index=False)

# === REFINEMENT FUNCTION ===
def refine_labels(feedback_input):
    if session['current_df'] is None:
        return "No data found. Please run initial analysis first.", "", ""

    df = session['current_df']
    current_sample = df[['text', 'label']].head(10).to_markdown(index=False)

    prompt = f"""
You are helping refine topic labels based on user feedback.

Current Labels:
{current_sample}

User Feedback:
{feedback_input}

Task:
Reassign labels accordingly. Keep output format consistent: one label per line.

Instructions:
Return only the revised labels, one per line.
"""

    # Simulating refinement using the same LLM
    response = llm(prompt)
    new_labels = response.strip().split('\n')[:len(df)]

    df['label'] = new_labels[:len(df)]
    session['current_df'] = df

    output = io.StringIO()
    df.to_csv(output, index=False)
    csv_str = output.getvalue()

    return "Labels refined!", csv_str, df.head(10).to_markdown(index=False)

# === GRADIO UI ===
with gr.Blocks(title="🧠 CrewAI + Open LLM Topic Modeling") as demo:
    gr.Markdown("# 🎯 CrewAI-Powered Topic Modeling with Open LLMs")
    gr.Markdown("Upload verbatims, get topics via multi-agent system using LLaMA / Mistral / Zephyr.")

    with gr.Row():
        with gr.Column():
            upload = gr.File(label="Upload CSV ('text' column)", file_types=[".csv"])
            context = gr.Textbox(label="Context/Instruction", lines=5, value="Group these into common themes.")
            cluster_slider = gr.Slider(2, 20, value=10, step=1, label="Number of Topics")
            run_btn = gr.Button("Run Initial Analysis")

        with gr.Column():
            feedback = gr.Textbox(label="Feedback / Instructions for Refinement", lines=5)
            refine_btn = gr.Button("Refine Labels")

    status = gr.Textbox(label="Status")
    preview = gr.Textbox(label="First 10 Rows (Editable View)", lines=10)
    download = gr.File(label="Download Final Labeled CSV")

    run_btn.click(fn=run_initial_analysis, inputs=[upload, context, cluster_slider], outputs=[status, download, preview])
    refine_btn.click(fn=refine_labels, inputs=[feedback], outputs=[status, download, preview])

if __name__ == "__main__":
    demo.launch()