File size: 8,380 Bytes
c2fded3
 
 
 
eda8ec5
 
c2fded3
4658555
eda8ec5
 
fa841be
c2fded3
e25fc13
 
c2fded3
eda8ec5
e25fc13
 
 
 
 
 
 
 
 
c2fded3
e25fc13
eda8ec5
 
 
 
 
 
c2fded3
eda8ec5
 
c2fded3
 
 
 
 
eda8ec5
 
 
c2fded3
 
eda8ec5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4658555
eda8ec5
c2fded3
 
eda8ec5
c2fded3
 
 
 
 
 
eda8ec5
 
 
c2fded3
 
 
 
eda8ec5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2fded3
eda8ec5
c2fded3
 
 
eda8ec5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2fded3
 
 
 
 
 
eda8ec5
c2fded3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eda8ec5
 
c2fded3
 
 
 
 
 
 
 
 
 
 
 
eda8ec5
 
 
c2fded3
 
 
e2a2e88
c2fded3
e2a2e88
c2fded3
 
 
 
 
 
 
 
 
 
 
 
 
 
eda8ec5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import gradio as gr
import pandas as pd
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
import numpy as np
import os
import io
from crewai import Agent, Task, Crew
from langchain_community.llms import HuggingFaceHub
from langchain_huggingface import HuggingFaceEndpoint



# === CONFIGURATION ===
HUGGINGFACEHUB_API_TOKEN = os.getenv("HF_API_TOKEN")  # Set this in environment
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"  # Publicly available!

# Setup LLM via HuggingFace Hub
llm = HuggingFaceEndpoint(
    repo_id=MODEL_NAME,
    max_length=128,
    temperature=0.4,
    token=HUGGINGFACEHUB_API_TOKEN
)

# Load embedding model and session state as before...
# Setup LLM via HuggingFace Hub
llm = HuggingFaceHub(
    repo_id=MODEL_NAME,
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
    model_kwargs={"temperature": 0.4, "max_new_tokens": 64}
)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
keyword_extractor = KeyBERT(model="distilbert-base-nli-mean-tokens")

session = {
    "original_df": None,
    "current_df": None,
    "context": "",
    "topic_labels": {},
    "keywords": {},
    "clusters_verified": False
}

# === AGENTS ===
keyword_agent = Agent(
    role='Keyword Analyst',
    goal='Extract top 5 keywords from a group of similar texts',
    backstory="""You are a skilled keyword analyst who identifies patterns in text data.
    You focus on extracting concise, meaningful keywords that represent the core themes.""",
    llm=llm,
    verbose=False
)

labeling_agent = Agent(
    role='Topic Labeler',
    goal='Generate a short label for a group of similar texts based on context',
    backstory="""You are a professional theme summarizer. Given example texts and a user context,
    you generate clear and actionable topic labels.""",
    llm=llm,
    verbose=False
)

validation_agent = Agent(
    role='QA Analyst',
    goal='Evaluate whether the clustered topics and keywords form coherent themes',
    backstory="""You are a quality assurance expert evaluating if generated topics make sense.
    You return 'Approved' or 'Needs Refinement' based on coherence.""",
    llm=llm,
    verbose=False
)

finalizer_agent = Agent(
    role='Data Engineer',
    goal='Prepare final labeled dataset for download',
    backstory="""You finalize the structured output file after approval and ensure it's ready for export.""",
    llm=llm,
    verbose=False
)

# === TASKS ===
def create_tasks(text_samples, context_input):
    extract_keywords_task = Task(
        description=f"Extract 5 most relevant keywords from the following sample texts:\n\n{text_samples}",
        agent=keyword_agent,
        expected_output="Comma-separated list of keywords"
    )

    label_topic_task = Task(
        description=f"Based on the following examples and instruction: '{context_input}', generate a concise topic label.\n\n{text_samples}",
        agent=labeling_agent,
        expected_output="A single line topic label"
    )

    validate_cluster_task = Task(
        description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {{label}}\nKEYWORDS: {{keywords}}",
        agent=validation_agent,
        expected_output="'Approved' or 'Needs Refinement'"
    )

    finalize_data_task = Task(
        description="Take the approved labeled DataFrame and format it for download.",
        agent=finalizer_agent,
        expected_output="Final CSV content as string"
    )

    return extract_keywords_task, label_topic_task, validate_cluster_task, finalize_data_task

# === CLUSTERING ===
def cluster_texts(texts, n_clusters=10):
    embeddings = embedding_model.encode(texts, show_progress_bar=False)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    return kmeans.fit_predict(embeddings)

# === FULL PIPELINE FUNCTION ===
def run_initial_analysis(csv_file, context_input, n_clusters=10):
    try:
        df = pd.read_csv(csv_file.name)
    except Exception as e:
        return f"Error reading CSV: {str(e)}", "", ""

    session['original_df'] = df.copy()
    session['context'] = context_input

    if 'text' not in df.columns:
        return "CSV must contain a column named 'text'", "", ""

    texts = df['text'].tolist()
    clusters = cluster_texts(texts, n_clusters)
    df['cluster'] = clusters

    topic_labels = {}
    keywords_map = {}

    for i in range(n_clusters):
        cluster_texts_i = [texts[j] for j in range(len(clusters)) if clusters[j] == i]
        if not cluster_texts_i:
            continue

        samples = "\n".join(cluster_texts_i[:3])

        # Create CrewAI Tasks for this cluster
        ext_task, lbl_task, val_task, _ = create_tasks(samples, context_input)

        # Run keyword extraction
        crew_keyword = Crew(agents=[keyword_agent], tasks=[ext_task])
        keyword_result = crew_keyword.kickoff()
        keywords_map[i] = keyword_result.raw.strip()

        # Run labeling
        crew_label = Crew(agents=[labeling_agent], tasks=[lbl_task])
        label_result = crew_label.kickoff()
        topic_labels[i] = label_result.raw.strip()

    # Assign labels and keywords back to DataFrame
    df['label'] = df['cluster'].map(topic_labels)
    df['keywords'] = df['cluster'].map(keywords_map)

    session['current_df'] = df

    # Validate Clusters
    validation_prompts = []
    for cid in topic_labels:
        val_task = Task(
            description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {topic_labels[cid]}\nKEYWORDS: {keywords_map.get(cid, '')}",
            agent=validation_agent,
            expected_output="'Approved' or 'Needs Refinement'"
        )
        crew_validate = Crew(agents=[validation_agent], tasks=[val_task])
        res = crew_validate.kickoff()
        if "Needs" in res.raw:
            session["clusters_verified"] = False
            break
    else:
        session["clusters_verified"] = True

    output = io.StringIO()
    df.to_csv(output, index=False)
    csv_str = output.getvalue()

    return "Initial analysis complete!", csv_str, df.head(10).to_markdown(index=False)

# === REFINEMENT FUNCTION ===
def refine_labels(feedback_input):
    if session['current_df'] is None:
        return "No data found. Please run initial analysis first.", "", ""

    df = session['current_df']
    current_sample = df[['text', 'label']].head(10).to_markdown(index=False)

    prompt = f"""
You are helping refine topic labels based on user feedback.

Current Labels:
{current_sample}

User Feedback:
{feedback_input}

Task:
Reassign labels accordingly. Keep output format consistent: one label per line.

Instructions:
Return only the revised labels, one per line.
"""

    # Simulating refinement using the same LLM
    response = llm(prompt)
    new_labels = response.strip().split('\n')[:len(df)]

    df['label'] = new_labels[:len(df)]
    session['current_df'] = df

    output = io.StringIO()
    df.to_csv(output, index=False)
    csv_str = output.getvalue()

    return "Labels refined!", csv_str, df.head(10).to_markdown(index=False)

# === GRADIO UI ===
with gr.Blocks(title="🧠 CrewAI + Open LLM Topic Modeling") as demo:
    gr.Markdown("# 🎯 CrewAI-Powered Topic Modeling with Open LLMs")
    gr.Markdown("Upload verbatims, get topics via multi-agent system using LLaMA / Mistral / Zephyr.")

    with gr.Row():
        with gr.Column():
            upload = gr.File(label="Upload CSV ('text' column)", file_types=[".csv"])
            context = gr.Textbox(label="Context/Instruction", lines=5, value="Group these into common themes.")
            cluster_slider = gr.Slider(2, 20, value=10, step=1, label="Number of Topics")
            run_btn = gr.Button("Run Initial Analysis")

        with gr.Column():
            feedback = gr.Textbox(label="Feedback / Instructions for Refinement", lines=5)
            refine_btn = gr.Button("Refine Labels")

    status = gr.Textbox(label="Status")
    preview = gr.Textbox(label="First 10 Rows (Editable View)", lines=10)
    download = gr.File(label="Download Final Labeled CSV")

    run_btn.click(fn=run_initial_analysis, inputs=[upload, context, cluster_slider], outputs=[status, download, preview])
    refine_btn.click(fn=refine_labels, inputs=[feedback], outputs=[status, download, preview])

if __name__ == "__main__":
    demo.launch()