import gradio as gr import pandas as pd from sklearn.cluster import KMeans from sentence_transformers import SentenceTransformer from keybert import KeyBERT import numpy as np import os import io from crewai import Agent, Task, Crew from langchain_community.llms import HuggingFaceHub from langchain_huggingface import HuggingFaceEndpoint # === CONFIGURATION === HUGGINGFACEHUB_API_TOKEN = os.getenv("HF_API_TOKEN") # Set this in environment MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2" # Publicly available! # Setup LLM via HuggingFace Hub llm = HuggingFaceEndpoint( repo_id=MODEL_NAME, max_length=128, temperature=0.4, token=HUGGINGFACEHUB_API_TOKEN ) # Load embedding model and session state as before... # Setup LLM via HuggingFace Hub llm = HuggingFaceHub( repo_id=MODEL_NAME, huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, model_kwargs={"temperature": 0.4, "max_new_tokens": 64} ) embedding_model = SentenceTransformer("all-MiniLM-L6-v2") keyword_extractor = KeyBERT(model="distilbert-base-nli-mean-tokens") session = { "original_df": None, "current_df": None, "context": "", "topic_labels": {}, "keywords": {}, "clusters_verified": False } # === AGENTS === keyword_agent = Agent( role='Keyword Analyst', goal='Extract top 5 keywords from a group of similar texts', backstory="""You are a skilled keyword analyst who identifies patterns in text data. You focus on extracting concise, meaningful keywords that represent the core themes.""", llm=llm, verbose=False ) labeling_agent = Agent( role='Topic Labeler', goal='Generate a short label for a group of similar texts based on context', backstory="""You are a professional theme summarizer. Given example texts and a user context, you generate clear and actionable topic labels.""", llm=llm, verbose=False ) validation_agent = Agent( role='QA Analyst', goal='Evaluate whether the clustered topics and keywords form coherent themes', backstory="""You are a quality assurance expert evaluating if generated topics make sense. You return 'Approved' or 'Needs Refinement' based on coherence.""", llm=llm, verbose=False ) finalizer_agent = Agent( role='Data Engineer', goal='Prepare final labeled dataset for download', backstory="""You finalize the structured output file after approval and ensure it's ready for export.""", llm=llm, verbose=False ) # === TASKS === def create_tasks(text_samples, context_input): extract_keywords_task = Task( description=f"Extract 5 most relevant keywords from the following sample texts:\n\n{text_samples}", agent=keyword_agent, expected_output="Comma-separated list of keywords" ) label_topic_task = Task( description=f"Based on the following examples and instruction: '{context_input}', generate a concise topic label.\n\n{text_samples}", agent=labeling_agent, expected_output="A single line topic label" ) validate_cluster_task = Task( description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {{label}}\nKEYWORDS: {{keywords}}", agent=validation_agent, expected_output="'Approved' or 'Needs Refinement'" ) finalize_data_task = Task( description="Take the approved labeled DataFrame and format it for download.", agent=finalizer_agent, expected_output="Final CSV content as string" ) return extract_keywords_task, label_topic_task, validate_cluster_task, finalize_data_task # === CLUSTERING === def cluster_texts(texts, n_clusters=10): embeddings = embedding_model.encode(texts, show_progress_bar=False) kmeans = KMeans(n_clusters=n_clusters, random_state=42) return kmeans.fit_predict(embeddings) # === FULL PIPELINE FUNCTION === def run_initial_analysis(csv_file, context_input, n_clusters=10): try: df = pd.read_csv(csv_file.name) except Exception as e: return f"Error reading CSV: {str(e)}", "", "" session['original_df'] = df.copy() session['context'] = context_input if 'text' not in df.columns: return "CSV must contain a column named 'text'", "", "" texts = df['text'].tolist() clusters = cluster_texts(texts, n_clusters) df['cluster'] = clusters topic_labels = {} keywords_map = {} for i in range(n_clusters): cluster_texts_i = [texts[j] for j in range(len(clusters)) if clusters[j] == i] if not cluster_texts_i: continue samples = "\n".join(cluster_texts_i[:3]) # Create CrewAI Tasks for this cluster ext_task, lbl_task, val_task, _ = create_tasks(samples, context_input) # Run keyword extraction crew_keyword = Crew(agents=[keyword_agent], tasks=[ext_task]) keyword_result = crew_keyword.kickoff() keywords_map[i] = keyword_result.raw.strip() # Run labeling crew_label = Crew(agents=[labeling_agent], tasks=[lbl_task]) label_result = crew_label.kickoff() topic_labels[i] = label_result.raw.strip() # Assign labels and keywords back to DataFrame df['label'] = df['cluster'].map(topic_labels) df['keywords'] = df['cluster'].map(keywords_map) session['current_df'] = df # Validate Clusters validation_prompts = [] for cid in topic_labels: val_task = Task( description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {topic_labels[cid]}\nKEYWORDS: {keywords_map.get(cid, '')}", agent=validation_agent, expected_output="'Approved' or 'Needs Refinement'" ) crew_validate = Crew(agents=[validation_agent], tasks=[val_task]) res = crew_validate.kickoff() if "Needs" in res.raw: session["clusters_verified"] = False break else: session["clusters_verified"] = True output = io.StringIO() df.to_csv(output, index=False) csv_str = output.getvalue() return "Initial analysis complete!", csv_str, df.head(10).to_markdown(index=False) # === REFINEMENT FUNCTION === def refine_labels(feedback_input): if session['current_df'] is None: return "No data found. Please run initial analysis first.", "", "" df = session['current_df'] current_sample = df[['text', 'label']].head(10).to_markdown(index=False) prompt = f""" You are helping refine topic labels based on user feedback. Current Labels: {current_sample} User Feedback: {feedback_input} Task: Reassign labels accordingly. Keep output format consistent: one label per line. Instructions: Return only the revised labels, one per line. """ # Simulating refinement using the same LLM response = llm(prompt) new_labels = response.strip().split('\n')[:len(df)] df['label'] = new_labels[:len(df)] session['current_df'] = df output = io.StringIO() df.to_csv(output, index=False) csv_str = output.getvalue() return "Labels refined!", csv_str, df.head(10).to_markdown(index=False) # === GRADIO UI === with gr.Blocks(title="🧠 CrewAI + Open LLM Topic Modeling") as demo: gr.Markdown("# 🎯 CrewAI-Powered Topic Modeling with Open LLMs") gr.Markdown("Upload verbatims, get topics via multi-agent system using LLaMA / Mistral / Zephyr.") with gr.Row(): with gr.Column(): upload = gr.File(label="Upload CSV ('text' column)", file_types=[".csv"]) context = gr.Textbox(label="Context/Instruction", lines=5, value="Group these into common themes.") cluster_slider = gr.Slider(2, 20, value=10, step=1, label="Number of Topics") run_btn = gr.Button("Run Initial Analysis") with gr.Column(): feedback = gr.Textbox(label="Feedback / Instructions for Refinement", lines=5) refine_btn = gr.Button("Refine Labels") status = gr.Textbox(label="Status") preview = gr.Textbox(label="First 10 Rows (Editable View)", lines=10) download = gr.File(label="Download Final Labeled CSV") run_btn.click(fn=run_initial_analysis, inputs=[upload, context, cluster_slider], outputs=[status, download, preview]) refine_btn.click(fn=refine_labels, inputs=[feedback], outputs=[status, download, preview]) if __name__ == "__main__": demo.launch()