import gradio as gr
import pandas as pd
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
import numpy as np
import os
import io
from crewai import Agent, Task, Crew
from langchain_community.llms import HuggingFaceHub
from langchain_huggingface import HuggingFaceEndpoint


# === CONFIGURATION ===
HUGGINGFACEHUB_API_TOKEN = os.getenv("HF_API_TOKEN")  # Set this in environment
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"  # Publicly available!

# Setup LLM via HuggingFace Hub
llm = HuggingFaceEndpoint(
    repo_id=MODEL_NAME,
    max_length=128,
    temperature=0.4,
    token=HUGGINGFACEHUB_API_TOKEN
)

# Load embedding model and session state as before...
# Setup LLM via HuggingFace Hub
llm = HuggingFaceHub(
    repo_id=MODEL_NAME,
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
    model_kwargs={"temperature": 0.4, "max_new_tokens": 64}
)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
keyword_extractor = KeyBERT(model="distilbert-base-nli-mean-tokens")

session = {
    "original_df": None,
    "current_df": None,
    "context": "",
    "topic_labels": {},
    "keywords": {},
    "clusters_verified": False
}

# === AGENTS ===
keyword_agent = Agent(
    role='Keyword Analyst',
    goal='Extract top 5 keywords from a group of similar texts',
    backstory="""You are a skilled keyword analyst who identifies patterns in text data.
    You focus on extracting concise, meaningful keywords that represent the core themes.""",
    llm=llm,
    verbose=False
)

labeling_agent = Agent(
    role='Topic Labeler',
    goal='Generate a short label for a group of similar texts based on context',
    backstory="""You are a professional theme summarizer. Given example texts and a user context,
    you generate clear and actionable topic labels.""",
    llm=llm,
    verbose=False
)

validation_agent = Agent(
    role='QA Analyst',
    goal='Evaluate whether the clustered topics and keywords form coherent themes',
    backstory="""You are a quality assurance expert evaluating if generated topics make sense.
    You return 'Approved' or 'Needs Refinement' based on coherence.""",
    llm=llm,
    verbose=False
)

finalizer_agent = Agent(
    role='Data Engineer',
    goal='Prepare final labeled dataset for download',
    backstory="""You finalize the structured output file after approval and ensure it's ready for export.""",
    llm=llm,
    verbose=False
)

# === TASKS ===
def create_tasks(text_samples, context_input):
    extract_keywords_task = Task(
        description=f"Extract 5 most relevant keywords from the following sample texts:\n\n{text_samples}",
        agent=keyword_agent,
        expected_output="Comma-separated list of keywords"
    )

    label_topic_task = Task(
        description=f"Based on the following examples and instruction: '{context_input}', generate a concise topic label.\n\n{text_samples}",
        agent=labeling_agent,
        expected_output="A single line topic label"
    )

    validate_cluster_task = Task(
        description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {{label}}\nKEYWORDS: {{keywords}}",
        agent=validation_agent,
        expected_output="'Approved' or 'Needs Refinement'"
    )

    finalize_data_task = Task(
        description="Take the approved labeled DataFrame and format it for download.",
        agent=finalizer_agent,
        expected_output="Final CSV content as string"
    )

    return extract_keywords_task, label_topic_task, validate_cluster_task, finalize_data_task

# === CLUSTERING ===
def cluster_texts(texts, n_clusters=10):
    embeddings = embedding_model.encode(texts, show_progress_bar=False)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    return kmeans.fit_predict(embeddings)

# === FULL PIPELINE FUNCTION ===
def run_initial_analysis(csv_file, context_input, n_clusters=10):
    try:
        df = pd.read_csv(csv_file.name)
    except Exception as e:
        return f"Error reading CSV: {str(e)}", "", ""

    session['original_df'] = df.copy()
    session['context'] = context_input

    if 'text' not in df.columns:
        return "CSV must contain a column named 'text'", "", ""

    texts = df['text'].tolist()
    clusters = cluster_texts(texts, n_clusters)
    df['cluster'] = clusters

    topic_labels = {}
    keywords_map = {}

    for i in range(n_clusters):
        cluster_texts_i = [texts[j] for j in range(len(clusters)) if clusters[j] == i]
        if not cluster_texts_i:
            continue

        samples = "\n".join(cluster_texts_i[:3])

        # Create CrewAI Tasks for this cluster
        ext_task, lbl_task, val_task, _ = create_tasks(samples, context_input)

        # Run keyword extraction
        crew_keyword = Crew(agents=[keyword_agent], tasks=[ext_task])
        keyword_result = crew_keyword.kickoff()
        keywords_map[i] = keyword_result.raw.strip()

        # Run labeling
        crew_label = Crew(agents=[labeling_agent], tasks=[lbl_task])
        label_result = crew_label.kickoff()
        topic_labels[i] = label_result.raw.strip()

    # Assign labels and keywords back to DataFrame
    df['label'] = df['cluster'].map(topic_labels)
    df['keywords'] = df['cluster'].map(keywords_map)

    session['current_df'] = df

    # Validate Clusters
    validation_prompts = []
    for cid in topic_labels:
        val_task = Task(
            description=f"Evaluate whether the topic label and keywords make sense together.\n\nLABEL: {topic_labels[cid]}\nKEYWORDS: {keywords_map.get(cid, '')}",
            agent=validation_agent,
            expected_output="'Approved' or 'Needs Refinement'"
        )
        crew_validate = Crew(agents=[validation_agent], tasks=[val_task])
        res = crew_validate.kickoff()
        if "Needs" in res.raw:
            session["clusters_verified"] = False
            break
    else:
        session["clusters_verified"] = True

    output = io.StringIO()
    df.to_csv(output, index=False)
    csv_str = output.getvalue()

    return "Initial analysis complete!", csv_str, df.head(10).to_markdown(index=False)

# === REFINEMENT FUNCTION ===
def refine_labels(feedback_input):
    if session['current_df'] is None:
        return "No data found. Please run initial analysis first.", "", ""

    df = session['current_df']
    current_sample = df[['text', 'label']].head(10).to_markdown(index=False)

    prompt = f"""
You are helping refine topic labels based on user feedback.

Current Labels:
{current_sample}

User Feedback:
{feedback_input}

Task:
Reassign labels accordingly. Keep output format consistent: one label per line.

Instructions:
Return only the revised labels, one per line.
"""

    # Simulating refinement using the same LLM
    response = llm(prompt)
    new_labels = response.strip().split('\n')[:len(df)]

    df['label'] = new_labels[:len(df)]
    session['current_df'] = df

    output = io.StringIO()
    df.to_csv(output, index=False)
    csv_str = output.getvalue()

    return "Labels refined!", csv_str, df.head(10).to_markdown(index=False)

# === GRADIO UI ===
with gr.Blocks(title="🧠 CrewAI + Open LLM Topic Modeling") as demo:
    gr.Markdown("# 🎯 CrewAI-Powered Topic Modeling with Open LLMs")
    gr.Markdown("Upload verbatims, get topics via multi-agent system using LLaMA / Mistral / Zephyr.")

    with gr.Row():
        with gr.Column():
            upload = gr.File(label="Upload CSV ('text' column)", file_types=[".csv"])
            context = gr.Textbox(label="Context/Instruction", lines=5, value="Group these into common themes.")
            cluster_slider = gr.Slider(2, 20, value=10, step=1, label="Number of Topics")
            run_btn = gr.Button("Run Initial Analysis")

        with gr.Column():
            feedback = gr.Textbox(label="Feedback / Instructions for Refinement", lines=5)
            refine_btn = gr.Button("Refine Labels")

    status = gr.Textbox(label="Status")
    preview = gr.Textbox(label="First 10 Rows (Editable View)", lines=10)
    download = gr.File(label="Download Final Labeled CSV")

    run_btn.click(fn=run_initial_analysis, inputs=[upload, context, cluster_slider], outputs=[status, download, preview])
    refine_btn.click(fn=refine_labels, inputs=[feedback], outputs=[status, download, preview])

if __name__ == "__main__":
    demo.launch()