Spaces:

Karthik1610
/

EvalKit

Runtime error

App Files Files Community

Karthik1610 commited on Aug 24

Commit

03ecb7b

verified ·

1 Parent(s): ec47f09

Update app.py

Browse files

Files changed (1) hide show

app.py +259 -355

app.py CHANGED Viewed

@@ -1,388 +1,292 @@
-import os, time, json, io
-from typing import Dict, List, Any, Optional, Tuple
 import gradio as gr
 import pandas as pd
 from datasets import load_dataset
 import evaluate
-from huggingface_hub import InferenceClient, ModelCard
-# Keep evaluate for ROUGE and SacreBLEU only (no sklearn required)
-ROUGE = evaluate.load("rouge")
-SACREBLEU = evaluate.load("sacrebleu")
-# ---------- Small helpers: accuracy & F1 (macro) without scikit-learn ----------
-def _accuracy_score(y_pred: List[str], y_true: List[str]) -> float:
-    paired = [(p, t) for p, t in zip(y_pred, y_true) if p is not None]
-    if not paired:
-        return 0.0
-    correct = sum(1 for p, t in paired if str(p) == str(t))
-    return correct / len(paired)
-def _f1_macro_score(y_pred: List[str], y_true: List[str]) -> float:
-    paired = [(p, t) for p, t in zip(y_pred, y_true) if p is not None]
-    if not paired:
-        return 0.0
-    yp, yt = zip(*paired)
-    labels = sorted(set(yt))
-    def _f1_for(label: str) -> float:
-        tp = sum(1 for p, t in zip(yp, yt) if p == label and t == label)
-        fp = sum(1 for p, t in zip(yp, yt) if p == label and t != label)
-        fn = sum(1 for p, t in zip(yp, yt) if p != label and t == label)
-        if tp == 0 and (fp == 0 or fn == 0):
-            return 0.0
-        prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
-        rec  = tp / (tp + fn) if (tp + fn) > 0 else 0.0
-        return (2 * prec * rec / (prec + rec)) if (prec + rec) > 0 else 0.0
-    scores = [ _f1_for(lbl) for lbl in labels ]
-    return sum(scores) / len(scores) if scores else 0.0
-# -----------------------------------------------------------------------------
-TASKS: Dict[str, Dict[str, str]] = {
-    "sentiment": {
-        "distilbert-base-uncased-finetuned-sst-2-english": "DistilBERT SST-2",
-        "cardiffnlp/twitter-roberta-base-sentiment-latest": "RoBERTa Twitter Sentiment"
-    },
-    "zero-shot-classification": {
-        "facebook/bart-large-mnli": "BART MNLI",
-        "joeddav/xlm-roberta-large-xnli": "XLM-R XNLI"
-    },
-    "summarization": {
-        "facebook/bart-large-cnn": "BART CNN",
-        "google/pegasus-xsum": "Pegasus XSum"
-    },
-    "translation_en_fr": {
-        "Helsinki-NLP/opus-mt-en-fr": "Opus-MT EN to FR",
-        "facebook/m2m100_418M": "M2M100 418M"
-    }
-}
-METRIC_POLICY = {
-    "sentiment": {"requires": ["label"], "metrics": ["accuracy", "f1_macro"]},
-    "zero-shot-classification": {"requires": [], "metrics": ["accuracy_if_labels", "f1_macro_if_labels"]},
-    "summarization": {"requires": ["reference"], "metrics": ["rougeL", "rouge1_opt", "rouge2_opt"]},
-    "translation_en_fr": {"requires": ["reference"], "metrics": ["sacrebleu", "chrf_opt"]},
 }
-def validate_token(hf_token: str) -> Tuple[bool, str]:
-    if not hf_token or not hf_token.strip().startswith("hf_"):
-        return False, "Paste a valid Hugging Face token starting with hf_"
-    return True, "Token format OK. We'll use it only for this session."
-def load_hub_dataset(ds_id: str, config: Optional[str], split: Optional[str], sample_size: int) -> Tuple[pd.DataFrame, Dict[str, Any]]:
-    kwargs = {}
-    if config:
-        kwargs["name"] = config
-    if split:
-        kwargs["split"] = split
-    ds = load_dataset(ds_id, **kwargs)
-    if not split:
-        for sp in ["test", "validation", "train"]:
-            if sp in ds:
-                split = sp
-                break
-    d = ds[split].to_pandas()
-    if sample_size and sample_size < len(d):
-        d = d.sample(n=sample_size, random_state=42)
-    meta = {"dataset_id": ds_id, "config": config, "split": split}
-    return d, meta
-def decide_metrics(task: str, mapped_cols: List[str]) -> List[str]:
-    policy = METRIC_POLICY.get(task, {})
-    reqs = set(policy.get("requires", []))
-    if not reqs.issubset(set(mapped_cols)):
-        return ["latency_only"]
-    out = []
-    for m in policy.get("metrics", []):
-        if m.endswith("_opt"):
-            continue
-        if m.endswith("_if_labels") and "label" not in mapped_cols:
-            continue
-        out.append(m)
-    return out
-def normalize_cls_label(pred_label: str, label_names: Optional[List[str]]):
-    if label_names is None:
-        return pred_label
-    low = str(pred_label).lower()
-    for name in label_names:
-        if low == str(name).lower():
-            return name
-    if low.startswith("pos"):
-        for name in label_names:
-            if "pos" in str(name).lower():
-                return name
-    if low.startswith("neg"):
-        for name in label_names:
-            if "neg" in str(name).lower():
-                return name
-    return pred_label
-def run_remote_inference(task: str, model_id: str, token: str, texts: List[str], zs_labels: Optional[List[str]] = None,
-                          gen_params: Optional[Dict[str, Any]] = None, timeout_s: int = 20) -> Tuple[List[Any], float]:
     client = InferenceClient(model=model_id, token=token)
-    gen_params = gen_params or {}
-    outputs = []
-    t0 = time.perf_counter()
-    for t in texts:
         try:
-            if task == "summarization":
-                out = client.summarization(t, **gen_params)
-                outputs.append(out)
-            elif task == "translation_en_fr":
-                out = client.translation(t, src_lang="en", tgt_lang="fr", **gen_params)
-                outputs.append(out)
-            elif task == "sentiment":
-                out = client.text_classification(t)
-                outputs.append(out)
-            elif task == "zero-shot-classification":
-                if not zs_labels:
-                    outputs.append({"label": None, "score": None})
-                else:
-                    out = client.zero_shot_classification(t, labels=zs_labels)
-                    outputs.append(out)
             else:
-                outputs.append(None)
         except Exception as e:
-            outputs.append({"error": str(e)})
-    latency = (time.perf_counter() - t0) / max(1, len(texts))
-    return outputs, latency
-def compute_metrics(task: str, preds: List[Any], refs: Optional[List[Any]], label_names: Optional[List[str]] = None) -> Dict[str, float]:
     metrics = {}
-    if task in ["sentiment", "zero-shot-classification"] and refs is not None:
-        if label_names and isinstance(refs[0], (int, float)):
-            refs = [label_names[int(x)] for x in refs]
-        y_pred = []
-        for p in preds:
-            if isinstance(p, list) and len(p) and isinstance(p[0], dict) and "label" in p[0]:
-                y_pred.append(normalize_cls_label(p[0]["label"], label_names))
-            elif isinstance(p, dict) and "label" in p:
-                y_pred.append(normalize_cls_label(p.get("label"), label_names))
             else:
-                y_pred.append(None)
-        y_true = [str(x) for x in refs]
-        metrics["accuracy"] = _accuracy_score(y_pred, y_true)
-        metrics["f1_macro"] = _f1_macro_score(y_pred, y_true)
-    elif task == "summarization" and refs is not None:
-        preds_text = []
-        for p in preds:
-            if isinstance(p, dict) and "summary_text" in p:
-                preds_text.append(p["summary_text"])
-            elif isinstance(p, list) and len(p) and isinstance(p[0], dict) and "summary_text" in p[0]:
-                preds_text.append(p[0]["summary_text"])
-            elif isinstance(p, str):
-                preds_text.append(p)
             else:
-                preds_text.append("")
-        metrics.update(ROUGE.compute(predictions=preds_text, references=refs))
-    elif task == "translation_en_fr" and refs is not None:
-        preds_text = []
-        for p in preds:
-            if isinstance(p, dict) and "translation_text" in p:
-                preds_text.append(p["translation_text"])
-            elif isinstance(p, list) and len(p) and isinstance(p[0], dict) and "translation_text" in p[0]:
-                preds_text.append(p[0]["translation_text"])
-            elif isinstance(p, str):
-                preds_text.append(p)
             else:
-                preds_text.append("")
-        metrics.update(SACREBLEU.compute(predictions=preds_text, references=[[r] for r in refs]))
     return metrics
-def lint_model(model_id: str, token: Optional[str]) -> Dict[str, Any]:
-    out = {"model": model_id, "readiness": 0, "checks": []}
     try:
-        card = ModelCard.load(model_id, token=token)
-        data = card.data.to_dict() if hasattr(card, 'data') else {}
-        pipeline_tag = data.get("pipeline_tag")
-        license_ = data.get("license")
-        has_tags = bool(data.get("tags"))
-        score = 0
-        score += 25 if pipeline_tag else 0
-        score += 25 if license_ else 0
-        score += 25 if has_tags else 0
-        score += 25
-        out["readiness"] = score
-        out["checks"].append({"pipeline_tag": pipeline_tag, "license": license_, "has_tags": has_tags})
-    except Exception as e:
-        out["checks"].append({"error": str(e)})
-    return out
-def run_benchmark(hf_token: str, compute_mode: str, task: str, curated_models: List[str], custom_model: str,
-                  ds_source: str, ds_id: str, ds_config: str, ds_split: str, csv_file, text_col: str,
-                  label_col: str, ref_col: str, sample_size: int, zs_labels_csv: str,
-                  max_new_tokens: int, temperature: float, batch_size: int, timeout_s: int):
-    models = []
-    if curated_models:
-        models.extend(curated_models)
-    if custom_model and custom_model.strip():
-        models.append(custom_model.strip())
-    models = list(dict.fromkeys(models))
-    if not models:
-        return {"error": "Pick at least one model"}
-    if ds_source == "hub":
-        df, meta = load_hub_dataset(ds_id, ds_config or None, ds_split or None, sample_size)
-    else:
-        if csv_file is None:
-            return {"error": "Upload a CSV"}
-        df = pd.read_csv(csv_file.name)
-        if sample_size and sample_size < len(df):
-            df = df.sample(n=sample_size, random_state=42)
-        meta = {"dataset_id": "uploaded_csv", "config": None, "split": None}
-    if text_col not in df.columns:
-        text_col = text_col or df.columns[0]
-    labels = df[label_col].tolist() if label_col and label_col in df.columns else None
-    refs = df[ref_col].tolist() if ref_col and ref_col in df.columns else None
-    zs_labels = [x.strip() for x in zs_labels_csv.split(',')] if (task == "zero-shot-classification" and zs_labels_csv) else None
-    all_preds = {}
-    metrics_table = []
-    for mid in models:
-        preds, avg_lat = run_remote_inference(
-            task=task,
-            model_id=mid,
-            token=hf_token,
-            texts=df[text_col].astype(str).tolist(),
-            zs_labels=zs_labels,
-            gen_params={"max_new_tokens": int(max_new_tokens), "temperature": float(temperature)},
-            timeout_s=int(timeout_s)
-        )
-        all_preds[mid] = preds
-        m = compute_metrics(task, preds, refs if task in ["summarization", "translation_en_fr"] else labels, label_names=None)
-        m["avg_latency_s"] = avg_lat
-        metrics_table.append({"model": mid, **m})
-    preview = pd.DataFrame({"text": df[text_col].astype(str).tolist()})
-    if labels is not None:
-        preview["label"] = labels
-    if refs is not None:
-        preview["reference"] = refs
-    for mid, preds in all_preds.items():
-        col = []
-        for p in preds:
-            if isinstance(p, dict):
-                col.append(p.get("summary_text") or p.get("translation_text") or p.get("label") or str(p))
-            elif isinstance(p, list) and len(p) and isinstance(p[0], dict):
-                col.append(p[0].get("summary_text") or p[0].get("translation_text") or p[0].get("label") or str(p[0]))
-            else:
-                col.append(str(p))
-        preview[mid] = col
-    csv_buf = io.StringIO()
-    preview.to_csv(csv_buf, index=False)
-    csv_bytes = io.BytesIO(csv_buf.getvalue().encode("utf-8"))
-    lints = [lint_model(m, hf_token) for m in models]
-    return {
-        "metrics": pd.DataFrame(metrics_table),
-        "preview": preview.head(20),
-        "download": ("predictions.csv", csv_bytes),
-        "lint": lints,
-        "session": {"task": task, "models": models, "dataset": meta, "sample_size": sample_size}
-    }
-def build_ui():
-    # Use Interface instead of Blocks to avoid the JSON schema parsing bug
-    def benchmark_interface(hf_token, task, curated_models_text, custom_model, ds_id, ds_config, ds_split,
-                          text_col, label_col, ref_col, sample_size, zs_labels_csv, max_new_tokens,
-                          temperature, timeout_s, csv_file=None):
-        # Parse curated models from text input
-        curated_models = [m.strip() for m in curated_models_text.split('\n') if m.strip()] if curated_models_text else []
-        # Validate token
-        if not hf_token or not hf_token.strip().startswith("hf_"):
-            return "Error: Please provide a valid HF token", "", "", ""
-        try:
-            out = run_benchmark(
-                hf_token=hf_token,
-                compute_mode="Remote (Inference API)",
-                task=task,
-                curated_models=curated_models,
-                custom_model=custom_model,
-                ds_source="hub" if csv_file is None else "csv",
-                ds_id=ds_id,
-                ds_config=ds_config,
-                ds_split=ds_split,
-                csv_file=csv_file,
-                text_col=text_col,
-                label_col=label_col,
-                ref_col=ref_col,
-                sample_size=int(sample_size),
-                zs_labels_csv=zs_labels_csv,
-                max_new_tokens=int(max_new_tokens),
-                temperature=float(temperature),
-                batch_size=8,
-                timeout_s=int(timeout_s)
-            )
-            if isinstance(out, dict) and "error" in out:
-                return f"Error: {out['error']}", "", "", ""
-            # Format outputs as strings
-            metrics_str = out["metrics"].to_string() if not out["metrics"].empty else "No metrics computed"
-            preview_str = out["preview"].to_string() if not out["preview"].empty else "No preview available"
-            lint_str = json.dumps(out["lint"], indent=2)
-            session_str = json.dumps(out["session"], indent=2)
-            return metrics_str, preview_str, lint_str, session_str
-        except Exception as e:
-            return f"Error: {str(e)}", "", "", ""
-    # Create Interface instead of Blocks
-    demo = gr.Interface(
-        fn=benchmark_interface,
-        inputs=[
-            gr.Textbox(label="Hugging Face Token", type="password", placeholder="hf_..."),
-            gr.Dropdown(choices=list(TASKS.keys()), label="Task", value="sentiment"),
-            gr.Textbox(label="Curated Models (one per line)", lines=3,
-                      placeholder="distilbert-base-uncased-finetuned-sst-2-english\ncardiffnlp/twitter-roberta-base-sentiment-latest"),
-            gr.Textbox(label="Custom Model ID (optional)", placeholder="username/my-finetune"),
-            gr.Textbox(label="Dataset ID", value="imdb"),
-            gr.Textbox(label="Config (optional)"),
-            gr.Textbox(label="Split (optional)"),
-            gr.Textbox(label="Text Column", value="text"),
-            gr.Textbox(label="Label Column (optional)"),
-            gr.Textbox(label="Reference Column (optional)"),
-            gr.Slider(20, 500, value=100, step=10, label="Sample Size"),
-            gr.Textbox(label="Zero-shot Labels (comma-separated)"),
-            gr.Number(value=128, label="Max New Tokens"),
-            gr.Number(value=0.7, label="Temperature"),
-            gr.Number(value=20, label="Timeout (seconds)"),
-            gr.File(file_types=[".csv"], label="CSV File (optional)")
-        ],
-        outputs=[
-            gr.Textbox(label="Metrics", lines=10),
-            gr.Textbox(label="Preview (first 20 rows)", lines=15),
-            gr.Textbox(label="Model Lint Results", lines=8),
-            gr.Textbox(label="Session Info", lines=5)
-        ],
-        title="AI Model Benchmark Hub",
-        description="Compare AI models on various tasks using the Hugging Face Inference API"
-    )
     return demo
-app = build_ui()
-# Simple launch configuration for HF Spaces
 if __name__ == "__main__":
-    # Check if we're on HF Spaces
-    if "SPACE_ID" in os.environ:
-        app.launch()
-    else:
-        # Local development
-        try:
-            app.launch(share=True)
-        except:
-            app.launch(server_name="127.0.0.1", server_port=7860)

+import os
+import time
+import json
 import gradio as gr
 import pandas as pd
+from typing import List, Optional, Dict, Any
 from datasets import load_dataset
+from huggingface_hub import InferenceClient
 import evaluate
+# Load evaluation metrics
+rouge = evaluate.load("rouge")
+sacrebleu = evaluate.load("sacrebleu")
+# Model configurations
+MODELS = {
+    "sentiment": [
+        "distilbert-base-uncased-finetuned-sst-2-english",
+        "cardiffnlp/twitter-roberta-base-sentiment-latest"
+    ],
+    "summarization": [
+        "facebook/bart-large-cnn",
+        "google/pegasus-xsum"
+    ],
+    "translation": [
+        "Helsinki-NLP/opus-mt-en-fr",
+        "facebook/m2m100_418M"
+    ]
 }
+def validate_token(token: str) -> bool:
+    """Validate HF token format"""
+    return token and token.strip().startswith("hf_")
+def accuracy_score(predictions: List[str], labels: List[str]) -> float:
+    """Calculate accuracy without sklearn"""
+    if len(predictions) != len(labels):
+        return 0.0
+    correct = sum(1 for p, l in zip(predictions, labels) if str(p).lower() == str(l).lower())
+    return correct / len(labels) if labels else 0.0
+def run_inference(model_id: str, texts: List[str], task: str, token: str) -> List[Dict]:
+    """Run inference using HF Inference API"""
     client = InferenceClient(model=model_id, token=token)
+    results = []
+    for text in texts:
         try:
+            if task == "sentiment":
+                result = client.text_classification(text)
+                results.append(result[0] if isinstance(result, list) else result)
+            elif task == "summarization":
+                result = client.summarization(text, max_length=150)
+                results.append(result)
+            elif task == "translation":
+                result = client.translation(text, src_lang="en", tgt_lang="fr")
+                results.append(result)
             else:
+                results.append({"error": "Unsupported task"})
         except Exception as e:
+            results.append({"error": str(e)})
+    return results
+def compute_metrics(task: str, predictions: List[Dict], references: Optional[List[str]] = None) -> Dict[str, float]:
+    """Compute task-specific metrics"""
     metrics = {}
+    if task == "sentiment" and references:
+        pred_labels = []
+        for pred in predictions:
+            if isinstance(pred, dict) and "label" in pred:
+                pred_labels.append(pred["label"])
             else:
+                pred_labels.append("UNKNOWN")
+        metrics["accuracy"] = accuracy_score(pred_labels, references)
+    elif task == "summarization" and references:
+        pred_texts = []
+        for pred in predictions:
+            if isinstance(pred, dict) and "summary_text" in pred:
+                pred_texts.append(pred["summary_text"])
             else:
+                pred_texts.append("")
+        rouge_scores = rouge.compute(predictions=pred_texts, references=references)
+        metrics.update(rouge_scores)
+    elif task == "translation" and references:
+        pred_texts = []
+        for pred in predictions:
+            if isinstance(pred, dict) and "translation_text" in pred:
+                pred_texts.append(pred["translation_text"])
             else:
+                pred_texts.append("")
+        bleu_scores = sacrebleu.compute(predictions=pred_texts, references=[[ref] for ref in references])
+        metrics.update(bleu_scores)
     return metrics
+def benchmark_models(
+    hf_token: str,
+    task: str,
+    selected_models: List[str],
+    dataset_name: str,
+    sample_size: int,
+    text_column: str,
+    label_column: str
+):
+    """Main benchmarking function"""
+    # Validate token
+    if not validate_token(hf_token):
+        return "❌ Invalid HuggingFace token. Please provide a token starting with 'hf_'", "", ""
+    if not selected_models:
+        return "❌ Please select at least one model", "", ""
     try:
+        # Load dataset
+        dataset = load_dataset(dataset_name, split="test")
+        if sample_size > 0:
+            dataset = dataset.select(range(min(sample_size, len(dataset))))
+        df = dataset.to_pandas()
+        if text_column not in df.columns:
+            return f"❌ Text column '{text_column}' not found in dataset", "", ""
+        texts = df[text_column].astype(str).tolist()
+        references = df[label_column].tolist() if label_column in df.columns else None
+        # Results storage
+        all_results = []
+        detailed_results = {"text": texts}
+        # Run benchmarks
+        for model_id in selected_models:
+            print(f"Running inference with {model_id}...")
+            start_time = time.time()
+            predictions = run_inference(model_id, texts, task, hf_token)
+            inference_time = time.time() - start_time
+            # Compute metrics
+            metrics = compute_metrics(task, predictions, references)
+            metrics["model"] = model_id
+            metrics["inference_time"] = round(inference_time, 2)
+            metrics["samples"] = len(texts)
+            all_results.append(metrics)
+            # Store predictions for detailed view
+            pred_texts = []
+            for pred in predictions:
+                if isinstance(pred, dict):
+                    if "label" in pred:
+                        pred_texts.append(pred["label"])
+                    elif "summary_text" in pred:
+                        pred_texts.append(pred["summary_text"])
+                    elif "translation_text" in pred:
+                        pred_texts.append(pred["translation_text"])
+                    else:
+                        pred_texts.append(str(pred))
+                else:
+                    pred_texts.append(str(pred))
+            detailed_results[model_id] = pred_texts
+        # Create results DataFrames
+        results_df = pd.DataFrame(all_results)
+        detailed_df = pd.DataFrame(detailed_results)
+        # Format results for display
+        results_str = "📊 **Benchmark Results:**\n\n"
+        results_str += results_df.to_string(index=False)
+        detailed_str = "🔍 **Detailed Predictions (first 10 samples):**\n\n"
+        detailed_str += detailed_df.head(10).to_string(index=False)
+        # Create summary
+        summary = f"✅ **Benchmark Complete!**\n\n"
+        summary += f"**Task:** {task}\n"
+        summary += f"**Dataset:** {dataset_name}\n"
+        summary += f"**Models tested:** {len(selected_models)}\n"
+        summary += f"**Samples processed:** {len(texts)}\n"
+        summary += f"**Total time:** {sum(r['inference_time'] for r in all_results):.2f}s\n"
+        return summary, results_str, detailed_str
+    except Exception as e:
+        return f"❌ Error: {str(e)}", "", ""
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="AI Model Benchmark Hub") as demo:
+        gr.Markdown("# 🧪 AI Model Benchmark Hub")
+        gr.Markdown("Compare AI models on various tasks using HuggingFace Inference API")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### 🔑 Authentication")
+                hf_token = gr.Textbox(
+                    label="HuggingFace Token",
+                    type="password",
+                    placeholder="hf_...",
+                    info="Get your token from https://huggingface.co/settings/tokens"
+                )
+                gr.Markdown("### 📋 Task Selection")
+                task = gr.Dropdown(
+                    choices=["sentiment", "summarization", "translation"],
+                    label="Task",
+                    value="sentiment"
+                )
+                model_choices = gr.CheckboxGroup(
+                    choices=MODELS["sentiment"],
+                    label="Select Models",
+                    value=[MODELS["sentiment"][0]]
+                )
+                def update_models(selected_task):
+                    return gr.update(choices=MODELS[selected_task], value=[MODELS[selected_task][0]])
+                task.change(update_models, inputs=[task], outputs=[model_choices])
+            with gr.Column():
+                gr.Markdown("### 📊 Dataset Configuration")
+                dataset_name = gr.Textbox(
+                    label="Dataset Name",
+                    value="imdb",
+                    placeholder="e.g., imdb, amazon_reviews_multi"
+                )
+                sample_size = gr.Slider(
+                    minimum=10,
+                    maximum=1000,
+                    value=50,
+                    step=10,
+                    label="Sample Size"
+                )
+                text_column = gr.Textbox(
+                    label="Text Column Name",
+                    value="text",
+                    placeholder="e.g., text, review, sentence"
+                )
+                label_column = gr.Textbox(
+                    label="Label Column Name (optional)",
+                    value="label",
+                    placeholder="e.g., label, sentiment, rating"
+                )
+        run_btn = gr.Button("🚀 Run Benchmark", variant="primary", size="lg")
+        gr.Markdown("---")
+        with gr.Row():
+            with gr.Column():
+                summary_output = gr.Markdown(label="Summary")
+        with gr.Row():
+            with gr.Column():
+                results_output = gr.Markdown(label="Results")
+            with gr.Column():
+                detailed_output = gr.Markdown(label="Detailed Output")
+        # Connect the interface
+        run_btn.click(
+            benchmark_models,
+            inputs=[
+                hf_token,
+                task,
+                model_choices,
+                dataset_name,
+                sample_size,
+                text_column,
+                label_column
+            ],
+            outputs=[summary_output, results_output, detailed_output]
+        )
     return demo
+# Launch the app
 if __name__ == "__main__":
+    app = create_interface()
+    app.launch()