Spaces:

Karthik1610
/

EvalKit

Runtime error

App Files Files Community

Karthik1610 commited on Aug 24

Commit

ffa226d

verified ·

1 Parent(s): 74ff2c6

Update app.py

Browse files

Files changed (1) hide show

app.py +299 -237

app.py CHANGED Viewed

@@ -1,292 +1,354 @@
 import os
-import time
 import json
-import gradio as gr
-import pandas as pd
-from typing import List, Optional, Dict, Any
 from datasets import load_dataset
-from huggingface_hub import InferenceClient
 import evaluate
 # Load evaluation metrics
 rouge = evaluate.load("rouge")
 sacrebleu = evaluate.load("sacrebleu")
-# Model configurations
-MODELS = {
-    "sentiment": [
-        "distilbert-base-uncased-finetuned-sst-2-english",
-        "cardiffnlp/twitter-roberta-base-sentiment-latest"
-    ],
-    "summarization": [
-        "facebook/bart-large-cnn",
-        "google/pegasus-xsum"
-    ],
-    "translation": [
-        "Helsinki-NLP/opus-mt-en-fr",
-        "facebook/m2m100_418M"
-    ]
 }
-def validate_token(token: str) -> bool:
-    """Validate HF token format"""
-    return token and token.strip().startswith("hf_")
-def accuracy_score(predictions: List[str], labels: List[str]) -> float:
-    """Calculate accuracy without sklearn"""
-    if len(predictions) != len(labels):
-        return 0.0
-    correct = sum(1 for p, l in zip(predictions, labels) if str(p).lower() == str(l).lower())
-    return correct / len(labels) if labels else 0.0
-def run_inference(model_id: str, texts: List[str], task: str, token: str) -> List[Dict]:
     """Run inference using HF Inference API"""
     client = InferenceClient(model=model_id, token=token)
     results = []
     for text in texts:
         try:
-            if task == "sentiment":
                 result = client.text_classification(text)
                 results.append(result[0] if isinstance(result, list) else result)
             elif task == "summarization":
-                result = client.summarization(text, max_length=150)
                 results.append(result)
             elif task == "translation":
-                result = client.translation(text, src_lang="en", tgt_lang="fr")
                 results.append(result)
             else:
                 results.append({"error": "Unsupported task"})
         except Exception as e:
             results.append({"error": str(e)})
-    return results
-def compute_metrics(task: str, predictions: List[Dict], references: Optional[List[str]] = None) -> Dict[str, float]:
     """Compute task-specific metrics"""
-    metrics = {}
-    if task == "sentiment" and references:
-        pred_labels = []
-        for pred in predictions:
-            if isinstance(pred, dict) and "label" in pred:
-                pred_labels.append(pred["label"])
-            else:
-                pred_labels.append("UNKNOWN")
-        metrics["accuracy"] = accuracy_score(pred_labels, references)
-    elif task == "summarization" and references:
-        pred_texts = []
-        for pred in predictions:
-            if isinstance(pred, dict) and "summary_text" in pred:
-                pred_texts.append(pred["summary_text"])
-            else:
-                pred_texts.append("")
         rouge_scores = rouge.compute(predictions=pred_texts, references=references)
-        metrics.update(rouge_scores)
-    elif task == "translation" and references:
-        pred_texts = []
-        for pred in predictions:
-            if isinstance(pred, dict) and "translation_text" in pred:
-                pred_texts.append(pred["translation_text"])
-            else:
-                pred_texts.append("")
-        bleu_scores = sacrebleu.compute(predictions=pred_texts, references=[[ref] for ref in references])
-        metrics.update(bleu_scores)
-    return metrics
-def benchmark_models(
-    hf_token: str,
-    task: str,
-    selected_models: List[str],
-    dataset_name: str,
-    sample_size: int,
-    text_column: str,
-    label_column: str
-):
-    """Main benchmarking function"""
-    # Validate token
-    if not validate_token(hf_token):
-        return "❌ Invalid HuggingFace token. Please provide a token starting with 'hf_'", "", ""
-    if not selected_models:
-        return "❌ Please select at least one model", "", ""
     try:
-        # Load dataset
-        dataset = load_dataset(dataset_name, split="test")
-        if sample_size > 0:
-            dataset = dataset.select(range(min(sample_size, len(dataset))))
-        df = dataset.to_pandas()
-        if text_column not in df.columns:
-            return f"❌ Text column '{text_column}' not found in dataset", "", ""
-        texts = df[text_column].astype(str).tolist()
-        references = df[label_column].tolist() if label_column in df.columns else None
-        # Results storage
-        all_results = []
-        detailed_results = {"text": texts}
-        # Run benchmarks
-        for model_id in selected_models:
-            print(f"Running inference with {model_id}...")
-            start_time = time.time()
-            predictions = run_inference(model_id, texts, task, hf_token)
-            inference_time = time.time() - start_time
-            # Compute metrics
-            metrics = compute_metrics(task, predictions, references)
-            metrics["model"] = model_id
-            metrics["inference_time"] = round(inference_time, 2)
-            metrics["samples"] = len(texts)
-            all_results.append(metrics)
-            # Store predictions for detailed view
-            pred_texts = []
-            for pred in predictions:
-                if isinstance(pred, dict):
-                    if "label" in pred:
-                        pred_texts.append(pred["label"])
-                    elif "summary_text" in pred:
-                        pred_texts.append(pred["summary_text"])
-                    elif "translation_text" in pred:
-                        pred_texts.append(pred["translation_text"])
-                    else:
-                        pred_texts.append(str(pred))
-                else:
-                    pred_texts.append(str(pred))
-            detailed_results[model_id] = pred_texts
-        # Create results DataFrames
-        results_df = pd.DataFrame(all_results)
-        detailed_df = pd.DataFrame(detailed_results)
-        # Format results for display
-        results_str = "📊 **Benchmark Results:**\n\n"
-        results_str += results_df.to_string(index=False)
-        detailed_str = "🔍 **Detailed Predictions (first 10 samples):**\n\n"
-        detailed_str += detailed_df.head(10).to_string(index=False)
-        # Create summary
-        summary = f"✅ **Benchmark Complete!**\n\n"
-        summary += f"**Task:** {task}\n"
-        summary += f"**Dataset:** {dataset_name}\n"
-        summary += f"**Models tested:** {len(selected_models)}\n"
-        summary += f"**Samples processed:** {len(texts)}\n"
-        summary += f"**Total time:** {sum(r['inference_time'] for r in all_results):.2f}s\n"
-        return summary, results_str, detailed_str
-    except Exception as e:
-        return f"❌ Error: {str(e)}", "", ""
-# Create Gradio interface
-def create_interface():
-    with gr.Blocks(title="AI Model Benchmark Hub") as demo:
-        gr.Markdown("# 🧪 AI Model Benchmark Hub")
-        gr.Markdown("Compare AI models on various tasks using HuggingFace Inference API")
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("### 🔑 Authentication")
-                hf_token = gr.Textbox(
-                    label="HuggingFace Token",
-                    type="password",
-                    placeholder="hf_...",
-                    info="Get your token from https://huggingface.co/settings/tokens"
-                )
-                gr.Markdown("### 📋 Task Selection")
-                task = gr.Dropdown(
-                    choices=["sentiment", "summarization", "translation"],
-                    label="Task",
-                    value="sentiment"
-                )
-                model_choices = gr.CheckboxGroup(
-                    choices=MODELS["sentiment"],
-                    label="Select Models",
-                    value=[MODELS["sentiment"][0]]
-                )
-                def update_models(selected_task):
-                    return gr.update(choices=MODELS[selected_task], value=[MODELS[selected_task][0]])
-                task.change(update_models, inputs=[task], outputs=[model_choices])
-            with gr.Column():
-                gr.Markdown("### 📊 Dataset Configuration")
-                dataset_name = gr.Textbox(
-                    label="Dataset Name",
-                    value="imdb",
-                    placeholder="e.g., imdb, amazon_reviews_multi"
                 )
-                sample_size = gr.Slider(
-                    minimum=10,
-                    maximum=1000,
-                    value=50,
-                    step=10,
-                    label="Sample Size"
-                )
-                text_column = gr.Textbox(
-                    label="Text Column Name",
-                    value="text",
-                    placeholder="e.g., text, review, sentence"
-                )
-                label_column = gr.Textbox(
-                    label="Label Column Name (optional)",
-                    value="label",
-                    placeholder="e.g., label, sentiment, rating"
-                )
-        run_btn = gr.Button("🚀 Run Benchmark", variant="primary", size="lg")
-        gr.Markdown("---")
-        with gr.Row():
-            with gr.Column():
-                summary_output = gr.Markdown(label="Summary")
-        with gr.Row():
-            with gr.Column():
-                results_output = gr.Markdown(label="Results")
-            with gr.Column():
-                detailed_output = gr.Markdown(label="Detailed Output")
-        # Connect the interface
-        run_btn.click(
-            benchmark_models,
-            inputs=[
-                hf_token,
-                task,
-                model_choices,
-                dataset_name,
-                sample_size,
-                text_column,
-                label_column
-            ],
-            outputs=[summary_output, results_output, detailed_output]
-        )
-    return demo
-# Launch the app
-if __name__ == "__main__":
-    app = create_interface()
-    app.launch()

 import os
 import json
+import time
+import yaml
+from datetime import datetime
+from flask import Flask, render_template, request, jsonify, send_file
+from huggingface_hub import InferenceClient, ModelCard, model_info
 from datasets import load_dataset
+import pandas as pd
 import evaluate
+from typing import Dict, List, Any, Optional
+import io
+import zipfile
+app = Flask(__name__)
 # Load evaluation metrics
 rouge = evaluate.load("rouge")
 sacrebleu = evaluate.load("sacrebleu")
+# Benchmark packs (manifests)
+BENCHMARK_PACKS = {
+    "sentiment": {
+        "name": "Sentiment Analysis Pack",
+        "datasets": [
+            {"id": "imdb", "split": "test", "text_col": "text", "label_col": "label", "sample_size": 100},
+            {"id": "emotion", "split": "test", "text_col": "text", "label_col": "label", "sample_size": 100}
+        ],
+        "metrics": ["accuracy", "f1_macro"],
+        "params": {"max_new_tokens": 32, "temperature": 0.1}
+    },
+    "summarization": {
+        "name": "Text Summarization Pack",
+        "datasets": [
+            {"id": "cnn_dailymail", "config": "3.0.0", "split": "test", "text_col": "article", "label_col": "highlights", "sample_size": 50},
+            {"id": "xsum", "split": "test", "text_col": "document", "label_col": "summary", "sample_size": 50}
+        ],
+        "metrics": ["rouge1", "rouge2", "rougeL"],
+        "params": {"max_new_tokens": 150, "temperature": 0.3}
+    },
+    "translation": {
+        "name": "EN→FR Translation Pack",
+        "datasets": [
+            {"id": "wmt14", "config": "fr-en", "split": "test", "text_col": "translation.en", "label_col": "translation.fr", "sample_size": 50}
+        ],
+        "metrics": ["sacrebleu", "chrf"],
+        "params": {"max_new_tokens": 200, "temperature": 0.1}
+    }
 }
+def lint_model(model_id: str, token: str = None) -> Dict[str, Any]:
+    """Import and lint a HuggingFace model"""
+    try:
+        # Get model info
+        info = model_info(model_id, token=token)
+        # Get model card
+        try:
+            card = ModelCard.load(model_id, token=token)
+            card_data = card.data.to_dict() if hasattr(card, 'data') else {}
+        except:
+            card_data = {}
+        # Lint checks
+        checks = {
+            "pipeline_tag": bool(info.pipeline_tag),
+            "license": bool(card_data.get("license")),
+            "model_card": bool(card.content if 'card' in locals() else False),
+            "tags": bool(info.tags),
+            "language": bool(card_data.get("language")),
+            "datasets": bool(card_data.get("datasets")),
+            "metrics": bool(card_data.get("metrics")),
+            "intended_use": "intended use" in (card.content.lower() if 'card' in locals() and card.content else ""),
+            "limitations": "limitation" in (card.content.lower() if 'card' in locals() and card.content else ""),
+            "bias_risks": any(word in (card.content.lower() if 'card' in locals() and card.content else "")
+                             for word in ["bias", "fairness", "risk"])
+        }
+        # Calculate readiness score
+        score = sum(checks.values()) / len(checks) * 100
+        # Generate recommendations
+        recommendations = []
+        if not checks["license"]: recommendations.append("Add license information")
+        if not checks["model_card"]: recommendations.append("Add detailed model card")
+        if not checks["intended_use"]: recommendations.append("Specify intended use cases")
+        if not checks["limitations"]: recommendations.append("Document known limitations")
+        if not checks["bias_risks"]: recommendations.append("Address bias and safety considerations")
+        return {
+            "model_id": model_id,
+            "task": info.pipeline_tag,
+            "readiness_score": round(score),
+            "checks": checks,
+            "recommendations": recommendations,
+            "downloads": info.downloads or 0,
+            "likes": info.likes or 0,
+            "created_at": info.created_at.isoformat() if info.created_at else None,
+            "library_name": info.library_name
+        }
+    except Exception as e:
+        return {"error": str(e)}
+def run_inference(model_id: str, texts: List[str], task: str, token: str, params: Dict = None) -> List[Dict]:
     """Run inference using HF Inference API"""
     client = InferenceClient(model=model_id, token=token)
     results = []
+    params = params or {}
+    start_time = time.time()
     for text in texts:
         try:
+            if task == "text-classification":
                 result = client.text_classification(text)
                 results.append(result[0] if isinstance(result, list) else result)
             elif task == "summarization":
+                result = client.summarization(text, **params)
                 results.append(result)
             elif task == "translation":
+                result = client.translation(text, **params)
                 results.append(result)
             else:
                 results.append({"error": "Unsupported task"})
         except Exception as e:
             results.append({"error": str(e)})
+    total_time = time.time() - start_time
+    avg_latency = total_time / len(texts) if texts else 0
+    return results, avg_latency
+def compute_metrics(task: str, predictions: List[Dict], references: List[str]) -> Dict[str, float]:
     """Compute task-specific metrics"""
+    if task == "text-classification":
+        pred_labels = [p.get("label", "UNKNOWN") if isinstance(p, dict) else "UNKNOWN" for p in predictions]
+        accuracy = sum(1 for p, r in zip(pred_labels, references) if str(p).lower() == str(r).lower()) / len(references)
+        return {"accuracy": round(accuracy, 4)}
+    elif task == "summarization":
+        pred_texts = [p.get("summary_text", "") if isinstance(p, dict) else "" for p in predictions]
         rouge_scores = rouge.compute(predictions=pred_texts, references=references)
+        return {k: round(v, 4) for k, v in rouge_scores.items()}
+    elif task == "translation":
+        pred_texts = [p.get("translation_text", "") if isinstance(p, dict) else "" for p in predictions]
+        bleu_scores = sacrebleu.compute(predictions=pred_texts, references=[[r] for r in references])
+        return {k: round(v, 4) for k, v in bleu_scores.items()}
+    return {}
+def generate_readme_section(results: Dict) -> str:
+    """Generate README section for model"""
+    readme = f"""## Benchmark Results
+**Evaluated on:** {datetime.now().strftime('%Y-%m-%d')}
+**Task:** {results['task']}
+**Readiness Score:** {results['readiness_score']}/100
+### Performance Metrics
+"""
+    for dataset_result in results.get('benchmark_results', []):
+        readme += f"\n**Dataset:** {dataset_result['dataset']}\n"
+        for metric, value in dataset_result['metrics'].items():
+            readme += f"- {metric}: {value}\n"
+        readme += f"- Average Latency: {dataset_result['avg_latency']:.3f}s\n"
+    readme += f"""
+### Quick Start
+```python
+from transformers import pipeline
+classifier = pipeline("text-classification", model="{results['model_id']}")
+result = classifier("Your text here")
+```
+*Benchmarked with [Clarifai Community Bench](https://huggingface.co/spaces/your-space)*
+"""
+    return readme
+@app.route('/')
+def index():
+    return render_template('index.html', benchmark_packs=BENCHMARK_PACKS)
+@app.route('/api/lint-model', methods=['POST'])
+def api_lint_model():
+    data = request.json
+    model_id = data.get('model_id')
+    token = data.get('token')
+    if not model_id:
+        return jsonify({"error": "Model ID is required"}), 400
+    result = lint_model(model_id, token)
+    return jsonify(result)
+@app.route('/api/run-benchmark', methods=['POST'])
+def api_run_benchmark():
+    data = request.json
+    model_id = data.get('model_id')
+    pack_name = data.get('pack')
+    token = data.get('token')
+    if not all([model_id, pack_name, token]):
+        return jsonify({"error": "Missing required parameters"}), 400
+    if pack_name not in BENCHMARK_PACKS:
+        return jsonify({"error": "Invalid benchmark pack"}), 400
     try:
+        # First lint the model
+        lint_result = lint_model(model_id, token)
+        if "error" in lint_result:
+            return jsonify(lint_result), 400
+        pack = BENCHMARK_PACKS[pack_name]
+        benchmark_results = []
+        # Run benchmark on each dataset in the pack
+        for dataset_config in pack['datasets']:
+            try:
+                # Load dataset
+                ds_params = {"path": dataset_config['id']}
+                if dataset_config.get('config'):
+                    ds_params['name'] = dataset_config['config']
+                dataset = load_dataset(**ds_params, split=dataset_config['split'])
+                sample_size = min(dataset_config['sample_size'], len(dataset))
+                dataset = dataset.select(range(sample_size))
+                # Extract text and references
+                texts = [item[dataset_config['text_col']] for item in dataset]
+                references = [item[dataset_config['label_col']] for item in dataset] if dataset_config.get('label_col') else None
+                # Run inference
+                predictions, avg_latency = run_inference(
+                    model_id, texts, lint_result['task'], token, pack['params']
                 )
+                # Compute metrics
+                metrics = compute_metrics(lint_result['task'], predictions, references) if references else {}
+                benchmark_results.append({
+                    "dataset": dataset_config['id'],
+                    "samples": len(texts),
+                    "metrics": metrics,
+                    "avg_latency": round(avg_latency, 3),
+                    "predictions": predictions[:5]  # First 5 for preview
+                })
+            except Exception as e:
+                benchmark_results.append({
+                    "dataset": dataset_config['id'],
+                    "error": str(e)
+                })
+        # Combine results
+        result = {
+            **lint_result,
+            "benchmark_results": benchmark_results,
+            "pack_name": pack['name'],
+            "timestamp": datetime.now().isoformat()
+        }
+        return jsonify(result)
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+@app.route('/api/generate-readme', methods=['POST'])
+def api_generate_readme():
+    data = request.json
+    readme_content = generate_readme_section(data)
+    return jsonify({"readme": readme_content})
+@app.route('/api/export-artifacts', methods=['POST'])
+def api_export_artifacts():
+    data = request.json
+    # Create ZIP file in memory
+    zip_buffer = io.BytesIO()
+    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+        # Add benchmark results as JSON
+        zip_file.writestr('benchmark_results.json', json.dumps(data, indent=2))
+        # Add YAML manifest
+        manifest = {
+            'model_id': data.get('model_id'),
+            'task': data.get('task'),
+            'benchmark_pack': data.get('pack_name'),
+            'results': data.get('benchmark_results'),
+            'timestamp': data.get('timestamp')
+        }
+        zip_file.writestr('manifest.yaml', yaml.dump(manifest, default_flow_style=False))
+        # Add README section
+        readme_content = generate_readme_section(data)
+        zip_file.writestr('README_section.md', readme_content)
+        # Add Python utility script
+        python_script = f'''
+"""
+Model Registration Utility
+Generated by Clarifai Community Bench
+"""
+import json
+from datetime import datetime
+class ModelArtifact:
+    def __init__(self, manifest_path="manifest.yaml"):
+        with open(manifest_path, 'r') as f:
+            import yaml
+            self.manifest = yaml.safe_load(f)
+    def get_model_info(self):
+        return {{
+            "id": self.manifest["model_id"],
+            "task": self.manifest["task"],
+            "readiness_score": self.manifest.get("readiness_score", 0),
+            "avg_latency": self._calculate_avg_latency(),
+            "best_dataset": self._get_best_performing_dataset()
+        }}
+    def _calculate_avg_latency(self):
+        results = self.manifest.get("results", [])
+        if not results:
+            return None
+        latencies = [r.get("avg_latency", 0) for r in results if "avg_latency" in r]
+        return sum(latencies) / len(latencies) if latencies else None
+    def _get_best_performing_dataset(self):
+        # Implementation depends on task-specific metrics
+        return self.manifest.get("results", [{}])[0].get("dataset")
+# Usage example:
+# artifact = ModelArtifact()
+# print(artifact.get_model_info())
+'''
+        zip_file.writestr('model_utility.py', python_script)
+    zip_buffer.seek(0)
+    return send_file(
+        io.BytesIO(zip_buffer.read()),
+        mimetype='application/zip',
+        as_attachment=True,
+        download_name=f'{data.get("model_id", "model").replace("/", "_")}_artifacts.zip'
+    )
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)), debug=False)