Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import time | |
| import yaml | |
| from datetime import datetime | |
| from flask import Flask, render_template, request, jsonify, send_file | |
| from huggingface_hub import InferenceClient, ModelCard, model_info | |
| from datasets import load_dataset | |
| import pandas as pd | |
| import evaluate | |
| from typing import Dict, List, Any, Optional | |
| import io | |
| import zipfile | |
| app = Flask(__name__) | |
| # Load evaluation metrics | |
| rouge = evaluate.load("rouge") | |
| sacrebleu = evaluate.load("sacrebleu") | |
| # Benchmark packs (manifests) | |
| BENCHMARK_PACKS = { | |
| "sentiment": { | |
| "name": "Sentiment Analysis Pack", | |
| "datasets": [ | |
| {"id": "imdb", "split": "test", "text_col": "text", "label_col": "label", "sample_size": 100}, | |
| {"id": "emotion", "split": "test", "text_col": "text", "label_col": "label", "sample_size": 100} | |
| ], | |
| "metrics": ["accuracy", "f1_macro"], | |
| "params": {"max_new_tokens": 32, "temperature": 0.1} | |
| }, | |
| "summarization": { | |
| "name": "Text Summarization Pack", | |
| "datasets": [ | |
| {"id": "cnn_dailymail", "config": "3.0.0", "split": "test", "text_col": "article", "label_col": "highlights", "sample_size": 50}, | |
| {"id": "xsum", "split": "test", "text_col": "document", "label_col": "summary", "sample_size": 50} | |
| ], | |
| "metrics": ["rouge1", "rouge2", "rougeL"], | |
| "params": {"max_new_tokens": 150, "temperature": 0.3} | |
| }, | |
| "translation": { | |
| "name": "EN→FR Translation Pack", | |
| "datasets": [ | |
| {"id": "wmt14", "config": "fr-en", "split": "test", "text_col": "translation.en", "label_col": "translation.fr", "sample_size": 50} | |
| ], | |
| "metrics": ["sacrebleu", "chrf"], | |
| "params": {"max_new_tokens": 200, "temperature": 0.1} | |
| } | |
| } | |
| def lint_model(model_id: str, token: str = None) -> Dict[str, Any]: | |
| """Import and lint a HuggingFace model""" | |
| try: | |
| # Get model info | |
| info = model_info(model_id, token=token) | |
| # Get model card | |
| try: | |
| card = ModelCard.load(model_id, token=token) | |
| card_data = card.data.to_dict() if hasattr(card, 'data') else {} | |
| except: | |
| card_data = {} | |
| # Lint checks | |
| checks = { | |
| "pipeline_tag": bool(info.pipeline_tag), | |
| "license": bool(card_data.get("license")), | |
| "model_card": bool(card.content if 'card' in locals() else False), | |
| "tags": bool(info.tags), | |
| "language": bool(card_data.get("language")), | |
| "datasets": bool(card_data.get("datasets")), | |
| "metrics": bool(card_data.get("metrics")), | |
| "intended_use": "intended use" in (card.content.lower() if 'card' in locals() and card.content else ""), | |
| "limitations": "limitation" in (card.content.lower() if 'card' in locals() and card.content else ""), | |
| "bias_risks": any(word in (card.content.lower() if 'card' in locals() and card.content else "") | |
| for word in ["bias", "fairness", "risk"]) | |
| } | |
| # Calculate readiness score | |
| score = sum(checks.values()) / len(checks) * 100 | |
| # Generate recommendations | |
| recommendations = [] | |
| if not checks["license"]: recommendations.append("Add license information") | |
| if not checks["model_card"]: recommendations.append("Add detailed model card") | |
| if not checks["intended_use"]: recommendations.append("Specify intended use cases") | |
| if not checks["limitations"]: recommendations.append("Document known limitations") | |
| if not checks["bias_risks"]: recommendations.append("Address bias and safety considerations") | |
| return { | |
| "model_id": model_id, | |
| "task": info.pipeline_tag, | |
| "readiness_score": round(score), | |
| "checks": checks, | |
| "recommendations": recommendations, | |
| "downloads": info.downloads or 0, | |
| "likes": info.likes or 0, | |
| "created_at": info.created_at.isoformat() if info.created_at else None, | |
| "library_name": info.library_name | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def run_inference(model_id: str, texts: List[str], task: str, token: str, params: Dict = None) -> List[Dict]: | |
| """Run inference using HF Inference API""" | |
| client = InferenceClient(model=model_id, token=token) | |
| results = [] | |
| params = params or {} | |
| start_time = time.time() | |
| for text in texts: | |
| try: | |
| if task == "text-classification": | |
| result = client.text_classification(text) | |
| results.append(result[0] if isinstance(result, list) else result) | |
| elif task == "summarization": | |
| result = client.summarization(text, **params) | |
| results.append(result) | |
| elif task == "translation": | |
| result = client.translation(text, **params) | |
| results.append(result) | |
| else: | |
| results.append({"error": "Unsupported task"}) | |
| except Exception as e: | |
| results.append({"error": str(e)}) | |
| total_time = time.time() - start_time | |
| avg_latency = total_time / len(texts) if texts else 0 | |
| return results, avg_latency | |
| def compute_metrics(task: str, predictions: List[Dict], references: List[str]) -> Dict[str, float]: | |
| """Compute task-specific metrics""" | |
| if task == "text-classification": | |
| pred_labels = [p.get("label", "UNKNOWN") if isinstance(p, dict) else "UNKNOWN" for p in predictions] | |
| accuracy = sum(1 for p, r in zip(pred_labels, references) if str(p).lower() == str(r).lower()) / len(references) | |
| return {"accuracy": round(accuracy, 4)} | |
| elif task == "summarization": | |
| pred_texts = [p.get("summary_text", "") if isinstance(p, dict) else "" for p in predictions] | |
| rouge_scores = rouge.compute(predictions=pred_texts, references=references) | |
| return {k: round(v, 4) for k, v in rouge_scores.items()} | |
| elif task == "translation": | |
| pred_texts = [p.get("translation_text", "") if isinstance(p, dict) else "" for p in predictions] | |
| bleu_scores = sacrebleu.compute(predictions=pred_texts, references=[[r] for r in references]) | |
| return {k: round(v, 4) for k, v in bleu_scores.items()} | |
| return {} | |
| def generate_readme_section(results: Dict) -> str: | |
| """Generate README section for model""" | |
| readme = f"""## Benchmark Results | |
| **Evaluated on:** {datetime.now().strftime('%Y-%m-%d')} | |
| **Task:** {results['task']} | |
| **Readiness Score:** {results['readiness_score']}/100 | |
| ### Performance Metrics | |
| """ | |
| for dataset_result in results.get('benchmark_results', []): | |
| readme += f"\n**Dataset:** {dataset_result['dataset']}\n" | |
| for metric, value in dataset_result['metrics'].items(): | |
| readme += f"- {metric}: {value}\n" | |
| readme += f"- Average Latency: {dataset_result['avg_latency']:.3f}s\n" | |
| readme += f""" | |
| ### Quick Start | |
| ```python | |
| from transformers import pipeline | |
| classifier = pipeline("text-classification", model="{results['model_id']}") | |
| result = classifier("Your text here") | |
| ``` | |
| *Benchmarked with [Clarifai Community Bench](https://huggingface.co/spaces/your-space)* | |
| """ | |
| return readme | |
| def index(): | |
| return render_template('index.html', benchmark_packs=BENCHMARK_PACKS) | |
| def api_lint_model(): | |
| data = request.json | |
| model_id = data.get('model_id') | |
| token = data.get('token') | |
| if not model_id: | |
| return jsonify({"error": "Model ID is required"}), 400 | |
| result = lint_model(model_id, token) | |
| return jsonify(result) | |
| def api_run_benchmark(): | |
| data = request.json | |
| model_id = data.get('model_id') | |
| pack_name = data.get('pack') | |
| token = data.get('token') | |
| if not all([model_id, pack_name, token]): | |
| return jsonify({"error": "Missing required parameters"}), 400 | |
| if pack_name not in BENCHMARK_PACKS: | |
| return jsonify({"error": "Invalid benchmark pack"}), 400 | |
| try: | |
| # First lint the model | |
| lint_result = lint_model(model_id, token) | |
| if "error" in lint_result: | |
| return jsonify(lint_result), 400 | |
| pack = BENCHMARK_PACKS[pack_name] | |
| benchmark_results = [] | |
| # Run benchmark on each dataset in the pack | |
| for dataset_config in pack['datasets']: | |
| try: | |
| # Load dataset | |
| ds_params = {"path": dataset_config['id']} | |
| if dataset_config.get('config'): | |
| ds_params['name'] = dataset_config['config'] | |
| dataset = load_dataset(**ds_params, split=dataset_config['split']) | |
| sample_size = min(dataset_config['sample_size'], len(dataset)) | |
| dataset = dataset.select(range(sample_size)) | |
| # Extract text and references | |
| texts = [item[dataset_config['text_col']] for item in dataset] | |
| references = [item[dataset_config['label_col']] for item in dataset] if dataset_config.get('label_col') else None | |
| # Run inference | |
| predictions, avg_latency = run_inference( | |
| model_id, texts, lint_result['task'], token, pack['params'] | |
| ) | |
| # Compute metrics | |
| metrics = compute_metrics(lint_result['task'], predictions, references) if references else {} | |
| benchmark_results.append({ | |
| "dataset": dataset_config['id'], | |
| "samples": len(texts), | |
| "metrics": metrics, | |
| "avg_latency": round(avg_latency, 3), | |
| "predictions": predictions[:5] # First 5 for preview | |
| }) | |
| except Exception as e: | |
| benchmark_results.append({ | |
| "dataset": dataset_config['id'], | |
| "error": str(e) | |
| }) | |
| # Combine results | |
| result = { | |
| **lint_result, | |
| "benchmark_results": benchmark_results, | |
| "pack_name": pack['name'], | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| return jsonify(result) | |
| except Exception as e: | |
| return jsonify({"error": str(e)}), 500 | |
| def api_generate_readme(): | |
| data = request.json | |
| readme_content = generate_readme_section(data) | |
| return jsonify({"readme": readme_content}) | |
| def api_export_artifacts(): | |
| data = request.json | |
| # Create ZIP file in memory | |
| zip_buffer = io.BytesIO() | |
| with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: | |
| # Add benchmark results as JSON | |
| zip_file.writestr('benchmark_results.json', json.dumps(data, indent=2)) | |
| # Add YAML manifest | |
| manifest = { | |
| 'model_id': data.get('model_id'), | |
| 'task': data.get('task'), | |
| 'benchmark_pack': data.get('pack_name'), | |
| 'results': data.get('benchmark_results'), | |
| 'timestamp': data.get('timestamp') | |
| } | |
| zip_file.writestr('manifest.yaml', yaml.dump(manifest, default_flow_style=False)) | |
| # Add README section | |
| readme_content = generate_readme_section(data) | |
| zip_file.writestr('README_section.md', readme_content) | |
| # Add Python utility script | |
| python_script = ''' | |
| """ | |
| Model Registration Utility | |
| Generated by Clarifai Community Bench | |
| """ | |
| import json | |
| from datetime import datetime | |
| class ModelArtifact: | |
| def __init__(self, manifest_path="manifest.yaml"): | |
| with open(manifest_path, 'r') as f: | |
| import yaml | |
| self.manifest = yaml.safe_load(f) | |
| def get_model_info(self): | |
| return { | |
| "id": self.manifest["model_id"], | |
| "task": self.manifest["task"], | |
| "readiness_score": self.manifest.get("readiness_score", 0), | |
| "avg_latency": self._calculate_avg_latency(), | |
| "best_dataset": self._get_best_performing_dataset() | |
| } | |
| def _calculate_avg_latency(self): | |
| results = self.manifest.get("results", []) | |
| if not results: | |
| return None | |
| latencies = [r.get("avg_latency", 0) for r in results if "avg_latency" in r] | |
| return sum(latencies) / len(latencies) if latencies else None | |
| def _get_best_performing_dataset(self): | |
| # Implementation depends on task-specific metrics | |
| return self.manifest.get("results", [{}])[0].get("dataset") | |
| # Usage example: | |
| # artifact = ModelArtifact() | |
| # print(artifact.get_model_info()) | |
| ''' | |
| zip_file.writestr('model_utility.py', python_script) | |
| zip_buffer.seek(0) | |
| return send_file( | |
| io.BytesIO(zip_buffer.read()), | |
| mimetype='application/zip', | |
| as_attachment=True, | |
| download_name=f'{data.get("model_id", "model").replace("/", "_")}_artifacts.zip' | |
| ) | |
| if __name__ == '__main__': | |
| app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)), debug=False) |