EvalKit / app.py
Karthik1610's picture
Update app.py
ba45de7 verified
import os
import json
import time
import yaml
from datetime import datetime
from flask import Flask, render_template, request, jsonify, send_file
from huggingface_hub import InferenceClient, ModelCard, model_info
from datasets import load_dataset
import pandas as pd
import evaluate
from typing import Dict, List, Any, Optional
import io
import zipfile
app = Flask(__name__)
# Load evaluation metrics
rouge = evaluate.load("rouge")
sacrebleu = evaluate.load("sacrebleu")
# Benchmark packs (manifests)
BENCHMARK_PACKS = {
"sentiment": {
"name": "Sentiment Analysis Pack",
"datasets": [
{"id": "imdb", "split": "test", "text_col": "text", "label_col": "label", "sample_size": 100},
{"id": "emotion", "split": "test", "text_col": "text", "label_col": "label", "sample_size": 100}
],
"metrics": ["accuracy", "f1_macro"],
"params": {"max_new_tokens": 32, "temperature": 0.1}
},
"summarization": {
"name": "Text Summarization Pack",
"datasets": [
{"id": "cnn_dailymail", "config": "3.0.0", "split": "test", "text_col": "article", "label_col": "highlights", "sample_size": 50},
{"id": "xsum", "split": "test", "text_col": "document", "label_col": "summary", "sample_size": 50}
],
"metrics": ["rouge1", "rouge2", "rougeL"],
"params": {"max_new_tokens": 150, "temperature": 0.3}
},
"translation": {
"name": "EN→FR Translation Pack",
"datasets": [
{"id": "wmt14", "config": "fr-en", "split": "test", "text_col": "translation.en", "label_col": "translation.fr", "sample_size": 50}
],
"metrics": ["sacrebleu", "chrf"],
"params": {"max_new_tokens": 200, "temperature": 0.1}
}
}
def lint_model(model_id: str, token: str = None) -> Dict[str, Any]:
"""Import and lint a HuggingFace model"""
try:
# Get model info
info = model_info(model_id, token=token)
# Get model card
try:
card = ModelCard.load(model_id, token=token)
card_data = card.data.to_dict() if hasattr(card, 'data') else {}
except:
card_data = {}
# Lint checks
checks = {
"pipeline_tag": bool(info.pipeline_tag),
"license": bool(card_data.get("license")),
"model_card": bool(card.content if 'card' in locals() else False),
"tags": bool(info.tags),
"language": bool(card_data.get("language")),
"datasets": bool(card_data.get("datasets")),
"metrics": bool(card_data.get("metrics")),
"intended_use": "intended use" in (card.content.lower() if 'card' in locals() and card.content else ""),
"limitations": "limitation" in (card.content.lower() if 'card' in locals() and card.content else ""),
"bias_risks": any(word in (card.content.lower() if 'card' in locals() and card.content else "")
for word in ["bias", "fairness", "risk"])
}
# Calculate readiness score
score = sum(checks.values()) / len(checks) * 100
# Generate recommendations
recommendations = []
if not checks["license"]: recommendations.append("Add license information")
if not checks["model_card"]: recommendations.append("Add detailed model card")
if not checks["intended_use"]: recommendations.append("Specify intended use cases")
if not checks["limitations"]: recommendations.append("Document known limitations")
if not checks["bias_risks"]: recommendations.append("Address bias and safety considerations")
return {
"model_id": model_id,
"task": info.pipeline_tag,
"readiness_score": round(score),
"checks": checks,
"recommendations": recommendations,
"downloads": info.downloads or 0,
"likes": info.likes or 0,
"created_at": info.created_at.isoformat() if info.created_at else None,
"library_name": info.library_name
}
except Exception as e:
return {"error": str(e)}
def run_inference(model_id: str, texts: List[str], task: str, token: str, params: Dict = None) -> List[Dict]:
"""Run inference using HF Inference API"""
client = InferenceClient(model=model_id, token=token)
results = []
params = params or {}
start_time = time.time()
for text in texts:
try:
if task == "text-classification":
result = client.text_classification(text)
results.append(result[0] if isinstance(result, list) else result)
elif task == "summarization":
result = client.summarization(text, **params)
results.append(result)
elif task == "translation":
result = client.translation(text, **params)
results.append(result)
else:
results.append({"error": "Unsupported task"})
except Exception as e:
results.append({"error": str(e)})
total_time = time.time() - start_time
avg_latency = total_time / len(texts) if texts else 0
return results, avg_latency
def compute_metrics(task: str, predictions: List[Dict], references: List[str]) -> Dict[str, float]:
"""Compute task-specific metrics"""
if task == "text-classification":
pred_labels = [p.get("label", "UNKNOWN") if isinstance(p, dict) else "UNKNOWN" for p in predictions]
accuracy = sum(1 for p, r in zip(pred_labels, references) if str(p).lower() == str(r).lower()) / len(references)
return {"accuracy": round(accuracy, 4)}
elif task == "summarization":
pred_texts = [p.get("summary_text", "") if isinstance(p, dict) else "" for p in predictions]
rouge_scores = rouge.compute(predictions=pred_texts, references=references)
return {k: round(v, 4) for k, v in rouge_scores.items()}
elif task == "translation":
pred_texts = [p.get("translation_text", "") if isinstance(p, dict) else "" for p in predictions]
bleu_scores = sacrebleu.compute(predictions=pred_texts, references=[[r] for r in references])
return {k: round(v, 4) for k, v in bleu_scores.items()}
return {}
def generate_readme_section(results: Dict) -> str:
"""Generate README section for model"""
readme = f"""## Benchmark Results
**Evaluated on:** {datetime.now().strftime('%Y-%m-%d')}
**Task:** {results['task']}
**Readiness Score:** {results['readiness_score']}/100
### Performance Metrics
"""
for dataset_result in results.get('benchmark_results', []):
readme += f"\n**Dataset:** {dataset_result['dataset']}\n"
for metric, value in dataset_result['metrics'].items():
readme += f"- {metric}: {value}\n"
readme += f"- Average Latency: {dataset_result['avg_latency']:.3f}s\n"
readme += f"""
### Quick Start
```python
from transformers import pipeline
classifier = pipeline("text-classification", model="{results['model_id']}")
result = classifier("Your text here")
```
*Benchmarked with [Clarifai Community Bench](https://huggingface.co/spaces/your-space)*
"""
return readme
@app.route('/')
def index():
return render_template('index.html', benchmark_packs=BENCHMARK_PACKS)
@app.route('/api/lint-model', methods=['POST'])
def api_lint_model():
data = request.json
model_id = data.get('model_id')
token = data.get('token')
if not model_id:
return jsonify({"error": "Model ID is required"}), 400
result = lint_model(model_id, token)
return jsonify(result)
@app.route('/api/run-benchmark', methods=['POST'])
def api_run_benchmark():
data = request.json
model_id = data.get('model_id')
pack_name = data.get('pack')
token = data.get('token')
if not all([model_id, pack_name, token]):
return jsonify({"error": "Missing required parameters"}), 400
if pack_name not in BENCHMARK_PACKS:
return jsonify({"error": "Invalid benchmark pack"}), 400
try:
# First lint the model
lint_result = lint_model(model_id, token)
if "error" in lint_result:
return jsonify(lint_result), 400
pack = BENCHMARK_PACKS[pack_name]
benchmark_results = []
# Run benchmark on each dataset in the pack
for dataset_config in pack['datasets']:
try:
# Load dataset
ds_params = {"path": dataset_config['id']}
if dataset_config.get('config'):
ds_params['name'] = dataset_config['config']
dataset = load_dataset(**ds_params, split=dataset_config['split'])
sample_size = min(dataset_config['sample_size'], len(dataset))
dataset = dataset.select(range(sample_size))
# Extract text and references
texts = [item[dataset_config['text_col']] for item in dataset]
references = [item[dataset_config['label_col']] for item in dataset] if dataset_config.get('label_col') else None
# Run inference
predictions, avg_latency = run_inference(
model_id, texts, lint_result['task'], token, pack['params']
)
# Compute metrics
metrics = compute_metrics(lint_result['task'], predictions, references) if references else {}
benchmark_results.append({
"dataset": dataset_config['id'],
"samples": len(texts),
"metrics": metrics,
"avg_latency": round(avg_latency, 3),
"predictions": predictions[:5] # First 5 for preview
})
except Exception as e:
benchmark_results.append({
"dataset": dataset_config['id'],
"error": str(e)
})
# Combine results
result = {
**lint_result,
"benchmark_results": benchmark_results,
"pack_name": pack['name'],
"timestamp": datetime.now().isoformat()
}
return jsonify(result)
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route('/api/generate-readme', methods=['POST'])
def api_generate_readme():
data = request.json
readme_content = generate_readme_section(data)
return jsonify({"readme": readme_content})
@app.route('/api/export-artifacts', methods=['POST'])
def api_export_artifacts():
data = request.json
# Create ZIP file in memory
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
# Add benchmark results as JSON
zip_file.writestr('benchmark_results.json', json.dumps(data, indent=2))
# Add YAML manifest
manifest = {
'model_id': data.get('model_id'),
'task': data.get('task'),
'benchmark_pack': data.get('pack_name'),
'results': data.get('benchmark_results'),
'timestamp': data.get('timestamp')
}
zip_file.writestr('manifest.yaml', yaml.dump(manifest, default_flow_style=False))
# Add README section
readme_content = generate_readme_section(data)
zip_file.writestr('README_section.md', readme_content)
# Add Python utility script
python_script = '''
"""
Model Registration Utility
Generated by Clarifai Community Bench
"""
import json
from datetime import datetime
class ModelArtifact:
def __init__(self, manifest_path="manifest.yaml"):
with open(manifest_path, 'r') as f:
import yaml
self.manifest = yaml.safe_load(f)
def get_model_info(self):
return {
"id": self.manifest["model_id"],
"task": self.manifest["task"],
"readiness_score": self.manifest.get("readiness_score", 0),
"avg_latency": self._calculate_avg_latency(),
"best_dataset": self._get_best_performing_dataset()
}
def _calculate_avg_latency(self):
results = self.manifest.get("results", [])
if not results:
return None
latencies = [r.get("avg_latency", 0) for r in results if "avg_latency" in r]
return sum(latencies) / len(latencies) if latencies else None
def _get_best_performing_dataset(self):
# Implementation depends on task-specific metrics
return self.manifest.get("results", [{}])[0].get("dataset")
# Usage example:
# artifact = ModelArtifact()
# print(artifact.get_model_info())
'''
zip_file.writestr('model_utility.py', python_script)
zip_buffer.seek(0)
return send_file(
io.BytesIO(zip_buffer.read()),
mimetype='application/zip',
as_attachment=True,
download_name=f'{data.get("model_id", "model").replace("/", "_")}_artifacts.zip'
)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)), debug=False)