Spaces:

Karthik1610
/

EvalKit

Runtime error

App Files Files Community

EvalKit / app.py

Karthik1610

Update app.py

ba45de7 verified 4 months ago

raw

history blame contribute delete

13.4 kB

	import os
	import json
	import time
	import yaml
	from datetime import datetime
	from flask import Flask, render_template, request, jsonify, send_file
	from huggingface_hub import InferenceClient, ModelCard, model_info
	from datasets import load_dataset
	import pandas as pd
	import evaluate
	from typing import Dict, List, Any, Optional
	import io
	import zipfile

	app = Flask(__name__)

	# Load evaluation metrics
	rouge = evaluate.load("rouge")
	sacrebleu = evaluate.load("sacrebleu")

	# Benchmark packs (manifests)
	BENCHMARK_PACKS = {
	"sentiment": {
	"name": "Sentiment Analysis Pack",
	"datasets": [
	{"id": "imdb", "split": "test", "text_col": "text", "label_col": "label", "sample_size": 100},
	{"id": "emotion", "split": "test", "text_col": "text", "label_col": "label", "sample_size": 100}
	],
	"metrics": ["accuracy", "f1_macro"],
	"params": {"max_new_tokens": 32, "temperature": 0.1}
	},
	"summarization": {
	"name": "Text Summarization Pack",
	"datasets": [
	{"id": "cnn_dailymail", "config": "3.0.0", "split": "test", "text_col": "article", "label_col": "highlights", "sample_size": 50},
	{"id": "xsum", "split": "test", "text_col": "document", "label_col": "summary", "sample_size": 50}
	],
	"metrics": ["rouge1", "rouge2", "rougeL"],
	"params": {"max_new_tokens": 150, "temperature": 0.3}
	},
	"translation": {
	"name": "EN→FR Translation Pack",
	"datasets": [
	{"id": "wmt14", "config": "fr-en", "split": "test", "text_col": "translation.en", "label_col": "translation.fr", "sample_size": 50}
	],
	"metrics": ["sacrebleu", "chrf"],
	"params": {"max_new_tokens": 200, "temperature": 0.1}
	}
	}

	def lint_model(model_id: str, token: str = None) -> Dict[str, Any]:
	"""Import and lint a HuggingFace model"""
	try:
	# Get model info
	info = model_info(model_id, token=token)

	# Get model card
	try:
	card = ModelCard.load(model_id, token=token)
	card_data = card.data.to_dict() if hasattr(card, 'data') else {}
	except:
	card_data = {}

	# Lint checks
	checks = {
	"pipeline_tag": bool(info.pipeline_tag),
	"license": bool(card_data.get("license")),
	"model_card": bool(card.content if 'card' in locals() else False),
	"tags": bool(info.tags),
	"language": bool(card_data.get("language")),
	"datasets": bool(card_data.get("datasets")),
	"metrics": bool(card_data.get("metrics")),
	"intended_use": "intended use" in (card.content.lower() if 'card' in locals() and card.content else ""),
	"limitations": "limitation" in (card.content.lower() if 'card' in locals() and card.content else ""),
	"bias_risks": any(word in (card.content.lower() if 'card' in locals() and card.content else "")
	for word in ["bias", "fairness", "risk"])
	}

	# Calculate readiness score
	score = sum(checks.values()) / len(checks) * 100

	# Generate recommendations
	recommendations = []
	if not checks["license"]: recommendations.append("Add license information")
	if not checks["model_card"]: recommendations.append("Add detailed model card")
	if not checks["intended_use"]: recommendations.append("Specify intended use cases")
	if not checks["limitations"]: recommendations.append("Document known limitations")
	if not checks["bias_risks"]: recommendations.append("Address bias and safety considerations")

	return {
	"model_id": model_id,
	"task": info.pipeline_tag,
	"readiness_score": round(score),
	"checks": checks,
	"recommendations": recommendations,
	"downloads": info.downloads or 0,
	"likes": info.likes or 0,
	"created_at": info.created_at.isoformat() if info.created_at else None,
	"library_name": info.library_name
	}

	except Exception as e:
	return {"error": str(e)}

	def run_inference(model_id: str, texts: List[str], task: str, token: str, params: Dict = None) -> List[Dict]:
	"""Run inference using HF Inference API"""
	client = InferenceClient(model=model_id, token=token)
	results = []
	params = params or {}

	start_time = time.time()

	for text in texts:
	try:
	if task == "text-classification":
	result = client.text_classification(text)
	results.append(result[0] if isinstance(result, list) else result)
	elif task == "summarization":
	result = client.summarization(text, **params)
	results.append(result)
	elif task == "translation":
	result = client.translation(text, **params)
	results.append(result)
	else:
	results.append({"error": "Unsupported task"})
	except Exception as e:
	results.append({"error": str(e)})

	total_time = time.time() - start_time
	avg_latency = total_time / len(texts) if texts else 0

	return results, avg_latency

	def compute_metrics(task: str, predictions: List[Dict], references: List[str]) -> Dict[str, float]:
	"""Compute task-specific metrics"""
	if task == "text-classification":
	pred_labels = [p.get("label", "UNKNOWN") if isinstance(p, dict) else "UNKNOWN" for p in predictions]
	accuracy = sum(1 for p, r in zip(pred_labels, references) if str(p).lower() == str(r).lower()) / len(references)
	return {"accuracy": round(accuracy, 4)}

	elif task == "summarization":
	pred_texts = [p.get("summary_text", "") if isinstance(p, dict) else "" for p in predictions]
	rouge_scores = rouge.compute(predictions=pred_texts, references=references)
	return {k: round(v, 4) for k, v in rouge_scores.items()}

	elif task == "translation":
	pred_texts = [p.get("translation_text", "") if isinstance(p, dict) else "" for p in predictions]
	bleu_scores = sacrebleu.compute(predictions=pred_texts, references=[[r] for r in references])
	return {k: round(v, 4) for k, v in bleu_scores.items()}

	return {}

	def generate_readme_section(results: Dict) -> str:
	"""Generate README section for model"""
	readme = f"""## Benchmark Results

	Evaluated on: {datetime.now().strftime('%Y-%m-%d')}
	Task: {results['task']}
	Readiness Score: {results['readiness_score']}/100

	### Performance Metrics
	"""

	for dataset_result in results.get('benchmark_results', []):
	readme += f"\nDataset: {dataset_result['dataset']}\n"
	for metric, value in dataset_result['metrics'].items():
	readme += f"- {metric}: {value}\n"
	readme += f"- Average Latency: {dataset_result['avg_latency']:.3f}s\n"

	readme += f"""
	### Quick Start
	```python
	from transformers import pipeline
	classifier = pipeline("text-classification", model="{results['model_id']}")
	result = classifier("Your text here")
	```

	Benchmarked with [Clarifai Community Bench](https://huggingface.co/spaces/your-space)
	"""
	return readme

	@app.route('/')
	def index():
	return render_template('index.html', benchmark_packs=BENCHMARK_PACKS)

	@app.route('/api/lint-model', methods=['POST'])
	def api_lint_model():
	data = request.json
	model_id = data.get('model_id')
	token = data.get('token')

	if not model_id:
	return jsonify({"error": "Model ID is required"}), 400

	result = lint_model(model_id, token)
	return jsonify(result)

	@app.route('/api/run-benchmark', methods=['POST'])
	def api_run_benchmark():
	data = request.json
	model_id = data.get('model_id')
	pack_name = data.get('pack')
	token = data.get('token')

	if not all([model_id, pack_name, token]):
	return jsonify({"error": "Missing required parameters"}), 400

	if pack_name not in BENCHMARK_PACKS:
	return jsonify({"error": "Invalid benchmark pack"}), 400

	try:
	# First lint the model
	lint_result = lint_model(model_id, token)
	if "error" in lint_result:
	return jsonify(lint_result), 400

	pack = BENCHMARK_PACKS[pack_name]
	benchmark_results = []

	# Run benchmark on each dataset in the pack
	for dataset_config in pack['datasets']:
	try:
	# Load dataset
	ds_params = {"path": dataset_config['id']}
	if dataset_config.get('config'):
	ds_params['name'] = dataset_config['config']

	dataset = load_dataset(**ds_params, split=dataset_config['split'])
	sample_size = min(dataset_config['sample_size'], len(dataset))
	dataset = dataset.select(range(sample_size))

	# Extract text and references
	texts = [item[dataset_config['text_col']] for item in dataset]
	references = [item[dataset_config['label_col']] for item in dataset] if dataset_config.get('label_col') else None

	# Run inference
	predictions, avg_latency = run_inference(
	model_id, texts, lint_result['task'], token, pack['params']
	)

	# Compute metrics
	metrics = compute_metrics(lint_result['task'], predictions, references) if references else {}

	benchmark_results.append({
	"dataset": dataset_config['id'],
	"samples": len(texts),
	"metrics": metrics,
	"avg_latency": round(avg_latency, 3),
	"predictions": predictions[:5] # First 5 for preview
	})

	except Exception as e:
	benchmark_results.append({
	"dataset": dataset_config['id'],
	"error": str(e)
	})

	# Combine results
	result = {
	**lint_result,
	"benchmark_results": benchmark_results,
	"pack_name": pack['name'],
	"timestamp": datetime.now().isoformat()
	}

	return jsonify(result)

	except Exception as e:
	return jsonify({"error": str(e)}), 500

	@app.route('/api/generate-readme', methods=['POST'])
	def api_generate_readme():
	data = request.json
	readme_content = generate_readme_section(data)
	return jsonify({"readme": readme_content})

	@app.route('/api/export-artifacts', methods=['POST'])
	def api_export_artifacts():
	data = request.json

	# Create ZIP file in memory
	zip_buffer = io.BytesIO()

	with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
	# Add benchmark results as JSON
	zip_file.writestr('benchmark_results.json', json.dumps(data, indent=2))

	# Add YAML manifest
	manifest = {
	'model_id': data.get('model_id'),
	'task': data.get('task'),
	'benchmark_pack': data.get('pack_name'),
	'results': data.get('benchmark_results'),
	'timestamp': data.get('timestamp')
	}
	zip_file.writestr('manifest.yaml', yaml.dump(manifest, default_flow_style=False))

	# Add README section
	readme_content = generate_readme_section(data)
	zip_file.writestr('README_section.md', readme_content)

	# Add Python utility script
	python_script = '''
	"""
	Model Registration Utility
	Generated by Clarifai Community Bench
	"""

	import json
	from datetime import datetime

	class ModelArtifact:
	def __init__(self, manifest_path="manifest.yaml"):
	with open(manifest_path, 'r') as f:
	import yaml
	self.manifest = yaml.safe_load(f)

	def get_model_info(self):
	return {
	"id": self.manifest["model_id"],
	"task": self.manifest["task"],
	"readiness_score": self.manifest.get("readiness_score", 0),
	"avg_latency": self._calculate_avg_latency(),
	"best_dataset": self._get_best_performing_dataset()
	}

	def _calculate_avg_latency(self):
	results = self.manifest.get("results", [])
	if not results:
	return None
	latencies = [r.get("avg_latency", 0) for r in results if "avg_latency" in r]
	return sum(latencies) / len(latencies) if latencies else None

	def _get_best_performing_dataset(self):
	# Implementation depends on task-specific metrics
	return self.manifest.get("results", [{}])[0].get("dataset")

	# Usage example:
	# artifact = ModelArtifact()
	# print(artifact.get_model_info())
	'''
	zip_file.writestr('model_utility.py', python_script)

	zip_buffer.seek(0)

	return send_file(
	io.BytesIO(zip_buffer.read()),
	mimetype='application/zip',
	as_attachment=True,
	download_name=f'{data.get("model_id", "model").replace("/", "_")}_artifacts.zip'
	)

	if __name__ == '__main__':
	app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)), debug=False)