File size: 13,389 Bytes
03ecb7b
 
ffa226d
 
 
 
 
07a85e3
ffa226d
07a85e3
ffa226d
 
 
 
 
07a85e3
03ecb7b
 
 
07a85e3
ffa226d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07a85e3
 
ffa226d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07a85e3
ffa226d
03ecb7b
07a85e3
03ecb7b
ffa226d
 
 
03ecb7b
 
07a85e3
ffa226d
03ecb7b
 
 
ffa226d
03ecb7b
 
ffa226d
03ecb7b
07a85e3
03ecb7b
07a85e3
03ecb7b
 
ffa226d
 
 
 
07a85e3
ffa226d
03ecb7b
ffa226d
 
 
 
03ecb7b
ffa226d
 
03ecb7b
ffa226d
03ecb7b
ffa226d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03ecb7b
ffa226d
 
07a85e3
ffa226d
 
 
 
 
 
03ecb7b
ffa226d
 
03ecb7b
ffa226d
 
03ecb7b
07a85e3
ffa226d
 
 
 
9b0654a
ffa226d
 
9b0654a
ffa226d
 
 
 
 
 
 
03ecb7b
ffa226d
 
 
03ecb7b
ffa226d
 
 
03ecb7b
ffa226d
 
 
03ecb7b
 
ffa226d
 
03ecb7b
ffa226d
 
 
 
 
 
 
03ecb7b
ffa226d
 
 
 
 
 
 
 
 
 
 
 
 
03ecb7b
ffa226d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03ecb7b
ffa226d
 
 
 
 
 
 
 
 
03ecb7b
ffa226d
 
 
03ecb7b
ffa226d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c64f4f8
ffa226d
 
 
 
 
 
07a85e3
ffa226d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
import os
import json
import time
import yaml
from datetime import datetime
from flask import Flask, render_template, request, jsonify, send_file
from huggingface_hub import InferenceClient, ModelCard, model_info
from datasets import load_dataset
import pandas as pd
import evaluate
from typing import Dict, List, Any, Optional
import io
import zipfile

app = Flask(__name__)

# Load evaluation metrics
rouge = evaluate.load("rouge")
sacrebleu = evaluate.load("sacrebleu")

# Benchmark packs (manifests)
BENCHMARK_PACKS = {
    "sentiment": {
        "name": "Sentiment Analysis Pack",
        "datasets": [
            {"id": "imdb", "split": "test", "text_col": "text", "label_col": "label", "sample_size": 100},
            {"id": "emotion", "split": "test", "text_col": "text", "label_col": "label", "sample_size": 100}
        ],
        "metrics": ["accuracy", "f1_macro"],
        "params": {"max_new_tokens": 32, "temperature": 0.1}
    },
    "summarization": {
        "name": "Text Summarization Pack",
        "datasets": [
            {"id": "cnn_dailymail", "config": "3.0.0", "split": "test", "text_col": "article", "label_col": "highlights", "sample_size": 50},
            {"id": "xsum", "split": "test", "text_col": "document", "label_col": "summary", "sample_size": 50}
        ],
        "metrics": ["rouge1", "rouge2", "rougeL"],
        "params": {"max_new_tokens": 150, "temperature": 0.3}
    },
    "translation": {
        "name": "EN→FR Translation Pack",
        "datasets": [
            {"id": "wmt14", "config": "fr-en", "split": "test", "text_col": "translation.en", "label_col": "translation.fr", "sample_size": 50}
        ],
        "metrics": ["sacrebleu", "chrf"],
        "params": {"max_new_tokens": 200, "temperature": 0.1}
    }
}

def lint_model(model_id: str, token: str = None) -> Dict[str, Any]:
    """Import and lint a HuggingFace model"""
    try:
        # Get model info
        info = model_info(model_id, token=token)
        
        # Get model card
        try:
            card = ModelCard.load(model_id, token=token)
            card_data = card.data.to_dict() if hasattr(card, 'data') else {}
        except:
            card_data = {}
        
        # Lint checks
        checks = {
            "pipeline_tag": bool(info.pipeline_tag),
            "license": bool(card_data.get("license")),
            "model_card": bool(card.content if 'card' in locals() else False),
            "tags": bool(info.tags),
            "language": bool(card_data.get("language")),
            "datasets": bool(card_data.get("datasets")),
            "metrics": bool(card_data.get("metrics")),
            "intended_use": "intended use" in (card.content.lower() if 'card' in locals() and card.content else ""),
            "limitations": "limitation" in (card.content.lower() if 'card' in locals() and card.content else ""),
            "bias_risks": any(word in (card.content.lower() if 'card' in locals() and card.content else "") 
                             for word in ["bias", "fairness", "risk"])
        }
        
        # Calculate readiness score
        score = sum(checks.values()) / len(checks) * 100
        
        # Generate recommendations
        recommendations = []
        if not checks["license"]: recommendations.append("Add license information")
        if not checks["model_card"]: recommendations.append("Add detailed model card")
        if not checks["intended_use"]: recommendations.append("Specify intended use cases")
        if not checks["limitations"]: recommendations.append("Document known limitations")
        if not checks["bias_risks"]: recommendations.append("Address bias and safety considerations")
        
        return {
            "model_id": model_id,
            "task": info.pipeline_tag,
            "readiness_score": round(score),
            "checks": checks,
            "recommendations": recommendations,
            "downloads": info.downloads or 0,
            "likes": info.likes or 0,
            "created_at": info.created_at.isoformat() if info.created_at else None,
            "library_name": info.library_name
        }
        
    except Exception as e:
        return {"error": str(e)}

def run_inference(model_id: str, texts: List[str], task: str, token: str, params: Dict = None) -> List[Dict]:
    """Run inference using HF Inference API"""
    client = InferenceClient(model=model_id, token=token)
    results = []
    params = params or {}
    
    start_time = time.time()
    
    for text in texts:
        try:
            if task == "text-classification":
                result = client.text_classification(text)
                results.append(result[0] if isinstance(result, list) else result)
            elif task == "summarization":
                result = client.summarization(text, **params)
                results.append(result)
            elif task == "translation":
                result = client.translation(text, **params)
                results.append(result)
            else:
                results.append({"error": "Unsupported task"})
        except Exception as e:
            results.append({"error": str(e)})
    
    total_time = time.time() - start_time
    avg_latency = total_time / len(texts) if texts else 0
    
    return results, avg_latency

def compute_metrics(task: str, predictions: List[Dict], references: List[str]) -> Dict[str, float]:
    """Compute task-specific metrics"""
    if task == "text-classification":
        pred_labels = [p.get("label", "UNKNOWN") if isinstance(p, dict) else "UNKNOWN" for p in predictions]
        accuracy = sum(1 for p, r in zip(pred_labels, references) if str(p).lower() == str(r).lower()) / len(references)
        return {"accuracy": round(accuracy, 4)}
    
    elif task == "summarization":
        pred_texts = [p.get("summary_text", "") if isinstance(p, dict) else "" for p in predictions]
        rouge_scores = rouge.compute(predictions=pred_texts, references=references)
        return {k: round(v, 4) for k, v in rouge_scores.items()}
    
    elif task == "translation":
        pred_texts = [p.get("translation_text", "") if isinstance(p, dict) else "" for p in predictions]
        bleu_scores = sacrebleu.compute(predictions=pred_texts, references=[[r] for r in references])
        return {k: round(v, 4) for k, v in bleu_scores.items()}
    
    return {}

def generate_readme_section(results: Dict) -> str:
    """Generate README section for model"""
    readme = f"""## Benchmark Results

**Evaluated on:** {datetime.now().strftime('%Y-%m-%d')}
**Task:** {results['task']}
**Readiness Score:** {results['readiness_score']}/100

### Performance Metrics
"""
    
    for dataset_result in results.get('benchmark_results', []):
        readme += f"\n**Dataset:** {dataset_result['dataset']}\n"
        for metric, value in dataset_result['metrics'].items():
            readme += f"- {metric}: {value}\n"
        readme += f"- Average Latency: {dataset_result['avg_latency']:.3f}s\n"
    
    readme += f"""
### Quick Start
```python
from transformers import pipeline
classifier = pipeline("text-classification", model="{results['model_id']}")
result = classifier("Your text here")
```

*Benchmarked with [Clarifai Community Bench](https://huggingface.co/spaces/your-space)*
"""
    return readme

@app.route('/')
def index():
    return render_template('index.html', benchmark_packs=BENCHMARK_PACKS)

@app.route('/api/lint-model', methods=['POST'])
def api_lint_model():
    data = request.json
    model_id = data.get('model_id')
    token = data.get('token')
    
    if not model_id:
        return jsonify({"error": "Model ID is required"}), 400
    
    result = lint_model(model_id, token)
    return jsonify(result)

@app.route('/api/run-benchmark', methods=['POST'])
def api_run_benchmark():
    data = request.json
    model_id = data.get('model_id')
    pack_name = data.get('pack')
    token = data.get('token')
    
    if not all([model_id, pack_name, token]):
        return jsonify({"error": "Missing required parameters"}), 400
    
    if pack_name not in BENCHMARK_PACKS:
        return jsonify({"error": "Invalid benchmark pack"}), 400
    
    try:
        # First lint the model
        lint_result = lint_model(model_id, token)
        if "error" in lint_result:
            return jsonify(lint_result), 400
        
        pack = BENCHMARK_PACKS[pack_name]
        benchmark_results = []
        
        # Run benchmark on each dataset in the pack
        for dataset_config in pack['datasets']:
            try:
                # Load dataset
                ds_params = {"path": dataset_config['id']}
                if dataset_config.get('config'):
                    ds_params['name'] = dataset_config['config']
                
                dataset = load_dataset(**ds_params, split=dataset_config['split'])
                sample_size = min(dataset_config['sample_size'], len(dataset))
                dataset = dataset.select(range(sample_size))
                
                # Extract text and references
                texts = [item[dataset_config['text_col']] for item in dataset]
                references = [item[dataset_config['label_col']] for item in dataset] if dataset_config.get('label_col') else None
                
                # Run inference
                predictions, avg_latency = run_inference(
                    model_id, texts, lint_result['task'], token, pack['params']
                )
                
                # Compute metrics
                metrics = compute_metrics(lint_result['task'], predictions, references) if references else {}
                
                benchmark_results.append({
                    "dataset": dataset_config['id'],
                    "samples": len(texts),
                    "metrics": metrics,
                    "avg_latency": round(avg_latency, 3),
                    "predictions": predictions[:5]  # First 5 for preview
                })
                
            except Exception as e:
                benchmark_results.append({
                    "dataset": dataset_config['id'],
                    "error": str(e)
                })
        
        # Combine results
        result = {
            **lint_result,
            "benchmark_results": benchmark_results,
            "pack_name": pack['name'],
            "timestamp": datetime.now().isoformat()
        }
        
        return jsonify(result)
        
    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route('/api/generate-readme', methods=['POST'])
def api_generate_readme():
    data = request.json
    readme_content = generate_readme_section(data)
    return jsonify({"readme": readme_content})

@app.route('/api/export-artifacts', methods=['POST'])
def api_export_artifacts():
    data = request.json
    
    # Create ZIP file in memory
    zip_buffer = io.BytesIO()
    
    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
        # Add benchmark results as JSON
        zip_file.writestr('benchmark_results.json', json.dumps(data, indent=2))
        
        # Add YAML manifest
        manifest = {
            'model_id': data.get('model_id'),
            'task': data.get('task'),
            'benchmark_pack': data.get('pack_name'),
            'results': data.get('benchmark_results'),
            'timestamp': data.get('timestamp')
        }
        zip_file.writestr('manifest.yaml', yaml.dump(manifest, default_flow_style=False))
        
        # Add README section
        readme_content = generate_readme_section(data)
        zip_file.writestr('README_section.md', readme_content)
        
        # Add Python utility script
        python_script = f'''
"""
Model Registration Utility
Generated by Clarifai Community Bench
"""

import json
from datetime import datetime

class ModelArtifact:
    def __init__(self, manifest_path="manifest.yaml"):
        with open(manifest_path, 'r') as f:
            import yaml
            self.manifest = yaml.safe_load(f)
    
    def get_model_info(self):
        return {{
            "id": self.manifest["model_id"],
            "task": self.manifest["task"],
            "readiness_score": self.manifest.get("readiness_score", 0),
            "avg_latency": self._calculate_avg_latency(),
            "best_dataset": self._get_best_performing_dataset()
        }}
    
    def _calculate_avg_latency(self):
        results = self.manifest.get("results", [])
        if not results:
            return None
        latencies = [r.get("avg_latency", 0) for r in results if "avg_latency" in r]
        return sum(latencies) / len(latencies) if latencies else None
    
    def _get_best_performing_dataset(self):
        # Implementation depends on task-specific metrics
        return self.manifest.get("results", [{}])[0].get("dataset")

# Usage example:
# artifact = ModelArtifact()
# print(artifact.get_model_info())
'''
        zip_file.writestr('model_utility.py', python_script)
    
    zip_buffer.seek(0)
    
    return send_file(
        io.BytesIO(zip_buffer.read()),
        mimetype='application/zip',
        as_attachment=True,
        download_name=f'{data.get("model_id", "model").replace("/", "_")}_artifacts.zip'
    )

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)), debug=False)