import os import json import time import re import requests from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime # Get HF token HF_TOKEN = os.environ.get("HF_TOKEN", "") # Load models with open("models.json", "r") as f: models_data = json.load(f) # Extract model IDs model_ids = [model["id"] for model in models_data["data"]] # Limit to first 20 models model_ids = model_ids[:20] def extract_svg(text): """Extract SVG content from model response""" # First, check for code blocks with different markers code_block_patterns = [ r"```svg\s*(.*?)\s*```", r"```xml\s*(.*?)\s*```", r"```html\s*(.*?)\s*```", r"```\s*(.*?)\s*```", ] for pattern in code_block_patterns: match = re.search(pattern, text, re.DOTALL | re.IGNORECASE) if match: content = match.group(1) # Extract SVG from the code block content if "]*>.*?", content, re.DOTALL | re.IGNORECASE ) if svg_match: return svg_match.group(0) # If no code blocks, look for SVG directly in the text # Handle cases where SVG might be in thinking tags or other wrappers svg_pattern = r"]*>.*?" svg_match = re.search(svg_pattern, text, re.DOTALL | re.IGNORECASE) if svg_match: return svg_match.group(0) return None def test_model_with_temperature(model_id, temperature): """Test a single model with a specific temperature""" print(f"Testing {model_id} with temperature {temperature}...") result = { "model_id": model_id, "temperature": temperature, "timestamp": datetime.now().isoformat(), "success": False, "response_time": None, "svg_content": None, "error": None, "raw_response": None, } prompt = """Create a pelican riding a bicycle using SVG. Return only the SVG code without any explanation or markdown formatting. The SVG should be a complete, valid SVG document starting with and ending with .""" headers = { "Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json", } data = { "model": model_id, "messages": [{"role": "user", "content": prompt}], "max_tokens": 2000, "temperature": temperature, } try: start_time = time.time() response = requests.post( "https://router.huggingface.co/v1/chat/completions", headers=headers, json=data, timeout=60, ) response_time = time.time() - start_time result["response_time"] = response_time if response.status_code == 200: response_data = response.json() if response_data.get("choices") and response_data["choices"][0].get( "message" ): response_text = response_data["choices"][0]["message"]["content"] result["raw_response"] = response_text # Extract SVG svg_content = extract_svg(response_text) if svg_content: result["svg_content"] = svg_content result["success"] = True else: result["error"] = "No valid SVG found in response" else: result["error"] = "Empty response from model" else: result["error"] = f"HTTP {response.status_code}: {response.text}" except Exception as e: result["error"] = str(e) print(f"Error testing {model_id} with temperature {temperature}: {e}") return result def main(): temperatures = [0, 0.5, 1.0] print(f"Testing {len(model_ids)} models with {len(temperatures)} temperature settings...") results = [] # Create test tasks for each model and temperature combination test_tasks = [] for model_id in model_ids: for temp in temperatures: test_tasks.append((model_id, temp)) # Use ThreadPoolExecutor for concurrent requests with ThreadPoolExecutor(max_workers=10) as executor: future_to_task = { executor.submit(test_model_with_temperature, task[0], task[1]): task for task in test_tasks } for future in as_completed(future_to_task): task = future_to_task[future] model_id, temp = task try: result = future.result() results.append(result) print( f"Completed {model_id} (temp={temp}): {'Success' if result['success'] else 'Failed'}" ) except Exception as e: print(f"Exception for {model_id} (temp={temp}): {e}") results.append({ "model_id": model_id, "temperature": temp, "success": False, "error": str(e) }) # Save results with open("benchmark_results.json", "w") as f: json.dump(results, f, indent=2) # Generate statistics total_tests = len(results) successful_tests = sum(1 for r in results if r.get("success", False)) # Group by model to count unique models with at least one success models_with_success = {} for r in results: if r.get("success", False): models_with_success[r["model_id"]] = True stats = { "total_models": len(model_ids), "temperatures_tested": temperatures, "total_tests": total_tests, "successful_tests": successful_tests, "failed_tests": total_tests - successful_tests, "models_with_at_least_one_success": len(models_with_success), "average_response_time": ( sum(r.get("response_time", 0) for r in results if r.get("response_time")) / len([r for r in results if r.get("response_time")]) if any(r.get("response_time") for r in results) else 0 ), "successful_model_ids": list(models_with_success.keys()), } with open("benchmark_stats.json", "w") as f: json.dump(stats, f, indent=2) print("\nBenchmark complete!") print(f"Total models tested: {stats['total_models']}") print(f"Temperature settings: {stats['temperatures_tested']}") print(f"Total tests: {stats['total_tests']}") print(f"Successful tests: {stats['successful_tests']}") print(f"Failed tests: {stats['failed_tests']}") print(f"Models with at least one success: {stats['models_with_at_least_one_success']}") print(f"Average response time: {stats['average_response_time']:.2f}s") if __name__ == "__main__": main()