import os import asyncio import subprocess from pathlib import Path from loguru import logger from yourbench_space.leaderboard_space.env import INIT_MODELS ON_SPACES = os.environ.get("system") == "spaces" OUTPUT_DIR = "/data" if ON_SPACES else "." # TODO: fix the space folder def create_eval_file(eval_ds_name: str): task_name = eval_ds_name.replace("/", "_") template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py") subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name]) async def run_process(args: list, custom_env=None) -> dict: process = await asyncio.create_subprocess_exec( *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, env=custom_env ) try: await asyncio.wait_for(process.wait(), timeout=350) except TimeoutError: logger.error("Lighteval process Timed Out") stdout = await process.stdout.read() stderr = await process.stderr.read() return {"pid": process.pid, "stdout": stdout.decode(), "stderr": stderr.decode()} async def run_evaluations(eval_ds_name: str, org: str, custom_env=None) -> list: task_name = eval_ds_name.replace("/", "_") tasks = [] for model_name, provider in INIT_MODELS: args = [ "lighteval", "endpoint", "inference-providers", f"model={model_name},provider={provider}", f"custom|{task_name}|0|0", "--custom-tasks", f"custom_{task_name}_task.py", "--max-samples", "30", "--output-dir", f"{OUTPUT_DIR}", "--save-details", "--results-org", org, "--push-to-hub", ] tasks.append(run_process(args, custom_env)) # Will capture the task if failed processes = await asyncio.gather(*tasks, return_exceptions=True) for process in processes: logger.info("Logs for process:") logger.info(process["stdout"]) logger.info(process["stderr"]) if all(not isinstance(result, Exception) for result in processes): return "✅" return "At least one model failed"