advanced / yourbench_space /evaluation.py
alozowski's picture
alozowski HF Staff
Update pyproject.toml and apply ruff
64a657c
import os
import asyncio
import subprocess
from pathlib import Path
from loguru import logger
from yourbench_space.leaderboard_space.env import INIT_MODELS
ON_SPACES = os.environ.get("system") == "spaces"
OUTPUT_DIR = "/data" if ON_SPACES else "." # TODO: fix the space folder
def create_eval_file(eval_ds_name: str):
task_name = eval_ds_name.replace("/", "_")
template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py")
subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name])
async def run_process(args: list, custom_env=None) -> dict:
process = await asyncio.create_subprocess_exec(
*args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, env=custom_env
)
try:
await asyncio.wait_for(process.wait(), timeout=350)
except TimeoutError:
logger.error("Lighteval process Timed Out")
stdout = await process.stdout.read()
stderr = await process.stderr.read()
return {"pid": process.pid, "stdout": stdout.decode(), "stderr": stderr.decode()}
async def run_evaluations(eval_ds_name: str, org: str, custom_env=None) -> list:
task_name = eval_ds_name.replace("/", "_")
tasks = []
for model_name, provider in INIT_MODELS:
args = [
"lighteval",
"endpoint",
"inference-providers",
f"model={model_name},provider={provider}",
f"custom|{task_name}|0|0",
"--custom-tasks",
f"custom_{task_name}_task.py",
"--max-samples",
"30",
"--output-dir",
f"{OUTPUT_DIR}",
"--save-details",
"--results-org",
org,
"--push-to-hub",
]
tasks.append(run_process(args, custom_env))
# Will capture the task if failed
processes = await asyncio.gather(*tasks, return_exceptions=True)
for process in processes:
logger.info("Logs for process:")
logger.info(process["stdout"])
logger.info(process["stderr"])
if all(not isinstance(result, Exception) for result in processes):
return "✅"
return "At least one model failed"