Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os | |
import asyncio | |
import subprocess | |
from pathlib import Path | |
from loguru import logger | |
from yourbench_space.leaderboard_space.env import INIT_MODELS | |
ON_SPACES = os.environ.get("system") == "spaces" | |
OUTPUT_DIR = "/data" if ON_SPACES else "." # TODO: fix the space folder | |
def create_eval_file(eval_ds_name: str): | |
task_name = eval_ds_name.replace("/", "_") | |
template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py") | |
subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name]) | |
async def run_process(args: list, custom_env=None) -> dict: | |
process = await asyncio.create_subprocess_exec( | |
*args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, env=custom_env | |
) | |
try: | |
await asyncio.wait_for(process.wait(), timeout=350) | |
except TimeoutError: | |
logger.error("Lighteval process Timed Out") | |
stdout = await process.stdout.read() | |
stderr = await process.stderr.read() | |
return {"pid": process.pid, "stdout": stdout.decode(), "stderr": stderr.decode()} | |
async def run_evaluations(eval_ds_name: str, org: str, custom_env=None) -> list: | |
task_name = eval_ds_name.replace("/", "_") | |
tasks = [] | |
for model_name, provider in INIT_MODELS: | |
args = [ | |
"lighteval", | |
"endpoint", | |
"inference-providers", | |
f"model={model_name},provider={provider}", | |
f"custom|{task_name}|0|0", | |
"--custom-tasks", | |
f"custom_{task_name}_task.py", | |
"--max-samples", | |
"30", | |
"--output-dir", | |
f"{OUTPUT_DIR}", | |
"--save-details", | |
"--results-org", | |
org, | |
"--push-to-hub", | |
] | |
tasks.append(run_process(args, custom_env)) | |
# Will capture the task if failed | |
processes = await asyncio.gather(*tasks, return_exceptions=True) | |
for process in processes: | |
logger.info("Logs for process:") | |
logger.info(process["stdout"]) | |
logger.info(process["stderr"]) | |
if all(not isinstance(result, Exception) for result in processes): | |
return "✅" | |
return "At least one model failed" | |