Spaces:
Sleeping
Sleeping
import os | |
import json | |
import gzip | |
from typing import Dict, Iterable | |
def stream_jsonl(filename: str) -> Iterable[Dict]: | |
""" | |
Parses each jsonl line and yields it as a dictionary | |
""" | |
if filename.endswith(".gz"): | |
with open(filename, "rb") as gzfp: | |
with gzip.open(gzfp, "rt") as fp: | |
for line in fp: | |
if any(not x.isspace() for x in line): | |
yield json.loads(line) | |
else: | |
with open(filename, "r") as fp: | |
for line in fp: | |
if any(not x.isspace() for x in line): | |
yield json.loads(line) | |
def load_solutions(sample_path: os.PathLike) -> Iterable[Dict]: | |
"""We accept two formats of inputs. | |
+ `sample.jsonl` which is the format from BigCodeBench, i.e., {task_id, completion or solution}. | |
+ A folder which contains sub-folders named after the task_id. Each sub-folder | |
contains samples named in `[?].py` where `?` is the solution id starting with 0. | |
Different from `sample.jsonl`, the solutions must be complete (with prompt prefix). | |
""" | |
# if it is a file | |
if os.path.isfile(sample_path): | |
for i, sample in enumerate(stream_jsonl(sample_path)): | |
assert "task_id" in sample, "No task_id found in sample!" | |
assert "res_id" in sample, "No res_id found in sample!" | |
assert "test" in sample, "No test found in sample!" | |
assert "solution" in sample, "No solution found in sample!" | |
assert isinstance( | |
sample["solution"], str | |
), "Solution must be a string! If you have multiple solutions, please repeat the task_id." | |
sample["_identifier"] = ( | |
sample["task_id"] + f" (line {i+1} in {sample_path})" | |
) | |
yield sample | |
else: | |
raise NotImplementedError("Only jsonl solution output file is supported for now.") |