Spaces:

jjyang7
/

bcb_evaluator_testing

Sleeping

bcb_evaluator_testing / api /bigcodebench_data.py

jjyang77

app first pass

0f87dc1 5 months ago

1.92 kB

	import os
	import json
	import gzip
	from typing import Dict, Iterable

	def stream_jsonl(filename: str) -> Iterable[Dict]:
	"""
	Parses each jsonl line and yields it as a dictionary
	"""
	if filename.endswith(".gz"):
	with open(filename, "rb") as gzfp:
	with gzip.open(gzfp, "rt") as fp:
	for line in fp:
	if any(not x.isspace() for x in line):
	yield json.loads(line)
	else:
	with open(filename, "r") as fp:
	for line in fp:
	if any(not x.isspace() for x in line):
	yield json.loads(line)


	def load_solutions(sample_path: os.PathLike) -> Iterable[Dict]:
	"""We accept two formats of inputs.
	+ `sample.jsonl` which is the format from BigCodeBench, i.e., {task_id, completion or solution}.
	+ A folder which contains sub-folders named after the task_id. Each sub-folder
	contains samples named in `[?].py` where `?` is the solution id starting with 0.
	Different from `sample.jsonl`, the solutions must be complete (with prompt prefix).
	"""
	# if it is a file
	if os.path.isfile(sample_path):
	for i, sample in enumerate(stream_jsonl(sample_path)):
	assert "task_id" in sample, "No task_id found in sample!"
	assert "res_id" in sample, "No res_id found in sample!"
	assert "test" in sample, "No test found in sample!"
	assert "solution" in sample, "No solution found in sample!"
	assert isinstance(
	sample["solution"], str
	), "Solution must be a string! If you have multiple solutions, please repeat the task_id."

	sample["_identifier"] = (
	sample["task_id"] + f" (line {i+1} in {sample_path})"
	)
	yield sample
	else:
	raise NotImplementedError("Only jsonl solution output file is supported for now.")