evals / txt2json.py
hevok's picture
Update txt2json.py
46ad2d1 verified
"""
A function to parse an lm_eval text outputs into json format
"""
import os
import json
def txt2json(file):
"""Convert lm_eval text file to json format"""
with open(file) as fh:
lang = file.split('_')[-1].split('.txt')[0]
data = fh.read().split('hf (')[1:]
print(len(data))
for evaluation in data:
metadata = {}
results = {}
lines = evaluation.split('\n')
header, batch_size = lines[0].split('batch_size: ')
metadata['batch_size'] = batch_size
header = header.split(',')
for entry in header:
entry = entry.strip()
if not entry: continue
if '=' in entry:
key, value = entry.split('=')
elif 'batch_size: ' in entry:
key = 'batch_size'
value = entry.split('batch_size: ')[1]
print(key, value)
else:
key, value = entry.split(': ')
if value.endswith(')'):
value = value[:-1]
if value.startswith('('):
value = value[1:]
if not key == 'dtype':
try:
value = eval(value)
except Exception as e:
pass
if key == 'pretrained':
value = value.split('/')[-1]
pretrained = value
metadata[key] = value
print(metadata)
task = ''
alias = ''
for line in lines[1:]:
if line.startswith('|'):
columns = line.split('|')
_, tasks, version, filter, nshot, metric, _1, value, _2, stderr, _3 = columns
tasks = columns[1].strip()
if tasks == 'Tasks': continue
if '--' in tasks: continue
if tasks == 'Groups': continue
aliases = tasks
tasks = tasks.split('- ')[-1]
if tasks:
task = tasks
alias = aliases
results[task] = {}
#print(tasks)
#print(task, version, filter, nshot, metric, value, stderr) # = columns
if version.strip():
try:
results[task]['version'] = float(version.strip())
except Exception as e:
print(e)
if nshot.strip():
results[task]['nshot'] = int(nshot.strip())
metric = metric.strip()
value = value.strip()
filter = filter.strip()
results[task]['alias'] = alias
results[task][f'{metric},{filter}'] = float(value.strip())
results[task]['stderr'] = float(stderr.strip())
output = {'config': metadata, "results": results}
print(output)
pretrained = pretrained[0].upper()+pretrained[1:-1]+ pretrained[-1].upper()
with open(f'{pretrained}_{lang}.json', 'w') as f:
json.dump(output, f, ensure_ascii=False, indent=2)