""" A function to parse an lm_eval text outputs into json format """ import os import json def txt2json(file): """Convert lm_eval text file to json format""" with open(file) as fh: lang = file.split('_')[-1].split('.txt')[0] data = fh.read().split('hf (')[1:] print(len(data)) for evaluation in data: metadata = {} results = {} lines = evaluation.split('\n') header, batch_size = lines[0].split('batch_size: ') metadata['batch_size'] = batch_size header = header.split(',') for entry in header: entry = entry.strip() if not entry: continue if '=' in entry: key, value = entry.split('=') elif 'batch_size: ' in entry: key = 'batch_size' value = entry.split('batch_size: ')[1] print(key, value) else: key, value = entry.split(': ') if value.endswith(')'): value = value[:-1] if value.startswith('('): value = value[1:] if not key == 'dtype': try: value = eval(value) except Exception as e: pass if key == 'pretrained': value = value.split('/')[-1] pretrained = value metadata[key] = value print(metadata) task = '' alias = '' for line in lines[1:]: if line.startswith('|'): columns = line.split('|') _, tasks, version, filter, nshot, metric, _1, value, _2, stderr, _3 = columns tasks = columns[1].strip() if tasks == 'Tasks': continue if '--' in tasks: continue if tasks == 'Groups': continue aliases = tasks tasks = tasks.split('- ')[-1] if tasks: task = tasks alias = aliases results[task] = {} #print(tasks) #print(task, version, filter, nshot, metric, value, stderr) # = columns if version.strip(): try: results[task]['version'] = float(version.strip()) except Exception as e: print(e) if nshot.strip(): results[task]['nshot'] = int(nshot.strip()) metric = metric.strip() value = value.strip() filter = filter.strip() results[task]['alias'] = alias results[task][f'{metric},{filter}'] = float(value.strip()) results[task]['stderr'] = float(stderr.strip()) output = {'config': metadata, "results": results} print(output) pretrained = pretrained[0].upper()+pretrained[1:-1]+ pretrained[-1].upper() with open(f'{pretrained}_{lang}.json', 'w') as f: json.dump(output, f, ensure_ascii=False, indent=2)