sonsus's picture
improved result.json to include api call details, added a new prompt
1409f18
import asyncio
from functools import partial
from random import random
from types import SimpleNamespace
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
from .eval_utils import async_eval_w_prompt
class Match:
def __init__(
self,
A: SimpleNamespace = None,
eval_model: str = "gpt-3.5-turbo-0125",
B: Optional[SimpleNamespace] = None,
):
"""
No need to carry outputs nor generation configs
- if the outputs, and configurations are identifiable by modelA|Bname in the main system
"""
self.A = A
self.B = B
self.eval_model: str = eval_model
# results
self.winner: Literal["A", "B"] = None
self.score: float = None # for abs_eval
self.match_result = []
self.match_metainfo_log = []
# promptname, evalmodel, cost, tokens
self.aggregated_result = None
async def async_comp_eval(
self,
position_swap: bool = False, # DEPRECATED (tested but not used anymore) --> refactor later
comp_prompt: Literal["llmbar_brief", "llmbar"] = "llmbar",
) -> Tuple[str, List[Dict[str, float]]]:
"""
returns:
winner = "A"
result= [{"A": 0.6, "B": 0.4}, {optional swap result: "A" will refer to the same model as the first one (prefer_2nd)}]
"""
# sanity check:
if (
self.A.instruction != self.B.instruction
or self.A.source != self.B.source
or self.A.task != self.B.task
):
raise ValueError(
f"A and B required to have the same inst/src: \n\t{A=}, \n\t{B=}"
)
jobs = []
evalf = partial(
async_eval_w_prompt,
prompt_obj=comp_prompt,
evalmodel=self.eval_model,
position_1st=self.A,
position_2nd=self.B,
)
jobs.append(evalf())
if position_swap:
evalf_swap = partial(
async_eval_w_prompt,
prompt_obj=comp_prompt,
evalmodel=self.eval_model,
position_1st=self.B, # B data into A position (swapped inputs)
position_2nd=self.A,
)
jobs.append(evalf_swap())
evaluation_results = await asyncio.gather(*jobs)
# log match_result in granular
match_result_, _tracking_obj, resp = evaluation_results[0]
match_result = match_result_.copy()
match_result["A"] = match_result.pop("prefer_1st", 0.0)
match_result["B"] = match_result.pop("prefer_2nd", 0.0)
match_result["output_text"] = (
resp.choices[0].message.content if resp is not None else "",
)
self.match_result.append(match_result)
if position_swap:
match_result_swap_, _tracking_obj_swap, resp_swap = evaluation_results[-1]
match_result_swap = match_result_swap_.copy()
match_result_swap["A"] = match_result_swap.pop(
"prefer_2nd", 0.0
) # changing the name
match_result_swap["B"] = match_result_swap.pop("prefer_1st", 0.0)
match_result_swap["output_text"] = (
resp_swap.choices[0].message.content if resp_swap is not None else "",
)
self.match_result.append(match_result_swap)
# # token / cost logging
# cost: float = _tracking_obj.cost_in_usd(model=self.eval_model, silent=True)
# tokens: Dict[str, int] = _tracking_obj.print_summary(silent=True)
# self.log_metainfo(cost=cost, tokens=tokens, prompt_name=comp_prompt)
if position_swap:
# cost_swap = _tracking_obj_swap.cost_in_usd(
# model=self.eval_model, silent=True
# )
# tokens_swap = _tracking_obj_swap.print_summary(silent=True)
# self.log_metainfo(
# cost=cost_swap,
# tokens=tokens_swap,
# prompt_name=comp_prompt,
# )
pass
# aggregate and return
self.aggregated_result = self.aggregate_match_result()
self.winner = self.judge_winner()
return self.winner, self.match_result
async def async_dbg_eval(self, position_swap: bool = False):
# default
prefer_1st = random()
prefer_2nd = 1 - prefer_1st
await asyncio.sleep(random() * 2.0) # 0 ~ 2์ดˆ ์‰ฌ๊ธฐ
default_result = {
"A": prefer_1st,
"B": prefer_2nd,
"model": "dbg",
"prompt_name": "noprompt",
"error": False,
"exception_str": "",
}
self.match_result.append(default_result)
# self.log_metainfo(cost=0.0, tokens=dict(), prompt_name="dbg")
if position_swap:
p = random()
swap_result = {
"A": p,
"B": 1 - p,
"model": "dbg",
"prompt_name": "noprompt",
"error": False,
"exception_str": "",
}
self.match_result.append(swap_result)
# self.log_metainfo(
# cost=0.0, tokens=dict(), prompt_name="dbg",
# )
self.aggregated_result = self.aggregate_match_result()
self.winner = self.judge_winner()
return self.winner, self.match_result
def aggregate_match_result(self) -> Dict[str, float]:
"""
input:
[
{A:0.4, B:0.5, (otherkeys)...}, # one or two results
]
output:
{
A: float
B: float
}
"""
if not self.match_result:
raise ValueError("Match.comp_eval() need to be executed first!")
aggregate = self.match_result[0].copy()
if len(self.match_result) > 1:
aggregate["A"] += self.match_result[1]["A"]
aggregate["B"] += self.match_result[1]["B"]
aggregate = {k: v / 2 for k, v in aggregate.items() if k in "AB"}
error_exists = False
for res in self.match_result:
if res["error"]:
error_exists = res["error"]
break
aggregate["error"] = error_exists
return aggregate
def judge_winner(self) -> Literal["A", "B"]:
"""
based on self.aggregated_result, judge final winner
input:
{
A:float
B:float
error:bool
}
output:
Literal[A,B]
"""
if self.aggregated_result is None:
raise ValueError("Match.aggregate_math_result() need to be executed first!")
if self.aggregated_result["error"]:
winner = None
elif self.aggregated_result["A"] == self.aggregated_result["B"]:
winner = "A" if random() > 0.5 else "B"
else:
winner = (
"A"
if self.aggregated_result["A"] > self.aggregated_result["B"]
else "B"
)
return winner
# def log_metainfo(
# self,
# cost: float = 0.0,
# tokens: dict = None,
# prompt_name: str = None,
# position_swap: bool = False,
# ):
# metainfo_d = {
# "cost": cost,
# "tokens": tokens,
# "eval_prompt_name": prompt_name,
# "model": self.eval_model,
# "position_swap": position_swap,
# }
# self.match_metainfo_log.append(metainfo_d)
if __name__ == "__main__":
from pprint import pprint
# unit test for match.py (class Match)
row1 = {
"model_id": "240413_dora",
"task": "๋Š˜๋ ค์“ฐ๊ธฐ",
"instruction": "์ž…๋ ฅ์œผ๋กœ ์ฃผ์–ด์ง„ ๊ธ€์˜ ๋‚ด์šฉ์„ ์œ ์ง€ํ•œ ์ฑ„, ์–ดํœ˜๋ฅผ ๋” ๊ธธ๊ฒŒ ๋ฐ”๊พธ๊ฑฐ๋‚˜ ์‚ฌ์†Œํ•œ ๋‚ด์šฉ์„ ์ถ”๊ฐ€ํ•˜์—ฌ ๋‹ค์‹œ ์จ์ฃผ์„ธ์š”.",
"source": "์šฐ๋ฆฌ ๊ฒŒ์ž„์ด ๋ˆ์„ ์ž˜ ๋ฒˆ๋‹ค๋ฉด",
"generated": "๋งŒ์•ฝ ์šฐ๋ฆฌ ๊ฒŒ์ž„์ด ์œ ์ €๋“ค์—๊ฒŒ ๋†’์€ ํ˜ธ์‘์„ ์–ป๊ณ  ๋งŽ์€ ์‚ฌ๋žŒ๋“ค์ด ์ฆ๊ฑฐ์šด ์‹œ๊ฐ„์„ ๋ณด๋‚ผ ์ˆ˜ ์žˆ๋„๋ก ์„œ๋น„์Šคํ•˜๊ณ  ์žˆ๋‹ค๋ฉด?",
}
row2 = {
"model_id": "manual",
"task": "๋Š˜๋ ค์“ฐ๊ธฐ",
"instruction": "์ž…๋ ฅ์œผ๋กœ ์ฃผ์–ด์ง„ ๊ธ€์˜ ๋‚ด์šฉ์„ ์œ ์ง€ํ•œ ์ฑ„, ์–ดํœ˜๋ฅผ ๋” ๊ธธ๊ฒŒ ๋ฐ”๊พธ๊ฑฐ๋‚˜ ์‚ฌ์†Œํ•œ ๋‚ด์šฉ์„ ์ถ”๊ฐ€ํ•˜์—ฌ ๋‹ค์‹œ ์จ์ฃผ์„ธ์š”.",
"source": "์šฐ๋ฆฌ ๊ฒŒ์ž„์ด ๋ˆ์„ ์ž˜ ๋ฒˆ๋‹ค๋ฉด",
"generated": "๋งŒ์•ฝ ์šฐ๋ฆฌ ๊ฒŒ์ž„์ด ๋” ๋งŽ์€ ๋งค์ถœ์„ ๋‚ด๊ณ  ๋” ๋งŽ์€ ํ”Œ๋ ˆ์ด์–ด๋“ค์ด ์œ ์ž…๋˜๋Š” ์ƒํ™ฉ์ด๋ผ๋ฉด?",
}
A = SimpleNamespace(**row1)
B = SimpleNamespace(**row2)
m = Match(A=A, B=B, eval_model="gpt-3.5-turbo-1106")
winner, result = m.async_comp_eval()
score, result_ = m.async_abs_eval()
print(winner)
print(result)
print(score)
print(result_)
pprint(m.match_metainfo_log)