Spaces:
Sleeping
Sleeping
import asyncio | |
from functools import partial | |
from random import random | |
from types import SimpleNamespace | |
from typing import Any, Dict, List, Literal, Optional, Tuple, Union | |
from .eval_utils import async_eval_w_prompt | |
class Match: | |
def __init__( | |
self, | |
A: SimpleNamespace = None, | |
eval_model: str = "gpt-3.5-turbo-0125", | |
B: Optional[SimpleNamespace] = None, | |
): | |
""" | |
No need to carry outputs nor generation configs | |
- if the outputs, and configurations are identifiable by modelA|Bname in the main system | |
""" | |
self.A = A | |
self.B = B | |
self.eval_model: str = eval_model | |
# results | |
self.winner: Literal["A", "B"] = None | |
self.score: float = None # for abs_eval | |
self.match_result = [] | |
self.match_metainfo_log = [] | |
# promptname, evalmodel, cost, tokens | |
self.aggregated_result = None | |
async def async_comp_eval( | |
self, | |
position_swap: bool = False, # DEPRECATED (tested but not used anymore) --> refactor later | |
comp_prompt: Literal["llmbar_brief", "llmbar"] = "llmbar", | |
) -> Tuple[str, List[Dict[str, float]]]: | |
""" | |
returns: | |
winner = "A" | |
result= [{"A": 0.6, "B": 0.4}, {optional swap result: "A" will refer to the same model as the first one (prefer_2nd)}] | |
""" | |
# sanity check: | |
if ( | |
self.A.instruction != self.B.instruction | |
or self.A.source != self.B.source | |
or self.A.task != self.B.task | |
): | |
raise ValueError( | |
f"A and B required to have the same inst/src: \n\t{A=}, \n\t{B=}" | |
) | |
jobs = [] | |
evalf = partial( | |
async_eval_w_prompt, | |
prompt_obj=comp_prompt, | |
evalmodel=self.eval_model, | |
position_1st=self.A, | |
position_2nd=self.B, | |
) | |
jobs.append(evalf()) | |
if position_swap: | |
evalf_swap = partial( | |
async_eval_w_prompt, | |
prompt_obj=comp_prompt, | |
evalmodel=self.eval_model, | |
position_1st=self.B, # B data into A position (swapped inputs) | |
position_2nd=self.A, | |
) | |
jobs.append(evalf_swap()) | |
evaluation_results = await asyncio.gather(*jobs) | |
# log match_result in granular | |
match_result_, _tracking_obj, resp = evaluation_results[0] | |
match_result = match_result_.copy() | |
match_result["A"] = match_result.pop("prefer_1st", 0.0) | |
match_result["B"] = match_result.pop("prefer_2nd", 0.0) | |
match_result["output_text"] = ( | |
resp.choices[0].message.content if resp is not None else "", | |
) | |
self.match_result.append(match_result) | |
if position_swap: | |
match_result_swap_, _tracking_obj_swap, resp_swap = evaluation_results[-1] | |
match_result_swap = match_result_swap_.copy() | |
match_result_swap["A"] = match_result_swap.pop( | |
"prefer_2nd", 0.0 | |
) # changing the name | |
match_result_swap["B"] = match_result_swap.pop("prefer_1st", 0.0) | |
match_result_swap["output_text"] = ( | |
resp_swap.choices[0].message.content if resp_swap is not None else "", | |
) | |
self.match_result.append(match_result_swap) | |
# # token / cost logging | |
# cost: float = _tracking_obj.cost_in_usd(model=self.eval_model, silent=True) | |
# tokens: Dict[str, int] = _tracking_obj.print_summary(silent=True) | |
# self.log_metainfo(cost=cost, tokens=tokens, prompt_name=comp_prompt) | |
if position_swap: | |
# cost_swap = _tracking_obj_swap.cost_in_usd( | |
# model=self.eval_model, silent=True | |
# ) | |
# tokens_swap = _tracking_obj_swap.print_summary(silent=True) | |
# self.log_metainfo( | |
# cost=cost_swap, | |
# tokens=tokens_swap, | |
# prompt_name=comp_prompt, | |
# ) | |
pass | |
# aggregate and return | |
self.aggregated_result = self.aggregate_match_result() | |
self.winner = self.judge_winner() | |
return self.winner, self.match_result | |
async def async_dbg_eval(self, position_swap: bool = False): | |
# default | |
prefer_1st = random() | |
prefer_2nd = 1 - prefer_1st | |
await asyncio.sleep(random() * 2.0) # 0 ~ 2์ด ์ฌ๊ธฐ | |
default_result = { | |
"A": prefer_1st, | |
"B": prefer_2nd, | |
"model": "dbg", | |
"prompt_name": "noprompt", | |
"error": False, | |
"exception_str": "", | |
} | |
self.match_result.append(default_result) | |
# self.log_metainfo(cost=0.0, tokens=dict(), prompt_name="dbg") | |
if position_swap: | |
p = random() | |
swap_result = { | |
"A": p, | |
"B": 1 - p, | |
"model": "dbg", | |
"prompt_name": "noprompt", | |
"error": False, | |
"exception_str": "", | |
} | |
self.match_result.append(swap_result) | |
# self.log_metainfo( | |
# cost=0.0, tokens=dict(), prompt_name="dbg", | |
# ) | |
self.aggregated_result = self.aggregate_match_result() | |
self.winner = self.judge_winner() | |
return self.winner, self.match_result | |
def aggregate_match_result(self) -> Dict[str, float]: | |
""" | |
input: | |
[ | |
{A:0.4, B:0.5, (otherkeys)...}, # one or two results | |
] | |
output: | |
{ | |
A: float | |
B: float | |
} | |
""" | |
if not self.match_result: | |
raise ValueError("Match.comp_eval() need to be executed first!") | |
aggregate = self.match_result[0].copy() | |
if len(self.match_result) > 1: | |
aggregate["A"] += self.match_result[1]["A"] | |
aggregate["B"] += self.match_result[1]["B"] | |
aggregate = {k: v / 2 for k, v in aggregate.items() if k in "AB"} | |
error_exists = False | |
for res in self.match_result: | |
if res["error"]: | |
error_exists = res["error"] | |
break | |
aggregate["error"] = error_exists | |
return aggregate | |
def judge_winner(self) -> Literal["A", "B"]: | |
""" | |
based on self.aggregated_result, judge final winner | |
input: | |
{ | |
A:float | |
B:float | |
error:bool | |
} | |
output: | |
Literal[A,B] | |
""" | |
if self.aggregated_result is None: | |
raise ValueError("Match.aggregate_math_result() need to be executed first!") | |
if self.aggregated_result["error"]: | |
winner = None | |
elif self.aggregated_result["A"] == self.aggregated_result["B"]: | |
winner = "A" if random() > 0.5 else "B" | |
else: | |
winner = ( | |
"A" | |
if self.aggregated_result["A"] > self.aggregated_result["B"] | |
else "B" | |
) | |
return winner | |
# def log_metainfo( | |
# self, | |
# cost: float = 0.0, | |
# tokens: dict = None, | |
# prompt_name: str = None, | |
# position_swap: bool = False, | |
# ): | |
# metainfo_d = { | |
# "cost": cost, | |
# "tokens": tokens, | |
# "eval_prompt_name": prompt_name, | |
# "model": self.eval_model, | |
# "position_swap": position_swap, | |
# } | |
# self.match_metainfo_log.append(metainfo_d) | |
if __name__ == "__main__": | |
from pprint import pprint | |
# unit test for match.py (class Match) | |
row1 = { | |
"model_id": "240413_dora", | |
"task": "๋๋ ค์ฐ๊ธฐ", | |
"instruction": "์ ๋ ฅ์ผ๋ก ์ฃผ์ด์ง ๊ธ์ ๋ด์ฉ์ ์ ์งํ ์ฑ, ์ดํ๋ฅผ ๋ ๊ธธ๊ฒ ๋ฐ๊พธ๊ฑฐ๋ ์ฌ์ํ ๋ด์ฉ์ ์ถ๊ฐํ์ฌ ๋ค์ ์จ์ฃผ์ธ์.", | |
"source": "์ฐ๋ฆฌ ๊ฒ์์ด ๋์ ์ ๋ฒ๋ค๋ฉด", | |
"generated": "๋ง์ฝ ์ฐ๋ฆฌ ๊ฒ์์ด ์ ์ ๋ค์๊ฒ ๋์ ํธ์์ ์ป๊ณ ๋ง์ ์ฌ๋๋ค์ด ์ฆ๊ฑฐ์ด ์๊ฐ์ ๋ณด๋ผ ์ ์๋๋ก ์๋น์คํ๊ณ ์๋ค๋ฉด?", | |
} | |
row2 = { | |
"model_id": "manual", | |
"task": "๋๋ ค์ฐ๊ธฐ", | |
"instruction": "์ ๋ ฅ์ผ๋ก ์ฃผ์ด์ง ๊ธ์ ๋ด์ฉ์ ์ ์งํ ์ฑ, ์ดํ๋ฅผ ๋ ๊ธธ๊ฒ ๋ฐ๊พธ๊ฑฐ๋ ์ฌ์ํ ๋ด์ฉ์ ์ถ๊ฐํ์ฌ ๋ค์ ์จ์ฃผ์ธ์.", | |
"source": "์ฐ๋ฆฌ ๊ฒ์์ด ๋์ ์ ๋ฒ๋ค๋ฉด", | |
"generated": "๋ง์ฝ ์ฐ๋ฆฌ ๊ฒ์์ด ๋ ๋ง์ ๋งค์ถ์ ๋ด๊ณ ๋ ๋ง์ ํ๋ ์ด์ด๋ค์ด ์ ์ ๋๋ ์ํฉ์ด๋ผ๋ฉด?", | |
} | |
A = SimpleNamespace(**row1) | |
B = SimpleNamespace(**row2) | |
m = Match(A=A, B=B, eval_model="gpt-3.5-turbo-1106") | |
winner, result = m.async_comp_eval() | |
score, result_ = m.async_abs_eval() | |
print(winner) | |
print(result) | |
print(score) | |
print(result_) | |
pprint(m.match_metainfo_log) | |