Spaces:

NCSOFT
/

ArenaLite

Sleeping

App Files Files Community

ArenaLite / varco_arena /varco_arena_core /match.py

sonsus

improved result.json to include api call details, added a new prompt

1409f18 3 months ago

raw

history blame contribute delete

8.97 kB

	import asyncio
	from functools import partial
	from random import random
	from types import SimpleNamespace
	from typing import Any, Dict, List, Literal, Optional, Tuple, Union

	from .eval_utils import async_eval_w_prompt

	class Match:
	def __init__(
	self,
	A: SimpleNamespace = None,
	eval_model: str = "gpt-3.5-turbo-0125",
	B: Optional[SimpleNamespace] = None,
	):
	"""
	No need to carry outputs nor generation configs
	- if the outputs, and configurations are identifiable by modelA\|Bname in the main system
	"""

	self.A = A
	self.B = B

	self.eval_model: str = eval_model

	# results
	self.winner: Literal["A", "B"] = None
	self.score: float = None # for abs_eval
	self.match_result = []
	self.match_metainfo_log = []
	# promptname, evalmodel, cost, tokens
	self.aggregated_result = None

	async def async_comp_eval(
	self,
	position_swap: bool = False, # DEPRECATED (tested but not used anymore) --> refactor later
	comp_prompt: Literal["llmbar_brief", "llmbar"] = "llmbar",
	) -> Tuple[str, List[Dict[str, float]]]:
	"""
	returns:
	winner = "A"
	result= [{"A": 0.6, "B": 0.4}, {optional swap result: "A" will refer to the same model as the first one (prefer_2nd)}]
	"""

	# sanity check:
	if (
	self.A.instruction != self.B.instruction
	or self.A.source != self.B.source
	or self.A.task != self.B.task
	):
	raise ValueError(
	f"A and B required to have the same inst/src: \n\t{A=}, \n\t{B=}"
	)

	jobs = []

	evalf = partial(
	async_eval_w_prompt,
	prompt_obj=comp_prompt,
	evalmodel=self.eval_model,
	position_1st=self.A,
	position_2nd=self.B,
	)
	jobs.append(evalf())

	if position_swap:
	evalf_swap = partial(
	async_eval_w_prompt,
	prompt_obj=comp_prompt,
	evalmodel=self.eval_model,
	position_1st=self.B, # B data into A position (swapped inputs)
	position_2nd=self.A,
	)
	jobs.append(evalf_swap())

	evaluation_results = await asyncio.gather(*jobs)

	# log match_result in granular
	match_result_, _tracking_obj, resp = evaluation_results[0]
	match_result = match_result_.copy()
	match_result["A"] = match_result.pop("prefer_1st", 0.0)
	match_result["B"] = match_result.pop("prefer_2nd", 0.0)
	match_result["output_text"] = (
	resp.choices[0].message.content if resp is not None else "",
	)
	self.match_result.append(match_result)

	if position_swap:
	match_result_swap_, _tracking_obj_swap, resp_swap = evaluation_results[-1]
	match_result_swap = match_result_swap_.copy()
	match_result_swap["A"] = match_result_swap.pop(
	"prefer_2nd", 0.0
	) # changing the name
	match_result_swap["B"] = match_result_swap.pop("prefer_1st", 0.0)
	match_result_swap["output_text"] = (
	resp_swap.choices[0].message.content if resp_swap is not None else "",
	)
	self.match_result.append(match_result_swap)

	# # token / cost logging
	# cost: float = _tracking_obj.cost_in_usd(model=self.eval_model, silent=True)
	# tokens: Dict[str, int] = _tracking_obj.print_summary(silent=True)
	# self.log_metainfo(cost=cost, tokens=tokens, prompt_name=comp_prompt)

	if position_swap:
	# cost_swap = _tracking_obj_swap.cost_in_usd(
	# model=self.eval_model, silent=True
	# )
	# tokens_swap = _tracking_obj_swap.print_summary(silent=True)
	# self.log_metainfo(
	# cost=cost_swap,
	# tokens=tokens_swap,
	# prompt_name=comp_prompt,
	# )
	pass

	# aggregate and return
	self.aggregated_result = self.aggregate_match_result()
	self.winner = self.judge_winner()

	return self.winner, self.match_result

	async def async_dbg_eval(self, position_swap: bool = False):
	# default
	prefer_1st = random()
	prefer_2nd = 1 - prefer_1st
	await asyncio.sleep(random() * 2.0) # 0 ~ 2초 쉬기
	default_result = {
	"A": prefer_1st,
	"B": prefer_2nd,
	"model": "dbg",
	"prompt_name": "noprompt",
	"error": False,
	"exception_str": "",
	}
	self.match_result.append(default_result)
	# self.log_metainfo(cost=0.0, tokens=dict(), prompt_name="dbg")

	if position_swap:
	p = random()
	swap_result = {
	"A": p,
	"B": 1 - p,
	"model": "dbg",
	"prompt_name": "noprompt",
	"error": False,
	"exception_str": "",
	}
	self.match_result.append(swap_result)

	# self.log_metainfo(
	# cost=0.0, tokens=dict(), prompt_name="dbg",
	# )

	self.aggregated_result = self.aggregate_match_result()
	self.winner = self.judge_winner()

	return self.winner, self.match_result

	def aggregate_match_result(self) -> Dict[str, float]:
	"""
	input:
	[
	{A:0.4, B:0.5, (otherkeys)...}, # one or two results
	]
	output:
	{
	A: float
	B: float
	}

	"""
	if not self.match_result:
	raise ValueError("Match.comp_eval() need to be executed first!")

	aggregate = self.match_result[0].copy()
	if len(self.match_result) > 1:
	aggregate["A"] += self.match_result[1]["A"]
	aggregate["B"] += self.match_result[1]["B"]
	aggregate = {k: v / 2 for k, v in aggregate.items() if k in "AB"}

	error_exists = False
	for res in self.match_result:
	if res["error"]:
	error_exists = res["error"]
	break
	aggregate["error"] = error_exists

	return aggregate

	def judge_winner(self) -> Literal["A", "B"]:
	"""
	based on self.aggregated_result, judge final winner

	input:
	{
	A:float
	B:float
	error:bool
	}
	output:
	Literal[A,B]
	"""
	if self.aggregated_result is None:
	raise ValueError("Match.aggregate_math_result() need to be executed first!")

	if self.aggregated_result["error"]:
	winner = None
	elif self.aggregated_result["A"] == self.aggregated_result["B"]:
	winner = "A" if random() > 0.5 else "B"
	else:
	winner = (
	"A"
	if self.aggregated_result["A"] > self.aggregated_result["B"]
	else "B"
	)
	return winner

	# def log_metainfo(
	# self,
	# cost: float = 0.0,
	# tokens: dict = None,
	# prompt_name: str = None,
	# position_swap: bool = False,
	# ):
	# metainfo_d = {
	# "cost": cost,
	# "tokens": tokens,
	# "eval_prompt_name": prompt_name,
	# "model": self.eval_model,
	# "position_swap": position_swap,
	# }
	# self.match_metainfo_log.append(metainfo_d)


	if __name__ == "__main__":
	from pprint import pprint

	# unit test for match.py (class Match)
	row1 = {
	"model_id": "240413_dora",
	"task": "늘려쓰기",
	"instruction": "입력으로 주어진 글의 내용을 유지한 채, 어휘를 더 길게 바꾸거나 사소한 내용을 추가하여 다시 써주세요.",
	"source": "우리 게임이 돈을 잘 번다면",
	"generated": "만약 우리 게임이 유저들에게 높은 호응을 얻고 많은 사람들이 즐거운 시간을 보낼 수 있도록 서비스하고 있다면?",
	}
	row2 = {
	"model_id": "manual",
	"task": "늘려쓰기",
	"instruction": "입력으로 주어진 글의 내용을 유지한 채, 어휘를 더 길게 바꾸거나 사소한 내용을 추가하여 다시 써주세요.",
	"source": "우리 게임이 돈을 잘 번다면",
	"generated": "만약 우리 게임이 더 많은 매출을 내고 더 많은 플레이어들이 유입되는 상황이라면?",
	}
	A = SimpleNamespace(**row1)
	B = SimpleNamespace(**row2)
	m = Match(A=A, B=B, eval_model="gpt-3.5-turbo-1106")
	winner, result = m.async_comp_eval()
	score, result_ = m.async_abs_eval()
	print(winner)
	print(result)
	print(score)
	print(result_)
	pprint(m.match_metainfo_log)