Spaces:

NCSOFT
/

ArenaLite

Sleeping

File size: 8,968 Bytes

5b51c97

import asyncio
from functools import partial
from random import random
from types import SimpleNamespace
from typing import Any, Dict, List, Literal, Optional, Tuple, Union

from .eval_utils import async_eval_w_prompt

class Match:
    def __init__(
        self,
        A: SimpleNamespace = None,
        eval_model: str = "gpt-3.5-turbo-0125",
        B: Optional[SimpleNamespace] = None,
    ):
        """
        No need to carry outputs nor generation configs
            - if the outputs, and configurations are identifiable by modelA|Bname in the main system
        """

        self.A = A
        self.B = B

        self.eval_model: str = eval_model

        # results
        self.winner: Literal["A", "B"] = None
        self.score: float = None  # for abs_eval
        self.match_result = []
        self.match_metainfo_log = []
        # promptname, evalmodel, cost, tokens
        self.aggregated_result = None

    async def async_comp_eval(
        self,
        position_swap: bool = False,  # DEPRECATED (tested but not used anymore) --> refactor later
        comp_prompt: Literal["llmbar_brief", "llmbar"] = "llmbar",
    ) -> Tuple[str, List[Dict[str, float]]]:
        """
        returns:
            winner = "A"
            result= [{"A": 0.6, "B": 0.4}, {optional swap result: "A" will refer to the same model as the first one (prefer_2nd)}]
        """

        # sanity check:
        if (
            self.A.instruction != self.B.instruction
            or self.A.source != self.B.source
            or self.A.task != self.B.task
        ):
            raise ValueError(
                f"A and B required to have the same inst/src: \n\t{A=}, \n\t{B=}"
            )

        jobs = []

        evalf = partial(
            async_eval_w_prompt,
            prompt_obj=comp_prompt,
            evalmodel=self.eval_model,
            position_1st=self.A,
            position_2nd=self.B,
        )
        jobs.append(evalf())

        if position_swap:
            evalf_swap = partial(
                async_eval_w_prompt,
                prompt_obj=comp_prompt,
                evalmodel=self.eval_model,
                position_1st=self.B,  # B data into A position (swapped inputs)
                position_2nd=self.A,
            )
            jobs.append(evalf_swap())

        evaluation_results = await asyncio.gather(*jobs)

        # log match_result in granular
        match_result_, _tracking_obj, resp = evaluation_results[0]
        match_result = match_result_.copy()
        match_result["A"] = match_result.pop("prefer_1st", 0.0)
        match_result["B"] = match_result.pop("prefer_2nd", 0.0)
        match_result["output_text"] = (
            resp.choices[0].message.content if resp is not None else "",
        )
        self.match_result.append(match_result)

        if position_swap:
            match_result_swap_, _tracking_obj_swap, resp_swap = evaluation_results[-1]
            match_result_swap = match_result_swap_.copy()
            match_result_swap["A"] = match_result_swap.pop(
                "prefer_2nd", 0.0
            )  # changing the name
            match_result_swap["B"] = match_result_swap.pop("prefer_1st", 0.0)
            match_result_swap["output_text"] = (
                resp_swap.choices[0].message.content if resp_swap is not None else "",
            )
            self.match_result.append(match_result_swap)

        # # token / cost logging
        # cost: float = _tracking_obj.cost_in_usd(model=self.eval_model, silent=True)
        # tokens: Dict[str, int] = _tracking_obj.print_summary(silent=True)
        # self.log_metainfo(cost=cost, tokens=tokens, prompt_name=comp_prompt)

        if position_swap:
            # cost_swap = _tracking_obj_swap.cost_in_usd(
            #     model=self.eval_model, silent=True
            # )
            # tokens_swap = _tracking_obj_swap.print_summary(silent=True)
            # self.log_metainfo(
            #     cost=cost_swap,
            #     tokens=tokens_swap,
            #     prompt_name=comp_prompt,
            # )
            pass

        # aggregate and return
        self.aggregated_result = self.aggregate_match_result()
        self.winner = self.judge_winner()

        return self.winner, self.match_result

    async def async_dbg_eval(self, position_swap: bool = False):
        # default
        prefer_1st = random()
        prefer_2nd = 1 - prefer_1st
        await asyncio.sleep(random() * 2.0)  # 0 ~ 2초 쉬기
        default_result = {
            "A": prefer_1st,
            "B": prefer_2nd,
            "model": "dbg",
            "prompt_name": "noprompt",
            "error": False,
            "exception_str": "",
        }
        self.match_result.append(default_result)
        # self.log_metainfo(cost=0.0, tokens=dict(), prompt_name="dbg")

        if position_swap:
            p = random()
            swap_result = {
                "A": p,
                "B": 1 - p,
                "model": "dbg",
                "prompt_name": "noprompt",
                "error": False,
                "exception_str": "",
            }
            self.match_result.append(swap_result)

            # self.log_metainfo(
            #     cost=0.0, tokens=dict(), prompt_name="dbg",
            # )

        self.aggregated_result = self.aggregate_match_result()
        self.winner = self.judge_winner()

        return self.winner, self.match_result

    def aggregate_match_result(self) -> Dict[str, float]:
        """
        input:
            [
                {A:0.4, B:0.5, (otherkeys)...}, # one or two results
            ]
        output:
            {
                A: float
                B: float
            }

        """
        if not self.match_result:
            raise ValueError("Match.comp_eval() need to be executed first!")

        aggregate = self.match_result[0].copy()
        if len(self.match_result) > 1:
            aggregate["A"] += self.match_result[1]["A"]
            aggregate["B"] += self.match_result[1]["B"]
            aggregate = {k: v / 2 for k, v in aggregate.items() if k in "AB"}

        error_exists = False
        for res in self.match_result:
            if res["error"]:
                error_exists = res["error"]
                break
        aggregate["error"] = error_exists

        return aggregate

    def judge_winner(self) -> Literal["A", "B"]:
        """
        based on self.aggregated_result, judge final winner

        input:
            {
                A:float
                B:float
                error:bool
            }
        output:
            Literal[A,B]
        """
        if self.aggregated_result is None:
            raise ValueError("Match.aggregate_math_result() need to be executed first!")

        if self.aggregated_result["error"]:
            winner = None
        elif self.aggregated_result["A"] == self.aggregated_result["B"]:
            winner = "A" if random() > 0.5 else "B"
        else:
            winner = (
                "A"
                if self.aggregated_result["A"] > self.aggregated_result["B"]
                else "B"
            )
        return winner

    # def log_metainfo(
    #     self,
    #     cost: float = 0.0,
    #     tokens: dict = None,
    #     prompt_name: str = None,
    #     position_swap: bool = False,
    # ):
    #     metainfo_d = {
    #         "cost": cost,
    #         "tokens": tokens,
    #         "eval_prompt_name": prompt_name,
    #         "model": self.eval_model,
    #         "position_swap": position_swap,
    #     }
    #     self.match_metainfo_log.append(metainfo_d)


if __name__ == "__main__":
    from pprint import pprint

    # unit test for match.py (class Match)
    row1 = {
        "model_id": "240413_dora",
        "task": "늘려쓰기",
        "instruction": "입력으로 주어진 글의 내용을 유지한 채, 어휘를 더 길게 바꾸거나 사소한 내용을 추가하여 다시 써주세요.",
        "source": "우리 게임이 돈을 잘 번다면",
        "generated": "만약 우리 게임이 유저들에게 높은 호응을 얻고 많은 사람들이 즐거운 시간을 보낼 수 있도록 서비스하고 있다면?",
    }
    row2 = {
        "model_id": "manual",
        "task": "늘려쓰기",
        "instruction": "입력으로 주어진 글의 내용을 유지한 채, 어휘를 더 길게 바꾸거나 사소한 내용을 추가하여 다시 써주세요.",
        "source": "우리 게임이 돈을 잘 번다면",
        "generated": "만약 우리 게임이 더 많은 매출을 내고 더 많은 플레이어들이 유입되는 상황이라면?",
    }
    A = SimpleNamespace(**row1)
    B = SimpleNamespace(**row2)
    m = Match(A=A, B=B, eval_model="gpt-3.5-turbo-1106")
    winner, result = m.async_comp_eval()
    score, result_ = m.async_abs_eval()
    print(winner)
    print(result)
    print(score)
    print(result_)
    pprint(m.match_metainfo_log)