import math import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000): if isinstance(df, list): df = pd.DataFrame(df) df = df.dropna(subset=["winner", "model_a", "model_b"]) # dropping None vs sth models = pd.concat([df["model_a"], df["model_b"]]).unique() models = pd.Series(np.arange(len(models)), index=models) # duplicate battles df = pd.concat([df, df], ignore_index=True) p = len(models.index) n = df.shape[0] X = np.zeros([n, p]) X[np.arange(n), models[df["model_a"]]] = +math.log(BASE) X[np.arange(n), models[df["model_b"]]] = -math.log(BASE) # one A win => two A win Y = np.zeros(n) Y[df["winner"] == "A"] = 1.0 WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the Arena-Lite or something odd (perfect one-sided wins) is happening\n\nto avoid logistic regressor error, manually putting other class" if (Y == 0).all(): print(WARNING.format(L=32)) Y[-1] = 1.0 elif (Y == 1.0).all(): print(WARNING.format(L=35)) Y[-1] = 0.0 lr = LogisticRegression(fit_intercept=False) lr.fit(X, Y) elo_scores = SCALE * lr.coef_[0] + INIT_RATING elo_scores = pd.Series(elo_scores, index=models.index).sort_values(ascending=False) df = ( pd.DataFrame( [[n, round(elo_scores[n], 2)] for n in elo_scores.keys()], columns=["Model", "Elo rating"], ) .sort_values("Elo rating", ascending=False) .reset_index(drop=True) ) df.index = df.index + 1 df["Elo rating"] = df["Elo rating"] return df # Utilities def compute_relative_winrate_to_1st(elo_df): """ Post-processing utility for saving elo table to an excel file. Possibly work as a absolute measure for quality. elo_df: columns: Model, Elo rating add: column: relative_winrate_to_1st """ from functools import partial rating1st = elo_df["Elo rating"].max() win_rate_to_1st = partial(elo_to_winrate, rating_b=rating1st) elo_df["winrate_vs_1st"] = elo_df["Elo rating"].apply(win_rate_to_1st).round(3) print(elo_df) return elo_df def elo_to_winrate(rating_a: float = None, rating_b: float = None) -> float: # compute P(A wins B) from ratings rate_diff = rating_a - rating_b win_rate = 1 / (1 + 10 ** (-rate_diff / 400)) return win_rate def compute_elo_with_ci(df, n_bootstrap=1000): """ Compute ELO ratings with 95% confidence intervals using bootstrapping. """ if isinstance(df, list): df = pd.DataFrame(df) bootstrap_elo_scores = [] for i in range(n_bootstrap): # 복원추출로 샘플링 sample_df = df.sample(n=len(df), replace=True) elo_scores = compute_mle_elo(sample_df) elo_scores = elo_scores.set_index("Model")["Elo rating"] bootstrap_elo_scores.append(elo_scores) bootstrap_df = pd.DataFrame(bootstrap_elo_scores) # 신뢰구간 계산 ci_lower = bootstrap_df.quantile(0.025).round(1) ci_upper = bootstrap_df.quantile(0.975).round(1) # 원본 데이터로 ELO 점수 계산 main_elo_df = compute_mle_elo(df) main_elo_df = main_elo_df.set_index("Model") # 결과 합치기 result_df = main_elo_df.copy() result_df["95% CI_lower"] = ci_lower result_df["95% CI_upper"] = ci_upper result_df = result_df.sort_values("Elo rating", ascending=False) result_df["Elo rating"] = result_df["Elo rating"].round(1) result_df = result_df.reset_index() result_df.index = result_df.index + 1 return result_df