95% CI added (rebuttal)
a88ccc4
import math
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
if isinstance(df, list):
df = pd.DataFrame(df)
df = df.dropna(subset=["winner", "model_a", "model_b"]) # dropping None vs sth
models = pd.concat([df["model_a"], df["model_b"]]).unique()
models = pd.Series(np.arange(len(models)), index=models)
# duplicate battles
df = pd.concat([df, df], ignore_index=True)
p = len(models.index)
n = df.shape[0]
X = np.zeros([n, p])
X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
# one A win => two A win
Y = np.zeros(n)
Y[df["winner"] == "A"] = 1.0
WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the Arena-Lite or something odd (perfect one-sided wins) is happening\n\nto avoid logistic regressor error, manually putting other class"
if (Y == 0).all():
print(WARNING.format(L=32))
Y[-1] = 1.0
elif (Y == 1.0).all():
print(WARNING.format(L=35))
Y[-1] = 0.0
lr = LogisticRegression(fit_intercept=False)
lr.fit(X, Y)
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
elo_scores = pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
df = (
pd.DataFrame(
[[n, round(elo_scores[n], 2)] for n in elo_scores.keys()],
columns=["Model", "Elo rating"],
)
.sort_values("Elo rating", ascending=False)
.reset_index(drop=True)
)
df.index = df.index + 1
df["Elo rating"] = df["Elo rating"]
return df
# Utilities
def compute_relative_winrate_to_1st(elo_df):
"""
Post-processing utility for saving elo table to an excel file. Possibly work as a absolute measure for quality.
elo_df:
columns: Model, Elo rating
add:
column: relative_winrate_to_1st
"""
from functools import partial
rating1st = elo_df["Elo rating"].max()
win_rate_to_1st = partial(elo_to_winrate, rating_b=rating1st)
elo_df["winrate_vs_1st"] = elo_df["Elo rating"].apply(win_rate_to_1st).round(3)
print(elo_df)
return elo_df
def elo_to_winrate(rating_a: float = None, rating_b: float = None) -> float:
# compute P(A wins B) from ratings
rate_diff = rating_a - rating_b
win_rate = 1 / (1 + 10 ** (-rate_diff / 400))
return win_rate
def compute_elo_with_ci(df, n_bootstrap=1000):
"""
Compute ELO ratings with 95% confidence intervals using bootstrapping.
"""
if isinstance(df, list):
df = pd.DataFrame(df)
bootstrap_elo_scores = []
for i in range(n_bootstrap):
# 복원추출로 샘플링
sample_df = df.sample(n=len(df), replace=True)
elo_scores = compute_mle_elo(sample_df)
elo_scores = elo_scores.set_index("Model")["Elo rating"]
bootstrap_elo_scores.append(elo_scores)
bootstrap_df = pd.DataFrame(bootstrap_elo_scores)
# 신뢰구간 계산
ci_lower = bootstrap_df.quantile(0.025).round(1)
ci_upper = bootstrap_df.quantile(0.975).round(1)
# 원본 데이터로 ELO 점수 계산
main_elo_df = compute_mle_elo(df)
main_elo_df = main_elo_df.set_index("Model")
# 결과 합치기
result_df = main_elo_df.copy()
result_df["95% CI_lower"] = ci_lower
result_df["95% CI_upper"] = ci_upper
result_df = result_df.sort_values("Elo rating", ascending=False)
result_df["Elo rating"] = result_df["Elo rating"].round(1)
result_df = result_df.reset_index()
result_df.index = result_df.index + 1
return result_df