Spaces:
Sleeping
Sleeping
import math | |
import numpy as np | |
import pandas as pd | |
from sklearn.linear_model import LogisticRegression | |
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000): | |
if isinstance(df, list): | |
df = pd.DataFrame(df) | |
df = df.dropna(subset=["winner", "model_a", "model_b"]) # dropping None vs sth | |
models = pd.concat([df["model_a"], df["model_b"]]).unique() | |
models = pd.Series(np.arange(len(models)), index=models) | |
# duplicate battles | |
df = pd.concat([df, df], ignore_index=True) | |
p = len(models.index) | |
n = df.shape[0] | |
X = np.zeros([n, p]) | |
X[np.arange(n), models[df["model_a"]]] = +math.log(BASE) | |
X[np.arange(n), models[df["model_b"]]] = -math.log(BASE) | |
# one A win => two A win | |
Y = np.zeros(n) | |
Y[df["winner"] == "A"] = 1.0 | |
WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the Arena-Lite or something odd (perfect one-sided wins) is happening\n\nto avoid logistic regressor error, manually putting other class" | |
if (Y == 0).all(): | |
print(WARNING.format(L=32)) | |
Y[-1] = 1.0 | |
elif (Y == 1.0).all(): | |
print(WARNING.format(L=35)) | |
Y[-1] = 0.0 | |
lr = LogisticRegression(fit_intercept=False) | |
lr.fit(X, Y) | |
elo_scores = SCALE * lr.coef_[0] + INIT_RATING | |
elo_scores = pd.Series(elo_scores, index=models.index).sort_values(ascending=False) | |
df = ( | |
pd.DataFrame( | |
[[n, round(elo_scores[n], 2)] for n in elo_scores.keys()], | |
columns=["Model", "Elo rating"], | |
) | |
.sort_values("Elo rating", ascending=False) | |
.reset_index(drop=True) | |
) | |
df.index = df.index + 1 | |
df["Elo rating"] = df["Elo rating"] | |
return df | |
# Utilities | |
def compute_relative_winrate_to_1st(elo_df): | |
""" | |
Post-processing utility for saving elo table to an excel file. Possibly work as a absolute measure for quality. | |
elo_df: | |
columns: Model, Elo rating | |
add: | |
column: relative_winrate_to_1st | |
""" | |
from functools import partial | |
rating1st = elo_df["Elo rating"].max() | |
win_rate_to_1st = partial(elo_to_winrate, rating_b=rating1st) | |
elo_df["winrate_vs_1st"] = elo_df["Elo rating"].apply(win_rate_to_1st).round(3) | |
print(elo_df) | |
return elo_df | |
def elo_to_winrate(rating_a: float = None, rating_b: float = None) -> float: | |
# compute P(A wins B) from ratings | |
rate_diff = rating_a - rating_b | |
win_rate = 1 / (1 + 10 ** (-rate_diff / 400)) | |
return win_rate | |
def compute_elo_with_ci(df, n_bootstrap=1000): | |
""" | |
Compute ELO ratings with 95% confidence intervals using bootstrapping. | |
""" | |
if isinstance(df, list): | |
df = pd.DataFrame(df) | |
bootstrap_elo_scores = [] | |
for i in range(n_bootstrap): | |
# 복원추출로 샘플링 | |
sample_df = df.sample(n=len(df), replace=True) | |
elo_scores = compute_mle_elo(sample_df) | |
elo_scores = elo_scores.set_index("Model")["Elo rating"] | |
bootstrap_elo_scores.append(elo_scores) | |
bootstrap_df = pd.DataFrame(bootstrap_elo_scores) | |
# 신뢰구간 계산 | |
ci_lower = bootstrap_df.quantile(0.025).round(1) | |
ci_upper = bootstrap_df.quantile(0.975).round(1) | |
# 원본 데이터로 ELO 점수 계산 | |
main_elo_df = compute_mle_elo(df) | |
main_elo_df = main_elo_df.set_index("Model") | |
# 결과 합치기 | |
result_df = main_elo_df.copy() | |
result_df["95% CI_lower"] = ci_lower | |
result_df["95% CI_upper"] = ci_upper | |
result_df = result_df.sort_values("Elo rating", ascending=False) | |
result_df["Elo rating"] = result_df["Elo rating"].round(1) | |
result_df = result_df.reset_index() | |
result_df.index = result_df.index + 1 | |
return result_df | |