Spaces:
Running
Running
Update leaderboard.py
Browse files- leaderboard.py +58 -253
leaderboard.py
CHANGED
@@ -6,7 +6,6 @@ from datetime import datetime
|
|
6 |
import threading
|
7 |
import config
|
8 |
import math
|
9 |
-
import plotly.graph_objects as go
|
10 |
|
11 |
# Initialize Nextcloud client
|
12 |
nc = Nextcloud(nextcloud_url=config.NEXTCLOUD_URL, nc_auth_user=config.NEXTCLOUD_USERNAME, nc_auth_pass=config.NEXTCLOUD_PASSWORD)
|
@@ -112,101 +111,39 @@ def get_human_readable_name(model_name: str) -> str:
|
|
112 |
def get_leaderboard():
|
113 |
leaderboard = load_leaderboard()
|
114 |
|
115 |
-
#
|
116 |
-
|
117 |
-
|
118 |
-
if total_battles > 0:
|
119 |
-
win_rate = results["wins"] / total_battles
|
120 |
-
results["score"] = win_rate * (1 - 1 / (total_battles + 1))
|
121 |
-
else:
|
122 |
-
results["score"] = 0
|
123 |
-
|
124 |
-
# Sort results by score, then by total battles
|
125 |
-
sorted_results = sorted(
|
126 |
-
leaderboard.items(),
|
127 |
-
key=lambda x: (x[1]["score"], x[1]["wins"] + x[1]["losses"]),
|
128 |
-
reverse=True
|
129 |
-
)
|
130 |
-
# Explanation of the main leaderboard
|
131 |
-
explanation = """
|
132 |
-
<p style="font-size: 16px; margin-bottom: 20px;">
|
133 |
-
This leaderboard uses a scoring system that balances win rate and total battles. The score is calculated using the formula:
|
134 |
-
<br>
|
135 |
-
<strong>Score = Win Rate * (1 - 1 / (Total Battles + 1))</strong>
|
136 |
-
<br>
|
137 |
-
This formula rewards models with higher win rates and more battles. As the number of battles increases, the score approaches the win rate.
|
138 |
-
</p>
|
139 |
-
"""
|
140 |
-
|
141 |
-
leaderboard_html = f"""
|
142 |
-
{explanation}
|
143 |
-
<style>
|
144 |
-
.leaderboard-table {{
|
145 |
-
width: 100%;
|
146 |
-
border-collapse: collapse;
|
147 |
-
font-family: Arial, sans-serif;
|
148 |
-
}}
|
149 |
-
.leaderboard-table th, .leaderboard-table td {{
|
150 |
-
border: 1px solid #ddd;
|
151 |
-
padding: 8px;
|
152 |
-
text-align: left;
|
153 |
-
}}
|
154 |
-
.leaderboard-table th {{
|
155 |
-
background-color: rgba(255, 255, 255, 0.1);
|
156 |
-
font-weight: bold;
|
157 |
-
}}
|
158 |
-
.rank-column {{
|
159 |
-
width: 60px;
|
160 |
-
text-align: center;
|
161 |
-
}}
|
162 |
-
.opponent-details {{
|
163 |
-
font-size: 0.9em;
|
164 |
-
color: #888;
|
165 |
-
}}
|
166 |
-
</style>
|
167 |
-
<table class='leaderboard-table'>
|
168 |
-
<tr>
|
169 |
-
<th class='rank-column'>Rank</th>
|
170 |
-
<th>Model</th>
|
171 |
-
<th>Score</th>
|
172 |
-
<th>Wins</th>
|
173 |
-
<th>Losses</th>
|
174 |
-
<th>Win Rate</th>
|
175 |
-
<th>Total Battles</th>
|
176 |
-
<th>Top Rival</th>
|
177 |
-
<th>Toughest Opponent</th>
|
178 |
-
</tr>
|
179 |
-
"""
|
180 |
|
181 |
-
for
|
182 |
-
|
183 |
-
|
|
|
184 |
|
185 |
-
|
|
|
186 |
|
187 |
-
|
188 |
-
|
189 |
-
top_rival_wins = top_rival[1]["wins"]
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
toughest_opponent_losses = toughest_opponent[1]["losses"]
|
194 |
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
return
|
210 |
|
211 |
def calculate_elo_impact(model):
|
212 |
positive_impact = 0
|
@@ -232,101 +169,42 @@ def calculate_elo_impact(model):
|
|
232 |
|
233 |
def get_elo_leaderboard():
|
234 |
ensure_elo_ratings_initialized()
|
235 |
-
leaderboard = load_leaderboard()
|
236 |
|
237 |
-
#
|
|
|
|
|
|
|
|
|
238 |
all_models = set(dict(config.get_approved_models()).keys()) | set(leaderboard.keys())
|
239 |
|
240 |
-
elo_data = []
|
241 |
for model in all_models:
|
242 |
-
|
243 |
-
|
244 |
|
245 |
-
#
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
positive_impact, negative_impact, _ = calculate_elo_impact(model)
|
251 |
-
else:
|
252 |
-
wins = losses = total_battles = positive_impact = negative_impact = 0
|
253 |
|
254 |
-
|
255 |
-
|
256 |
-
'current_rating': current_rating,
|
257 |
-
'initial_rating': initial_rating,
|
258 |
-
'total_battles': total_battles,
|
259 |
-
'positive_impact': positive_impact,
|
260 |
-
'negative_impact': negative_impact
|
261 |
-
})
|
262 |
-
|
263 |
-
# Sort the data by current rating
|
264 |
-
sorted_elo_data = sorted(elo_data, key=lambda x: x['current_rating'], reverse=True)
|
265 |
-
|
266 |
-
min_initial_rating = min(data['initial_rating'] for data in elo_data)
|
267 |
-
max_initial_rating = max(data['initial_rating'] for data in elo_data)
|
268 |
-
|
269 |
-
explanation_elo = f"""
|
270 |
-
<p style="font-size: 16px; margin-bottom: 20px;">
|
271 |
-
This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models.
|
272 |
-
Initial ratings range from {round(min_initial_rating)} to {round(max_initial_rating)} points, based on model size, with larger models starting at higher ratings.
|
273 |
-
The "Positive Impact" score reflects the significance of wins, with higher scores for defeating larger models.
|
274 |
-
The "Negative Impact" score indicates the significance of losses, with higher scores for losing against smaller models.
|
275 |
-
The current ELO rating is calculated based on these impacts and the model's performance history.
|
276 |
-
</p>
|
277 |
-
"""
|
278 |
-
|
279 |
-
leaderboard_html = f"""
|
280 |
-
{explanation_elo}
|
281 |
-
<style>
|
282 |
-
.elo-leaderboard-table {{
|
283 |
-
width: 100%;
|
284 |
-
border-collapse: collapse;
|
285 |
-
font-family: Arial, sans-serif;
|
286 |
-
}}
|
287 |
-
.elo-leaderboard-table th, .elo-leaderboard-table td {{
|
288 |
-
border: 1px solid #ddd;
|
289 |
-
padding: 8px;
|
290 |
-
text-align: left;
|
291 |
-
}}
|
292 |
-
.elo-leaderboard-table th {{
|
293 |
-
background-color: rgba(255, 255, 255, 0.1);
|
294 |
-
font-weight: bold;
|
295 |
-
}}
|
296 |
-
.rank-column {{
|
297 |
-
width: 60px;
|
298 |
-
text-align: center;
|
299 |
-
}}
|
300 |
-
</style>
|
301 |
-
<table class='elo-leaderboard-table'>
|
302 |
-
<tr>
|
303 |
-
<th class='rank-column'>Rank</th>
|
304 |
-
<th>Model</th>
|
305 |
-
<th>Current ELO Rating</th>
|
306 |
-
<th>Positive Impact</th>
|
307 |
-
<th>Negative Impact</th>
|
308 |
-
<th>Total Battles</th>
|
309 |
-
<th>Initial Rating</th>
|
310 |
-
</tr>
|
311 |
-
"""
|
312 |
-
|
313 |
-
for index, data in enumerate(sorted_elo_data, start=1):
|
314 |
-
rank_display = {1: "🥇", 2: "🥈", 3: "🥉"}.get(index, f"{index}")
|
315 |
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
|
|
330 |
|
331 |
def create_backup():
|
332 |
while True:
|
@@ -344,77 +222,4 @@ def create_backup():
|
|
344 |
|
345 |
def start_backup_thread():
|
346 |
backup_thread = threading.Thread(target=create_backup, daemon=True)
|
347 |
-
backup_thread.start()
|
348 |
-
|
349 |
-
def get_leaderboard_chart():
|
350 |
-
battle_results = get_current_leaderboard()
|
351 |
-
|
352 |
-
# Calculate scores and sort results
|
353 |
-
for model, results in battle_results.items():
|
354 |
-
total_battles = results["wins"] + results["losses"]
|
355 |
-
if total_battles > 0:
|
356 |
-
win_rate = results["wins"] / total_battles
|
357 |
-
results["score"] = win_rate * (1 - 1 / (total_battles + 1))
|
358 |
-
else:
|
359 |
-
results["score"] = 0
|
360 |
-
|
361 |
-
sorted_results = sorted(
|
362 |
-
battle_results.items(),
|
363 |
-
key=lambda x: (x[1]["score"], x[1]["wins"] + x[1]["losses"]),
|
364 |
-
reverse=True
|
365 |
-
)
|
366 |
-
|
367 |
-
models = [get_human_readable_name(model) for model, _ in sorted_results]
|
368 |
-
wins = [results["wins"] for _, results in sorted_results]
|
369 |
-
losses = [results["losses"] for _, results in sorted_results]
|
370 |
-
scores = [results["score"] for _, results in sorted_results]
|
371 |
-
|
372 |
-
fig = go.Figure()
|
373 |
-
|
374 |
-
# Stacked Bar chart for Wins and Losses
|
375 |
-
fig.add_trace(go.Bar(
|
376 |
-
x=models,
|
377 |
-
y=wins,
|
378 |
-
name='Wins',
|
379 |
-
marker_color='#22577a'
|
380 |
-
))
|
381 |
-
fig.add_trace(go.Bar(
|
382 |
-
x=models,
|
383 |
-
y=losses,
|
384 |
-
name='Losses',
|
385 |
-
marker_color='#38a3a5'
|
386 |
-
))
|
387 |
-
|
388 |
-
# Line chart for Scores
|
389 |
-
fig.add_trace(go.Scatter(
|
390 |
-
x=models,
|
391 |
-
y=scores,
|
392 |
-
name='Score',
|
393 |
-
yaxis='y2',
|
394 |
-
line=dict(color='#ff7f0e', width=2)
|
395 |
-
))
|
396 |
-
|
397 |
-
# Update layout for full-width, increased height, and secondary y-axis
|
398 |
-
fig.update_layout(
|
399 |
-
title='Model Performance',
|
400 |
-
xaxis_title='Models',
|
401 |
-
yaxis_title='Number of Battles',
|
402 |
-
yaxis2=dict(
|
403 |
-
title='Score',
|
404 |
-
overlaying='y',
|
405 |
-
side='right'
|
406 |
-
),
|
407 |
-
barmode='stack',
|
408 |
-
height=800,
|
409 |
-
width=1450,
|
410 |
-
autosize=True,
|
411 |
-
legend=dict(
|
412 |
-
orientation='h',
|
413 |
-
yanchor='bottom',
|
414 |
-
y=1.02,
|
415 |
-
xanchor='right',
|
416 |
-
x=1
|
417 |
-
)
|
418 |
-
)
|
419 |
-
|
420 |
-
return fig
|
|
|
6 |
import threading
|
7 |
import config
|
8 |
import math
|
|
|
9 |
|
10 |
# Initialize Nextcloud client
|
11 |
nc = Nextcloud(nextcloud_url=config.NEXTCLOUD_URL, nc_auth_user=config.NEXTCLOUD_USERNAME, nc_auth_pass=config.NEXTCLOUD_PASSWORD)
|
|
|
111 |
def get_leaderboard():
|
112 |
leaderboard = load_leaderboard()
|
113 |
|
114 |
+
# Prepare data for Gradio table
|
115 |
+
table_data = []
|
116 |
+
headers = ["Model", "Score", "Wins", "Losses", "Total Battles", "Win Rate"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
+
for model, results in leaderboard.items():
|
119 |
+
wins = results.get('wins', 0)
|
120 |
+
losses = results.get('losses', 0)
|
121 |
+
total_battles = wins + losses
|
122 |
|
123 |
+
# Calculate win rate
|
124 |
+
win_rate = wins / total_battles if total_battles > 0 else 0
|
125 |
|
126 |
+
# Calculate score using the formula: win_rate * (1 - 1/(total_battles + 1))
|
127 |
+
score = win_rate * (1 - 1/(total_battles + 1)) if total_battles > 0 else 0
|
|
|
128 |
|
129 |
+
# Get human readable name
|
130 |
+
human_readable = get_human_readable_name(model)
|
|
|
131 |
|
132 |
+
# Format the row
|
133 |
+
row = [
|
134 |
+
human_readable,
|
135 |
+
f"{score:.3f}",
|
136 |
+
str(wins),
|
137 |
+
str(losses),
|
138 |
+
str(total_battles),
|
139 |
+
f"{win_rate:.1%}"
|
140 |
+
]
|
141 |
+
table_data.append(row)
|
142 |
+
|
143 |
+
# Sort by score (descending)
|
144 |
+
table_data.sort(key=lambda x: float(x[1]), reverse=True)
|
145 |
+
|
146 |
+
return table_data
|
147 |
|
148 |
def calculate_elo_impact(model):
|
149 |
positive_impact = 0
|
|
|
169 |
|
170 |
def get_elo_leaderboard():
|
171 |
ensure_elo_ratings_initialized()
|
|
|
172 |
|
173 |
+
# Prepare data for Gradio table
|
174 |
+
table_data = []
|
175 |
+
headers = ["Model", "ELO Rating", "Wins", "Losses", "Total Battles", "Win Rate"]
|
176 |
+
|
177 |
+
leaderboard = load_leaderboard()
|
178 |
all_models = set(dict(config.get_approved_models()).keys()) | set(leaderboard.keys())
|
179 |
|
|
|
180 |
for model in all_models:
|
181 |
+
# Get ELO rating
|
182 |
+
rating = elo_ratings.get(model, 1000 + (get_model_size(model) * 100))
|
183 |
|
184 |
+
# Get battle data
|
185 |
+
wins = leaderboard.get(model, {}).get('wins', 0)
|
186 |
+
losses = leaderboard.get(model, {}).get('losses', 0)
|
187 |
+
total_battles = wins + losses
|
188 |
+
win_rate = wins / total_battles if total_battles > 0 else 0
|
|
|
|
|
|
|
189 |
|
190 |
+
# Get human readable name
|
191 |
+
human_readable = get_human_readable_name(model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
+
# Format the row
|
194 |
+
row = [
|
195 |
+
human_readable,
|
196 |
+
f"{rating:.1f}",
|
197 |
+
str(wins),
|
198 |
+
str(losses),
|
199 |
+
str(total_battles),
|
200 |
+
f"{win_rate:.1%}"
|
201 |
+
]
|
202 |
+
table_data.append(row)
|
203 |
+
|
204 |
+
# Sort by ELO rating (descending)
|
205 |
+
table_data.sort(key=lambda x: float(x[1]), reverse=True)
|
206 |
+
|
207 |
+
return table_data
|
208 |
|
209 |
def create_backup():
|
210 |
while True:
|
|
|
222 |
|
223 |
def start_backup_thread():
|
224 |
backup_thread = threading.Thread(target=create_backup, daemon=True)
|
225 |
+
backup_thread.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|