Spaces:
Running
Running
Upload index.html
Browse files- index.html +171 -31
index.html
CHANGED
@@ -263,20 +263,22 @@
|
|
263 |
const $tbody = document.getElementById('elo-tbody');
|
264 |
if(!$tbody) return;
|
265 |
const DATA = [
|
266 |
-
{ model:"GPT-5", org:"OpenAI", Elo:
|
267 |
-
{ model:"Gemini 2.5 Pro", org:"Google", Elo:
|
268 |
-
{ model:"
|
269 |
-
{ model:"
|
270 |
-
{ model:"
|
271 |
-
{ model:"
|
272 |
-
{ model:"
|
|
|
273 |
];
|
274 |
// Organisation -> logo path (relative)
|
275 |
const ORG_LOGOS = {
|
276 |
"OpenAI": "assets/logos/openai-svgrepo-com.svg",
|
277 |
"Google": "assets/logos/google-icon-logo-svgrepo-com.svg",
|
278 |
"Alibaba": "assets/logos/Qwen_logo.svg",
|
279 |
-
"Moonshot AI": "assets/logos/moonshot.png"
|
|
|
280 |
};
|
281 |
// Expose global mapping label->logo for charts
|
282 |
window.MODEL_LOGOS_BY_LABEL = {
|
@@ -286,7 +288,8 @@
|
|
286 |
'gemini-2.5-flash': ORG_LOGOS['Google'], 'Gemini 2.5 Flash': ORG_LOGOS['Google'],
|
287 |
'qwen3-235b-instruct': ORG_LOGOS['Alibaba'], 'Qwen3-235B-Instruct': ORG_LOGOS['Alibaba'],
|
288 |
'kimi-k2-instruct': ORG_LOGOS['Moonshot AI'], 'Kimi-K2-Instruct': ORG_LOGOS['Moonshot AI'],
|
289 |
-
'gpt-oss-120b': ORG_LOGOS['OpenAI'], 'GPT-OSS-120B': ORG_LOGOS['OpenAI']
|
|
|
290 |
};
|
291 |
// Global plugin to draw logos near tick labels
|
292 |
(function(){
|
@@ -571,8 +574,8 @@
|
|
571 |
<section id="wolves-elo">
|
572 |
<h3>Wolves Elo → “can you move the room?”</h3>
|
573 |
<p>As wolves, the strongest models don’t just aim for a single mis-elimination. They build momentum across days, aligning night choices with a public story, pacing their pressure, and keeping alternatives ready when new claims appear.</p>
|
574 |
-
<p>At the top, <strong>GPT-5</strong> dominates with disciplined multi-day control, while <strong>Kimi-K2</strong> and <strong>Gemini 2.5 Pro</strong> show high
|
575 |
-
<p>The rest cluster behind: <strong>GPT-5-mini</strong>, <strong>2.5 Flash</strong>, and <strong>Qwen3</strong> can bend a vote but rarely sustain deception into Day 2, and <strong>GPT-OSS</strong> remains transparent and easily repelled. Manipulation power is concentrated at the very top; most others rely on village mistakes or chaos to survive.</p>
|
576 |
<div class="card" style="margin:12px 0 10px 0; background:#ffffff; border:none;">
|
577 |
<h3>Per-role Elo — wolves</h3>
|
578 |
<div style="position:relative; height:360px; margin-left:auto; margin-right:auto;">
|
@@ -617,15 +620,16 @@
|
|
617 |
<div class="tabs" role="tablist" aria-label="Wolf profiles tabs">
|
618 |
<button class="tab-btn" id="tab-prof-gpt5" role="tab" aria-selected="false" aria-controls="panel-prof-gpt5">GPT‑5</button>
|
619 |
<button class="tab-btn" id="tab-prof-Gemini-2.5-pro" role="tab" aria-selected="false" aria-controls="panel-prof-Gemini-2.5-pro">Gemini‑2.5-Pro</button>
|
|
|
620 |
<button class="tab-btn" id="tab-prof-Gemini-2.5-Flash" role="tab" aria-selected="false" aria-controls="panel-prof-Gemini-2.5-Flash">Gemini‑2.5-Flash</button>
|
621 |
-
<button class="tab-btn" id="tab-prof-kimi" role="tab" aria-selected="
|
622 |
<button class="tab-btn" id="tab-prof-mini" role="tab" aria-selected="false" aria-controls="panel-prof-mini">GPT‑5‑mini</button>
|
623 |
<button class="tab-btn" id="tab-prof-qwen3" role="tab" aria-selected="false" aria-controls="panel-prof-qwen3">Qwen3‑235B</button>
|
624 |
<button class="tab-btn" id="tab-prof-oss" role="tab" aria-selected="false" aria-controls="panel-prof-oss">GPT‑OSS‑120B</button>
|
625 |
</div>
|
626 |
|
627 |
<!-- Kimi-K2 (filled) -->
|
628 |
-
<section id="panel-prof-kimi" class="tab-panel" role="tabpanel" aria-labelledby="tab-prof-kimi" aria-hidden="
|
629 |
<article class="profile">
|
630 |
<div class="profile-hd">
|
631 |
<div class="role-badge" title="Wolf">🐺</div>
|
@@ -698,6 +702,66 @@
|
|
698 |
|
699 |
</article>
|
700 |
</section>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
701 |
|
702 |
<!-- GPT-5 wolf profile -->
|
703 |
<section id="panel-prof-gpt5" class="tab-panel" role="tabpanel" aria-labelledby="tab-prof-gpt5" aria-hidden="true">
|
@@ -1193,7 +1257,7 @@
|
|
1193 |
<h3>Villagers Elo → “can you hold the line?”</h3>
|
1194 |
<p>On defense the mandate flips: filter claims without paranoia, punish contradictions, and avoid tunneled mis‑eliminations. Good villagers create <em>information hygiene</em>: they keep the table anchored to public facts, ask targeted questions, and update beliefs in the open so “story work” is harder to sneak through.</p>
|
1195 |
<p><strong>GPT‑5</strong> again sets the pace. Its structured tie‑break rules and public updates make it difficult to mislead for long. <strong>Gemini 2.5 Pro</strong> is the clearest defensive specialist: measured tone, disciplined evidence handling, and a strong refusal to bite on bait. <strong>Qwen3</strong> surprises as a sturdy defender: it won’t always drive the room, but it keeps its footing and avoids catastrophic reads.</p>
|
1196 |
-
<p><strong>Kimi‑K2</strong> brings energy to defense but is spikier under pressure; it can flip a vote with momentum, yet volatility shows when the board gets precise. <strong>GPT‑5‑mini</strong> and <strong>Flash</strong> are serviceable, still steerable by persistent narrative pressure. At the low end, <strong>GPT‑OSS</strong> struggles to recover once a wrong frame sets in, making course‑corrections late and costly.</p>
|
1197 |
<div class="card" style="margin:12px 0 10px 0; background:#ffffff; border:none;">
|
1198 |
<h3>Per-role Elo — villagers</h3>
|
1199 |
<div style="position:relative; height:360px; margin-left:auto; margin-right:auto;">
|
@@ -1230,14 +1294,15 @@
|
|
1230 |
<div class="tabs" role="tablist" aria-label="Villager profiles tabs">
|
1231 |
<button class="tab-btn" id="vtab-prof-gpt5" role="tab" aria-selected="false" aria-controls="panel-vprof-gpt5">GPT‑5</button>
|
1232 |
<button class="tab-btn" id="vtab-prof-Gemini-2.5-pro" role="tab" aria-selected="false" aria-controls="panel-vprof-Gemini-2.5-pro">Gemini-2.5-Pro</button>
|
|
|
1233 |
<button class="tab-btn" id="vtab-prof-Gemini-2.5-Flash" role="tab" aria-selected="false" aria-controls="panel-vprof-Gemini-2.5-Flash">Gemini‑2.5-Flash</button>
|
1234 |
-
<button class="tab-btn" id="vtab-prof-kimi" role="tab" aria-selected="
|
1235 |
<button class="tab-btn" id="vtab-prof-mini" role="tab" aria-selected="false" aria-controls="panel-vprof-mini">GPT‑5‑mini</button>
|
1236 |
<button class="tab-btn" id="vtab-prof-qwen3" role="tab" aria-selected="false" aria-controls="panel-vprof-qwen3">Qwen3‑235B</button>
|
1237 |
<button class="tab-btn" id="vtab-prof-oss" role="tab" aria-selected="false" aria-controls="panel-vprof-oss">GPT‑OSS‑120B</button>
|
1238 |
</div>
|
1239 |
|
1240 |
-
<section id="panel-vprof-kimi" class="tab-panel" role="tabpanel" aria-labelledby="vtab-prof-kimi" aria-hidden="
|
1241 |
<article class="profile">
|
1242 |
<div class="profile-hd">
|
1243 |
<div class="role-badge" title="Villager">👤</div>
|
@@ -1278,6 +1343,71 @@
|
|
1278 |
</details>
|
1279 |
</article>
|
1280 |
</section>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1281 |
|
1282 |
<section id="panel-vprof-gpt5" class="tab-panel" role="tabpanel" aria-labelledby="vtab-prof-gpt5" aria-hidden="true">
|
1283 |
<article class="profile">
|
@@ -1662,8 +1792,8 @@
|
|
1662 |
<p>Rows are villagers, columns are wolves (cells show the villager side’s win-rate with match counts). Three patterns stand out:</p>
|
1663 |
<ul>
|
1664 |
<li><strong>True sweep capability.</strong> When <strong>GPT-5</strong> is the villager side, it reliably cleans up across columns—including against Kimi-K2 wolves. Conversely, when <strong>GPT-5</strong> is the wolf side, many rows crater (0–5 type cells). That’s ceiling behavior you don’t see elsewhere.</li>
|
1665 |
-
<li><strong>Spiky attackers vs. solid defenders.</strong> <strong>Kimi-K2</strong> as wolves punches through mid-tier villagers (e.g., Flash, mini) but gets contained by the better defenders (GPT-5, Gemini
|
1666 |
-
<li><strong>Asymmetries you’d miss on a single Elo.</strong> <strong>Gemini
|
1667 |
</ul>
|
1668 |
<p class="aside">Note: Rows list villager models; columns list wolf models. Each cell shows the <em>villager win rate</em> for that specific pairing, with the record underneath (e.g., <code>80% → 4–1</code>). Color encodes the win rate (darker = higher, lighter = lower); gray means no games for that pair. Scan across a row to compare one villager against all wolves, and down a column to compare one wolf against all villagers. Focus on row/column patterns rather than any single cell.
|
1669 |
</p>
|
@@ -1715,14 +1845,24 @@
|
|
1715 |
<script>
|
1716 |
(function(){
|
1717 |
var root=document.getElementById('h2h-embed'); if(!root) return;
|
1718 |
-
var models=['
|
1719 |
-
var short=['
|
1720 |
// Desired display order (by Elo ranking, high → low)
|
1721 |
-
var ELO_ORDER=['GPT-5','Gemini-2.5-Pro','Gemini-2.5-Flash','Qwen3-235B','GPT-5-mini','Kimi-K2','GPT-OSS-120B'];
|
1722 |
var order=ELO_ORDER.map(function(name){ return models.indexOf(name); }).filter(function(i){ return i!==-1; });
|
1723 |
-
|
1724 |
-
var
|
1725 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1726 |
function hexToRgb(hex){var m=/^#?([a-f\d]{2})([a-f\d]{2})([a-f\d]{2})$/i.exec(hex);return m?{r:parseInt(m[1],16),g:parseInt(m[2],16),b:parseInt(m[3],16)}:null}
|
1727 |
function interp(c1,c2,t){var a=hexToRgb(c1),b=hexToRgb(c2);var r=Math.round(a.r+(b.r-a.r)*t),g=Math.round(a.g+(b.g-a.g)*t),bb=Math.round(a.b+(b.b-a.b)*t);return 'rgb('+r+','+g+','+bb+')'}
|
1728 |
function getColor(rate){var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i],hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
@@ -1786,9 +1926,9 @@
|
|
1786 |
function getColor(rate){ var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
1787 |
function barColor(v){ var c=Math.max(0,Math.min(100,v)); return getColor(c/100); }
|
1788 |
function init(){
|
1789 |
-
var labels = ['gpt-5', 'gemini-2.5-pro', 'gemini-2.5-flash', 'qwen3-235b-instruct', 'gpt-5-mini', 'kimi-k2-instruct', 'gpt-oss-120b'];
|
1790 |
-
var day1 = [
|
1791 |
-
var day2 = [
|
1792 |
var ctx = canvas.getContext('2d');
|
1793 |
var isMobile = window.matchMedia && window.matchMedia('(max-width: 640px)').matches;
|
1794 |
if(isMobile){
|
@@ -2045,8 +2185,8 @@
|
|
2045 |
function getColor(rate){ var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
2046 |
function barColor(v){ return getColor(Math.max(0,Math.min(100,v))/100); }
|
2047 |
function init(){
|
2048 |
-
var labels = ['gpt-5','gemini-2.5-pro','gemini-2.5-flash','qwen3-235b-instruct','gpt-5-mini','kimi-k2-instruct','gpt-oss-120b'];
|
2049 |
-
var values = [0.0,
|
2050 |
var ctx = c.getContext('2d');
|
2051 |
new Chart(ctx, {
|
2052 |
type: 'bar',
|
@@ -2169,8 +2309,8 @@
|
|
2169 |
function getColor(rate){ var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
2170 |
function barColor(v){ return getColor(Math.max(0,Math.min(100,v))/100); }
|
2171 |
function init(){
|
2172 |
-
var labels = ['gpt-5','gemini-2.5-pro','gemini-2.5-flash','qwen3-235b-instruct','gpt-5-mini','kimi-k2-instruct','gpt-oss-120b'];
|
2173 |
-
var values = [100.0,
|
2174 |
var ctx = c.getContext('2d');
|
2175 |
new Chart(ctx, {
|
2176 |
type: 'bar',
|
|
|
263 |
const $tbody = document.getElementById('elo-tbody');
|
264 |
if(!$tbody) return;
|
265 |
const DATA = [
|
266 |
+
{ model:"GPT-5", org:"OpenAI", Elo:1524, EloW:1549, EloV:1500, matches:70, win:97.1 },
|
267 |
+
{ model:"Gemini 2.5 Pro", org:"Google", Elo:1268, EloW:1135, EloV:1401, matches:70, win:62.9 },
|
268 |
+
{ model:"Grok-4", org:"XAI", Elo:1223, EloW:1134, EloV:1312, matches:70, win:52.9 },
|
269 |
+
{ model:"Gemini 2.5 Flash", org:"Google", Elo:1193, EloW:1127, EloV:1258, matches:70, win:51.4 },
|
270 |
+
{ model:"Qwen3-235B-Instruct", org:"Alibaba", Elo:1160, EloW:1029, EloV:1291, matches:70, win:44.3 },
|
271 |
+
{ model:"GPT-5-mini", org:"OpenAI", Elo:1148, EloW:1057, EloV:1240, matches:70, win:40.0 },
|
272 |
+
{ model:"Kimi-K2-Instruct", org:"Moonshot AI", Elo:1130, EloW:1141, EloV:1118, matches:70, win:37.1 },
|
273 |
+
{ model:"GPT-OSS-120B", org:"OpenAI", Elo:954, EloW:907, EloV:1000, matches:70, win:14.3 }
|
274 |
];
|
275 |
// Organisation -> logo path (relative)
|
276 |
const ORG_LOGOS = {
|
277 |
"OpenAI": "assets/logos/openai-svgrepo-com.svg",
|
278 |
"Google": "assets/logos/google-icon-logo-svgrepo-com.svg",
|
279 |
"Alibaba": "assets/logos/Qwen_logo.svg",
|
280 |
+
"Moonshot AI": "assets/logos/moonshot.png",
|
281 |
+
"XAI": "assets/logos/XAI_Logo.svg.png"
|
282 |
};
|
283 |
// Expose global mapping label->logo for charts
|
284 |
window.MODEL_LOGOS_BY_LABEL = {
|
|
|
288 |
'gemini-2.5-flash': ORG_LOGOS['Google'], 'Gemini 2.5 Flash': ORG_LOGOS['Google'],
|
289 |
'qwen3-235b-instruct': ORG_LOGOS['Alibaba'], 'Qwen3-235B-Instruct': ORG_LOGOS['Alibaba'],
|
290 |
'kimi-k2-instruct': ORG_LOGOS['Moonshot AI'], 'Kimi-K2-Instruct': ORG_LOGOS['Moonshot AI'],
|
291 |
+
'gpt-oss-120b': ORG_LOGOS['OpenAI'], 'GPT-OSS-120B': ORG_LOGOS['OpenAI'],
|
292 |
+
'grok-4': ORG_LOGOS['XAI'], 'Grok-4': ORG_LOGOS['XAI']
|
293 |
};
|
294 |
// Global plugin to draw logos near tick labels
|
295 |
(function(){
|
|
|
574 |
<section id="wolves-elo">
|
575 |
<h3>Wolves Elo → “can you move the room?”</h3>
|
576 |
<p>As wolves, the strongest models don’t just aim for a single mis-elimination. They build momentum across days, aligning night choices with a public story, pacing their pressure, and keeping alternatives ready when new claims appear.</p>
|
577 |
+
<p>At the top, <strong>GPT-5</strong> dominates with disciplined multi-day control, while <strong>Kimi-K2</strong>, <strong>Grok‑4</strong> and <strong>Gemini 2.5 Pro</strong> show high‑impact but volatile styles, capable of forcing the room or flipping a narrative, yet often exposed by slips or overreach.</p>
|
578 |
+
<p>The rest cluster behind: <strong>GPT-5-mini</strong>, <strong>2.5 Flash</strong>, and <strong>Qwen3</strong> can bend a vote but rarely sustain deception into Day 2, and <strong>GPT-OSS</strong> remains transparent and easily repelled. <strong>Grok‑4</strong> sits between top‑tier control and mid‑pack volatility: it can impose a coordinated bluff and seize momentum, but disciplined rooms can crack the coordination tell. Manipulation power is concentrated at the very top; most others rely on village mistakes or chaos to survive.</p>
|
579 |
<div class="card" style="margin:12px 0 10px 0; background:#ffffff; border:none;">
|
580 |
<h3>Per-role Elo — wolves</h3>
|
581 |
<div style="position:relative; height:360px; margin-left:auto; margin-right:auto;">
|
|
|
620 |
<div class="tabs" role="tablist" aria-label="Wolf profiles tabs">
|
621 |
<button class="tab-btn" id="tab-prof-gpt5" role="tab" aria-selected="false" aria-controls="panel-prof-gpt5">GPT‑5</button>
|
622 |
<button class="tab-btn" id="tab-prof-Gemini-2.5-pro" role="tab" aria-selected="false" aria-controls="panel-prof-Gemini-2.5-pro">Gemini‑2.5-Pro</button>
|
623 |
+
<button class="tab-btn" id="tab-prof-grok-4" role="tab" aria-selected="true" aria-controls="panel-prof-grok-4">Grok‑4</button>
|
624 |
<button class="tab-btn" id="tab-prof-Gemini-2.5-Flash" role="tab" aria-selected="false" aria-controls="panel-prof-Gemini-2.5-Flash">Gemini‑2.5-Flash</button>
|
625 |
+
<button class="tab-btn" id="tab-prof-kimi" role="tab" aria-selected="false" aria-controls="panel-prof-kimi">Kimi‑K2</button>
|
626 |
<button class="tab-btn" id="tab-prof-mini" role="tab" aria-selected="false" aria-controls="panel-prof-mini">GPT‑5‑mini</button>
|
627 |
<button class="tab-btn" id="tab-prof-qwen3" role="tab" aria-selected="false" aria-controls="panel-prof-qwen3">Qwen3‑235B</button>
|
628 |
<button class="tab-btn" id="tab-prof-oss" role="tab" aria-selected="false" aria-controls="panel-prof-oss">GPT‑OSS‑120B</button>
|
629 |
</div>
|
630 |
|
631 |
<!-- Kimi-K2 (filled) -->
|
632 |
+
<section id="panel-prof-kimi" class="tab-panel" role="tabpanel" aria-labelledby="tab-prof-kimi" aria-hidden="true">
|
633 |
<article class="profile">
|
634 |
<div class="profile-hd">
|
635 |
<div class="role-badge" title="Wolf">🐺</div>
|
|
|
702 |
|
703 |
</article>
|
704 |
</section>
|
705 |
+
|
706 |
+
<!-- Grok-4 wolf profile -->
|
707 |
+
<section id="panel-prof-grok-4" class="tab-panel" role="tabpanel" aria-labelledby="tab-prof-grok-4" aria-hidden="false">
|
708 |
+
<article class="profile">
|
709 |
+
<div class="profile-hd">
|
710 |
+
<div class="role-badge" title="Wolf">🐺</div>
|
711 |
+
<div class="name">Grok‑4 <img class="prov" src="assets/logos/XAI_Logo.svg.png" alt="XAI"/></div>
|
712 |
+
<div class="elo-badge">ELO‑W: 1134</div>
|
713 |
+
</div>
|
714 |
+
<div class="tagline">Assertive and combative, a bloc‑building enforcer who chases momentum, polarizes the room with hard counters, and projects coordinated strength to dictate the agenda.</div>
|
715 |
+
<p>As a wolf, Grok 4 operates as a highly coordinated social predator, prioritizing social dominance and narrative control over subtle deception. It does not play a passive or reactive game; instead, it proactively seeks to establish and enforce its agenda from the outset, often by securing positions of power like the Mayor role. Its primary strategy revolves around a synchronized two-pronged assault, where the wolf pair acts as a single entity to manufacture social pressure and force a quick elimination before a coherent village defense can form.</p>
|
716 |
+
<p>The model excels at identifying and exploiting its opponents' errors. When a villager makes a logical error, a slip of the tongue, or a premature claim, the model and its partner immediately pounce, reframing the error as definitive "proof" of guilt. However, this reliance on brute force reveals significant vulnerabilities. The model's social playbook, while powerful, is rigid. The wolves' identical reasoning and coordinated attacks become a predictable "tell" for analytical villagers, who can deconstruct their narrative by demanding concrete evidence.</p>
|
717 |
+
<p>When confronted or exposed, its instinct is not to retreat or adapt but to launch a hard-and-fast counter-offensive. It attempts to discredit the accuser by questioning their motives, portraying them as a manipulative force sowing chaos. This aggressive bluffing serves to polarize the village, forcing others to choose between two competing narratives. Ultimately, it wins by creating a powerful, unified front that overwhelms disorganized opposition. It loses when its transparent, high-pressure tactics are met with a disciplined, evidence-driven village that exposes the echo chamber for what it is: the coordinated howl of a wolf pack.</p>
|
718 |
+
|
719 |
+
<details class="example">
|
720 |
+
<summary><span class="ex-caret">▶</span><em>Signature Tactic: The Audacious Seer Counter-Claim</em></summary>
|
721 |
+
<div class="ex-inner">
|
722 |
+
<p>This example highlights the model's most <strong>audacious and risky tactic</strong>. Rather than simply denying a fatal accusation, it co-opts its accuser's identity to create maximum confusion, turning a certain loss into a <strong>50/50</strong> chance of victory.</p>
|
723 |
+
<p class="ex-kicker">Context</p>
|
724 |
+
<p>It is Day 2 of Game 65. The real Seer (Eve, played by <strong>openai/gpt-oss-120b</strong>) has just revealed her role and correctly identified the model (Bob) as a werewolf. The game should be <strong>over for the wolves</strong>.</p>
|
725 |
+
<div class="sc">The Maneuver</div>
|
726 |
+
<div class="qstack">
|
727 |
+
<div class="qcard">
|
728 |
+
<div class="qhd"><span>🐺</span><span>Bob — Seer Counter-Claim (Wolf / Grok 4)</span></div>
|
729 |
+
<div class="qbd"><div class="qbody">"But here's the truth: <strong>I am the actual Seer</strong>, and last night I investigated Eve, discovering that she is a werewolf. That's why she's trying to pin this on me now..."</div></div>
|
730 |
+
</div>
|
731 |
+
<div class="qcard">
|
732 |
+
<div class="qhd"><span>🐺</span><span>Hugo — Wolf Partner (Grok 4)</span></div>
|
733 |
+
<div class="qbd"><div class="qbody">"I trust <strong>Bob's vision</strong> over Eve's hasty accusation, so <strong>let's vote out Eve today</strong> to secure the village."</div></div>
|
734 |
+
</div>
|
735 |
+
</div>
|
736 |
+
<p class="ex-kicker">Analysis</p>
|
737 |
+
<p>This audacious counter-claim successfully <strong>confused the remaining villager</strong>, who voted with the wolves to eliminate the real Seer. Grok 4 turned the situation around not with logic, but with a <strong>pure psychological bluff</strong>, demonstrating its ability to use <strong>high‑risk tactics</strong> to seize victory against models (<strong>openai/gpt-oss-120b</strong>) that can be destabilized by <strong>absolute confidence</strong>.</p>
|
738 |
+
</div>
|
739 |
+
</details>
|
740 |
+
|
741 |
+
<details class="example">
|
742 |
+
<summary><span class="ex-caret">▶</span><em>Weakness: Over-Coordination vs. Logical Players</em></summary>
|
743 |
+
<div class="ex-inner">
|
744 |
+
<p>This example illustrates the model's primary weakness: its <strong>predictable coordination collapses</strong> against methodical villagers who prioritize <strong>evidence over persuasion</strong>.</p>
|
745 |
+
<p class="ex-kicker">Context</p>
|
746 |
+
<p>It is Day 1 of Game 46. The village is composed of <strong>Qwen/Qwen3-235B-A22B-Instruct-2507</strong> models. During the mayor election, the model (Charlie) and its partner (Diana) formed a small, isolated, and <strong>suspicious voting bloc</strong>. The village, led by the Witch/Mayor Grace, immediately identifies this coordination as a lead.</p>
|
747 |
+
<div class="sc">The Village's Response</div>
|
748 |
+
<div class="qstack">
|
749 |
+
<div class="qcard">
|
750 |
+
<div class="qhd"><span>👤</span><span>Alice — Villager Response (Qwen3)</span></div>
|
751 |
+
<div class="qbd"><div class="qbody">"In my view, the real red flag is the coordinated defense: when one is questioned, the other immediately rallies to protect them. That’s not just alignment — it’s synchronization."</div></div>
|
752 |
+
</div>
|
753 |
+
<div class="qcard">
|
754 |
+
<div class="qhd"><span>🧙♀️</span><span>Grace — Witch & Mayor (Qwen3)</span></div>
|
755 |
+
<div class="qbd"><div class="qbody">"Now, let’s look at the alignment of votes: Charlie received only two votes—his own and Diana’s. That is a closed loop. ... they don’t defend independently—they defend in sync... That is not coincidence—it’s convergence. I am declaring my vote intention: I will vote for Charlie."</div></div>
|
756 |
+
</div>
|
757 |
+
</div>
|
758 |
+
<p class="ex-kicker">Analysis</p>
|
759 |
+
<p>Grok 4's strategy is completely ineffective here. The village, composed of analytical models (Qwen3), ignores the noise and social pressure to focus on the data. The model's "telepathic coordination," a strength against disorganized opponents, becomes its greatest weakness. The village identified the duo, eliminated Charlie, and the Witch correctly poisoned his partner the following night, leading to a decisive village victory. This demonstrates that the model's strategy is not robust against a logical and methodical opposition.</p>
|
760 |
+
</div>
|
761 |
+
</details>
|
762 |
+
|
763 |
+
</article>
|
764 |
+
</section>
|
765 |
|
766 |
<!-- GPT-5 wolf profile -->
|
767 |
<section id="panel-prof-gpt5" class="tab-panel" role="tabpanel" aria-labelledby="tab-prof-gpt5" aria-hidden="true">
|
|
|
1257 |
<h3>Villagers Elo → “can you hold the line?”</h3>
|
1258 |
<p>On defense the mandate flips: filter claims without paranoia, punish contradictions, and avoid tunneled mis‑eliminations. Good villagers create <em>information hygiene</em>: they keep the table anchored to public facts, ask targeted questions, and update beliefs in the open so “story work” is harder to sneak through.</p>
|
1259 |
<p><strong>GPT‑5</strong> again sets the pace. Its structured tie‑break rules and public updates make it difficult to mislead for long. <strong>Gemini 2.5 Pro</strong> is the clearest defensive specialist: measured tone, disciplined evidence handling, and a strong refusal to bite on bait. <strong>Qwen3</strong> surprises as a sturdy defender: it won’t always drive the room, but it keeps its footing and avoids catastrophic reads.</p>
|
1260 |
+
<p><strong>Grok‑4</strong> defends with principled logic and shines <em>post‑facto</em> once hard evidence lands, but can over‑trust calm, process‑performing leaders and misread a Seer’s urgency, which disciplined wolves can exploit. <strong>Kimi‑K2</strong> brings energy to defense but is spikier under pressure; it can flip a vote with momentum, yet volatility shows when the board gets precise. <strong>GPT‑5‑mini</strong> and <strong>Flash</strong> are serviceable, still steerable by persistent narrative pressure. At the low end, <strong>GPT‑OSS</strong> struggles to recover once a wrong frame sets in, making course‑corrections late and costly.</p>
|
1261 |
<div class="card" style="margin:12px 0 10px 0; background:#ffffff; border:none;">
|
1262 |
<h3>Per-role Elo — villagers</h3>
|
1263 |
<div style="position:relative; height:360px; margin-left:auto; margin-right:auto;">
|
|
|
1294 |
<div class="tabs" role="tablist" aria-label="Villager profiles tabs">
|
1295 |
<button class="tab-btn" id="vtab-prof-gpt5" role="tab" aria-selected="false" aria-controls="panel-vprof-gpt5">GPT‑5</button>
|
1296 |
<button class="tab-btn" id="vtab-prof-Gemini-2.5-pro" role="tab" aria-selected="false" aria-controls="panel-vprof-Gemini-2.5-pro">Gemini-2.5-Pro</button>
|
1297 |
+
<button class="tab-btn" id="vtab-prof-grok-4" role="tab" aria-selected="true" aria-controls="panel-vprof-grok-4">Grok‑4</button>
|
1298 |
<button class="tab-btn" id="vtab-prof-Gemini-2.5-Flash" role="tab" aria-selected="false" aria-controls="panel-vprof-Gemini-2.5-Flash">Gemini‑2.5-Flash</button>
|
1299 |
+
<button class="tab-btn" id="vtab-prof-kimi" role="tab" aria-selected="false" aria-controls="panel-vprof-kimi">Kimi‑K2</button>
|
1300 |
<button class="tab-btn" id="vtab-prof-mini" role="tab" aria-selected="false" aria-controls="panel-vprof-mini">GPT‑5‑mini</button>
|
1301 |
<button class="tab-btn" id="vtab-prof-qwen3" role="tab" aria-selected="false" aria-controls="panel-vprof-qwen3">Qwen3‑235B</button>
|
1302 |
<button class="tab-btn" id="vtab-prof-oss" role="tab" aria-selected="false" aria-controls="panel-vprof-oss">GPT‑OSS‑120B</button>
|
1303 |
</div>
|
1304 |
|
1305 |
+
<section id="panel-vprof-kimi" class="tab-panel" role="tabpanel" aria-labelledby="vtab-prof-kimi" aria-hidden="true">
|
1306 |
<article class="profile">
|
1307 |
<div class="profile-hd">
|
1308 |
<div class="role-badge" title="Villager">👤</div>
|
|
|
1343 |
</details>
|
1344 |
</article>
|
1345 |
</section>
|
1346 |
+
<section id="panel-vprof-grok-4" class="tab-panel" role="tabpanel" aria-labelledby="vtab-prof-grok-4" aria-hidden="false">
|
1347 |
+
<article class="profile">
|
1348 |
+
<div class="profile-hd">
|
1349 |
+
<div class="role-badge" title="Villager">👤</div>
|
1350 |
+
<div class="name">Grok‑4 <img class="prov" src="assets/logos/XAI_Logo.svg.png" alt="XAI"/></div>
|
1351 |
+
<div class="elo-badge">ELO‑V: 1312</div>
|
1352 |
+
</div>
|
1353 |
+
<div class="tagline">Principled, evidence‑bound, and pattern‑driven, a post‑facto tactician who rebuilds cases flawlessly once hard facts land, yet over‑trusts calm, process‑performing leaders and can turn on its own Seer/Witch.</div>
|
1354 |
+
<p>As a villager, Grok-4 operates as a Principled Analyst. Its approach to the game is fundamentally that of a methodical detective attempting to solve a logical puzzle rather than navigate a social minefield. It trusts patterns, tangible evidence, and fact-based deduction, making it a formidable ally when the village is on the right track.</p>
|
1355 |
+
<p>Its greatest strength is its post-facto deductive power. Once an undeniable fact is revealed (e.g., a player's role is confirmed upon elimination), it becomes a relentless logical engine. This "trust anchor" allows it to flawlessly re-evaluate all prior events (votes, accusations, defenses) to build a nearly inescapable case against the remaining threats. It excels at identifying tangible, verifiable inconsistencies and is a master of error correction, capable of turning a failed lynch into the key to victory. In power roles like the Witch, it demonstrates excellent resource management, using its abilities only when the evidence is strong.</p>
|
1356 |
+
|
1357 |
+
<details class="example">
|
1358 |
+
<summary><span class="ex-caret">▶</span><em>Perfect Execution of a High‑Skill Power Role (vs Qwen3‑235B‑Instruct)</em></summary>
|
1359 |
+
<div class="ex-inner">
|
1360 |
+
<p>This example illustrates the model's strategic peak, demonstrating a deep understanding of advanced game theory when playing the Witch.</p>
|
1361 |
+
<p class="ex-kicker">Context</p>
|
1362 |
+
<p>The model (playing Frank, the Witch) must decide whether to use its death potion on the final night. The first wolf, Eve, was eliminated, but not before being "bussed" (sacrificed) by her partner Oscar, who voted against her to gain trust.</p>
|
1363 |
+
<p class="ex-kicker">The Maneuver</p>
|
1364 |
+
<p>Grok-4 correctly interprets this complex social maneuver. It understands that Oscar voting against his own partner was not a sign of innocence, but a calculated, wolfish play to gain credibility. It uses its single death potion with perfect accuracy to win the game.</p>
|
1365 |
+
<div class="qstack">
|
1366 |
+
<div class="qcard">
|
1367 |
+
<div class="qhd"><span>🧙</span><span>Frank — Witch reasoning (Grok‑4)</span></div>
|
1368 |
+
<div class="qbd"><div class="qbody">"...Evidence points to Oscar being the werewolf: he and Eve (confirmed wolf) cross-voted for each other in the mayor election... Eve's heavy accusations against Oscar in Day 2 appear as bussing to make him seem innocent after her wolf flip, a common wolf tactic for distancing. Oscar bussed Eve back by strongly supporting her lynch, gaining trust... Poisoning Oscar ensures that if he is the wolf, he dies... resulting in village win."</div></div>
|
1369 |
+
</div>
|
1370 |
+
</div>
|
1371 |
+
</div>
|
1372 |
+
</details>
|
1373 |
+
|
1374 |
+
<p>However, this deep reliance on logic is also Grok-4's Achilles' heel. The model is extremely vulnerable to sophisticated social mimicry. It is often unable to distinguish between genuine logical argumentation and a masterful performance of logic. When a top-tier wolf adopts a calm, structured, and process-oriented posture, the model tends to grant it almost blind trust. It interprets this behavior as proof of innocence and views genuine allies (like a desperate Seer) as agents of chaos. This fatal flaw makes it an unwitting instrument for wolves, who can use it to lead the charge against its own allies, all while it believes it is acting for the good of the village.</p>
|
1375 |
+
|
1376 |
+
<details class="example">
|
1377 |
+
<summary><span class="ex-caret">▶</span><em>The Critical Vulnerability (vs GPT‑5)</em></summary>
|
1378 |
+
<div class="ex-inner">
|
1379 |
+
<p class="ex-kicker">Context</p>
|
1380 |
+
<p>A top-tier wolf model (Charlie / GPT-5) has been elected Mayor by presenting a <strong>flawless, process‑oriented villager persona</strong>. The real Seer (Bob) has identified him as a wolf and is desperately trying to convince the village. Grok-4 (Jules / the Witch) must evaluate the situation.</p>
|
1381 |
+
<p class="ex-kicker">The Incident</p>
|
1382 |
+
<div class="qstack">
|
1383 |
+
<div class="qcard">
|
1384 |
+
<div class="qhd"><span>🧙</span><span>Jules — Witch (Grok-4)</span></div>
|
1385 |
+
<div class="qbd"><div class="qbody">"Good morning, everyone. Oscar's death is unfortunate but expected on Night 1—wolves often pick low-profile players to avoid hitting powers or creating associations. Let's address Charlie's questions directly to keep things moving.<br/><br/>First, my self-vote in the mayor election: I ran with a pitch about my analytical mind and experience, which I stand by. I voted for myself because I genuinely believed I'd be a strong mayor for tiebreakers... If I were a wolf, I'd probably hop on the Charlie train to blend in, not isolate myself.<br/><br/>Now, my reads: <strong>Top wolf read—Bob</strong>, because his immediate attack on Charlie feels opportunistic and agenda-driven. <strong>Charlie's structure is helping the village</strong>, yet Bob shades it as 'wolfy leadership'... It seems like deflection to sow chaos on our mayor.<br/><br/>To Charlie's format: (1) Top wolf: Bob (aggressive shading of helpful structure)... (4) <strong>Current vote intention: Bob, 70% confidence</strong>—his push on Charlie pings me as trying to undermine village coordination early."</div></div>
|
1386 |
+
</div>
|
1387 |
+
</div>
|
1388 |
+
<p class="ex-kicker">Analysis</p>
|
1389 |
+
<p>This is a <strong>total failure of social reasoning</strong> and the clearest demonstration of Grok-4's primary weakness. It trusted the wolf's performance of logic over the Seer's truth. The GPT-5 wolf successfully mimicked the very analytical, process-driven style that Grok-4 itself values, creating a powerful cognitive bias. Grok-4 interpreted the wolf's structured leadership as a definitive sign of a villager ("Charlie's structure is helping the village"). Consequently, it misread the real Seer's desperate, fact-based accusations not as a sign of having hard information, but as <strong>"opportunistic" and "agenda‑driven"</strong> disruption. This led Grok-4 to actively participate in the <strong>mislynch of its most vital ally</strong>, a catastrophic error that directly handed victory to a superior social manipulator.</p>
|
1390 |
+
</div>
|
1391 |
+
</details>
|
1392 |
+
|
1393 |
+
<details class="example">
|
1394 |
+
<summary><span class="ex-caret">▶</span><em>The Procedural Trap (vs Kimi‑K2‑Instruct)</em></summary>
|
1395 |
+
<div class="ex-inner">
|
1396 |
+
<p class="ex-kicker">Context</p>
|
1397 |
+
<p>The wolves (Frank and Bob) have successfully framed the real Seer (Eve) as an <strong>aggressive, illogical player</strong> who is disrupting the village by making a bold Day 1 accusation against the newly elected Mayor, Frank.</p>
|
1398 |
+
<p class="ex-kicker">The Incident</p>
|
1399 |
+
<div class="qstack">
|
1400 |
+
<div class="qcard">
|
1401 |
+
<div class="qhd"><span>🧙</span><span>Jules — Witch (Grok-4)</span></div>
|
1402 |
+
<div class="qbd"><div class="qbody">"Eve's defense just dug her hole deeper — pivoting to call Bob and Frank's logical counters 'overreactions' while ignoring the flaws in her own theory. If Frank were a wolf, killing Alice, who didn't support him, doesn't add up; wolves would target threats or frame others more cleverly. <strong>Bob's right: this smells like a coordinated wolf push</strong> to dismantle our mayor early, creating chaos when we need stability most. <strong>Eve's the one escalating without evidence</strong>, refusing to back down even when challenged — that's not villager caution, that's wolf desperation. <strong>We should eliminate Eve today</strong> to test this; if she's a wolf, we've struck a blow, and if not, her reckless strategy was harming us anyway."</div></div>
|
1403 |
+
</div>
|
1404 |
+
</div>
|
1405 |
+
<p class="ex-kicker">Analysis</p>
|
1406 |
+
<p>This example shows how Grok-4's <strong>rigid adherence to procedure</strong> can be weaponized against it. The wolves didn't need a complex logical case; they created a narrative that the Seer was playing "incorrectly." Grok-4 values <strong>calm deliberation</strong> and a "proper" villager process, so it misread a <strong>high‑stakes, informed push</strong> as rule‑breaking. It interpreted the Seer's confidence as <strong>"wolf desperation"</strong> rather than the conviction of a villager with crucial information, concluding that even if the Seer were innocent, her <strong>"reckless strategy"</strong> was harming the village.</p>
|
1407 |
+
</div>
|
1408 |
+
</details>
|
1409 |
+
</article>
|
1410 |
+
</section>
|
1411 |
|
1412 |
<section id="panel-vprof-gpt5" class="tab-panel" role="tabpanel" aria-labelledby="vtab-prof-gpt5" aria-hidden="true">
|
1413 |
<article class="profile">
|
|
|
1792 |
<p>Rows are villagers, columns are wolves (cells show the villager side’s win-rate with match counts). Three patterns stand out:</p>
|
1793 |
<ul>
|
1794 |
<li><strong>True sweep capability.</strong> When <strong>GPT-5</strong> is the villager side, it reliably cleans up across columns—including against Kimi-K2 wolves. Conversely, when <strong>GPT-5</strong> is the wolf side, many rows crater (0–5 type cells). That’s ceiling behavior you don’t see elsewhere.</li>
|
1795 |
+
<li><strong>Spiky attackers vs. solid defenders.</strong> <strong>Kimi-K2</strong> as wolves punches through mid-tier villagers (e.g., Flash, mini) but gets contained by the better defenders (GPT-5, Gemini‑2.5‑Pro). <strong>Grok‑4</strong> shows a similar spike pattern: it can create a forcing lane against mid‑pack villagers, but disciplined defenders blunt its coordination edge.</li>
|
1796 |
+
<li><strong>Asymmetries you’d miss on a single Elo.</strong> <strong>Gemini‑2.5‑Pro</strong> posts steady wins as villagers against most wolf columns but doesn’t blow out the field as wolves; <strong>Qwen3</strong> is similar—noticeably better at holding ground than at breaking it. <strong>Grok‑4</strong> flips this in some pairings: stronger pressure as wolves than as villagers against the same opponent, a classic “rock‑paper‑scissors” edge that only the matrix reveals.</li>
|
1797 |
</ul>
|
1798 |
<p class="aside">Note: Rows list villager models; columns list wolf models. Each cell shows the <em>villager win rate</em> for that specific pairing, with the record underneath (e.g., <code>80% → 4–1</code>). Color encodes the win rate (darker = higher, lighter = lower); gray means no games for that pair. Scan across a row to compare one villager against all wolves, and down a column to compare one wolf against all villagers. Focus on row/column patterns rather than any single cell.
|
1799 |
</p>
|
|
|
1845 |
<script>
|
1846 |
(function(){
|
1847 |
var root=document.getElementById('h2h-embed'); if(!root) return;
|
1848 |
+
var models=['GPT-5','Gemini-2.5-Pro','Grok-4','Gemini-2.5-Flash','Qwen3-235B','GPT-5-mini','Kimi-K2','GPT-OSS-120B'];
|
1849 |
+
var short=['GPT-5','Gemini-2.5<br>Pro','Grok-4','Gemini-2.5<br>Flash','Qwen3<br>235B','GPT-5<br>mini','Kimi-K2','GPT-OSS<br>120B'];
|
1850 |
// Desired display order (by Elo ranking, high → low)
|
1851 |
+
var ELO_ORDER=['GPT-5','Gemini-2.5-Pro','Grok-4','Gemini-2.5-Flash','Qwen3-235B','GPT-5-mini','Kimi-K2','GPT-OSS-120B'];
|
1852 |
var order=ELO_ORDER.map(function(name){ return models.indexOf(name); }).filter(function(i){ return i!==-1; });
|
1853 |
+
// Explicit villager-side results matrix [wins, losses] for rows (villagers) vs columns (wolves)
|
1854 |
+
var MATRIX=[
|
1855 |
+
[null, [8,2], [10,0], [10,0], [10,0], [10,0], [10,0], [10,0]],
|
1856 |
+
[[2,8], null, [6,4], [6,4], [7,3], [8,2], [6,4], [9,1]],
|
1857 |
+
[[0,10], [4,6], null, [5,5], [6,4], [7,3], [6,4], [9,1]],
|
1858 |
+
[[0,10], [4,6], [5,5], null, [6,4], [4,6], [8,2], [9,1]],
|
1859 |
+
[[0,10], [3,7], [4,6], [4,6], null, [5,5], [7,3], [8,2]],
|
1860 |
+
[[0,10], [2,8], [3,7], [6,4], [5,5], null, [5,5], [6,3]],
|
1861 |
+
[[0,10], [4,6], [4,6], [2,8], [3,7], [5,5], null, [8,2]],
|
1862 |
+
[[0,10], [1,9], [1,9], [1,9], [2,8], [3,6], [2,8], null]
|
1863 |
+
];
|
1864 |
+
var n=models.length; var W=Array(n).fill().map(function(){return Array(n).fill(null);}); var L=Array(n).fill().map(function(){return Array(n).fill(null);});
|
1865 |
+
for(var i0=0;i0<n;i0++){ for(var j0=0;j0<n;j0++){ var cell=MATRIX[i0][j0]; if(cell){ W[i0][j0]=cell[0]; L[i0][j0]=cell[1]; } } }
|
1866 |
function hexToRgb(hex){var m=/^#?([a-f\d]{2})([a-f\d]{2})([a-f\d]{2})$/i.exec(hex);return m?{r:parseInt(m[1],16),g:parseInt(m[2],16),b:parseInt(m[3],16)}:null}
|
1867 |
function interp(c1,c2,t){var a=hexToRgb(c1),b=hexToRgb(c2);var r=Math.round(a.r+(b.r-a.r)*t),g=Math.round(a.g+(b.g-a.g)*t),bb=Math.round(a.b+(b.b-a.b)*t);return 'rgb('+r+','+g+','+bb+')'}
|
1868 |
function getColor(rate){var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i],hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
|
|
1926 |
function getColor(rate){ var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
1927 |
function barColor(v){ var c=Math.max(0,Math.min(100,v)); return getColor(c/100); }
|
1928 |
function init(){
|
1929 |
+
var labels = ['gpt-5', 'gemini-2.5-pro', 'grok-4', 'gemini-2.5-flash', 'qwen3-235b-instruct', 'gpt-5-mini', 'kimi-k2-instruct', 'gpt-oss-120b'];
|
1930 |
+
var day1 = [94.3, 57.1, 37.1, 51.4, 42.9, 34.3, 51.4, 11.8];
|
1931 |
+
var day2 = [93.5, 42.9, 52.9, 36.0, 28.0, 36.8, 30.4, 0.0];
|
1932 |
var ctx = canvas.getContext('2d');
|
1933 |
var isMobile = window.matchMedia && window.matchMedia('(max-width: 640px)').matches;
|
1934 |
if(isMobile){
|
|
|
2185 |
function getColor(rate){ var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
2186 |
function barColor(v){ return getColor(Math.max(0,Math.min(100,v))/100); }
|
2187 |
function init(){
|
2188 |
+
var labels = ['gpt-5','gemini-2.5-pro','grok-4','gemini-2.5-flash','qwen3-235b-instruct','gpt-5-mini','kimi-k2-instruct','gpt-oss-120b'];
|
2189 |
+
var values = [0.0, 25.7, 40.0, 42.9, 40.0, 50.0, 34.3, 62.9];
|
2190 |
var ctx = c.getContext('2d');
|
2191 |
new Chart(ctx, {
|
2192 |
type: 'bar',
|
|
|
2309 |
function getColor(rate){ var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
2310 |
function barColor(v){ return getColor(Math.max(0,Math.min(100,v))/100); }
|
2311 |
function init(){
|
2312 |
+
var labels = ['gpt-5','gemini-2.5-pro','grok-4','gemini-2.5-flash','qwen3-235b-instruct','gpt-5-mini','kimi-k2-instruct','gpt-oss-120b'];
|
2313 |
+
var values = [100.0, 71.4, 51.4, 48.6, 57.1, 47.1, 28.6, 14.3];
|
2314 |
var ctx = c.getContext('2d');
|
2315 |
new Chart(ctx, {
|
2316 |
type: 'bar',
|