davidpomerenke commited on
Commit
3fbff09
Β·
verified Β·
1 Parent(s): fd102e9

Upload from GitHub Actions: Exclude TruthfulQA from proficiency score

Browse files
Files changed (2) hide show
  1. evals/backend.py +1 -1
  2. frontend/src/App.js +219 -90
evals/backend.py CHANGED
@@ -26,7 +26,7 @@ task_metrics = [
26
  "classification_accuracy",
27
  "mmlu_accuracy",
28
  "arc_accuracy",
29
- "truthfulqa_accuracy",
30
  "mgsm_accuracy",
31
  ]
32
 
 
26
  "classification_accuracy",
27
  "mmlu_accuracy",
28
  "arc_accuracy",
29
+ # "truthfulqa_accuracy",
30
  "mgsm_accuracy",
31
  ]
32
 
frontend/src/App.js CHANGED
@@ -57,20 +57,47 @@ function App () {
57
 
58
  return (
59
  <PrimeReactProvider>
60
- <div style={{ minHeight: '100vh', display: 'flex', flexDirection: 'column', width: '100vw' }}>
61
- <div style={{backgroundColor: '#fff3cd', color: '#856404', padding: '0.75rem 1.25rem', marginBottom: '1rem', border: '1px solid #ffeeba', borderRadius: '0.25rem', textAlign: 'center'}}>
62
- <strong>Work in Progress:</strong> This dashboard is currently under active development. Evaluation results are not yet final.
63
- <a href="https://github.com/datenlabor-bmz/ai-language-monitor" target="_blank" rel="noopener noreferrer" style={{
64
- textDecoration: 'none',
65
- color: '#856404',
66
- float: 'right',
67
- fontSize: '1.2rem',
68
- fontWeight: 'bold',
69
- padding: '0 0.5rem',
70
- borderRadius: '3px',
71
- backgroundColor: 'rgba(255,255,255,0.3)'
72
- }}>
73
- <i className="pi pi-github" title="View on GitHub" style={{ marginRight: '0.3rem' }} />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  GitHub
75
  </a>
76
  </div>
@@ -95,30 +122,42 @@ function App () {
95
  🌍
96
  </span>
97
  </div>
98
- <h1 style={{
99
- fontSize: '2.5rem',
100
- fontWeight: '600',
101
- margin: '1rem 0 0.5rem 0',
102
- color: '#333',
103
- letterSpacing: '-0.01em'
104
- }}>
105
- AI Language Proficiency Monitor
 
 
106
  </h1>
107
- <p style={{
108
- fontSize: '1.1rem',
109
- color: '#666',
110
- margin: '0 0 2.5rem 0',
111
- fontWeight: '400',
112
- maxWidth: '700px',
113
- lineHeight: '1.5'
114
- }}>
 
 
115
  Comprehensive multilingual evaluation results for AI language models
116
  </p>
117
-
118
- <div style={{ display: 'flex', gap: '1rem', marginBottom: '1.5rem', flexWrap: 'wrap', justifyContent: 'center' }}>
119
- <Button
120
- label="πŸ“š About this tool"
121
- className="p-button-text"
 
 
 
 
 
 
 
 
122
  onClick={() => setAboutVisible(true)}
123
  style={{
124
  color: '#666',
@@ -128,12 +167,12 @@ function App () {
128
  fontSize: '0.9rem'
129
  }}
130
  />
131
-
132
- <Button
133
- label="πŸš€ Add your model (soon)"
134
- className="p-button-text"
135
  onClick={() => setContributeVisible(true)}
136
- tooltip="This feature is on our roadmap and will be available soon."
137
  tooltipOptions={{ position: 'bottom' }}
138
  style={{
139
  color: '#666',
@@ -144,7 +183,7 @@ function App () {
144
  }}
145
  />
146
  </div>
147
-
148
  {data && (
149
  <AutoComplete
150
  languages={data?.language_table}
@@ -164,30 +203,31 @@ function App () {
164
  >
165
  {loading && (
166
  <div style={{ width: '100%', textAlign: 'center' }}>
167
- <i className='pi pi-spinner pi-spin' style={{ fontSize: '4rem' }} />
 
 
 
 
 
 
 
 
168
  </div>
169
  )}
170
- {error && <div style={{ width: '100%', textAlign: 'center' }}><p>Error: {error}</p></div>}
171
  {data && (
172
  <>
173
- <div style={{ width: '100%' }}>
174
- <ModelTable
175
- data={data.model_table}
176
- selectedLanguages={selectedLanguages}
177
- allLanguages={data.language_table || []}
178
- />
179
- </div>
180
- <div style={{ width: '100%' }}>
181
- <LanguageTable
182
- data={data.language_table}
183
- selectedLanguages={selectedLanguages}
184
- setSelectedLanguages={setSelectedLanguages}
185
- totalModels={data.model_table?.length || 0}
186
- />
187
- </div>
188
- <div style={{ width: '100%' }}>
189
- <DatasetTable data={data} />
190
- </div>
191
  <div
192
  id='figure'
193
  style={{
@@ -196,10 +236,10 @@ function App () {
196
  }}
197
  >
198
  <Button
199
- icon="pi pi-external-link"
200
- className="p-button-text p-button-plain"
201
  onClick={() => setDialogVisible(true)}
202
- tooltip="Open in larger view"
203
  style={{
204
  position: 'absolute',
205
  top: '10px',
@@ -214,7 +254,7 @@ function App () {
214
  <LanguagePlot data={data} />,
215
  <SpeakerPlot data={data} />,
216
  <HistoryPlot data={data} />,
217
- <CostPlot data={data} />,
218
  ]}
219
  numScroll={1}
220
  numVisible={1}
@@ -233,35 +273,85 @@ function App () {
233
  onHide={() => setAboutVisible(false)}
234
  style={{ width: '600px' }}
235
  modal
236
- header="About this tool"
237
  >
238
  <div>
239
- <p>The <i>AI Language Proficiency Monitor</i> presents comprehensive multilingual evaluation results of AI language models.</p>
 
 
 
240
  <h4>Who is this for?</h4>
241
  <ul>
242
- <li><b>Practitioners</b> can pick the best model for a given language.</li>
243
- <li><b>Policymakers and funders</b> can identify and prioritize neglected languages.</li>
244
- <li><b>Model developers</b> can compete on our <i>AI Language Proficiency</i> metric.</li>
 
 
 
 
 
 
 
 
 
245
  </ul>
246
  <h4>⚑ Live Updates</h4>
247
- <p>Benchmark results automatically refresh every night and include the most popular models from <a href="https://openrouter.ai" target="_blank" rel="noopener noreferrer">OpenRouter</a>, plus community-submitted models.</p>
 
 
 
 
 
 
 
 
 
 
 
248
  <h4>Authors</h4>
249
- <p>The AI Language Proficiency Monitor is a collaboration between BMZ's <a href="https://www.bmz-digital.global/en/overview-of-initiatives/the-bmz-data-lab/" target="_blank" rel="noopener noreferrer">Data Lab</a>, the BMZ-Initiative <a href="https://www.bmz-digital.global/en/overview-of-initiatives/fair-forward/" target="_blank" rel="noopener noreferrer">Fair Forward</a> (implemented by GIZ), and the <a href="https://www.dfki.de/en/web/research/research-departments/multilinguality-and-language-technology/ee-team" target="_blank" rel="noopener noreferrer">E&E group</a> of DFKI's Multilinguality and Language Technology Lab.</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  <h4>πŸ”— Links</h4>
251
  <p>
252
- <a
253
- href="https://github.com/datenlabor-bmz/ai-language-monitor"
254
- target="_blank"
255
- rel="noopener noreferrer"
256
- style={{
257
- color: '#666',
258
  textDecoration: 'none',
259
  display: 'inline-flex',
260
  alignItems: 'center',
261
  gap: '0.5rem'
262
  }}
263
  >
264
- <i className="pi pi-github" style={{ fontSize: '1.2rem' }} />
265
  View source code on GitHub
266
  </a>
267
  </p>
@@ -274,16 +364,39 @@ function App () {
274
  onHide={() => setContributeVisible(false)}
275
  style={{ width: '600px' }}
276
  modal
277
- header="Add your model & Contribute"
278
  >
279
  <div>
280
  <h4>πŸš€ Submit Your Model</h4>
281
- <p>Have a custom fine-tuned model you'd like to see on the leaderboard?</p>
282
- <p><a href="https://forms.gle/ckvY9pS7XLcHYnaV8" target="_blank" rel="noopener noreferrer" style={{ color: '#28a745', fontWeight: 'bold' }}>β†’ Submit your model here</a></p>
283
-
 
 
 
 
 
 
 
 
 
 
 
 
284
  <h4>πŸ”§ Contribute to Development</h4>
285
- <p>Help us expand language coverage and add new evaluation tasks:</p>
286
- <p><a href="https://github.com/datenlabor-bmz/ai-language-monitor/blob/main/CONTRIBUTING.md" target="_blank" rel="noopener noreferrer" style={{ color: '#007bff', fontWeight: 'bold' }}>β†’ Contribution guidelines</a></p>
 
 
 
 
 
 
 
 
 
 
 
287
  </div>
288
  </Dialog>
289
 
@@ -300,11 +413,27 @@ function App () {
300
  <div style={{ width: '100%', height: '100%' }}>
301
  <Carousel
302
  value={[
303
- <WorldMap data={data.countries} width={windowWidth * 0.7} height={windowHeight * 0.6} />,
304
- <LanguagePlot data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />,
305
- <SpeakerPlot data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />,
306
- <HistoryPlot data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />,
307
- <CostPlot data={data} />,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  ]}
309
  numScroll={1}
310
  numVisible={1}
 
57
 
58
  return (
59
  <PrimeReactProvider>
60
+ <div
61
+ style={{
62
+ minHeight: '100vh',
63
+ display: 'flex',
64
+ flexDirection: 'column',
65
+ width: '100vw'
66
+ }}
67
+ >
68
+ <div
69
+ style={{
70
+ backgroundColor: '#fff3cd',
71
+ color: '#856404',
72
+ padding: '0.75rem 1.25rem',
73
+ marginBottom: '1rem',
74
+ border: '1px solid #ffeeba',
75
+ borderRadius: '0.25rem',
76
+ textAlign: 'center'
77
+ }}
78
+ >
79
+ <strong>Work in Progress:</strong> This dashboard is currently under
80
+ active development. Evaluation results are not yet final.
81
+ <a
82
+ href='https://github.com/datenlabor-bmz/ai-language-monitor'
83
+ target='_blank'
84
+ rel='noopener noreferrer'
85
+ style={{
86
+ textDecoration: 'none',
87
+ color: '#856404',
88
+ float: 'right',
89
+ fontSize: '1.2rem',
90
+ fontWeight: 'bold',
91
+ padding: '0 0.5rem',
92
+ borderRadius: '3px',
93
+ backgroundColor: 'rgba(255,255,255,0.3)'
94
+ }}
95
+ >
96
+ <i
97
+ className='pi pi-github'
98
+ title='View on GitHub'
99
+ style={{ marginRight: '0.3rem' }}
100
+ />
101
  GitHub
102
  </a>
103
  </div>
 
122
  🌍
123
  </span>
124
  </div>
125
+ <h1
126
+ style={{
127
+ fontSize: '2.5rem',
128
+ fontWeight: '600',
129
+ margin: '1rem 0 0.5rem 0',
130
+ color: '#333',
131
+ letterSpacing: '-0.01em'
132
+ }}
133
+ >
134
+ AI Language Proficiency Monitor
135
  </h1>
136
+ <p
137
+ style={{
138
+ fontSize: '1.1rem',
139
+ color: '#666',
140
+ margin: '0 0 2.5rem 0',
141
+ fontWeight: '400',
142
+ maxWidth: '700px',
143
+ lineHeight: '1.5'
144
+ }}
145
+ >
146
  Comprehensive multilingual evaluation results for AI language models
147
  </p>
148
+
149
+ <div
150
+ style={{
151
+ display: 'flex',
152
+ gap: '1rem',
153
+ marginBottom: '1.5rem',
154
+ flexWrap: 'wrap',
155
+ justifyContent: 'center'
156
+ }}
157
+ >
158
+ <Button
159
+ label='πŸ“š About this tool'
160
+ className='p-button-text'
161
  onClick={() => setAboutVisible(true)}
162
  style={{
163
  color: '#666',
 
167
  fontSize: '0.9rem'
168
  }}
169
  />
170
+
171
+ <Button
172
+ label='πŸš€ Add your model (soon)'
173
+ className='p-button-text'
174
  onClick={() => setContributeVisible(true)}
175
+ tooltip='This feature is on our roadmap and will be available soon.'
176
  tooltipOptions={{ position: 'bottom' }}
177
  style={{
178
  color: '#666',
 
183
  }}
184
  />
185
  </div>
186
+
187
  {data && (
188
  <AutoComplete
189
  languages={data?.language_table}
 
203
  >
204
  {loading && (
205
  <div style={{ width: '100%', textAlign: 'center' }}>
206
+ <i
207
+ className='pi pi-spinner pi-spin'
208
+ style={{ fontSize: '4rem' }}
209
+ />
210
+ </div>
211
+ )}
212
+ {error && (
213
+ <div style={{ width: '100%', textAlign: 'center' }}>
214
+ <p>Error: {error}</p>
215
  </div>
216
  )}
 
217
  {data && (
218
  <>
219
+ <ModelTable
220
+ data={data.model_table}
221
+ selectedLanguages={selectedLanguages}
222
+ allLanguages={data.language_table || []}
223
+ />
224
+ <LanguageTable
225
+ data={data.language_table}
226
+ selectedLanguages={selectedLanguages}
227
+ setSelectedLanguages={setSelectedLanguages}
228
+ totalModels={data.model_table?.length || 0}
229
+ />
230
+ <DatasetTable data={data} />
 
 
 
 
 
 
231
  <div
232
  id='figure'
233
  style={{
 
236
  }}
237
  >
238
  <Button
239
+ icon='pi pi-external-link'
240
+ className='p-button-text p-button-plain'
241
  onClick={() => setDialogVisible(true)}
242
+ tooltip='Open in larger view'
243
  style={{
244
  position: 'absolute',
245
  top: '10px',
 
254
  <LanguagePlot data={data} />,
255
  <SpeakerPlot data={data} />,
256
  <HistoryPlot data={data} />,
257
+ <CostPlot data={data} />
258
  ]}
259
  numScroll={1}
260
  numVisible={1}
 
273
  onHide={() => setAboutVisible(false)}
274
  style={{ width: '600px' }}
275
  modal
276
+ header='About this tool'
277
  >
278
  <div>
279
+ <p>
280
+ The <i>AI Language Proficiency Monitor</i> presents comprehensive
281
+ multilingual evaluation results of AI language models.
282
+ </p>
283
  <h4>Who is this for?</h4>
284
  <ul>
285
+ <li>
286
+ <b>Practitioners</b> can pick the best model for a given
287
+ language.
288
+ </li>
289
+ <li>
290
+ <b>Policymakers and funders</b> can identify and prioritize
291
+ neglected languages.
292
+ </li>
293
+ <li>
294
+ <b>Model developers</b> can compete on our{' '}
295
+ <i>AI Language Proficiency</i> metric.
296
+ </li>
297
  </ul>
298
  <h4>⚑ Live Updates</h4>
299
+ <p>
300
+ Benchmark results automatically refresh every night and include
301
+ the most popular models from{' '}
302
+ <a
303
+ href='https://openrouter.ai'
304
+ target='_blank'
305
+ rel='noopener noreferrer'
306
+ >
307
+ OpenRouter
308
+ </a>
309
+ , plus community-submitted models.
310
+ </p>
311
  <h4>Authors</h4>
312
+ <p>
313
+ The AI Language Proficiency Monitor is a collaboration between
314
+ BMZ's{' '}
315
+ <a
316
+ href='https://www.bmz-digital.global/en/overview-of-initiatives/the-bmz-data-lab/'
317
+ target='_blank'
318
+ rel='noopener noreferrer'
319
+ >
320
+ Data Lab
321
+ </a>
322
+ , the BMZ-Initiative{' '}
323
+ <a
324
+ href='https://www.bmz-digital.global/en/overview-of-initiatives/fair-forward/'
325
+ target='_blank'
326
+ rel='noopener noreferrer'
327
+ >
328
+ Fair Forward
329
+ </a>{' '}
330
+ (implemented by GIZ), and the{' '}
331
+ <a
332
+ href='https://www.dfki.de/en/web/research/research-departments/multilinguality-and-language-technology/ee-team'
333
+ target='_blank'
334
+ rel='noopener noreferrer'
335
+ >
336
+ E&E group
337
+ </a>{' '}
338
+ of DFKI's Multilinguality and Language Technology Lab.
339
+ </p>
340
  <h4>πŸ”— Links</h4>
341
  <p>
342
+ <a
343
+ href='https://github.com/datenlabor-bmz/ai-language-monitor'
344
+ target='_blank'
345
+ rel='noopener noreferrer'
346
+ style={{
347
+ color: '#666',
348
  textDecoration: 'none',
349
  display: 'inline-flex',
350
  alignItems: 'center',
351
  gap: '0.5rem'
352
  }}
353
  >
354
+ <i className='pi pi-github' style={{ fontSize: '1.2rem' }} />
355
  View source code on GitHub
356
  </a>
357
  </p>
 
364
  onHide={() => setContributeVisible(false)}
365
  style={{ width: '600px' }}
366
  modal
367
+ header='Add your model & Contribute'
368
  >
369
  <div>
370
  <h4>πŸš€ Submit Your Model</h4>
371
+ <p>
372
+ Have a custom fine-tuned model you'd like to see on the
373
+ leaderboard?
374
+ </p>
375
+ <p>
376
+ <a
377
+ href='https://forms.gle/ckvY9pS7XLcHYnaV8'
378
+ target='_blank'
379
+ rel='noopener noreferrer'
380
+ style={{ color: '#28a745', fontWeight: 'bold' }}
381
+ >
382
+ β†’ Submit your model here
383
+ </a>
384
+ </p>
385
+
386
  <h4>πŸ”§ Contribute to Development</h4>
387
+ <p>
388
+ Help us expand language coverage and add new evaluation tasks:
389
+ </p>
390
+ <p>
391
+ <a
392
+ href='https://github.com/datenlabor-bmz/ai-language-monitor/blob/main/CONTRIBUTING.md'
393
+ target='_blank'
394
+ rel='noopener noreferrer'
395
+ style={{ color: '#007bff', fontWeight: 'bold' }}
396
+ >
397
+ β†’ Contribution guidelines
398
+ </a>
399
+ </p>
400
  </div>
401
  </Dialog>
402
 
 
413
  <div style={{ width: '100%', height: '100%' }}>
414
  <Carousel
415
  value={[
416
+ <WorldMap
417
+ data={data.countries}
418
+ width={windowWidth * 0.7}
419
+ height={windowHeight * 0.6}
420
+ />,
421
+ <LanguagePlot
422
+ data={data}
423
+ width={windowWidth * 0.7}
424
+ height={windowHeight * 0.6}
425
+ />,
426
+ <SpeakerPlot
427
+ data={data}
428
+ width={windowWidth * 0.7}
429
+ height={windowHeight * 0.6}
430
+ />,
431
+ <HistoryPlot
432
+ data={data}
433
+ width={windowWidth * 0.7}
434
+ height={windowHeight * 0.6}
435
+ />,
436
+ <CostPlot data={data} />
437
  ]}
438
  numScroll={1}
439
  numVisible={1}