akera commited on
Commit
7c90731
Β·
verified Β·
1 Parent(s): 3dcbb9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +264 -964
app.py CHANGED
@@ -3,11 +3,13 @@ import subprocess
3
  import sys
4
  import os
5
  from pathlib import Path
 
 
 
6
 
7
  def setup_salt():
8
  """Clone and setup SALT library like in Colab."""
9
  try:
10
- # Check if salt is already available
11
  import salt.dataset
12
  print("βœ… SALT library already available")
13
  return True
@@ -17,7 +19,6 @@ def setup_salt():
17
  print("πŸ“₯ Setting up SALT library...")
18
 
19
  try:
20
- # Clone SALT repo if not exists
21
  salt_dir = Path("salt")
22
  if not salt_dir.exists():
23
  print("πŸ”„ Cloning SALT repository...")
@@ -27,7 +28,6 @@ def setup_salt():
27
  else:
28
  print("πŸ“ SALT repository already exists")
29
 
30
- # Install SALT requirements
31
  salt_requirements = salt_dir / "requirements.txt"
32
  if salt_requirements.exists():
33
  print("πŸ“¦ Installing SALT requirements...")
@@ -35,13 +35,11 @@ def setup_salt():
35
  sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements)
36
  ])
37
 
38
- # Add SALT directory to Python path
39
  salt_path = str(salt_dir.absolute())
40
  if salt_path not in sys.path:
41
  sys.path.insert(0, salt_path)
42
  print(f"πŸ”— Added {salt_path} to Python path")
43
 
44
- # Test import
45
  import salt.dataset
46
  print("βœ… SALT library setup completed successfully")
47
  return True
@@ -51,186 +49,119 @@ def setup_salt():
51
  return False
52
 
53
  # Setup SALT on startup
54
- print("πŸš€ Starting SALT Translation Leaderboard - Scientific Edition...")
55
  if not setup_salt():
56
  print("❌ Cannot continue without SALT library")
57
- print("πŸ’‘ Please check that git is available and GitHub is accessible")
58
  sys.exit(1)
59
 
60
  import gradio as gr
61
  import pandas as pd
62
  import json
63
- import traceback
64
- from datetime import datetime
65
- from typing import Optional, Dict, Tuple, List
66
 
67
- # Import our enhanced modules
68
  from src.test_set import (
69
- get_public_test_set_scientific,
70
- get_complete_test_set_scientific,
71
- create_test_set_download_scientific,
72
- validate_test_set_integrity_scientific,
73
- get_track_test_set
74
- )
75
- from src.validation import validate_submission_scientific
76
- from src.evaluation import (
77
- evaluate_predictions_scientific,
78
- generate_scientific_report,
79
- compare_models_statistically
80
  )
 
 
81
  from src.leaderboard import (
82
- load_scientific_leaderboard,
83
- add_model_to_scientific_leaderboard,
84
- get_scientific_leaderboard_stats,
85
  get_track_leaderboard,
86
- prepare_track_leaderboard_display,
87
- perform_fair_comparison,
88
- export_scientific_leaderboard
89
  )
90
  from src.plotting import (
91
- create_scientific_leaderboard_plot,
92
- create_language_pair_heatmap_scientific,
93
- create_statistical_comparison_plot,
94
- create_category_comparison_plot,
95
- create_adequacy_analysis_plot,
96
- create_cross_track_analysis_plot,
97
- create_scientific_model_detail_plot
98
- )
99
- from src.utils import (
100
- sanitize_model_name,
101
- get_all_language_pairs,
102
- get_google_comparable_pairs,
103
- get_track_language_pairs,
104
- format_metric_value
105
  )
 
106
  from config import *
107
 
108
  # Global variables for caching
109
  current_leaderboard = None
110
  public_test_set = None
111
  complete_test_set = None
112
- test_set_stats = None
113
 
114
- def initialize_scientific_data():
115
- """Initialize scientific test sets and leaderboard data."""
116
- global public_test_set, complete_test_set, current_leaderboard, test_set_stats
117
 
118
  try:
119
- print("πŸ”¬ Initializing SALT Translation Leaderboard - Scientific Edition...")
 
 
120
 
121
- # Load scientific test sets
122
- print("πŸ“₯ Loading scientific test sets...")
123
- public_test_set = get_public_test_set_scientific()
124
- complete_test_set = get_complete_test_set_scientific()
125
 
126
- # Load scientific leaderboard
127
- print("πŸ† Loading scientific leaderboard...")
128
- current_leaderboard = load_scientific_leaderboard()
129
-
130
- # Validate test set integrity
131
- print("πŸ” Validating test set integrity...")
132
- test_set_stats = validate_test_set_integrity_scientific()
133
-
134
- print(f"βœ… Scientific initialization complete!")
135
  print(f" - Test set: {len(public_test_set):,} samples")
136
- print(f" - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}")
137
- print(f" - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}")
138
  print(f" - Current models: {len(current_leaderboard)}")
139
 
140
  return True
141
 
142
  except Exception as e:
143
- print(f"❌ Scientific initialization failed: {e}")
144
  traceback.print_exc()
145
  return False
146
 
147
- def download_scientific_test_set() -> Tuple[str, str]:
148
- """Create downloadable scientific test set and return file path and info."""
149
-
150
  try:
151
  global public_test_set
152
  if public_test_set is None:
153
- public_test_set = get_public_test_set_scientific()
154
-
155
- # Create download file
156
- download_path, stats = create_test_set_download_scientific()
157
 
158
- # Create comprehensive info message
159
- adequacy = stats.get('adequacy_assessment', 'unknown')
160
- adequacy_emoji = {
161
- 'excellent': '🟒',
162
- 'good': '🟑',
163
- 'fair': '🟠',
164
- 'insufficient': 'πŸ”΄',
165
- 'unknown': 'βšͺ'
166
- }.get(adequacy, 'βšͺ')
167
 
168
  info_msg = f"""
169
- ## πŸ“₯ SALT Scientific Test Set Downloaded Successfully!
170
-
171
- ### πŸ”¬ Scientific Edition Features:
172
- - **Stratified Sampling**: Ensures representative coverage across domains
173
- - **Statistical Weighting**: Samples weighted by track importance
174
- - **Track Balancing**: Optimized for fair cross-track comparison
175
- - **Adequacy Validation**: {adequacy_emoji} Overall adequacy: **{adequacy.title()}**
176
 
177
  ### πŸ“Š Dataset Statistics:
178
  - **Total Samples**: {stats['total_samples']:,}
179
  - **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))})
180
- - **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples ({stats.get('google_comparable_rate', 0):.1%})
181
- - **Domains**: {', '.join(stats.get('domains', ['general']))}
182
 
183
  ### 🏁 Track Breakdown:
184
  """
185
 
186
  track_breakdown = stats.get('track_breakdown', {})
187
  for track_name, track_info in track_breakdown.items():
188
- status_emoji = 'βœ…' if track_info.get('statistical_adequacy', False) else '⚠️'
189
  info_msg += f"""
190
- **{status_emoji} {track_info.get('name', track_name)}**:
191
  - Samples: {track_info.get('total_samples', 0):,}
192
  - Language Pairs: {track_info.get('language_pairs', 0)}
193
- - Min Required/Pair: {track_info.get('min_samples_per_pair', 0)}
194
- - Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'}
195
  """
196
 
197
  info_msg += f"""
198
 
199
- ### πŸ“‹ Enhanced File Format:
200
  - `sample_id`: Unique identifier for each sample
201
  - `source_text`: Text to be translated
202
  - `source_language`: Source language code
203
  - `target_language`: Target language code
204
  - `domain`: Content domain (if available)
205
  - `google_comparable`: Whether this pair can be compared with Google Translate
206
- - `tracks_included`: Comma-separated list of tracks that include this sample
207
- - `statistical_weight`: Statistical importance weight (1.0-5.0)
208
 
209
- ### πŸ”¬ Next Steps for Scientific Evaluation:
210
  1. **Run your model** on the source texts to generate translations
211
  2. **Create a predictions file** with columns: `sample_id`, `prediction`
212
- 3. **Optional**: Add `category` column to help with model classification
213
- 4. **Submit** your predictions using the appropriate track tab
214
- 5. **Analyze** results with statistical confidence intervals
215
-
216
- ### πŸ’‘ Tips for Best Results:
217
- - Ensure coverage of all language pairs for chosen track
218
- - Include confidence scores if available
219
- - Provide detailed model description for proper categorization
220
- - Consider submitting to multiple tracks for comprehensive evaluation
221
  """
222
 
223
  return download_path, info_msg
224
 
225
  except Exception as e:
226
- error_msg = f"❌ Error creating scientific test set download: {str(e)}"
227
  return None, error_msg
228
 
229
- def validate_scientific_submission(
230
- file, model_name: str, author: str, description: str
231
- ) -> Tuple[str, Optional[pd.DataFrame], str]:
232
- """Validate uploaded prediction file with scientific rigor."""
233
-
234
  try:
235
  if file is None:
236
  return "❌ Please upload a predictions file", None, "community"
@@ -252,70 +183,50 @@ def validate_scientific_submission(
252
  else:
253
  return "❌ Could not read uploaded file", None, "community"
254
 
255
- # Determine filename
256
- filename = (
257
- getattr(file, "name", None)
258
- or getattr(file, "filename", None)
259
- or "predictions.csv"
260
- )
261
 
262
- # Load test set if needed
263
  global complete_test_set
264
  if complete_test_set is None:
265
- complete_test_set = get_complete_test_set_scientific()
266
 
267
- # Run enhanced scientific validation
268
- validation_result = validate_submission_scientific(
269
  file_content, filename, complete_test_set, model_name, author, description
270
  )
271
 
272
  detected_category = validation_result.get("category", "community")
273
 
274
- # Return predictions if evaluation is possible (even with limitations)
275
  if validation_result.get("can_evaluate", False):
276
  return validation_result["report"], validation_result["predictions"], detected_category
277
  else:
278
  return validation_result["report"], None, detected_category
279
 
280
  except Exception as e:
281
- return (
282
- f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
283
- None,
284
- "community"
285
- )
286
 
287
- def evaluate_scientific_submission(
288
  predictions_df: pd.DataFrame,
289
  model_name: str,
290
  author: str,
291
  description: str,
292
  detected_category: str,
293
- validation_info: Dict,
294
  ) -> Tuple[str, pd.DataFrame, object, object]:
295
- """Evaluate validated predictions using scientific methodology."""
296
-
297
  try:
298
  if predictions_df is None:
299
  return "❌ No valid predictions to evaluate", None, None, None
300
 
301
- # Get complete test set with targets
302
  global complete_test_set, current_leaderboard
303
  if complete_test_set is None:
304
- complete_test_set = get_complete_test_set_scientific()
305
 
306
- # Run scientific evaluation across all tracks
307
- print(f"πŸ”¬ Starting scientific evaluation for {model_name}...")
308
- evaluation_results = evaluate_predictions_scientific(
309
- predictions_df, complete_test_set, detected_category
310
- )
311
 
312
- if any(track_data.get('error') for track_data in evaluation_results.get('tracks', {}).values()):
313
- errors = [track_data['error'] for track_data in evaluation_results['tracks'].values() if track_data.get('error')]
314
- return f"❌ Evaluation errors: {'; '.join(errors)}", None, None, None
315
 
316
- # Add to scientific leaderboard
317
- print("πŸ† Adding to scientific leaderboard...")
318
- updated_leaderboard = add_model_to_scientific_leaderboard(
319
  model_name=sanitize_model_name(model_name),
320
  author=author or "Anonymous",
321
  evaluation_results=evaluation_results,
@@ -323,526 +234,221 @@ def evaluate_scientific_submission(
323
  description=description or ""
324
  )
325
 
326
- # Update global leaderboard
327
  current_leaderboard = updated_leaderboard
328
 
329
- # Generate scientific report
330
- report = generate_scientific_report(evaluation_results, model_name)
331
 
332
  # Create visualizations
333
- summary_plot = create_adequacy_analysis_plot(updated_leaderboard)
334
- cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard)
335
-
336
- # Prepare display leaderboard (Google-comparable track by default)
337
  google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable")
338
- display_leaderboard = prepare_track_leaderboard_display(google_leaderboard, "google_comparable")
339
 
340
- # Format success message with track-specific results
341
  success_msg = f"""
342
- ## πŸŽ‰ Scientific Evaluation Complete!
343
 
344
  ### πŸ“Š Model Information:
345
  - **Model**: {model_name}
346
  - **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)}
347
  - **Author**: {author or 'Anonymous'}
348
 
349
- ### πŸ† Track Performance Summary:
350
- """
351
-
352
- tracks = evaluation_results.get('tracks', {})
353
- for track_name, track_data in tracks.items():
354
- if not track_data.get('error'):
355
- track_config = EVALUATION_TRACKS[track_name]
356
- track_averages = track_data.get('track_averages', {})
357
- summary = track_data.get('summary', {})
358
-
359
- # Get rank in this track
360
- track_leaderboard = get_track_leaderboard(updated_leaderboard, track_name)
361
- if not track_leaderboard.empty:
362
- model_row = track_leaderboard[track_leaderboard['model_name'] == sanitize_model_name(model_name)]
363
- rank = model_row.index[0] + 1 if not model_row.empty else "N/A"
364
- total_models = len(track_leaderboard)
365
- else:
366
- rank = "N/A"
367
- total_models = 0
368
-
369
- quality_score = track_averages.get('quality_score', 0)
370
- bleu_score = track_averages.get('bleu', 0)
371
- samples = summary.get('total_samples', 0)
372
-
373
- success_msg += f"""
374
- **🏁 {track_config['name']}**:
375
- - Rank: #{rank} out of {total_models} models
376
- - Quality Score: {quality_score:.4f}
377
- - BLEU: {bleu_score:.2f}
378
- - Samples: {samples:,}
379
- """
380
-
381
- success_msg += f"""
382
-
383
- ### πŸ”¬ Scientific Adequacy:
384
- - **Cross-Track Consistency**: Available in detailed analysis
385
- - **Statistical Confidence**: 95% confidence intervals computed
386
- - **Sample Adequacy**: {validation_info.get('adequacy', {}).get('overall_adequate', 'Unknown')}
387
-
388
  {report}
389
  """
390
 
391
- return success_msg, display_leaderboard, summary_plot, cross_track_plot
392
 
393
  except Exception as e:
394
- error_msg = f"❌ Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
395
  return error_msg, None, None, None
396
 
397
- def refresh_track_leaderboard(
398
- track: str,
399
- search_query: str = "",
400
- category_filter: str = "all",
401
- min_adequacy: float = 0.0,
402
- show_ci: bool = True
403
- ) -> Tuple[pd.DataFrame, object, object, str]:
404
  """Refresh leaderboard for a specific track with filters."""
405
-
406
  try:
407
  global current_leaderboard
408
  if current_leaderboard is None:
409
- current_leaderboard = load_scientific_leaderboard()
410
 
411
- # Get track-specific leaderboard with better error handling
412
- try:
413
- track_leaderboard = get_track_leaderboard(
414
- current_leaderboard, track, category_filter=category_filter, min_adequacy=min_adequacy
415
- )
416
- except Exception as e:
417
- print(f"Error getting track leaderboard for {track}: {e}")
418
- track_leaderboard = pd.DataFrame()
419
 
420
  # Apply search filter
421
  if search_query and not track_leaderboard.empty:
422
- try:
423
- query_lower = search_query.lower()
424
- mask = (
425
- track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) |
426
- track_leaderboard['author'].str.lower().str.contains(query_lower, na=False)
427
- )
428
- track_leaderboard = track_leaderboard[mask]
429
- except Exception as e:
430
- print(f"Error applying search filter: {e}")
431
-
432
- # Prepare for display
433
- try:
434
- display_df = prepare_track_leaderboard_display(track_leaderboard, track)
435
- except Exception as e:
436
- print(f"Error preparing display: {e}")
437
- display_df = pd.DataFrame()
438
-
439
- # Create plots with error handling
440
- try:
441
- ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track)
442
- except Exception as e:
443
- print(f"Error creating ranking plot: {e}")
444
- ranking_plot = None
445
 
446
- try:
447
- comparison_plot = create_statistical_comparison_plot(track_leaderboard, track)
448
- except Exception as e:
449
- print(f"Error creating comparison plot: {e}")
450
- comparison_plot = None
451
 
452
- # Get track statistics
453
- try:
454
- track_stats = get_scientific_leaderboard_stats(track_leaderboard, track)
455
- track_config = EVALUATION_TRACKS[track]
456
-
457
- stats_text = f"""
458
  ### πŸ“Š {track_config['name']} Statistics
459
 
460
- - **Total Models**: {track_stats.get('total_models', 0)}
461
- - **Models by Category**: {', '.join([f"{k}: {v}" for k, v in track_stats.get('models_by_category', {}).items()])}
462
- - **Average Quality Score**: {track_stats.get('track_statistics', {}).get(track, {}).get('avg_quality', 0.0):.4f}
463
 
464
- **Best Model**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('name', 'None')}
465
- **Best Score**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('quality', 0.0):.4f}
466
-
467
- ### πŸ”¬ Scientific Notes:
468
- - All metrics include 95% confidence intervals
469
- - Statistical adequacy verified for reliable comparisons
470
- - {track_config['description']}
471
- """
472
- except Exception as e:
473
- print(f"Error generating stats: {e}")
474
- stats_text = f"Error loading {track} statistics: {str(e)}"
475
 
476
  return display_df, ranking_plot, comparison_plot, stats_text
477
 
478
  except Exception as e:
479
  error_msg = f"Error loading {track} leaderboard: {str(e)}"
480
  print(error_msg)
481
- empty_df = pd.DataFrame()
482
- return empty_df, None, None, error_msg
483
 
484
- def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, object, object]:
485
- """Get detailed scientific analysis for a specific model."""
486
-
487
  try:
488
  global current_leaderboard
489
  if current_leaderboard is None:
490
- return "Leaderboard not loaded", None, None
491
-
492
- # Find model
493
- model_row = current_leaderboard[current_leaderboard['model_name'] == model_name]
494
 
495
- if model_row.empty:
496
- return f"Model '{model_name}' not found in leaderboard", None, None
497
 
498
- model_info = model_row.iloc[0]
499
-
500
- # Parse detailed metrics for the requested track
501
- detailed_results = {}
502
- detailed_col = f'detailed_{track}'
503
-
504
- if detailed_col in model_info and pd.notna(model_info[detailed_col]):
505
- try:
506
- detailed_results = json.loads(model_info[detailed_col])
507
- print(f"Successfully loaded detailed results for {model_name} in {track}")
508
- except json.JSONDecodeError as e:
509
- print(f"Error parsing detailed metrics for {model_name}: {e}")
510
- detailed_results = {}
511
- else:
512
- print(f"No detailed metrics found for {model_name} in column {detailed_col}")
513
- # Create a fallback structure
514
- detailed_results = {
515
- 'tracks': {
516
- track: {
517
- 'pair_metrics': {},
518
- 'track_averages': {
519
- 'quality_score': model_info.get(f'{track}_quality', 0),
520
- 'bleu': model_info.get(f'{track}_bleu', 0),
521
- 'chrf': model_info.get(f'{track}_chrf', 0)
522
- }
523
- }
524
- }
525
- }
526
 
527
- # Create detailed plots
528
- try:
529
- detail_plot = create_scientific_model_detail_plot(detailed_results, model_name, track)
530
- except Exception as e:
531
- print(f"Error creating detail plot: {e}")
532
- detail_plot = None
533
 
534
- try:
535
- heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track)
536
- except Exception as e:
537
- print(f"Error creating heatmap plot: {e}")
538
- heatmap_plot = None
539
-
540
- # Format model details with scientific information
541
- track_config = EVALUATION_TRACKS[track]
542
- category_info = MODEL_CATEGORIES.get(model_info['model_category'], {})
543
-
544
- # Extract track-specific metrics
545
- quality_col = f"{track}_quality"
546
- bleu_col = f"{track}_bleu"
547
- chrf_col = f"{track}_chrf"
548
- ci_lower_col = f"{track}_ci_lower"
549
- ci_upper_col = f"{track}_ci_upper"
550
- samples_col = f"{track}_samples"
551
- pairs_col = f"{track}_pairs"
552
- adequate_col = f"{track}_adequate"
553
-
554
- details_text = f"""
555
- ## πŸ”¬ Scientific Model Analysis: {model_name}
556
-
557
- ### πŸ“‹ Basic Information:
558
- - **Author**: {model_info['author']}
559
- - **Category**: {category_info.get('name', 'Unknown')} - {category_info.get('description', '')}
560
- - **Submission Date**: {model_info['submission_date'][:10]}
561
- - **Description**: {model_info['description'] or 'No description provided'}
562
-
563
- ### 🏁 {track_config['name']} Performance:
564
- - **Quality Score**: {model_info.get(quality_col, 0):.4f}
565
- - **BLEU**: {model_info.get(bleu_col, 0):.2f}
566
- - **ChrF**: {model_info.get(chrf_col, 0):.4f}
567
- - **95% CI**: [{model_info.get(ci_lower_col, 0):.4f}, {model_info.get(ci_upper_col, 0):.4f}]
568
-
569
- ### πŸ“Š Coverage Information:
570
- - **Total Samples**: {model_info.get(samples_col, 0):,}
571
- - **Language Pairs Covered**: {model_info.get(pairs_col, 0)}
572
- - **Statistical Adequacy**: {'βœ… Yes' if model_info.get(adequate_col, False) else '❌ No'}
573
-
574
- ### πŸ”¬ Statistical Metadata:
575
- - **Confidence Level**: {STATISTICAL_CONFIG['confidence_level']:.0%}
576
- - **Bootstrap Samples**: {STATISTICAL_CONFIG['bootstrap_samples']:,}
577
- - **Scientific Adequacy Score**: {model_info.get('scientific_adequacy_score', 0.0):.3f}
578
-
579
- ### πŸ“ˆ Cross-Track Performance:
580
- """
581
-
582
- # Add other track performances for comparison
583
- for other_track in EVALUATION_TRACKS.keys():
584
- if other_track != track:
585
- other_quality_col = f"{other_track}_quality"
586
- other_adequate_col = f"{other_track}_adequate"
587
 
588
- if model_info.get(other_adequate_col, False):
589
- other_quality = model_info.get(other_quality_col, 0)
590
- details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: {other_quality:.4f}\n"
591
- else:
592
- details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: Not evaluated\n"
593
-
594
- # Add language pair performance if available
595
- if detailed_results and 'tracks' in detailed_results and track in detailed_results['tracks']:
596
- track_data = detailed_results['tracks'][track]
597
- pair_metrics = track_data.get('pair_metrics', {})
598
-
599
- if pair_metrics:
600
- details_text += f"""
601
-
602
- ### πŸ—ΊοΈ Language Pair Performance:
603
- Top performing pairs:
604
- """
605
- # Sort pairs by quality score
606
- pairs_sorted = []
607
- for pair_key, metrics in pair_metrics.items():
608
- if 'quality_score' in metrics and 'mean' in metrics['quality_score']:
609
- pairs_sorted.append((pair_key, metrics['quality_score']['mean']))
610
-
611
- pairs_sorted.sort(key=lambda x: x[1], reverse=True)
612
 
613
- for pair_key, score in pairs_sorted[:5]: # Top 5
614
- src, tgt = pair_key.split('_to_')
615
- src_name = LANGUAGE_NAMES.get(src, src)
616
- tgt_name = LANGUAGE_NAMES.get(tgt, tgt)
617
- details_text += f"- **{src_name} β†’ {tgt_name}**: {score:.4f}\n"
618
-
619
- details_text += f"""
620
-
621
- ### πŸ’‘ Scientific Interpretation:
622
- - Performance metrics include 95% confidence intervals for reliability
623
- - Statistical adequacy ensures meaningful comparisons with other models
624
- - Cross-track analysis reveals model strengths across different language sets
625
- - Category classification helps contextualize performance expectations
626
- """
627
-
628
- return details_text, detail_plot, heatmap_plot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
 
630
  except Exception as e:
631
- error_msg = f"Error getting model details: {str(e)}\n{traceback.format_exc()}"
632
- print(error_msg)
633
- return error_msg, None, None
634
-
635
- def perform_model_comparison(
636
- model_names: List[str], track: str, comparison_type: str = "statistical"
637
- ) -> Tuple[str, object]:
638
- """Perform scientific comparison between selected models."""
639
-
640
- try:
641
- global current_leaderboard
642
- if current_leaderboard is None:
643
- return "Leaderboard not loaded", None
644
-
645
- if len(model_names) < 2:
646
- return "Please select at least 2 models for comparison", None
647
-
648
- # Get models
649
- models = current_leaderboard[current_leaderboard['model_name'].isin(model_names)]
650
-
651
- if len(models) < 2:
652
- return "Selected models not found in leaderboard", None
653
-
654
- # Perform fair comparison
655
- comparison_result = perform_fair_comparison(current_leaderboard, model_names)
656
-
657
- if comparison_result.get('error'):
658
- return f"Comparison error: {comparison_result['error']}", None
659
-
660
- # Create comparison visualization
661
- if comparison_type == "statistical":
662
- comparison_plot = create_statistical_comparison_plot(models, track)
663
- else:
664
- comparison_plot = create_category_comparison_plot(models, track)
665
-
666
- # Format comparison report
667
- track_config = EVALUATION_TRACKS[track]
668
- comparison_text = f"""
669
- ## πŸ”¬ Scientific Model Comparison - {track_config['name']}
670
-
671
- ### πŸ“Š Models Compared:
672
- """
673
-
674
- quality_col = f"{track}_quality"
675
- ci_lower_col = f"{track}_ci_lower"
676
- ci_upper_col = f"{track}_ci_upper"
677
-
678
- # Sort models by performance
679
- models_sorted = models.sort_values(quality_col, ascending=False)
680
-
681
- for i, (_, model) in enumerate(models_sorted.iterrows(), 1):
682
- category_info = MODEL_CATEGORIES.get(model['model_category'], {})
683
-
684
- comparison_text += f"""
685
- **#{i}. {model['model_name']}**
686
- - Category: {category_info.get('name', 'Unknown')}
687
- - Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])}
688
- - Author: {model['author']}
689
- """
690
-
691
- # Add statistical analysis
692
- track_comparison = comparison_result.get('track_comparisons', {}).get(track, {})
693
- if track_comparison:
694
- comparison_text += f"""
695
-
696
- ### πŸ”¬ Statistical Analysis:
697
- - **Models with adequate data**: {track_comparison.get('participating_models', 0)}
698
- - **Confidence intervals available**: Yes (95% level)
699
- - **Fair comparison possible**: {'βœ… Yes' if comparison_result.get('fair_comparison_possible', False) else '⚠️ Limited'}
700
- """
701
-
702
- # Check for statistical significance (simplified)
703
- quality_scores = list(track_comparison.get('quality_scores', {}).values())
704
- if len(quality_scores) >= 2:
705
- score_range = max(quality_scores) - min(quality_scores)
706
- if score_range > 0.05: # 5% difference threshold
707
- comparison_text += "- **Performance differences**: Potentially significant\n"
708
- else:
709
- comparison_text += "- **Performance differences**: Minimal\n"
710
-
711
- # Add recommendations
712
- recommendations = comparison_result.get('recommendations', [])
713
- if recommendations:
714
- comparison_text += "\n### πŸ’‘ Recommendations:\n"
715
- for rec in recommendations:
716
- comparison_text += f"- {rec}\n"
717
-
718
- return comparison_text, comparison_plot
719
-
720
- except Exception as e:
721
- error_msg = f"Error performing comparison: {str(e)}"
722
- return error_msg, None
723
 
724
  # Initialize data on startup
725
- print("πŸš€ Starting SALT Translation Leaderboard - Scientific Edition...")
726
- initialization_success = initialize_scientific_data()
727
 
728
- # Create Gradio interface with scientific design
729
  with gr.Blocks(
730
- title=UI_CONFIG["title"],
731
  theme=gr.themes.Soft(),
732
  css="""
733
  .gradio-container {
734
  max-width: 1600px !important;
735
  margin: 0 auto;
736
  }
737
- .scientific-header {
738
- text-align: center;
739
- margin-bottom: 2rem;
740
- padding: 2rem;
741
- background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%);
742
- color: white !important;
743
- border-radius: 10px;
744
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
745
- }
746
- .scientific-header h1, .scientific-header p {
747
- color: white !important;
748
- }
749
 
750
- /* Simple fix for text visibility - force dark text on light background */
751
- .markdown, .gr-markdown {
752
- background: #ffffff !important;
753
- color: #1f2937 !important;
754
- padding: 1rem;
755
- border-radius: 8px;
756
- margin: 0.5rem 0;
757
  }
758
- .markdown h1, .markdown h2, .markdown h3, .markdown h4, .markdown h5, .markdown h6 {
759
- color: #1f2937 !important;
 
 
760
  }
761
- .markdown p, .markdown li, .markdown strong, .markdown em {
762
- color: #1f2937 !important;
 
 
763
  }
764
- .markdown code {
765
- background: #f3f4f6 !important;
766
- color: #1f2937 !important;
767
- padding: 0.2em 0.4em;
768
- border-radius: 4px;
769
  }
770
- .markdown pre {
771
- background: #f3f4f6 !important;
772
- color: #1f2937 !important;
773
- padding: 1rem;
774
- border-radius: 8px;
775
  }
776
 
777
- /* Track tab styling */
778
- .track-tab {
779
- background: #ffffff !important;
780
- color: #1f2937 !important;
781
- border-radius: 8px;
782
- margin: 0.5rem;
783
- padding: 1rem;
784
- border: 2px solid #e5e7eb;
785
  }
786
  """
787
  ) as demo:
788
 
789
- # Scientific Header
790
- gr.HTML(f"""
791
- <div class="scientific-header">
792
- <h1>πŸ† SALT Translation Leaderboard - Scientific Edition</h1>
793
- <p><strong>Rigorous Evaluation with Statistical Significance Testing</strong></p>
794
- <p>Three-tier evaluation tracks β€’ 95% Confidence intervals β€’ Research-grade analysis</p>
795
- <p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
796
  </div>
797
  """)
798
 
799
  # Status indicator
800
  if initialization_success:
801
- status_msg = "βœ… Scientific system initialized successfully"
802
- adequacy_info = test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')
803
- status_msg += f" | Test set adequacy: {adequacy_info.title()}"
804
  else:
805
  status_msg = "❌ System initialization failed - some features may not work"
806
 
807
  gr.Markdown(f"**System Status**: {status_msg}")
808
 
809
- # Add scientific overview
810
- gr.Markdown("""
811
- ## πŸ”¬ Scientific Evaluation Framework
812
-
813
- This leaderboard implements rigorous scientific methodology for translation model evaluation:
814
-
815
- - **Three Evaluation Tracks**: Fair comparison across different model capabilities
816
- - **Statistical Significance**: 95% confidence intervals and effect size analysis
817
- - **Category-Based Analysis**: Commercial, Research, Baseline, and Community models
818
- - **Cross-Track Consistency**: Validate model performance across language sets
819
- """)
820
-
821
  with gr.Tabs():
822
 
823
  # Tab 1: Download Test Set
824
  with gr.Tab("πŸ“₯ Download Test Set", id="download"):
825
  gr.Markdown("""
826
- ## πŸ“‹ Get the SALT Scientific Test Set
827
 
828
- Download our scientifically designed test set with stratified sampling and statistical weighting.
829
  """)
830
 
831
- with gr.Row():
832
- download_btn = gr.Button("πŸ“₯ Download Scientific Test Set", variant="primary", size="lg")
833
 
834
  with gr.Row():
835
  with gr.Column():
836
  download_file = gr.File(label="πŸ“‚ Test Set File", interactive=False)
837
  with gr.Column():
838
- download_info = gr.Markdown(label="ℹ️ Test Set Information")
839
 
840
  # Tab 2: Submit Predictions
841
  with gr.Tab("πŸš€ Submit Predictions", id="submit"):
842
  gr.Markdown("""
843
- ## 🎯 Submit Your Model's Predictions for Scientific Evaluation
844
 
845
- Upload predictions for comprehensive evaluation across all three tracks with statistical analysis.
846
  """)
847
 
848
  with gr.Row():
@@ -864,51 +470,38 @@ with gr.Blocks(
864
  description_input = gr.Textbox(
865
  label="πŸ“„ Model Description",
866
  placeholder="Architecture, training data, special features...",
867
- lines=4,
868
- info="Detailed description helps with proper categorization"
869
  )
870
 
871
- gr.Markdown("### πŸ“€ Upload Predictions")
872
  predictions_file = gr.File(
873
  label="πŸ“‚ Predictions File",
874
  file_types=[".csv", ".tsv", ".json"]
875
  )
876
 
877
  validate_btn = gr.Button("βœ… Validate Submission", variant="secondary")
878
- submit_btn = gr.Button("πŸš€ Submit for Scientific Evaluation", variant="primary", interactive=False)
879
 
880
  with gr.Column(scale=1):
881
- gr.Markdown("### πŸ“Š Validation Results")
882
  validation_output = gr.Markdown()
883
 
884
- # Results section
885
- gr.Markdown("### πŸ† Scientific Evaluation Results")
886
-
887
- with gr.Row():
888
- evaluation_output = gr.Markdown()
889
 
890
  with gr.Row():
891
  with gr.Column():
892
- submission_plot = gr.Plot(label="πŸ“ˆ Submission Analysis")
893
  with gr.Column():
894
- cross_track_plot = gr.Plot(label="πŸ”„ Cross-Track Analysis")
895
-
896
- with gr.Row():
897
- results_table = gr.Dataframe(label="πŸ“Š Updated Leaderboard (Google-Comparable Track)", interactive=False)
898
 
899
  # Tab 3: Google-Comparable Track
900
- with gr.Tab("πŸ€– Google-Comparable Track", id="google_track", elem_classes=["track-tab", "google-comparable"]):
901
  gr.Markdown(f"""
902
- ## {UI_CONFIG['tracks']['google_comparable']['tab_name']}
903
 
904
- **Fair comparison with commercial translation systems**
905
 
906
- This track evaluates models on the {len(get_google_comparable_pairs())} language pairs supported by Google Translate,
907
  enabling direct comparison with commercial baselines.
908
-
909
- - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
910
- - **Purpose**: Commercial system comparison and baseline establishment
911
- - **Statistical Power**: High (optimized sample sizes)
912
  """)
913
 
914
  with gr.Row():
@@ -920,39 +513,28 @@ with gr.Blocks(
920
  choices=["all"] + list(MODEL_CATEGORIES.keys()),
921
  value="all"
922
  )
923
- with gr.Column(scale=1):
924
- google_adequacy = gr.Slider(
925
- label="πŸ“Š Min Adequacy",
926
- minimum=0.0, maximum=1.0, value=0.0, step=0.1
927
- )
928
  with gr.Column(scale=1):
929
  google_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
930
 
931
- with gr.Row():
932
- google_stats = gr.Markdown()
933
 
934
  with gr.Row():
935
  with gr.Column():
936
- google_ranking_plot = gr.Plot(label="πŸ† Google-Comparable Rankings")
937
  with gr.Column():
938
- google_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
939
 
940
- with gr.Row():
941
- google_leaderboard = gr.Dataframe(label="πŸ“ˆ Google-Comparable Leaderboard", interactive=False)
942
 
943
  # Tab 4: UG40-Complete Track
944
- with gr.Tab("🌍 UG40-Complete Track", id="ug40_track", elem_classes=["track-tab", "ug40-complete"]):
945
  gr.Markdown(f"""
946
- ## {UI_CONFIG['tracks']['ug40_complete']['tab_name']}
947
 
948
- **Comprehensive evaluation across all Ugandan languages**
949
 
950
- This track evaluates models on all {len(get_all_language_pairs())} UG40 language pairs,
951
- providing the most comprehensive assessment of Ugandan language translation capabilities.
952
-
953
- - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
954
- - **Purpose**: Comprehensive Ugandan language capability assessment
955
- - **Coverage**: Complete linguistic landscape of Uganda
956
  """)
957
 
958
  with gr.Row():
@@ -964,479 +546,197 @@ with gr.Blocks(
964
  choices=["all"] + list(MODEL_CATEGORIES.keys()),
965
  value="all"
966
  )
967
- with gr.Column(scale=1):
968
- ug40_adequacy = gr.Slider(
969
- label="πŸ“Š Min Adequacy",
970
- minimum=0.0, maximum=1.0, value=0.0, step=0.1
971
- )
972
  with gr.Column(scale=1):
973
  ug40_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
974
 
975
- with gr.Row():
976
- ug40_stats = gr.Markdown()
977
 
978
  with gr.Row():
979
  with gr.Column():
980
- ug40_ranking_plot = gr.Plot(label="πŸ† UG40-Complete Rankings")
981
  with gr.Column():
982
- ug40_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
983
 
984
- with gr.Row():
985
- ug40_leaderboard = gr.Dataframe(label="πŸ“ˆ UG40-Complete Leaderboard", interactive=False)
986
 
987
- # Tab 5: Language-Pair Matrix
988
- with gr.Tab("πŸ“Š Language-Pair Matrix", id="matrix_track", elem_classes=["track-tab", "language-pair-matrix"]):
989
- gr.Markdown(f"""
990
- ## {UI_CONFIG['tracks']['language_pair_matrix']['tab_name']}
991
-
992
- **Detailed language pair analysis with statistical significance**
993
-
994
- This view provides granular analysis of model performance across individual language pairs
995
- with statistical significance testing and effect size analysis.
996
-
997
- - **Resolution**: Individual language pair performance
998
- - **Purpose**: Detailed linguistic analysis and model diagnostics
999
- - **Statistics**: Pairwise significance testing available
1000
- """)
1001
-
1002
- with gr.Row():
1003
- with gr.Column(scale=2):
1004
- matrix_search = gr.Textbox(label="πŸ” Search Models", placeholder="Search by model name, author...")
1005
- with gr.Column(scale=1):
1006
- matrix_category = gr.Dropdown(
1007
- label="🏷️ Category Filter",
1008
- choices=["all"] + list(MODEL_CATEGORIES.keys()),
1009
- value="all"
1010
- )
1011
- with gr.Column(scale=1):
1012
- matrix_adequacy = gr.Slider(
1013
- label="πŸ“Š Min Adequacy",
1014
- minimum=0.0, maximum=1.0, value=0.0, step=0.1
1015
- )
1016
- with gr.Column(scale=1):
1017
- matrix_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
1018
-
1019
- with gr.Row():
1020
- matrix_stats = gr.Markdown()
1021
-
1022
- with gr.Row():
1023
- with gr.Column():
1024
- matrix_ranking_plot = gr.Plot(label="πŸ† Language-Pair Matrix Rankings")
1025
- with gr.Column():
1026
- matrix_comparison_plot = gr.Plot(label="πŸ“Š Statistical Comparison")
1027
-
1028
- with gr.Row():
1029
- matrix_leaderboard = gr.Dataframe(label="πŸ“ˆ Language-Pair Matrix Leaderboard", interactive=False)
1030
-
1031
- # Tab 6: Model Analysis
1032
- with gr.Tab("πŸ” Scientific Model Analysis", id="analysis"):
1033
- gr.Markdown("""
1034
- ## πŸ”¬ Detailed Scientific Model Analysis
1035
-
1036
- Comprehensive analysis of individual models with statistical confidence intervals,
1037
- cross-track performance, and detailed language pair breakdowns.
1038
- """)
1039
-
1040
- with gr.Row():
1041
- with gr.Column(scale=2):
1042
- model_select = gr.Dropdown(
1043
- label="πŸ€– Select Model",
1044
- choices=[],
1045
- value=None,
1046
- info="Choose a model for detailed scientific analysis"
1047
- )
1048
- with gr.Column(scale=1):
1049
- track_select = gr.Dropdown(
1050
- label="🏁 Analysis Track",
1051
- choices=list(EVALUATION_TRACKS.keys()),
1052
- value="google_comparable",
1053
- info="Track for detailed analysis"
1054
- )
1055
- with gr.Column(scale=1):
1056
- analyze_btn = gr.Button("πŸ” Analyze", variant="primary")
1057
-
1058
- with gr.Row():
1059
- model_details = gr.Markdown()
1060
-
1061
- with gr.Row():
1062
- with gr.Column():
1063
- model_analysis_plot = gr.Plot(label="πŸ“Š Detailed Performance Analysis")
1064
- with gr.Column():
1065
- model_heatmap_plot = gr.Plot(label="πŸ—ΊοΈ Language Pair Heatmap")
1066
-
1067
- # Tab 7: Model Comparison
1068
- with gr.Tab("βš–οΈ Scientific Model Comparison", id="comparison"):
1069
  gr.Markdown("""
1070
- ## πŸ”¬ Scientific Model Comparison
1071
 
1072
- Compare multiple models with statistical significance testing and fair comparison analysis.
1073
- Only models evaluated on the same language pairs are compared for scientific validity.
1074
  """)
1075
 
1076
  with gr.Row():
1077
- with gr.Column(scale=2):
1078
- comparison_models = gr.CheckboxGroup(
1079
- label="πŸ€– Select Models to Compare",
1080
- choices=[],
1081
- value=[],
1082
- info="Select 2-6 models for comparison"
1083
- )
1084
  with gr.Column(scale=1):
1085
- comparison_track = gr.Dropdown(
1086
- label="🏁 Comparison Track",
1087
  choices=list(EVALUATION_TRACKS.keys()),
1088
  value="google_comparable"
1089
  )
1090
- comparison_type = gr.Radio(
1091
- label="πŸ“Š Comparison Type",
1092
- choices=["statistical", "category"],
1093
- value="statistical"
1094
- )
1095
- compare_btn = gr.Button("βš–οΈ Compare Models", variant="primary")
1096
-
1097
- with gr.Row():
1098
- comparison_output = gr.Markdown()
1099
 
1100
- with gr.Row():
1101
- comparison_plot = gr.Plot(label="πŸ“Š Model Comparison Analysis")
1102
 
1103
- # Tab 8: Documentation
1104
- with gr.Tab("πŸ“š Scientific Documentation", id="docs"):
1105
  gr.Markdown(f"""
1106
- # πŸ“– SALT Translation Leaderboard - Scientific Edition Documentation
1107
 
1108
  ## 🎯 Overview
1109
 
1110
- The SALT Translation Leaderboard Scientific Edition implements rigorous evaluation methodology
1111
- for translation models on Ugandan languages, designed for research publication and scientific analysis.
1112
 
1113
- ## πŸ”¬ Scientific Methodology
1114
-
1115
- ### Three-Tier Evaluation System
1116
 
1117
  **1. πŸ€– Google-Comparable Track**
1118
  - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
1119
- - **Pairs**: {len(get_google_comparable_pairs())} language pairs
1120
  - **Purpose**: Fair comparison with commercial translation systems
1121
- - **Statistical Power**: High (β‰₯200 samples per pair recommended)
1122
 
1123
  **2. 🌍 UG40-Complete Track**
1124
  - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
1125
- - **Pairs**: {len(get_all_language_pairs())} language pairs
1126
  - **Purpose**: Comprehensive Ugandan language capability assessment
1127
- - **Statistical Power**: Moderate (β‰₯100 samples per pair recommended)
1128
-
1129
- **3. πŸ“Š Language-Pair Matrix**
1130
- - **Resolution**: Individual language pair analysis
1131
- - **Purpose**: Detailed linguistic analysis and model diagnostics
1132
- - **Statistics**: Pairwise significance testing with multiple comparison correction
1133
-
1134
- ### Statistical Rigor
1135
-
1136
- - **Confidence Intervals**: 95% confidence intervals using bootstrap sampling ({STATISTICAL_CONFIG['bootstrap_samples']:,} resamples)
1137
- - **Significance Testing**: Two-tailed t-tests with {STATISTICAL_CONFIG['multiple_testing_correction'].title()} correction
1138
- - **Effect Size**: Cohen's d with interpretation (small: {STATISTICAL_CONFIG['effect_size_thresholds']['small']}, medium: {STATISTICAL_CONFIG['effect_size_thresholds']['medium']}, large: {STATISTICAL_CONFIG['effect_size_thresholds']['large']})
1139
- - **Statistical Power**: Estimated based on sample sizes and effect sizes
1140
-
1141
- ### Model Categories
1142
-
1143
- Models are automatically categorized for fair comparison:
1144
-
1145
- - **🏒 Commercial**: Production translation systems (Google Translate, Azure, etc.)
1146
- - **πŸ”¬ Research**: Academic and research institution models (NLLB, M2M-100, etc.)
1147
- - **πŸ“Š Baseline**: Simple baseline and reference models
1148
- - **πŸ‘₯ Community**: User-submitted models and fine-tuned variants
1149
 
1150
  ## πŸ“Š Evaluation Metrics
1151
 
1152
  ### Primary Metrics
1153
- - **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, error rates, and ROUGE
1154
  - **BLEU**: Bilingual Evaluation Understudy (0-100)
1155
  - **ChrF**: Character-level F-score (0-1)
1156
 
1157
- ### Secondary Metrics
1158
- - **ROUGE-1/ROUGE-L**: Recall-oriented metrics for content overlap
1159
- - **CER/WER**: Character/Word Error Rate (lower is better)
1160
- - **Length Ratio**: Prediction/reference length ratio
1161
 
1162
- All metrics include 95% confidence intervals for statistical reliability.
 
 
 
 
 
1163
 
1164
  ## πŸ”„ Submission Process
1165
 
1166
- ### Step 1: Download Scientific Test Set
1167
- 1. Click "Download Scientific Test Set" in the first tab
1168
- 2. Review test set adequacy and track breakdown
1169
- 3. Save the enhanced test set with statistical weights
1170
 
1171
  ### Step 2: Generate Predictions
1172
  1. Load the test set in your evaluation pipeline
1173
  2. For each row, translate `source_text` from `source_language` to `target_language`
1174
  3. Save results as CSV with columns: `sample_id`, `prediction`
1175
- 4. Optional: Add `category` column for automatic classification
1176
 
1177
  ### Step 3: Submit & Evaluate
1178
- 1. Fill in detailed model information (improves categorization)
1179
  2. Upload your predictions file
1180
- 3. Review validation report with track-specific adequacy assessment
1181
- 4. Submit for scientific evaluation across all tracks
1182
 
1183
- ## πŸ“‹ Enhanced File Formats
1184
 
1185
- ### Scientific Test Set Format
1186
  ```csv
1187
- sample_id,source_text,source_language,target_language,domain,google_comparable,tracks_included,statistical_weight
1188
- salt_000001,"Hello world",eng,lug,general,true,"google_comparable,ug40_complete",2.5
1189
- salt_000002,"How are you?",eng,ach,conversation,true,"google_comparable,ug40_complete",2.5
1190
- salt_000003,"Good morning",lgg,teo,greetings,false,"ug40_complete,language_pair_matrix",1.0
1191
  ```
1192
 
1193
  ### Predictions Format
1194
  ```csv
1195
- sample_id,prediction,category
1196
- salt_000001,"Amakuru ensi","community"
1197
- salt_000002,"Ibino nining?","community"
1198
- salt_000003,"Ejok nanu","community"
1199
  ```
1200
 
1201
- ## πŸ† Scientific Leaderboard Features
1202
-
1203
- ### Fair Comparison
1204
- - Models only compared within the same category and track
1205
- - Statistical significance testing prevents misleading rankings
1206
- - Confidence intervals show measurement uncertainty
1207
-
1208
- ### Cross-Track Analysis
1209
- - Consistency analysis across evaluation tracks
1210
- - Identification of model strengths and weaknesses
1211
- - Language-specific performance patterns
1212
-
1213
- ### Publication Quality
1214
- - All visualizations include error bars and statistical annotations
1215
- - Comprehensive methodology documentation
1216
- - Reproducible evaluation pipeline
1217
-
1218
- ## πŸ”¬ Statistical Interpretation Guide
1219
-
1220
- ### Confidence Intervals
1221
- - **Non-overlapping CIs**: Likely significant difference
1222
- - **Overlapping CIs**: May or may not be significant (requires formal testing)
1223
- - **Wide CIs**: High uncertainty (need more data)
1224
-
1225
- ### Effect Sizes
1226
- - **Negligible (< {STATISTICAL_CONFIG['effect_size_thresholds']['small']})**: Practical equivalence
1227
- - **Small ({STATISTICAL_CONFIG['effect_size_thresholds']['small']}-{STATISTICAL_CONFIG['effect_size_thresholds']['medium']})**: Noticeable difference
1228
- - **Medium ({STATISTICAL_CONFIG['effect_size_thresholds']['medium']}-{STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Substantial difference
1229
- - **Large (> {STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Very large difference
1230
-
1231
- ### Statistical Adequacy
1232
- - **Excellent**: High statistical power (>0.8) for all comparisons
1233
- - **Good**: Adequate power for most comparisons
1234
- - **Fair**: Limited power, interpret with caution
1235
- - **Insufficient**: Results not reliable for scientific conclusions
1236
-
1237
- ## 🀝 Contributing to Science
1238
 
1239
  This leaderboard is designed for the research community. When using results:
1240
 
1241
- 1. **Always report confidence intervals** along with point estimates
1242
- 2. **Acknowledge statistical adequacy** when interpreting results
1243
- 3. **Use appropriate track** for your comparison (don't compare Google-track vs UG40-track results)
1244
- 4. **Consider effect sizes** not just statistical significance
1245
-
1246
- ## πŸ“„ Citation
1247
-
1248
- If you use this leaderboard in your research, please cite:
1249
-
1250
- ```bibtex
1251
- @misc{{salt_leaderboard_scientific_2024,
1252
- title={{SALT Translation Leaderboard: Scientific Edition - Rigorous Evaluation of Translation Models on Ugandan Languages}},
1253
- author={{Sunbird AI}},
1254
- year={{2024}},
1255
- url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard-scientific}},
1256
- note={{Three-tier evaluation system with statistical significance testing}}
1257
- }}
1258
- ```
1259
-
1260
- ## πŸ”— Related Resources
1261
-
1262
- - **SALT Dataset**: [sunbird/salt](https://huggingface.co/datasets/sunbird/salt)
1263
- - **Sunbird AI Research**: [sunbird.ai/research](https://sunbird.ai/research)
1264
- - **Statistical Methodology**: See our technical paper on rigorous MT evaluation
1265
- - **Open Source Code**: Available on GitHub for reproducibility
1266
 
1267
  ---
1268
 
1269
- *For questions about scientific methodology or statistical interpretation, contact our research team at [email protected]*
1270
  """)
1271
 
1272
- # Event handlers with enhanced scientific functionality
1273
  predictions_validated = gr.State(value=None)
1274
- validation_info_state = gr.State(value=None)
1275
  detected_category_state = gr.State(value="community")
1276
 
1277
  # Download test set
1278
  download_btn.click(
1279
- fn=download_scientific_test_set,
1280
  outputs=[download_file, download_info]
1281
  )
1282
 
1283
  # Validate predictions
1284
- def handle_scientific_validation(file, model_name, author, description):
1285
- report, predictions, category = validate_scientific_submission(file, model_name, author, description)
1286
-
1287
- # Enable button if predictions are available (allows evaluation with limitations)
1288
  can_evaluate = predictions is not None
1289
 
1290
- # Add user-friendly button status message to report
1291
  if can_evaluate:
1292
- if "πŸŽ‰ **Final Verdict**: Ready for scientific evaluation!" in report:
1293
- button_status = "\n\nβœ… **Button Status**: Ready to submit for evaluation!"
1294
- elif "⚠️ **Final Verdict**: Can be evaluated with limitations" in report:
1295
- button_status = "\n\n⚠️ **Button Status**: Can submit for evaluation (results will include limitations note)"
1296
- else:
1297
- button_status = "\n\nβœ… **Button Status**: Evaluation possible"
1298
  else:
1299
- button_status = "\n\n❌ **Button Status**: Please fix issues above before evaluation"
1300
 
1301
  enhanced_report = report + button_status
1302
 
1303
  return (
1304
  enhanced_report,
1305
  predictions,
1306
- {"category": category, "validation_passed": can_evaluate},
1307
  category,
1308
  gr.update(interactive=can_evaluate)
1309
  )
1310
 
1311
  validate_btn.click(
1312
- fn=handle_scientific_validation,
1313
  inputs=[predictions_file, model_name_input, author_input, description_input],
1314
- outputs=[validation_output, predictions_validated, validation_info_state, detected_category_state, submit_btn]
1315
  )
1316
 
1317
  # Submit for evaluation
1318
- def handle_scientific_submission(predictions, model_name, author, description, category, validation_info):
1319
- if predictions is None:
1320
- return "❌ Please validate your submission first", None, None, None
1321
-
1322
- result = evaluate_scientific_submission(
1323
- predictions, model_name, author, description, category, validation_info
1324
- )
1325
-
1326
- # After successful evaluation, update dropdown choices
1327
- global current_leaderboard
1328
- if current_leaderboard is not None and not current_leaderboard.empty:
1329
- model_choices = current_leaderboard['model_name'].unique().tolist()
1330
- else:
1331
- model_choices = []
1332
-
1333
- # Return the evaluation results plus updated dropdown choices
1334
- return result + (
1335
- gr.Dropdown(choices=model_choices, value=model_choices[0] if model_choices else None),
1336
- gr.CheckboxGroup(choices=model_choices, value=[])
1337
- )
1338
-
1339
  submit_btn.click(
1340
- fn=handle_scientific_submission,
1341
- inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state, validation_info_state],
1342
- outputs=[evaluation_output, results_table, submission_plot, cross_track_plot, model_select, comparison_models]
1343
  )
1344
 
1345
  # Track leaderboard refresh functions
1346
- def refresh_google_track(*args):
1347
- result = refresh_track_leaderboard("google_comparable", *args)
1348
- # Update dropdowns too
1349
- if current_leaderboard is not None and not current_leaderboard.empty:
1350
- model_choices = current_leaderboard['model_name'].unique().tolist()
1351
- else:
1352
- model_choices = []
1353
- return result + (
1354
- gr.Dropdown(choices=model_choices, value=model_choices[0] if model_choices else None),
1355
- gr.CheckboxGroup(choices=model_choices, value=[])
1356
- )
1357
-
1358
- def refresh_ug40_track(*args):
1359
- return refresh_track_leaderboard("ug40_complete", *args)
1360
-
1361
- def refresh_matrix_track(*args):
1362
- return refresh_track_leaderboard("language_pair_matrix", *args)
1363
-
1364
- # Google-Comparable Track
1365
  google_refresh.click(
1366
- fn=refresh_google_track,
1367
- inputs=[google_search, google_category, google_adequacy],
1368
- outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats, model_select, comparison_models]
1369
  )
1370
 
1371
- # UG40-Complete Track
1372
  ug40_refresh.click(
1373
- fn=refresh_ug40_track,
1374
- inputs=[ug40_search, ug40_category, ug40_adequacy],
1375
  outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats]
1376
  )
1377
 
1378
- # Language-Pair Matrix Track
1379
- matrix_refresh.click(
1380
- fn=refresh_matrix_track,
1381
- inputs=[matrix_search, matrix_category, matrix_adequacy],
1382
- outputs=[matrix_leaderboard, matrix_ranking_plot, matrix_comparison_plot, matrix_stats]
1383
  )
1384
 
1385
- # Model analysis
1386
- def handle_model_analysis(model_name, track):
1387
- if not model_name:
1388
- return "Please select a model for analysis", None, None
1389
-
1390
- print(f"Analyzing model: {model_name} for track: {track}")
1391
-
1392
- global current_leaderboard
1393
- if current_leaderboard is not None:
1394
- print(f"Available models: {current_leaderboard['model_name'].tolist()}")
1395
-
1396
- return get_scientific_model_details(model_name, track)
1397
-
1398
- analyze_btn.click(
1399
- fn=handle_model_analysis,
1400
- inputs=[model_select, track_select],
1401
- outputs=[model_details, model_analysis_plot, model_heatmap_plot]
1402
- )
1403
-
1404
- # Model comparison
1405
- compare_btn.click(
1406
- fn=perform_model_comparison,
1407
- inputs=[comparison_models, comparison_track, comparison_type],
1408
- outputs=[comparison_output, comparison_plot]
1409
- )
1410
-
1411
- # Load initial data and update dropdowns
1412
  def load_initial_data():
1413
- # Load initial Google track data
1414
- google_data = refresh_google_track("", "all", 0.0)
1415
-
1416
- # Update dropdown choices
1417
- if current_leaderboard is not None and not current_leaderboard.empty:
1418
- model_choices = current_leaderboard['model_name'].unique().tolist()
1419
- else:
1420
- model_choices = []
1421
-
1422
- return (
1423
- google_data[0], # google_leaderboard
1424
- google_data[1], # google_ranking_plot
1425
- google_data[2], # google_comparison_plot
1426
- google_data[3], # google_stats
1427
- gr.Dropdown(choices=model_choices, value=model_choices[0] if model_choices else None), # model_select
1428
- gr.CheckboxGroup(choices=model_choices, value=[]) # comparison_models
1429
- )
1430
 
1431
  demo.load(
1432
  fn=load_initial_data,
1433
- outputs=[
1434
- google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats,
1435
- model_select, comparison_models
1436
- ]
1437
  )
1438
 
1439
- # Launch the scientific application
1440
  if __name__ == "__main__":
1441
  demo.launch(
1442
  server_name="0.0.0.0",
 
3
  import sys
4
  import os
5
  from pathlib import Path
6
+ import traceback
7
+ from datetime import datetime
8
+ from typing import Optional, Dict, Tuple, List
9
 
10
  def setup_salt():
11
  """Clone and setup SALT library like in Colab."""
12
  try:
 
13
  import salt.dataset
14
  print("βœ… SALT library already available")
15
  return True
 
19
  print("πŸ“₯ Setting up SALT library...")
20
 
21
  try:
 
22
  salt_dir = Path("salt")
23
  if not salt_dir.exists():
24
  print("πŸ”„ Cloning SALT repository...")
 
28
  else:
29
  print("πŸ“ SALT repository already exists")
30
 
 
31
  salt_requirements = salt_dir / "requirements.txt"
32
  if salt_requirements.exists():
33
  print("πŸ“¦ Installing SALT requirements...")
 
35
  sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements)
36
  ])
37
 
 
38
  salt_path = str(salt_dir.absolute())
39
  if salt_path not in sys.path:
40
  sys.path.insert(0, salt_path)
41
  print(f"πŸ”— Added {salt_path} to Python path")
42
 
 
43
  import salt.dataset
44
  print("βœ… SALT library setup completed successfully")
45
  return True
 
49
  return False
50
 
51
  # Setup SALT on startup
52
+ print("πŸš€ Starting SALT Translation Leaderboard...")
53
  if not setup_salt():
54
  print("❌ Cannot continue without SALT library")
 
55
  sys.exit(1)
56
 
57
  import gradio as gr
58
  import pandas as pd
59
  import json
 
 
 
60
 
61
+ # Import our modules
62
  from src.test_set import (
63
+ get_public_test_set,
64
+ get_complete_test_set,
65
+ create_test_set_download
 
 
 
 
 
 
 
 
66
  )
67
+ from src.validation import validate_submission
68
+ from src.evaluation import evaluate_predictions, generate_evaluation_report
69
  from src.leaderboard import (
70
+ load_leaderboard,
71
+ add_model_to_leaderboard,
 
72
  get_track_leaderboard,
73
+ prepare_leaderboard_display
 
 
74
  )
75
  from src.plotting import (
76
+ create_leaderboard_plot,
77
+ create_language_pair_heatmap,
78
+ create_performance_comparison_plot,
79
+ create_language_pair_comparison_plot
 
 
 
 
 
 
 
 
 
 
80
  )
81
+ from src.utils import sanitize_model_name, get_all_language_pairs
82
  from config import *
83
 
84
  # Global variables for caching
85
  current_leaderboard = None
86
  public_test_set = None
87
  complete_test_set = None
 
88
 
89
+ def initialize_data():
90
+ """Initialize test sets and leaderboard data."""
91
+ global public_test_set, complete_test_set, current_leaderboard
92
 
93
  try:
94
+ print("πŸ“₯ Loading test sets...")
95
+ public_test_set = get_public_test_set()
96
+ complete_test_set = get_complete_test_set()
97
 
98
+ print("πŸ† Loading leaderboard...")
99
+ current_leaderboard = load_leaderboard()
 
 
100
 
101
+ print(f"βœ… Initialization complete!")
 
 
 
 
 
 
 
 
102
  print(f" - Test set: {len(public_test_set):,} samples")
 
 
103
  print(f" - Current models: {len(current_leaderboard)}")
104
 
105
  return True
106
 
107
  except Exception as e:
108
+ print(f"❌ Initialization failed: {e}")
109
  traceback.print_exc()
110
  return False
111
 
112
+ def download_test_set() -> Tuple[str, str]:
113
+ """Create downloadable test set and return file path and info."""
 
114
  try:
115
  global public_test_set
116
  if public_test_set is None:
117
+ public_test_set = get_public_test_set()
 
 
 
118
 
119
+ download_path, stats = create_test_set_download()
 
 
 
 
 
 
 
 
120
 
121
  info_msg = f"""
122
+ ## πŸ“₯ SALT Test Set Downloaded Successfully!
 
 
 
 
 
 
123
 
124
  ### πŸ“Š Dataset Statistics:
125
  - **Total Samples**: {stats['total_samples']:,}
126
  - **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))})
127
+ - **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples
128
+ - **Language Pairs**: {stats.get('language_pairs', 0)}
129
 
130
  ### 🏁 Track Breakdown:
131
  """
132
 
133
  track_breakdown = stats.get('track_breakdown', {})
134
  for track_name, track_info in track_breakdown.items():
 
135
  info_msg += f"""
136
+ **{EVALUATION_TRACKS[track_name]['name']}**:
137
  - Samples: {track_info.get('total_samples', 0):,}
138
  - Language Pairs: {track_info.get('language_pairs', 0)}
 
 
139
  """
140
 
141
  info_msg += f"""
142
 
143
+ ### πŸ“‹ File Format:
144
  - `sample_id`: Unique identifier for each sample
145
  - `source_text`: Text to be translated
146
  - `source_language`: Source language code
147
  - `target_language`: Target language code
148
  - `domain`: Content domain (if available)
149
  - `google_comparable`: Whether this pair can be compared with Google Translate
 
 
150
 
151
+ ### πŸ”¬ Next Steps:
152
  1. **Run your model** on the source texts to generate translations
153
  2. **Create a predictions file** with columns: `sample_id`, `prediction`
154
+ 3. **Submit** your predictions using the submission tab
 
 
 
 
 
 
 
 
155
  """
156
 
157
  return download_path, info_msg
158
 
159
  except Exception as e:
160
+ error_msg = f"❌ Error creating test set download: {str(e)}"
161
  return None, error_msg
162
 
163
+ def validate_submission_file(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame], str]:
164
+ """Validate uploaded prediction file."""
 
 
 
165
  try:
166
  if file is None:
167
  return "❌ Please upload a predictions file", None, "community"
 
183
  else:
184
  return "❌ Could not read uploaded file", None, "community"
185
 
186
+ filename = getattr(file, "name", None) or getattr(file, "filename", None) or "predictions.csv"
 
 
 
 
 
187
 
 
188
  global complete_test_set
189
  if complete_test_set is None:
190
+ complete_test_set = get_complete_test_set()
191
 
192
+ validation_result = validate_submission(
 
193
  file_content, filename, complete_test_set, model_name, author, description
194
  )
195
 
196
  detected_category = validation_result.get("category", "community")
197
 
 
198
  if validation_result.get("can_evaluate", False):
199
  return validation_result["report"], validation_result["predictions"], detected_category
200
  else:
201
  return validation_result["report"], None, detected_category
202
 
203
  except Exception as e:
204
+ return f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}", None, "community"
 
 
 
 
205
 
206
+ def evaluate_submission(
207
  predictions_df: pd.DataFrame,
208
  model_name: str,
209
  author: str,
210
  description: str,
211
  detected_category: str,
 
212
  ) -> Tuple[str, pd.DataFrame, object, object]:
213
+ """Evaluate validated predictions."""
 
214
  try:
215
  if predictions_df is None:
216
  return "❌ No valid predictions to evaluate", None, None, None
217
 
 
218
  global complete_test_set, current_leaderboard
219
  if complete_test_set is None:
220
+ complete_test_set = get_complete_test_set()
221
 
222
+ print(f"πŸ”¬ Starting evaluation for {model_name}...")
223
+ evaluation_results = evaluate_predictions(predictions_df, complete_test_set, detected_category)
 
 
 
224
 
225
+ if evaluation_results.get('error'):
226
+ return f"❌ Evaluation error: {evaluation_results['error']}", None, None, None
 
227
 
228
+ print("πŸ† Adding to leaderboard...")
229
+ updated_leaderboard = add_model_to_leaderboard(
 
230
  model_name=sanitize_model_name(model_name),
231
  author=author or "Anonymous",
232
  evaluation_results=evaluation_results,
 
234
  description=description or ""
235
  )
236
 
 
237
  current_leaderboard = updated_leaderboard
238
 
239
+ report = generate_evaluation_report(evaluation_results, model_name)
 
240
 
241
  # Create visualizations
242
+ summary_plot = create_performance_comparison_plot(updated_leaderboard, "google_comparable")
 
 
 
243
  google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable")
244
+ display_leaderboard = prepare_leaderboard_display(google_leaderboard, "google_comparable")
245
 
 
246
  success_msg = f"""
247
+ ## πŸŽ‰ Evaluation Complete!
248
 
249
  ### πŸ“Š Model Information:
250
  - **Model**: {model_name}
251
  - **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)}
252
  - **Author**: {author or 'Anonymous'}
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  {report}
255
  """
256
 
257
+ return success_msg, display_leaderboard, summary_plot, None
258
 
259
  except Exception as e:
260
+ error_msg = f"❌ Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
261
  return error_msg, None, None, None
262
 
263
+ def refresh_track_leaderboard(track: str, search_query: str = "", category_filter: str = "all") -> Tuple[pd.DataFrame, object, object, str]:
 
 
 
 
 
 
264
  """Refresh leaderboard for a specific track with filters."""
 
265
  try:
266
  global current_leaderboard
267
  if current_leaderboard is None:
268
+ current_leaderboard = load_leaderboard()
269
 
270
+ track_leaderboard = get_track_leaderboard(current_leaderboard, track, category_filter=category_filter)
 
 
 
 
 
 
 
271
 
272
  # Apply search filter
273
  if search_query and not track_leaderboard.empty:
274
+ query_lower = search_query.lower()
275
+ mask = (
276
+ track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) |
277
+ track_leaderboard['author'].str.lower().str.contains(query_lower, na=False)
278
+ )
279
+ track_leaderboard = track_leaderboard[mask]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
+ display_df = prepare_leaderboard_display(track_leaderboard, track)
282
+ ranking_plot = create_leaderboard_plot(track_leaderboard, track)
283
+ comparison_plot = create_performance_comparison_plot(track_leaderboard, track)
 
 
284
 
285
+ track_config = EVALUATION_TRACKS[track]
286
+ stats_text = f"""
 
 
 
 
287
  ### πŸ“Š {track_config['name']} Statistics
288
 
289
+ - **Total Models**: {len(track_leaderboard)}
290
+ - **Best Model**: {track_leaderboard.iloc[0]['model_name'] if not track_leaderboard.empty else 'None'}
291
+ - **Best Score**: {track_leaderboard.iloc[0][f'{track}_quality']:.4f if not track_leaderboard.empty else 0.0}
292
 
293
+ ### πŸ”¬ Track Information:
294
+ {track_config['description']}
295
+ """
 
 
 
 
 
 
 
 
296
 
297
  return display_df, ranking_plot, comparison_plot, stats_text
298
 
299
  except Exception as e:
300
  error_msg = f"Error loading {track} leaderboard: {str(e)}"
301
  print(error_msg)
302
+ return pd.DataFrame(), None, None, error_msg
 
303
 
304
+ def get_language_pair_comparison(track: str) -> Tuple[pd.DataFrame, object]:
305
+ """Get language pair comparison data and visualization."""
 
306
  try:
307
  global current_leaderboard
308
  if current_leaderboard is None:
309
+ return pd.DataFrame(), None
 
 
 
310
 
311
+ track_leaderboard = get_track_leaderboard(current_leaderboard, track)
 
312
 
313
+ if track_leaderboard.empty:
314
+ return pd.DataFrame(), None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
+ # Create language pair comparison table
317
+ pairs_data = []
318
+ track_languages = EVALUATION_TRACKS[track]["languages"]
 
 
 
319
 
320
+ for src in track_languages:
321
+ for tgt in track_languages:
322
+ if src == tgt:
323
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
+ pair_key = f"{src}_to_{tgt}"
326
+ pair_display = f"{LANGUAGE_NAMES.get(src, src)} β†’ {LANGUAGE_NAMES.get(tgt, tgt)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
+ for _, model in track_leaderboard.iterrows():
329
+ # Extract detailed results if available
330
+ detailed_col = f'detailed_{track}'
331
+ if detailed_col in model and pd.notna(model[detailed_col]):
332
+ try:
333
+ detailed_results = json.loads(model[detailed_col])
334
+ pair_metrics = detailed_results.get('pair_metrics', {})
335
+
336
+ if pair_key in pair_metrics:
337
+ metrics = pair_metrics[pair_key]
338
+ pairs_data.append({
339
+ 'Language Pair': pair_display,
340
+ 'Model': model['model_name'],
341
+ 'Category': model['model_category'],
342
+ 'Quality Score': metrics.get('quality_score', {}).get('mean', 0),
343
+ 'BLEU': metrics.get('bleu', {}).get('mean', 0),
344
+ 'ChrF': metrics.get('chrf', {}).get('mean', 0),
345
+ 'Samples': metrics.get('sample_count', 0)
346
+ })
347
+ except (json.JSONDecodeError, KeyError):
348
+ continue
349
+
350
+ pairs_df = pd.DataFrame(pairs_data)
351
+
352
+ if pairs_df.empty:
353
+ return pd.DataFrame(), None
354
+
355
+ # Create visualization
356
+ comparison_plot = create_language_pair_comparison_plot(pairs_df, track)
357
+
358
+ return pairs_df, comparison_plot
359
 
360
  except Exception as e:
361
+ print(f"Error in language pair comparison: {e}")
362
+ return pd.DataFrame(), None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
  # Initialize data on startup
365
+ initialization_success = initialize_data()
 
366
 
367
+ # Create Gradio interface
368
  with gr.Blocks(
369
+ title="πŸ† SALT Translation Leaderboard",
370
  theme=gr.themes.Soft(),
371
  css="""
372
  .gradio-container {
373
  max-width: 1600px !important;
374
  margin: 0 auto;
375
  }
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
+ /* Force readable text in all themes */
378
+ .markdown, .gr-markdown, .gr-html {
379
+ color: var(--body-text-color) !important;
380
+ background: var(--background-fill-primary) !important;
 
 
 
381
  }
382
+
383
+ .markdown h1, .markdown h2, .markdown h3,
384
+ .gr-markdown h1, .gr-markdown h2, .gr-markdown h3 {
385
+ color: var(--body-text-color) !important;
386
  }
387
+
388
+ .markdown p, .markdown li, .markdown strong,
389
+ .gr-markdown p, .gr-markdown li, .gr-markdown strong {
390
+ color: var(--body-text-color) !important;
391
  }
392
+
393
+ /* Table styling */
394
+ .dataframe, .gr-dataframe {
395
+ color: var(--body-text-color) !important;
396
+ background: var(--background-fill-primary) !important;
397
  }
398
+
399
+ /* Button and input styling */
400
+ .gr-button, .gr-textbox, .gr-dropdown {
401
+ color: var(--body-text-color) !important;
 
402
  }
403
 
404
+ /* Ensure plot backgrounds work in both themes */
405
+ .plot-container {
406
+ background: var(--background-fill-primary) !important;
 
 
 
 
 
407
  }
408
  """
409
  ) as demo:
410
 
411
+ # Header
412
+ gr.HTML("""
413
+ <div style="text-align: center; margin-bottom: 2rem; padding: 2rem; background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%); color: white !important; border-radius: 10px;">
414
+ <h1 style="color: white !important;">πŸ† SALT Translation Leaderboard</h1>
415
+ <p style="color: white !important;"><strong>Rigorous Evaluation of Translation Models on Ugandan Languages</strong></p>
416
+ <p style="color: white !important;">Three-tier evaluation β€’ Statistical confidence intervals β€’ Research-grade analysis</p>
 
417
  </div>
418
  """)
419
 
420
  # Status indicator
421
  if initialization_success:
422
+ status_msg = "βœ… System initialized successfully"
 
 
423
  else:
424
  status_msg = "❌ System initialization failed - some features may not work"
425
 
426
  gr.Markdown(f"**System Status**: {status_msg}")
427
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  with gr.Tabs():
429
 
430
  # Tab 1: Download Test Set
431
  with gr.Tab("πŸ“₯ Download Test Set", id="download"):
432
  gr.Markdown("""
433
+ ## πŸ“‹ Get the SALT Test Set
434
 
435
+ Download our test set for translation model evaluation.
436
  """)
437
 
438
+ download_btn = gr.Button("πŸ“₯ Download Test Set", variant="primary", size="lg")
 
439
 
440
  with gr.Row():
441
  with gr.Column():
442
  download_file = gr.File(label="πŸ“‚ Test Set File", interactive=False)
443
  with gr.Column():
444
+ download_info = gr.Markdown()
445
 
446
  # Tab 2: Submit Predictions
447
  with gr.Tab("πŸš€ Submit Predictions", id="submit"):
448
  gr.Markdown("""
449
+ ## 🎯 Submit Your Model's Predictions
450
 
451
+ Upload predictions for evaluation across all tracks.
452
  """)
453
 
454
  with gr.Row():
 
470
  description_input = gr.Textbox(
471
  label="πŸ“„ Model Description",
472
  placeholder="Architecture, training data, special features...",
473
+ lines=4
 
474
  )
475
 
 
476
  predictions_file = gr.File(
477
  label="πŸ“‚ Predictions File",
478
  file_types=[".csv", ".tsv", ".json"]
479
  )
480
 
481
  validate_btn = gr.Button("βœ… Validate Submission", variant="secondary")
482
+ submit_btn = gr.Button("πŸš€ Submit for Evaluation", variant="primary", interactive=False)
483
 
484
  with gr.Column(scale=1):
 
485
  validation_output = gr.Markdown()
486
 
487
+ gr.Markdown("### πŸ† Evaluation Results")
488
+ evaluation_output = gr.Markdown()
 
 
 
489
 
490
  with gr.Row():
491
  with gr.Column():
492
+ submission_plot = gr.Plot(label="πŸ“ˆ Performance Analysis")
493
  with gr.Column():
494
+ results_table = gr.Dataframe(label="πŸ“Š Updated Leaderboard", interactive=False)
 
 
 
495
 
496
  # Tab 3: Google-Comparable Track
497
+ with gr.Tab("πŸ€– Google-Comparable Track", id="google_track"):
498
  gr.Markdown(f"""
499
+ ## {EVALUATION_TRACKS['google_comparable']['name']}
500
 
501
+ **{EVALUATION_TRACKS['google_comparable']['description']}**
502
 
503
+ This track evaluates models on language pairs supported by Google Translate,
504
  enabling direct comparison with commercial baselines.
 
 
 
 
505
  """)
506
 
507
  with gr.Row():
 
513
  choices=["all"] + list(MODEL_CATEGORIES.keys()),
514
  value="all"
515
  )
 
 
 
 
 
516
  with gr.Column(scale=1):
517
  google_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
518
 
519
+ google_stats = gr.Markdown()
 
520
 
521
  with gr.Row():
522
  with gr.Column():
523
+ google_ranking_plot = gr.Plot(label="πŸ† Rankings")
524
  with gr.Column():
525
+ google_comparison_plot = gr.Plot(label="πŸ“Š Performance Comparison")
526
 
527
+ google_leaderboard = gr.Dataframe(label="πŸ“ˆ Google-Comparable Leaderboard", interactive=False)
 
528
 
529
  # Tab 4: UG40-Complete Track
530
+ with gr.Tab("🌍 UG40-Complete Track", id="ug40_track"):
531
  gr.Markdown(f"""
532
+ ## {EVALUATION_TRACKS['ug40_complete']['name']}
533
 
534
+ **{EVALUATION_TRACKS['ug40_complete']['description']}**
535
 
536
+ This track evaluates models on all UG40 language pairs,
537
+ providing comprehensive assessment of Ugandan language translation capabilities.
 
 
 
 
538
  """)
539
 
540
  with gr.Row():
 
546
  choices=["all"] + list(MODEL_CATEGORIES.keys()),
547
  value="all"
548
  )
 
 
 
 
 
549
  with gr.Column(scale=1):
550
  ug40_refresh = gr.Button("πŸ”„ Refresh", variant="secondary")
551
 
552
+ ug40_stats = gr.Markdown()
 
553
 
554
  with gr.Row():
555
  with gr.Column():
556
+ ug40_ranking_plot = gr.Plot(label="πŸ† Rankings")
557
  with gr.Column():
558
+ ug40_comparison_plot = gr.Plot(label="πŸ“Š Performance Comparison")
559
 
560
+ ug40_leaderboard = gr.Dataframe(label="πŸ“ˆ UG40-Complete Leaderboard", interactive=False)
 
561
 
562
+ # Tab 5: Language Pair Analysis
563
+ with gr.Tab("πŸ“Š Language Pair Analysis", id="pairs_analysis"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
  gr.Markdown("""
565
+ ## πŸ“Š Language Pair Performance Analysis
566
 
567
+ Compare model performance across individual language pairs with detailed breakdowns.
 
568
  """)
569
 
570
  with gr.Row():
 
 
 
 
 
 
 
571
  with gr.Column(scale=1):
572
+ pairs_track_select = gr.Dropdown(
573
+ label="🏁 Select Track",
574
  choices=list(EVALUATION_TRACKS.keys()),
575
  value="google_comparable"
576
  )
577
+ with gr.Column(scale=1):
578
+ pairs_refresh = gr.Button("πŸ”„ Analyze Language Pairs", variant="primary")
 
 
 
 
 
 
 
579
 
580
+ pairs_comparison_plot = gr.Plot(label="πŸ“Š Language Pair Comparison")
581
+ pairs_table = gr.Dataframe(label="πŸ“ˆ Language Pair Performance", interactive=False)
582
 
583
+ # Tab 6: Documentation
584
+ with gr.Tab("πŸ“š Documentation", id="docs"):
585
  gr.Markdown(f"""
586
+ # πŸ“– SALT Translation Leaderboard Documentation
587
 
588
  ## 🎯 Overview
589
 
590
+ The SALT Translation Leaderboard provides rigorous evaluation of translation models
591
+ on Ugandan languages using three different tracks for fair comparison.
592
 
593
+ ## 🏁 Evaluation Tracks
 
 
594
 
595
  **1. πŸ€– Google-Comparable Track**
596
  - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
 
597
  - **Purpose**: Fair comparison with commercial translation systems
598
+ - **Language Pairs**: {len([1 for src in GOOGLE_SUPPORTED_LANGUAGES for tgt in GOOGLE_SUPPORTED_LANGUAGES if src != tgt])}
599
 
600
  **2. 🌍 UG40-Complete Track**
601
  - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
 
602
  - **Purpose**: Comprehensive Ugandan language capability assessment
603
+ - **Language Pairs**: {len([1 for src in ALL_UG40_LANGUAGES for tgt in ALL_UG40_LANGUAGES if src != tgt])}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
 
605
  ## πŸ“Š Evaluation Metrics
606
 
607
  ### Primary Metrics
608
+ - **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, and error rates
609
  - **BLEU**: Bilingual Evaluation Understudy (0-100)
610
  - **ChrF**: Character-level F-score (0-1)
611
 
612
+ ### Model Categories
 
 
 
613
 
614
+ Models are automatically categorized for fair comparison:
615
+
616
+ - **🏒 Commercial**: Production translation systems
617
+ - **πŸ”¬ Research**: Academic and research institution models
618
+ - **πŸ“Š Baseline**: Simple baseline and reference models
619
+ - **πŸ‘₯ Community**: User-submitted models
620
 
621
  ## πŸ”„ Submission Process
622
 
623
+ ### Step 1: Download Test Set
624
+ 1. Click "Download Test Set" in the first tab
625
+ 2. Save the test set file
 
626
 
627
  ### Step 2: Generate Predictions
628
  1. Load the test set in your evaluation pipeline
629
  2. For each row, translate `source_text` from `source_language` to `target_language`
630
  3. Save results as CSV with columns: `sample_id`, `prediction`
 
631
 
632
  ### Step 3: Submit & Evaluate
633
+ 1. Fill in model information
634
  2. Upload your predictions file
635
+ 3. Review validation report
636
+ 4. Submit for evaluation
637
 
638
+ ## πŸ“‹ File Formats
639
 
640
+ ### Test Set Format
641
  ```csv
642
+ sample_id,source_text,source_language,target_language,domain,google_comparable
643
+ salt_000001,"Hello world",eng,lug,general,true
644
+ salt_000002,"How are you?",eng,ach,conversation,true
 
645
  ```
646
 
647
  ### Predictions Format
648
  ```csv
649
+ sample_id,prediction
650
+ salt_000001,"Amakuru ensi"
651
+ salt_000002,"Ibino nining?"
 
652
  ```
653
 
654
+ ## 🀝 Contributing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655
 
656
  This leaderboard is designed for the research community. When using results:
657
 
658
+ 1. Consider the appropriate track for your comparison
659
+ 2. Report confidence intervals when available
660
+ 3. Acknowledge the model category in comparisons
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
 
662
  ---
663
 
664
+ *For questions, contact the team at [email protected]*
665
  """)
666
 
667
+ # Event handlers
668
  predictions_validated = gr.State(value=None)
 
669
  detected_category_state = gr.State(value="community")
670
 
671
  # Download test set
672
  download_btn.click(
673
+ fn=download_test_set,
674
  outputs=[download_file, download_info]
675
  )
676
 
677
  # Validate predictions
678
+ def handle_validation(file, model_name, author, description):
679
+ report, predictions, category = validate_submission_file(file, model_name, author, description)
 
 
680
  can_evaluate = predictions is not None
681
 
 
682
  if can_evaluate:
683
+ button_status = "\n\nβœ… **Ready to submit for evaluation!**"
 
 
 
 
 
684
  else:
685
+ button_status = "\n\n❌ **Please fix issues above before evaluation**"
686
 
687
  enhanced_report = report + button_status
688
 
689
  return (
690
  enhanced_report,
691
  predictions,
 
692
  category,
693
  gr.update(interactive=can_evaluate)
694
  )
695
 
696
  validate_btn.click(
697
+ fn=handle_validation,
698
  inputs=[predictions_file, model_name_input, author_input, description_input],
699
+ outputs=[validation_output, predictions_validated, detected_category_state, submit_btn]
700
  )
701
 
702
  # Submit for evaluation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703
  submit_btn.click(
704
+ fn=evaluate_submission,
705
+ inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state],
706
+ outputs=[evaluation_output, results_table, submission_plot, gr.Plot(visible=False)]
707
  )
708
 
709
  # Track leaderboard refresh functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
  google_refresh.click(
711
+ fn=lambda *args: refresh_track_leaderboard("google_comparable", *args),
712
+ inputs=[google_search, google_category],
713
+ outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
714
  )
715
 
 
716
  ug40_refresh.click(
717
+ fn=lambda *args: refresh_track_leaderboard("ug40_complete", *args),
718
+ inputs=[ug40_search, ug40_category],
719
  outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats]
720
  )
721
 
722
+ # Language pair analysis
723
+ pairs_refresh.click(
724
+ fn=get_language_pair_comparison,
725
+ inputs=[pairs_track_select],
726
+ outputs=[pairs_table, pairs_comparison_plot]
727
  )
728
 
729
+ # Load initial data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
730
  def load_initial_data():
731
+ google_data = refresh_track_leaderboard("google_comparable", "", "all")
732
+ return google_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733
 
734
  demo.load(
735
  fn=load_initial_data,
736
+ outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
 
 
 
737
  )
738
 
739
+ # Launch the application
740
  if __name__ == "__main__":
741
  demo.launch(
742
  server_name="0.0.0.0",