BounharAbdelaziz commited on
Commit
f6452ab
·
1 Parent(s): 1c7ff5f

implemented full tests for upload via csv + cleaned the code a bit

Browse files
Files changed (1) hide show
  1. utils.py +225 -250
utils.py CHANGED
@@ -13,104 +13,29 @@ from sklearn.metrics import (
13
  matthews_corrcoef
14
  )
15
  import numpy as np
16
- from datasets import load_dataset
17
-
18
- # Constants
19
- MODEL_REPO = "atlasia/Sfaya-Moroccan-Darija-vs-All"
20
- BIN_FILENAME = "model_multi_v3_2fpr.bin"
21
- BINARY_LEADERBOARD_FILE = "darija_leaderboard_binary.json"
22
- MULTILINGUAL_LEADERBOARD_FILE = "darija_leaderboard_multilingual.json"
23
- DATA_PATH = "atlasia/Arabic-LID-Leaderboard"
24
-
25
- target_label = "Morocco"
26
- is_binary = False
27
-
28
- # Load test dataset
29
- test_dataset = load_dataset(DATA_PATH, split='test')
30
-
31
- # Supported dialects
32
- all_target_languages = list(test_dataset.unique("dialect"))
33
- supported_dialects = all_target_languages + ['All']
34
- languages_to_display_one_vs_all = all_target_languages # everything except All
35
-
36
- print(f'all_target_languages: {all_target_languages}')
37
-
38
- metrics = [
39
- 'f1_score',
40
- 'precision',
41
- 'recall',
42
- 'specificity',
43
- 'false_positive_rate',
44
- 'false_negative_rate',
45
- 'negative_predictive_value',
46
- 'n_test_samples',
47
- ]
48
-
49
- default_metrics = [
50
- 'f1_score',
51
- 'precision',
52
- 'recall',
53
- 'false_positive_rate',
54
- 'false_negative_rate'
55
- ]
56
-
57
- # default language to display in one-vs-all leaderboard
58
- default_languages = [
59
- #'Morocco',
60
- 'MSA',
61
- #'Egypt',
62
- #'Algeria',
63
- #'Tunisia',
64
- #'Levantine',
65
- ]
66
-
67
- language_mapping_dict = {
68
- 'ace_Arab': 'Acehnese',
69
- 'acm_Arab': 'Mesopotamia', # 'Gilit Mesopotamian'
70
- 'aeb_Arab': 'Tunisia',
71
- 'ajp_Arab': 'Levantine', # 'South Levantine'
72
- 'apc_Arab': 'Levantine',
73
- 'arb_Arab': 'MSA',
74
- 'arq_Arab': 'Algeria',
75
- 'ars_Arab': 'Saudi', # Najdi is primarily Saudi Arabian
76
- 'ary_Arab': 'Morocco',
77
- 'arz_Arab': 'Egypt',
78
- 'ayp_Arab': 'Mesopotamia', # 'North Mesopotamian'
79
- 'azb_Arab': 'Azerbaijan', # South Azerbaijani pertains to this region
80
- 'bcc_Arab': 'Balochistan', # Southern Balochi is from Balochistan
81
- 'bjn_Arab': 'Indonesia', # Banjar is spoken in Indonesia
82
- 'brh_Arab': 'Pakistan', # Brahui is spoken in Pakistan
83
- 'ckb_Arab': 'Kurdistan', # Central Kurdish is mainly in Iraq
84
- 'fuv_Arab': 'Nigeria', # Hausa States Fulfulde
85
- 'glk_Arab': 'Iran', # Gilaki is spoken in Iran
86
- 'hac_Arab': 'Iran', # Gurani is also primarily spoken in Iran
87
- 'kas_Arab': 'Kashmir',
88
- 'knc_Arab': 'Nigeria', # Central Kanuri is in Nigeria
89
- 'lki_Arab': 'Iran', # Laki is from Iran
90
- 'lrc_Arab': 'Iran', # Northern Luri is from Iran
91
- 'min_Arab': 'Indonesia', # Minangkabau is spoken in Indonesia
92
- 'mzn_Arab': 'Iran', # Mazanderani is spoken in Iran
93
- 'ota_Arab': 'Turkey', # Ottoman Turkish
94
- 'pbt_Arab': 'Afghanistan', # Southern Pashto
95
- 'pnb_Arab': 'Pakistan', # Western Panjabi
96
- 'sdh_Arab': 'Iraq', # Southern Kurdish
97
- 'shu_Arab': 'Chad', # Chadian Arabic
98
- 'skr_Arab': 'Pakistan', # Saraiki
99
- 'snd_Arab': 'Pakistan', # Sindhi
100
- 'sus_Arab': 'Guinea', # Susu
101
- 'tuk_Arab': 'Turkmenistan', # Turkmen
102
- 'uig_Arab': 'Uighur (China)', # Uighur
103
- 'urd_Arab': 'Pakistan', # Urdu
104
- 'uzs_Arab': 'Uzbekistan', # Southern Uzbek
105
- 'zsm_Arab': 'Malaysia' # Standard Malay
106
- }
107
 
108
  def predict_label(text, model, language_mapping_dict, use_mapping=False):
 
 
 
 
 
 
 
 
 
 
 
 
109
  # Remove any newline characters and strip whitespace
110
  text = str(text).strip().replace('\n', ' ')
111
 
112
  if text == '':
113
- return 'Other'
 
114
 
115
  try:
116
  # Get top prediction
@@ -124,6 +49,7 @@ def predict_label(text, model, language_mapping_dict, use_mapping=False):
124
 
125
  # map label to language using language_mapping_dict
126
  if use_mapping:
 
127
  label = language_mapping_dict.get(label, 'Other')
128
  return label
129
 
@@ -132,7 +58,7 @@ def predict_label(text, model, language_mapping_dict, use_mapping=False):
132
  print(f"Exception: {e}")
133
  return {'prediction_label': 'Error', 'prediction_confidence': 0.0}
134
 
135
- def compute_classification_metrics(test_dataset):
136
  """
137
  Compute comprehensive classification metrics for each class.
138
 
@@ -142,8 +68,10 @@ def compute_classification_metrics(test_dataset):
142
  Returns:
143
  pd.DataFrame: DataFrame with detailed metrics for each class.
144
  """
145
- # transform the dataset into a DataFrame
146
- data = pd.DataFrame(test_dataset)
 
 
147
  # Extract true labels and predictions
148
  true_labels = list(data['dialect'])
149
  predicted_labels = list(data['preds'])
@@ -161,24 +89,29 @@ def compute_classification_metrics(test_dataset):
161
  precision_scores = precision_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
162
  recall_scores = recall_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
163
 
 
 
 
 
 
164
  # Compute confusion matrix
165
  conf_mat = confusion_matrix(true_indices, pred_indices, labels=range(len(labels)))
166
 
167
  # Calculate various metrics per class
168
- FP = conf_mat.sum(axis=0) - np.diag(conf_mat) # False Positives
169
- FN = conf_mat.sum(axis=1) - np.diag(conf_mat) # False Negatives
170
- TP = np.diag(conf_mat) # True Positives
171
- TN = conf_mat.sum() - (FP + FN + TP) # True Negatives
172
 
173
  # Calculate sample counts per class
174
  samples_per_class = np.bincount(true_indices, minlength=len(labels))
175
 
176
  # Calculate additional metrics
177
  with np.errstate(divide='ignore', invalid='ignore'):
178
- fp_rate = FP / (FP + TN) # False Positive Rate
179
- fn_rate = FN / (FN + TP) # False Negative Rate
180
- specificity = TN / (TN + FP) # True Negative Rate
181
- npv = TN / (TN + FN) # Negative Predictive Value
182
 
183
  # Replace NaN/inf with 0
184
  metrics = [fp_rate, fn_rate, specificity, npv]
@@ -194,6 +127,9 @@ def compute_classification_metrics(test_dataset):
194
  'country': labels,
195
  'samples': samples_per_class,
196
  'f1_score': f1_scores,
 
 
 
197
  'precision': precision_scores,
198
  'recall': recall_scores,
199
  'specificity': specificity,
@@ -203,28 +139,21 @@ def compute_classification_metrics(test_dataset):
203
  'false_positives': FP,
204
  'true_negatives': TN,
205
  'false_negatives': FN,
206
- 'negative_predictive_value': npv
 
 
207
  })
208
 
209
  # Sort by number of samples (descending)
210
  result_df = result_df.sort_values('samples', ascending=False)
211
 
212
- # Calculate and add summary metrics
213
- summary_metrics = {
214
- 'macro_f1': f1_score(true_indices, pred_indices, average='macro'),
215
- 'weighted_f1': f1_score(true_indices, pred_indices, average='weighted'),
216
- 'micro_f1': f1_score(true_indices, pred_indices, average='micro'),
217
- 'balanced_accuracy': balanced_acc,
218
- 'matthews_correlation': mcc
219
- }
220
-
221
  # Format all numeric columns to 4 decimal places
222
  numeric_cols = result_df.select_dtypes(include=[np.number]).columns
223
  result_df[numeric_cols] = result_df[numeric_cols].round(4)
224
 
225
- print(f'result_df: {result_df}')
226
 
227
- return result_df, summary_metrics
228
 
229
  def make_binary(dialect, target):
230
  if dialect != target:
@@ -250,47 +179,80 @@ def run_eval_one_vs_all(data_test, TARGET_LANG='Morocco'):
250
  out = out.reset_index()
251
  out = out[out['preds']==TARGET_LANG].drop(columns=['preds', 'size'])
252
 
253
- print(f'out for TARGET_LANG={TARGET_LANG} \n: {out}')
254
 
255
  return out
256
 
257
- def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, BINARY_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  try:
259
- with open(BINARY_LEADERBOARD_FILE, "r") as f:
260
- data = json.load(f)
261
- except FileNotFoundError:
262
- data = []
263
-
264
- # Process the results for each dialect/country
265
- for _, row in result_df.iterrows():
266
- dialect = row['dialect']
267
- # Skip 'Other' class, it is considered as the null space
268
- if dialect == 'Other':
269
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
- # Find existing target_lang entry or create a new one
272
- target_entry = next((item for item in data if target_lang in item), None)
273
- if target_entry is None:
274
- target_entry = {target_lang: {}}
275
- data.append(target_entry)
276
-
277
- # Get the country-specific data for this target language
278
- country_data = target_entry[target_lang]
279
 
280
- # Initialize the dialect/country entry if it doesn't exist
281
- if dialect not in country_data:
282
- country_data[dialect] = {}
283
-
284
- # Update the model metrics under the model name for the given dialect
285
- country_data[dialect][model_name] = float(row['false_positive_rate'])
 
286
 
287
- # # Add the number of test samples, if not already present
288
- # if "n_test_samples" not in country_data[dialect]:
289
- # country_data[dialect]["n_test_samples"] = int(row['size'])
290
-
291
- # Save updated leaderboard data
292
- with open(BINARY_LEADERBOARD_FILE, "w") as f:
293
- json.dump(data, f, indent=4)
294
 
295
  def handle_evaluation(model_path, model_path_bin, use_mapping=False):
296
 
@@ -301,10 +263,6 @@ def handle_evaluation(model_path, model_path_bin, use_mapping=False):
301
  print(f"[INFO] Loading model from Path: {model_path_hub}, using version {model_path_bin}...")
302
  model = fasttext.load_model(model_path_hub)
303
 
304
- # Load the evaluation dataset
305
- print(f"[INFO] Loading evaluation dataset from Path: {DATA_PATH}...")
306
- eval_dataset = load_dataset(DATA_PATH, split='test')
307
-
308
  # Transform to pandas DataFrame
309
  print(f"[INFO] Converting evaluation dataset to Pandas DataFrame...")
310
  df_eval = pd.DataFrame(eval_dataset)
@@ -314,20 +272,21 @@ def handle_evaluation(model_path, model_path_bin, use_mapping=False):
314
  df_eval['preds'] = df_eval['text'].apply(lambda text: predict_label(text, model, language_mapping_dict, use_mapping=use_mapping))
315
 
316
  # run the evaluation
317
- result_df, _ = run_eval(df_eval)
 
318
  # set the model name
319
  model_name = model_path + '/' + model_path_bin
320
 
321
  # update the multilingual leaderboard
322
- update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE)
323
 
324
  for target_lang in all_target_languages:
325
  result_df_one_vs_all =run_eval_one_vs_all(df_eval, TARGET_LANG=target_lang)
326
- update_darija_one_vs_all_leaderboard(result_df_one_vs_all, model_name, target_lang, BINARY_LEADERBOARD_FILE)
327
 
328
  # load the updated leaderboard tables
329
- df_multilingual = load_leaderboard_multilingual()
330
- df_one_vs_all = load_leaderboard_one_vs_all()
331
 
332
  status_message = "**Evaluation now ended! 🤗**"
333
 
@@ -347,7 +306,7 @@ def run_eval(df_eval):
347
  pd.DataFrame: A DataFrame containing evaluation metrics.
348
  """
349
 
350
- # map to binary
351
  df_eval_multilingual = df_eval.copy()
352
 
353
  # now drop the columns that are not needed, i.e. 'text'
@@ -355,11 +314,11 @@ def run_eval(df_eval):
355
 
356
  # Compute evaluation metrics
357
  print(f"[INFO] Computing metrics...")
358
- result_df, _ = compute_classification_metrics(df_eval_multilingual)
359
 
360
- # update_darija_multilingual_leaderboard(result_df, model_path, MULTILINGUAL_LEADERBOARD_FILE)
361
 
362
- return result_df, df_eval_multilingual
363
 
364
  def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/submissions/", default_language='Morocco'):
365
  try:
@@ -368,7 +327,7 @@ def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/su
368
 
369
  # Clean the model name to be safe for file paths
370
  uploaded_model_name = uploaded_model_name.strip().replace(" ", "_")
371
- print(f"[INFO] uploaded_model_name: {uploaded_model_name}")
372
 
373
  # Create the directory for saving submissions
374
  path_saving = os.path.join(base_path_save, uploaded_model_name)
@@ -378,7 +337,7 @@ def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/su
378
  saved_file_path = os.path.join(path_saving, 'submission.csv')
379
 
380
  # Read the uploaded file as DataFrame
381
- print(f"[INFO] Loading results...")
382
  df_eval = pd.read_csv(file.name)
383
 
384
  # Save the DataFrame
@@ -390,72 +349,116 @@ def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/su
390
 
391
  # Compute evaluation metrics
392
  print(f"[INFO] Computing metrics...")
393
- result_df, _ = compute_classification_metrics(df_eval)
394
 
395
  # Update the leaderboards
396
- update_darija_multilingual_leaderboard(result_df, uploaded_model_name, MULTILINGUAL_LEADERBOARD_FILE)
397
 
398
  # TODO: implement this ove_vs_all differently for people only submitting csv file. They need to submit two files, one for multi-lang and the other for one-vs-all
399
  # result_df_one_vs_all = run_eval_one_vs_all(...)
400
  # update_darija_one_vs_all_leaderboard(...)
401
 
402
- # update the leaderboard table
403
- df = load_leaderboard_multilingual()
 
404
 
405
- return create_leaderboard_display_multilingual(df, default_language, default_metrics)
 
 
406
 
407
- def update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
408
 
409
- # Load leaderboard data
410
- current_dir = os.path.dirname(os.path.abspath(__file__))
411
- MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE)
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
  try:
414
- with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f:
415
- data = json.load(f)
416
- except FileNotFoundError:
417
- data = []
 
 
 
 
 
 
 
 
 
 
 
418
 
419
- # Process the results for each dialect/country
420
- for _, row in result_df.iterrows():
421
- country = row['country']
422
- # skip 'Other' class, it is considered as the null space
423
- if country == 'Other':
424
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
 
426
- # Create metrics dictionary directly
427
- metrics = {
428
- 'f1_score': float(row['f1_score']),
429
- 'precision': float(row['precision']),
430
- 'recall': float(row['recall']),
431
- 'specificity': float(row['specificity']),
432
- 'false_positive_rate': float(row['false_positive_rate']),
433
- 'false_negative_rate': float(row['false_negative_rate']),
434
- 'negative_predictive_value': float(row['negative_predictive_value']),
435
- 'n_test_samples': int(row['samples'])
436
- }
 
 
 
437
 
438
- # Find existing country entry or create new one
439
- country_entry = next((item for item in data if country in item), None)
440
- if country_entry is None:
441
- country_entry = {country: {}}
442
- data.append(country_entry)
 
 
443
 
444
- # Update the model metrics directly under the model name
445
- if country not in country_entry:
446
- country_entry[country] = {}
447
- country_entry[country][model_name] = metrics
448
-
449
- # Save updated leaderboard data
450
- with open(MULTILINGUAL_LEADERBOARD_FILE, "w") as f:
451
- json.dump(data, f, indent=4)
452
 
453
 
454
- def load_leaderboard_one_vs_all(BINARY_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
455
  current_dir = os.path.dirname(os.path.abspath(__file__))
456
- BINARY_LEADERBOARD_FILE = os.path.join(current_dir, BINARY_LEADERBOARD_FILE)
457
 
458
- with open(BINARY_LEADERBOARD_FILE, "r") as f:
459
  data = json.load(f)
460
 
461
  # Initialize lists to store the flattened data
@@ -482,16 +485,14 @@ def load_leaderboard_one_vs_all(BINARY_LEADERBOARD_FILE="darija_leaderboard_bina
482
 
483
  # Pivot the DataFrame to create the desired structure: all languages in columns and models in rows, and each (model, target_language, language) = false_positive_rate
484
  df_pivot = df.pivot(index=['model', 'target_language'], columns='language', values='false_positive_rate').reset_index()
485
-
486
- # print(f'df_pivot \n: {df_pivot}')
487
-
488
  return df_pivot
489
 
490
- def load_leaderboard_multilingual(MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
491
  current_dir = os.path.dirname(os.path.abspath(__file__))
492
- MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE)
493
 
494
- with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f:
495
  data = json.load(f)
496
 
497
  # Initialize lists to store the flattened data
@@ -568,25 +569,28 @@ def create_leaderboard_display_multilingual(df, selected_country, selected_metri
568
  def update_leaderboard_multilingual(country, selected_metrics):
569
  if not selected_metrics: # If no metrics selected, show all
570
  selected_metrics = metrics
571
- df = load_leaderboard_multilingual()
572
  display_df = create_leaderboard_display_multilingual(df, country, selected_metrics)
573
  return display_df
574
 
575
  def update_leaderboard_one_vs_all(target_language, selected_languages):
576
  if not selected_languages: # If no language selected, show all defaults
577
  selected_languages = default_languages
578
- df = load_leaderboard_one_vs_all()
579
  display_df, selected_languages = create_leaderboard_display_one_vs_all(df, target_language, selected_languages)
 
580
  # to improve visibility in case the user chooses multiple language leading to many columns, the `model` column must remain fixed
581
- # display_df = render_fixed_columns(display_df)
582
  return display_df, selected_languages
583
 
584
  def encode_image_to_base64(image_path):
 
585
  with open(image_path, "rb") as image_file:
586
  encoded_string = base64.b64encode(image_file.read()).decode()
587
  return encoded_string
588
 
589
  def create_html_image(image_path):
 
590
  # Get base64 string of image
591
  img_base64 = encode_image_to_base64(image_path)
592
 
@@ -602,35 +606,6 @@ def create_html_image(image_path):
602
  """
603
  return html_string
604
 
605
- # Function to render HTML table with fixed 'model' column
606
  def render_fixed_columns(df):
607
- style = """
608
- <style>
609
- .table-container {
610
- overflow-x: auto;
611
- position: relative;
612
- white-space: nowrap;
613
- }
614
- table {
615
- border-collapse: collapse;
616
- width: 100%;
617
- }
618
- th, td {
619
- border: 1px solid black;
620
- padding: 8px;
621
- text-align: left;
622
- }
623
- th.fixed, td.fixed {
624
- position: sticky;
625
- left: 0;
626
- background-color: white;
627
- z-index: 2;
628
- }
629
- </style>
630
- """
631
- table_html = df.to_html(index=False).replace(
632
- "<th>model</th>", '<th class="fixed">model</th>'
633
- ).replace(
634
- '<td>', '<td class="fixed">', 1
635
- )
636
- return f"{style}<div class='table-container'>{table_html}</div>"
 
13
  matthews_corrcoef
14
  )
15
  import numpy as np
16
+ from huggingface_hub import HfApi
17
+ from pathlib import Path
18
+ from constants import *
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def predict_label(text, model, language_mapping_dict, use_mapping=False):
21
+ """
22
+ Runs predictions for a fasttext model.
23
+
24
+ Args:
25
+ text (str): The input text to classify.
26
+ model (fasttext.FastText._FastText): The fasttext model to use for prediction.
27
+ language_mapping_dict (dict): A dictionary mapping fasttext labels to human-readable language names.
28
+ use_mapping (bool): Whether to use the language mapping dictionary.
29
+
30
+ Returns:
31
+ str: The predicted label for the input text.
32
+ """
33
  # Remove any newline characters and strip whitespace
34
  text = str(text).strip().replace('\n', ' ')
35
 
36
  if text == '':
37
+ # if empty text, return EMPTY
38
+ return 'EMPTY'
39
 
40
  try:
41
  # Get top prediction
 
49
 
50
  # map label to language using language_mapping_dict
51
  if use_mapping:
52
+ # if label not found in mapping dict, set it to other as we are not taking them into account
53
  label = language_mapping_dict.get(label, 'Other')
54
  return label
55
 
 
58
  print(f"Exception: {e}")
59
  return {'prediction_label': 'Error', 'prediction_confidence': 0.0}
60
 
61
+ def compute_classification_metrics(eval_dataset):
62
  """
63
  Compute comprehensive classification metrics for each class.
64
 
 
68
  Returns:
69
  pd.DataFrame: DataFrame with detailed metrics for each class.
70
  """
71
+
72
+ # transform the dataset object into a pandas DataFrame object
73
+ data = pd.DataFrame(eval_dataset)
74
+
75
  # Extract true labels and predictions
76
  true_labels = list(data['dialect'])
77
  predicted_labels = list(data['preds'])
 
89
  precision_scores = precision_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
90
  recall_scores = recall_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
91
 
92
+ # Compute macro, weighted and micro f1 score
93
+ macro_f1_score = f1_score(true_indices, pred_indices, average='macro')
94
+ weighted_f1_score = f1_score(true_indices, pred_indices, average='weighted')
95
+ micro_f1_score = f1_score(true_indices, pred_indices, average='micro')
96
+
97
  # Compute confusion matrix
98
  conf_mat = confusion_matrix(true_indices, pred_indices, labels=range(len(labels)))
99
 
100
  # Calculate various metrics per class
101
+ FP = conf_mat.sum(axis=0) - np.diag(conf_mat) # False Positives
102
+ FN = conf_mat.sum(axis=1) - np.diag(conf_mat) # False Negatives
103
+ TP = np.diag(conf_mat) # True Positives
104
+ TN = conf_mat.sum() - (FP + FN + TP) # True Negatives
105
 
106
  # Calculate sample counts per class
107
  samples_per_class = np.bincount(true_indices, minlength=len(labels))
108
 
109
  # Calculate additional metrics
110
  with np.errstate(divide='ignore', invalid='ignore'):
111
+ fp_rate = FP / (FP + TN) # False Positive Rate
112
+ fn_rate = FN / (FN + TP) # False Negative Rate
113
+ specificity = TN / (TN + FP) # True Negative Rate
114
+ npv = TN / (TN + FN) # Negative Predictive Value
115
 
116
  # Replace NaN/inf with 0
117
  metrics = [fp_rate, fn_rate, specificity, npv]
 
127
  'country': labels,
128
  'samples': samples_per_class,
129
  'f1_score': f1_scores,
130
+ 'macro_f1_score': macro_f1_score,
131
+ 'weighted_f1_score': weighted_f1_score,
132
+ 'micro_f1_score': micro_f1_score,
133
  'precision': precision_scores,
134
  'recall': recall_scores,
135
  'specificity': specificity,
 
139
  'false_positives': FP,
140
  'true_negatives': TN,
141
  'false_negatives': FN,
142
+ 'negative_predictive_value': npv,
143
+ 'balanced_accuracy': balanced_acc,
144
+ 'matthews_correlation': mcc,
145
  })
146
 
147
  # Sort by number of samples (descending)
148
  result_df = result_df.sort_values('samples', ascending=False)
149
 
 
 
 
 
 
 
 
 
 
150
  # Format all numeric columns to 4 decimal places
151
  numeric_cols = result_df.select_dtypes(include=[np.number]).columns
152
  result_df[numeric_cols] = result_df[numeric_cols].round(4)
153
 
154
+ print(f'[INFO] result_df \n: {result_df}')
155
 
156
+ return result_df
157
 
158
  def make_binary(dialect, target):
159
  if dialect != target:
 
179
  out = out.reset_index()
180
  out = out[out['preds']==TARGET_LANG].drop(columns=['preds', 'size'])
181
 
182
+ print(f'[INFO] out for TARGET_LANG={TARGET_LANG} \n: {out}')
183
 
184
  return out
185
 
186
+ def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE):
187
+
188
+ # Initialize Hugging Face API
189
+ api = HfApi()
190
+
191
+ # Get the repository ID from environment variables
192
+ repo_id = os.environ.get("SPACE_ID")
193
+ if not repo_id:
194
+ raise ValueError("This code must be run in a Hugging Face Space")
195
+
196
+ # Create a temporary directory for file operations
197
+ temp_dir = Path("/tmp")
198
+ temp_file = temp_dir / DIALECT_CONFUSION_LEADERBOARD_FILE
199
+
200
  try:
201
+ # Try to download existing file from the Space
202
+ try:
203
+ api.hf_hub_download(
204
+ repo_id=repo_id,
205
+ filename=DIALECT_CONFUSION_LEADERBOARD_FILE,
206
+ repo_type="space",
207
+ local_dir=temp_dir
208
+ )
209
+ except Exception:
210
+ # If file doesn't exist, start with empty data
211
+ data = []
212
+ else:
213
+ # If file exists, read it
214
+ with open(temp_file, "r") as f:
215
+ data = json.load(f)
216
+
217
+ # Process the results for each dialect/country
218
+ for _, row in result_df.iterrows():
219
+ dialect = row['dialect']
220
+ # Skip 'Other' class
221
+ if dialect == 'Other':
222
+ continue
223
+
224
+ # Find existing target_lang entry or create a new one
225
+ target_entry = next((item for item in data if target_lang in item), None)
226
+ if target_entry is None:
227
+ target_entry = {target_lang: {}}
228
+ data.append(target_entry)
229
+
230
+ # Get the country-specific data for this target language
231
+ country_data = target_entry[target_lang]
232
+
233
+ # Initialize the dialect/country entry if it doesn't exist
234
+ if dialect not in country_data:
235
+ country_data[dialect] = {}
236
+
237
+ # Update the model metrics under the model name for the given dialect
238
+ country_data[dialect][model_name] = float(row['false_positive_rate'])
239
 
240
+ # Save updated data to temporary file
241
+ with open(temp_file, "w") as f:
242
+ json.dump(data, f, indent=4)
 
 
 
 
 
243
 
244
+ # Upload the file back to the Space
245
+ api.upload_file(
246
+ path_or_fileobj=str(temp_file),
247
+ path_in_repo=DIALECT_CONFUSION_LEADERBOARD_FILE,
248
+ repo_id=repo_id,
249
+ repo_type="space"
250
+ )
251
 
252
+ finally:
253
+ # Clean up temporary file
254
+ if temp_file.exists():
255
+ temp_file.unlink()
 
 
 
256
 
257
  def handle_evaluation(model_path, model_path_bin, use_mapping=False):
258
 
 
263
  print(f"[INFO] Loading model from Path: {model_path_hub}, using version {model_path_bin}...")
264
  model = fasttext.load_model(model_path_hub)
265
 
 
 
 
 
266
  # Transform to pandas DataFrame
267
  print(f"[INFO] Converting evaluation dataset to Pandas DataFrame...")
268
  df_eval = pd.DataFrame(eval_dataset)
 
272
  df_eval['preds'] = df_eval['text'].apply(lambda text: predict_label(text, model, language_mapping_dict, use_mapping=use_mapping))
273
 
274
  # run the evaluation
275
+ result_df = run_eval(df_eval)
276
+
277
  # set the model name
278
  model_name = model_path + '/' + model_path_bin
279
 
280
  # update the multilingual leaderboard
281
+ update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE)
282
 
283
  for target_lang in all_target_languages:
284
  result_df_one_vs_all =run_eval_one_vs_all(df_eval, TARGET_LANG=target_lang)
285
+ update_darija_one_vs_all_leaderboard(result_df_one_vs_all, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE)
286
 
287
  # load the updated leaderboard tables
288
+ df_multilingual = load_leaderboard_multilingual(MULTI_DIALECTS_LEADERBOARD_FILE)
289
+ df_one_vs_all = load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE)
290
 
291
  status_message = "**Evaluation now ended! 🤗**"
292
 
 
306
  pd.DataFrame: A DataFrame containing evaluation metrics.
307
  """
308
 
309
+ # make a copy as the original one is used later
310
  df_eval_multilingual = df_eval.copy()
311
 
312
  # now drop the columns that are not needed, i.e. 'text'
 
314
 
315
  # Compute evaluation metrics
316
  print(f"[INFO] Computing metrics...")
317
+ result_df = compute_classification_metrics(df_eval_multilingual)
318
 
319
+ # update_darija_multilingual_leaderboard(result_df, model_path, MULTI_DIALECTS_LEADERBOARD_FILE)
320
 
321
+ return result_df
322
 
323
  def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/submissions/", default_language='Morocco'):
324
  try:
 
327
 
328
  # Clean the model name to be safe for file paths
329
  uploaded_model_name = uploaded_model_name.strip().replace(" ", "_")
330
+ print(f"[INFO] Uploaded model name: {uploaded_model_name}")
331
 
332
  # Create the directory for saving submissions
333
  path_saving = os.path.join(base_path_save, uploaded_model_name)
 
337
  saved_file_path = os.path.join(path_saving, 'submission.csv')
338
 
339
  # Read the uploaded file as DataFrame
340
+ print(f"[INFO] Loading csv results file...")
341
  df_eval = pd.read_csv(file.name)
342
 
343
  # Save the DataFrame
 
349
 
350
  # Compute evaluation metrics
351
  print(f"[INFO] Computing metrics...")
352
+ result_df = compute_classification_metrics(df_eval)
353
 
354
  # Update the leaderboards
355
+ update_darija_multilingual_leaderboard(result_df, uploaded_model_name, MULTI_DIALECTS_LEADERBOARD_FILE)
356
 
357
  # TODO: implement this ove_vs_all differently for people only submitting csv file. They need to submit two files, one for multi-lang and the other for one-vs-all
358
  # result_df_one_vs_all = run_eval_one_vs_all(...)
359
  # update_darija_one_vs_all_leaderboard(...)
360
 
361
+ for target_lang in all_target_languages:
362
+ result_df_one_vs_all =run_eval_one_vs_all(df_eval, TARGET_LANG=target_lang)
363
+ update_darija_one_vs_all_leaderboard(result_df_one_vs_all, uploaded_model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE)
364
 
365
+ # load the updated leaderboard tables
366
+ df_multilingual = load_leaderboard_multilingual(MULTI_DIALECTS_LEADERBOARD_FILE)
367
+ df_one_vs_all = load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE)
368
 
369
+ status_message = "**Evaluation now ended! 🤗**"
370
 
371
+ return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
372
+
373
+ def update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
374
+ # Initialize Hugging Face API
375
+ api = HfApi()
376
+
377
+ # Get the repository ID from environment variables
378
+ # HF Spaces sets this automatically
379
+ repo_id = os.environ.get("SPACE_ID")
380
+ if not repo_id:
381
+ raise ValueError("This code must be run in a Hugging Face Space")
382
+
383
+ # Create a temporary directory for file operations
384
+ temp_dir = Path("/tmp")
385
+ temp_file = temp_dir / MULTI_DIALECTS_LEADERBOARD_FILE
386
 
387
  try:
388
+ # Try to download existing file from the Space
389
+ try:
390
+ api.hf_hub_download(
391
+ repo_id=repo_id,
392
+ filename=MULTI_DIALECTS_LEADERBOARD_FILE,
393
+ repo_type="space",
394
+ local_dir=temp_dir
395
+ )
396
+ except Exception:
397
+ # If file doesn't exist, start with empty data
398
+ data = []
399
+ else:
400
+ # If file exists, read it
401
+ with open(temp_file, "r") as f:
402
+ data = json.load(f)
403
 
404
+ # Process the results for each dialect/country
405
+ for _, row in result_df.iterrows():
406
+ country = row['country']
407
+ # skip 'Other' class
408
+ if country == 'Other':
409
+ continue
410
+
411
+ # Create metrics dictionary
412
+ metrics = {
413
+ 'f1_score': float(row['f1_score']),
414
+ 'precision': float(row['precision']),
415
+ 'recall': float(row['recall']),
416
+ 'macro_f1_score': float(row['macro_f1_score']),
417
+ 'micro_f1_score': float(row['micro_f1_score']),
418
+ 'weighted_f1_score': float(row['weighted_f1_score']),
419
+ 'specificity': float(row['specificity']),
420
+ 'false_positive_rate': float(row['false_positive_rate']),
421
+ 'false_negative_rate': float(row['false_negative_rate']),
422
+ 'negative_predictive_value': float(row['negative_predictive_value']),
423
+ 'balanced_accuracy': float(row['balanced_accuracy']),
424
+ 'matthews_correlation': float(row['matthews_correlation']),
425
+ 'n_test_samples': int(row['samples'])
426
+ }
427
 
428
+ # Find existing country entry or create new one
429
+ country_entry = next((item for item in data if country in item), None)
430
+ if country_entry is None:
431
+ country_entry = {country: {}}
432
+ data.append(country_entry)
433
+
434
+ # Update the model metrics
435
+ if country not in country_entry:
436
+ country_entry[country] = {}
437
+ country_entry[country][model_name] = metrics
438
+
439
+ # Save updated data to temporary file
440
+ with open(temp_file, "w") as f:
441
+ json.dump(data, f, indent=4)
442
 
443
+ # Upload the file back to the Space
444
+ api.upload_file(
445
+ path_or_fileobj=str(temp_file),
446
+ path_in_repo=MULTI_DIALECTS_LEADERBOARD_FILE,
447
+ repo_id=repo_id,
448
+ repo_type="space"
449
+ )
450
 
451
+ finally:
452
+ # Clean up temporary file
453
+ if temp_file.exists():
454
+ temp_file.unlink()
 
 
 
 
455
 
456
 
457
+ def load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE):
458
  current_dir = os.path.dirname(os.path.abspath(__file__))
459
+ DIALECT_CONFUSION_LEADERBOARD_FILE = os.path.join(current_dir, DIALECT_CONFUSION_LEADERBOARD_FILE)
460
 
461
+ with open(DIALECT_CONFUSION_LEADERBOARD_FILE, "r") as f:
462
  data = json.load(f)
463
 
464
  # Initialize lists to store the flattened data
 
485
 
486
  # Pivot the DataFrame to create the desired structure: all languages in columns and models in rows, and each (model, target_language, language) = false_positive_rate
487
  df_pivot = df.pivot(index=['model', 'target_language'], columns='language', values='false_positive_rate').reset_index()
488
+
 
 
489
  return df_pivot
490
 
491
+ def load_leaderboard_multilingual(MULTI_DIALECTS_LEADERBOARD_FILE):
492
  current_dir = os.path.dirname(os.path.abspath(__file__))
493
+ MULTI_DIALECTS_LEADERBOARD_FILE = os.path.join(current_dir, MULTI_DIALECTS_LEADERBOARD_FILE)
494
 
495
+ with open(MULTI_DIALECTS_LEADERBOARD_FILE, "r") as f:
496
  data = json.load(f)
497
 
498
  # Initialize lists to store the flattened data
 
569
  def update_leaderboard_multilingual(country, selected_metrics):
570
  if not selected_metrics: # If no metrics selected, show all
571
  selected_metrics = metrics
572
+ df = load_leaderboard_multilingual(MULTI_DIALECTS_LEADERBOARD_FILE)
573
  display_df = create_leaderboard_display_multilingual(df, country, selected_metrics)
574
  return display_df
575
 
576
  def update_leaderboard_one_vs_all(target_language, selected_languages):
577
  if not selected_languages: # If no language selected, show all defaults
578
  selected_languages = default_languages
579
+ df = load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE)
580
  display_df, selected_languages = create_leaderboard_display_one_vs_all(df, target_language, selected_languages)
581
+
582
  # to improve visibility in case the user chooses multiple language leading to many columns, the `model` column must remain fixed
583
+ # display_df = render_fixed_columns(display_df) # needs to be implemented
584
  return display_df, selected_languages
585
 
586
  def encode_image_to_base64(image_path):
587
+ """ encodes the image to base64"""
588
  with open(image_path, "rb") as image_file:
589
  encoded_string = base64.b64encode(image_file.read()).decode()
590
  return encoded_string
591
 
592
  def create_html_image(image_path):
593
+ """ Creates the html of the logo image from the image path input """
594
  # Get base64 string of image
595
  img_base64 = encode_image_to_base64(image_path)
596
 
 
606
  """
607
  return html_string
608
 
 
609
  def render_fixed_columns(df):
610
+ """ A function to render HTML table with fixed 'model' column for better visibility """
611
+ return NotImplementedError