Spaces:

atlasia
/

Open-Arabic-Dialect-Identification-Leaderboard

Running

App Files Files Community

BounharAbdelaziz commited on Jan 7

Commit

f6452ab

1 Parent(s): 1c7ff5f

implemented full tests for upload via csv + cleaned the code a bit

Browse files

Files changed (1) hide show

utils.py +225 -250

utils.py CHANGED Viewed

@@ -13,104 +13,29 @@ from sklearn.metrics import (
     matthews_corrcoef
 )
 import numpy as np
-from datasets import load_dataset
-# Constants
-MODEL_REPO = "atlasia/Sfaya-Moroccan-Darija-vs-All"
-BIN_FILENAME = "model_multi_v3_2fpr.bin"
-BINARY_LEADERBOARD_FILE = "darija_leaderboard_binary.json"
-MULTILINGUAL_LEADERBOARD_FILE = "darija_leaderboard_multilingual.json"
-DATA_PATH = "atlasia/Arabic-LID-Leaderboard"
-target_label = "Morocco"
-is_binary = False
-# Load test dataset
-test_dataset = load_dataset(DATA_PATH, split='test')
-# Supported dialects
-all_target_languages = list(test_dataset.unique("dialect"))
-supported_dialects = all_target_languages + ['All']
-languages_to_display_one_vs_all = all_target_languages # everything except All
-print(f'all_target_languages: {all_target_languages}')
-metrics = [
-    'f1_score',
-    'precision',
-    'recall',
-    'specificity',
-    'false_positive_rate',
-    'false_negative_rate',
-    'negative_predictive_value',
-    'n_test_samples',
-]
-default_metrics = [
-    'f1_score',
-    'precision',
-    'recall',
-    'false_positive_rate',
-    'false_negative_rate'
-]
-# default language to display in one-vs-all leaderboard
-default_languages = [
-    #'Morocco',
-    'MSA',
-    #'Egypt',
-    #'Algeria',
-    #'Tunisia',
-    #'Levantine',
-]
-language_mapping_dict = {
-    'ace_Arab': 'Acehnese',
-    'acm_Arab': 'Mesopotamia',  # 'Gilit Mesopotamian'
-    'aeb_Arab': 'Tunisia',
-    'ajp_Arab': 'Levantine',  # 'South Levantine'
-    'apc_Arab': 'Levantine',
-    'arb_Arab': 'MSA',
-    'arq_Arab': 'Algeria',
-    'ars_Arab': 'Saudi',  # Najdi is primarily Saudi Arabian
-    'ary_Arab': 'Morocco',
-    'arz_Arab': 'Egypt',
-    'ayp_Arab': 'Mesopotamia',  # 'North Mesopotamian'
-    'azb_Arab': 'Azerbaijan',  # South Azerbaijani pertains to this region
-    'bcc_Arab': 'Balochistan',  # Southern Balochi is from Balochistan
-    'bjn_Arab': 'Indonesia',  # Banjar is spoken in Indonesia
-    'brh_Arab': 'Pakistan',  # Brahui is spoken in Pakistan
-    'ckb_Arab': 'Kurdistan',  # Central Kurdish is mainly in Iraq
-    'fuv_Arab': 'Nigeria', # Hausa States Fulfulde
-    'glk_Arab': 'Iran',  # Gilaki is spoken in Iran
-    'hac_Arab': 'Iran',  # Gurani is also primarily spoken in Iran
-    'kas_Arab': 'Kashmir',
-    'knc_Arab': 'Nigeria',  # Central Kanuri is in Nigeria
-    'lki_Arab': 'Iran',  # Laki is from Iran
-    'lrc_Arab': 'Iran',  # Northern Luri is from Iran
-    'min_Arab': 'Indonesia',  # Minangkabau is spoken in Indonesia
-    'mzn_Arab': 'Iran',  # Mazanderani is spoken in Iran
-    'ota_Arab': 'Turkey',  # Ottoman Turkish
-    'pbt_Arab': 'Afghanistan',  # Southern Pashto
-    'pnb_Arab': 'Pakistan',  # Western Panjabi
-    'sdh_Arab': 'Iraq',  # Southern Kurdish
-    'shu_Arab': 'Chad',  # Chadian Arabic
-    'skr_Arab': 'Pakistan',  # Saraiki
-    'snd_Arab': 'Pakistan',  # Sindhi
-    'sus_Arab': 'Guinea',  # Susu
-    'tuk_Arab': 'Turkmenistan',  # Turkmen
-    'uig_Arab': 'Uighur (China)',  # Uighur
-    'urd_Arab': 'Pakistan',  # Urdu
-    'uzs_Arab': 'Uzbekistan',  # Southern Uzbek
-    'zsm_Arab': 'Malaysia'  # Standard Malay
-}
 def predict_label(text, model, language_mapping_dict, use_mapping=False):
     # Remove any newline characters and strip whitespace
     text = str(text).strip().replace('\n', ' ')
     if text == '':
-        return 'Other'
     try:
         # Get top prediction
@@ -124,6 +49,7 @@ def predict_label(text, model, language_mapping_dict, use_mapping=False):
         # map label to language using language_mapping_dict
         if use_mapping:
             label = language_mapping_dict.get(label, 'Other')
         return label
@@ -132,7 +58,7 @@ def predict_label(text, model, language_mapping_dict, use_mapping=False):
         print(f"Exception: {e}")
         return {'prediction_label': 'Error', 'prediction_confidence': 0.0}
-def compute_classification_metrics(test_dataset):
     """
     Compute comprehensive classification metrics for each class.
@@ -142,8 +68,10 @@ def compute_classification_metrics(test_dataset):
     Returns:
         pd.DataFrame: DataFrame with detailed metrics for each class.
     """
-    # transform the dataset into a DataFrame
-    data = pd.DataFrame(test_dataset)
     # Extract true labels and predictions
     true_labels = list(data['dialect'])
     predicted_labels = list(data['preds'])
@@ -161,24 +89,29 @@ def compute_classification_metrics(test_dataset):
     precision_scores = precision_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
     recall_scores = recall_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
     # Compute confusion matrix
     conf_mat = confusion_matrix(true_indices, pred_indices, labels=range(len(labels)))
     # Calculate various metrics per class
-    FP = conf_mat.sum(axis=0) - np.diag(conf_mat)  # False Positives
-    FN = conf_mat.sum(axis=1) - np.diag(conf_mat)  # False Negatives
-    TP = np.diag(conf_mat)                         # True Positives
-    TN = conf_mat.sum() - (FP + FN + TP)          # True Negatives
     # Calculate sample counts per class
     samples_per_class = np.bincount(true_indices, minlength=len(labels))
     # Calculate additional metrics
     with np.errstate(divide='ignore', invalid='ignore'):
-        fp_rate = FP / (FP + TN)  # False Positive Rate
-        fn_rate = FN / (FN + TP)  # False Negative Rate
-        specificity = TN / (TN + FP)  # True Negative Rate
-        npv = TN / (TN + FN)  # Negative Predictive Value
         # Replace NaN/inf with 0
         metrics = [fp_rate, fn_rate, specificity, npv]
@@ -194,6 +127,9 @@ def compute_classification_metrics(test_dataset):
         'country': labels,
         'samples': samples_per_class,
         'f1_score': f1_scores,
         'precision': precision_scores,
         'recall': recall_scores,
         'specificity': specificity,
@@ -203,28 +139,21 @@ def compute_classification_metrics(test_dataset):
         'false_positives': FP,
         'true_negatives': TN,
         'false_negatives': FN,
-        'negative_predictive_value': npv
     })
     # Sort by number of samples (descending)
     result_df = result_df.sort_values('samples', ascending=False)
-    # Calculate and add summary metrics
-    summary_metrics = {
-        'macro_f1': f1_score(true_indices, pred_indices, average='macro'),
-        'weighted_f1': f1_score(true_indices, pred_indices, average='weighted'),
-        'micro_f1': f1_score(true_indices, pred_indices, average='micro'),
-        'balanced_accuracy': balanced_acc,
-        'matthews_correlation': mcc
-    }
     # Format all numeric columns to 4 decimal places
     numeric_cols = result_df.select_dtypes(include=[np.number]).columns
     result_df[numeric_cols] = result_df[numeric_cols].round(4)
-    print(f'result_df: {result_df}')
-    return result_df, summary_metrics
 def make_binary(dialect, target):
     if dialect != target:
@@ -250,47 +179,80 @@ def run_eval_one_vs_all(data_test, TARGET_LANG='Morocco'):
     out = out.reset_index()
     out = out[out['preds']==TARGET_LANG].drop(columns=['preds', 'size'])
-    print(f'out for TARGET_LANG={TARGET_LANG} \n: {out}')
     return out
-def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, BINARY_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
     try:
-        with open(BINARY_LEADERBOARD_FILE, "r") as f:
-            data = json.load(f)
-    except FileNotFoundError:
-        data = []
-    # Process the results for each dialect/country
-    for _, row in result_df.iterrows():
-        dialect = row['dialect']
-        # Skip 'Other' class, it is considered as the null space
-        if dialect == 'Other':
-            continue
-        # Find existing target_lang entry or create a new one
-        target_entry = next((item for item in data if target_lang in item), None)
-        if target_entry is None:
-            target_entry = {target_lang: {}}
-            data.append(target_entry)
-        # Get the country-specific data for this target language
-        country_data = target_entry[target_lang]
-        # Initialize the dialect/country entry if it doesn't exist
-        if dialect not in country_data:
-            country_data[dialect] = {}
-        # Update the model metrics under the model name for the given dialect
-        country_data[dialect][model_name] = float(row['false_positive_rate'])
-        # # Add the number of test samples, if not already present
-        # if "n_test_samples" not in country_data[dialect]:
-        #     country_data[dialect]["n_test_samples"] = int(row['size'])
-    # Save updated leaderboard data
-    with open(BINARY_LEADERBOARD_FILE, "w") as f:
-        json.dump(data, f, indent=4)
 def handle_evaluation(model_path, model_path_bin, use_mapping=False):
@@ -301,10 +263,6 @@ def handle_evaluation(model_path, model_path_bin, use_mapping=False):
     print(f"[INFO] Loading model from Path: {model_path_hub}, using version {model_path_bin}...")
     model = fasttext.load_model(model_path_hub)
-    # Load the evaluation dataset
-    print(f"[INFO] Loading evaluation dataset from Path: {DATA_PATH}...")
-    eval_dataset = load_dataset(DATA_PATH, split='test')
     # Transform to pandas DataFrame
     print(f"[INFO] Converting evaluation dataset to Pandas DataFrame...")
     df_eval = pd.DataFrame(eval_dataset)
@@ -314,20 +272,21 @@ def handle_evaluation(model_path, model_path_bin, use_mapping=False):
     df_eval['preds'] = df_eval['text'].apply(lambda text: predict_label(text, model, language_mapping_dict, use_mapping=use_mapping))
     # run the evaluation
-    result_df, _ = run_eval(df_eval)
     # set the model name
     model_name = model_path + '/' + model_path_bin
     # update the multilingual leaderboard
-    update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE)
     for target_lang in all_target_languages:
         result_df_one_vs_all =run_eval_one_vs_all(df_eval, TARGET_LANG=target_lang)
-        update_darija_one_vs_all_leaderboard(result_df_one_vs_all, model_name, target_lang, BINARY_LEADERBOARD_FILE)
     # load the updated leaderboard tables
-    df_multilingual = load_leaderboard_multilingual()
-    df_one_vs_all = load_leaderboard_one_vs_all()
     status_message = "**Evaluation now ended! 🤗**"
@@ -347,7 +306,7 @@ def run_eval(df_eval):
         pd.DataFrame: A DataFrame containing evaluation metrics.
     """
-    # map to binary
     df_eval_multilingual = df_eval.copy()
     # now drop the columns that are not needed, i.e. 'text'
@@ -355,11 +314,11 @@ def run_eval(df_eval):
     # Compute evaluation metrics
     print(f"[INFO] Computing metrics...")
-    result_df, _ = compute_classification_metrics(df_eval_multilingual)
-    # update_darija_multilingual_leaderboard(result_df, model_path, MULTILINGUAL_LEADERBOARD_FILE)
-    return result_df, df_eval_multilingual
 def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/submissions/", default_language='Morocco'):
     try:
@@ -368,7 +327,7 @@ def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/su
         # Clean the model name to be safe for file paths
         uploaded_model_name = uploaded_model_name.strip().replace(" ", "_")
-        print(f"[INFO] uploaded_model_name: {uploaded_model_name}")
         # Create the directory for saving submissions
         path_saving = os.path.join(base_path_save, uploaded_model_name)
@@ -378,7 +337,7 @@ def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/su
         saved_file_path = os.path.join(path_saving, 'submission.csv')
         # Read the uploaded file as DataFrame
-        print(f"[INFO] Loading results...")
         df_eval = pd.read_csv(file.name)
         # Save the DataFrame
@@ -390,72 +349,116 @@ def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/su
     # Compute evaluation metrics
     print(f"[INFO] Computing metrics...")
-    result_df, _ = compute_classification_metrics(df_eval)
     # Update the leaderboards
-    update_darija_multilingual_leaderboard(result_df, uploaded_model_name, MULTILINGUAL_LEADERBOARD_FILE)
     # TODO: implement this ove_vs_all differently for people only submitting csv file. They need to submit two files, one for multi-lang and the other for one-vs-all
     # result_df_one_vs_all = run_eval_one_vs_all(...)
     # update_darija_one_vs_all_leaderboard(...)
-    # update the leaderboard table
-    df = load_leaderboard_multilingual()
-    return create_leaderboard_display_multilingual(df, default_language, default_metrics)
-def update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
-    # Load leaderboard data
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-    MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE)
     try:
-        with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f:
-            data = json.load(f)
-    except FileNotFoundError:
-        data = []
-    # Process the results for each dialect/country
-    for _, row in result_df.iterrows():
-        country = row['country']
-        # skip 'Other' class, it is considered as the null space
-        if country == 'Other':
-            continue
-        # Create metrics dictionary directly
-        metrics = {
-            'f1_score': float(row['f1_score']),
-            'precision': float(row['precision']),
-            'recall': float(row['recall']),
-            'specificity': float(row['specificity']),
-            'false_positive_rate': float(row['false_positive_rate']),
-            'false_negative_rate': float(row['false_negative_rate']),
-            'negative_predictive_value': float(row['negative_predictive_value']),
-            'n_test_samples': int(row['samples'])
-        }
-        # Find existing country entry or create new one
-        country_entry = next((item for item in data if country in item), None)
-        if country_entry is None:
-            country_entry = {country: {}}
-            data.append(country_entry)
-        # Update the model metrics directly under the model name
-        if country not in country_entry:
-            country_entry[country] = {}
-        country_entry[country][model_name] = metrics
-    # Save updated leaderboard data
-    with open(MULTILINGUAL_LEADERBOARD_FILE, "w") as f:
-        json.dump(data, f, indent=4)
-def load_leaderboard_one_vs_all(BINARY_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
     current_dir = os.path.dirname(os.path.abspath(__file__))
-    BINARY_LEADERBOARD_FILE = os.path.join(current_dir, BINARY_LEADERBOARD_FILE)
-    with open(BINARY_LEADERBOARD_FILE, "r") as f:
         data = json.load(f)
     # Initialize lists to store the flattened data
@@ -482,16 +485,14 @@ def load_leaderboard_one_vs_all(BINARY_LEADERBOARD_FILE="darija_leaderboard_bina
     # Pivot the DataFrame to create the desired structure: all languages in columns and models in rows, and each (model, target_language, language) = false_positive_rate
     df_pivot = df.pivot(index=['model', 'target_language'], columns='language', values='false_positive_rate').reset_index()
-    # print(f'df_pivot \n: {df_pivot}')
     return df_pivot
-def load_leaderboard_multilingual(MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
     current_dir = os.path.dirname(os.path.abspath(__file__))
-    MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE)
-    with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f:
         data = json.load(f)
     # Initialize lists to store the flattened data
@@ -568,25 +569,28 @@ def create_leaderboard_display_multilingual(df, selected_country, selected_metri
 def update_leaderboard_multilingual(country, selected_metrics):
     if not selected_metrics:  # If no metrics selected, show all
         selected_metrics = metrics
-    df = load_leaderboard_multilingual()
     display_df = create_leaderboard_display_multilingual(df, country, selected_metrics)
     return display_df
 def update_leaderboard_one_vs_all(target_language, selected_languages):
     if not selected_languages:  # If no language selected, show all defaults
         selected_languages = default_languages
-    df = load_leaderboard_one_vs_all()
     display_df, selected_languages = create_leaderboard_display_one_vs_all(df, target_language, selected_languages)
     # to improve visibility in case the user chooses multiple language leading to many columns, the `model` column must remain fixed
-    # display_df = render_fixed_columns(display_df)
     return display_df, selected_languages
 def encode_image_to_base64(image_path):
     with open(image_path, "rb") as image_file:
         encoded_string = base64.b64encode(image_file.read()).decode()
     return encoded_string
 def create_html_image(image_path):
     # Get base64 string of image
     img_base64 = encode_image_to_base64(image_path)
@@ -602,35 +606,6 @@ def create_html_image(image_path):
     """
     return html_string
-# Function to render HTML table with fixed 'model' column
 def render_fixed_columns(df):
-    style = """
-    <style>
-    .table-container {
-        overflow-x: auto;
-        position: relative;
-        white-space: nowrap;
-    }
-    table {
-        border-collapse: collapse;
-        width: 100%;
-    }
-    th, td {
-        border: 1px solid black;
-        padding: 8px;
-        text-align: left;
-    }
-    th.fixed, td.fixed {
-        position: sticky;
-        left: 0;
-        background-color: white;
-        z-index: 2;
-    }
-    </style>
-    """
-    table_html = df.to_html(index=False).replace(
-        "<th>model</th>", '<th class="fixed">model</th>'
-    ).replace(
-        '<td>', '<td class="fixed">', 1
-    )
-    return f"{style}<div class='table-container'>{table_html}</div>"

     matthews_corrcoef
 )
 import numpy as np
+from huggingface_hub import HfApi
+from pathlib import Path
+from constants import *
 def predict_label(text, model, language_mapping_dict, use_mapping=False):
+    """
+    Runs predictions for a fasttext model.
+    Args:
+        text (str): The input text to classify.
+        model (fasttext.FastText._FastText): The fasttext model to use for prediction.
+        language_mapping_dict (dict): A dictionary mapping fasttext labels to human-readable language names.
+        use_mapping (bool): Whether to use the language mapping dictionary.
+    Returns:
+        str: The predicted label for the input text.
+    """
     # Remove any newline characters and strip whitespace
     text = str(text).strip().replace('\n', ' ')
     if text == '':
+        # if empty text, return EMPTY
+        return 'EMPTY'
     try:
         # Get top prediction
         # map label to language using language_mapping_dict
         if use_mapping:
+            # if label not found in mapping dict, set it to other as we are not taking them into account
             label = language_mapping_dict.get(label, 'Other')
         return label
         print(f"Exception: {e}")
         return {'prediction_label': 'Error', 'prediction_confidence': 0.0}
+def compute_classification_metrics(eval_dataset):
     """
     Compute comprehensive classification metrics for each class.
     Returns:
         pd.DataFrame: DataFrame with detailed metrics for each class.
     """
+    # transform the dataset object into a pandas DataFrame object
+    data = pd.DataFrame(eval_dataset)
     # Extract true labels and predictions
     true_labels = list(data['dialect'])
     predicted_labels = list(data['preds'])
     precision_scores = precision_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
     recall_scores = recall_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
+    # Compute macro, weighted and micro f1 score
+    macro_f1_score = f1_score(true_indices, pred_indices, average='macro')
+    weighted_f1_score = f1_score(true_indices, pred_indices, average='weighted')
+    micro_f1_score = f1_score(true_indices, pred_indices, average='micro')
     # Compute confusion matrix
     conf_mat = confusion_matrix(true_indices, pred_indices, labels=range(len(labels)))
     # Calculate various metrics per class
+    FP = conf_mat.sum(axis=0) - np.diag(conf_mat)   # False Positives
+    FN = conf_mat.sum(axis=1) - np.diag(conf_mat)   # False Negatives
+    TP = np.diag(conf_mat)                          # True Positives
+    TN = conf_mat.sum() - (FP + FN + TP)            # True Negatives
     # Calculate sample counts per class
     samples_per_class = np.bincount(true_indices, minlength=len(labels))
     # Calculate additional metrics
     with np.errstate(divide='ignore', invalid='ignore'):
+        fp_rate = FP / (FP + TN)            # False Positive Rate
+        fn_rate = FN / (FN + TP)            # False Negative Rate
+        specificity = TN / (TN + FP)        # True Negative Rate
+        npv = TN / (TN + FN)                # Negative Predictive Value
         # Replace NaN/inf with 0
         metrics = [fp_rate, fn_rate, specificity, npv]
         'country': labels,
         'samples': samples_per_class,
         'f1_score': f1_scores,
+        'macro_f1_score': macro_f1_score,
+        'weighted_f1_score': weighted_f1_score,
+        'micro_f1_score': micro_f1_score,
         'precision': precision_scores,
         'recall': recall_scores,
         'specificity': specificity,
         'false_positives': FP,
         'true_negatives': TN,
         'false_negatives': FN,
+        'negative_predictive_value': npv,
+        'balanced_accuracy': balanced_acc,
+        'matthews_correlation': mcc,
     })
     # Sort by number of samples (descending)
     result_df = result_df.sort_values('samples', ascending=False)
     # Format all numeric columns to 4 decimal places
     numeric_cols = result_df.select_dtypes(include=[np.number]).columns
     result_df[numeric_cols] = result_df[numeric_cols].round(4)
+    print(f'[INFO] result_df \n: {result_df}')
+    return result_df
 def make_binary(dialect, target):
     if dialect != target:
     out = out.reset_index()
     out = out[out['preds']==TARGET_LANG].drop(columns=['preds', 'size'])
+    print(f'[INFO] out for TARGET_LANG={TARGET_LANG} \n: {out}')
     return out
+def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE):
+    # Initialize Hugging Face API
+    api = HfApi()
+    # Get the repository ID from environment variables
+    repo_id = os.environ.get("SPACE_ID")
+    if not repo_id:
+        raise ValueError("This code must be run in a Hugging Face Space")
+    # Create a temporary directory for file operations
+    temp_dir = Path("/tmp")
+    temp_file = temp_dir / DIALECT_CONFUSION_LEADERBOARD_FILE
     try:
+        # Try to download existing file from the Space
+        try:
+            api.hf_hub_download(
+                repo_id=repo_id,
+                filename=DIALECT_CONFUSION_LEADERBOARD_FILE,
+                repo_type="space",
+                local_dir=temp_dir
+            )
+        except Exception:
+            # If file doesn't exist, start with empty data
+            data = []
+        else:
+            # If file exists, read it
+            with open(temp_file, "r") as f:
+                data = json.load(f)
+        # Process the results for each dialect/country
+        for _, row in result_df.iterrows():
+            dialect = row['dialect']
+            # Skip 'Other' class
+            if dialect == 'Other':
+                continue
+            # Find existing target_lang entry or create a new one
+            target_entry = next((item for item in data if target_lang in item), None)
+            if target_entry is None:
+                target_entry = {target_lang: {}}
+                data.append(target_entry)
+            # Get the country-specific data for this target language
+            country_data = target_entry[target_lang]
+            # Initialize the dialect/country entry if it doesn't exist
+            if dialect not in country_data:
+                country_data[dialect] = {}
+            # Update the model metrics under the model name for the given dialect
+            country_data[dialect][model_name] = float(row['false_positive_rate'])
+        # Save updated data to temporary file
+        with open(temp_file, "w") as f:
+            json.dump(data, f, indent=4)
+        # Upload the file back to the Space
+        api.upload_file(
+            path_or_fileobj=str(temp_file),
+            path_in_repo=DIALECT_CONFUSION_LEADERBOARD_FILE,
+            repo_id=repo_id,
+            repo_type="space"
+        )
+    finally:
+        # Clean up temporary file
+        if temp_file.exists():
+            temp_file.unlink()
 def handle_evaluation(model_path, model_path_bin, use_mapping=False):
     print(f"[INFO] Loading model from Path: {model_path_hub}, using version {model_path_bin}...")
     model = fasttext.load_model(model_path_hub)
     # Transform to pandas DataFrame
     print(f"[INFO] Converting evaluation dataset to Pandas DataFrame...")
     df_eval = pd.DataFrame(eval_dataset)
     df_eval['preds'] = df_eval['text'].apply(lambda text: predict_label(text, model, language_mapping_dict, use_mapping=use_mapping))
     # run the evaluation
+    result_df = run_eval(df_eval)
     # set the model name
     model_name = model_path + '/' + model_path_bin
     # update the multilingual leaderboard
+    update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE)
     for target_lang in all_target_languages:
         result_df_one_vs_all =run_eval_one_vs_all(df_eval, TARGET_LANG=target_lang)
+        update_darija_one_vs_all_leaderboard(result_df_one_vs_all, model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE)
     # load the updated leaderboard tables
+    df_multilingual = load_leaderboard_multilingual(MULTI_DIALECTS_LEADERBOARD_FILE)
+    df_one_vs_all = load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE)
     status_message = "**Evaluation now ended! 🤗**"
         pd.DataFrame: A DataFrame containing evaluation metrics.
     """
+    # make a copy as the original one is used later
     df_eval_multilingual = df_eval.copy()
     # now drop the columns that are not needed, i.e. 'text'
     # Compute evaluation metrics
     print(f"[INFO] Computing metrics...")
+    result_df = compute_classification_metrics(df_eval_multilingual)
+    # update_darija_multilingual_leaderboard(result_df, model_path, MULTI_DIALECTS_LEADERBOARD_FILE)
+    return result_df
 def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/submissions/", default_language='Morocco'):
     try:
         # Clean the model name to be safe for file paths
         uploaded_model_name = uploaded_model_name.strip().replace(" ", "_")
+        print(f"[INFO] Uploaded model name: {uploaded_model_name}")
         # Create the directory for saving submissions
         path_saving = os.path.join(base_path_save, uploaded_model_name)
         saved_file_path = os.path.join(path_saving, 'submission.csv')
         # Read the uploaded file as DataFrame
+        print(f"[INFO] Loading csv results file...")
         df_eval = pd.read_csv(file.name)
         # Save the DataFrame
     # Compute evaluation metrics
     print(f"[INFO] Computing metrics...")
+    result_df = compute_classification_metrics(df_eval)
     # Update the leaderboards
+    update_darija_multilingual_leaderboard(result_df, uploaded_model_name, MULTI_DIALECTS_LEADERBOARD_FILE)
     # TODO: implement this ove_vs_all differently for people only submitting csv file. They need to submit two files, one for multi-lang and the other for one-vs-all
     # result_df_one_vs_all = run_eval_one_vs_all(...)
     # update_darija_one_vs_all_leaderboard(...)
+    for target_lang in all_target_languages:
+        result_df_one_vs_all =run_eval_one_vs_all(df_eval, TARGET_LANG=target_lang)
+        update_darija_one_vs_all_leaderboard(result_df_one_vs_all, uploaded_model_name, target_lang, DIALECT_CONFUSION_LEADERBOARD_FILE)
+    # load the updated leaderboard tables
+    df_multilingual = load_leaderboard_multilingual(MULTI_DIALECTS_LEADERBOARD_FILE)
+    df_one_vs_all = load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE)
+    status_message = "**Evaluation now ended! 🤗**"
+    return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
+def update_darija_multilingual_leaderboard(result_df, model_name, MULTI_DIALECTS_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
+    # Initialize Hugging Face API
+    api = HfApi()
+    # Get the repository ID from environment variables
+    # HF Spaces sets this automatically
+    repo_id = os.environ.get("SPACE_ID")
+    if not repo_id:
+        raise ValueError("This code must be run in a Hugging Face Space")
+    # Create a temporary directory for file operations
+    temp_dir = Path("/tmp")
+    temp_file = temp_dir / MULTI_DIALECTS_LEADERBOARD_FILE
     try:
+        # Try to download existing file from the Space
+        try:
+            api.hf_hub_download(
+                repo_id=repo_id,
+                filename=MULTI_DIALECTS_LEADERBOARD_FILE,
+                repo_type="space",
+                local_dir=temp_dir
+            )
+        except Exception:
+            # If file doesn't exist, start with empty data
+            data = []
+        else:
+            # If file exists, read it
+            with open(temp_file, "r") as f:
+                data = json.load(f)
+        # Process the results for each dialect/country
+        for _, row in result_df.iterrows():
+            country = row['country']
+            # skip 'Other' class
+            if country == 'Other':
+                continue
+            # Create metrics dictionary
+            metrics = {
+                'f1_score': float(row['f1_score']),
+                'precision': float(row['precision']),
+                'recall': float(row['recall']),
+                'macro_f1_score': float(row['macro_f1_score']),
+                'micro_f1_score': float(row['micro_f1_score']),
+                'weighted_f1_score': float(row['weighted_f1_score']),
+                'specificity': float(row['specificity']),
+                'false_positive_rate': float(row['false_positive_rate']),
+                'false_negative_rate': float(row['false_negative_rate']),
+                'negative_predictive_value': float(row['negative_predictive_value']),
+                'balanced_accuracy': float(row['balanced_accuracy']),
+                'matthews_correlation': float(row['matthews_correlation']),
+                'n_test_samples': int(row['samples'])
+            }
+            # Find existing country entry or create new one
+            country_entry = next((item for item in data if country in item), None)
+            if country_entry is None:
+                country_entry = {country: {}}
+                data.append(country_entry)
+            # Update the model metrics
+            if country not in country_entry:
+                country_entry[country] = {}
+            country_entry[country][model_name] = metrics
+        # Save updated data to temporary file
+        with open(temp_file, "w") as f:
+            json.dump(data, f, indent=4)
+        # Upload the file back to the Space
+        api.upload_file(
+            path_or_fileobj=str(temp_file),
+            path_in_repo=MULTI_DIALECTS_LEADERBOARD_FILE,
+            repo_id=repo_id,
+            repo_type="space"
+        )
+    finally:
+        # Clean up temporary file
+        if temp_file.exists():
+            temp_file.unlink()
+def load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE):
     current_dir = os.path.dirname(os.path.abspath(__file__))
+    DIALECT_CONFUSION_LEADERBOARD_FILE = os.path.join(current_dir, DIALECT_CONFUSION_LEADERBOARD_FILE)
+    with open(DIALECT_CONFUSION_LEADERBOARD_FILE, "r") as f:
         data = json.load(f)
     # Initialize lists to store the flattened data
     # Pivot the DataFrame to create the desired structure: all languages in columns and models in rows, and each (model, target_language, language) = false_positive_rate
     df_pivot = df.pivot(index=['model', 'target_language'], columns='language', values='false_positive_rate').reset_index()
     return df_pivot
+def load_leaderboard_multilingual(MULTI_DIALECTS_LEADERBOARD_FILE):
     current_dir = os.path.dirname(os.path.abspath(__file__))
+    MULTI_DIALECTS_LEADERBOARD_FILE = os.path.join(current_dir, MULTI_DIALECTS_LEADERBOARD_FILE)
+    with open(MULTI_DIALECTS_LEADERBOARD_FILE, "r") as f:
         data = json.load(f)
     # Initialize lists to store the flattened data
 def update_leaderboard_multilingual(country, selected_metrics):
     if not selected_metrics:  # If no metrics selected, show all
         selected_metrics = metrics
+    df = load_leaderboard_multilingual(MULTI_DIALECTS_LEADERBOARD_FILE)
     display_df = create_leaderboard_display_multilingual(df, country, selected_metrics)
     return display_df
 def update_leaderboard_one_vs_all(target_language, selected_languages):
     if not selected_languages:  # If no language selected, show all defaults
         selected_languages = default_languages
+    df = load_leaderboard_one_vs_all(DIALECT_CONFUSION_LEADERBOARD_FILE)
     display_df, selected_languages = create_leaderboard_display_one_vs_all(df, target_language, selected_languages)
     # to improve visibility in case the user chooses multiple language leading to many columns, the `model` column must remain fixed
+    # display_df = render_fixed_columns(display_df) # needs to be implemented
     return display_df, selected_languages
 def encode_image_to_base64(image_path):
+    """ encodes the image to base64"""
     with open(image_path, "rb") as image_file:
         encoded_string = base64.b64encode(image_file.read()).decode()
     return encoded_string
 def create_html_image(image_path):
+    """ Creates the html of the logo image from the image path input """
     # Get base64 string of image
     img_base64 = encode_image_to_base64(image_path)
     """
     return html_string
 def render_fixed_columns(df):
+    """ A function to render HTML table with fixed 'model' column for better visibility """
+    return NotImplementedError