Spaces:

atlasia
/

Open-Arabic-Dialect-Identification-Leaderboard

Running

App Files Files Community

nouamanetazi HF staff commited on Jan 4

Commit

bbdf7f4

verified ·

1 Parent(s): 07add23

Update utils.py

Browse files

Files changed (1) hide show

utils.py +635 -635

utils.py CHANGED Viewed

@@ -1,636 +1,636 @@
-import base64
-from huggingface_hub import hf_hub_download
-import fasttext
-import os
-import json
-import pandas as pd
-from sklearn.metrics import (
-    precision_score,
-    recall_score,
-    f1_score,
-    confusion_matrix,
-    balanced_accuracy_score,
-    matthews_corrcoef
-)
-import numpy as np
-from datasets import load_dataset
-# Constants
-MODEL_REPO = "atlasia/Sfaya-Moroccan-Darija-vs-All"
-BIN_FILENAME = "model_multi_v3_2fpr.bin"
-BINARY_LEADERBOARD_FILE = "darija_leaderboard_binary.json"
-MULTILINGUAL_LEADERBOARD_FILE = "darija_leaderboard_multilingual.json"
-DATA_PATH = "atlasia/Arabic-LID-Leaderboard"
-target_label = "Morocco"
-is_binary = False
-# Load test dataset
-test_dataset = load_dataset(DATA_PATH, split='test')
-# Supported dialects
-all_target_languages = list(test_dataset.unique("dialect"))
-supported_dialects = all_target_languages + ['All']
-languages_to_display_one_vs_all = all_target_languages # everything except All
-print(f'all_target_languages: {all_target_languages}')
-metrics = [
-    'f1_score',
-    'precision',
-    'recall',
-    'specificity',
-    'false_positive_rate',
-    'false_negative_rate',
-    'negative_predictive_value',
-    'n_test_samples',
-]
-default_metrics = [
-    'f1_score',
-    'precision',
-    'recall',
-    'false_positive_rate',
-    'false_negative_rate'
-]
-# default language to display in one-vs-all leaderboard
-default_languages = [
-    'Morocco',
-    'MSA',
-    'Egypt',
-    'Algeria',
-    'Tunisia',
-    'Levantine',
-]
-language_mapping_dict = {
-    'ace_Arab': 'Acehnese',
-    'acm_Arab': 'Mesopotamia',  # 'Gilit Mesopotamian'
-    'aeb_Arab': 'Tunisia',
-    'ajp_Arab': 'Levantine',  # 'South Levantine'
-    'apc_Arab': 'Levantine',
-    'arb_Arab': 'MSA',
-    'arq_Arab': 'Algeria',
-    'ars_Arab': 'Saudi',  # Najdi is primarily Saudi Arabian
-    'ary_Arab': 'Morocco',
-    'arz_Arab': 'Egypt',
-    'ayp_Arab': 'Mesopotamia',  # 'North Mesopotamian'
-    'azb_Arab': 'Azerbaijan',  # South Azerbaijani pertains to this region
-    'bcc_Arab': 'Balochistan',  # Southern Balochi is from Balochistan
-    'bjn_Arab': 'Indonesia',  # Banjar is spoken in Indonesia
-    'brh_Arab': 'Pakistan',  # Brahui is spoken in Pakistan
-    'ckb_Arab': 'Kurdistan',  # Central Kurdish is mainly in Iraq
-    'fuv_Arab': 'Nigeria', # Hausa States Fulfulde
-    'glk_Arab': 'Iran',  # Gilaki is spoken in Iran
-    'hac_Arab': 'Iran',  # Gurani is also primarily spoken in Iran
-    'kas_Arab': 'Kashmir',
-    'knc_Arab': 'Nigeria',  # Central Kanuri is in Nigeria
-    'lki_Arab': 'Iran',  # Laki is from Iran
-    'lrc_Arab': 'Iran',  # Northern Luri is from Iran
-    'min_Arab': 'Indonesia',  # Minangkabau is spoken in Indonesia
-    'mzn_Arab': 'Iran',  # Mazanderani is spoken in Iran
-    'ota_Arab': 'Turkey',  # Ottoman Turkish
-    'pbt_Arab': 'Afghanistan',  # Southern Pashto
-    'pnb_Arab': 'Pakistan',  # Western Panjabi
-    'sdh_Arab': 'Iraq',  # Southern Kurdish
-    'shu_Arab': 'Chad',  # Chadian Arabic
-    'skr_Arab': 'Pakistan',  # Saraiki
-    'snd_Arab': 'Pakistan',  # Sindhi
-    'sus_Arab': 'Guinea',  # Susu
-    'tuk_Arab': 'Turkmenistan',  # Turkmen
-    'uig_Arab': 'Uighur (China)',  # Uighur
-    'urd_Arab': 'Pakistan',  # Urdu
-    'uzs_Arab': 'Uzbekistan',  # Southern Uzbek
-    'zsm_Arab': 'Malaysia'  # Standard Malay
-}
-def predict_label(text, model, language_mapping_dict, use_mapping=False):
-    # Remove any newline characters and strip whitespace
-    text = str(text).strip().replace('\n', ' ')
-    if text == '':
-        return 'Other'
-    try:
-        # Get top prediction
-        prediction = model.predict(text, 1)
-        # Extract label and remove __label__ prefix
-        label = prediction[0][0].replace('__label__', '')
-        # Extract confidence score
-        confidence = prediction[1][0]
-        # map label to language using language_mapping_dict
-        if use_mapping:
-            label = language_mapping_dict.get(label, 'Other')
-        return label
-    except Exception as e:
-        print(f"Error processing text: {text}")
-        print(f"Exception: {e}")
-        return {'prediction_label': 'Error', 'prediction_confidence': 0.0}
-def compute_classification_metrics(test_dataset):
-    """
-    Compute comprehensive classification metrics for each class.
-    Args:
-        data (pd.DataFrame): DataFrame containing 'dialect' as true labels and 'preds' as predicted labels.
-    Returns:
-        pd.DataFrame: DataFrame with detailed metrics for each class.
-    """
-    # transform the dataset into a DataFrame
-    data = pd.DataFrame(test_dataset)
-    # Extract true labels and predictions
-    true_labels = list(data['dialect'])
-    predicted_labels = list(data['preds'])
-    # Handle all unique labels
-    labels = sorted(list(set(true_labels + predicted_labels)))
-    label_to_index = {label: index for index, label in enumerate(labels)}
-    # Convert labels to indices
-    true_indices = [label_to_index[label] for label in true_labels]
-    pred_indices = [label_to_index[label] for label in predicted_labels]
-    # Compute basic metrics
-    f1_scores = f1_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
-    precision_scores = precision_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
-    recall_scores = recall_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
-    # Compute confusion matrix
-    conf_mat = confusion_matrix(true_indices, pred_indices, labels=range(len(labels)))
-    # Calculate various metrics per class
-    FP = conf_mat.sum(axis=0) - np.diag(conf_mat)  # False Positives
-    FN = conf_mat.sum(axis=1) - np.diag(conf_mat)  # False Negatives
-    TP = np.diag(conf_mat)                         # True Positives
-    TN = conf_mat.sum() - (FP + FN + TP)          # True Negatives
-    # Calculate sample counts per class
-    samples_per_class = np.bincount(true_indices, minlength=len(labels))
-    # Calculate additional metrics
-    with np.errstate(divide='ignore', invalid='ignore'):
-        fp_rate = FP / (FP + TN)  # False Positive Rate
-        fn_rate = FN / (FN + TP)  # False Negative Rate
-        specificity = TN / (TN + FP)  # True Negative Rate
-        npv = TN / (TN + FN)  # Negative Predictive Value
-        # Replace NaN/inf with 0
-        metrics = [fp_rate, fn_rate, specificity, npv]
-        metrics = [np.nan_to_num(m, nan=0.0, posinf=0.0, neginf=0.0) for m in metrics]
-        fp_rate, fn_rate, specificity, npv = metrics
-    # Calculate overall metrics
-    balanced_acc = balanced_accuracy_score(true_indices, pred_indices)
-    mcc = matthews_corrcoef(true_indices, pred_indices)
-    # Compile results into a DataFrame
-    result_df = pd.DataFrame({
-        'country': labels,
-        'samples': samples_per_class,
-        'f1_score': f1_scores,
-        'precision': precision_scores,
-        'recall': recall_scores,
-        'specificity': specificity,
-        'false_positive_rate': fp_rate,
-        'false_negative_rate': fn_rate,
-        'true_positives': TP,
-        'false_positives': FP,
-        'true_negatives': TN,
-        'false_negatives': FN,
-        'negative_predictive_value': npv
-    })
-    # Sort by number of samples (descending)
-    result_df = result_df.sort_values('samples', ascending=False)
-    # Calculate and add summary metrics
-    summary_metrics = {
-        'macro_f1': f1_score(true_indices, pred_indices, average='macro'),
-        'weighted_f1': f1_score(true_indices, pred_indices, average='weighted'),
-        'micro_f1': f1_score(true_indices, pred_indices, average='micro'),
-        'balanced_accuracy': balanced_acc,
-        'matthews_correlation': mcc
-    }
-    # Format all numeric columns to 4 decimal places
-    numeric_cols = result_df.select_dtypes(include=[np.number]).columns
-    result_df[numeric_cols] = result_df[numeric_cols].round(4)
-    print(f'result_df: {result_df}')
-    return result_df, summary_metrics
-def make_binary(dialect, target):
-    if dialect != target:
-        return 'Other'
-    return target
-def run_eval_one_vs_all(data_test, TARGET_LANG='Morocco'):
-    # map to binary
-    df_test_preds = data_test.copy()
-    df_test_preds.loc[df_test_preds['dialect'] == TARGET_LANG, 'dialect'] = TARGET_LANG
-    df_test_preds.loc[df_test_preds['dialect'] != TARGET_LANG, 'dialect'] = 'Other'
-    # compute the fpr per dialect
-    dialect_counts = data_test.groupby('dialect')['dialect'].count().reset_index(name='size')
-    result_df = pd.merge(dialect_counts, data_test, on='dialect')
-    result_df = result_df.groupby(['dialect', 'size', 'preds'])['preds'].count()/result_df.groupby(['dialect', 'size'])['preds'].count()
-    result_df.sort_index(ascending=False, level='size', inplace=True)
-    # group by dialect and get the false positive rate
-    out = result_df.copy()
-    out.name = 'false_positive_rate'
-    out = out.reset_index()
-    out = out[out['preds']==TARGET_LANG].drop(columns=['preds', 'size'])
-    print(f'out for TARGET_LANG={TARGET_LANG} \n: {out}')
-    return out
-def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, BINARY_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
-    try:
-        with open(BINARY_LEADERBOARD_FILE, "r") as f:
-            data = json.load(f)
-    except FileNotFoundError:
-        data = []
-    # Process the results for each dialect/country
-    for _, row in result_df.iterrows():
-        dialect = row['dialect']
-        # Skip 'Other' class, it is considered as the null space
-        if dialect == 'Other':
-            continue
-        # Find existing target_lang entry or create a new one
-        target_entry = next((item for item in data if target_lang in item), None)
-        if target_entry is None:
-            target_entry = {target_lang: {}}
-            data.append(target_entry)
-        # Get the country-specific data for this target language
-        country_data = target_entry[target_lang]
-        # Initialize the dialect/country entry if it doesn't exist
-        if dialect not in country_data:
-            country_data[dialect] = {}
-        # Update the model metrics under the model name for the given dialect
-        country_data[dialect][model_name] = float(row['false_positive_rate'])
-        # # Add the number of test samples, if not already present
-        # if "n_test_samples" not in country_data[dialect]:
-        #     country_data[dialect]["n_test_samples"] = int(row['size'])
-    # Save updated leaderboard data
-    with open(BINARY_LEADERBOARD_FILE, "w") as f:
-        json.dump(data, f, indent=4)
-def handle_evaluation(model_path, model_path_bin, use_mapping=False):
-    # download model and get the model path
-    model_path_hub = hf_hub_download(repo_id=model_path, filename=model_path_bin, cache_dir=None)
-    # Load the trained model
-    print(f"[INFO] Loading model from Path: {model_path_hub}, using version {model_path_bin}...")
-    model = fasttext.load_model(model_path_hub)
-    # Load the evaluation dataset
-    print(f"[INFO] Loading evaluation dataset from Path: {DATA_PATH}...")
-    eval_dataset = load_dataset(DATA_PATH, split='test')
-    # Transform to pandas DataFrame
-    print(f"[INFO] Converting evaluation dataset to Pandas DataFrame...")
-    df_eval = pd.DataFrame(eval_dataset)
-    # Predict labels using the model
-    print(f"[INFO] Running predictions...")
-    df_eval['preds'] = df_eval['text'].apply(lambda text: predict_label(text, model, language_mapping_dict, use_mapping=use_mapping))
-    # run the evaluation
-    result_df, _ = run_eval(df_eval)
-    # set the model name
-    model_name = model_path + '/' + model_path_bin
-    # update the multilingual leaderboard
-    update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE)
-    for target_lang in all_target_languages:
-        result_df_one_vs_all =run_eval_one_vs_all(df_eval, TARGET_LANG=target_lang)
-        update_darija_one_vs_all_leaderboard(result_df_one_vs_all, model_name, target_lang, BINARY_LEADERBOARD_FILE)
-    # load the updated leaderboard tables
-    df_multilingual = load_leaderboard_multilingual()
-    df_one_vs_all = load_leaderboard_one_vs_all()
-    status_message = "**Evaluation now ended! 🤗**"
-    return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
-def run_eval(df_eval):
-    """Run evaluation on a dataset and compute metrics.
-    Args:
-        model: The model to evaluate.
-        DATA_PATH (str): Path to the dataset.
-        is_binary (bool): If True, evaluate as binary classification.
-                          If False, evaluate as multi-class classification.
-        target_label (str): The target class label in binary mode.
-    Returns:
-        pd.DataFrame: A DataFrame containing evaluation metrics.
-    """
-    # map to binary
-    df_eval_multilingual = df_eval.copy()
-    # now drop the columns that are not needed, i.e. 'text'
-    df_eval_multilingual = df_eval_multilingual.drop(columns=['text', 'metadata', 'dataset_source'])
-    # Compute evaluation metrics
-    print(f"[INFO] Computing metrics...")
-    result_df, _ = compute_classification_metrics(df_eval_multilingual)
-    # update_darija_multilingual_leaderboard(result_df, model_path, MULTILINGUAL_LEADERBOARD_FILE)
-    return result_df, df_eval_multilingual
-def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/submissions/", default_language='Morocco'):
-    try:
-        if file is None:
-            return "Please upload a file."
-        # Clean the model name to be safe for file paths
-        uploaded_model_name = uploaded_model_name.strip().replace(" ", "_")
-        print(f"[INFO] uploaded_model_name: {uploaded_model_name}")
-        # Create the directory for saving submissions
-        path_saving = os.path.join(base_path_save, uploaded_model_name)
-        os.makedirs(path_saving, exist_ok=True)
-        # Define the full path to save the file
-        saved_file_path = os.path.join(path_saving, 'submission.csv')
-        # Read the uploaded file as DataFrame
-        print(f"[INFO] Loading results...")
-        df_eval = pd.read_csv(file.name)
-        # Save the DataFrame
-        print(f"[INFO] Saving the file locally in: {saved_file_path}")
-        df_eval.to_csv(saved_file_path, index=False)
-    except Exception as e:
-        return f"Error processing file: {str(e)}"
-    # Compute evaluation metrics
-    print(f"[INFO] Computing metrics...")
-    result_df, _ = compute_classification_metrics(df_eval)
-    # Update the leaderboards
-    update_darija_multilingual_leaderboard(result_df, uploaded_model_name, MULTILINGUAL_LEADERBOARD_FILE)
-    # TODO: implement this ove_vs_all differently for people only submitting csv file. They need to submit two files, one for multi-lang and the other for one-vs-all
-    # result_df_one_vs_all = run_eval_one_vs_all(...)
-    # update_darija_one_vs_all_leaderboard(...)
-    # update the leaderboard table
-    df = load_leaderboard_multilingual()
-    return create_leaderboard_display_multilingual(df, default_language, default_metrics)
-def update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
-    # Load leaderboard data
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-    MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE)
-    try:
-        with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f:
-            data = json.load(f)
-    except FileNotFoundError:
-        data = []
-    # Process the results for each dialect/country
-    for _, row in result_df.iterrows():
-        country = row['country']
-        # skip 'Other' class, it is considered as the null space
-        if country == 'Other':
-            continue
-        # Create metrics dictionary directly
-        metrics = {
-            'f1_score': float(row['f1_score']),
-            'precision': float(row['precision']),
-            'recall': float(row['recall']),
-            'specificity': float(row['specificity']),
-            'false_positive_rate': float(row['false_positive_rate']),
-            'false_negative_rate': float(row['false_negative_rate']),
-            'negative_predictive_value': float(row['negative_predictive_value']),
-            'n_test_samples': int(row['samples'])
-        }
-        # Find existing country entry or create new one
-        country_entry = next((item for item in data if country in item), None)
-        if country_entry is None:
-            country_entry = {country: {}}
-            data.append(country_entry)
-        # Update the model metrics directly under the model name
-        if country not in country_entry:
-            country_entry[country] = {}
-        country_entry[country][model_name] = metrics
-    # Save updated leaderboard data
-    with open(MULTILINGUAL_LEADERBOARD_FILE, "w") as f:
-        json.dump(data, f, indent=4)
-def load_leaderboard_one_vs_all(BINARY_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-    BINARY_LEADERBOARD_FILE = os.path.join(current_dir, BINARY_LEADERBOARD_FILE)
-    with open(BINARY_LEADERBOARD_FILE, "r") as f:
-        data = json.load(f)
-    # Initialize lists to store the flattened data
-    rows = []
-    # Process each target language's data
-    for leaderboard_data in data:
-        for target_language, results in leaderboard_data.items():
-            for language, models in results.items():
-                for model_name, false_positive_rate in models.items():
-                    row = {
-                        'target_language': target_language,
-                        'language': language,
-                        'model': model_name,
-                        'false_positive_rate': false_positive_rate,
-                    }
-                    # Add all metrics to the row
-                    rows.append(row)
-    # Convert to DataFrame
-    df = pd.DataFrame(rows)
-    # Pivot the DataFrame to create the desired structure: all languages in columns and models in rows, and each (model, target_language, language) = false_positive_rate
-    df_pivot = df.pivot(index=['model', 'target_language'], columns='language', values='false_positive_rate').reset_index()
-    # print(f'df_pivot \n: {df_pivot}')
-    return df_pivot
-def load_leaderboard_multilingual(MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-    MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE)
-    with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f:
-        data = json.load(f)
-    # Initialize lists to store the flattened data
-    rows = []
-    # Process each country's data
-    for country_data in data:
-        for country, models in country_data.items():
-            for model_name, metrics in models.items():
-                row = {
-                    'country': country,
-                    'model': model_name,
-                }
-                # Add all metrics to the row
-                row.update(metrics)
-                rows.append(row)
-    # Convert to DataFrame
-    df = pd.DataFrame(rows)
-    return df
-def create_leaderboard_display_one_vs_all(df, target_language, selected_languages):
-    # Filter by target_language if specified
-    if target_language:
-        df = df[df['target_language'] == target_language]
-        # Remove the target_language from selected_languages
-        if target_language in selected_languages:
-            selected_languages = [lang for lang in selected_languages if lang != target_language]
-        # Select only the chosen languages (plus 'model' column)
-        columns_to_show = ['model'] + [language for language in selected_languages if language in df.columns]
-    # Sort by first selected metric by default
-    if selected_languages:
-        df = df.sort_values(by=selected_languages[0], ascending=False)
-    df = df[columns_to_show]
-    # Format numeric columns to 4 decimal places
-    numeric_cols = df.select_dtypes(include=['float64']).columns
-    df[numeric_cols] = df[numeric_cols].round(4)
-    return df, selected_languages
-def create_leaderboard_display_multilingual(df, selected_country, selected_metrics):
-    # Filter by country if specified
-    if selected_country and selected_country.upper() != 'ALL':
-        # print(f"Filtering leaderboard by country: {selected_country}")
-        df = df[df['country'] == selected_country]
-        df = df.drop(columns=['country'])
-        # Select only the chosen metrics (plus 'model' column)
-        columns_to_show = ['model'] + [metric for metric in selected_metrics if metric in df.columns]
-    else:
-        # Select all metrics (plus 'country' and 'model' columns), if no country is selected or 'All' is selected for ease of comparison
-        columns_to_show = ['model', 'country'] + selected_metrics
-    # Sort by first selected metric by default
-    if selected_metrics:
-        df = df.sort_values(by=selected_metrics[0], ascending=False)
-    df = df[columns_to_show]
-    # Format numeric columns to 4 decimal places
-    numeric_cols = df.select_dtypes(include=['float64']).columns
-    df[numeric_cols] = df[numeric_cols].round(4)
-    return df
-def update_leaderboard_multilingual(country, selected_metrics):
-    if not selected_metrics:  # If no metrics selected, show all
-        selected_metrics = metrics
-    df = load_leaderboard_multilingual()
-    display_df = create_leaderboard_display_multilingual(df, country, selected_metrics)
-    return display_df
-def update_leaderboard_one_vs_all(target_language, selected_languages):
-    if not selected_languages:  # If no language selected, show all defaults
-        selected_languages = default_languages
-    df = load_leaderboard_one_vs_all()
-    display_df, selected_languages = create_leaderboard_display_one_vs_all(df, target_language, selected_languages)
-    # to improve visibility in case the user chooses multiple language leading to many columns, the `model` column must remain fixed
-    # display_df = render_fixed_columns(display_df)
-    return display_df, selected_languages
-def encode_image_to_base64(image_path):
-    with open(image_path, "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read()).decode()
-    return encoded_string
-def create_html_image(image_path):
-    # Get base64 string of image
-    img_base64 = encode_image_to_base64(image_path)
-    # Create HTML string with embedded image and centering styles
-    html_string = f"""
-    <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
-        <div style="max-width: 800px; margin: auto;">
-            <img src="data:image/jpeg;base64,{img_base64}"
-                 style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
-                 alt="Displayed Image">
-        </div>
-    </div>
-    """
-    return html_string
-# Function to render HTML table with fixed 'model' column
-def render_fixed_columns(df):
-    style = """
-    <style>
-    .table-container {
-        overflow-x: auto;
-        position: relative;
-        white-space: nowrap;
-    }
-    table {
-        border-collapse: collapse;
-        width: 100%;
-    }
-    th, td {
-        border: 1px solid black;
-        padding: 8px;
-        text-align: left;
-    }
-    th.fixed, td.fixed {
-        position: sticky;
-        left: 0;
-        background-color: white;
-        z-index: 2;
-    }
-    </style>
-    """
-    table_html = df.to_html(index=False).replace(
-        "<th>model</th>", '<th class="fixed">model</th>'
-    ).replace(
-        '<td>', '<td class="fixed">', 1
-    )
     return f"{style}<div class='table-container'>{table_html}</div>"

+import base64
+from huggingface_hub import hf_hub_download
+import fasttext
+import os
+import json
+import pandas as pd
+from sklearn.metrics import (
+    precision_score,
+    recall_score,
+    f1_score,
+    confusion_matrix,
+    balanced_accuracy_score,
+    matthews_corrcoef
+)
+import numpy as np
+from datasets import load_dataset
+# Constants
+MODEL_REPO = "atlasia/Sfaya-Moroccan-Darija-vs-All"
+BIN_FILENAME = "model_multi_v3_2fpr.bin"
+BINARY_LEADERBOARD_FILE = "darija_leaderboard_binary.json"
+MULTILINGUAL_LEADERBOARD_FILE = "darija_leaderboard_multilingual.json"
+DATA_PATH = "atlasia/Arabic-LID-Leaderboard"
+target_label = "Morocco"
+is_binary = False
+# Load test dataset
+test_dataset = load_dataset(DATA_PATH, split='test')
+# Supported dialects
+all_target_languages = list(test_dataset.unique("dialect"))
+supported_dialects = all_target_languages + ['All']
+languages_to_display_one_vs_all = all_target_languages # everything except All
+print(f'all_target_languages: {all_target_languages}')
+metrics = [
+    'f1_score',
+    'precision',
+    'recall',
+    'specificity',
+    'false_positive_rate',
+    'false_negative_rate',
+    'negative_predictive_value',
+    'n_test_samples',
+]
+default_metrics = [
+    'f1_score',
+    'precision',
+    'recall',
+    'false_positive_rate',
+    'false_negative_rate'
+]
+# default language to display in one-vs-all leaderboard
+default_languages = [
+    #'Morocco',
+    'MSA',
+    #'Egypt',
+    #'Algeria',
+    #'Tunisia',
+    #'Levantine',
+]
+language_mapping_dict = {
+    'ace_Arab': 'Acehnese',
+    'acm_Arab': 'Mesopotamia',  # 'Gilit Mesopotamian'
+    'aeb_Arab': 'Tunisia',
+    'ajp_Arab': 'Levantine',  # 'South Levantine'
+    'apc_Arab': 'Levantine',
+    'arb_Arab': 'MSA',
+    'arq_Arab': 'Algeria',
+    'ars_Arab': 'Saudi',  # Najdi is primarily Saudi Arabian
+    'ary_Arab': 'Morocco',
+    'arz_Arab': 'Egypt',
+    'ayp_Arab': 'Mesopotamia',  # 'North Mesopotamian'
+    'azb_Arab': 'Azerbaijan',  # South Azerbaijani pertains to this region
+    'bcc_Arab': 'Balochistan',  # Southern Balochi is from Balochistan
+    'bjn_Arab': 'Indonesia',  # Banjar is spoken in Indonesia
+    'brh_Arab': 'Pakistan',  # Brahui is spoken in Pakistan
+    'ckb_Arab': 'Kurdistan',  # Central Kurdish is mainly in Iraq
+    'fuv_Arab': 'Nigeria', # Hausa States Fulfulde
+    'glk_Arab': 'Iran',  # Gilaki is spoken in Iran
+    'hac_Arab': 'Iran',  # Gurani is also primarily spoken in Iran
+    'kas_Arab': 'Kashmir',
+    'knc_Arab': 'Nigeria',  # Central Kanuri is in Nigeria
+    'lki_Arab': 'Iran',  # Laki is from Iran
+    'lrc_Arab': 'Iran',  # Northern Luri is from Iran
+    'min_Arab': 'Indonesia',  # Minangkabau is spoken in Indonesia
+    'mzn_Arab': 'Iran',  # Mazanderani is spoken in Iran
+    'ota_Arab': 'Turkey',  # Ottoman Turkish
+    'pbt_Arab': 'Afghanistan',  # Southern Pashto
+    'pnb_Arab': 'Pakistan',  # Western Panjabi
+    'sdh_Arab': 'Iraq',  # Southern Kurdish
+    'shu_Arab': 'Chad',  # Chadian Arabic
+    'skr_Arab': 'Pakistan',  # Saraiki
+    'snd_Arab': 'Pakistan',  # Sindhi
+    'sus_Arab': 'Guinea',  # Susu
+    'tuk_Arab': 'Turkmenistan',  # Turkmen
+    'uig_Arab': 'Uighur (China)',  # Uighur
+    'urd_Arab': 'Pakistan',  # Urdu
+    'uzs_Arab': 'Uzbekistan',  # Southern Uzbek
+    'zsm_Arab': 'Malaysia'  # Standard Malay
+}
+def predict_label(text, model, language_mapping_dict, use_mapping=False):
+    # Remove any newline characters and strip whitespace
+    text = str(text).strip().replace('\n', ' ')
+    if text == '':
+        return 'Other'
+    try:
+        # Get top prediction
+        prediction = model.predict(text, 1)
+        # Extract label and remove __label__ prefix
+        label = prediction[0][0].replace('__label__', '')
+        # Extract confidence score
+        confidence = prediction[1][0]
+        # map label to language using language_mapping_dict
+        if use_mapping:
+            label = language_mapping_dict.get(label, 'Other')
+        return label
+    except Exception as e:
+        print(f"Error processing text: {text}")
+        print(f"Exception: {e}")
+        return {'prediction_label': 'Error', 'prediction_confidence': 0.0}
+def compute_classification_metrics(test_dataset):
+    """
+    Compute comprehensive classification metrics for each class.
+    Args:
+        data (pd.DataFrame): DataFrame containing 'dialect' as true labels and 'preds' as predicted labels.
+    Returns:
+        pd.DataFrame: DataFrame with detailed metrics for each class.
+    """
+    # transform the dataset into a DataFrame
+    data = pd.DataFrame(test_dataset)
+    # Extract true labels and predictions
+    true_labels = list(data['dialect'])
+    predicted_labels = list(data['preds'])
+    # Handle all unique labels
+    labels = sorted(list(set(true_labels + predicted_labels)))
+    label_to_index = {label: index for index, label in enumerate(labels)}
+    # Convert labels to indices
+    true_indices = [label_to_index[label] for label in true_labels]
+    pred_indices = [label_to_index[label] for label in predicted_labels]
+    # Compute basic metrics
+    f1_scores = f1_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
+    precision_scores = precision_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
+    recall_scores = recall_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
+    # Compute confusion matrix
+    conf_mat = confusion_matrix(true_indices, pred_indices, labels=range(len(labels)))
+    # Calculate various metrics per class
+    FP = conf_mat.sum(axis=0) - np.diag(conf_mat)  # False Positives
+    FN = conf_mat.sum(axis=1) - np.diag(conf_mat)  # False Negatives
+    TP = np.diag(conf_mat)                         # True Positives
+    TN = conf_mat.sum() - (FP + FN + TP)          # True Negatives
+    # Calculate sample counts per class
+    samples_per_class = np.bincount(true_indices, minlength=len(labels))
+    # Calculate additional metrics
+    with np.errstate(divide='ignore', invalid='ignore'):
+        fp_rate = FP / (FP + TN)  # False Positive Rate
+        fn_rate = FN / (FN + TP)  # False Negative Rate
+        specificity = TN / (TN + FP)  # True Negative Rate
+        npv = TN / (TN + FN)  # Negative Predictive Value
+        # Replace NaN/inf with 0
+        metrics = [fp_rate, fn_rate, specificity, npv]
+        metrics = [np.nan_to_num(m, nan=0.0, posinf=0.0, neginf=0.0) for m in metrics]
+        fp_rate, fn_rate, specificity, npv = metrics
+    # Calculate overall metrics
+    balanced_acc = balanced_accuracy_score(true_indices, pred_indices)
+    mcc = matthews_corrcoef(true_indices, pred_indices)
+    # Compile results into a DataFrame
+    result_df = pd.DataFrame({
+        'country': labels,
+        'samples': samples_per_class,
+        'f1_score': f1_scores,
+        'precision': precision_scores,
+        'recall': recall_scores,
+        'specificity': specificity,
+        'false_positive_rate': fp_rate,
+        'false_negative_rate': fn_rate,
+        'true_positives': TP,
+        'false_positives': FP,
+        'true_negatives': TN,
+        'false_negatives': FN,
+        'negative_predictive_value': npv
+    })
+    # Sort by number of samples (descending)
+    result_df = result_df.sort_values('samples', ascending=False)
+    # Calculate and add summary metrics
+    summary_metrics = {
+        'macro_f1': f1_score(true_indices, pred_indices, average='macro'),
+        'weighted_f1': f1_score(true_indices, pred_indices, average='weighted'),
+        'micro_f1': f1_score(true_indices, pred_indices, average='micro'),
+        'balanced_accuracy': balanced_acc,
+        'matthews_correlation': mcc
+    }
+    # Format all numeric columns to 4 decimal places
+    numeric_cols = result_df.select_dtypes(include=[np.number]).columns
+    result_df[numeric_cols] = result_df[numeric_cols].round(4)
+    print(f'result_df: {result_df}')
+    return result_df, summary_metrics
+def make_binary(dialect, target):
+    if dialect != target:
+        return 'Other'
+    return target
+def run_eval_one_vs_all(data_test, TARGET_LANG='Morocco'):
+    # map to binary
+    df_test_preds = data_test.copy()
+    df_test_preds.loc[df_test_preds['dialect'] == TARGET_LANG, 'dialect'] = TARGET_LANG
+    df_test_preds.loc[df_test_preds['dialect'] != TARGET_LANG, 'dialect'] = 'Other'
+    # compute the fpr per dialect
+    dialect_counts = data_test.groupby('dialect')['dialect'].count().reset_index(name='size')
+    result_df = pd.merge(dialect_counts, data_test, on='dialect')
+    result_df = result_df.groupby(['dialect', 'size', 'preds'])['preds'].count()/result_df.groupby(['dialect', 'size'])['preds'].count()
+    result_df.sort_index(ascending=False, level='size', inplace=True)
+    # group by dialect and get the false positive rate
+    out = result_df.copy()
+    out.name = 'false_positive_rate'
+    out = out.reset_index()
+    out = out[out['preds']==TARGET_LANG].drop(columns=['preds', 'size'])
+    print(f'out for TARGET_LANG={TARGET_LANG} \n: {out}')
+    return out
+def update_darija_one_vs_all_leaderboard(result_df, model_name, target_lang, BINARY_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
+    try:
+        with open(BINARY_LEADERBOARD_FILE, "r") as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        data = []
+    # Process the results for each dialect/country
+    for _, row in result_df.iterrows():
+        dialect = row['dialect']
+        # Skip 'Other' class, it is considered as the null space
+        if dialect == 'Other':
+            continue
+        # Find existing target_lang entry or create a new one
+        target_entry = next((item for item in data if target_lang in item), None)
+        if target_entry is None:
+            target_entry = {target_lang: {}}
+            data.append(target_entry)
+        # Get the country-specific data for this target language
+        country_data = target_entry[target_lang]
+        # Initialize the dialect/country entry if it doesn't exist
+        if dialect not in country_data:
+            country_data[dialect] = {}
+        # Update the model metrics under the model name for the given dialect
+        country_data[dialect][model_name] = float(row['false_positive_rate'])
+        # # Add the number of test samples, if not already present
+        # if "n_test_samples" not in country_data[dialect]:
+        #     country_data[dialect]["n_test_samples"] = int(row['size'])
+    # Save updated leaderboard data
+    with open(BINARY_LEADERBOARD_FILE, "w") as f:
+        json.dump(data, f, indent=4)
+def handle_evaluation(model_path, model_path_bin, use_mapping=False):
+    # download model and get the model path
+    model_path_hub = hf_hub_download(repo_id=model_path, filename=model_path_bin, cache_dir=None)
+    # Load the trained model
+    print(f"[INFO] Loading model from Path: {model_path_hub}, using version {model_path_bin}...")
+    model = fasttext.load_model(model_path_hub)
+    # Load the evaluation dataset
+    print(f"[INFO] Loading evaluation dataset from Path: {DATA_PATH}...")
+    eval_dataset = load_dataset(DATA_PATH, split='test')
+    # Transform to pandas DataFrame
+    print(f"[INFO] Converting evaluation dataset to Pandas DataFrame...")
+    df_eval = pd.DataFrame(eval_dataset)
+    # Predict labels using the model
+    print(f"[INFO] Running predictions...")
+    df_eval['preds'] = df_eval['text'].apply(lambda text: predict_label(text, model, language_mapping_dict, use_mapping=use_mapping))
+    # run the evaluation
+    result_df, _ = run_eval(df_eval)
+    # set the model name
+    model_name = model_path + '/' + model_path_bin
+    # update the multilingual leaderboard
+    update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE)
+    for target_lang in all_target_languages:
+        result_df_one_vs_all =run_eval_one_vs_all(df_eval, TARGET_LANG=target_lang)
+        update_darija_one_vs_all_leaderboard(result_df_one_vs_all, model_name, target_lang, BINARY_LEADERBOARD_FILE)
+    # load the updated leaderboard tables
+    df_multilingual = load_leaderboard_multilingual()
+    df_one_vs_all = load_leaderboard_one_vs_all()
+    status_message = "**Evaluation now ended! 🤗**"
+    return create_leaderboard_display_multilingual(df_multilingual, target_label, default_metrics), status_message
+def run_eval(df_eval):
+    """Run evaluation on a dataset and compute metrics.
+    Args:
+        model: The model to evaluate.
+        DATA_PATH (str): Path to the dataset.
+        is_binary (bool): If True, evaluate as binary classification.
+                          If False, evaluate as multi-class classification.
+        target_label (str): The target class label in binary mode.
+    Returns:
+        pd.DataFrame: A DataFrame containing evaluation metrics.
+    """
+    # map to binary
+    df_eval_multilingual = df_eval.copy()
+    # now drop the columns that are not needed, i.e. 'text'
+    df_eval_multilingual = df_eval_multilingual.drop(columns=['text', 'metadata', 'dataset_source'])
+    # Compute evaluation metrics
+    print(f"[INFO] Computing metrics...")
+    result_df, _ = compute_classification_metrics(df_eval_multilingual)
+    # update_darija_multilingual_leaderboard(result_df, model_path, MULTILINGUAL_LEADERBOARD_FILE)
+    return result_df, df_eval_multilingual
+def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/submissions/", default_language='Morocco'):
+    try:
+        if file is None:
+            return "Please upload a file."
+        # Clean the model name to be safe for file paths
+        uploaded_model_name = uploaded_model_name.strip().replace(" ", "_")
+        print(f"[INFO] uploaded_model_name: {uploaded_model_name}")
+        # Create the directory for saving submissions
+        path_saving = os.path.join(base_path_save, uploaded_model_name)
+        os.makedirs(path_saving, exist_ok=True)
+        # Define the full path to save the file
+        saved_file_path = os.path.join(path_saving, 'submission.csv')
+        # Read the uploaded file as DataFrame
+        print(f"[INFO] Loading results...")
+        df_eval = pd.read_csv(file.name)
+        # Save the DataFrame
+        print(f"[INFO] Saving the file locally in: {saved_file_path}")
+        df_eval.to_csv(saved_file_path, index=False)
+    except Exception as e:
+        return f"Error processing file: {str(e)}"
+    # Compute evaluation metrics
+    print(f"[INFO] Computing metrics...")
+    result_df, _ = compute_classification_metrics(df_eval)
+    # Update the leaderboards
+    update_darija_multilingual_leaderboard(result_df, uploaded_model_name, MULTILINGUAL_LEADERBOARD_FILE)
+    # TODO: implement this ove_vs_all differently for people only submitting csv file. They need to submit two files, one for multi-lang and the other for one-vs-all
+    # result_df_one_vs_all = run_eval_one_vs_all(...)
+    # update_darija_one_vs_all_leaderboard(...)
+    # update the leaderboard table
+    df = load_leaderboard_multilingual()
+    return create_leaderboard_display_multilingual(df, default_language, default_metrics)
+def update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
+    # Load leaderboard data
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE)
+    try:
+        with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        data = []
+    # Process the results for each dialect/country
+    for _, row in result_df.iterrows():
+        country = row['country']
+        # skip 'Other' class, it is considered as the null space
+        if country == 'Other':
+            continue
+        # Create metrics dictionary directly
+        metrics = {
+            'f1_score': float(row['f1_score']),
+            'precision': float(row['precision']),
+            'recall': float(row['recall']),
+            'specificity': float(row['specificity']),
+            'false_positive_rate': float(row['false_positive_rate']),
+            'false_negative_rate': float(row['false_negative_rate']),
+            'negative_predictive_value': float(row['negative_predictive_value']),
+            'n_test_samples': int(row['samples'])
+        }
+        # Find existing country entry or create new one
+        country_entry = next((item for item in data if country in item), None)
+        if country_entry is None:
+            country_entry = {country: {}}
+            data.append(country_entry)
+        # Update the model metrics directly under the model name
+        if country not in country_entry:
+            country_entry[country] = {}
+        country_entry[country][model_name] = metrics
+    # Save updated leaderboard data
+    with open(MULTILINGUAL_LEADERBOARD_FILE, "w") as f:
+        json.dump(data, f, indent=4)
+def load_leaderboard_one_vs_all(BINARY_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    BINARY_LEADERBOARD_FILE = os.path.join(current_dir, BINARY_LEADERBOARD_FILE)
+    with open(BINARY_LEADERBOARD_FILE, "r") as f:
+        data = json.load(f)
+    # Initialize lists to store the flattened data
+    rows = []
+    # Process each target language's data
+    for leaderboard_data in data:
+        for target_language, results in leaderboard_data.items():
+            for language, models in results.items():
+                for model_name, false_positive_rate in models.items():
+                    row = {
+                        'target_language': target_language,
+                        'language': language,
+                        'model': model_name,
+                        'false_positive_rate': false_positive_rate,
+                    }
+                    # Add all metrics to the row
+                    rows.append(row)
+    # Convert to DataFrame
+    df = pd.DataFrame(rows)
+    # Pivot the DataFrame to create the desired structure: all languages in columns and models in rows, and each (model, target_language, language) = false_positive_rate
+    df_pivot = df.pivot(index=['model', 'target_language'], columns='language', values='false_positive_rate').reset_index()
+    # print(f'df_pivot \n: {df_pivot}')
+    return df_pivot
+def load_leaderboard_multilingual(MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE)
+    with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f:
+        data = json.load(f)
+    # Initialize lists to store the flattened data
+    rows = []
+    # Process each country's data
+    for country_data in data:
+        for country, models in country_data.items():
+            for model_name, metrics in models.items():
+                row = {
+                    'country': country,
+                    'model': model_name,
+                }
+                # Add all metrics to the row
+                row.update(metrics)
+                rows.append(row)
+    # Convert to DataFrame
+    df = pd.DataFrame(rows)
+    return df
+def create_leaderboard_display_one_vs_all(df, target_language, selected_languages):
+    # Filter by target_language if specified
+    if target_language:
+        df = df[df['target_language'] == target_language]
+        # Remove the target_language from selected_languages
+        if target_language in selected_languages:
+            selected_languages = [lang for lang in selected_languages if lang != target_language]
+        # Select only the chosen languages (plus 'model' column)
+        columns_to_show = ['model'] + [language for language in selected_languages if language in df.columns]
+    # Sort by first selected metric by default
+    if selected_languages:
+        df = df.sort_values(by=selected_languages[0], ascending=False)
+    df = df[columns_to_show]
+    # Format numeric columns to 4 decimal places
+    numeric_cols = df.select_dtypes(include=['float64']).columns
+    df[numeric_cols] = df[numeric_cols].round(4)
+    return df, selected_languages
+def create_leaderboard_display_multilingual(df, selected_country, selected_metrics):
+    # Filter by country if specified
+    if selected_country and selected_country.upper() != 'ALL':
+        # print(f"Filtering leaderboard by country: {selected_country}")
+        df = df[df['country'] == selected_country]
+        df = df.drop(columns=['country'])
+        # Select only the chosen metrics (plus 'model' column)
+        columns_to_show = ['model'] + [metric for metric in selected_metrics if metric in df.columns]
+    else:
+        # Select all metrics (plus 'country' and 'model' columns), if no country is selected or 'All' is selected for ease of comparison
+        columns_to_show = ['model', 'country'] + selected_metrics
+    # Sort by first selected metric by default
+    if selected_metrics:
+        df = df.sort_values(by=selected_metrics[0], ascending=False)
+    df = df[columns_to_show]
+    # Format numeric columns to 4 decimal places
+    numeric_cols = df.select_dtypes(include=['float64']).columns
+    df[numeric_cols] = df[numeric_cols].round(4)
+    return df
+def update_leaderboard_multilingual(country, selected_metrics):
+    if not selected_metrics:  # If no metrics selected, show all
+        selected_metrics = metrics
+    df = load_leaderboard_multilingual()
+    display_df = create_leaderboard_display_multilingual(df, country, selected_metrics)
+    return display_df
+def update_leaderboard_one_vs_all(target_language, selected_languages):
+    if not selected_languages:  # If no language selected, show all defaults
+        selected_languages = default_languages
+    df = load_leaderboard_one_vs_all()
+    display_df, selected_languages = create_leaderboard_display_one_vs_all(df, target_language, selected_languages)
+    # to improve visibility in case the user chooses multiple language leading to many columns, the `model` column must remain fixed
+    # display_df = render_fixed_columns(display_df)
+    return display_df, selected_languages
+def encode_image_to_base64(image_path):
+    with open(image_path, "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read()).decode()
+    return encoded_string
+def create_html_image(image_path):
+    # Get base64 string of image
+    img_base64 = encode_image_to_base64(image_path)
+    # Create HTML string with embedded image and centering styles
+    html_string = f"""
+    <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
+        <div style="max-width: 800px; margin: auto;">
+            <img src="data:image/jpeg;base64,{img_base64}"
+                 style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
+                 alt="Displayed Image">
+        </div>
+    </div>
+    """
+    return html_string
+# Function to render HTML table with fixed 'model' column
+def render_fixed_columns(df):
+    style = """
+    <style>
+    .table-container {
+        overflow-x: auto;
+        position: relative;
+        white-space: nowrap;
+    }
+    table {
+        border-collapse: collapse;
+        width: 100%;
+    }
+    th, td {
+        border: 1px solid black;
+        padding: 8px;
+        text-align: left;
+    }
+    th.fixed, td.fixed {
+        position: sticky;
+        left: 0;
+        background-color: white;
+        z-index: 2;
+    }
+    </style>
+    """
+    table_html = df.to_html(index=False).replace(
+        "<th>model</th>", '<th class="fixed">model</th>'
+    ).replace(
+        '<td>', '<td class="fixed">', 1
+    )
     return f"{style}<div class='table-container'>{table_html}</div>"