|
import base64
|
|
from fasttext import load_model
|
|
from huggingface_hub import hf_hub_download
|
|
import os
|
|
import json
|
|
import pandas as pd
|
|
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score, matthews_corrcoef
|
|
import numpy as np
|
|
from datasets import load_dataset
|
|
import fasttext
|
|
|
|
|
|
MODEL_REPO = "atlasia/Sfaya-Moroccan-Darija-vs-All"
|
|
BIN_FILENAME = "model_multi_v3_2fpr.bin"
|
|
BINARY_LEADERBOARD_FILE = "darija_leaderboard_binary.json"
|
|
MULTILINGUAL_LEADERBOARD_FILE = "darija_leaderboard_multilingual.json"
|
|
DATA_PATH = "atlasia/No-Arabic-Dialect-Left-Behind-Filtered-Balanced"
|
|
|
|
target_label = "Morocco"
|
|
is_binary = False
|
|
|
|
metrics = [
|
|
'f1_score',
|
|
'precision',
|
|
'recall',
|
|
'specificity',
|
|
'false_positive_rate',
|
|
'false_negative_rate',
|
|
'negative_predictive_value',
|
|
'n_test_samples',
|
|
]
|
|
|
|
default_metrics = [
|
|
'f1_score',
|
|
'precision',
|
|
'recall',
|
|
'false_positive_rate',
|
|
'false_negative_rate'
|
|
]
|
|
|
|
language_mapping_dict = {
|
|
'ace_Arab': 'Acehnese',
|
|
'acm_Arab': 'Mesopotamia',
|
|
'aeb_Arab': 'Tunisia',
|
|
'ajp_Arab': 'Levantine',
|
|
'apc_Arab': 'Levantine',
|
|
'arb_Arab': 'MSA',
|
|
'arq_Arab': 'Algeria',
|
|
'ars_Arab': 'Saudi',
|
|
'ary_Arab': 'Morocco',
|
|
'arz_Arab': 'Egypt',
|
|
'ayp_Arab': 'Mesopotamia',
|
|
'azb_Arab': 'Azerbaijan',
|
|
'bcc_Arab': 'Balochistan',
|
|
'bjn_Arab': 'Indonesia',
|
|
'brh_Arab': 'Pakistan',
|
|
'ckb_Arab': 'Kurdistan',
|
|
'fuv_Arab': 'Nigeria',
|
|
'glk_Arab': 'Iran',
|
|
'hac_Arab': 'Iran',
|
|
'kas_Arab': 'Kashmir',
|
|
'knc_Arab': 'Nigeria',
|
|
'lki_Arab': 'Iran',
|
|
'lrc_Arab': 'Iran',
|
|
'min_Arab': 'Indonesia',
|
|
'mzn_Arab': 'Iran',
|
|
'ota_Arab': 'Turkey',
|
|
'pbt_Arab': 'Afghanistan',
|
|
'pnb_Arab': 'Pakistan',
|
|
'sdh_Arab': 'Iraq',
|
|
'shu_Arab': 'Chad',
|
|
'skr_Arab': 'Pakistan',
|
|
'snd_Arab': 'Pakistan',
|
|
'sus_Arab': 'Guinea',
|
|
'tuk_Arab': 'Turkmenistan',
|
|
'uig_Arab': 'Uighur (China)',
|
|
'urd_Arab': 'Pakistan',
|
|
'uzs_Arab': 'Uzbekistan',
|
|
'zsm_Arab': 'Malaysia'
|
|
}
|
|
|
|
def predict_label(text, model, language_mapping_dict, use_mapping=False):
|
|
|
|
text = str(text).strip().replace('\n', ' ')
|
|
|
|
if text == '':
|
|
return 'Other'
|
|
|
|
try:
|
|
|
|
prediction = model.predict(text, 1)
|
|
|
|
|
|
label = prediction[0][0].replace('__label__', '')
|
|
|
|
|
|
confidence = prediction[1][0]
|
|
|
|
|
|
if use_mapping:
|
|
label = language_mapping_dict.get(label, 'Other')
|
|
return label
|
|
|
|
except Exception as e:
|
|
print(f"Error processing text: {text}")
|
|
print(f"Exception: {e}")
|
|
return {'prediction_label': 'Error', 'prediction_confidence': 0.0}
|
|
|
|
def compute_classification_metrics(test_dataset):
|
|
"""
|
|
Compute comprehensive classification metrics for each class.
|
|
|
|
Args:
|
|
data (pd.DataFrame): DataFrame containing 'dialect' as true labels and 'preds' as predicted labels.
|
|
|
|
Returns:
|
|
pd.DataFrame: DataFrame with detailed metrics for each class.
|
|
"""
|
|
|
|
data = pd.DataFrame(test_dataset)
|
|
|
|
true_labels = list(data['dialect'])
|
|
predicted_labels = list(data['preds'])
|
|
|
|
|
|
labels = sorted(list(set(true_labels + predicted_labels)))
|
|
label_to_index = {label: index for index, label in enumerate(labels)}
|
|
|
|
|
|
true_indices = [label_to_index[label] for label in true_labels]
|
|
pred_indices = [label_to_index[label] for label in predicted_labels]
|
|
|
|
|
|
f1_scores = f1_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
|
|
precision_scores = precision_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
|
|
recall_scores = recall_score(true_indices, pred_indices, average=None, labels=range(len(labels)))
|
|
|
|
|
|
conf_mat = confusion_matrix(true_indices, pred_indices, labels=range(len(labels)))
|
|
|
|
|
|
FP = conf_mat.sum(axis=0) - np.diag(conf_mat)
|
|
FN = conf_mat.sum(axis=1) - np.diag(conf_mat)
|
|
TP = np.diag(conf_mat)
|
|
TN = conf_mat.sum() - (FP + FN + TP)
|
|
|
|
|
|
samples_per_class = np.bincount(true_indices, minlength=len(labels))
|
|
|
|
|
|
with np.errstate(divide='ignore', invalid='ignore'):
|
|
fp_rate = FP / (FP + TN)
|
|
fn_rate = FN / (FN + TP)
|
|
specificity = TN / (TN + FP)
|
|
npv = TN / (TN + FN)
|
|
|
|
|
|
metrics = [fp_rate, fn_rate, specificity, npv]
|
|
metrics = [np.nan_to_num(m, nan=0.0, posinf=0.0, neginf=0.0) for m in metrics]
|
|
fp_rate, fn_rate, specificity, npv = metrics
|
|
|
|
|
|
balanced_acc = balanced_accuracy_score(true_indices, pred_indices)
|
|
mcc = matthews_corrcoef(true_indices, pred_indices)
|
|
|
|
|
|
result_df = pd.DataFrame({
|
|
'country': labels,
|
|
'samples': samples_per_class,
|
|
'f1_score': f1_scores,
|
|
'precision': precision_scores,
|
|
'recall': recall_scores,
|
|
'specificity': specificity,
|
|
'false_positive_rate': fp_rate,
|
|
'false_negative_rate': fn_rate,
|
|
'true_positives': TP,
|
|
'false_positives': FP,
|
|
'true_negatives': TN,
|
|
'false_negatives': FN,
|
|
'negative_predictive_value': npv
|
|
})
|
|
|
|
|
|
result_df = result_df.sort_values('samples', ascending=False)
|
|
|
|
|
|
summary_metrics = {
|
|
'macro_f1': f1_score(true_indices, pred_indices, average='macro'),
|
|
'weighted_f1': f1_score(true_indices, pred_indices, average='weighted'),
|
|
'micro_f1': f1_score(true_indices, pred_indices, average='micro'),
|
|
'balanced_accuracy': balanced_acc,
|
|
'matthews_correlation': mcc
|
|
}
|
|
|
|
|
|
numeric_cols = result_df.select_dtypes(include=[np.number]).columns
|
|
result_df[numeric_cols] = result_df[numeric_cols].round(4)
|
|
|
|
print(f'result_df: {result_df}')
|
|
|
|
return result_df, summary_metrics
|
|
|
|
def make_binary(dialect, target):
|
|
if dialect != target:
|
|
return 'Other'
|
|
return target
|
|
|
|
def run_eval_one_vs_all(model, data_test, TARGET_LANG='Morocco', language_mapping_dict=None, use_mapping=False):
|
|
|
|
|
|
print(f"[INFO] Running predictions...")
|
|
data_test['preds'] = data_test['text'].apply(lambda text: predict_label(text, model, language_mapping_dict, use_mapping=use_mapping))
|
|
|
|
|
|
df_test_preds = data_test.copy()
|
|
df_test_preds.loc[df_test_preds['dialect'] == TARGET_LANG, 'dialect'] = TARGET_LANG
|
|
df_test_preds.loc[df_test_preds['dialect'] != TARGET_LANG, 'dialect'] = 'Other'
|
|
|
|
|
|
dialect_counts = data_test.groupby('dialect')['dialect'].count().reset_index(name='size')
|
|
result_df = pd.merge(dialect_counts, data_test, on='dialect')
|
|
result_df = result_df.groupby(['dialect', 'size', 'preds'])['preds'].count()/result_df.groupby(['dialect', 'size'])['preds'].count()
|
|
result_df.sort_index(ascending=False, level='size', inplace=True)
|
|
|
|
|
|
out = result_df.copy()
|
|
out.name = 'false_positive_rate'
|
|
out = out.reset_index()
|
|
out = out[out['preds']==TARGET_LANG].drop(columns=['preds', 'size'])
|
|
|
|
return out
|
|
|
|
def update_darija_binary_leaderboard(result_df, model_name, BINARY_LEADERBOARD_FILE="darija_leaderboard_binary.json"):
|
|
try:
|
|
with open(BINARY_LEADERBOARD_FILE, "r") as f:
|
|
data = json.load(f)
|
|
except FileNotFoundError:
|
|
data = []
|
|
|
|
|
|
for _, row in result_df.iterrows():
|
|
country = row['dialect']
|
|
|
|
if country == 'Other':
|
|
continue
|
|
|
|
|
|
country_entry = next((item for item in data if country in item), None)
|
|
if country_entry is None:
|
|
country_entry = {country: {}}
|
|
data.append(country_entry)
|
|
|
|
|
|
if country not in country_entry:
|
|
country_entry[country] = {}
|
|
country_entry[country][model_name] = float(row['false_positive_rate'])
|
|
|
|
if country_entry[country].get("n_test_samples") is None:
|
|
country_entry[country]["n_test_samples"] = int(row['size'])
|
|
|
|
|
|
with open(MULTILINGUAL_LEADERBOARD_FILE, "w") as f:
|
|
json.dump(data, f, indent=4)
|
|
|
|
def handle_evaluation(model_path, model_path_bin, use_mapping=False):
|
|
|
|
result_df, _ = run_eval(model_path, model_path_bin, language_mapping_dict, use_mapping=use_mapping)
|
|
|
|
model_name = model_path + '/' + model_path_bin
|
|
|
|
update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE)
|
|
|
|
df = load_leaderboard_multilingual()
|
|
|
|
return create_leaderboard_display_multilingual(df, 'Morocco', default_metrics)
|
|
|
|
def run_eval(model_path, model_path_bin, language_mapping_dict=None, use_mapping=False):
|
|
"""Run evaluation on a dataset and compute metrics.
|
|
|
|
Args:
|
|
model: The model to evaluate.
|
|
DATA_PATH (str): Path to the dataset.
|
|
is_binary (bool): If True, evaluate as binary classification.
|
|
If False, evaluate as multi-class classification.
|
|
target_label (str): The target class label in binary mode.
|
|
|
|
Returns:
|
|
pd.DataFrame: A DataFrame containing evaluation metrics.
|
|
"""
|
|
|
|
|
|
model_path = hf_hub_download(repo_id=model_path, filename=model_path_bin, cache_dir=None)
|
|
|
|
|
|
print(f"[INFO] Loading model from Path: {model_path}, using version {model_path_bin}...")
|
|
model = fasttext.load_model(model_path)
|
|
|
|
|
|
print(f"[INFO] Loading evaluation dataset from Path: atlasia/No-Arabic-Dialect-Left-Behind-Filtered-Balanced...")
|
|
eval_dataset = load_dataset("atlasia/No-Arabic-Dialect-Left-Behind-Filtered-Balanced", split='test')
|
|
|
|
|
|
print(f"[INFO] Converting evaluation dataset to Pandas DataFrame...")
|
|
df_eval = pd.DataFrame(eval_dataset)
|
|
|
|
|
|
print(f"[INFO] Running predictions...")
|
|
df_eval['preds'] = df_eval['text'].apply(lambda text: predict_label(text, model, language_mapping_dict, use_mapping=use_mapping))
|
|
|
|
|
|
df_eval = df_eval.drop(columns=['text', 'metadata', 'dataset_source'])
|
|
|
|
|
|
print(f"[INFO] Computing metrics...")
|
|
result_df, _ = compute_classification_metrics(df_eval)
|
|
|
|
|
|
|
|
return result_df, df_eval
|
|
|
|
def process_results_file(file, uploaded_model_name, base_path_save="./atlasia/submissions/"):
|
|
try:
|
|
if file is None:
|
|
return "Please upload a file."
|
|
|
|
|
|
uploaded_model_name = uploaded_model_name.strip().replace(" ", "_")
|
|
print(f"[INFO] uploaded_model_name: {uploaded_model_name}")
|
|
|
|
|
|
path_saving = os.path.join(base_path_save, uploaded_model_name)
|
|
os.makedirs(path_saving, exist_ok=True)
|
|
|
|
|
|
saved_file_path = os.path.join(path_saving, 'submission.csv')
|
|
|
|
|
|
print(f"[INFO] Loading results...")
|
|
df_eval = pd.read_csv(file.name)
|
|
|
|
|
|
print(f"[INFO] Saving the file locally in: {saved_file_path}")
|
|
df_eval.to_csv(saved_file_path, index=False)
|
|
|
|
except Exception as e:
|
|
return f"Error processing file: {str(e)}"
|
|
|
|
|
|
print(f"[INFO] Computing metrics...")
|
|
result_df, _ = compute_classification_metrics(df_eval)
|
|
|
|
|
|
update_darija_multilingual_leaderboard(result_df, uploaded_model_name, MULTILINGUAL_LEADERBOARD_FILE)
|
|
|
|
|
|
|
|
|
|
|
|
df = load_leaderboard_multilingual()
|
|
|
|
return create_leaderboard_display_multilingual(df, 'Morocco', default_metrics)
|
|
|
|
def update_darija_multilingual_leaderboard(result_df, model_name, MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
|
|
|
|
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE)
|
|
|
|
try:
|
|
with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f:
|
|
data = json.load(f)
|
|
except FileNotFoundError:
|
|
data = []
|
|
|
|
|
|
for _, row in result_df.iterrows():
|
|
country = row['country']
|
|
|
|
if country == 'Other':
|
|
continue
|
|
|
|
|
|
metrics = {
|
|
'f1_score': float(row['f1_score']),
|
|
'precision': float(row['precision']),
|
|
'recall': float(row['recall']),
|
|
'specificity': float(row['specificity']),
|
|
'false_positive_rate': float(row['false_positive_rate']),
|
|
'false_negative_rate': float(row['false_negative_rate']),
|
|
'negative_predictive_value': float(row['negative_predictive_value']),
|
|
'n_test_samples': int(row['samples'])
|
|
}
|
|
|
|
|
|
country_entry = next((item for item in data if country in item), None)
|
|
if country_entry is None:
|
|
country_entry = {country: {}}
|
|
data.append(country_entry)
|
|
|
|
|
|
if country not in country_entry:
|
|
country_entry[country] = {}
|
|
country_entry[country][model_name] = metrics
|
|
|
|
|
|
with open(MULTILINGUAL_LEADERBOARD_FILE, "w") as f:
|
|
json.dump(data, f, indent=4)
|
|
|
|
|
|
def load_leaderboard_multilingual(MULTILINGUAL_LEADERBOARD_FILE="darija_leaderboard_multilingual.json"):
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
MULTILINGUAL_LEADERBOARD_FILE = os.path.join(current_dir, MULTILINGUAL_LEADERBOARD_FILE)
|
|
|
|
with open(MULTILINGUAL_LEADERBOARD_FILE, "r") as f:
|
|
data = json.load(f)
|
|
|
|
|
|
rows = []
|
|
|
|
|
|
for country_data in data:
|
|
for country, models in country_data.items():
|
|
for model_name, metrics in models.items():
|
|
row = {
|
|
'country': country,
|
|
'model': model_name,
|
|
}
|
|
|
|
row.update(metrics)
|
|
rows.append(row)
|
|
|
|
|
|
df = pd.DataFrame(rows)
|
|
return df
|
|
|
|
def create_leaderboard_display_multilingual(df, selected_country, selected_metrics):
|
|
|
|
if selected_country and selected_country.upper() != 'ALL':
|
|
print(f"Filtering leaderboard by country: {selected_country}")
|
|
df = df[df['country'] == selected_country]
|
|
df = df.drop(columns=['country'])
|
|
|
|
|
|
columns_to_show = ['model'] + [metric for metric in selected_metrics if metric in df.columns]
|
|
|
|
else:
|
|
|
|
columns_to_show = ['model', 'country'] + selected_metrics
|
|
|
|
|
|
if selected_metrics:
|
|
df = df.sort_values(by=selected_metrics[0], ascending=False)
|
|
|
|
df = df[columns_to_show]
|
|
|
|
|
|
numeric_cols = df.select_dtypes(include=['float64']).columns
|
|
df[numeric_cols] = df[numeric_cols].round(4)
|
|
|
|
return df
|
|
|
|
def update_leaderboard_multilingual(country, selected_metrics):
|
|
if not selected_metrics:
|
|
selected_metrics = metrics
|
|
df = load_leaderboard_multilingual()
|
|
display_df = create_leaderboard_display_multilingual(df, country, selected_metrics)
|
|
return display_df
|
|
|
|
def encode_image_to_base64(image_path):
|
|
with open(image_path, "rb") as image_file:
|
|
encoded_string = base64.b64encode(image_file.read()).decode()
|
|
return encoded_string
|
|
|
|
def create_html_image(image_path):
|
|
|
|
img_base64 = encode_image_to_base64(image_path)
|
|
|
|
|
|
html_string = f"""
|
|
<div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
|
|
<div style="max-width: 800px; margin: auto;">
|
|
<img src="data:image/jpeg;base64,{img_base64}"
|
|
style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
|
|
alt="Displayed Image">
|
|
</div>
|
|
</div>
|
|
"""
|
|
return html_string |