Commit
·
04e4741
1
Parent(s):
e2bd164
all constants in a single file
Browse files- constants.py +93 -0
constants.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
|
3 |
+
|
4 |
+
# Constants values
|
5 |
+
DATA_PATH = "atlasia/Arabic-LID-Leaderboard"
|
6 |
+
DIALECT_CONFUSION_LEADERBOARD_FILE = "darija_leaderboard_dialect_confusion.json"
|
7 |
+
MULTI_DIALECTS_LEADERBOARD_FILE = "darija_leaderboard_multi_dialects.json"
|
8 |
+
|
9 |
+
# classification metrics
|
10 |
+
metrics = [
|
11 |
+
'f1_score',
|
12 |
+
'precision',
|
13 |
+
'recall',
|
14 |
+
'false_positive_rate',
|
15 |
+
'false_negative_rate',
|
16 |
+
'weighted_f1_score',
|
17 |
+
'macro_f1_score',
|
18 |
+
'micro_f1_score',
|
19 |
+
'balanced_accuracy',
|
20 |
+
'matthews_correlation',
|
21 |
+
'specificity',
|
22 |
+
'negative_predictive_value',
|
23 |
+
'n_test_samples',
|
24 |
+
]
|
25 |
+
|
26 |
+
# Mapping dict from iso code to country name
|
27 |
+
language_mapping_dict = {
|
28 |
+
'ace_Arab': 'Acehnese',
|
29 |
+
'acm_Arab': 'Mesopotamia', # 'Gilit Mesopotamian'
|
30 |
+
'aeb_Arab': 'Tunisia',
|
31 |
+
'ajp_Arab': 'Levantine', # 'South Levantine'
|
32 |
+
'apc_Arab': 'Levantine',
|
33 |
+
'arb_Arab': 'MSA',
|
34 |
+
'arq_Arab': 'Algeria',
|
35 |
+
'ars_Arab': 'Saudi', # Najdi is primarily Saudi Arabian
|
36 |
+
'ary_Arab': 'Morocco',
|
37 |
+
'arz_Arab': 'Egypt',
|
38 |
+
'ayp_Arab': 'Mesopotamia', # 'North Mesopotamian'
|
39 |
+
'azb_Arab': 'Azerbaijan', # South Azerbaijani pertains to this region
|
40 |
+
'bcc_Arab': 'Balochistan', # Southern Balochi is from Balochistan
|
41 |
+
'bjn_Arab': 'Indonesia', # Banjar is spoken in Indonesia
|
42 |
+
'brh_Arab': 'Pakistan', # Brahui is spoken in Pakistan
|
43 |
+
'ckb_Arab': 'Kurdistan', # Central Kurdish is mainly in Iraq
|
44 |
+
'fuv_Arab': 'Nigeria', # Hausa States Fulfulde
|
45 |
+
'glk_Arab': 'Iran', # Gilaki is spoken in Iran
|
46 |
+
'hac_Arab': 'Iran', # Gurani is also primarily spoken in Iran
|
47 |
+
'kas_Arab': 'Kashmir',
|
48 |
+
'knc_Arab': 'Nigeria', # Central Kanuri is in Nigeria
|
49 |
+
'lki_Arab': 'Iran', # Laki is from Iran
|
50 |
+
'lrc_Arab': 'Iran', # Northern Luri is from Iran
|
51 |
+
'min_Arab': 'Indonesia', # Minangkabau is spoken in Indonesia
|
52 |
+
'mzn_Arab': 'Iran', # Mazanderani is spoken in Iran
|
53 |
+
'ota_Arab': 'Turkey', # Ottoman Turkish
|
54 |
+
'pbt_Arab': 'Afghanistan', # Southern Pashto
|
55 |
+
'pnb_Arab': 'Pakistan', # Western Panjabi
|
56 |
+
'sdh_Arab': 'Iraq', # Southern Kurdish
|
57 |
+
'shu_Arab': 'Chad', # Chadian Arabic
|
58 |
+
'skr_Arab': 'Pakistan', # Saraiki
|
59 |
+
'snd_Arab': 'Pakistan', # Sindhi
|
60 |
+
'sus_Arab': 'Guinea', # Susu
|
61 |
+
'tuk_Arab': 'Turkmenistan', # Turkmen
|
62 |
+
'uig_Arab': 'Uighur (China)', # Uighur
|
63 |
+
'urd_Arab': 'Pakistan', # Urdu
|
64 |
+
'uzs_Arab': 'Uzbekistan', # Southern Uzbek
|
65 |
+
'zsm_Arab': 'Malaysia' # Standard Malay
|
66 |
+
}
|
67 |
+
|
68 |
+
# Default values
|
69 |
+
target_label = "Morocco"
|
70 |
+
is_binary = False
|
71 |
+
|
72 |
+
# default metrics to display in the multilingual leaderboard
|
73 |
+
default_metrics = [
|
74 |
+
'f1_score',
|
75 |
+
'false_positive_rate',
|
76 |
+
]
|
77 |
+
|
78 |
+
# default language to display in one-vs-all leaderboard
|
79 |
+
default_languages = [
|
80 |
+
'MSA',
|
81 |
+
#'Egypt',
|
82 |
+
#'Algeria',
|
83 |
+
#'Tunisia',
|
84 |
+
#'Levantine',
|
85 |
+
]
|
86 |
+
|
87 |
+
# Load eval dataset
|
88 |
+
eval_dataset = load_dataset(DATA_PATH, split='test')
|
89 |
+
|
90 |
+
# Supported dialects
|
91 |
+
all_target_languages = list(eval_dataset.unique("dialect"))
|
92 |
+
supported_dialects = all_target_languages + ['All']
|
93 |
+
languages_to_display_one_vs_all = all_target_languages # everything except All
|