BounharAbdelaziz commited on
Commit
04e4741
·
1 Parent(s): e2bd164

all constants in a single file

Browse files
Files changed (1) hide show
  1. constants.py +93 -0
constants.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+
4
+ # Constants values
5
+ DATA_PATH = "atlasia/Arabic-LID-Leaderboard"
6
+ DIALECT_CONFUSION_LEADERBOARD_FILE = "darija_leaderboard_dialect_confusion.json"
7
+ MULTI_DIALECTS_LEADERBOARD_FILE = "darija_leaderboard_multi_dialects.json"
8
+
9
+ # classification metrics
10
+ metrics = [
11
+ 'f1_score',
12
+ 'precision',
13
+ 'recall',
14
+ 'false_positive_rate',
15
+ 'false_negative_rate',
16
+ 'weighted_f1_score',
17
+ 'macro_f1_score',
18
+ 'micro_f1_score',
19
+ 'balanced_accuracy',
20
+ 'matthews_correlation',
21
+ 'specificity',
22
+ 'negative_predictive_value',
23
+ 'n_test_samples',
24
+ ]
25
+
26
+ # Mapping dict from iso code to country name
27
+ language_mapping_dict = {
28
+ 'ace_Arab': 'Acehnese',
29
+ 'acm_Arab': 'Mesopotamia', # 'Gilit Mesopotamian'
30
+ 'aeb_Arab': 'Tunisia',
31
+ 'ajp_Arab': 'Levantine', # 'South Levantine'
32
+ 'apc_Arab': 'Levantine',
33
+ 'arb_Arab': 'MSA',
34
+ 'arq_Arab': 'Algeria',
35
+ 'ars_Arab': 'Saudi', # Najdi is primarily Saudi Arabian
36
+ 'ary_Arab': 'Morocco',
37
+ 'arz_Arab': 'Egypt',
38
+ 'ayp_Arab': 'Mesopotamia', # 'North Mesopotamian'
39
+ 'azb_Arab': 'Azerbaijan', # South Azerbaijani pertains to this region
40
+ 'bcc_Arab': 'Balochistan', # Southern Balochi is from Balochistan
41
+ 'bjn_Arab': 'Indonesia', # Banjar is spoken in Indonesia
42
+ 'brh_Arab': 'Pakistan', # Brahui is spoken in Pakistan
43
+ 'ckb_Arab': 'Kurdistan', # Central Kurdish is mainly in Iraq
44
+ 'fuv_Arab': 'Nigeria', # Hausa States Fulfulde
45
+ 'glk_Arab': 'Iran', # Gilaki is spoken in Iran
46
+ 'hac_Arab': 'Iran', # Gurani is also primarily spoken in Iran
47
+ 'kas_Arab': 'Kashmir',
48
+ 'knc_Arab': 'Nigeria', # Central Kanuri is in Nigeria
49
+ 'lki_Arab': 'Iran', # Laki is from Iran
50
+ 'lrc_Arab': 'Iran', # Northern Luri is from Iran
51
+ 'min_Arab': 'Indonesia', # Minangkabau is spoken in Indonesia
52
+ 'mzn_Arab': 'Iran', # Mazanderani is spoken in Iran
53
+ 'ota_Arab': 'Turkey', # Ottoman Turkish
54
+ 'pbt_Arab': 'Afghanistan', # Southern Pashto
55
+ 'pnb_Arab': 'Pakistan', # Western Panjabi
56
+ 'sdh_Arab': 'Iraq', # Southern Kurdish
57
+ 'shu_Arab': 'Chad', # Chadian Arabic
58
+ 'skr_Arab': 'Pakistan', # Saraiki
59
+ 'snd_Arab': 'Pakistan', # Sindhi
60
+ 'sus_Arab': 'Guinea', # Susu
61
+ 'tuk_Arab': 'Turkmenistan', # Turkmen
62
+ 'uig_Arab': 'Uighur (China)', # Uighur
63
+ 'urd_Arab': 'Pakistan', # Urdu
64
+ 'uzs_Arab': 'Uzbekistan', # Southern Uzbek
65
+ 'zsm_Arab': 'Malaysia' # Standard Malay
66
+ }
67
+
68
+ # Default values
69
+ target_label = "Morocco"
70
+ is_binary = False
71
+
72
+ # default metrics to display in the multilingual leaderboard
73
+ default_metrics = [
74
+ 'f1_score',
75
+ 'false_positive_rate',
76
+ ]
77
+
78
+ # default language to display in one-vs-all leaderboard
79
+ default_languages = [
80
+ 'MSA',
81
+ #'Egypt',
82
+ #'Algeria',
83
+ #'Tunisia',
84
+ #'Levantine',
85
+ ]
86
+
87
+ # Load eval dataset
88
+ eval_dataset = load_dataset(DATA_PATH, split='test')
89
+
90
+ # Supported dialects
91
+ all_target_languages = list(eval_dataset.unique("dialect"))
92
+ supported_dialects = all_target_languages + ['All']
93
+ languages_to_display_one_vs_all = all_target_languages # everything except All