Easonwangzk commited on
Commit
432ed18
·
verified ·
1 Parent(s): d23beb3

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. README.md +206 -0
  3. adapter_config.json +41 -0
  4. adapter_model.safetensors +3 -0
  5. checkpoint-4375/README.md +206 -0
  6. checkpoint-4375/adapter_config.json +41 -0
  7. checkpoint-4375/adapter_model.safetensors +3 -0
  8. checkpoint-4375/optimizer.pt +3 -0
  9. checkpoint-4375/rng_state.pth +3 -0
  10. checkpoint-4375/scheduler.pt +3 -0
  11. checkpoint-4375/trainer_state.json +653 -0
  12. checkpoint-4375/training_args.bin +3 -0
  13. cls/README.md +206 -0
  14. cls/adapter_config.json +38 -0
  15. cls/adapter_model.safetensors +3 -0
  16. cls/checkpoint-4375/model.safetensors +3 -0
  17. cls/checkpoint-4375/optimizer.pt +3 -0
  18. cls/checkpoint-4375/rng_state.pth +3 -0
  19. cls/checkpoint-4375/scheduler.pt +3 -0
  20. cls/checkpoint-4375/trainer_state.json +653 -0
  21. cls/checkpoint-4375/training_args.bin +3 -0
  22. cls/classifier_head.pt +3 -0
  23. cls/id2label.json +22 -0
  24. cls/label2id.json +22 -0
  25. cls/sentencepiece.bpe.model +3 -0
  26. cls/special_tokens_map.json +15 -0
  27. cls/tokenizer.json +3 -0
  28. cls/tokenizer_config.json +55 -0
  29. id2label.json +22 -0
  30. label2id.json +22 -0
  31. mean/README.md +206 -0
  32. mean/adapter_config.json +38 -0
  33. mean/adapter_model.safetensors +3 -0
  34. mean/checkpoint-4375/model.safetensors +3 -0
  35. mean/checkpoint-4375/optimizer.pt +3 -0
  36. mean/checkpoint-4375/rng_state.pth +3 -0
  37. mean/checkpoint-4375/scheduler.pt +3 -0
  38. mean/checkpoint-4375/trainer_state.json +653 -0
  39. mean/checkpoint-4375/training_args.bin +3 -0
  40. mean/classifier_head.pt +3 -0
  41. mean/id2label.json +22 -0
  42. mean/label2id.json +22 -0
  43. mean/sentencepiece.bpe.model +3 -0
  44. mean/special_tokens_map.json +15 -0
  45. mean/tokenizer.json +3 -0
  46. mean/tokenizer_config.json +55 -0
  47. sentencepiece.bpe.model +3 -0
  48. special_tokens_map.json +15 -0
  49. tokenizer.json +3 -0
  50. tokenizer_config.json +55 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ cls/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ mean/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: xlm-roberta-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:xlm-roberta-base
6
+ - lora
7
+ - transformers
8
+ ---
9
+
10
+ # Model Card for Model ID
11
+
12
+ <!-- Provide a quick summary of what the model is/does. -->
13
+
14
+
15
+
16
+ ## Model Details
17
+
18
+ ### Model Description
19
+
20
+ <!-- Provide a longer summary of what this model is. -->
21
+
22
+
23
+
24
+ - **Developed by:** [More Information Needed]
25
+ - **Funded by [optional]:** [More Information Needed]
26
+ - **Shared by [optional]:** [More Information Needed]
27
+ - **Model type:** [More Information Needed]
28
+ - **Language(s) (NLP):** [More Information Needed]
29
+ - **License:** [More Information Needed]
30
+ - **Finetuned from model [optional]:** [More Information Needed]
31
+
32
+ ### Model Sources [optional]
33
+
34
+ <!-- Provide the basic links for the model. -->
35
+
36
+ - **Repository:** [More Information Needed]
37
+ - **Paper [optional]:** [More Information Needed]
38
+ - **Demo [optional]:** [More Information Needed]
39
+
40
+ ## Uses
41
+
42
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
43
+
44
+ ### Direct Use
45
+
46
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
47
+
48
+ [More Information Needed]
49
+
50
+ ### Downstream Use [optional]
51
+
52
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
53
+
54
+ [More Information Needed]
55
+
56
+ ### Out-of-Scope Use
57
+
58
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
59
+
60
+ [More Information Needed]
61
+
62
+ ## Bias, Risks, and Limitations
63
+
64
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
65
+
66
+ [More Information Needed]
67
+
68
+ ### Recommendations
69
+
70
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
71
+
72
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
73
+
74
+ ## How to Get Started with the Model
75
+
76
+ Use the code below to get started with the model.
77
+
78
+ [More Information Needed]
79
+
80
+ ## Training Details
81
+
82
+ ### Training Data
83
+
84
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
85
+
86
+ [More Information Needed]
87
+
88
+ ### Training Procedure
89
+
90
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
91
+
92
+ #### Preprocessing [optional]
93
+
94
+ [More Information Needed]
95
+
96
+
97
+ #### Training Hyperparameters
98
+
99
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
100
+
101
+ #### Speeds, Sizes, Times [optional]
102
+
103
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
104
+
105
+ [More Information Needed]
106
+
107
+ ## Evaluation
108
+
109
+ <!-- This section describes the evaluation protocols and provides the results. -->
110
+
111
+ ### Testing Data, Factors & Metrics
112
+
113
+ #### Testing Data
114
+
115
+ <!-- This should link to a Dataset Card if possible. -->
116
+
117
+ [More Information Needed]
118
+
119
+ #### Factors
120
+
121
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
122
+
123
+ [More Information Needed]
124
+
125
+ #### Metrics
126
+
127
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
128
+
129
+ [More Information Needed]
130
+
131
+ ### Results
132
+
133
+ [More Information Needed]
134
+
135
+ #### Summary
136
+
137
+
138
+
139
+ ## Model Examination [optional]
140
+
141
+ <!-- Relevant interpretability work for the model goes here -->
142
+
143
+ [More Information Needed]
144
+
145
+ ## Environmental Impact
146
+
147
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
148
+
149
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
150
+
151
+ - **Hardware Type:** [More Information Needed]
152
+ - **Hours used:** [More Information Needed]
153
+ - **Cloud Provider:** [More Information Needed]
154
+ - **Compute Region:** [More Information Needed]
155
+ - **Carbon Emitted:** [More Information Needed]
156
+
157
+ ## Technical Specifications [optional]
158
+
159
+ ### Model Architecture and Objective
160
+
161
+ [More Information Needed]
162
+
163
+ ### Compute Infrastructure
164
+
165
+ [More Information Needed]
166
+
167
+ #### Hardware
168
+
169
+ [More Information Needed]
170
+
171
+ #### Software
172
+
173
+ [More Information Needed]
174
+
175
+ ## Citation [optional]
176
+
177
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
178
+
179
+ **BibTeX:**
180
+
181
+ [More Information Needed]
182
+
183
+ **APA:**
184
+
185
+ [More Information Needed]
186
+
187
+ ## Glossary [optional]
188
+
189
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
190
+
191
+ [More Information Needed]
192
+
193
+ ## More Information [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Authors [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Contact
202
+
203
+ [More Information Needed]
204
+ ### Framework versions
205
+
206
+ - PEFT 0.17.1
adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "xlm-roberta-base",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": [
22
+ "classifier",
23
+ "score"
24
+ ],
25
+ "peft_type": "LORA",
26
+ "qalora_group_size": 16,
27
+ "r": 16,
28
+ "rank_pattern": {},
29
+ "revision": null,
30
+ "target_modules": [
31
+ "key",
32
+ "query",
33
+ "value"
34
+ ],
35
+ "target_parameters": null,
36
+ "task_type": "SEQ_CLS",
37
+ "trainable_token_indices": null,
38
+ "use_dora": false,
39
+ "use_qalora": false,
40
+ "use_rslora": false
41
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:415a2fc81835d4a32a0f4a0130dc4d7b3e54538057b8c505bbb81b7633f7cc4a
3
+ size 4761648
checkpoint-4375/README.md ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: xlm-roberta-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:xlm-roberta-base
6
+ - lora
7
+ - transformers
8
+ ---
9
+
10
+ # Model Card for Model ID
11
+
12
+ <!-- Provide a quick summary of what the model is/does. -->
13
+
14
+
15
+
16
+ ## Model Details
17
+
18
+ ### Model Description
19
+
20
+ <!-- Provide a longer summary of what this model is. -->
21
+
22
+
23
+
24
+ - **Developed by:** [More Information Needed]
25
+ - **Funded by [optional]:** [More Information Needed]
26
+ - **Shared by [optional]:** [More Information Needed]
27
+ - **Model type:** [More Information Needed]
28
+ - **Language(s) (NLP):** [More Information Needed]
29
+ - **License:** [More Information Needed]
30
+ - **Finetuned from model [optional]:** [More Information Needed]
31
+
32
+ ### Model Sources [optional]
33
+
34
+ <!-- Provide the basic links for the model. -->
35
+
36
+ - **Repository:** [More Information Needed]
37
+ - **Paper [optional]:** [More Information Needed]
38
+ - **Demo [optional]:** [More Information Needed]
39
+
40
+ ## Uses
41
+
42
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
43
+
44
+ ### Direct Use
45
+
46
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
47
+
48
+ [More Information Needed]
49
+
50
+ ### Downstream Use [optional]
51
+
52
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
53
+
54
+ [More Information Needed]
55
+
56
+ ### Out-of-Scope Use
57
+
58
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
59
+
60
+ [More Information Needed]
61
+
62
+ ## Bias, Risks, and Limitations
63
+
64
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
65
+
66
+ [More Information Needed]
67
+
68
+ ### Recommendations
69
+
70
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
71
+
72
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
73
+
74
+ ## How to Get Started with the Model
75
+
76
+ Use the code below to get started with the model.
77
+
78
+ [More Information Needed]
79
+
80
+ ## Training Details
81
+
82
+ ### Training Data
83
+
84
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
85
+
86
+ [More Information Needed]
87
+
88
+ ### Training Procedure
89
+
90
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
91
+
92
+ #### Preprocessing [optional]
93
+
94
+ [More Information Needed]
95
+
96
+
97
+ #### Training Hyperparameters
98
+
99
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
100
+
101
+ #### Speeds, Sizes, Times [optional]
102
+
103
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
104
+
105
+ [More Information Needed]
106
+
107
+ ## Evaluation
108
+
109
+ <!-- This section describes the evaluation protocols and provides the results. -->
110
+
111
+ ### Testing Data, Factors & Metrics
112
+
113
+ #### Testing Data
114
+
115
+ <!-- This should link to a Dataset Card if possible. -->
116
+
117
+ [More Information Needed]
118
+
119
+ #### Factors
120
+
121
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
122
+
123
+ [More Information Needed]
124
+
125
+ #### Metrics
126
+
127
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
128
+
129
+ [More Information Needed]
130
+
131
+ ### Results
132
+
133
+ [More Information Needed]
134
+
135
+ #### Summary
136
+
137
+
138
+
139
+ ## Model Examination [optional]
140
+
141
+ <!-- Relevant interpretability work for the model goes here -->
142
+
143
+ [More Information Needed]
144
+
145
+ ## Environmental Impact
146
+
147
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
148
+
149
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
150
+
151
+ - **Hardware Type:** [More Information Needed]
152
+ - **Hours used:** [More Information Needed]
153
+ - **Cloud Provider:** [More Information Needed]
154
+ - **Compute Region:** [More Information Needed]
155
+ - **Carbon Emitted:** [More Information Needed]
156
+
157
+ ## Technical Specifications [optional]
158
+
159
+ ### Model Architecture and Objective
160
+
161
+ [More Information Needed]
162
+
163
+ ### Compute Infrastructure
164
+
165
+ [More Information Needed]
166
+
167
+ #### Hardware
168
+
169
+ [More Information Needed]
170
+
171
+ #### Software
172
+
173
+ [More Information Needed]
174
+
175
+ ## Citation [optional]
176
+
177
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
178
+
179
+ **BibTeX:**
180
+
181
+ [More Information Needed]
182
+
183
+ **APA:**
184
+
185
+ [More Information Needed]
186
+
187
+ ## Glossary [optional]
188
+
189
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
190
+
191
+ [More Information Needed]
192
+
193
+ ## More Information [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Authors [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Contact
202
+
203
+ [More Information Needed]
204
+ ### Framework versions
205
+
206
+ - PEFT 0.17.1
checkpoint-4375/adapter_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "xlm-roberta-base",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": [
22
+ "classifier",
23
+ "score"
24
+ ],
25
+ "peft_type": "LORA",
26
+ "qalora_group_size": 16,
27
+ "r": 16,
28
+ "rank_pattern": {},
29
+ "revision": null,
30
+ "target_modules": [
31
+ "key",
32
+ "query",
33
+ "value"
34
+ ],
35
+ "target_parameters": null,
36
+ "task_type": "SEQ_CLS",
37
+ "trainable_token_indices": null,
38
+ "use_dora": false,
39
+ "use_qalora": false,
40
+ "use_rslora": false
41
+ }
checkpoint-4375/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:415a2fc81835d4a32a0f4a0130dc4d7b3e54538057b8c505bbb81b7633f7cc4a
3
+ size 4761648
checkpoint-4375/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66d6726175fb892fccada29b5502d3264e53b6b2374fd8341dd077ebc3285a3f
3
+ size 9567371
checkpoint-4375/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad5ca0f8bcf195c3eecf0d29502529a9b5292d77ddbc0d61c129ec324025167b
3
+ size 14645
checkpoint-4375/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a993a449c7a40aa61a80703106703284ce9b6fb450a8bd64d07585e0fc640e07
3
+ size 1465
checkpoint-4375/trainer_state.json ADDED
@@ -0,0 +1,653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 4375,
3
+ "best_metric": 0.9967,
4
+ "best_model_checkpoint": "lora-xlmr-langid/checkpoint-4375",
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 4375,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.011428571428571429,
14
+ "grad_norm": 2.9476499557495117,
15
+ "learning_rate": 3.7262357414448674e-05,
16
+ "loss": 3.0186,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.022857142857142857,
21
+ "grad_norm": 3.132594108581543,
22
+ "learning_rate": 7.52851711026616e-05,
23
+ "loss": 3.0152,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.03428571428571429,
28
+ "grad_norm": 5.743668079376221,
29
+ "learning_rate": 0.00011330798479087452,
30
+ "loss": 2.8481,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.045714285714285714,
35
+ "grad_norm": 6.232862949371338,
36
+ "learning_rate": 0.00015133079847908746,
37
+ "loss": 1.9112,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.05714285714285714,
42
+ "grad_norm": 4.115293025970459,
43
+ "learning_rate": 0.0001893536121673004,
44
+ "loss": 0.7416,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.06857142857142857,
49
+ "grad_norm": 2.165242910385132,
50
+ "learning_rate": 0.00019996217828993133,
51
+ "loss": 0.3754,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.08,
56
+ "grad_norm": 1.7203718423843384,
57
+ "learning_rate": 0.0001997842234752531,
58
+ "loss": 0.1814,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.09142857142857143,
63
+ "grad_norm": 0.9917824864387512,
64
+ "learning_rate": 0.00019946067495120418,
65
+ "loss": 0.1232,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.10285714285714286,
70
+ "grad_norm": 4.663485050201416,
71
+ "learning_rate": 0.0001989920048027309,
72
+ "loss": 0.068,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 0.11428571428571428,
77
+ "grad_norm": 0.9876736402511597,
78
+ "learning_rate": 0.00019837889685963127,
79
+ "loss": 0.0654,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 0.12571428571428572,
84
+ "grad_norm": 0.21394599974155426,
85
+ "learning_rate": 0.00019762224569878898,
86
+ "loss": 0.069,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 0.13714285714285715,
91
+ "grad_norm": 0.7479887008666992,
92
+ "learning_rate": 0.00019672315533890932,
93
+ "loss": 0.0534,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 0.14857142857142858,
98
+ "grad_norm": 0.12780117988586426,
99
+ "learning_rate": 0.00019568293762966147,
100
+ "loss": 0.0264,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 0.16,
105
+ "grad_norm": 0.13217857480049133,
106
+ "learning_rate": 0.0001945031103375777,
107
+ "loss": 0.0596,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 0.17142857142857143,
112
+ "grad_norm": 3.1933023929595947,
113
+ "learning_rate": 0.00019318539493150242,
114
+ "loss": 0.0447,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 0.18285714285714286,
119
+ "grad_norm": 0.5493296980857849,
120
+ "learning_rate": 0.0001917317140708218,
121
+ "loss": 0.0474,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 0.19428571428571428,
126
+ "grad_norm": 0.10272103548049927,
127
+ "learning_rate": 0.00019014418880013975,
128
+ "loss": 0.0539,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 0.2057142857142857,
133
+ "grad_norm": 0.21979080140590668,
134
+ "learning_rate": 0.00018842513545449223,
135
+ "loss": 0.0203,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 0.21714285714285714,
140
+ "grad_norm": 0.19337095320224762,
141
+ "learning_rate": 0.00018657706227961677,
142
+ "loss": 0.0343,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 0.22857142857142856,
147
+ "grad_norm": 1.1383503675460815,
148
+ "learning_rate": 0.00018460266577220732,
149
+ "loss": 0.0229,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 0.24,
154
+ "grad_norm": 0.08038519322872162,
155
+ "learning_rate": 0.00018250482674549547,
156
+ "loss": 0.0196,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 0.25142857142857145,
161
+ "grad_norm": 1.3838239908218384,
162
+ "learning_rate": 0.00018028660612589743,
163
+ "loss": 0.0228,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 0.26285714285714284,
168
+ "grad_norm": 10.174747467041016,
169
+ "learning_rate": 0.00017795124048686103,
170
+ "loss": 0.0424,
171
+ "step": 1150
172
+ },
173
+ {
174
+ "epoch": 0.2742857142857143,
175
+ "grad_norm": 0.14027394354343414,
176
+ "learning_rate": 0.00017550213732642837,
177
+ "loss": 0.0374,
178
+ "step": 1200
179
+ },
180
+ {
181
+ "epoch": 0.2857142857142857,
182
+ "grad_norm": 0.03382161259651184,
183
+ "learning_rate": 0.00017294287009540494,
184
+ "loss": 0.0166,
185
+ "step": 1250
186
+ },
187
+ {
188
+ "epoch": 0.29714285714285715,
189
+ "grad_norm": 0.07205367088317871,
190
+ "learning_rate": 0.00017027717298338977,
191
+ "loss": 0.0316,
192
+ "step": 1300
193
+ },
194
+ {
195
+ "epoch": 0.30857142857142855,
196
+ "grad_norm": 0.4876558482646942,
197
+ "learning_rate": 0.0001675089354702732,
198
+ "loss": 0.0474,
199
+ "step": 1350
200
+ },
201
+ {
202
+ "epoch": 0.32,
203
+ "grad_norm": 0.0657462626695633,
204
+ "learning_rate": 0.0001646421966511539,
205
+ "loss": 0.0137,
206
+ "step": 1400
207
+ },
208
+ {
209
+ "epoch": 0.3314285714285714,
210
+ "grad_norm": 0.043785881251096725,
211
+ "learning_rate": 0.00016168113934295362,
212
+ "loss": 0.0438,
213
+ "step": 1450
214
+ },
215
+ {
216
+ "epoch": 0.34285714285714286,
217
+ "grad_norm": 0.1223607212305069,
218
+ "learning_rate": 0.0001586300839813298,
219
+ "loss": 0.0285,
220
+ "step": 1500
221
+ },
222
+ {
223
+ "epoch": 0.35428571428571426,
224
+ "grad_norm": 0.036955248564481735,
225
+ "learning_rate": 0.00015549348231679093,
226
+ "loss": 0.0193,
227
+ "step": 1550
228
+ },
229
+ {
230
+ "epoch": 0.3657142857142857,
231
+ "grad_norm": 0.10787559300661087,
232
+ "learning_rate": 0.00015227591091921205,
233
+ "loss": 0.0258,
234
+ "step": 1600
235
+ },
236
+ {
237
+ "epoch": 0.37714285714285717,
238
+ "grad_norm": 0.05653952807188034,
239
+ "learning_rate": 0.00014898206450022813,
240
+ "loss": 0.0287,
241
+ "step": 1650
242
+ },
243
+ {
244
+ "epoch": 0.38857142857142857,
245
+ "grad_norm": 0.4728432297706604,
246
+ "learning_rate": 0.00014561674906324873,
247
+ "loss": 0.0357,
248
+ "step": 1700
249
+ },
250
+ {
251
+ "epoch": 0.4,
252
+ "grad_norm": 0.03136986121535301,
253
+ "learning_rate": 0.00014218487489108813,
254
+ "loss": 0.0173,
255
+ "step": 1750
256
+ },
257
+ {
258
+ "epoch": 0.4114285714285714,
259
+ "grad_norm": 0.019133005291223526,
260
+ "learning_rate": 0.00013869144938144325,
261
+ "loss": 0.014,
262
+ "step": 1800
263
+ },
264
+ {
265
+ "epoch": 0.4228571428571429,
266
+ "grad_norm": 0.025615772232413292,
267
+ "learning_rate": 0.00013514156974067242,
268
+ "loss": 0.0248,
269
+ "step": 1850
270
+ },
271
+ {
272
+ "epoch": 0.4342857142857143,
273
+ "grad_norm": 0.12288248538970947,
274
+ "learning_rate": 0.00013154041554653577,
275
+ "loss": 0.0421,
276
+ "step": 1900
277
+ },
278
+ {
279
+ "epoch": 0.44571428571428573,
280
+ "grad_norm": 0.22975103557109833,
281
+ "learning_rate": 0.00012789324119074852,
282
+ "loss": 0.0489,
283
+ "step": 1950
284
+ },
285
+ {
286
+ "epoch": 0.45714285714285713,
287
+ "grad_norm": 0.04648689553141594,
288
+ "learning_rate": 0.00012420536821237444,
289
+ "loss": 0.0178,
290
+ "step": 2000
291
+ },
292
+ {
293
+ "epoch": 0.4685714285714286,
294
+ "grad_norm": 0.034475695341825485,
295
+ "learning_rate": 0.00012048217753324587,
296
+ "loss": 0.045,
297
+ "step": 2050
298
+ },
299
+ {
300
+ "epoch": 0.48,
301
+ "grad_norm": 0.05342303588986397,
302
+ "learning_rate": 0.00011672910160673858,
303
+ "loss": 0.01,
304
+ "step": 2100
305
+ },
306
+ {
307
+ "epoch": 0.49142857142857144,
308
+ "grad_norm": 0.07373230904340744,
309
+ "learning_rate": 0.00011295161649135815,
310
+ "loss": 0.0243,
311
+ "step": 2150
312
+ },
313
+ {
314
+ "epoch": 0.5028571428571429,
315
+ "grad_norm": 0.11881459504365921,
316
+ "learning_rate": 0.00010915523386070277,
317
+ "loss": 0.0287,
318
+ "step": 2200
319
+ },
320
+ {
321
+ "epoch": 0.5142857142857142,
322
+ "grad_norm": 0.16949842870235443,
323
+ "learning_rate": 0.0001053454929614603,
324
+ "loss": 0.024,
325
+ "step": 2250
326
+ },
327
+ {
328
+ "epoch": 0.5257142857142857,
329
+ "grad_norm": 0.19728004932403564,
330
+ "learning_rate": 0.00010152795253117406,
331
+ "loss": 0.0097,
332
+ "step": 2300
333
+ },
334
+ {
335
+ "epoch": 0.5371428571428571,
336
+ "grad_norm": 3.5783822536468506,
337
+ "learning_rate": 9.770818268756971e-05,
338
+ "loss": 0.0379,
339
+ "step": 2350
340
+ },
341
+ {
342
+ "epoch": 0.5485714285714286,
343
+ "grad_norm": 0.03188510239124298,
344
+ "learning_rate": 9.389175680127735e-05,
345
+ "loss": 0.0368,
346
+ "step": 2400
347
+ },
348
+ {
349
+ "epoch": 0.56,
350
+ "grad_norm": 0.06983346492052078,
351
+ "learning_rate": 9.008424336380778e-05,
352
+ "loss": 0.0271,
353
+ "step": 2450
354
+ },
355
+ {
356
+ "epoch": 0.5714285714285714,
357
+ "grad_norm": 4.264764308929443,
358
+ "learning_rate": 8.62911978626472e-05,
359
+ "loss": 0.0226,
360
+ "step": 2500
361
+ },
362
+ {
363
+ "epoch": 0.5828571428571429,
364
+ "grad_norm": 0.02377461828291416,
365
+ "learning_rate": 8.251815467532628e-05,
366
+ "loss": 0.0111,
367
+ "step": 2550
368
+ },
369
+ {
370
+ "epoch": 0.5942857142857143,
371
+ "grad_norm": 0.05363013595342636,
372
+ "learning_rate": 7.877061899429066e-05,
373
+ "loss": 0.0223,
374
+ "step": 2600
375
+ },
376
+ {
377
+ "epoch": 0.6057142857142858,
378
+ "grad_norm": 0.5701267719268799,
379
+ "learning_rate": 7.505405879435429e-05,
380
+ "loss": 0.0219,
381
+ "step": 2650
382
+ },
383
+ {
384
+ "epoch": 0.6171428571428571,
385
+ "grad_norm": 0.030285466462373734,
386
+ "learning_rate": 7.137389685445726e-05,
387
+ "loss": 0.0307,
388
+ "step": 2700
389
+ },
390
+ {
391
+ "epoch": 0.6285714285714286,
392
+ "grad_norm": 0.10280855000019073,
393
+ "learning_rate": 6.773550284536764e-05,
394
+ "loss": 0.0201,
395
+ "step": 2750
396
+ },
397
+ {
398
+ "epoch": 0.64,
399
+ "grad_norm": 0.017326869070529938,
400
+ "learning_rate": 6.414418549487308e-05,
401
+ "loss": 0.0367,
402
+ "step": 2800
403
+ },
404
+ {
405
+ "epoch": 0.6514285714285715,
406
+ "grad_norm": 0.039858993142843246,
407
+ "learning_rate": 6.060518484189344e-05,
408
+ "loss": 0.0302,
409
+ "step": 2850
410
+ },
411
+ {
412
+ "epoch": 0.6628571428571428,
413
+ "grad_norm": 0.19618330895900726,
414
+ "learning_rate": 5.712366459081577e-05,
415
+ "loss": 0.0035,
416
+ "step": 2900
417
+ },
418
+ {
419
+ "epoch": 0.6742857142857143,
420
+ "grad_norm": 0.10122616589069366,
421
+ "learning_rate": 5.37047045772089e-05,
422
+ "loss": 0.0301,
423
+ "step": 2950
424
+ },
425
+ {
426
+ "epoch": 0.6857142857142857,
427
+ "grad_norm": 0.28158578276634216,
428
+ "learning_rate": 5.035329335590868e-05,
429
+ "loss": 0.0183,
430
+ "step": 3000
431
+ },
432
+ {
433
+ "epoch": 0.6971428571428572,
434
+ "grad_norm": 1.7570079565048218,
435
+ "learning_rate": 4.707432092229059e-05,
436
+ "loss": 0.0124,
437
+ "step": 3050
438
+ },
439
+ {
440
+ "epoch": 0.7085714285714285,
441
+ "grad_norm": 0.3031100630760193,
442
+ "learning_rate": 4.387257157734841e-05,
443
+ "loss": 0.0145,
444
+ "step": 3100
445
+ },
446
+ {
447
+ "epoch": 0.72,
448
+ "grad_norm": 0.014142443425953388,
449
+ "learning_rate": 4.0752716946990246e-05,
450
+ "loss": 0.0147,
451
+ "step": 3150
452
+ },
453
+ {
454
+ "epoch": 0.7314285714285714,
455
+ "grad_norm": 0.02221490442752838,
456
+ "learning_rate": 3.7719309165737013e-05,
457
+ "loss": 0.0085,
458
+ "step": 3200
459
+ },
460
+ {
461
+ "epoch": 0.7428571428571429,
462
+ "grad_norm": 0.03386814147233963,
463
+ "learning_rate": 3.477677423476935e-05,
464
+ "loss": 0.0178,
465
+ "step": 3250
466
+ },
467
+ {
468
+ "epoch": 0.7542857142857143,
469
+ "grad_norm": 0.020248012617230415,
470
+ "learning_rate": 3.19294055640135e-05,
471
+ "loss": 0.0325,
472
+ "step": 3300
473
+ },
474
+ {
475
+ "epoch": 0.7657142857142857,
476
+ "grad_norm": 0.03207193687558174,
477
+ "learning_rate": 2.9181357707689438e-05,
478
+ "loss": 0.0148,
479
+ "step": 3350
480
+ },
481
+ {
482
+ "epoch": 0.7771428571428571,
483
+ "grad_norm": 0.024402625858783722,
484
+ "learning_rate": 2.6536640302461034e-05,
485
+ "loss": 0.047,
486
+ "step": 3400
487
+ },
488
+ {
489
+ "epoch": 0.7885714285714286,
490
+ "grad_norm": 0.10441145300865173,
491
+ "learning_rate": 2.399911221703377e-05,
492
+ "loss": 0.0134,
493
+ "step": 3450
494
+ },
495
+ {
496
+ "epoch": 0.8,
497
+ "grad_norm": 0.03832864388823509,
498
+ "learning_rate": 2.1572475921735357e-05,
499
+ "loss": 0.0107,
500
+ "step": 3500
501
+ },
502
+ {
503
+ "epoch": 0.8114285714285714,
504
+ "grad_norm": 0.01320917159318924,
505
+ "learning_rate": 1.9260272086295082e-05,
506
+ "loss": 0.008,
507
+ "step": 3550
508
+ },
509
+ {
510
+ "epoch": 0.8228571428571428,
511
+ "grad_norm": 0.14558428525924683,
512
+ "learning_rate": 1.706587441370433e-05,
513
+ "loss": 0.03,
514
+ "step": 3600
515
+ },
516
+ {
517
+ "epoch": 0.8342857142857143,
518
+ "grad_norm": 0.08396098017692566,
519
+ "learning_rate": 1.499248471769531e-05,
520
+ "loss": 0.0131,
521
+ "step": 3650
522
+ },
523
+ {
524
+ "epoch": 0.8457142857142858,
525
+ "grad_norm": 0.027459578588604927,
526
+ "learning_rate": 1.304312825102142e-05,
527
+ "loss": 0.0118,
528
+ "step": 3700
529
+ },
530
+ {
531
+ "epoch": 0.8571428571428571,
532
+ "grad_norm": 0.22308436036109924,
533
+ "learning_rate": 1.1220649291354902e-05,
534
+ "loss": 0.0033,
535
+ "step": 3750
536
+ },
537
+ {
538
+ "epoch": 0.8685714285714285,
539
+ "grad_norm": 0.01619495451450348,
540
+ "learning_rate": 9.527706991242502e-06,
541
+ "loss": 0.0185,
542
+ "step": 3800
543
+ },
544
+ {
545
+ "epoch": 0.88,
546
+ "grad_norm": 0.0167935099452734,
547
+ "learning_rate": 7.966771498174963e-06,
548
+ "loss": 0.023,
549
+ "step": 3850
550
+ },
551
+ {
552
+ "epoch": 0.8914285714285715,
553
+ "grad_norm": 0.018898479640483856,
554
+ "learning_rate": 6.540120350430423e-06,
555
+ "loss": 0.0261,
556
+ "step": 3900
557
+ },
558
+ {
559
+ "epoch": 0.9028571428571428,
560
+ "grad_norm": 0.04563748463988304,
561
+ "learning_rate": 5.24983515395161e-06,
562
+ "loss": 0.0103,
563
+ "step": 3950
564
+ },
565
+ {
566
+ "epoch": 0.9142857142857143,
567
+ "grad_norm": 0.050685442984104156,
568
+ "learning_rate": 4.097798545104914e-06,
569
+ "loss": 0.005,
570
+ "step": 4000
571
+ },
572
+ {
573
+ "epoch": 0.9257142857142857,
574
+ "grad_norm": 0.13636836409568787,
575
+ "learning_rate": 3.0856914437528805e-06,
576
+ "loss": 0.0326,
577
+ "step": 4050
578
+ },
579
+ {
580
+ "epoch": 0.9371428571428572,
581
+ "grad_norm": 0.035096440464258194,
582
+ "learning_rate": 2.2149906006486364e-06,
583
+ "loss": 0.0196,
584
+ "step": 4100
585
+ },
586
+ {
587
+ "epoch": 0.9485714285714286,
588
+ "grad_norm": 6.301705837249756,
589
+ "learning_rate": 1.4869664427303088e-06,
590
+ "loss": 0.0169,
591
+ "step": 4150
592
+ },
593
+ {
594
+ "epoch": 0.96,
595
+ "grad_norm": 0.012595863081514835,
596
+ "learning_rate": 9.026812194594448e-07,
597
+ "loss": 0.0119,
598
+ "step": 4200
599
+ },
600
+ {
601
+ "epoch": 0.9714285714285714,
602
+ "grad_norm": 0.05827270448207855,
603
+ "learning_rate": 4.629874529084477e-07,
604
+ "loss": 0.011,
605
+ "step": 4250
606
+ },
607
+ {
608
+ "epoch": 0.9828571428571429,
609
+ "grad_norm": 0.012354250065982342,
610
+ "learning_rate": 1.6852669385787334e-07,
611
+ "loss": 0.0267,
612
+ "step": 4300
613
+ },
614
+ {
615
+ "epoch": 0.9942857142857143,
616
+ "grad_norm": 0.017665507271885872,
617
+ "learning_rate": 1.9728585719092086e-08,
618
+ "loss": 0.0364,
619
+ "step": 4350
620
+ },
621
+ {
622
+ "epoch": 1.0,
623
+ "eval_accuracy": 0.9967,
624
+ "eval_f1_macro": 0.9967018731563405,
625
+ "eval_loss": 0.014994239434599876,
626
+ "eval_runtime": 7.0745,
627
+ "eval_samples_per_second": 1413.524,
628
+ "eval_steps_per_second": 44.243,
629
+ "step": 4375
630
+ }
631
+ ],
632
+ "logging_steps": 50,
633
+ "max_steps": 4375,
634
+ "num_input_tokens_seen": 0,
635
+ "num_train_epochs": 1,
636
+ "save_steps": 500,
637
+ "stateful_callbacks": {
638
+ "TrainerControl": {
639
+ "args": {
640
+ "should_epoch_stop": false,
641
+ "should_evaluate": false,
642
+ "should_log": false,
643
+ "should_save": true,
644
+ "should_training_stop": true
645
+ },
646
+ "attributes": {}
647
+ }
648
+ },
649
+ "total_flos": 2293744193283840.0,
650
+ "train_batch_size": 16,
651
+ "trial_name": null,
652
+ "trial_params": null
653
+ }
checkpoint-4375/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:061be453b71cc5f0316ab060db00b5b0ed085a09e132f09349edfeaf3c76dfc0
3
+ size 5841
cls/README.md ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: xlm-roberta-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:xlm-roberta-base
6
+ - lora
7
+ - transformers
8
+ ---
9
+
10
+ # Model Card for Model ID
11
+
12
+ <!-- Provide a quick summary of what the model is/does. -->
13
+
14
+
15
+
16
+ ## Model Details
17
+
18
+ ### Model Description
19
+
20
+ <!-- Provide a longer summary of what this model is. -->
21
+
22
+
23
+
24
+ - **Developed by:** [More Information Needed]
25
+ - **Funded by [optional]:** [More Information Needed]
26
+ - **Shared by [optional]:** [More Information Needed]
27
+ - **Model type:** [More Information Needed]
28
+ - **Language(s) (NLP):** [More Information Needed]
29
+ - **License:** [More Information Needed]
30
+ - **Finetuned from model [optional]:** [More Information Needed]
31
+
32
+ ### Model Sources [optional]
33
+
34
+ <!-- Provide the basic links for the model. -->
35
+
36
+ - **Repository:** [More Information Needed]
37
+ - **Paper [optional]:** [More Information Needed]
38
+ - **Demo [optional]:** [More Information Needed]
39
+
40
+ ## Uses
41
+
42
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
43
+
44
+ ### Direct Use
45
+
46
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
47
+
48
+ [More Information Needed]
49
+
50
+ ### Downstream Use [optional]
51
+
52
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
53
+
54
+ [More Information Needed]
55
+
56
+ ### Out-of-Scope Use
57
+
58
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
59
+
60
+ [More Information Needed]
61
+
62
+ ## Bias, Risks, and Limitations
63
+
64
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
65
+
66
+ [More Information Needed]
67
+
68
+ ### Recommendations
69
+
70
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
71
+
72
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
73
+
74
+ ## How to Get Started with the Model
75
+
76
+ Use the code below to get started with the model.
77
+
78
+ [More Information Needed]
79
+
80
+ ## Training Details
81
+
82
+ ### Training Data
83
+
84
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
85
+
86
+ [More Information Needed]
87
+
88
+ ### Training Procedure
89
+
90
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
91
+
92
+ #### Preprocessing [optional]
93
+
94
+ [More Information Needed]
95
+
96
+
97
+ #### Training Hyperparameters
98
+
99
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
100
+
101
+ #### Speeds, Sizes, Times [optional]
102
+
103
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
104
+
105
+ [More Information Needed]
106
+
107
+ ## Evaluation
108
+
109
+ <!-- This section describes the evaluation protocols and provides the results. -->
110
+
111
+ ### Testing Data, Factors & Metrics
112
+
113
+ #### Testing Data
114
+
115
+ <!-- This should link to a Dataset Card if possible. -->
116
+
117
+ [More Information Needed]
118
+
119
+ #### Factors
120
+
121
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
122
+
123
+ [More Information Needed]
124
+
125
+ #### Metrics
126
+
127
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
128
+
129
+ [More Information Needed]
130
+
131
+ ### Results
132
+
133
+ [More Information Needed]
134
+
135
+ #### Summary
136
+
137
+
138
+
139
+ ## Model Examination [optional]
140
+
141
+ <!-- Relevant interpretability work for the model goes here -->
142
+
143
+ [More Information Needed]
144
+
145
+ ## Environmental Impact
146
+
147
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
148
+
149
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
150
+
151
+ - **Hardware Type:** [More Information Needed]
152
+ - **Hours used:** [More Information Needed]
153
+ - **Cloud Provider:** [More Information Needed]
154
+ - **Compute Region:** [More Information Needed]
155
+ - **Carbon Emitted:** [More Information Needed]
156
+
157
+ ## Technical Specifications [optional]
158
+
159
+ ### Model Architecture and Objective
160
+
161
+ [More Information Needed]
162
+
163
+ ### Compute Infrastructure
164
+
165
+ [More Information Needed]
166
+
167
+ #### Hardware
168
+
169
+ [More Information Needed]
170
+
171
+ #### Software
172
+
173
+ [More Information Needed]
174
+
175
+ ## Citation [optional]
176
+
177
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
178
+
179
+ **BibTeX:**
180
+
181
+ [More Information Needed]
182
+
183
+ **APA:**
184
+
185
+ [More Information Needed]
186
+
187
+ ## Glossary [optional]
188
+
189
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
190
+
191
+ [More Information Needed]
192
+
193
+ ## More Information [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Authors [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Contact
202
+
203
+ [More Information Needed]
204
+ ### Framework versions
205
+
206
+ - PEFT 0.17.1
cls/adapter_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "xlm-roberta-base",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "key",
29
+ "query",
30
+ "value"
31
+ ],
32
+ "target_parameters": null,
33
+ "task_type": "FEATURE_EXTRACTION",
34
+ "trainable_token_indices": null,
35
+ "use_dora": false,
36
+ "use_qalora": false,
37
+ "use_rslora": false
38
+ }
cls/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0069df277693580cb1e9f2174db688694eef6282d376cb3e75724474e5e92a6
3
+ size 3548696
cls/checkpoint-4375/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9637f0a60a0eb82366143b9573cfd1624e0e9b65dd7a386e736f69ae0891e2db
3
+ size 559727136
cls/checkpoint-4375/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79695aaa1294db864d2ae12d0c3c57e187239591e32d4257830a45e2b5c77384
3
+ size 7264779
cls/checkpoint-4375/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd939a99316e4f9a37f179832cf4a3356001e3909c993b3e827da5cd4a4617ba
3
+ size 14645
cls/checkpoint-4375/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a993a449c7a40aa61a80703106703284ce9b6fb450a8bd64d07585e0fc640e07
3
+ size 1465
cls/checkpoint-4375/trainer_state.json ADDED
@@ -0,0 +1,653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 4375,
3
+ "best_metric": 0.9965,
4
+ "best_model_checkpoint": "lora-xlmr-langid/cls/checkpoint-4375",
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 4375,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.011428571428571429,
14
+ "grad_norm": 3.6967201232910156,
15
+ "learning_rate": 3.7262357414448674e-05,
16
+ "loss": 3.0631,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.022857142857142857,
21
+ "grad_norm": 4.938446521759033,
22
+ "learning_rate": 7.52851711026616e-05,
23
+ "loss": 3.0133,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.03428571428571429,
28
+ "grad_norm": 6.178889274597168,
29
+ "learning_rate": 0.00011330798479087452,
30
+ "loss": 2.8852,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.045714285714285714,
35
+ "grad_norm": 8.431418418884277,
36
+ "learning_rate": 0.00015133079847908746,
37
+ "loss": 2.3558,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.05714285714285714,
42
+ "grad_norm": 8.70128345489502,
43
+ "learning_rate": 0.0001893536121673004,
44
+ "loss": 1.5706,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.06857142857142857,
49
+ "grad_norm": 5.243009567260742,
50
+ "learning_rate": 0.00019996217828993133,
51
+ "loss": 0.8102,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.08,
56
+ "grad_norm": 5.443891525268555,
57
+ "learning_rate": 0.0001997842234752531,
58
+ "loss": 0.4393,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.09142857142857143,
63
+ "grad_norm": 11.013250350952148,
64
+ "learning_rate": 0.00019946067495120418,
65
+ "loss": 0.2872,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.10285714285714286,
70
+ "grad_norm": 7.354420185089111,
71
+ "learning_rate": 0.0001989920048027309,
72
+ "loss": 0.165,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 0.11428571428571428,
77
+ "grad_norm": 0.8610571622848511,
78
+ "learning_rate": 0.00019837889685963127,
79
+ "loss": 0.1022,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 0.12571428571428572,
84
+ "grad_norm": 1.2767540216445923,
85
+ "learning_rate": 0.00019762224569878898,
86
+ "loss": 0.0783,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 0.13714285714285715,
91
+ "grad_norm": 0.9847678542137146,
92
+ "learning_rate": 0.00019672315533890932,
93
+ "loss": 0.062,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 0.14857142857142858,
98
+ "grad_norm": 0.5554406046867371,
99
+ "learning_rate": 0.00019568293762966147,
100
+ "loss": 0.031,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 0.16,
105
+ "grad_norm": 0.37308812141418457,
106
+ "learning_rate": 0.0001945031103375777,
107
+ "loss": 0.0307,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 0.17142857142857143,
112
+ "grad_norm": 0.682551920413971,
113
+ "learning_rate": 0.00019318539493150242,
114
+ "loss": 0.0328,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 0.18285714285714286,
119
+ "grad_norm": 3.072620153427124,
120
+ "learning_rate": 0.0001917317140708218,
121
+ "loss": 0.0472,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 0.19428571428571428,
126
+ "grad_norm": 0.26799216866493225,
127
+ "learning_rate": 0.00019014418880013975,
128
+ "loss": 0.0483,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 0.2057142857142857,
133
+ "grad_norm": 0.7617077827453613,
134
+ "learning_rate": 0.00018842513545449223,
135
+ "loss": 0.0282,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 0.21714285714285714,
140
+ "grad_norm": 0.3497171401977539,
141
+ "learning_rate": 0.00018657706227961677,
142
+ "loss": 0.0368,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 0.22857142857142856,
147
+ "grad_norm": 2.047266960144043,
148
+ "learning_rate": 0.00018460266577220732,
149
+ "loss": 0.0235,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 0.24,
154
+ "grad_norm": 0.046612028032541275,
155
+ "learning_rate": 0.00018250482674549547,
156
+ "loss": 0.0113,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 0.25142857142857145,
161
+ "grad_norm": 3.3522257804870605,
162
+ "learning_rate": 0.00018028660612589743,
163
+ "loss": 0.0271,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 0.26285714285714284,
168
+ "grad_norm": 9.730717658996582,
169
+ "learning_rate": 0.00017795124048686103,
170
+ "loss": 0.0381,
171
+ "step": 1150
172
+ },
173
+ {
174
+ "epoch": 0.2742857142857143,
175
+ "grad_norm": 0.21655498445034027,
176
+ "learning_rate": 0.00017550213732642837,
177
+ "loss": 0.0265,
178
+ "step": 1200
179
+ },
180
+ {
181
+ "epoch": 0.2857142857142857,
182
+ "grad_norm": 0.018837904557585716,
183
+ "learning_rate": 0.00017294287009540494,
184
+ "loss": 0.0079,
185
+ "step": 1250
186
+ },
187
+ {
188
+ "epoch": 0.29714285714285715,
189
+ "grad_norm": 0.02092825062572956,
190
+ "learning_rate": 0.00017027717298338977,
191
+ "loss": 0.0199,
192
+ "step": 1300
193
+ },
194
+ {
195
+ "epoch": 0.30857142857142855,
196
+ "grad_norm": 0.24861940741539001,
197
+ "learning_rate": 0.0001675089354702732,
198
+ "loss": 0.0503,
199
+ "step": 1350
200
+ },
201
+ {
202
+ "epoch": 0.32,
203
+ "grad_norm": 0.02130724862217903,
204
+ "learning_rate": 0.0001646421966511539,
205
+ "loss": 0.0187,
206
+ "step": 1400
207
+ },
208
+ {
209
+ "epoch": 0.3314285714285714,
210
+ "grad_norm": 0.07263777405023575,
211
+ "learning_rate": 0.00016168113934295362,
212
+ "loss": 0.033,
213
+ "step": 1450
214
+ },
215
+ {
216
+ "epoch": 0.34285714285714286,
217
+ "grad_norm": 0.025235984474420547,
218
+ "learning_rate": 0.0001586300839813298,
219
+ "loss": 0.0064,
220
+ "step": 1500
221
+ },
222
+ {
223
+ "epoch": 0.35428571428571426,
224
+ "grad_norm": 0.022818434983491898,
225
+ "learning_rate": 0.00015549348231679093,
226
+ "loss": 0.0111,
227
+ "step": 1550
228
+ },
229
+ {
230
+ "epoch": 0.3657142857142857,
231
+ "grad_norm": 0.6987139582633972,
232
+ "learning_rate": 0.00015227591091921205,
233
+ "loss": 0.0215,
234
+ "step": 1600
235
+ },
236
+ {
237
+ "epoch": 0.37714285714285717,
238
+ "grad_norm": 2.869483470916748,
239
+ "learning_rate": 0.00014898206450022813,
240
+ "loss": 0.0416,
241
+ "step": 1650
242
+ },
243
+ {
244
+ "epoch": 0.38857142857142857,
245
+ "grad_norm": 0.12740445137023926,
246
+ "learning_rate": 0.00014561674906324873,
247
+ "loss": 0.014,
248
+ "step": 1700
249
+ },
250
+ {
251
+ "epoch": 0.4,
252
+ "grad_norm": 0.015665501356124878,
253
+ "learning_rate": 0.00014218487489108813,
254
+ "loss": 0.0084,
255
+ "step": 1750
256
+ },
257
+ {
258
+ "epoch": 0.4114285714285714,
259
+ "grad_norm": 0.019675016403198242,
260
+ "learning_rate": 0.00013869144938144325,
261
+ "loss": 0.0038,
262
+ "step": 1800
263
+ },
264
+ {
265
+ "epoch": 0.4228571428571429,
266
+ "grad_norm": 0.3949635922908783,
267
+ "learning_rate": 0.00013514156974067242,
268
+ "loss": 0.0212,
269
+ "step": 1850
270
+ },
271
+ {
272
+ "epoch": 0.4342857142857143,
273
+ "grad_norm": 0.04113984480500221,
274
+ "learning_rate": 0.00013154041554653577,
275
+ "loss": 0.0182,
276
+ "step": 1900
277
+ },
278
+ {
279
+ "epoch": 0.44571428571428573,
280
+ "grad_norm": 0.055694226175546646,
281
+ "learning_rate": 0.00012789324119074852,
282
+ "loss": 0.0408,
283
+ "step": 1950
284
+ },
285
+ {
286
+ "epoch": 0.45714285714285713,
287
+ "grad_norm": 0.01574717089533806,
288
+ "learning_rate": 0.00012420536821237444,
289
+ "loss": 0.016,
290
+ "step": 2000
291
+ },
292
+ {
293
+ "epoch": 0.4685714285714286,
294
+ "grad_norm": 0.03843434900045395,
295
+ "learning_rate": 0.00012048217753324587,
296
+ "loss": 0.0369,
297
+ "step": 2050
298
+ },
299
+ {
300
+ "epoch": 0.48,
301
+ "grad_norm": 0.01596643030643463,
302
+ "learning_rate": 0.00011672910160673858,
303
+ "loss": 0.0147,
304
+ "step": 2100
305
+ },
306
+ {
307
+ "epoch": 0.49142857142857144,
308
+ "grad_norm": 0.025914940983057022,
309
+ "learning_rate": 0.00011295161649135815,
310
+ "loss": 0.0257,
311
+ "step": 2150
312
+ },
313
+ {
314
+ "epoch": 0.5028571428571429,
315
+ "grad_norm": 0.0455513596534729,
316
+ "learning_rate": 0.00010915523386070277,
317
+ "loss": 0.0213,
318
+ "step": 2200
319
+ },
320
+ {
321
+ "epoch": 0.5142857142857142,
322
+ "grad_norm": 1.0212668180465698,
323
+ "learning_rate": 0.0001053454929614603,
324
+ "loss": 0.0121,
325
+ "step": 2250
326
+ },
327
+ {
328
+ "epoch": 0.5257142857142857,
329
+ "grad_norm": 0.07384993880987167,
330
+ "learning_rate": 0.00010152795253117406,
331
+ "loss": 0.0013,
332
+ "step": 2300
333
+ },
334
+ {
335
+ "epoch": 0.5371428571428571,
336
+ "grad_norm": 0.04635027050971985,
337
+ "learning_rate": 9.770818268756971e-05,
338
+ "loss": 0.0427,
339
+ "step": 2350
340
+ },
341
+ {
342
+ "epoch": 0.5485714285714286,
343
+ "grad_norm": 6.902871131896973,
344
+ "learning_rate": 9.389175680127735e-05,
345
+ "loss": 0.0351,
346
+ "step": 2400
347
+ },
348
+ {
349
+ "epoch": 0.56,
350
+ "grad_norm": 0.1742580235004425,
351
+ "learning_rate": 9.008424336380778e-05,
352
+ "loss": 0.0199,
353
+ "step": 2450
354
+ },
355
+ {
356
+ "epoch": 0.5714285714285714,
357
+ "grad_norm": 4.179978370666504,
358
+ "learning_rate": 8.62911978626472e-05,
359
+ "loss": 0.0125,
360
+ "step": 2500
361
+ },
362
+ {
363
+ "epoch": 0.5828571428571429,
364
+ "grad_norm": 0.01033821888267994,
365
+ "learning_rate": 8.251815467532628e-05,
366
+ "loss": 0.0038,
367
+ "step": 2550
368
+ },
369
+ {
370
+ "epoch": 0.5942857142857143,
371
+ "grad_norm": 0.017704356461763382,
372
+ "learning_rate": 7.877061899429066e-05,
373
+ "loss": 0.0388,
374
+ "step": 2600
375
+ },
376
+ {
377
+ "epoch": 0.6057142857142858,
378
+ "grad_norm": 0.06891408562660217,
379
+ "learning_rate": 7.505405879435429e-05,
380
+ "loss": 0.0179,
381
+ "step": 2650
382
+ },
383
+ {
384
+ "epoch": 0.6171428571428571,
385
+ "grad_norm": 0.06603990495204926,
386
+ "learning_rate": 7.137389685445726e-05,
387
+ "loss": 0.0229,
388
+ "step": 2700
389
+ },
390
+ {
391
+ "epoch": 0.6285714285714286,
392
+ "grad_norm": 0.029912831261754036,
393
+ "learning_rate": 6.773550284536764e-05,
394
+ "loss": 0.0073,
395
+ "step": 2750
396
+ },
397
+ {
398
+ "epoch": 0.64,
399
+ "grad_norm": 0.014299588277935982,
400
+ "learning_rate": 6.414418549487308e-05,
401
+ "loss": 0.0354,
402
+ "step": 2800
403
+ },
404
+ {
405
+ "epoch": 0.6514285714285715,
406
+ "grad_norm": 0.03639346361160278,
407
+ "learning_rate": 6.060518484189344e-05,
408
+ "loss": 0.0292,
409
+ "step": 2850
410
+ },
411
+ {
412
+ "epoch": 0.6628571428571428,
413
+ "grad_norm": 0.3018437325954437,
414
+ "learning_rate": 5.712366459081577e-05,
415
+ "loss": 0.0051,
416
+ "step": 2900
417
+ },
418
+ {
419
+ "epoch": 0.6742857142857143,
420
+ "grad_norm": 0.030778272077441216,
421
+ "learning_rate": 5.37047045772089e-05,
422
+ "loss": 0.0173,
423
+ "step": 2950
424
+ },
425
+ {
426
+ "epoch": 0.6857142857142857,
427
+ "grad_norm": 0.017592955380678177,
428
+ "learning_rate": 5.035329335590868e-05,
429
+ "loss": 0.0204,
430
+ "step": 3000
431
+ },
432
+ {
433
+ "epoch": 0.6971428571428572,
434
+ "grad_norm": 0.017659608274698257,
435
+ "learning_rate": 4.707432092229059e-05,
436
+ "loss": 0.0391,
437
+ "step": 3050
438
+ },
439
+ {
440
+ "epoch": 0.7085714285714285,
441
+ "grad_norm": 0.0674019530415535,
442
+ "learning_rate": 4.387257157734841e-05,
443
+ "loss": 0.0111,
444
+ "step": 3100
445
+ },
446
+ {
447
+ "epoch": 0.72,
448
+ "grad_norm": 0.007148749195039272,
449
+ "learning_rate": 4.0752716946990246e-05,
450
+ "loss": 0.0083,
451
+ "step": 3150
452
+ },
453
+ {
454
+ "epoch": 0.7314285714285714,
455
+ "grad_norm": 0.00936940684914589,
456
+ "learning_rate": 3.7719309165737013e-05,
457
+ "loss": 0.0078,
458
+ "step": 3200
459
+ },
460
+ {
461
+ "epoch": 0.7428571428571429,
462
+ "grad_norm": 0.023398227989673615,
463
+ "learning_rate": 3.477677423476935e-05,
464
+ "loss": 0.0073,
465
+ "step": 3250
466
+ },
467
+ {
468
+ "epoch": 0.7542857142857143,
469
+ "grad_norm": 0.036872465163469315,
470
+ "learning_rate": 3.19294055640135e-05,
471
+ "loss": 0.03,
472
+ "step": 3300
473
+ },
474
+ {
475
+ "epoch": 0.7657142857142857,
476
+ "grad_norm": 0.00688199233263731,
477
+ "learning_rate": 2.9181357707689438e-05,
478
+ "loss": 0.0076,
479
+ "step": 3350
480
+ },
481
+ {
482
+ "epoch": 0.7771428571428571,
483
+ "grad_norm": 0.05069967731833458,
484
+ "learning_rate": 2.6536640302461034e-05,
485
+ "loss": 0.0366,
486
+ "step": 3400
487
+ },
488
+ {
489
+ "epoch": 0.7885714285714286,
490
+ "grad_norm": 0.002097500255331397,
491
+ "learning_rate": 2.399911221703377e-05,
492
+ "loss": 0.0152,
493
+ "step": 3450
494
+ },
495
+ {
496
+ "epoch": 0.8,
497
+ "grad_norm": 0.008384926244616508,
498
+ "learning_rate": 2.1572475921735357e-05,
499
+ "loss": 0.0045,
500
+ "step": 3500
501
+ },
502
+ {
503
+ "epoch": 0.8114285714285714,
504
+ "grad_norm": 0.005785002373158932,
505
+ "learning_rate": 1.9260272086295082e-05,
506
+ "loss": 0.0133,
507
+ "step": 3550
508
+ },
509
+ {
510
+ "epoch": 0.8228571428571428,
511
+ "grad_norm": 0.031658366322517395,
512
+ "learning_rate": 1.706587441370433e-05,
513
+ "loss": 0.0339,
514
+ "step": 3600
515
+ },
516
+ {
517
+ "epoch": 0.8342857142857143,
518
+ "grad_norm": 0.03882748261094093,
519
+ "learning_rate": 1.499248471769531e-05,
520
+ "loss": 0.0086,
521
+ "step": 3650
522
+ },
523
+ {
524
+ "epoch": 0.8457142857142858,
525
+ "grad_norm": 0.007800533901900053,
526
+ "learning_rate": 1.304312825102142e-05,
527
+ "loss": 0.0147,
528
+ "step": 3700
529
+ },
530
+ {
531
+ "epoch": 0.8571428571428571,
532
+ "grad_norm": 0.059970512986183167,
533
+ "learning_rate": 1.1220649291354902e-05,
534
+ "loss": 0.0093,
535
+ "step": 3750
536
+ },
537
+ {
538
+ "epoch": 0.8685714285714285,
539
+ "grad_norm": 0.014666451141238213,
540
+ "learning_rate": 9.527706991242502e-06,
541
+ "loss": 0.0295,
542
+ "step": 3800
543
+ },
544
+ {
545
+ "epoch": 0.88,
546
+ "grad_norm": 0.04973109811544418,
547
+ "learning_rate": 7.966771498174963e-06,
548
+ "loss": 0.0182,
549
+ "step": 3850
550
+ },
551
+ {
552
+ "epoch": 0.8914285714285715,
553
+ "grad_norm": 0.00469050882384181,
554
+ "learning_rate": 6.540120350430423e-06,
555
+ "loss": 0.0121,
556
+ "step": 3900
557
+ },
558
+ {
559
+ "epoch": 0.9028571428571428,
560
+ "grad_norm": 0.004425337538123131,
561
+ "learning_rate": 5.24983515395161e-06,
562
+ "loss": 0.0178,
563
+ "step": 3950
564
+ },
565
+ {
566
+ "epoch": 0.9142857142857143,
567
+ "grad_norm": 0.006870228797197342,
568
+ "learning_rate": 4.097798545104914e-06,
569
+ "loss": 0.0192,
570
+ "step": 4000
571
+ },
572
+ {
573
+ "epoch": 0.9257142857142857,
574
+ "grad_norm": 0.006712966598570347,
575
+ "learning_rate": 3.0856914437528805e-06,
576
+ "loss": 0.0219,
577
+ "step": 4050
578
+ },
579
+ {
580
+ "epoch": 0.9371428571428572,
581
+ "grad_norm": 0.08353295922279358,
582
+ "learning_rate": 2.2149906006486364e-06,
583
+ "loss": 0.0176,
584
+ "step": 4100
585
+ },
586
+ {
587
+ "epoch": 0.9485714285714286,
588
+ "grad_norm": 5.768489360809326,
589
+ "learning_rate": 1.4869664427303088e-06,
590
+ "loss": 0.0146,
591
+ "step": 4150
592
+ },
593
+ {
594
+ "epoch": 0.96,
595
+ "grad_norm": 0.008484387770295143,
596
+ "learning_rate": 9.026812194594448e-07,
597
+ "loss": 0.0144,
598
+ "step": 4200
599
+ },
600
+ {
601
+ "epoch": 0.9714285714285714,
602
+ "grad_norm": 0.0066833593882620335,
603
+ "learning_rate": 4.629874529084477e-07,
604
+ "loss": 0.0126,
605
+ "step": 4250
606
+ },
607
+ {
608
+ "epoch": 0.9828571428571429,
609
+ "grad_norm": 0.0016708762850612402,
610
+ "learning_rate": 1.6852669385787334e-07,
611
+ "loss": 0.0312,
612
+ "step": 4300
613
+ },
614
+ {
615
+ "epoch": 0.9942857142857143,
616
+ "grad_norm": 0.05290694534778595,
617
+ "learning_rate": 1.9728585719092086e-08,
618
+ "loss": 0.0151,
619
+ "step": 4350
620
+ },
621
+ {
622
+ "epoch": 1.0,
623
+ "eval_accuracy": 0.9965,
624
+ "eval_f1_macro": 0.9965041499596807,
625
+ "eval_loss": 0.014111927710473537,
626
+ "eval_runtime": 7.3568,
627
+ "eval_samples_per_second": 1359.295,
628
+ "eval_steps_per_second": 42.546,
629
+ "step": 4375
630
+ }
631
+ ],
632
+ "logging_steps": 50,
633
+ "max_steps": 4375,
634
+ "num_input_tokens_seen": 0,
635
+ "num_train_epochs": 1,
636
+ "save_steps": 500,
637
+ "stateful_callbacks": {
638
+ "TrainerControl": {
639
+ "args": {
640
+ "should_epoch_stop": false,
641
+ "should_evaluate": false,
642
+ "should_log": false,
643
+ "should_save": true,
644
+ "should_training_stop": true
645
+ },
646
+ "attributes": {}
647
+ }
648
+ },
649
+ "total_flos": 0.0,
650
+ "train_batch_size": 16,
651
+ "trial_name": null,
652
+ "trial_params": null
653
+ }
cls/checkpoint-4375/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4075c4dd5f76985a66b5b8244a5a3a67d98141d7a13e095e3d3741f036a674f4
3
+ size 5841
cls/classifier_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e01577e3d7fb321b18e6436893b9b36458a7a55d7a382a9fc1ad659a21d50d68
3
+ size 63589
cls/id2label.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "ar",
3
+ "1": "bg",
4
+ "2": "de",
5
+ "3": "el",
6
+ "4": "en",
7
+ "5": "es",
8
+ "6": "fr",
9
+ "7": "hi",
10
+ "8": "it",
11
+ "9": "ja",
12
+ "10": "nl",
13
+ "11": "pl",
14
+ "12": "pt",
15
+ "13": "ru",
16
+ "14": "sw",
17
+ "15": "th",
18
+ "16": "tr",
19
+ "17": "ur",
20
+ "18": "vi",
21
+ "19": "zh"
22
+ }
cls/label2id.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ar": 0,
3
+ "bg": 1,
4
+ "de": 2,
5
+ "el": 3,
6
+ "en": 4,
7
+ "es": 5,
8
+ "fr": 6,
9
+ "hi": 7,
10
+ "it": 8,
11
+ "ja": 9,
12
+ "nl": 10,
13
+ "pl": 11,
14
+ "pt": 12,
15
+ "ru": 13,
16
+ "sw": 14,
17
+ "th": 15,
18
+ "tr": 16,
19
+ "ur": 17,
20
+ "vi": 18,
21
+ "zh": 19
22
+ }
cls/sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
cls/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
cls/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea09a711f7adcb7e3bc41b614e59b829fc98e7b50b94d273d029315524364069
3
+ size 17082831
cls/tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "model_max_length": 512,
51
+ "pad_token": "<pad>",
52
+ "sep_token": "</s>",
53
+ "tokenizer_class": "XLMRobertaTokenizer",
54
+ "unk_token": "<unk>"
55
+ }
id2label.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "ar",
3
+ "1": "bg",
4
+ "2": "de",
5
+ "3": "el",
6
+ "4": "en",
7
+ "5": "es",
8
+ "6": "fr",
9
+ "7": "hi",
10
+ "8": "it",
11
+ "9": "ja",
12
+ "10": "nl",
13
+ "11": "pl",
14
+ "12": "pt",
15
+ "13": "ru",
16
+ "14": "sw",
17
+ "15": "th",
18
+ "16": "tr",
19
+ "17": "ur",
20
+ "18": "vi",
21
+ "19": "zh"
22
+ }
label2id.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ar": 0,
3
+ "bg": 1,
4
+ "de": 2,
5
+ "el": 3,
6
+ "en": 4,
7
+ "es": 5,
8
+ "fr": 6,
9
+ "hi": 7,
10
+ "it": 8,
11
+ "ja": 9,
12
+ "nl": 10,
13
+ "pl": 11,
14
+ "pt": 12,
15
+ "ru": 13,
16
+ "sw": 14,
17
+ "th": 15,
18
+ "tr": 16,
19
+ "ur": 17,
20
+ "vi": 18,
21
+ "zh": 19
22
+ }
mean/README.md ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: xlm-roberta-base
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:xlm-roberta-base
6
+ - lora
7
+ - transformers
8
+ ---
9
+
10
+ # Model Card for Model ID
11
+
12
+ <!-- Provide a quick summary of what the model is/does. -->
13
+
14
+
15
+
16
+ ## Model Details
17
+
18
+ ### Model Description
19
+
20
+ <!-- Provide a longer summary of what this model is. -->
21
+
22
+
23
+
24
+ - **Developed by:** [More Information Needed]
25
+ - **Funded by [optional]:** [More Information Needed]
26
+ - **Shared by [optional]:** [More Information Needed]
27
+ - **Model type:** [More Information Needed]
28
+ - **Language(s) (NLP):** [More Information Needed]
29
+ - **License:** [More Information Needed]
30
+ - **Finetuned from model [optional]:** [More Information Needed]
31
+
32
+ ### Model Sources [optional]
33
+
34
+ <!-- Provide the basic links for the model. -->
35
+
36
+ - **Repository:** [More Information Needed]
37
+ - **Paper [optional]:** [More Information Needed]
38
+ - **Demo [optional]:** [More Information Needed]
39
+
40
+ ## Uses
41
+
42
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
43
+
44
+ ### Direct Use
45
+
46
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
47
+
48
+ [More Information Needed]
49
+
50
+ ### Downstream Use [optional]
51
+
52
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
53
+
54
+ [More Information Needed]
55
+
56
+ ### Out-of-Scope Use
57
+
58
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
59
+
60
+ [More Information Needed]
61
+
62
+ ## Bias, Risks, and Limitations
63
+
64
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
65
+
66
+ [More Information Needed]
67
+
68
+ ### Recommendations
69
+
70
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
71
+
72
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
73
+
74
+ ## How to Get Started with the Model
75
+
76
+ Use the code below to get started with the model.
77
+
78
+ [More Information Needed]
79
+
80
+ ## Training Details
81
+
82
+ ### Training Data
83
+
84
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
85
+
86
+ [More Information Needed]
87
+
88
+ ### Training Procedure
89
+
90
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
91
+
92
+ #### Preprocessing [optional]
93
+
94
+ [More Information Needed]
95
+
96
+
97
+ #### Training Hyperparameters
98
+
99
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
100
+
101
+ #### Speeds, Sizes, Times [optional]
102
+
103
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
104
+
105
+ [More Information Needed]
106
+
107
+ ## Evaluation
108
+
109
+ <!-- This section describes the evaluation protocols and provides the results. -->
110
+
111
+ ### Testing Data, Factors & Metrics
112
+
113
+ #### Testing Data
114
+
115
+ <!-- This should link to a Dataset Card if possible. -->
116
+
117
+ [More Information Needed]
118
+
119
+ #### Factors
120
+
121
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
122
+
123
+ [More Information Needed]
124
+
125
+ #### Metrics
126
+
127
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
128
+
129
+ [More Information Needed]
130
+
131
+ ### Results
132
+
133
+ [More Information Needed]
134
+
135
+ #### Summary
136
+
137
+
138
+
139
+ ## Model Examination [optional]
140
+
141
+ <!-- Relevant interpretability work for the model goes here -->
142
+
143
+ [More Information Needed]
144
+
145
+ ## Environmental Impact
146
+
147
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
148
+
149
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
150
+
151
+ - **Hardware Type:** [More Information Needed]
152
+ - **Hours used:** [More Information Needed]
153
+ - **Cloud Provider:** [More Information Needed]
154
+ - **Compute Region:** [More Information Needed]
155
+ - **Carbon Emitted:** [More Information Needed]
156
+
157
+ ## Technical Specifications [optional]
158
+
159
+ ### Model Architecture and Objective
160
+
161
+ [More Information Needed]
162
+
163
+ ### Compute Infrastructure
164
+
165
+ [More Information Needed]
166
+
167
+ #### Hardware
168
+
169
+ [More Information Needed]
170
+
171
+ #### Software
172
+
173
+ [More Information Needed]
174
+
175
+ ## Citation [optional]
176
+
177
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
178
+
179
+ **BibTeX:**
180
+
181
+ [More Information Needed]
182
+
183
+ **APA:**
184
+
185
+ [More Information Needed]
186
+
187
+ ## Glossary [optional]
188
+
189
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
190
+
191
+ [More Information Needed]
192
+
193
+ ## More Information [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Authors [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Contact
202
+
203
+ [More Information Needed]
204
+ ### Framework versions
205
+
206
+ - PEFT 0.17.1
mean/adapter_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "xlm-roberta-base",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "key",
29
+ "query",
30
+ "value"
31
+ ],
32
+ "target_parameters": null,
33
+ "task_type": "FEATURE_EXTRACTION",
34
+ "trainable_token_indices": null,
35
+ "use_dora": false,
36
+ "use_qalora": false,
37
+ "use_rslora": false
38
+ }
mean/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3259b74e83045754bee25a18ab00c16e5ea082f1c73200058fadf143a2bb99
3
+ size 3548696
mean/checkpoint-4375/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05574c04d0a1632ae27e5c247a92868cfd8a59adde5ba200a1c58b6efc5cbcf6
3
+ size 559727136
mean/checkpoint-4375/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b469bc554749b2d52c882e1c0f43a5512c17d0f39f6179643840dccd149a14f1
3
+ size 7264779
mean/checkpoint-4375/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd939a99316e4f9a37f179832cf4a3356001e3909c993b3e827da5cd4a4617ba
3
+ size 14645
mean/checkpoint-4375/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a993a449c7a40aa61a80703106703284ce9b6fb450a8bd64d07585e0fc640e07
3
+ size 1465
mean/checkpoint-4375/trainer_state.json ADDED
@@ -0,0 +1,653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 4375,
3
+ "best_metric": 0.997,
4
+ "best_model_checkpoint": "lora-xlmr-langid/mean/checkpoint-4375",
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 4375,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.011428571428571429,
14
+ "grad_norm": 3.5266411304473877,
15
+ "learning_rate": 3.7262357414448674e-05,
16
+ "loss": 3.0645,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.022857142857142857,
21
+ "grad_norm": 2.9135210514068604,
22
+ "learning_rate": 7.52851711026616e-05,
23
+ "loss": 3.0412,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.03428571428571429,
28
+ "grad_norm": 5.023406505584717,
29
+ "learning_rate": 0.00011330798479087452,
30
+ "loss": 2.6251,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.045714285714285714,
35
+ "grad_norm": 4.2361741065979,
36
+ "learning_rate": 0.00015133079847908746,
37
+ "loss": 1.3511,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.05714285714285714,
42
+ "grad_norm": 8.629075050354004,
43
+ "learning_rate": 0.0001893536121673004,
44
+ "loss": 0.4282,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.06857142857142857,
49
+ "grad_norm": 0.5085486769676208,
50
+ "learning_rate": 0.00019996217828993133,
51
+ "loss": 0.1367,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.08,
56
+ "grad_norm": 0.5619511604309082,
57
+ "learning_rate": 0.0001997842234752531,
58
+ "loss": 0.0675,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.09142857142857143,
63
+ "grad_norm": 0.11627336591482162,
64
+ "learning_rate": 0.00019946067495120418,
65
+ "loss": 0.0584,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.10285714285714286,
70
+ "grad_norm": 0.20873290300369263,
71
+ "learning_rate": 0.0001989920048027309,
72
+ "loss": 0.042,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 0.11428571428571428,
77
+ "grad_norm": 0.1875362992286682,
78
+ "learning_rate": 0.00019837889685963127,
79
+ "loss": 0.0376,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 0.12571428571428572,
84
+ "grad_norm": 0.10094111412763596,
85
+ "learning_rate": 0.00019762224569878898,
86
+ "loss": 0.0462,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 0.13714285714285715,
91
+ "grad_norm": 0.0968400090932846,
92
+ "learning_rate": 0.00019672315533890932,
93
+ "loss": 0.032,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 0.14857142857142858,
98
+ "grad_norm": 0.020933035761117935,
99
+ "learning_rate": 0.00019568293762966147,
100
+ "loss": 0.0169,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 0.16,
105
+ "grad_norm": 0.06590937077999115,
106
+ "learning_rate": 0.0001945031103375777,
107
+ "loss": 0.0207,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 0.17142857142857143,
112
+ "grad_norm": 0.18071456253528595,
113
+ "learning_rate": 0.00019318539493150242,
114
+ "loss": 0.0266,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 0.18285714285714286,
119
+ "grad_norm": 1.208040475845337,
120
+ "learning_rate": 0.0001917317140708218,
121
+ "loss": 0.0453,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 0.19428571428571428,
126
+ "grad_norm": 0.03780468925833702,
127
+ "learning_rate": 0.00019014418880013975,
128
+ "loss": 0.0402,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 0.2057142857142857,
133
+ "grad_norm": 0.15351633727550507,
134
+ "learning_rate": 0.00018842513545449223,
135
+ "loss": 0.0179,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 0.21714285714285714,
140
+ "grad_norm": 0.1763002723455429,
141
+ "learning_rate": 0.00018657706227961677,
142
+ "loss": 0.0256,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 0.22857142857142856,
147
+ "grad_norm": 0.4840329587459564,
148
+ "learning_rate": 0.00018460266577220732,
149
+ "loss": 0.0177,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 0.24,
154
+ "grad_norm": 0.03627489507198334,
155
+ "learning_rate": 0.00018250482674549547,
156
+ "loss": 0.0048,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 0.25142857142857145,
161
+ "grad_norm": 0.39385655522346497,
162
+ "learning_rate": 0.00018028660612589743,
163
+ "loss": 0.037,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 0.26285714285714284,
168
+ "grad_norm": 6.667974948883057,
169
+ "learning_rate": 0.00017795124048686103,
170
+ "loss": 0.033,
171
+ "step": 1150
172
+ },
173
+ {
174
+ "epoch": 0.2742857142857143,
175
+ "grad_norm": 0.4796125888824463,
176
+ "learning_rate": 0.00017550213732642837,
177
+ "loss": 0.0197,
178
+ "step": 1200
179
+ },
180
+ {
181
+ "epoch": 0.2857142857142857,
182
+ "grad_norm": 0.022151026874780655,
183
+ "learning_rate": 0.00017294287009540494,
184
+ "loss": 0.0117,
185
+ "step": 1250
186
+ },
187
+ {
188
+ "epoch": 0.29714285714285715,
189
+ "grad_norm": 0.024426177144050598,
190
+ "learning_rate": 0.00017027717298338977,
191
+ "loss": 0.0258,
192
+ "step": 1300
193
+ },
194
+ {
195
+ "epoch": 0.30857142857142855,
196
+ "grad_norm": 0.008404894731938839,
197
+ "learning_rate": 0.0001675089354702732,
198
+ "loss": 0.0315,
199
+ "step": 1350
200
+ },
201
+ {
202
+ "epoch": 0.32,
203
+ "grad_norm": 0.019231267273426056,
204
+ "learning_rate": 0.0001646421966511539,
205
+ "loss": 0.0067,
206
+ "step": 1400
207
+ },
208
+ {
209
+ "epoch": 0.3314285714285714,
210
+ "grad_norm": 0.02111838385462761,
211
+ "learning_rate": 0.00016168113934295362,
212
+ "loss": 0.0346,
213
+ "step": 1450
214
+ },
215
+ {
216
+ "epoch": 0.34285714285714286,
217
+ "grad_norm": 0.017707131803035736,
218
+ "learning_rate": 0.0001586300839813298,
219
+ "loss": 0.0066,
220
+ "step": 1500
221
+ },
222
+ {
223
+ "epoch": 0.35428571428571426,
224
+ "grad_norm": 0.01487037818878889,
225
+ "learning_rate": 0.00015549348231679093,
226
+ "loss": 0.0078,
227
+ "step": 1550
228
+ },
229
+ {
230
+ "epoch": 0.3657142857142857,
231
+ "grad_norm": 0.013854872435331345,
232
+ "learning_rate": 0.00015227591091921205,
233
+ "loss": 0.019,
234
+ "step": 1600
235
+ },
236
+ {
237
+ "epoch": 0.37714285714285717,
238
+ "grad_norm": 2.5561580657958984,
239
+ "learning_rate": 0.00014898206450022813,
240
+ "loss": 0.0295,
241
+ "step": 1650
242
+ },
243
+ {
244
+ "epoch": 0.38857142857142857,
245
+ "grad_norm": 0.023739265277981758,
246
+ "learning_rate": 0.00014561674906324873,
247
+ "loss": 0.0254,
248
+ "step": 1700
249
+ },
250
+ {
251
+ "epoch": 0.4,
252
+ "grad_norm": 0.008893780410289764,
253
+ "learning_rate": 0.00014218487489108813,
254
+ "loss": 0.0093,
255
+ "step": 1750
256
+ },
257
+ {
258
+ "epoch": 0.4114285714285714,
259
+ "grad_norm": 0.025123456493020058,
260
+ "learning_rate": 0.00013869144938144325,
261
+ "loss": 0.0182,
262
+ "step": 1800
263
+ },
264
+ {
265
+ "epoch": 0.4228571428571429,
266
+ "grad_norm": 0.016919748857617378,
267
+ "learning_rate": 0.00013514156974067242,
268
+ "loss": 0.0232,
269
+ "step": 1850
270
+ },
271
+ {
272
+ "epoch": 0.4342857142857143,
273
+ "grad_norm": 0.21599197387695312,
274
+ "learning_rate": 0.00013154041554653577,
275
+ "loss": 0.0307,
276
+ "step": 1900
277
+ },
278
+ {
279
+ "epoch": 0.44571428571428573,
280
+ "grad_norm": 3.532423973083496,
281
+ "learning_rate": 0.00012789324119074852,
282
+ "loss": 0.0244,
283
+ "step": 1950
284
+ },
285
+ {
286
+ "epoch": 0.45714285714285713,
287
+ "grad_norm": 0.007677409332245588,
288
+ "learning_rate": 0.00012420536821237444,
289
+ "loss": 0.0233,
290
+ "step": 2000
291
+ },
292
+ {
293
+ "epoch": 0.4685714285714286,
294
+ "grad_norm": 0.020959220826625824,
295
+ "learning_rate": 0.00012048217753324587,
296
+ "loss": 0.0234,
297
+ "step": 2050
298
+ },
299
+ {
300
+ "epoch": 0.48,
301
+ "grad_norm": 0.006289786193519831,
302
+ "learning_rate": 0.00011672910160673858,
303
+ "loss": 0.0073,
304
+ "step": 2100
305
+ },
306
+ {
307
+ "epoch": 0.49142857142857144,
308
+ "grad_norm": 0.012328894808888435,
309
+ "learning_rate": 0.00011295161649135815,
310
+ "loss": 0.0308,
311
+ "step": 2150
312
+ },
313
+ {
314
+ "epoch": 0.5028571428571429,
315
+ "grad_norm": 0.0096198795363307,
316
+ "learning_rate": 0.00010915523386070277,
317
+ "loss": 0.0251,
318
+ "step": 2200
319
+ },
320
+ {
321
+ "epoch": 0.5142857142857142,
322
+ "grad_norm": 0.08846427500247955,
323
+ "learning_rate": 0.0001053454929614603,
324
+ "loss": 0.0325,
325
+ "step": 2250
326
+ },
327
+ {
328
+ "epoch": 0.5257142857142857,
329
+ "grad_norm": 0.024032501503825188,
330
+ "learning_rate": 0.00010152795253117406,
331
+ "loss": 0.0011,
332
+ "step": 2300
333
+ },
334
+ {
335
+ "epoch": 0.5371428571428571,
336
+ "grad_norm": 0.017903102561831474,
337
+ "learning_rate": 9.770818268756971e-05,
338
+ "loss": 0.026,
339
+ "step": 2350
340
+ },
341
+ {
342
+ "epoch": 0.5485714285714286,
343
+ "grad_norm": 11.608915328979492,
344
+ "learning_rate": 9.389175680127735e-05,
345
+ "loss": 0.0203,
346
+ "step": 2400
347
+ },
348
+ {
349
+ "epoch": 0.56,
350
+ "grad_norm": 0.017623024061322212,
351
+ "learning_rate": 9.008424336380778e-05,
352
+ "loss": 0.0248,
353
+ "step": 2450
354
+ },
355
+ {
356
+ "epoch": 0.5714285714285714,
357
+ "grad_norm": 4.2325592041015625,
358
+ "learning_rate": 8.62911978626472e-05,
359
+ "loss": 0.0193,
360
+ "step": 2500
361
+ },
362
+ {
363
+ "epoch": 0.5828571428571429,
364
+ "grad_norm": 0.008484977297484875,
365
+ "learning_rate": 8.251815467532628e-05,
366
+ "loss": 0.0057,
367
+ "step": 2550
368
+ },
369
+ {
370
+ "epoch": 0.5942857142857143,
371
+ "grad_norm": 0.10779959708452225,
372
+ "learning_rate": 7.877061899429066e-05,
373
+ "loss": 0.0316,
374
+ "step": 2600
375
+ },
376
+ {
377
+ "epoch": 0.6057142857142858,
378
+ "grad_norm": 0.01012630295008421,
379
+ "learning_rate": 7.505405879435429e-05,
380
+ "loss": 0.0216,
381
+ "step": 2650
382
+ },
383
+ {
384
+ "epoch": 0.6171428571428571,
385
+ "grad_norm": 0.00607542647048831,
386
+ "learning_rate": 7.137389685445726e-05,
387
+ "loss": 0.0169,
388
+ "step": 2700
389
+ },
390
+ {
391
+ "epoch": 0.6285714285714286,
392
+ "grad_norm": 0.022452019155025482,
393
+ "learning_rate": 6.773550284536764e-05,
394
+ "loss": 0.0103,
395
+ "step": 2750
396
+ },
397
+ {
398
+ "epoch": 0.64,
399
+ "grad_norm": 0.029570262879133224,
400
+ "learning_rate": 6.414418549487308e-05,
401
+ "loss": 0.0344,
402
+ "step": 2800
403
+ },
404
+ {
405
+ "epoch": 0.6514285714285715,
406
+ "grad_norm": 0.009840002283453941,
407
+ "learning_rate": 6.060518484189344e-05,
408
+ "loss": 0.0237,
409
+ "step": 2850
410
+ },
411
+ {
412
+ "epoch": 0.6628571428571428,
413
+ "grad_norm": 0.5386189818382263,
414
+ "learning_rate": 5.712366459081577e-05,
415
+ "loss": 0.0017,
416
+ "step": 2900
417
+ },
418
+ {
419
+ "epoch": 0.6742857142857143,
420
+ "grad_norm": 0.004062721040099859,
421
+ "learning_rate": 5.37047045772089e-05,
422
+ "loss": 0.0133,
423
+ "step": 2950
424
+ },
425
+ {
426
+ "epoch": 0.6857142857142857,
427
+ "grad_norm": 0.005474657751619816,
428
+ "learning_rate": 5.035329335590868e-05,
429
+ "loss": 0.0176,
430
+ "step": 3000
431
+ },
432
+ {
433
+ "epoch": 0.6971428571428572,
434
+ "grad_norm": 0.04338672012090683,
435
+ "learning_rate": 4.707432092229059e-05,
436
+ "loss": 0.0202,
437
+ "step": 3050
438
+ },
439
+ {
440
+ "epoch": 0.7085714285714285,
441
+ "grad_norm": 0.11501504480838776,
442
+ "learning_rate": 4.387257157734841e-05,
443
+ "loss": 0.003,
444
+ "step": 3100
445
+ },
446
+ {
447
+ "epoch": 0.72,
448
+ "grad_norm": 0.004171635489910841,
449
+ "learning_rate": 4.0752716946990246e-05,
450
+ "loss": 0.0076,
451
+ "step": 3150
452
+ },
453
+ {
454
+ "epoch": 0.7314285714285714,
455
+ "grad_norm": 0.015187480486929417,
456
+ "learning_rate": 3.7719309165737013e-05,
457
+ "loss": 0.0091,
458
+ "step": 3200
459
+ },
460
+ {
461
+ "epoch": 0.7428571428571429,
462
+ "grad_norm": 0.029909875243902206,
463
+ "learning_rate": 3.477677423476935e-05,
464
+ "loss": 0.008,
465
+ "step": 3250
466
+ },
467
+ {
468
+ "epoch": 0.7542857142857143,
469
+ "grad_norm": 0.0026129253674298525,
470
+ "learning_rate": 3.19294055640135e-05,
471
+ "loss": 0.0216,
472
+ "step": 3300
473
+ },
474
+ {
475
+ "epoch": 0.7657142857142857,
476
+ "grad_norm": 0.0037045152857899666,
477
+ "learning_rate": 2.9181357707689438e-05,
478
+ "loss": 0.0051,
479
+ "step": 3350
480
+ },
481
+ {
482
+ "epoch": 0.7771428571428571,
483
+ "grad_norm": 0.02281978540122509,
484
+ "learning_rate": 2.6536640302461034e-05,
485
+ "loss": 0.0312,
486
+ "step": 3400
487
+ },
488
+ {
489
+ "epoch": 0.7885714285714286,
490
+ "grad_norm": 0.006208827719092369,
491
+ "learning_rate": 2.399911221703377e-05,
492
+ "loss": 0.0128,
493
+ "step": 3450
494
+ },
495
+ {
496
+ "epoch": 0.8,
497
+ "grad_norm": 0.0057523371651768684,
498
+ "learning_rate": 2.1572475921735357e-05,
499
+ "loss": 0.0052,
500
+ "step": 3500
501
+ },
502
+ {
503
+ "epoch": 0.8114285714285714,
504
+ "grad_norm": 0.0012299221707507968,
505
+ "learning_rate": 1.9260272086295082e-05,
506
+ "loss": 0.0077,
507
+ "step": 3550
508
+ },
509
+ {
510
+ "epoch": 0.8228571428571428,
511
+ "grad_norm": 0.03079100325703621,
512
+ "learning_rate": 1.706587441370433e-05,
513
+ "loss": 0.0216,
514
+ "step": 3600
515
+ },
516
+ {
517
+ "epoch": 0.8342857142857143,
518
+ "grad_norm": 0.0038934126496315002,
519
+ "learning_rate": 1.499248471769531e-05,
520
+ "loss": 0.0092,
521
+ "step": 3650
522
+ },
523
+ {
524
+ "epoch": 0.8457142857142858,
525
+ "grad_norm": 0.0019553981255739927,
526
+ "learning_rate": 1.304312825102142e-05,
527
+ "loss": 0.0076,
528
+ "step": 3700
529
+ },
530
+ {
531
+ "epoch": 0.8571428571428571,
532
+ "grad_norm": 0.7177829742431641,
533
+ "learning_rate": 1.1220649291354902e-05,
534
+ "loss": 0.001,
535
+ "step": 3750
536
+ },
537
+ {
538
+ "epoch": 0.8685714285714285,
539
+ "grad_norm": 0.0023476951755583286,
540
+ "learning_rate": 9.527706991242502e-06,
541
+ "loss": 0.0267,
542
+ "step": 3800
543
+ },
544
+ {
545
+ "epoch": 0.88,
546
+ "grad_norm": 0.006838622502982616,
547
+ "learning_rate": 7.966771498174963e-06,
548
+ "loss": 0.0186,
549
+ "step": 3850
550
+ },
551
+ {
552
+ "epoch": 0.8914285714285715,
553
+ "grad_norm": 0.0074235862120985985,
554
+ "learning_rate": 6.540120350430423e-06,
555
+ "loss": 0.013,
556
+ "step": 3900
557
+ },
558
+ {
559
+ "epoch": 0.9028571428571428,
560
+ "grad_norm": 0.0050347852520644665,
561
+ "learning_rate": 5.24983515395161e-06,
562
+ "loss": 0.006,
563
+ "step": 3950
564
+ },
565
+ {
566
+ "epoch": 0.9142857142857143,
567
+ "grad_norm": 0.0023033509496599436,
568
+ "learning_rate": 4.097798545104914e-06,
569
+ "loss": 0.0084,
570
+ "step": 4000
571
+ },
572
+ {
573
+ "epoch": 0.9257142857142857,
574
+ "grad_norm": 0.0027543501928448677,
575
+ "learning_rate": 3.0856914437528805e-06,
576
+ "loss": 0.0185,
577
+ "step": 4050
578
+ },
579
+ {
580
+ "epoch": 0.9371428571428572,
581
+ "grad_norm": 0.010095668025314808,
582
+ "learning_rate": 2.2149906006486364e-06,
583
+ "loss": 0.0133,
584
+ "step": 4100
585
+ },
586
+ {
587
+ "epoch": 0.9485714285714286,
588
+ "grad_norm": 7.557308673858643,
589
+ "learning_rate": 1.4869664427303088e-06,
590
+ "loss": 0.0162,
591
+ "step": 4150
592
+ },
593
+ {
594
+ "epoch": 0.96,
595
+ "grad_norm": 0.0023619842249900103,
596
+ "learning_rate": 9.026812194594448e-07,
597
+ "loss": 0.006,
598
+ "step": 4200
599
+ },
600
+ {
601
+ "epoch": 0.9714285714285714,
602
+ "grad_norm": 0.0030775663908571005,
603
+ "learning_rate": 4.629874529084477e-07,
604
+ "loss": 0.005,
605
+ "step": 4250
606
+ },
607
+ {
608
+ "epoch": 0.9828571428571429,
609
+ "grad_norm": 0.001358982059173286,
610
+ "learning_rate": 1.6852669385787334e-07,
611
+ "loss": 0.0352,
612
+ "step": 4300
613
+ },
614
+ {
615
+ "epoch": 0.9942857142857143,
616
+ "grad_norm": 0.0023301932960748672,
617
+ "learning_rate": 1.9728585719092086e-08,
618
+ "loss": 0.0115,
619
+ "step": 4350
620
+ },
621
+ {
622
+ "epoch": 1.0,
623
+ "eval_accuracy": 0.997,
624
+ "eval_f1_macro": 0.996997148310402,
625
+ "eval_loss": 0.01342015340924263,
626
+ "eval_runtime": 7.328,
627
+ "eval_samples_per_second": 1364.629,
628
+ "eval_steps_per_second": 42.713,
629
+ "step": 4375
630
+ }
631
+ ],
632
+ "logging_steps": 50,
633
+ "max_steps": 4375,
634
+ "num_input_tokens_seen": 0,
635
+ "num_train_epochs": 1,
636
+ "save_steps": 500,
637
+ "stateful_callbacks": {
638
+ "TrainerControl": {
639
+ "args": {
640
+ "should_epoch_stop": false,
641
+ "should_evaluate": false,
642
+ "should_log": false,
643
+ "should_save": true,
644
+ "should_training_stop": true
645
+ },
646
+ "attributes": {}
647
+ }
648
+ },
649
+ "total_flos": 0.0,
650
+ "train_batch_size": 16,
651
+ "trial_name": null,
652
+ "trial_params": null
653
+ }
mean/checkpoint-4375/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bb4e594c01a307f96c5e4776130ac5e50f436bb186d6f394d8c81895e4289e9
3
+ size 5841
mean/classifier_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58cf068c2c5b769df377d36f2ae24cc4c0b005be68e5f00ebcaff2e21f655814
3
+ size 63589
mean/id2label.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "ar",
3
+ "1": "bg",
4
+ "2": "de",
5
+ "3": "el",
6
+ "4": "en",
7
+ "5": "es",
8
+ "6": "fr",
9
+ "7": "hi",
10
+ "8": "it",
11
+ "9": "ja",
12
+ "10": "nl",
13
+ "11": "pl",
14
+ "12": "pt",
15
+ "13": "ru",
16
+ "14": "sw",
17
+ "15": "th",
18
+ "16": "tr",
19
+ "17": "ur",
20
+ "18": "vi",
21
+ "19": "zh"
22
+ }
mean/label2id.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ar": 0,
3
+ "bg": 1,
4
+ "de": 2,
5
+ "el": 3,
6
+ "en": 4,
7
+ "es": 5,
8
+ "fr": 6,
9
+ "hi": 7,
10
+ "it": 8,
11
+ "ja": 9,
12
+ "nl": 10,
13
+ "pl": 11,
14
+ "pt": 12,
15
+ "ru": 13,
16
+ "sw": 14,
17
+ "th": 15,
18
+ "tr": 16,
19
+ "ur": 17,
20
+ "vi": 18,
21
+ "zh": 19
22
+ }
mean/sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
mean/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
mean/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea09a711f7adcb7e3bc41b614e59b829fc98e7b50b94d273d029315524364069
3
+ size 17082831
mean/tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "model_max_length": 512,
51
+ "pad_token": "<pad>",
52
+ "sep_token": "</s>",
53
+ "tokenizer_class": "XLMRobertaTokenizer",
54
+ "unk_token": "<unk>"
55
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea09a711f7adcb7e3bc41b614e59b829fc98e7b50b94d273d029315524364069
3
+ size 17082831
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "model_max_length": 512,
51
+ "pad_token": "<pad>",
52
+ "sep_token": "</s>",
53
+ "tokenizer_class": "XLMRobertaTokenizer",
54
+ "unk_token": "<unk>"
55
+ }