alkiskoudounas commited on
Commit
f92a635
·
verified ·
1 Parent(s): 3547a96

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. config.json +269 -0
  2. model.safetensors +3 -0
  3. optimizer.pt +3 -0
  4. rng_state.pth +3 -0
  5. scheduler.pt +3 -0
  6. trainer_state.json +2883 -0
  7. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/hubert-large-ll60k",
3
+ "activation_dropout": 0.0,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "HubertForSequenceClassification"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": true,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": true,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_dropout": 0.0,
45
+ "feat_extract_norm": "layer",
46
+ "feat_proj_dropout": 0.1,
47
+ "feat_proj_layer_norm": true,
48
+ "final_dropout": 0.0,
49
+ "gradient_checkpointing": false,
50
+ "hidden_act": "gelu",
51
+ "hidden_dropout": 0.1,
52
+ "hidden_size": 1024,
53
+ "id2label": {
54
+ "0": "calendar_set",
55
+ "1": "wavs/audio_volume_up",
56
+ "10": "qa_stock",
57
+ "11": "play_music",
58
+ "12": "recommendation_events",
59
+ "13": "qa_definition",
60
+ "14": "alarm_remove",
61
+ "15": "play_podcasts",
62
+ "16": "social_query",
63
+ "17": "email_addcontact",
64
+ "18": "news_query",
65
+ "19": "calendar_query",
66
+ "2": "iot_hue_lightup",
67
+ "20": "music_likeness",
68
+ "21": "general_quirky",
69
+ "22": "qa_factoid",
70
+ "23": "takeaway_order",
71
+ "24": "play_wavs/audiobook",
72
+ "25": "iot_cleaning",
73
+ "26": "general_greet",
74
+ "27": "transport_query",
75
+ "28": "transport_taxi",
76
+ "29": "email_sendemail",
77
+ "3": "weather_query",
78
+ "30": "general_joke",
79
+ "31": "qa_maths",
80
+ "32": "social_post",
81
+ "33": "transport_ticket",
82
+ "34": "cooking_recipe",
83
+ "35": "music_settings",
84
+ "36": "calendar_remove",
85
+ "37": "iot_wemo_on",
86
+ "38": "iot_hue_lightchange",
87
+ "39": "play_radio",
88
+ "4": "iot_coffee",
89
+ "40": "email_querycontact",
90
+ "41": "transport_traffic",
91
+ "42": "qa_currency",
92
+ "43": "datetime_query",
93
+ "44": "iot_hue_lightoff",
94
+ "45": "takeaway_query",
95
+ "46": "lists_createoradd",
96
+ "47": "music_query",
97
+ "48": "recommendation_locations",
98
+ "49": "lists_query",
99
+ "5": "wavs/audio_volume_mute",
100
+ "50": "recommendation_movies",
101
+ "51": "iot_wemo_off",
102
+ "52": "iot_hue_lighton",
103
+ "53": "sendemail",
104
+ "54": "wavs/audio_volume_down",
105
+ "55": "play_game",
106
+ "56": "music",
107
+ "57": "datetime_convert",
108
+ "58": "iot_hue_lightdim",
109
+ "59": "query",
110
+ "6": "lists_remove",
111
+ "60": "createoradd",
112
+ "61": "music_dislikeness",
113
+ "62": "podcasts",
114
+ "63": "joke",
115
+ "64": "set",
116
+ "65": "hue_lightup",
117
+ "66": "factoid",
118
+ "67": "wavs/audio_volume_other",
119
+ "68": "hue_lightoff",
120
+ "69": "quirky",
121
+ "7": "email_query",
122
+ "70": "querycontact",
123
+ "71": "radio",
124
+ "72": "addcontact",
125
+ "73": "greet",
126
+ "74": "ticket",
127
+ "75": "traffic",
128
+ "76": "cooking_query",
129
+ "77": "remove",
130
+ "78": "currency",
131
+ "79": "coffee",
132
+ "8": "alarm_set",
133
+ "80": "game",
134
+ "81": "wemo_on",
135
+ "82": "definition",
136
+ "83": "events",
137
+ "84": "post",
138
+ "85": "hue_lightdim",
139
+ "86": "convert",
140
+ "87": "wemo_off",
141
+ "88": "cleaning",
142
+ "89": "settings",
143
+ "9": "alarm_query",
144
+ "90": "volume_other"
145
+ },
146
+ "initializer_range": 0.02,
147
+ "intermediate_size": 4096,
148
+ "label2id": {
149
+ "addcontact": "72",
150
+ "alarm_query": "9",
151
+ "alarm_remove": "14",
152
+ "alarm_set": "8",
153
+ "calendar_query": "19",
154
+ "calendar_remove": "36",
155
+ "calendar_set": "0",
156
+ "cleaning": "88",
157
+ "coffee": "79",
158
+ "convert": "86",
159
+ "cooking_query": "76",
160
+ "cooking_recipe": "34",
161
+ "createoradd": "60",
162
+ "currency": "78",
163
+ "datetime_convert": "57",
164
+ "datetime_query": "43",
165
+ "definition": "82",
166
+ "email_addcontact": "17",
167
+ "email_query": "7",
168
+ "email_querycontact": "40",
169
+ "email_sendemail": "29",
170
+ "events": "83",
171
+ "factoid": "66",
172
+ "game": "80",
173
+ "general_greet": "26",
174
+ "general_joke": "30",
175
+ "general_quirky": "21",
176
+ "greet": "73",
177
+ "hue_lightdim": "85",
178
+ "hue_lightoff": "68",
179
+ "hue_lightup": "65",
180
+ "iot_cleaning": "25",
181
+ "iot_coffee": "4",
182
+ "iot_hue_lightchange": "38",
183
+ "iot_hue_lightdim": "58",
184
+ "iot_hue_lightoff": "44",
185
+ "iot_hue_lighton": "52",
186
+ "iot_hue_lightup": "2",
187
+ "iot_wemo_off": "51",
188
+ "iot_wemo_on": "37",
189
+ "joke": "63",
190
+ "lists_createoradd": "46",
191
+ "lists_query": "49",
192
+ "lists_remove": "6",
193
+ "music": "56",
194
+ "music_dislikeness": "61",
195
+ "music_likeness": "20",
196
+ "music_query": "47",
197
+ "music_settings": "35",
198
+ "news_query": "18",
199
+ "play_game": "55",
200
+ "play_music": "11",
201
+ "play_podcasts": "15",
202
+ "play_radio": "39",
203
+ "play_wavs/audiobook": "24",
204
+ "podcasts": "62",
205
+ "post": "84",
206
+ "qa_currency": "42",
207
+ "qa_definition": "13",
208
+ "qa_factoid": "22",
209
+ "qa_maths": "31",
210
+ "qa_stock": "10",
211
+ "query": "59",
212
+ "querycontact": "70",
213
+ "quirky": "69",
214
+ "radio": "71",
215
+ "recommendation_events": "12",
216
+ "recommendation_locations": "48",
217
+ "recommendation_movies": "50",
218
+ "remove": "77",
219
+ "sendemail": "53",
220
+ "set": "64",
221
+ "settings": "89",
222
+ "social_post": "32",
223
+ "social_query": "16",
224
+ "takeaway_order": "23",
225
+ "takeaway_query": "45",
226
+ "ticket": "74",
227
+ "traffic": "75",
228
+ "transport_query": "27",
229
+ "transport_taxi": "28",
230
+ "transport_ticket": "33",
231
+ "transport_traffic": "41",
232
+ "volume_other": "90",
233
+ "wavs/audio_volume_down": "54",
234
+ "wavs/audio_volume_mute": "5",
235
+ "wavs/audio_volume_other": "67",
236
+ "wavs/audio_volume_up": "1",
237
+ "weather_query": "3",
238
+ "wemo_off": "87",
239
+ "wemo_on": "81"
240
+ },
241
+ "layer_norm_eps": 1e-05,
242
+ "layerdrop": 0.1,
243
+ "mask_channel_length": 10,
244
+ "mask_channel_min_space": 1,
245
+ "mask_channel_other": 0.0,
246
+ "mask_channel_prob": 0.0,
247
+ "mask_channel_selection": "static",
248
+ "mask_feature_length": 10,
249
+ "mask_feature_min_masks": 0,
250
+ "mask_feature_prob": 0.0,
251
+ "mask_time_length": 10,
252
+ "mask_time_min_masks": 2,
253
+ "mask_time_min_space": 1,
254
+ "mask_time_other": 0.0,
255
+ "mask_time_prob": 0.075,
256
+ "mask_time_selection": "static",
257
+ "model_type": "hubert",
258
+ "num_attention_heads": 16,
259
+ "num_conv_pos_embedding_groups": 16,
260
+ "num_conv_pos_embeddings": 128,
261
+ "num_feat_extract_layers": 7,
262
+ "num_hidden_layers": 24,
263
+ "pad_token_id": 0,
264
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
265
+ "torch_dtype": "float32",
266
+ "transformers_version": "4.45.2",
267
+ "use_weighted_layer_sum": false,
268
+ "vocab_size": 32
269
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94c5bf935da29df4273858bb64655d3e483c1565367faa48e0454fc328d456cb
3
+ size 1262949964
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29ed984b50ee2d8d070a53dc90f36912258aaadefb6ebf1b36c4e527a1f5e574
3
+ size 2526152656
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d724277b411545cf366b67774cfe033bc7ad010ad2913a328760fe568fdb6821
3
+ size 14308
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:417c32b40ce266eae8ab814d0af12409a7770b12ed2c5afd1b1ffbf9fa27b42f
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,2883 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7934407364787112,
3
+ "best_model_checkpoint": "results/facebook/hubert-large-ll60k/42/_retain/checkpoint-30000",
4
+ "epoch": 75.80543272267846,
5
+ "eval_steps": 400,
6
+ "global_step": 30000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.2526847757422615,
13
+ "grad_norm": 7.531186580657959,
14
+ "learning_rate": 1.6666666666666667e-05,
15
+ "loss": 4.4765,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.505369551484523,
20
+ "grad_norm": 1.3038771152496338,
21
+ "learning_rate": 3.3333333333333335e-05,
22
+ "loss": 4.1328,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.7580543272267846,
27
+ "grad_norm": 1.8434199094772339,
28
+ "learning_rate": 5e-05,
29
+ "loss": 3.8858,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 1.010739102969046,
34
+ "grad_norm": 4.255322456359863,
35
+ "learning_rate": 6.666666666666667e-05,
36
+ "loss": 3.8141,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 1.010739102969046,
41
+ "eval_accuracy": 0.08400460299194476,
42
+ "eval_f1_macro": 0.008669601975968978,
43
+ "eval_loss": 3.7715396881103516,
44
+ "eval_runtime": 108.6457,
45
+ "eval_samples_per_second": 79.985,
46
+ "eval_steps_per_second": 2.504,
47
+ "step": 400
48
+ },
49
+ {
50
+ "epoch": 1.2634238787113077,
51
+ "grad_norm": 5.987393379211426,
52
+ "learning_rate": 8.333333333333333e-05,
53
+ "loss": 3.7488,
54
+ "step": 500
55
+ },
56
+ {
57
+ "epoch": 1.5161086544535691,
58
+ "grad_norm": 4.6245269775390625,
59
+ "learning_rate": 0.0001,
60
+ "loss": 3.6051,
61
+ "step": 600
62
+ },
63
+ {
64
+ "epoch": 1.7687934301958306,
65
+ "grad_norm": 7.047222137451172,
66
+ "learning_rate": 0.00011666666666666667,
67
+ "loss": 3.2059,
68
+ "step": 700
69
+ },
70
+ {
71
+ "epoch": 2.021478205938092,
72
+ "grad_norm": 11.504085540771484,
73
+ "learning_rate": 0.00013333333333333334,
74
+ "loss": 2.7291,
75
+ "step": 800
76
+ },
77
+ {
78
+ "epoch": 2.021478205938092,
79
+ "eval_accuracy": 0.42681242807825087,
80
+ "eval_f1_macro": 0.17578343384188652,
81
+ "eval_loss": 2.323087215423584,
82
+ "eval_runtime": 108.6246,
83
+ "eval_samples_per_second": 80.0,
84
+ "eval_steps_per_second": 2.504,
85
+ "step": 800
86
+ },
87
+ {
88
+ "epoch": 2.274162981680354,
89
+ "grad_norm": 9.302848815917969,
90
+ "learning_rate": 0.00015,
91
+ "loss": 2.2862,
92
+ "step": 900
93
+ },
94
+ {
95
+ "epoch": 2.5268477574226154,
96
+ "grad_norm": 16.832901000976562,
97
+ "learning_rate": 0.00016666666666666666,
98
+ "loss": 1.9558,
99
+ "step": 1000
100
+ },
101
+ {
102
+ "epoch": 2.779532533164877,
103
+ "grad_norm": 18.594715118408203,
104
+ "learning_rate": 0.00018333333333333334,
105
+ "loss": 1.6921,
106
+ "step": 1100
107
+ },
108
+ {
109
+ "epoch": 3.0322173089071383,
110
+ "grad_norm": 21.0137882232666,
111
+ "learning_rate": 0.0002,
112
+ "loss": 1.4799,
113
+ "step": 1200
114
+ },
115
+ {
116
+ "epoch": 3.0322173089071383,
117
+ "eval_accuracy": 0.68204833141542,
118
+ "eval_f1_macro": 0.4656587521593648,
119
+ "eval_loss": 1.2915173768997192,
120
+ "eval_runtime": 108.6127,
121
+ "eval_samples_per_second": 80.009,
122
+ "eval_steps_per_second": 2.504,
123
+ "step": 1200
124
+ },
125
+ {
126
+ "epoch": 3.2849020846493997,
127
+ "grad_norm": 10.087828636169434,
128
+ "learning_rate": 0.00021666666666666668,
129
+ "loss": 1.264,
130
+ "step": 1300
131
+ },
132
+ {
133
+ "epoch": 3.537586860391661,
134
+ "grad_norm": 8.730626106262207,
135
+ "learning_rate": 0.00023333333333333333,
136
+ "loss": 1.1697,
137
+ "step": 1400
138
+ },
139
+ {
140
+ "epoch": 3.790271636133923,
141
+ "grad_norm": 19.08763885498047,
142
+ "learning_rate": 0.00025,
143
+ "loss": 1.0774,
144
+ "step": 1500
145
+ },
146
+ {
147
+ "epoch": 4.042956411876184,
148
+ "grad_norm": 12.934535026550293,
149
+ "learning_rate": 0.0002666666666666667,
150
+ "loss": 1.0037,
151
+ "step": 1600
152
+ },
153
+ {
154
+ "epoch": 4.042956411876184,
155
+ "eval_accuracy": 0.7395857307249712,
156
+ "eval_f1_macro": 0.5527706813137244,
157
+ "eval_loss": 1.1226997375488281,
158
+ "eval_runtime": 108.4906,
159
+ "eval_samples_per_second": 80.099,
160
+ "eval_steps_per_second": 2.507,
161
+ "step": 1600
162
+ },
163
+ {
164
+ "epoch": 4.295641187618446,
165
+ "grad_norm": 11.546894073486328,
166
+ "learning_rate": 0.00028333333333333335,
167
+ "loss": 0.8737,
168
+ "step": 1700
169
+ },
170
+ {
171
+ "epoch": 4.548325963360708,
172
+ "grad_norm": 10.242780685424805,
173
+ "learning_rate": 0.0003,
174
+ "loss": 0.8296,
175
+ "step": 1800
176
+ },
177
+ {
178
+ "epoch": 4.801010739102969,
179
+ "grad_norm": 9.696259498596191,
180
+ "learning_rate": 0.00031666666666666665,
181
+ "loss": 0.8139,
182
+ "step": 1900
183
+ },
184
+ {
185
+ "epoch": 5.053695514845231,
186
+ "grad_norm": 6.417578220367432,
187
+ "learning_rate": 0.0003333333333333333,
188
+ "loss": 0.7768,
189
+ "step": 2000
190
+ },
191
+ {
192
+ "epoch": 5.053695514845231,
193
+ "eval_accuracy": 0.7643268124280782,
194
+ "eval_f1_macro": 0.5816116111187802,
195
+ "eval_loss": 1.0890038013458252,
196
+ "eval_runtime": 108.7404,
197
+ "eval_samples_per_second": 79.915,
198
+ "eval_steps_per_second": 2.501,
199
+ "step": 2000
200
+ },
201
+ {
202
+ "epoch": 5.306380290587492,
203
+ "grad_norm": 9.01593017578125,
204
+ "learning_rate": 0.00035,
205
+ "loss": 0.7057,
206
+ "step": 2100
207
+ },
208
+ {
209
+ "epoch": 5.559065066329754,
210
+ "grad_norm": 6.170841217041016,
211
+ "learning_rate": 0.00036666666666666667,
212
+ "loss": 0.6655,
213
+ "step": 2200
214
+ },
215
+ {
216
+ "epoch": 5.8117498420720155,
217
+ "grad_norm": 9.565030097961426,
218
+ "learning_rate": 0.00038333333333333334,
219
+ "loss": 0.6767,
220
+ "step": 2300
221
+ },
222
+ {
223
+ "epoch": 6.0644346178142765,
224
+ "grad_norm": 11.446185111999512,
225
+ "learning_rate": 0.0004,
226
+ "loss": 0.6207,
227
+ "step": 2400
228
+ },
229
+ {
230
+ "epoch": 6.0644346178142765,
231
+ "eval_accuracy": 0.756156501726122,
232
+ "eval_f1_macro": 0.5962013864268475,
233
+ "eval_loss": 1.1590509414672852,
234
+ "eval_runtime": 108.3732,
235
+ "eval_samples_per_second": 80.186,
236
+ "eval_steps_per_second": 2.51,
237
+ "step": 2400
238
+ },
239
+ {
240
+ "epoch": 6.317119393556538,
241
+ "grad_norm": 6.382347583770752,
242
+ "learning_rate": 0.0004166666666666667,
243
+ "loss": 0.5489,
244
+ "step": 2500
245
+ },
246
+ {
247
+ "epoch": 6.569804169298799,
248
+ "grad_norm": 7.796228408813477,
249
+ "learning_rate": 0.00043333333333333337,
250
+ "loss": 0.5615,
251
+ "step": 2600
252
+ },
253
+ {
254
+ "epoch": 6.822488945041061,
255
+ "grad_norm": 10.157086372375488,
256
+ "learning_rate": 0.00045000000000000004,
257
+ "loss": 0.61,
258
+ "step": 2700
259
+ },
260
+ {
261
+ "epoch": 7.075173720783323,
262
+ "grad_norm": 9.361109733581543,
263
+ "learning_rate": 0.00046666666666666666,
264
+ "loss": 0.5457,
265
+ "step": 2800
266
+ },
267
+ {
268
+ "epoch": 7.075173720783323,
269
+ "eval_accuracy": 0.75385500575374,
270
+ "eval_f1_macro": 0.594201593201007,
271
+ "eval_loss": 1.2054548263549805,
272
+ "eval_runtime": 108.935,
273
+ "eval_samples_per_second": 79.772,
274
+ "eval_steps_per_second": 2.497,
275
+ "step": 2800
276
+ },
277
+ {
278
+ "epoch": 7.327858496525584,
279
+ "grad_norm": 4.478394031524658,
280
+ "learning_rate": 0.00048333333333333334,
281
+ "loss": 0.5006,
282
+ "step": 2900
283
+ },
284
+ {
285
+ "epoch": 7.580543272267846,
286
+ "grad_norm": 7.6330366134643555,
287
+ "learning_rate": 0.0005,
288
+ "loss": 0.5456,
289
+ "step": 3000
290
+ },
291
+ {
292
+ "epoch": 7.833228048010107,
293
+ "grad_norm": 6.51686429977417,
294
+ "learning_rate": 0.0004981481481481482,
295
+ "loss": 0.5245,
296
+ "step": 3100
297
+ },
298
+ {
299
+ "epoch": 8.085912823752368,
300
+ "grad_norm": 6.197965621948242,
301
+ "learning_rate": 0.0004962962962962963,
302
+ "loss": 0.4905,
303
+ "step": 3200
304
+ },
305
+ {
306
+ "epoch": 8.085912823752368,
307
+ "eval_accuracy": 0.7558112773302647,
308
+ "eval_f1_macro": 0.591751644264711,
309
+ "eval_loss": 1.168681263923645,
310
+ "eval_runtime": 108.6923,
311
+ "eval_samples_per_second": 79.95,
312
+ "eval_steps_per_second": 2.502,
313
+ "step": 3200
314
+ },
315
+ {
316
+ "epoch": 8.33859759949463,
317
+ "grad_norm": 5.975191116333008,
318
+ "learning_rate": 0.0004944444444444445,
319
+ "loss": 0.4324,
320
+ "step": 3300
321
+ },
322
+ {
323
+ "epoch": 8.591282375236892,
324
+ "grad_norm": 6.450153827667236,
325
+ "learning_rate": 0.0004925925925925925,
326
+ "loss": 0.4385,
327
+ "step": 3400
328
+ },
329
+ {
330
+ "epoch": 8.843967150979154,
331
+ "grad_norm": 3.6089916229248047,
332
+ "learning_rate": 0.0004907407407407408,
333
+ "loss": 0.4312,
334
+ "step": 3500
335
+ },
336
+ {
337
+ "epoch": 9.096651926721416,
338
+ "grad_norm": 5.325081825256348,
339
+ "learning_rate": 0.0004888888888888889,
340
+ "loss": 0.3977,
341
+ "step": 3600
342
+ },
343
+ {
344
+ "epoch": 9.096651926721416,
345
+ "eval_accuracy": 0.7486766398158803,
346
+ "eval_f1_macro": 0.5505823611427755,
347
+ "eval_loss": 1.3269578218460083,
348
+ "eval_runtime": 108.9968,
349
+ "eval_samples_per_second": 79.727,
350
+ "eval_steps_per_second": 2.495,
351
+ "step": 3600
352
+ },
353
+ {
354
+ "epoch": 9.349336702463676,
355
+ "grad_norm": 6.121856212615967,
356
+ "learning_rate": 0.00048703703703703707,
357
+ "loss": 0.3766,
358
+ "step": 3700
359
+ },
360
+ {
361
+ "epoch": 9.602021478205938,
362
+ "grad_norm": 3.0020909309387207,
363
+ "learning_rate": 0.0004851851851851852,
364
+ "loss": 0.3605,
365
+ "step": 3800
366
+ },
367
+ {
368
+ "epoch": 9.8547062539482,
369
+ "grad_norm": 3.841557502746582,
370
+ "learning_rate": 0.00048333333333333334,
371
+ "loss": 0.3802,
372
+ "step": 3900
373
+ },
374
+ {
375
+ "epoch": 10.107391029690461,
376
+ "grad_norm": 7.770158767700195,
377
+ "learning_rate": 0.00048148148148148144,
378
+ "loss": 0.3404,
379
+ "step": 4000
380
+ },
381
+ {
382
+ "epoch": 10.107391029690461,
383
+ "eval_accuracy": 0.7620253164556962,
384
+ "eval_f1_macro": 0.5823323463909736,
385
+ "eval_loss": 1.2837411165237427,
386
+ "eval_runtime": 108.7289,
387
+ "eval_samples_per_second": 79.924,
388
+ "eval_steps_per_second": 2.502,
389
+ "step": 4000
390
+ },
391
+ {
392
+ "epoch": 10.360075805432723,
393
+ "grad_norm": 4.366064071655273,
394
+ "learning_rate": 0.00047962962962962965,
395
+ "loss": 0.314,
396
+ "step": 4100
397
+ },
398
+ {
399
+ "epoch": 10.612760581174983,
400
+ "grad_norm": 9.094232559204102,
401
+ "learning_rate": 0.0004777777777777778,
402
+ "loss": 0.3174,
403
+ "step": 4200
404
+ },
405
+ {
406
+ "epoch": 10.865445356917245,
407
+ "grad_norm": 4.532145023345947,
408
+ "learning_rate": 0.0004759259259259259,
409
+ "loss": 0.3083,
410
+ "step": 4300
411
+ },
412
+ {
413
+ "epoch": 11.118130132659507,
414
+ "grad_norm": 2.8124845027923584,
415
+ "learning_rate": 0.0004740740740740741,
416
+ "loss": 0.28,
417
+ "step": 4400
418
+ },
419
+ {
420
+ "epoch": 11.118130132659507,
421
+ "eval_accuracy": 0.7566168009205984,
422
+ "eval_f1_macro": 0.5792237564159676,
423
+ "eval_loss": 1.4293328523635864,
424
+ "eval_runtime": 108.8172,
425
+ "eval_samples_per_second": 79.859,
426
+ "eval_steps_per_second": 2.5,
427
+ "step": 4400
428
+ },
429
+ {
430
+ "epoch": 11.37081490840177,
431
+ "grad_norm": 3.8208773136138916,
432
+ "learning_rate": 0.00047222222222222224,
433
+ "loss": 0.2614,
434
+ "step": 4500
435
+ },
436
+ {
437
+ "epoch": 11.623499684144031,
438
+ "grad_norm": 4.572866916656494,
439
+ "learning_rate": 0.0004703703703703704,
440
+ "loss": 0.2696,
441
+ "step": 4600
442
+ },
443
+ {
444
+ "epoch": 11.876184459886291,
445
+ "grad_norm": 3.239572763442993,
446
+ "learning_rate": 0.0004685185185185185,
447
+ "loss": 0.2691,
448
+ "step": 4700
449
+ },
450
+ {
451
+ "epoch": 12.128869235628553,
452
+ "grad_norm": 4.106109619140625,
453
+ "learning_rate": 0.00046666666666666666,
454
+ "loss": 0.2564,
455
+ "step": 4800
456
+ },
457
+ {
458
+ "epoch": 12.128869235628553,
459
+ "eval_accuracy": 0.7440736478711162,
460
+ "eval_f1_macro": 0.5907392176160607,
461
+ "eval_loss": 1.45040762424469,
462
+ "eval_runtime": 108.4161,
463
+ "eval_samples_per_second": 80.154,
464
+ "eval_steps_per_second": 2.509,
465
+ "step": 4800
466
+ },
467
+ {
468
+ "epoch": 12.381554011370815,
469
+ "grad_norm": 2.1584527492523193,
470
+ "learning_rate": 0.0004648148148148148,
471
+ "loss": 0.2412,
472
+ "step": 4900
473
+ },
474
+ {
475
+ "epoch": 12.634238787113077,
476
+ "grad_norm": 5.035179138183594,
477
+ "learning_rate": 0.000462962962962963,
478
+ "loss": 0.2345,
479
+ "step": 5000
480
+ },
481
+ {
482
+ "epoch": 12.886923562855339,
483
+ "grad_norm": 4.4065752029418945,
484
+ "learning_rate": 0.00046111111111111114,
485
+ "loss": 0.2373,
486
+ "step": 5100
487
+ },
488
+ {
489
+ "epoch": 13.139608338597599,
490
+ "grad_norm": 3.795588254928589,
491
+ "learning_rate": 0.00045925925925925925,
492
+ "loss": 0.2166,
493
+ "step": 5200
494
+ },
495
+ {
496
+ "epoch": 13.139608338597599,
497
+ "eval_accuracy": 0.7540851553509781,
498
+ "eval_f1_macro": 0.5753080086949651,
499
+ "eval_loss": 1.4374622106552124,
500
+ "eval_runtime": 108.3257,
501
+ "eval_samples_per_second": 80.221,
502
+ "eval_steps_per_second": 2.511,
503
+ "step": 5200
504
+ },
505
+ {
506
+ "epoch": 13.39229311433986,
507
+ "grad_norm": 3.1705026626586914,
508
+ "learning_rate": 0.00045740740740740746,
509
+ "loss": 0.2114,
510
+ "step": 5300
511
+ },
512
+ {
513
+ "epoch": 13.644977890082123,
514
+ "grad_norm": 5.109796047210693,
515
+ "learning_rate": 0.00045555555555555556,
516
+ "loss": 0.2207,
517
+ "step": 5400
518
+ },
519
+ {
520
+ "epoch": 13.897662665824384,
521
+ "grad_norm": 5.884572982788086,
522
+ "learning_rate": 0.0004537037037037037,
523
+ "loss": 0.2135,
524
+ "step": 5500
525
+ },
526
+ {
527
+ "epoch": 14.150347441566646,
528
+ "grad_norm": 4.0508246421813965,
529
+ "learning_rate": 0.00045185185185185183,
530
+ "loss": 0.2029,
531
+ "step": 5600
532
+ },
533
+ {
534
+ "epoch": 14.150347441566646,
535
+ "eval_accuracy": 0.7579976985040277,
536
+ "eval_f1_macro": 0.5636077065338464,
537
+ "eval_loss": 1.3845158815383911,
538
+ "eval_runtime": 108.3008,
539
+ "eval_samples_per_second": 80.239,
540
+ "eval_steps_per_second": 2.512,
541
+ "step": 5600
542
+ },
543
+ {
544
+ "epoch": 14.403032217308906,
545
+ "grad_norm": 3.048769235610962,
546
+ "learning_rate": 0.00045000000000000004,
547
+ "loss": 0.1842,
548
+ "step": 5700
549
+ },
550
+ {
551
+ "epoch": 14.655716993051168,
552
+ "grad_norm": 5.531657695770264,
553
+ "learning_rate": 0.00044814814814814815,
554
+ "loss": 0.1969,
555
+ "step": 5800
556
+ },
557
+ {
558
+ "epoch": 14.90840176879343,
559
+ "grad_norm": 4.874760150909424,
560
+ "learning_rate": 0.0004462962962962963,
561
+ "loss": 0.187,
562
+ "step": 5900
563
+ },
564
+ {
565
+ "epoch": 15.161086544535692,
566
+ "grad_norm": 4.56043815612793,
567
+ "learning_rate": 0.0004444444444444444,
568
+ "loss": 0.1781,
569
+ "step": 6000
570
+ },
571
+ {
572
+ "epoch": 15.161086544535692,
573
+ "eval_accuracy": 0.7592635212888378,
574
+ "eval_f1_macro": 0.6140090678141539,
575
+ "eval_loss": 1.4697922468185425,
576
+ "eval_runtime": 108.1742,
577
+ "eval_samples_per_second": 80.333,
578
+ "eval_steps_per_second": 2.514,
579
+ "step": 6000
580
+ },
581
+ {
582
+ "epoch": 15.413771320277954,
583
+ "grad_norm": 2.3174920082092285,
584
+ "learning_rate": 0.0004425925925925926,
585
+ "loss": 0.1646,
586
+ "step": 6100
587
+ },
588
+ {
589
+ "epoch": 15.666456096020214,
590
+ "grad_norm": 6.532171726226807,
591
+ "learning_rate": 0.0004407407407407408,
592
+ "loss": 0.176,
593
+ "step": 6200
594
+ },
595
+ {
596
+ "epoch": 15.919140871762476,
597
+ "grad_norm": 8.349685668945312,
598
+ "learning_rate": 0.0004388888888888889,
599
+ "loss": 0.1819,
600
+ "step": 6300
601
+ },
602
+ {
603
+ "epoch": 16.171825647504736,
604
+ "grad_norm": 2.4073634147644043,
605
+ "learning_rate": 0.00043703703703703705,
606
+ "loss": 0.1559,
607
+ "step": 6400
608
+ },
609
+ {
610
+ "epoch": 16.171825647504736,
611
+ "eval_accuracy": 0.7581127733026467,
612
+ "eval_f1_macro": 0.6005969862926049,
613
+ "eval_loss": 1.6600337028503418,
614
+ "eval_runtime": 108.333,
615
+ "eval_samples_per_second": 80.216,
616
+ "eval_steps_per_second": 2.511,
617
+ "step": 6400
618
+ },
619
+ {
620
+ "epoch": 16.424510423247,
621
+ "grad_norm": 5.727371692657471,
622
+ "learning_rate": 0.0004351851851851852,
623
+ "loss": 0.1574,
624
+ "step": 6500
625
+ },
626
+ {
627
+ "epoch": 16.67719519898926,
628
+ "grad_norm": 5.15314245223999,
629
+ "learning_rate": 0.00043333333333333337,
630
+ "loss": 0.162,
631
+ "step": 6600
632
+ },
633
+ {
634
+ "epoch": 16.929879974731524,
635
+ "grad_norm": 4.066269397735596,
636
+ "learning_rate": 0.00043148148148148147,
637
+ "loss": 0.1541,
638
+ "step": 6700
639
+ },
640
+ {
641
+ "epoch": 17.182564750473784,
642
+ "grad_norm": 2.442532777786255,
643
+ "learning_rate": 0.00042962962962962963,
644
+ "loss": 0.1449,
645
+ "step": 6800
646
+ },
647
+ {
648
+ "epoch": 17.182564750473784,
649
+ "eval_accuracy": 0.7632911392405063,
650
+ "eval_f1_macro": 0.5902398414509258,
651
+ "eval_loss": 1.5675625801086426,
652
+ "eval_runtime": 108.2685,
653
+ "eval_samples_per_second": 80.263,
654
+ "eval_steps_per_second": 2.512,
655
+ "step": 6800
656
+ },
657
+ {
658
+ "epoch": 17.435249526216044,
659
+ "grad_norm": 2.888688564300537,
660
+ "learning_rate": 0.0004277777777777778,
661
+ "loss": 0.1602,
662
+ "step": 6900
663
+ },
664
+ {
665
+ "epoch": 17.687934301958308,
666
+ "grad_norm": 1.7030471563339233,
667
+ "learning_rate": 0.00042592592592592595,
668
+ "loss": 0.141,
669
+ "step": 7000
670
+ },
671
+ {
672
+ "epoch": 17.940619077700568,
673
+ "grad_norm": 3.838587760925293,
674
+ "learning_rate": 0.00042407407407407406,
675
+ "loss": 0.14,
676
+ "step": 7100
677
+ },
678
+ {
679
+ "epoch": 18.19330385344283,
680
+ "grad_norm": 2.5156667232513428,
681
+ "learning_rate": 0.0004222222222222222,
682
+ "loss": 0.144,
683
+ "step": 7200
684
+ },
685
+ {
686
+ "epoch": 18.19330385344283,
687
+ "eval_accuracy": 0.7498273878020714,
688
+ "eval_f1_macro": 0.5654132069159379,
689
+ "eval_loss": 1.4789820909500122,
690
+ "eval_runtime": 108.261,
691
+ "eval_samples_per_second": 80.269,
692
+ "eval_steps_per_second": 2.512,
693
+ "step": 7200
694
+ },
695
+ {
696
+ "epoch": 18.44598862918509,
697
+ "grad_norm": 4.657927513122559,
698
+ "learning_rate": 0.00042037037037037043,
699
+ "loss": 0.1385,
700
+ "step": 7300
701
+ },
702
+ {
703
+ "epoch": 18.69867340492735,
704
+ "grad_norm": 5.224093914031982,
705
+ "learning_rate": 0.00041851851851851853,
706
+ "loss": 0.1333,
707
+ "step": 7400
708
+ },
709
+ {
710
+ "epoch": 18.951358180669615,
711
+ "grad_norm": 2.436795949935913,
712
+ "learning_rate": 0.0004166666666666667,
713
+ "loss": 0.1333,
714
+ "step": 7500
715
+ },
716
+ {
717
+ "epoch": 19.204042956411875,
718
+ "grad_norm": 5.1690354347229,
719
+ "learning_rate": 0.0004148148148148148,
720
+ "loss": 0.1136,
721
+ "step": 7600
722
+ },
723
+ {
724
+ "epoch": 19.204042956411875,
725
+ "eval_accuracy": 0.7498273878020714,
726
+ "eval_f1_macro": 0.5531386036990584,
727
+ "eval_loss": 1.6063206195831299,
728
+ "eval_runtime": 108.3182,
729
+ "eval_samples_per_second": 80.227,
730
+ "eval_steps_per_second": 2.511,
731
+ "step": 7600
732
+ },
733
+ {
734
+ "epoch": 19.45672773215414,
735
+ "grad_norm": 1.983140230178833,
736
+ "learning_rate": 0.00041296296296296296,
737
+ "loss": 0.1265,
738
+ "step": 7700
739
+ },
740
+ {
741
+ "epoch": 19.7094125078964,
742
+ "grad_norm": 2.0450029373168945,
743
+ "learning_rate": 0.0004111111111111111,
744
+ "loss": 0.1221,
745
+ "step": 7800
746
+ },
747
+ {
748
+ "epoch": 19.96209728363866,
749
+ "grad_norm": 6.7355804443359375,
750
+ "learning_rate": 0.0004092592592592593,
751
+ "loss": 0.1369,
752
+ "step": 7900
753
+ },
754
+ {
755
+ "epoch": 20.214782059380923,
756
+ "grad_norm": 3.7778196334838867,
757
+ "learning_rate": 0.0004074074074074074,
758
+ "loss": 0.1082,
759
+ "step": 8000
760
+ },
761
+ {
762
+ "epoch": 20.214782059380923,
763
+ "eval_accuracy": 0.7598388952819333,
764
+ "eval_f1_macro": 0.5650987069075204,
765
+ "eval_loss": 1.6282719373703003,
766
+ "eval_runtime": 108.2582,
767
+ "eval_samples_per_second": 80.271,
768
+ "eval_steps_per_second": 2.513,
769
+ "step": 8000
770
+ },
771
+ {
772
+ "epoch": 20.467466835123183,
773
+ "grad_norm": 3.0025575160980225,
774
+ "learning_rate": 0.00040555555555555554,
775
+ "loss": 0.1108,
776
+ "step": 8100
777
+ },
778
+ {
779
+ "epoch": 20.720151610865447,
780
+ "grad_norm": 3.365360736846924,
781
+ "learning_rate": 0.00040370370370370375,
782
+ "loss": 0.1188,
783
+ "step": 8200
784
+ },
785
+ {
786
+ "epoch": 20.972836386607707,
787
+ "grad_norm": 1.7243456840515137,
788
+ "learning_rate": 0.00040185185185185186,
789
+ "loss": 0.1152,
790
+ "step": 8300
791
+ },
792
+ {
793
+ "epoch": 21.225521162349967,
794
+ "grad_norm": 2.3120861053466797,
795
+ "learning_rate": 0.0004,
796
+ "loss": 0.109,
797
+ "step": 8400
798
+ },
799
+ {
800
+ "epoch": 21.225521162349967,
801
+ "eval_accuracy": 0.7635212888377445,
802
+ "eval_f1_macro": 0.569418631056331,
803
+ "eval_loss": 1.752591848373413,
804
+ "eval_runtime": 108.3807,
805
+ "eval_samples_per_second": 80.18,
806
+ "eval_steps_per_second": 2.51,
807
+ "step": 8400
808
+ },
809
+ {
810
+ "epoch": 21.47820593809223,
811
+ "grad_norm": 3.04634165763855,
812
+ "learning_rate": 0.0003981481481481481,
813
+ "loss": 0.1219,
814
+ "step": 8500
815
+ },
816
+ {
817
+ "epoch": 21.73089071383449,
818
+ "grad_norm": 5.413991928100586,
819
+ "learning_rate": 0.00039629629629629634,
820
+ "loss": 0.1122,
821
+ "step": 8600
822
+ },
823
+ {
824
+ "epoch": 21.983575489576754,
825
+ "grad_norm": 3.94024395942688,
826
+ "learning_rate": 0.00039444444444444444,
827
+ "loss": 0.1023,
828
+ "step": 8700
829
+ },
830
+ {
831
+ "epoch": 22.236260265319014,
832
+ "grad_norm": 4.625360488891602,
833
+ "learning_rate": 0.0003925925925925926,
834
+ "loss": 0.1173,
835
+ "step": 8800
836
+ },
837
+ {
838
+ "epoch": 22.236260265319014,
839
+ "eval_accuracy": 0.7597238204833141,
840
+ "eval_f1_macro": 0.5837145389837423,
841
+ "eval_loss": 1.5915985107421875,
842
+ "eval_runtime": 108.3288,
843
+ "eval_samples_per_second": 80.219,
844
+ "eval_steps_per_second": 2.511,
845
+ "step": 8800
846
+ },
847
+ {
848
+ "epoch": 22.488945041061275,
849
+ "grad_norm": 4.926342010498047,
850
+ "learning_rate": 0.0003907407407407407,
851
+ "loss": 0.1059,
852
+ "step": 8900
853
+ },
854
+ {
855
+ "epoch": 22.74162981680354,
856
+ "grad_norm": 3.0098073482513428,
857
+ "learning_rate": 0.0003888888888888889,
858
+ "loss": 0.1021,
859
+ "step": 9000
860
+ },
861
+ {
862
+ "epoch": 22.9943145925458,
863
+ "grad_norm": 1.6922105550765991,
864
+ "learning_rate": 0.000387037037037037,
865
+ "loss": 0.1046,
866
+ "step": 9100
867
+ },
868
+ {
869
+ "epoch": 23.246999368288062,
870
+ "grad_norm": 4.048877239227295,
871
+ "learning_rate": 0.0003851851851851852,
872
+ "loss": 0.094,
873
+ "step": 9200
874
+ },
875
+ {
876
+ "epoch": 23.246999368288062,
877
+ "eval_accuracy": 0.7614499424626007,
878
+ "eval_f1_macro": 0.575035765001433,
879
+ "eval_loss": 1.631859302520752,
880
+ "eval_runtime": 108.7827,
881
+ "eval_samples_per_second": 79.884,
882
+ "eval_steps_per_second": 2.5,
883
+ "step": 9200
884
+ },
885
+ {
886
+ "epoch": 23.499684144030322,
887
+ "grad_norm": 3.0817179679870605,
888
+ "learning_rate": 0.00038333333333333334,
889
+ "loss": 0.1134,
890
+ "step": 9300
891
+ },
892
+ {
893
+ "epoch": 23.752368919772582,
894
+ "grad_norm": 0.7254201769828796,
895
+ "learning_rate": 0.0003814814814814815,
896
+ "loss": 0.0916,
897
+ "step": 9400
898
+ },
899
+ {
900
+ "epoch": 24.005053695514846,
901
+ "grad_norm": 1.1015946865081787,
902
+ "learning_rate": 0.00037962962962962966,
903
+ "loss": 0.1014,
904
+ "step": 9500
905
+ },
906
+ {
907
+ "epoch": 24.257738471257106,
908
+ "grad_norm": 1.6841648817062378,
909
+ "learning_rate": 0.00037777777777777777,
910
+ "loss": 0.0819,
911
+ "step": 9600
912
+ },
913
+ {
914
+ "epoch": 24.257738471257106,
915
+ "eval_accuracy": 0.7601841196777905,
916
+ "eval_f1_macro": 0.5741548033552556,
917
+ "eval_loss": 1.7696462869644165,
918
+ "eval_runtime": 108.7945,
919
+ "eval_samples_per_second": 79.875,
920
+ "eval_steps_per_second": 2.5,
921
+ "step": 9600
922
+ },
923
+ {
924
+ "epoch": 24.51042324699937,
925
+ "grad_norm": 1.9033466577529907,
926
+ "learning_rate": 0.00037592592592592593,
927
+ "loss": 0.1013,
928
+ "step": 9700
929
+ },
930
+ {
931
+ "epoch": 24.76310802274163,
932
+ "grad_norm": 3.788855791091919,
933
+ "learning_rate": 0.0003740740740740741,
934
+ "loss": 0.1057,
935
+ "step": 9800
936
+ },
937
+ {
938
+ "epoch": 25.01579279848389,
939
+ "grad_norm": 1.6506637334823608,
940
+ "learning_rate": 0.00037222222222222225,
941
+ "loss": 0.0979,
942
+ "step": 9900
943
+ },
944
+ {
945
+ "epoch": 25.268477574226154,
946
+ "grad_norm": 4.115843296051025,
947
+ "learning_rate": 0.00037037037037037035,
948
+ "loss": 0.0978,
949
+ "step": 10000
950
+ },
951
+ {
952
+ "epoch": 25.268477574226154,
953
+ "eval_accuracy": 0.7659378596087457,
954
+ "eval_f1_macro": 0.5667775496828193,
955
+ "eval_loss": 1.633931040763855,
956
+ "eval_runtime": 108.6433,
957
+ "eval_samples_per_second": 79.987,
958
+ "eval_steps_per_second": 2.504,
959
+ "step": 10000
960
+ },
961
+ {
962
+ "epoch": 25.521162349968414,
963
+ "grad_norm": 3.1308891773223877,
964
+ "learning_rate": 0.0003685185185185185,
965
+ "loss": 0.0804,
966
+ "step": 10100
967
+ },
968
+ {
969
+ "epoch": 25.773847125710677,
970
+ "grad_norm": 3.1427507400512695,
971
+ "learning_rate": 0.00036666666666666667,
972
+ "loss": 0.0861,
973
+ "step": 10200
974
+ },
975
+ {
976
+ "epoch": 26.026531901452937,
977
+ "grad_norm": 2.7073075771331787,
978
+ "learning_rate": 0.00036481481481481483,
979
+ "loss": 0.0846,
980
+ "step": 10300
981
+ },
982
+ {
983
+ "epoch": 26.279216677195198,
984
+ "grad_norm": 0.8588293194770813,
985
+ "learning_rate": 0.000362962962962963,
986
+ "loss": 0.0854,
987
+ "step": 10400
988
+ },
989
+ {
990
+ "epoch": 26.279216677195198,
991
+ "eval_accuracy": 0.7668584579976985,
992
+ "eval_f1_macro": 0.566287765137334,
993
+ "eval_loss": 1.6085623502731323,
994
+ "eval_runtime": 108.9617,
995
+ "eval_samples_per_second": 79.753,
996
+ "eval_steps_per_second": 2.496,
997
+ "step": 10400
998
+ },
999
+ {
1000
+ "epoch": 26.53190145293746,
1001
+ "grad_norm": 2.541748285293579,
1002
+ "learning_rate": 0.0003611111111111111,
1003
+ "loss": 0.0828,
1004
+ "step": 10500
1005
+ },
1006
+ {
1007
+ "epoch": 26.78458622867972,
1008
+ "grad_norm": 4.645153999328613,
1009
+ "learning_rate": 0.0003592592592592593,
1010
+ "loss": 0.0943,
1011
+ "step": 10600
1012
+ },
1013
+ {
1014
+ "epoch": 27.037271004421985,
1015
+ "grad_norm": 2.1111841201782227,
1016
+ "learning_rate": 0.0003574074074074074,
1017
+ "loss": 0.0757,
1018
+ "step": 10700
1019
+ },
1020
+ {
1021
+ "epoch": 27.289955780164245,
1022
+ "grad_norm": 1.5534864664077759,
1023
+ "learning_rate": 0.00035555555555555557,
1024
+ "loss": 0.0818,
1025
+ "step": 10800
1026
+ },
1027
+ {
1028
+ "epoch": 27.289955780164245,
1029
+ "eval_accuracy": 0.768584579976985,
1030
+ "eval_f1_macro": 0.5967195650070578,
1031
+ "eval_loss": 1.6560544967651367,
1032
+ "eval_runtime": 108.8685,
1033
+ "eval_samples_per_second": 79.821,
1034
+ "eval_steps_per_second": 2.498,
1035
+ "step": 10800
1036
+ },
1037
+ {
1038
+ "epoch": 27.542640555906505,
1039
+ "grad_norm": 1.9038456678390503,
1040
+ "learning_rate": 0.0003537037037037037,
1041
+ "loss": 0.081,
1042
+ "step": 10900
1043
+ },
1044
+ {
1045
+ "epoch": 27.79532533164877,
1046
+ "grad_norm": 1.55673348903656,
1047
+ "learning_rate": 0.0003518518518518519,
1048
+ "loss": 0.0828,
1049
+ "step": 11000
1050
+ },
1051
+ {
1052
+ "epoch": 28.04801010739103,
1053
+ "grad_norm": 5.855372905731201,
1054
+ "learning_rate": 0.00035,
1055
+ "loss": 0.0862,
1056
+ "step": 11100
1057
+ },
1058
+ {
1059
+ "epoch": 28.300694883133293,
1060
+ "grad_norm": 3.111088991165161,
1061
+ "learning_rate": 0.00034814814814814816,
1062
+ "loss": 0.0725,
1063
+ "step": 11200
1064
+ },
1065
+ {
1066
+ "epoch": 28.300694883133293,
1067
+ "eval_accuracy": 0.7542002301495973,
1068
+ "eval_f1_macro": 0.5740863312977772,
1069
+ "eval_loss": 1.7682907581329346,
1070
+ "eval_runtime": 144.2676,
1071
+ "eval_samples_per_second": 60.235,
1072
+ "eval_steps_per_second": 1.885,
1073
+ "step": 11200
1074
+ },
1075
+ {
1076
+ "epoch": 28.553379658875553,
1077
+ "grad_norm": 1.5844389200210571,
1078
+ "learning_rate": 0.00034629629629629626,
1079
+ "loss": 0.0771,
1080
+ "step": 11300
1081
+ },
1082
+ {
1083
+ "epoch": 28.806064434617813,
1084
+ "grad_norm": 3.93422269821167,
1085
+ "learning_rate": 0.0003444444444444445,
1086
+ "loss": 0.0863,
1087
+ "step": 11400
1088
+ },
1089
+ {
1090
+ "epoch": 29.058749210360077,
1091
+ "grad_norm": 1.9465683698654175,
1092
+ "learning_rate": 0.00034259259259259263,
1093
+ "loss": 0.0777,
1094
+ "step": 11500
1095
+ },
1096
+ {
1097
+ "epoch": 29.311433986102337,
1098
+ "grad_norm": 1.951295018196106,
1099
+ "learning_rate": 0.00034074074074074074,
1100
+ "loss": 0.0672,
1101
+ "step": 11600
1102
+ },
1103
+ {
1104
+ "epoch": 29.311433986102337,
1105
+ "eval_accuracy": 0.7616800920598389,
1106
+ "eval_f1_macro": 0.5831099181645485,
1107
+ "eval_loss": 1.8726670742034912,
1108
+ "eval_runtime": 143.7286,
1109
+ "eval_samples_per_second": 60.461,
1110
+ "eval_steps_per_second": 1.892,
1111
+ "step": 11600
1112
+ },
1113
+ {
1114
+ "epoch": 29.5641187618446,
1115
+ "grad_norm": 2.5593600273132324,
1116
+ "learning_rate": 0.0003388888888888889,
1117
+ "loss": 0.0707,
1118
+ "step": 11700
1119
+ },
1120
+ {
1121
+ "epoch": 29.81680353758686,
1122
+ "grad_norm": 1.8622692823410034,
1123
+ "learning_rate": 0.00033703703703703706,
1124
+ "loss": 0.0709,
1125
+ "step": 11800
1126
+ },
1127
+ {
1128
+ "epoch": 30.06948831332912,
1129
+ "grad_norm": 2.1602845191955566,
1130
+ "learning_rate": 0.0003351851851851852,
1131
+ "loss": 0.0653,
1132
+ "step": 11900
1133
+ },
1134
+ {
1135
+ "epoch": 30.322173089071384,
1136
+ "grad_norm": 3.438899278640747,
1137
+ "learning_rate": 0.0003333333333333333,
1138
+ "loss": 0.076,
1139
+ "step": 12000
1140
+ },
1141
+ {
1142
+ "epoch": 30.322173089071384,
1143
+ "eval_accuracy": 0.7635212888377445,
1144
+ "eval_f1_macro": 0.608710633839459,
1145
+ "eval_loss": 1.6385372877120972,
1146
+ "eval_runtime": 143.5758,
1147
+ "eval_samples_per_second": 60.526,
1148
+ "eval_steps_per_second": 1.894,
1149
+ "step": 12000
1150
+ },
1151
+ {
1152
+ "epoch": 30.574857864813644,
1153
+ "grad_norm": 4.342224597930908,
1154
+ "learning_rate": 0.0003314814814814815,
1155
+ "loss": 0.0682,
1156
+ "step": 12100
1157
+ },
1158
+ {
1159
+ "epoch": 30.827542640555908,
1160
+ "grad_norm": 3.186768054962158,
1161
+ "learning_rate": 0.0003296296296296296,
1162
+ "loss": 0.0705,
1163
+ "step": 12200
1164
+ },
1165
+ {
1166
+ "epoch": 31.080227416298168,
1167
+ "grad_norm": 1.2158719301223755,
1168
+ "learning_rate": 0.0003277777777777778,
1169
+ "loss": 0.0665,
1170
+ "step": 12300
1171
+ },
1172
+ {
1173
+ "epoch": 31.33291219204043,
1174
+ "grad_norm": 1.0089023113250732,
1175
+ "learning_rate": 0.00032592592592592596,
1176
+ "loss": 0.0702,
1177
+ "step": 12400
1178
+ },
1179
+ {
1180
+ "epoch": 31.33291219204043,
1181
+ "eval_accuracy": 0.7721518987341772,
1182
+ "eval_f1_macro": 0.5988474647551651,
1183
+ "eval_loss": 1.7596935033798218,
1184
+ "eval_runtime": 145.7714,
1185
+ "eval_samples_per_second": 59.614,
1186
+ "eval_steps_per_second": 1.866,
1187
+ "step": 12400
1188
+ },
1189
+ {
1190
+ "epoch": 31.585596967782692,
1191
+ "grad_norm": 1.9418379068374634,
1192
+ "learning_rate": 0.00032407407407407406,
1193
+ "loss": 0.0674,
1194
+ "step": 12500
1195
+ },
1196
+ {
1197
+ "epoch": 31.838281743524952,
1198
+ "grad_norm": 2.2405946254730225,
1199
+ "learning_rate": 0.0003222222222222222,
1200
+ "loss": 0.0645,
1201
+ "step": 12600
1202
+ },
1203
+ {
1204
+ "epoch": 32.09096651926721,
1205
+ "grad_norm": 1.296802043914795,
1206
+ "learning_rate": 0.0003203703703703704,
1207
+ "loss": 0.0654,
1208
+ "step": 12700
1209
+ },
1210
+ {
1211
+ "epoch": 32.34365129500947,
1212
+ "grad_norm": 5.712021827697754,
1213
+ "learning_rate": 0.00031851851851851854,
1214
+ "loss": 0.0644,
1215
+ "step": 12800
1216
+ },
1217
+ {
1218
+ "epoch": 32.34365129500947,
1219
+ "eval_accuracy": 0.7686996547756041,
1220
+ "eval_f1_macro": 0.5840964451710614,
1221
+ "eval_loss": 1.795427918434143,
1222
+ "eval_runtime": 137.5518,
1223
+ "eval_samples_per_second": 63.176,
1224
+ "eval_steps_per_second": 1.977,
1225
+ "step": 12800
1226
+ },
1227
+ {
1228
+ "epoch": 32.59633607075174,
1229
+ "grad_norm": 1.1909292936325073,
1230
+ "learning_rate": 0.00031666666666666665,
1231
+ "loss": 0.0638,
1232
+ "step": 12900
1233
+ },
1234
+ {
1235
+ "epoch": 32.849020846494,
1236
+ "grad_norm": 4.957464694976807,
1237
+ "learning_rate": 0.0003148148148148148,
1238
+ "loss": 0.0566,
1239
+ "step": 13000
1240
+ },
1241
+ {
1242
+ "epoch": 33.10170562223626,
1243
+ "grad_norm": 0.9219366908073425,
1244
+ "learning_rate": 0.00031296296296296297,
1245
+ "loss": 0.0622,
1246
+ "step": 13100
1247
+ },
1248
+ {
1249
+ "epoch": 33.35439039797852,
1250
+ "grad_norm": 0.8782810568809509,
1251
+ "learning_rate": 0.0003111111111111111,
1252
+ "loss": 0.0521,
1253
+ "step": 13200
1254
+ },
1255
+ {
1256
+ "epoch": 33.35439039797852,
1257
+ "eval_accuracy": 0.7619102416570771,
1258
+ "eval_f1_macro": 0.6192131512606447,
1259
+ "eval_loss": 1.901371955871582,
1260
+ "eval_runtime": 134.6686,
1261
+ "eval_samples_per_second": 64.529,
1262
+ "eval_steps_per_second": 2.02,
1263
+ "step": 13200
1264
+ },
1265
+ {
1266
+ "epoch": 33.60707517372078,
1267
+ "grad_norm": 2.130397319793701,
1268
+ "learning_rate": 0.00030925925925925923,
1269
+ "loss": 0.0686,
1270
+ "step": 13300
1271
+ },
1272
+ {
1273
+ "epoch": 33.85975994946305,
1274
+ "grad_norm": 1.2521294355392456,
1275
+ "learning_rate": 0.0003074074074074074,
1276
+ "loss": 0.058,
1277
+ "step": 13400
1278
+ },
1279
+ {
1280
+ "epoch": 34.11244472520531,
1281
+ "grad_norm": 2.1233527660369873,
1282
+ "learning_rate": 0.0003055555555555556,
1283
+ "loss": 0.0528,
1284
+ "step": 13500
1285
+ },
1286
+ {
1287
+ "epoch": 34.36512950094757,
1288
+ "grad_norm": 2.557140827178955,
1289
+ "learning_rate": 0.0003037037037037037,
1290
+ "loss": 0.0489,
1291
+ "step": 13600
1292
+ },
1293
+ {
1294
+ "epoch": 34.36512950094757,
1295
+ "eval_accuracy": 0.768584579976985,
1296
+ "eval_f1_macro": 0.6008906251293721,
1297
+ "eval_loss": 1.8431710004806519,
1298
+ "eval_runtime": 109.9242,
1299
+ "eval_samples_per_second": 79.054,
1300
+ "eval_steps_per_second": 2.474,
1301
+ "step": 13600
1302
+ },
1303
+ {
1304
+ "epoch": 34.61781427668983,
1305
+ "grad_norm": 1.0921064615249634,
1306
+ "learning_rate": 0.00030185185185185187,
1307
+ "loss": 0.0561,
1308
+ "step": 13700
1309
+ },
1310
+ {
1311
+ "epoch": 34.87049905243209,
1312
+ "grad_norm": 2.349817991256714,
1313
+ "learning_rate": 0.0003,
1314
+ "loss": 0.0565,
1315
+ "step": 13800
1316
+ },
1317
+ {
1318
+ "epoch": 35.123183828174355,
1319
+ "grad_norm": 1.187012791633606,
1320
+ "learning_rate": 0.0002981481481481482,
1321
+ "loss": 0.049,
1322
+ "step": 13900
1323
+ },
1324
+ {
1325
+ "epoch": 35.375868603916615,
1326
+ "grad_norm": 0.04951922222971916,
1327
+ "learning_rate": 0.0002962962962962963,
1328
+ "loss": 0.0447,
1329
+ "step": 14000
1330
+ },
1331
+ {
1332
+ "epoch": 35.375868603916615,
1333
+ "eval_accuracy": 0.769620253164557,
1334
+ "eval_f1_macro": 0.5941725933407251,
1335
+ "eval_loss": 1.8310672044754028,
1336
+ "eval_runtime": 109.8974,
1337
+ "eval_samples_per_second": 79.074,
1338
+ "eval_steps_per_second": 2.475,
1339
+ "step": 14000
1340
+ },
1341
+ {
1342
+ "epoch": 35.628553379658875,
1343
+ "grad_norm": 1.9226619005203247,
1344
+ "learning_rate": 0.00029444444444444445,
1345
+ "loss": 0.0575,
1346
+ "step": 14100
1347
+ },
1348
+ {
1349
+ "epoch": 35.881238155401135,
1350
+ "grad_norm": 0.9691677093505859,
1351
+ "learning_rate": 0.00029259259259259256,
1352
+ "loss": 0.0569,
1353
+ "step": 14200
1354
+ },
1355
+ {
1356
+ "epoch": 36.133922931143395,
1357
+ "grad_norm": 3.869662046432495,
1358
+ "learning_rate": 0.00029074074074074077,
1359
+ "loss": 0.0572,
1360
+ "step": 14300
1361
+ },
1362
+ {
1363
+ "epoch": 36.38660770688566,
1364
+ "grad_norm": 1.026167869567871,
1365
+ "learning_rate": 0.0002888888888888889,
1366
+ "loss": 0.0506,
1367
+ "step": 14400
1368
+ },
1369
+ {
1370
+ "epoch": 36.38660770688566,
1371
+ "eval_accuracy": 0.7692750287686997,
1372
+ "eval_f1_macro": 0.5692835911324542,
1373
+ "eval_loss": 1.7118504047393799,
1374
+ "eval_runtime": 110.0615,
1375
+ "eval_samples_per_second": 78.956,
1376
+ "eval_steps_per_second": 2.471,
1377
+ "step": 14400
1378
+ },
1379
+ {
1380
+ "epoch": 36.63929248262792,
1381
+ "grad_norm": 0.6115164756774902,
1382
+ "learning_rate": 0.00028703703703703703,
1383
+ "loss": 0.0472,
1384
+ "step": 14500
1385
+ },
1386
+ {
1387
+ "epoch": 36.89197725837018,
1388
+ "grad_norm": 2.26706862449646,
1389
+ "learning_rate": 0.0002851851851851852,
1390
+ "loss": 0.0507,
1391
+ "step": 14600
1392
+ },
1393
+ {
1394
+ "epoch": 37.14466203411244,
1395
+ "grad_norm": 1.0484291315078735,
1396
+ "learning_rate": 0.00028333333333333335,
1397
+ "loss": 0.0533,
1398
+ "step": 14700
1399
+ },
1400
+ {
1401
+ "epoch": 37.3973468098547,
1402
+ "grad_norm": 1.587339162826538,
1403
+ "learning_rate": 0.0002814814814814815,
1404
+ "loss": 0.0489,
1405
+ "step": 14800
1406
+ },
1407
+ {
1408
+ "epoch": 37.3973468098547,
1409
+ "eval_accuracy": 0.7665132336018412,
1410
+ "eval_f1_macro": 0.5648640266723668,
1411
+ "eval_loss": 1.8756378889083862,
1412
+ "eval_runtime": 109.745,
1413
+ "eval_samples_per_second": 79.184,
1414
+ "eval_steps_per_second": 2.478,
1415
+ "step": 14800
1416
+ },
1417
+ {
1418
+ "epoch": 37.65003158559697,
1419
+ "grad_norm": 0.5741543173789978,
1420
+ "learning_rate": 0.0002796296296296296,
1421
+ "loss": 0.0539,
1422
+ "step": 14900
1423
+ },
1424
+ {
1425
+ "epoch": 37.90271636133923,
1426
+ "grad_norm": 2.3613216876983643,
1427
+ "learning_rate": 0.0002777777777777778,
1428
+ "loss": 0.0601,
1429
+ "step": 15000
1430
+ },
1431
+ {
1432
+ "epoch": 38.15540113708149,
1433
+ "grad_norm": 2.9896764755249023,
1434
+ "learning_rate": 0.00027592592592592594,
1435
+ "loss": 0.0441,
1436
+ "step": 15100
1437
+ },
1438
+ {
1439
+ "epoch": 38.40808591282375,
1440
+ "grad_norm": 2.065026044845581,
1441
+ "learning_rate": 0.0002740740740740741,
1442
+ "loss": 0.0523,
1443
+ "step": 15200
1444
+ },
1445
+ {
1446
+ "epoch": 38.40808591282375,
1447
+ "eval_accuracy": 0.777445339470656,
1448
+ "eval_f1_macro": 0.5952390760705957,
1449
+ "eval_loss": 1.6472355127334595,
1450
+ "eval_runtime": 109.7433,
1451
+ "eval_samples_per_second": 79.185,
1452
+ "eval_steps_per_second": 2.479,
1453
+ "step": 15200
1454
+ },
1455
+ {
1456
+ "epoch": 38.66077068856601,
1457
+ "grad_norm": 0.533388078212738,
1458
+ "learning_rate": 0.0002722222222222222,
1459
+ "loss": 0.0465,
1460
+ "step": 15300
1461
+ },
1462
+ {
1463
+ "epoch": 38.91345546430828,
1464
+ "grad_norm": 2.1074326038360596,
1465
+ "learning_rate": 0.00027037037037037036,
1466
+ "loss": 0.0398,
1467
+ "step": 15400
1468
+ },
1469
+ {
1470
+ "epoch": 39.16614024005054,
1471
+ "grad_norm": 1.6379950046539307,
1472
+ "learning_rate": 0.0002685185185185186,
1473
+ "loss": 0.04,
1474
+ "step": 15500
1475
+ },
1476
+ {
1477
+ "epoch": 39.4188250157928,
1478
+ "grad_norm": 1.9422637224197388,
1479
+ "learning_rate": 0.0002666666666666667,
1480
+ "loss": 0.0384,
1481
+ "step": 15600
1482
+ },
1483
+ {
1484
+ "epoch": 39.4188250157928,
1485
+ "eval_accuracy": 0.7736478711162256,
1486
+ "eval_f1_macro": 0.6025598876629095,
1487
+ "eval_loss": 1.8879679441452026,
1488
+ "eval_runtime": 109.8175,
1489
+ "eval_samples_per_second": 79.131,
1490
+ "eval_steps_per_second": 2.477,
1491
+ "step": 15600
1492
+ },
1493
+ {
1494
+ "epoch": 39.67150979153506,
1495
+ "grad_norm": 1.7022106647491455,
1496
+ "learning_rate": 0.00026481481481481484,
1497
+ "loss": 0.036,
1498
+ "step": 15700
1499
+ },
1500
+ {
1501
+ "epoch": 39.92419456727732,
1502
+ "grad_norm": 1.5560113191604614,
1503
+ "learning_rate": 0.00026296296296296294,
1504
+ "loss": 0.0419,
1505
+ "step": 15800
1506
+ },
1507
+ {
1508
+ "epoch": 40.176879343019586,
1509
+ "grad_norm": 3.473234176635742,
1510
+ "learning_rate": 0.00026111111111111116,
1511
+ "loss": 0.0375,
1512
+ "step": 15900
1513
+ },
1514
+ {
1515
+ "epoch": 40.429564118761846,
1516
+ "grad_norm": 1.8860403299331665,
1517
+ "learning_rate": 0.00025925925925925926,
1518
+ "loss": 0.0423,
1519
+ "step": 16000
1520
+ },
1521
+ {
1522
+ "epoch": 40.429564118761846,
1523
+ "eval_accuracy": 0.775258918296893,
1524
+ "eval_f1_macro": 0.5970358418964591,
1525
+ "eval_loss": 1.8289481401443481,
1526
+ "eval_runtime": 109.6274,
1527
+ "eval_samples_per_second": 79.268,
1528
+ "eval_steps_per_second": 2.481,
1529
+ "step": 16000
1530
+ },
1531
+ {
1532
+ "epoch": 40.682248894504106,
1533
+ "grad_norm": 3.3260364532470703,
1534
+ "learning_rate": 0.0002574074074074074,
1535
+ "loss": 0.0384,
1536
+ "step": 16100
1537
+ },
1538
+ {
1539
+ "epoch": 40.934933670246366,
1540
+ "grad_norm": 0.306608110666275,
1541
+ "learning_rate": 0.00025555555555555553,
1542
+ "loss": 0.0485,
1543
+ "step": 16200
1544
+ },
1545
+ {
1546
+ "epoch": 41.187618445988626,
1547
+ "grad_norm": 1.0209686756134033,
1548
+ "learning_rate": 0.00025370370370370374,
1549
+ "loss": 0.0388,
1550
+ "step": 16300
1551
+ },
1552
+ {
1553
+ "epoch": 41.44030322173089,
1554
+ "grad_norm": 1.1874313354492188,
1555
+ "learning_rate": 0.00025185185185185185,
1556
+ "loss": 0.0421,
1557
+ "step": 16400
1558
+ },
1559
+ {
1560
+ "epoch": 41.44030322173089,
1561
+ "eval_accuracy": 0.7693901035673187,
1562
+ "eval_f1_macro": 0.6018061729163716,
1563
+ "eval_loss": 1.6877504587173462,
1564
+ "eval_runtime": 110.175,
1565
+ "eval_samples_per_second": 78.874,
1566
+ "eval_steps_per_second": 2.469,
1567
+ "step": 16400
1568
+ },
1569
+ {
1570
+ "epoch": 41.69298799747315,
1571
+ "grad_norm": 1.121667742729187,
1572
+ "learning_rate": 0.00025,
1573
+ "loss": 0.0382,
1574
+ "step": 16500
1575
+ },
1576
+ {
1577
+ "epoch": 41.94567277321541,
1578
+ "grad_norm": 1.1757502555847168,
1579
+ "learning_rate": 0.00024814814814814816,
1580
+ "loss": 0.0397,
1581
+ "step": 16600
1582
+ },
1583
+ {
1584
+ "epoch": 42.198357548957674,
1585
+ "grad_norm": 9.486854553222656,
1586
+ "learning_rate": 0.00024629629629629627,
1587
+ "loss": 0.0409,
1588
+ "step": 16700
1589
+ },
1590
+ {
1591
+ "epoch": 42.451042324699934,
1592
+ "grad_norm": 0.4830320179462433,
1593
+ "learning_rate": 0.00024444444444444443,
1594
+ "loss": 0.0404,
1595
+ "step": 16800
1596
+ },
1597
+ {
1598
+ "epoch": 42.451042324699934,
1599
+ "eval_accuracy": 0.7681242807825086,
1600
+ "eval_f1_macro": 0.5853419424167555,
1601
+ "eval_loss": 1.8393748998641968,
1602
+ "eval_runtime": 109.7881,
1603
+ "eval_samples_per_second": 79.152,
1604
+ "eval_steps_per_second": 2.478,
1605
+ "step": 16800
1606
+ },
1607
+ {
1608
+ "epoch": 42.7037271004422,
1609
+ "grad_norm": 1.6823731660842896,
1610
+ "learning_rate": 0.0002425925925925926,
1611
+ "loss": 0.0398,
1612
+ "step": 16900
1613
+ },
1614
+ {
1615
+ "epoch": 42.95641187618446,
1616
+ "grad_norm": 6.2725749015808105,
1617
+ "learning_rate": 0.00024074074074074072,
1618
+ "loss": 0.041,
1619
+ "step": 17000
1620
+ },
1621
+ {
1622
+ "epoch": 43.20909665192672,
1623
+ "grad_norm": 1.0358606576919556,
1624
+ "learning_rate": 0.0002388888888888889,
1625
+ "loss": 0.0359,
1626
+ "step": 17100
1627
+ },
1628
+ {
1629
+ "epoch": 43.46178142766898,
1630
+ "grad_norm": 0.5175994634628296,
1631
+ "learning_rate": 0.00023703703703703704,
1632
+ "loss": 0.0311,
1633
+ "step": 17200
1634
+ },
1635
+ {
1636
+ "epoch": 43.46178142766898,
1637
+ "eval_accuracy": 0.7772151898734178,
1638
+ "eval_f1_macro": 0.5963074285064315,
1639
+ "eval_loss": 1.7879575490951538,
1640
+ "eval_runtime": 109.7858,
1641
+ "eval_samples_per_second": 79.154,
1642
+ "eval_steps_per_second": 2.478,
1643
+ "step": 17200
1644
+ },
1645
+ {
1646
+ "epoch": 43.71446620341124,
1647
+ "grad_norm": 3.763942003250122,
1648
+ "learning_rate": 0.0002351851851851852,
1649
+ "loss": 0.0283,
1650
+ "step": 17300
1651
+ },
1652
+ {
1653
+ "epoch": 43.96715097915351,
1654
+ "grad_norm": 0.7010844349861145,
1655
+ "learning_rate": 0.00023333333333333333,
1656
+ "loss": 0.0392,
1657
+ "step": 17400
1658
+ },
1659
+ {
1660
+ "epoch": 44.21983575489577,
1661
+ "grad_norm": 1.6681573390960693,
1662
+ "learning_rate": 0.0002314814814814815,
1663
+ "loss": 0.0306,
1664
+ "step": 17500
1665
+ },
1666
+ {
1667
+ "epoch": 44.47252053063803,
1668
+ "grad_norm": 3.1810269355773926,
1669
+ "learning_rate": 0.00022962962962962962,
1670
+ "loss": 0.0421,
1671
+ "step": 17600
1672
+ },
1673
+ {
1674
+ "epoch": 44.47252053063803,
1675
+ "eval_accuracy": 0.7758342922899885,
1676
+ "eval_f1_macro": 0.601629063986742,
1677
+ "eval_loss": 1.70499587059021,
1678
+ "eval_runtime": 109.8026,
1679
+ "eval_samples_per_second": 79.142,
1680
+ "eval_steps_per_second": 2.477,
1681
+ "step": 17600
1682
+ },
1683
+ {
1684
+ "epoch": 44.72520530638029,
1685
+ "grad_norm": 1.0086809396743774,
1686
+ "learning_rate": 0.00022777777777777778,
1687
+ "loss": 0.0337,
1688
+ "step": 17700
1689
+ },
1690
+ {
1691
+ "epoch": 44.97789008212255,
1692
+ "grad_norm": 0.7673502564430237,
1693
+ "learning_rate": 0.00022592592592592591,
1694
+ "loss": 0.0381,
1695
+ "step": 17800
1696
+ },
1697
+ {
1698
+ "epoch": 45.230574857864816,
1699
+ "grad_norm": 0.9791069626808167,
1700
+ "learning_rate": 0.00022407407407407407,
1701
+ "loss": 0.0263,
1702
+ "step": 17900
1703
+ },
1704
+ {
1705
+ "epoch": 45.48325963360708,
1706
+ "grad_norm": 2.0957231521606445,
1707
+ "learning_rate": 0.0002222222222222222,
1708
+ "loss": 0.039,
1709
+ "step": 18000
1710
+ },
1711
+ {
1712
+ "epoch": 45.48325963360708,
1713
+ "eval_accuracy": 0.7800920598388953,
1714
+ "eval_f1_macro": 0.6025897002203272,
1715
+ "eval_loss": 1.7869104146957397,
1716
+ "eval_runtime": 109.5691,
1717
+ "eval_samples_per_second": 79.311,
1718
+ "eval_steps_per_second": 2.482,
1719
+ "step": 18000
1720
+ },
1721
+ {
1722
+ "epoch": 45.73594440934934,
1723
+ "grad_norm": 2.0811707973480225,
1724
+ "learning_rate": 0.0002203703703703704,
1725
+ "loss": 0.0284,
1726
+ "step": 18100
1727
+ },
1728
+ {
1729
+ "epoch": 45.9886291850916,
1730
+ "grad_norm": 4.134197235107422,
1731
+ "learning_rate": 0.00021851851851851852,
1732
+ "loss": 0.0347,
1733
+ "step": 18200
1734
+ },
1735
+ {
1736
+ "epoch": 46.24131396083386,
1737
+ "grad_norm": 0.5542005300521851,
1738
+ "learning_rate": 0.00021666666666666668,
1739
+ "loss": 0.0293,
1740
+ "step": 18300
1741
+ },
1742
+ {
1743
+ "epoch": 46.493998736576124,
1744
+ "grad_norm": 2.4530539512634277,
1745
+ "learning_rate": 0.00021481481481481482,
1746
+ "loss": 0.0268,
1747
+ "step": 18400
1748
+ },
1749
+ {
1750
+ "epoch": 46.493998736576124,
1751
+ "eval_accuracy": 0.7783659378596087,
1752
+ "eval_f1_macro": 0.5935536715539459,
1753
+ "eval_loss": 1.9122555255889893,
1754
+ "eval_runtime": 109.4826,
1755
+ "eval_samples_per_second": 79.373,
1756
+ "eval_steps_per_second": 2.484,
1757
+ "step": 18400
1758
+ },
1759
+ {
1760
+ "epoch": 46.746683512318384,
1761
+ "grad_norm": 0.5376617312431335,
1762
+ "learning_rate": 0.00021296296296296298,
1763
+ "loss": 0.0348,
1764
+ "step": 18500
1765
+ },
1766
+ {
1767
+ "epoch": 46.999368288060644,
1768
+ "grad_norm": 1.4315904378890991,
1769
+ "learning_rate": 0.0002111111111111111,
1770
+ "loss": 0.036,
1771
+ "step": 18600
1772
+ },
1773
+ {
1774
+ "epoch": 47.252053063802904,
1775
+ "grad_norm": 0.866862952709198,
1776
+ "learning_rate": 0.00020925925925925927,
1777
+ "loss": 0.0273,
1778
+ "step": 18700
1779
+ },
1780
+ {
1781
+ "epoch": 47.504737839545164,
1782
+ "grad_norm": 2.0620691776275635,
1783
+ "learning_rate": 0.0002074074074074074,
1784
+ "loss": 0.0305,
1785
+ "step": 18800
1786
+ },
1787
+ {
1788
+ "epoch": 47.504737839545164,
1789
+ "eval_accuracy": 0.7820483314154201,
1790
+ "eval_f1_macro": 0.5937346523730859,
1791
+ "eval_loss": 1.7703049182891846,
1792
+ "eval_runtime": 109.6053,
1793
+ "eval_samples_per_second": 79.284,
1794
+ "eval_steps_per_second": 2.482,
1795
+ "step": 18800
1796
+ },
1797
+ {
1798
+ "epoch": 47.75742261528743,
1799
+ "grad_norm": 1.7660882472991943,
1800
+ "learning_rate": 0.00020555555555555556,
1801
+ "loss": 0.0248,
1802
+ "step": 18900
1803
+ },
1804
+ {
1805
+ "epoch": 48.01010739102969,
1806
+ "grad_norm": 0.0962400734424591,
1807
+ "learning_rate": 0.0002037037037037037,
1808
+ "loss": 0.0316,
1809
+ "step": 19000
1810
+ },
1811
+ {
1812
+ "epoch": 48.26279216677195,
1813
+ "grad_norm": 1.133728265762329,
1814
+ "learning_rate": 0.00020185185185185188,
1815
+ "loss": 0.0308,
1816
+ "step": 19100
1817
+ },
1818
+ {
1819
+ "epoch": 48.51547694251421,
1820
+ "grad_norm": 0.4573793411254883,
1821
+ "learning_rate": 0.0002,
1822
+ "loss": 0.0279,
1823
+ "step": 19200
1824
+ },
1825
+ {
1826
+ "epoch": 48.51547694251421,
1827
+ "eval_accuracy": 0.7795166858457998,
1828
+ "eval_f1_macro": 0.5972420776230559,
1829
+ "eval_loss": 1.6281923055648804,
1830
+ "eval_runtime": 109.5117,
1831
+ "eval_samples_per_second": 79.352,
1832
+ "eval_steps_per_second": 2.484,
1833
+ "step": 19200
1834
+ },
1835
+ {
1836
+ "epoch": 48.76816171825647,
1837
+ "grad_norm": 1.396851658821106,
1838
+ "learning_rate": 0.00019814814814814817,
1839
+ "loss": 0.0216,
1840
+ "step": 19300
1841
+ },
1842
+ {
1843
+ "epoch": 49.02084649399874,
1844
+ "grad_norm": 1.0762335062026978,
1845
+ "learning_rate": 0.0001962962962962963,
1846
+ "loss": 0.0219,
1847
+ "step": 19400
1848
+ },
1849
+ {
1850
+ "epoch": 49.273531269741,
1851
+ "grad_norm": 5.425212860107422,
1852
+ "learning_rate": 0.00019444444444444446,
1853
+ "loss": 0.0276,
1854
+ "step": 19500
1855
+ },
1856
+ {
1857
+ "epoch": 49.52621604548326,
1858
+ "grad_norm": 0.31777116656303406,
1859
+ "learning_rate": 0.0001925925925925926,
1860
+ "loss": 0.0238,
1861
+ "step": 19600
1862
+ },
1863
+ {
1864
+ "epoch": 49.52621604548326,
1865
+ "eval_accuracy": 0.768584579976985,
1866
+ "eval_f1_macro": 0.5921492404881852,
1867
+ "eval_loss": 1.8642910718917847,
1868
+ "eval_runtime": 109.7124,
1869
+ "eval_samples_per_second": 79.207,
1870
+ "eval_steps_per_second": 2.479,
1871
+ "step": 19600
1872
+ },
1873
+ {
1874
+ "epoch": 49.77890082122552,
1875
+ "grad_norm": 2.751400947570801,
1876
+ "learning_rate": 0.00019074074074074075,
1877
+ "loss": 0.0271,
1878
+ "step": 19700
1879
+ },
1880
+ {
1881
+ "epoch": 50.03158559696778,
1882
+ "grad_norm": 2.2741167545318604,
1883
+ "learning_rate": 0.00018888888888888888,
1884
+ "loss": 0.0238,
1885
+ "step": 19800
1886
+ },
1887
+ {
1888
+ "epoch": 50.28427037271005,
1889
+ "grad_norm": 0.2512575387954712,
1890
+ "learning_rate": 0.00018703703703703704,
1891
+ "loss": 0.0288,
1892
+ "step": 19900
1893
+ },
1894
+ {
1895
+ "epoch": 50.53695514845231,
1896
+ "grad_norm": 1.9498554468154907,
1897
+ "learning_rate": 0.00018518518518518518,
1898
+ "loss": 0.0243,
1899
+ "step": 20000
1900
+ },
1901
+ {
1902
+ "epoch": 50.53695514845231,
1903
+ "eval_accuracy": 0.770771001150748,
1904
+ "eval_f1_macro": 0.598576194898749,
1905
+ "eval_loss": 1.976339340209961,
1906
+ "eval_runtime": 109.5523,
1907
+ "eval_samples_per_second": 79.323,
1908
+ "eval_steps_per_second": 2.483,
1909
+ "step": 20000
1910
+ },
1911
+ {
1912
+ "epoch": 50.78963992419457,
1913
+ "grad_norm": 0.7483634948730469,
1914
+ "learning_rate": 0.00018333333333333334,
1915
+ "loss": 0.0253,
1916
+ "step": 20100
1917
+ },
1918
+ {
1919
+ "epoch": 51.04232469993683,
1920
+ "grad_norm": 1.7610820531845093,
1921
+ "learning_rate": 0.0001814814814814815,
1922
+ "loss": 0.025,
1923
+ "step": 20200
1924
+ },
1925
+ {
1926
+ "epoch": 51.29500947567909,
1927
+ "grad_norm": 2.2282676696777344,
1928
+ "learning_rate": 0.00017962962962962965,
1929
+ "loss": 0.0205,
1930
+ "step": 20300
1931
+ },
1932
+ {
1933
+ "epoch": 51.547694251421355,
1934
+ "grad_norm": 1.0908398628234863,
1935
+ "learning_rate": 0.00017777777777777779,
1936
+ "loss": 0.0264,
1937
+ "step": 20400
1938
+ },
1939
+ {
1940
+ "epoch": 51.547694251421355,
1941
+ "eval_accuracy": 0.7785960874568469,
1942
+ "eval_f1_macro": 0.6159147237572236,
1943
+ "eval_loss": 1.9196436405181885,
1944
+ "eval_runtime": 109.4823,
1945
+ "eval_samples_per_second": 79.374,
1946
+ "eval_steps_per_second": 2.484,
1947
+ "step": 20400
1948
+ },
1949
+ {
1950
+ "epoch": 51.800379027163615,
1951
+ "grad_norm": 3.9686079025268555,
1952
+ "learning_rate": 0.00017592592592592595,
1953
+ "loss": 0.0215,
1954
+ "step": 20500
1955
+ },
1956
+ {
1957
+ "epoch": 52.053063802905875,
1958
+ "grad_norm": 1.9715676307678223,
1959
+ "learning_rate": 0.00017407407407407408,
1960
+ "loss": 0.0247,
1961
+ "step": 20600
1962
+ },
1963
+ {
1964
+ "epoch": 52.305748578648135,
1965
+ "grad_norm": 0.6495607495307922,
1966
+ "learning_rate": 0.00017222222222222224,
1967
+ "loss": 0.0249,
1968
+ "step": 20700
1969
+ },
1970
+ {
1971
+ "epoch": 52.558433354390395,
1972
+ "grad_norm": 1.0535669326782227,
1973
+ "learning_rate": 0.00017037037037037037,
1974
+ "loss": 0.0236,
1975
+ "step": 20800
1976
+ },
1977
+ {
1978
+ "epoch": 52.558433354390395,
1979
+ "eval_accuracy": 0.7753739930955121,
1980
+ "eval_f1_macro": 0.5985595705467979,
1981
+ "eval_loss": 1.9067500829696655,
1982
+ "eval_runtime": 109.3981,
1983
+ "eval_samples_per_second": 79.435,
1984
+ "eval_steps_per_second": 2.486,
1985
+ "step": 20800
1986
+ },
1987
+ {
1988
+ "epoch": 52.81111813013266,
1989
+ "grad_norm": 1.502255916595459,
1990
+ "learning_rate": 0.00016851851851851853,
1991
+ "loss": 0.018,
1992
+ "step": 20900
1993
+ },
1994
+ {
1995
+ "epoch": 53.06380290587492,
1996
+ "grad_norm": 0.7225372791290283,
1997
+ "learning_rate": 0.00016666666666666666,
1998
+ "loss": 0.0238,
1999
+ "step": 21000
2000
+ },
2001
+ {
2002
+ "epoch": 53.31648768161718,
2003
+ "grad_norm": 1.2900406122207642,
2004
+ "learning_rate": 0.0001648148148148148,
2005
+ "loss": 0.0203,
2006
+ "step": 21100
2007
+ },
2008
+ {
2009
+ "epoch": 53.56917245735944,
2010
+ "grad_norm": 1.0755679607391357,
2011
+ "learning_rate": 0.00016296296296296298,
2012
+ "loss": 0.0243,
2013
+ "step": 21200
2014
+ },
2015
+ {
2016
+ "epoch": 53.56917245735944,
2017
+ "eval_accuracy": 0.7788262370540852,
2018
+ "eval_f1_macro": 0.6106899497765403,
2019
+ "eval_loss": 1.8558076620101929,
2020
+ "eval_runtime": 109.5516,
2021
+ "eval_samples_per_second": 79.323,
2022
+ "eval_steps_per_second": 2.483,
2023
+ "step": 21200
2024
+ },
2025
+ {
2026
+ "epoch": 53.8218572331017,
2027
+ "grad_norm": 0.06415720283985138,
2028
+ "learning_rate": 0.0001611111111111111,
2029
+ "loss": 0.0205,
2030
+ "step": 21300
2031
+ },
2032
+ {
2033
+ "epoch": 54.07454200884397,
2034
+ "grad_norm": 0.032455410808324814,
2035
+ "learning_rate": 0.00015925925925925927,
2036
+ "loss": 0.0221,
2037
+ "step": 21400
2038
+ },
2039
+ {
2040
+ "epoch": 54.32722678458623,
2041
+ "grad_norm": 0.04551871493458748,
2042
+ "learning_rate": 0.0001574074074074074,
2043
+ "loss": 0.0173,
2044
+ "step": 21500
2045
+ },
2046
+ {
2047
+ "epoch": 54.57991156032849,
2048
+ "grad_norm": 1.3965240716934204,
2049
+ "learning_rate": 0.00015555555555555556,
2050
+ "loss": 0.0168,
2051
+ "step": 21600
2052
+ },
2053
+ {
2054
+ "epoch": 54.57991156032849,
2055
+ "eval_accuracy": 0.7788262370540852,
2056
+ "eval_f1_macro": 0.5821705462809209,
2057
+ "eval_loss": 1.8485394716262817,
2058
+ "eval_runtime": 110.0893,
2059
+ "eval_samples_per_second": 78.936,
2060
+ "eval_steps_per_second": 2.471,
2061
+ "step": 21600
2062
+ },
2063
+ {
2064
+ "epoch": 54.83259633607075,
2065
+ "grad_norm": 0.8048573136329651,
2066
+ "learning_rate": 0.0001537037037037037,
2067
+ "loss": 0.0212,
2068
+ "step": 21700
2069
+ },
2070
+ {
2071
+ "epoch": 55.08528111181301,
2072
+ "grad_norm": 3.6472041606903076,
2073
+ "learning_rate": 0.00015185185185185185,
2074
+ "loss": 0.0229,
2075
+ "step": 21800
2076
+ },
2077
+ {
2078
+ "epoch": 55.33796588755528,
2079
+ "grad_norm": 1.6580862998962402,
2080
+ "learning_rate": 0.00015,
2081
+ "loss": 0.0185,
2082
+ "step": 21900
2083
+ },
2084
+ {
2085
+ "epoch": 55.59065066329754,
2086
+ "grad_norm": 0.0511002242565155,
2087
+ "learning_rate": 0.00014814814814814815,
2088
+ "loss": 0.019,
2089
+ "step": 22000
2090
+ },
2091
+ {
2092
+ "epoch": 55.59065066329754,
2093
+ "eval_accuracy": 0.7772151898734178,
2094
+ "eval_f1_macro": 0.6042884739793327,
2095
+ "eval_loss": 1.8060083389282227,
2096
+ "eval_runtime": 109.6887,
2097
+ "eval_samples_per_second": 79.224,
2098
+ "eval_steps_per_second": 2.48,
2099
+ "step": 22000
2100
+ },
2101
+ {
2102
+ "epoch": 55.8433354390398,
2103
+ "grad_norm": 0.7249517440795898,
2104
+ "learning_rate": 0.00014629629629629628,
2105
+ "loss": 0.022,
2106
+ "step": 22100
2107
+ },
2108
+ {
2109
+ "epoch": 56.09602021478206,
2110
+ "grad_norm": 0.0043795122765004635,
2111
+ "learning_rate": 0.00014444444444444444,
2112
+ "loss": 0.0156,
2113
+ "step": 22200
2114
+ },
2115
+ {
2116
+ "epoch": 56.34870499052432,
2117
+ "grad_norm": 1.0969189405441284,
2118
+ "learning_rate": 0.0001425925925925926,
2119
+ "loss": 0.0186,
2120
+ "step": 22300
2121
+ },
2122
+ {
2123
+ "epoch": 56.601389766266585,
2124
+ "grad_norm": 0.018035294488072395,
2125
+ "learning_rate": 0.00014074074074074076,
2126
+ "loss": 0.0193,
2127
+ "step": 22400
2128
+ },
2129
+ {
2130
+ "epoch": 56.601389766266585,
2131
+ "eval_accuracy": 0.7785960874568469,
2132
+ "eval_f1_macro": 0.6042077712818668,
2133
+ "eval_loss": 1.7848457098007202,
2134
+ "eval_runtime": 109.4669,
2135
+ "eval_samples_per_second": 79.385,
2136
+ "eval_steps_per_second": 2.485,
2137
+ "step": 22400
2138
+ },
2139
+ {
2140
+ "epoch": 56.854074542008846,
2141
+ "grad_norm": 1.3541306257247925,
2142
+ "learning_rate": 0.0001388888888888889,
2143
+ "loss": 0.014,
2144
+ "step": 22500
2145
+ },
2146
+ {
2147
+ "epoch": 57.106759317751106,
2148
+ "grad_norm": 0.32108399271965027,
2149
+ "learning_rate": 0.00013703703703703705,
2150
+ "loss": 0.0147,
2151
+ "step": 22600
2152
+ },
2153
+ {
2154
+ "epoch": 57.359444093493366,
2155
+ "grad_norm": 1.5764517784118652,
2156
+ "learning_rate": 0.00013518518518518518,
2157
+ "loss": 0.0121,
2158
+ "step": 22700
2159
+ },
2160
+ {
2161
+ "epoch": 57.612128869235626,
2162
+ "grad_norm": 0.08151420950889587,
2163
+ "learning_rate": 0.00013333333333333334,
2164
+ "loss": 0.0167,
2165
+ "step": 22800
2166
+ },
2167
+ {
2168
+ "epoch": 57.612128869235626,
2169
+ "eval_accuracy": 0.7812428078250863,
2170
+ "eval_f1_macro": 0.5890612998790621,
2171
+ "eval_loss": 1.9067610502243042,
2172
+ "eval_runtime": 109.8252,
2173
+ "eval_samples_per_second": 79.126,
2174
+ "eval_steps_per_second": 2.477,
2175
+ "step": 22800
2176
+ },
2177
+ {
2178
+ "epoch": 57.86481364497789,
2179
+ "grad_norm": 0.2571534514427185,
2180
+ "learning_rate": 0.00013148148148148147,
2181
+ "loss": 0.0175,
2182
+ "step": 22900
2183
+ },
2184
+ {
2185
+ "epoch": 58.11749842072015,
2186
+ "grad_norm": 1.1256097555160522,
2187
+ "learning_rate": 0.00012962962962962963,
2188
+ "loss": 0.0185,
2189
+ "step": 23000
2190
+ },
2191
+ {
2192
+ "epoch": 58.37018319646241,
2193
+ "grad_norm": 0.036026712507009506,
2194
+ "learning_rate": 0.00012777777777777776,
2195
+ "loss": 0.0149,
2196
+ "step": 23100
2197
+ },
2198
+ {
2199
+ "epoch": 58.62286797220467,
2200
+ "grad_norm": 0.8259644508361816,
2201
+ "learning_rate": 0.00012592592592592592,
2202
+ "loss": 0.0103,
2203
+ "step": 23200
2204
+ },
2205
+ {
2206
+ "epoch": 58.62286797220467,
2207
+ "eval_accuracy": 0.7819332566168009,
2208
+ "eval_f1_macro": 0.5807141259596321,
2209
+ "eval_loss": 1.945181965827942,
2210
+ "eval_runtime": 109.5353,
2211
+ "eval_samples_per_second": 79.335,
2212
+ "eval_steps_per_second": 2.483,
2213
+ "step": 23200
2214
+ },
2215
+ {
2216
+ "epoch": 58.875552747946934,
2217
+ "grad_norm": 0.3167104423046112,
2218
+ "learning_rate": 0.00012407407407407408,
2219
+ "loss": 0.0111,
2220
+ "step": 23300
2221
+ },
2222
+ {
2223
+ "epoch": 59.1282375236892,
2224
+ "grad_norm": 0.19305191934108734,
2225
+ "learning_rate": 0.00012222222222222221,
2226
+ "loss": 0.0198,
2227
+ "step": 23400
2228
+ },
2229
+ {
2230
+ "epoch": 59.38092229943146,
2231
+ "grad_norm": 0.9000421762466431,
2232
+ "learning_rate": 0.00012037037037037036,
2233
+ "loss": 0.0175,
2234
+ "step": 23500
2235
+ },
2236
+ {
2237
+ "epoch": 59.63360707517372,
2238
+ "grad_norm": 0.16270488500595093,
2239
+ "learning_rate": 0.00011851851851851852,
2240
+ "loss": 0.0134,
2241
+ "step": 23600
2242
+ },
2243
+ {
2244
+ "epoch": 59.63360707517372,
2245
+ "eval_accuracy": 0.7856156501726121,
2246
+ "eval_f1_macro": 0.6008830980384848,
2247
+ "eval_loss": 1.8185406923294067,
2248
+ "eval_runtime": 110.5509,
2249
+ "eval_samples_per_second": 78.606,
2250
+ "eval_steps_per_second": 2.46,
2251
+ "step": 23600
2252
+ },
2253
+ {
2254
+ "epoch": 59.88629185091598,
2255
+ "grad_norm": 0.15302228927612305,
2256
+ "learning_rate": 0.00011666666666666667,
2257
+ "loss": 0.0114,
2258
+ "step": 23700
2259
+ },
2260
+ {
2261
+ "epoch": 60.13897662665824,
2262
+ "grad_norm": 0.07428427040576935,
2263
+ "learning_rate": 0.00011481481481481481,
2264
+ "loss": 0.0115,
2265
+ "step": 23800
2266
+ },
2267
+ {
2268
+ "epoch": 60.39166140240051,
2269
+ "grad_norm": 0.025211408734321594,
2270
+ "learning_rate": 0.00011296296296296296,
2271
+ "loss": 0.0129,
2272
+ "step": 23900
2273
+ },
2274
+ {
2275
+ "epoch": 60.64434617814277,
2276
+ "grad_norm": 0.12629875540733337,
2277
+ "learning_rate": 0.0001111111111111111,
2278
+ "loss": 0.0105,
2279
+ "step": 24000
2280
+ },
2281
+ {
2282
+ "epoch": 60.64434617814277,
2283
+ "eval_accuracy": 0.7771001150747986,
2284
+ "eval_f1_macro": 0.5844044138241861,
2285
+ "eval_loss": 1.9464218616485596,
2286
+ "eval_runtime": 109.5176,
2287
+ "eval_samples_per_second": 79.348,
2288
+ "eval_steps_per_second": 2.484,
2289
+ "step": 24000
2290
+ },
2291
+ {
2292
+ "epoch": 60.89703095388503,
2293
+ "grad_norm": 0.08720462769269943,
2294
+ "learning_rate": 0.00010925925925925926,
2295
+ "loss": 0.0104,
2296
+ "step": 24100
2297
+ },
2298
+ {
2299
+ "epoch": 61.14971572962729,
2300
+ "grad_norm": 2.3958306312561035,
2301
+ "learning_rate": 0.00010740740740740741,
2302
+ "loss": 0.011,
2303
+ "step": 24200
2304
+ },
2305
+ {
2306
+ "epoch": 61.40240050536955,
2307
+ "grad_norm": 0.0953073650598526,
2308
+ "learning_rate": 0.00010555555555555555,
2309
+ "loss": 0.0115,
2310
+ "step": 24300
2311
+ },
2312
+ {
2313
+ "epoch": 61.655085281111816,
2314
+ "grad_norm": 0.013684218749403954,
2315
+ "learning_rate": 0.0001037037037037037,
2316
+ "loss": 0.0154,
2317
+ "step": 24400
2318
+ },
2319
+ {
2320
+ "epoch": 61.655085281111816,
2321
+ "eval_accuracy": 0.7811277330264672,
2322
+ "eval_f1_macro": 0.5880759609293783,
2323
+ "eval_loss": 1.9170596599578857,
2324
+ "eval_runtime": 109.5265,
2325
+ "eval_samples_per_second": 79.342,
2326
+ "eval_steps_per_second": 2.483,
2327
+ "step": 24400
2328
+ },
2329
+ {
2330
+ "epoch": 61.907770056854076,
2331
+ "grad_norm": 0.02879270352423191,
2332
+ "learning_rate": 0.00010185185185185185,
2333
+ "loss": 0.0133,
2334
+ "step": 24500
2335
+ },
2336
+ {
2337
+ "epoch": 62.160454832596336,
2338
+ "grad_norm": 2.672358512878418,
2339
+ "learning_rate": 0.0001,
2340
+ "loss": 0.014,
2341
+ "step": 24600
2342
+ },
2343
+ {
2344
+ "epoch": 62.4131396083386,
2345
+ "grad_norm": 0.22067667543888092,
2346
+ "learning_rate": 9.814814814814815e-05,
2347
+ "loss": 0.0152,
2348
+ "step": 24700
2349
+ },
2350
+ {
2351
+ "epoch": 62.66582438408086,
2352
+ "grad_norm": 1.4633599519729614,
2353
+ "learning_rate": 9.62962962962963e-05,
2354
+ "loss": 0.0132,
2355
+ "step": 24800
2356
+ },
2357
+ {
2358
+ "epoch": 62.66582438408086,
2359
+ "eval_accuracy": 0.7836593785960875,
2360
+ "eval_f1_macro": 0.5992199483923372,
2361
+ "eval_loss": 1.8896235227584839,
2362
+ "eval_runtime": 109.6805,
2363
+ "eval_samples_per_second": 79.23,
2364
+ "eval_steps_per_second": 2.48,
2365
+ "step": 24800
2366
+ },
2367
+ {
2368
+ "epoch": 62.918509159823124,
2369
+ "grad_norm": 1.5682035684585571,
2370
+ "learning_rate": 9.444444444444444e-05,
2371
+ "loss": 0.0103,
2372
+ "step": 24900
2373
+ },
2374
+ {
2375
+ "epoch": 63.171193935565384,
2376
+ "grad_norm": 0.010575806722044945,
2377
+ "learning_rate": 9.259259259259259e-05,
2378
+ "loss": 0.0099,
2379
+ "step": 25000
2380
+ },
2381
+ {
2382
+ "epoch": 63.423878711307644,
2383
+ "grad_norm": 1.3311010599136353,
2384
+ "learning_rate": 9.074074074074075e-05,
2385
+ "loss": 0.0082,
2386
+ "step": 25100
2387
+ },
2388
+ {
2389
+ "epoch": 63.676563487049904,
2390
+ "grad_norm": 0.13537296652793884,
2391
+ "learning_rate": 8.888888888888889e-05,
2392
+ "loss": 0.0059,
2393
+ "step": 25200
2394
+ },
2395
+ {
2396
+ "epoch": 63.676563487049904,
2397
+ "eval_accuracy": 0.785385500575374,
2398
+ "eval_f1_macro": 0.5910071350304901,
2399
+ "eval_loss": 1.93367600440979,
2400
+ "eval_runtime": 109.8909,
2401
+ "eval_samples_per_second": 79.078,
2402
+ "eval_steps_per_second": 2.475,
2403
+ "step": 25200
2404
+ },
2405
+ {
2406
+ "epoch": 63.929248262792164,
2407
+ "grad_norm": 0.13171835243701935,
2408
+ "learning_rate": 8.703703703703704e-05,
2409
+ "loss": 0.0099,
2410
+ "step": 25300
2411
+ },
2412
+ {
2413
+ "epoch": 64.18193303853442,
2414
+ "grad_norm": 0.03450850397348404,
2415
+ "learning_rate": 8.518518518518518e-05,
2416
+ "loss": 0.0125,
2417
+ "step": 25400
2418
+ },
2419
+ {
2420
+ "epoch": 64.43461781427669,
2421
+ "grad_norm": 2.559318780899048,
2422
+ "learning_rate": 8.333333333333333e-05,
2423
+ "loss": 0.0134,
2424
+ "step": 25500
2425
+ },
2426
+ {
2427
+ "epoch": 64.68730259001894,
2428
+ "grad_norm": 0.4207484722137451,
2429
+ "learning_rate": 8.148148148148149e-05,
2430
+ "loss": 0.0082,
2431
+ "step": 25600
2432
+ },
2433
+ {
2434
+ "epoch": 64.68730259001894,
2435
+ "eval_accuracy": 0.7844649021864212,
2436
+ "eval_f1_macro": 0.5981216790852985,
2437
+ "eval_loss": 1.9173579216003418,
2438
+ "eval_runtime": 109.5317,
2439
+ "eval_samples_per_second": 79.338,
2440
+ "eval_steps_per_second": 2.483,
2441
+ "step": 25600
2442
+ },
2443
+ {
2444
+ "epoch": 64.93998736576121,
2445
+ "grad_norm": 0.02727402001619339,
2446
+ "learning_rate": 7.962962962962964e-05,
2447
+ "loss": 0.0079,
2448
+ "step": 25700
2449
+ },
2450
+ {
2451
+ "epoch": 65.19267214150348,
2452
+ "grad_norm": 0.055505283176898956,
2453
+ "learning_rate": 7.777777777777778e-05,
2454
+ "loss": 0.0088,
2455
+ "step": 25800
2456
+ },
2457
+ {
2458
+ "epoch": 65.44535691724573,
2459
+ "grad_norm": 0.0938887745141983,
2460
+ "learning_rate": 7.592592592592593e-05,
2461
+ "loss": 0.0081,
2462
+ "step": 25900
2463
+ },
2464
+ {
2465
+ "epoch": 65.698041692988,
2466
+ "grad_norm": 0.06274864077568054,
2467
+ "learning_rate": 7.407407407407407e-05,
2468
+ "loss": 0.01,
2469
+ "step": 26000
2470
+ },
2471
+ {
2472
+ "epoch": 65.698041692988,
2473
+ "eval_accuracy": 0.7873417721518987,
2474
+ "eval_f1_macro": 0.5952116194605911,
2475
+ "eval_loss": 1.9076104164123535,
2476
+ "eval_runtime": 109.6326,
2477
+ "eval_samples_per_second": 79.265,
2478
+ "eval_steps_per_second": 2.481,
2479
+ "step": 26000
2480
+ },
2481
+ {
2482
+ "epoch": 65.95072646873025,
2483
+ "grad_norm": 0.395120769739151,
2484
+ "learning_rate": 7.222222222222222e-05,
2485
+ "loss": 0.013,
2486
+ "step": 26100
2487
+ },
2488
+ {
2489
+ "epoch": 66.20341124447252,
2490
+ "grad_norm": 0.0025830611120909452,
2491
+ "learning_rate": 7.037037037037038e-05,
2492
+ "loss": 0.0072,
2493
+ "step": 26200
2494
+ },
2495
+ {
2496
+ "epoch": 66.45609602021479,
2497
+ "grad_norm": 0.08498067408800125,
2498
+ "learning_rate": 6.851851851851852e-05,
2499
+ "loss": 0.0092,
2500
+ "step": 26300
2501
+ },
2502
+ {
2503
+ "epoch": 66.70878079595704,
2504
+ "grad_norm": 0.03530678525567055,
2505
+ "learning_rate": 6.666666666666667e-05,
2506
+ "loss": 0.0119,
2507
+ "step": 26400
2508
+ },
2509
+ {
2510
+ "epoch": 66.70878079595704,
2511
+ "eval_accuracy": 0.7857307249712313,
2512
+ "eval_f1_macro": 0.6071046848722826,
2513
+ "eval_loss": 1.8940664529800415,
2514
+ "eval_runtime": 109.9317,
2515
+ "eval_samples_per_second": 79.049,
2516
+ "eval_steps_per_second": 2.474,
2517
+ "step": 26400
2518
+ },
2519
+ {
2520
+ "epoch": 66.9614655716993,
2521
+ "grad_norm": 0.031329553574323654,
2522
+ "learning_rate": 6.481481481481482e-05,
2523
+ "loss": 0.0096,
2524
+ "step": 26500
2525
+ },
2526
+ {
2527
+ "epoch": 67.21415034744156,
2528
+ "grad_norm": 0.30799156427383423,
2529
+ "learning_rate": 6.296296296296296e-05,
2530
+ "loss": 0.0095,
2531
+ "step": 26600
2532
+ },
2533
+ {
2534
+ "epoch": 67.46683512318383,
2535
+ "grad_norm": 1.5470929145812988,
2536
+ "learning_rate": 6.111111111111111e-05,
2537
+ "loss": 0.0089,
2538
+ "step": 26700
2539
+ },
2540
+ {
2541
+ "epoch": 67.7195198989261,
2542
+ "grad_norm": 2.5448250770568848,
2543
+ "learning_rate": 5.925925925925926e-05,
2544
+ "loss": 0.0067,
2545
+ "step": 26800
2546
+ },
2547
+ {
2548
+ "epoch": 67.7195198989261,
2549
+ "eval_accuracy": 0.7878020713463751,
2550
+ "eval_f1_macro": 0.5999522395147027,
2551
+ "eval_loss": 1.930634617805481,
2552
+ "eval_runtime": 109.663,
2553
+ "eval_samples_per_second": 79.243,
2554
+ "eval_steps_per_second": 2.48,
2555
+ "step": 26800
2556
+ },
2557
+ {
2558
+ "epoch": 67.97220467466835,
2559
+ "grad_norm": 0.7930640578269958,
2560
+ "learning_rate": 5.7407407407407406e-05,
2561
+ "loss": 0.0069,
2562
+ "step": 26900
2563
+ },
2564
+ {
2565
+ "epoch": 68.22488945041061,
2566
+ "grad_norm": 0.26925787329673767,
2567
+ "learning_rate": 5.555555555555555e-05,
2568
+ "loss": 0.007,
2569
+ "step": 27000
2570
+ },
2571
+ {
2572
+ "epoch": 68.47757422615287,
2573
+ "grad_norm": 0.1341189444065094,
2574
+ "learning_rate": 5.3703703703703704e-05,
2575
+ "loss": 0.0087,
2576
+ "step": 27100
2577
+ },
2578
+ {
2579
+ "epoch": 68.73025900189513,
2580
+ "grad_norm": 0.02547445334494114,
2581
+ "learning_rate": 5.185185185185185e-05,
2582
+ "loss": 0.0081,
2583
+ "step": 27200
2584
+ },
2585
+ {
2586
+ "epoch": 68.73025900189513,
2587
+ "eval_accuracy": 0.792174913693901,
2588
+ "eval_f1_macro": 0.6008077464320003,
2589
+ "eval_loss": 1.9338207244873047,
2590
+ "eval_runtime": 109.813,
2591
+ "eval_samples_per_second": 79.135,
2592
+ "eval_steps_per_second": 2.477,
2593
+ "step": 27200
2594
+ },
2595
+ {
2596
+ "epoch": 68.9829437776374,
2597
+ "grad_norm": 0.03730254992842674,
2598
+ "learning_rate": 5e-05,
2599
+ "loss": 0.0065,
2600
+ "step": 27300
2601
+ },
2602
+ {
2603
+ "epoch": 69.23562855337966,
2604
+ "grad_norm": 0.11829439550638199,
2605
+ "learning_rate": 4.814814814814815e-05,
2606
+ "loss": 0.0058,
2607
+ "step": 27400
2608
+ },
2609
+ {
2610
+ "epoch": 69.48831332912192,
2611
+ "grad_norm": 1.238792896270752,
2612
+ "learning_rate": 4.6296296296296294e-05,
2613
+ "loss": 0.0047,
2614
+ "step": 27500
2615
+ },
2616
+ {
2617
+ "epoch": 69.74099810486418,
2618
+ "grad_norm": 0.08776523917913437,
2619
+ "learning_rate": 4.4444444444444447e-05,
2620
+ "loss": 0.0069,
2621
+ "step": 27600
2622
+ },
2623
+ {
2624
+ "epoch": 69.74099810486418,
2625
+ "eval_accuracy": 0.7914844649021864,
2626
+ "eval_f1_macro": 0.5974683158925355,
2627
+ "eval_loss": 1.9561039209365845,
2628
+ "eval_runtime": 109.6013,
2629
+ "eval_samples_per_second": 79.287,
2630
+ "eval_steps_per_second": 2.482,
2631
+ "step": 27600
2632
+ },
2633
+ {
2634
+ "epoch": 69.99368288060644,
2635
+ "grad_norm": 0.02279621548950672,
2636
+ "learning_rate": 4.259259259259259e-05,
2637
+ "loss": 0.0077,
2638
+ "step": 27700
2639
+ },
2640
+ {
2641
+ "epoch": 70.24636765634871,
2642
+ "grad_norm": 1.407116413116455,
2643
+ "learning_rate": 4.0740740740740745e-05,
2644
+ "loss": 0.0083,
2645
+ "step": 27800
2646
+ },
2647
+ {
2648
+ "epoch": 70.49905243209096,
2649
+ "grad_norm": 0.8993849754333496,
2650
+ "learning_rate": 3.888888888888889e-05,
2651
+ "loss": 0.0037,
2652
+ "step": 27900
2653
+ },
2654
+ {
2655
+ "epoch": 70.75173720783323,
2656
+ "grad_norm": 0.01359980646520853,
2657
+ "learning_rate": 3.7037037037037037e-05,
2658
+ "loss": 0.0037,
2659
+ "step": 28000
2660
+ },
2661
+ {
2662
+ "epoch": 70.75173720783323,
2663
+ "eval_accuracy": 0.7919447640966628,
2664
+ "eval_f1_macro": 0.5933817664756982,
2665
+ "eval_loss": 1.9411331415176392,
2666
+ "eval_runtime": 109.976,
2667
+ "eval_samples_per_second": 79.017,
2668
+ "eval_steps_per_second": 2.473,
2669
+ "step": 28000
2670
+ },
2671
+ {
2672
+ "epoch": 71.00442198357548,
2673
+ "grad_norm": 0.00356766814365983,
2674
+ "learning_rate": 3.518518518518519e-05,
2675
+ "loss": 0.0053,
2676
+ "step": 28100
2677
+ },
2678
+ {
2679
+ "epoch": 71.25710675931775,
2680
+ "grad_norm": 0.0005093297804705799,
2681
+ "learning_rate": 3.3333333333333335e-05,
2682
+ "loss": 0.0046,
2683
+ "step": 28200
2684
+ },
2685
+ {
2686
+ "epoch": 71.50979153506002,
2687
+ "grad_norm": 0.007542046718299389,
2688
+ "learning_rate": 3.148148148148148e-05,
2689
+ "loss": 0.0063,
2690
+ "step": 28300
2691
+ },
2692
+ {
2693
+ "epoch": 71.76247631080227,
2694
+ "grad_norm": 0.008996374905109406,
2695
+ "learning_rate": 2.962962962962963e-05,
2696
+ "loss": 0.0055,
2697
+ "step": 28400
2698
+ },
2699
+ {
2700
+ "epoch": 71.76247631080227,
2701
+ "eval_accuracy": 0.7947065592635213,
2702
+ "eval_f1_macro": 0.606388230425645,
2703
+ "eval_loss": 1.9448614120483398,
2704
+ "eval_runtime": 109.7807,
2705
+ "eval_samples_per_second": 79.158,
2706
+ "eval_steps_per_second": 2.478,
2707
+ "step": 28400
2708
+ },
2709
+ {
2710
+ "epoch": 72.01516108654454,
2711
+ "grad_norm": 0.27878421545028687,
2712
+ "learning_rate": 2.7777777777777776e-05,
2713
+ "loss": 0.0046,
2714
+ "step": 28500
2715
+ },
2716
+ {
2717
+ "epoch": 72.26784586228679,
2718
+ "grad_norm": 0.011219154112040997,
2719
+ "learning_rate": 2.5925925925925925e-05,
2720
+ "loss": 0.0067,
2721
+ "step": 28600
2722
+ },
2723
+ {
2724
+ "epoch": 72.52053063802906,
2725
+ "grad_norm": 0.0006956435390748084,
2726
+ "learning_rate": 2.4074074074074074e-05,
2727
+ "loss": 0.0063,
2728
+ "step": 28700
2729
+ },
2730
+ {
2731
+ "epoch": 72.77321541377133,
2732
+ "grad_norm": 0.07210740447044373,
2733
+ "learning_rate": 2.2222222222222223e-05,
2734
+ "loss": 0.0046,
2735
+ "step": 28800
2736
+ },
2737
+ {
2738
+ "epoch": 72.77321541377133,
2739
+ "eval_accuracy": 0.7941311852704258,
2740
+ "eval_f1_macro": 0.6069777778254928,
2741
+ "eval_loss": 1.9499226808547974,
2742
+ "eval_runtime": 109.6721,
2743
+ "eval_samples_per_second": 79.236,
2744
+ "eval_steps_per_second": 2.48,
2745
+ "step": 28800
2746
+ },
2747
+ {
2748
+ "epoch": 73.02590018951358,
2749
+ "grad_norm": 0.04884390905499458,
2750
+ "learning_rate": 2.0370370370370372e-05,
2751
+ "loss": 0.006,
2752
+ "step": 28900
2753
+ },
2754
+ {
2755
+ "epoch": 73.27858496525585,
2756
+ "grad_norm": 0.06649865210056305,
2757
+ "learning_rate": 1.8518518518518518e-05,
2758
+ "loss": 0.0028,
2759
+ "step": 29000
2760
+ },
2761
+ {
2762
+ "epoch": 73.5312697409981,
2763
+ "grad_norm": 0.0004434076545294374,
2764
+ "learning_rate": 1.6666666666666667e-05,
2765
+ "loss": 0.009,
2766
+ "step": 29100
2767
+ },
2768
+ {
2769
+ "epoch": 73.78395451674037,
2770
+ "grad_norm": 0.15433602035045624,
2771
+ "learning_rate": 1.4814814814814815e-05,
2772
+ "loss": 0.0063,
2773
+ "step": 29200
2774
+ },
2775
+ {
2776
+ "epoch": 73.78395451674037,
2777
+ "eval_accuracy": 0.7918296892980438,
2778
+ "eval_f1_macro": 0.5927699398802871,
2779
+ "eval_loss": 1.9630827903747559,
2780
+ "eval_runtime": 109.0221,
2781
+ "eval_samples_per_second": 79.709,
2782
+ "eval_steps_per_second": 2.495,
2783
+ "step": 29200
2784
+ },
2785
+ {
2786
+ "epoch": 74.03663929248263,
2787
+ "grad_norm": 0.0017094516661018133,
2788
+ "learning_rate": 1.2962962962962962e-05,
2789
+ "loss": 0.0046,
2790
+ "step": 29300
2791
+ },
2792
+ {
2793
+ "epoch": 74.28932406822489,
2794
+ "grad_norm": 1.173220157623291,
2795
+ "learning_rate": 1.1111111111111112e-05,
2796
+ "loss": 0.0037,
2797
+ "step": 29400
2798
+ },
2799
+ {
2800
+ "epoch": 74.54200884396715,
2801
+ "grad_norm": 0.006894304417073727,
2802
+ "learning_rate": 9.259259259259259e-06,
2803
+ "loss": 0.0042,
2804
+ "step": 29500
2805
+ },
2806
+ {
2807
+ "epoch": 74.7946936197094,
2808
+ "grad_norm": 0.005111688282340765,
2809
+ "learning_rate": 7.4074074074074075e-06,
2810
+ "loss": 0.0052,
2811
+ "step": 29600
2812
+ },
2813
+ {
2814
+ "epoch": 74.7946936197094,
2815
+ "eval_accuracy": 0.7937859608745684,
2816
+ "eval_f1_macro": 0.5940671707873302,
2817
+ "eval_loss": 1.951813817024231,
2818
+ "eval_runtime": 108.972,
2819
+ "eval_samples_per_second": 79.745,
2820
+ "eval_steps_per_second": 2.496,
2821
+ "step": 29600
2822
+ },
2823
+ {
2824
+ "epoch": 75.04737839545167,
2825
+ "grad_norm": 0.0013843182241544127,
2826
+ "learning_rate": 5.555555555555556e-06,
2827
+ "loss": 0.0057,
2828
+ "step": 29700
2829
+ },
2830
+ {
2831
+ "epoch": 75.30006317119394,
2832
+ "grad_norm": 9.947911894414574e-05,
2833
+ "learning_rate": 3.7037037037037037e-06,
2834
+ "loss": 0.0052,
2835
+ "step": 29800
2836
+ },
2837
+ {
2838
+ "epoch": 75.5527479469362,
2839
+ "grad_norm": 0.3020813763141632,
2840
+ "learning_rate": 1.8518518518518519e-06,
2841
+ "loss": 0.006,
2842
+ "step": 29900
2843
+ },
2844
+ {
2845
+ "epoch": 75.80543272267846,
2846
+ "grad_norm": 1.3610678911209106,
2847
+ "learning_rate": 0.0,
2848
+ "loss": 0.0058,
2849
+ "step": 30000
2850
+ },
2851
+ {
2852
+ "epoch": 75.80543272267846,
2853
+ "eval_accuracy": 0.7934407364787112,
2854
+ "eval_f1_macro": 0.6028809095714917,
2855
+ "eval_loss": 1.9492535591125488,
2856
+ "eval_runtime": 108.7371,
2857
+ "eval_samples_per_second": 79.918,
2858
+ "eval_steps_per_second": 2.501,
2859
+ "step": 30000
2860
+ }
2861
+ ],
2862
+ "logging_steps": 100,
2863
+ "max_steps": 30000,
2864
+ "num_input_tokens_seen": 0,
2865
+ "num_train_epochs": 76,
2866
+ "save_steps": 30000,
2867
+ "stateful_callbacks": {
2868
+ "TrainerControl": {
2869
+ "args": {
2870
+ "should_epoch_stop": false,
2871
+ "should_evaluate": false,
2872
+ "should_log": false,
2873
+ "should_save": true,
2874
+ "should_training_stop": true
2875
+ },
2876
+ "attributes": {}
2877
+ }
2878
+ },
2879
+ "total_flos": 5.8164789316384843e+20,
2880
+ "train_batch_size": 32,
2881
+ "trial_name": null,
2882
+ "trial_params": null
2883
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b14532a009c641bdd0c8eb184939690a7f953fdf842b11ea7a30fd47ea759a9
3
+ size 5240