marcus-castalk commited on
Commit
5aeb525
·
verified ·
1 Parent(s): fdc400c

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 2048,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 11008,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention"
49
+ ],
50
+ "max_position_embeddings": 32768,
51
+ "max_window_layers": 70,
52
+ "model_type": "qwen2",
53
+ "num_attention_heads": 16,
54
+ "num_hidden_layers": 36,
55
+ "num_key_value_heads": 2,
56
+ "rms_norm_eps": 1e-06,
57
+ "rope_scaling": null,
58
+ "rope_theta": 1000000.0,
59
+ "sliding_window": null,
60
+ "tie_word_embeddings": true,
61
+ "torch_dtype": "bfloat16",
62
+ "transformers_version": "4.53.1",
63
+ "use_cache": true,
64
+ "use_sliding_window": false,
65
+ "vocab_size": 151936
66
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.53.1"
14
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3a933fc7e35615df5ab9e2966cf53475fccb8773812503163021846073d3270
3
+ size 4957560304
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0d4b2e356c35872a3760bf971ad1a0ee89f3ec27d9fc515187e872c7af2754c
3
+ size 1214366696
model.safetensors.index.json ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 3085938688,
4
+ "total_size": 6171877376
5
+ },
6
+ "weight_map": {
7
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
260
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
261
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
262
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
263
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
264
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
265
+ "model.layers.28.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
266
+ "model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
267
+ "model.layers.28.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
268
+ "model.layers.28.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
269
+ "model.layers.28.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
270
+ "model.layers.28.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
271
+ "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
272
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
273
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
274
+ "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
275
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
276
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
277
+ "model.layers.29.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
278
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
279
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
280
+ "model.layers.29.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
281
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
282
+ "model.layers.29.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
283
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
284
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
285
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
286
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
287
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
288
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
289
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
290
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
291
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
292
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
293
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
294
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
295
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
296
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
297
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
298
+ "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
299
+ "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
300
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
301
+ "model.layers.30.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
302
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
303
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
304
+ "model.layers.30.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
305
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
306
+ "model.layers.30.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
307
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
308
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
309
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
310
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
311
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
312
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
313
+ "model.layers.31.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
314
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
315
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
316
+ "model.layers.31.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
317
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
318
+ "model.layers.31.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
319
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
320
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
321
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
322
+ "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
323
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
324
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
325
+ "model.layers.32.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
326
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
327
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
328
+ "model.layers.32.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
329
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
330
+ "model.layers.32.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
331
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
332
+ "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
333
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
334
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
335
+ "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
336
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
337
+ "model.layers.33.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
338
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
339
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
340
+ "model.layers.33.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
341
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
342
+ "model.layers.33.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
343
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
344
+ "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
345
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
346
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
347
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
348
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
349
+ "model.layers.34.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
350
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
351
+ "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
352
+ "model.layers.34.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
353
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
354
+ "model.layers.34.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
355
+ "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
356
+ "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
357
+ "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
358
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
359
+ "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
360
+ "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
361
+ "model.layers.35.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
362
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
363
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
364
+ "model.layers.35.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
365
+ "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
366
+ "model.layers.35.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
367
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
368
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
369
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
370
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
371
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
372
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
373
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
374
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
375
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
376
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
377
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
378
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
379
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
380
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
381
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
382
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
383
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
384
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
385
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
386
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
387
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
388
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
389
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
390
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
391
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
392
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
393
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
394
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
395
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
396
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
397
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
398
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
399
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
400
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
401
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
402
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
403
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
404
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
405
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
406
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
407
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
408
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
409
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
410
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
411
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
412
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
413
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
414
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
415
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
416
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
417
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
418
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
419
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
420
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
421
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
422
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
423
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
424
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
425
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
426
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
427
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
428
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
429
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
430
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
431
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
432
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
433
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
434
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
435
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
436
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
437
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
438
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
439
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
440
+ "model.norm.weight": "model-00002-of-00002.safetensors"
441
+ }
442
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4e63fe992c6c928518dc9c9f832a8aa53118c3c86c06344552df3040e7f9d72
3
+ size 12344133221
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bfe1981024ef92f2da08a90c72c7c793d1cc9de1547abd2556c968be70232eb
3
+ size 16389
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a35b845d476d830805793c3dcf8ac2daad87fec289bff3f7eda9e72fc374eda1
3
+ size 16389
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03e9880996b01262a807d1ec3ebd91eee540e08130a14a45a4648731fd0d48a9
3
+ size 16389
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee25c237d6fe62ec76adcf7daf899d7ed32eab5d1a5b447b911f4451c9a1b258
3
+ size 16389
rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a6b31133f29a8fc0cb538aa807d6a403bd51939336bfd425cd3d122d8c5595c
3
+ size 16389
rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a26c55b5c7fa0522b1d27b2c00a7ea77ad010f19a1321991165c5c972b8fa97a
3
+ size 16389
rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a1a3cf85626196804f25a8293e22dc561bba068a70fb123e04afe4896c33972
3
+ size 16389
rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28f87c1ee5f5db346c7b913137cbccd196eaf8ec5a4cf9f192418a3069269b49
3
+ size 16389
trainer_state.json ADDED
@@ -0,0 +1,2834 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.986294567484358,
6
+ "eval_steps": 500,
7
+ "global_step": 20000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.004965736418710895,
14
+ "grad_norm": 1.140625,
15
+ "learning_rate": 4.9918892971827726e-05,
16
+ "loss": 1.2895,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.00993147283742179,
21
+ "grad_norm": 1.0859375,
22
+ "learning_rate": 4.9836130698182544e-05,
23
+ "loss": 1.0572,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.014897209256132684,
28
+ "grad_norm": 1.0078125,
29
+ "learning_rate": 4.975336842453736e-05,
30
+ "loss": 1.0198,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.01986294567484358,
35
+ "grad_norm": 1.0078125,
36
+ "learning_rate": 4.967060615089218e-05,
37
+ "loss": 1.0004,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.024828682093554474,
42
+ "grad_norm": 1.0546875,
43
+ "learning_rate": 4.9587843877246996e-05,
44
+ "loss": 0.9881,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.02979441851226537,
49
+ "grad_norm": 0.93359375,
50
+ "learning_rate": 4.9505081603601814e-05,
51
+ "loss": 0.977,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.03476015493097626,
56
+ "grad_norm": 0.875,
57
+ "learning_rate": 4.942231932995664e-05,
58
+ "loss": 0.9686,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.03972589134968716,
63
+ "grad_norm": 0.85546875,
64
+ "learning_rate": 4.9339557056311455e-05,
65
+ "loss": 0.9622,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.04469162776839805,
70
+ "grad_norm": 0.890625,
71
+ "learning_rate": 4.925679478266627e-05,
72
+ "loss": 0.9588,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 0.04965736418710895,
77
+ "grad_norm": 0.89453125,
78
+ "learning_rate": 4.917403250902109e-05,
79
+ "loss": 0.9526,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 0.054623100605819846,
84
+ "grad_norm": 0.86328125,
85
+ "learning_rate": 4.909127023537591e-05,
86
+ "loss": 0.9471,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 0.05958883702453074,
91
+ "grad_norm": 0.88671875,
92
+ "learning_rate": 4.9008507961730725e-05,
93
+ "loss": 0.9441,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 0.06455457344324163,
98
+ "grad_norm": 0.87109375,
99
+ "learning_rate": 4.892574568808554e-05,
100
+ "loss": 0.9408,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 0.06952030986195253,
105
+ "grad_norm": 0.8125,
106
+ "learning_rate": 4.8842983414440366e-05,
107
+ "loss": 0.9375,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 0.07448604628066342,
112
+ "grad_norm": 0.796875,
113
+ "learning_rate": 4.8760221140795184e-05,
114
+ "loss": 0.9368,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 0.07945178269937432,
119
+ "grad_norm": 1.015625,
120
+ "learning_rate": 4.867745886715e-05,
121
+ "loss": 0.9335,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 0.08441751911808522,
126
+ "grad_norm": 0.8046875,
127
+ "learning_rate": 4.859469659350482e-05,
128
+ "loss": 0.935,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 0.0893832555367961,
133
+ "grad_norm": 0.93359375,
134
+ "learning_rate": 4.8511934319859636e-05,
135
+ "loss": 0.9273,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 0.094348991955507,
140
+ "grad_norm": 0.83984375,
141
+ "learning_rate": 4.8429172046214454e-05,
142
+ "loss": 0.9275,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 0.0993147283742179,
147
+ "grad_norm": 0.859375,
148
+ "learning_rate": 4.834640977256927e-05,
149
+ "loss": 0.9268,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 0.1042804647929288,
154
+ "grad_norm": 0.8125,
155
+ "learning_rate": 4.826364749892409e-05,
156
+ "loss": 0.9264,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 0.10924620121163969,
161
+ "grad_norm": 0.80859375,
162
+ "learning_rate": 4.818088522527891e-05,
163
+ "loss": 0.9232,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 0.11421193763035058,
168
+ "grad_norm": 0.8828125,
169
+ "learning_rate": 4.809812295163373e-05,
170
+ "loss": 0.9188,
171
+ "step": 1150
172
+ },
173
+ {
174
+ "epoch": 0.11917767404906147,
175
+ "grad_norm": 0.859375,
176
+ "learning_rate": 4.801536067798855e-05,
177
+ "loss": 0.9197,
178
+ "step": 1200
179
+ },
180
+ {
181
+ "epoch": 0.12414341046777237,
182
+ "grad_norm": 0.91015625,
183
+ "learning_rate": 4.7932598404343365e-05,
184
+ "loss": 0.919,
185
+ "step": 1250
186
+ },
187
+ {
188
+ "epoch": 0.12910914688648326,
189
+ "grad_norm": 0.85546875,
190
+ "learning_rate": 4.784983613069818e-05,
191
+ "loss": 0.9211,
192
+ "step": 1300
193
+ },
194
+ {
195
+ "epoch": 0.13407488330519415,
196
+ "grad_norm": 0.77734375,
197
+ "learning_rate": 4.7767073857053e-05,
198
+ "loss": 0.9153,
199
+ "step": 1350
200
+ },
201
+ {
202
+ "epoch": 0.13904061972390505,
203
+ "grad_norm": 0.828125,
204
+ "learning_rate": 4.768431158340782e-05,
205
+ "loss": 0.911,
206
+ "step": 1400
207
+ },
208
+ {
209
+ "epoch": 0.14400635614261595,
210
+ "grad_norm": 0.80078125,
211
+ "learning_rate": 4.760154930976264e-05,
212
+ "loss": 0.9112,
213
+ "step": 1450
214
+ },
215
+ {
216
+ "epoch": 0.14897209256132685,
217
+ "grad_norm": 0.7890625,
218
+ "learning_rate": 4.751878703611746e-05,
219
+ "loss": 0.9069,
220
+ "step": 1500
221
+ },
222
+ {
223
+ "epoch": 0.15393782898003774,
224
+ "grad_norm": 0.7734375,
225
+ "learning_rate": 4.7436024762472276e-05,
226
+ "loss": 0.9061,
227
+ "step": 1550
228
+ },
229
+ {
230
+ "epoch": 0.15890356539874864,
231
+ "grad_norm": 0.8046875,
232
+ "learning_rate": 4.7353262488827094e-05,
233
+ "loss": 0.909,
234
+ "step": 1600
235
+ },
236
+ {
237
+ "epoch": 0.16386930181745954,
238
+ "grad_norm": 0.8125,
239
+ "learning_rate": 4.727050021518191e-05,
240
+ "loss": 0.9065,
241
+ "step": 1650
242
+ },
243
+ {
244
+ "epoch": 0.16883503823617044,
245
+ "grad_norm": 0.7890625,
246
+ "learning_rate": 4.718773794153673e-05,
247
+ "loss": 0.9103,
248
+ "step": 1700
249
+ },
250
+ {
251
+ "epoch": 0.1738007746548813,
252
+ "grad_norm": 0.8125,
253
+ "learning_rate": 4.7104975667891546e-05,
254
+ "loss": 0.9018,
255
+ "step": 1750
256
+ },
257
+ {
258
+ "epoch": 0.1787665110735922,
259
+ "grad_norm": 0.76953125,
260
+ "learning_rate": 4.702221339424637e-05,
261
+ "loss": 0.901,
262
+ "step": 1800
263
+ },
264
+ {
265
+ "epoch": 0.1837322474923031,
266
+ "grad_norm": 0.81640625,
267
+ "learning_rate": 4.693945112060119e-05,
268
+ "loss": 0.9015,
269
+ "step": 1850
270
+ },
271
+ {
272
+ "epoch": 0.188697983911014,
273
+ "grad_norm": 0.8046875,
274
+ "learning_rate": 4.6856688846956005e-05,
275
+ "loss": 0.903,
276
+ "step": 1900
277
+ },
278
+ {
279
+ "epoch": 0.1936637203297249,
280
+ "grad_norm": 0.8359375,
281
+ "learning_rate": 4.677392657331082e-05,
282
+ "loss": 0.9023,
283
+ "step": 1950
284
+ },
285
+ {
286
+ "epoch": 0.1986294567484358,
287
+ "grad_norm": 0.9453125,
288
+ "learning_rate": 4.669116429966564e-05,
289
+ "loss": 0.8992,
290
+ "step": 2000
291
+ },
292
+ {
293
+ "epoch": 0.2035951931671467,
294
+ "grad_norm": 0.80859375,
295
+ "learning_rate": 4.660840202602046e-05,
296
+ "loss": 0.9002,
297
+ "step": 2050
298
+ },
299
+ {
300
+ "epoch": 0.2085609295858576,
301
+ "grad_norm": 0.83203125,
302
+ "learning_rate": 4.6525639752375274e-05,
303
+ "loss": 0.8992,
304
+ "step": 2100
305
+ },
306
+ {
307
+ "epoch": 0.2135266660045685,
308
+ "grad_norm": 0.77734375,
309
+ "learning_rate": 4.64428774787301e-05,
310
+ "loss": 0.8982,
311
+ "step": 2150
312
+ },
313
+ {
314
+ "epoch": 0.21849240242327939,
315
+ "grad_norm": 0.80859375,
316
+ "learning_rate": 4.6360115205084916e-05,
317
+ "loss": 0.9018,
318
+ "step": 2200
319
+ },
320
+ {
321
+ "epoch": 0.22345813884199026,
322
+ "grad_norm": 0.890625,
323
+ "learning_rate": 4.6277352931439734e-05,
324
+ "loss": 0.8944,
325
+ "step": 2250
326
+ },
327
+ {
328
+ "epoch": 0.22842387526070115,
329
+ "grad_norm": 0.78515625,
330
+ "learning_rate": 4.619459065779455e-05,
331
+ "loss": 0.898,
332
+ "step": 2300
333
+ },
334
+ {
335
+ "epoch": 0.23338961167941205,
336
+ "grad_norm": 0.8671875,
337
+ "learning_rate": 4.611182838414937e-05,
338
+ "loss": 0.8938,
339
+ "step": 2350
340
+ },
341
+ {
342
+ "epoch": 0.23835534809812295,
343
+ "grad_norm": 0.76171875,
344
+ "learning_rate": 4.6029066110504186e-05,
345
+ "loss": 0.896,
346
+ "step": 2400
347
+ },
348
+ {
349
+ "epoch": 0.24332108451683385,
350
+ "grad_norm": 0.78125,
351
+ "learning_rate": 4.5946303836859e-05,
352
+ "loss": 0.8924,
353
+ "step": 2450
354
+ },
355
+ {
356
+ "epoch": 0.24828682093554474,
357
+ "grad_norm": 0.765625,
358
+ "learning_rate": 4.586354156321383e-05,
359
+ "loss": 0.8906,
360
+ "step": 2500
361
+ },
362
+ {
363
+ "epoch": 0.2532525573542556,
364
+ "grad_norm": 0.81640625,
365
+ "learning_rate": 4.5780779289568645e-05,
366
+ "loss": 0.8932,
367
+ "step": 2550
368
+ },
369
+ {
370
+ "epoch": 0.2582182937729665,
371
+ "grad_norm": 0.84765625,
372
+ "learning_rate": 4.569801701592346e-05,
373
+ "loss": 0.8941,
374
+ "step": 2600
375
+ },
376
+ {
377
+ "epoch": 0.2631840301916774,
378
+ "grad_norm": 0.7890625,
379
+ "learning_rate": 4.561525474227828e-05,
380
+ "loss": 0.8934,
381
+ "step": 2650
382
+ },
383
+ {
384
+ "epoch": 0.2681497666103883,
385
+ "grad_norm": 0.80078125,
386
+ "learning_rate": 4.55324924686331e-05,
387
+ "loss": 0.8909,
388
+ "step": 2700
389
+ },
390
+ {
391
+ "epoch": 0.2731155030290992,
392
+ "grad_norm": 0.796875,
393
+ "learning_rate": 4.5449730194987914e-05,
394
+ "loss": 0.8871,
395
+ "step": 2750
396
+ },
397
+ {
398
+ "epoch": 0.2780812394478101,
399
+ "grad_norm": 0.78125,
400
+ "learning_rate": 4.536696792134273e-05,
401
+ "loss": 0.8916,
402
+ "step": 2800
403
+ },
404
+ {
405
+ "epoch": 0.283046975866521,
406
+ "grad_norm": 0.796875,
407
+ "learning_rate": 4.5284205647697556e-05,
408
+ "loss": 0.8892,
409
+ "step": 2850
410
+ },
411
+ {
412
+ "epoch": 0.2880127122852319,
413
+ "grad_norm": 0.75,
414
+ "learning_rate": 4.5201443374052373e-05,
415
+ "loss": 0.8894,
416
+ "step": 2900
417
+ },
418
+ {
419
+ "epoch": 0.2929784487039428,
420
+ "grad_norm": 0.83984375,
421
+ "learning_rate": 4.51186811004072e-05,
422
+ "loss": 0.8884,
423
+ "step": 2950
424
+ },
425
+ {
426
+ "epoch": 0.2979441851226537,
427
+ "grad_norm": 0.77734375,
428
+ "learning_rate": 4.5035918826762015e-05,
429
+ "loss": 0.8855,
430
+ "step": 3000
431
+ },
432
+ {
433
+ "epoch": 0.3029099215413646,
434
+ "grad_norm": 0.78515625,
435
+ "learning_rate": 4.495315655311683e-05,
436
+ "loss": 0.8874,
437
+ "step": 3050
438
+ },
439
+ {
440
+ "epoch": 0.3078756579600755,
441
+ "grad_norm": 0.79296875,
442
+ "learning_rate": 4.487039427947165e-05,
443
+ "loss": 0.8881,
444
+ "step": 3100
445
+ },
446
+ {
447
+ "epoch": 0.3128413943787864,
448
+ "grad_norm": 0.80078125,
449
+ "learning_rate": 4.478763200582647e-05,
450
+ "loss": 0.8832,
451
+ "step": 3150
452
+ },
453
+ {
454
+ "epoch": 0.3178071307974973,
455
+ "grad_norm": 0.765625,
456
+ "learning_rate": 4.4704869732181285e-05,
457
+ "loss": 0.8865,
458
+ "step": 3200
459
+ },
460
+ {
461
+ "epoch": 0.3227728672162082,
462
+ "grad_norm": 0.75390625,
463
+ "learning_rate": 4.46221074585361e-05,
464
+ "loss": 0.8879,
465
+ "step": 3250
466
+ },
467
+ {
468
+ "epoch": 0.3277386036349191,
469
+ "grad_norm": 0.76953125,
470
+ "learning_rate": 4.4539345184890926e-05,
471
+ "loss": 0.8857,
472
+ "step": 3300
473
+ },
474
+ {
475
+ "epoch": 0.33270434005363,
476
+ "grad_norm": 0.79296875,
477
+ "learning_rate": 4.4456582911245744e-05,
478
+ "loss": 0.8849,
479
+ "step": 3350
480
+ },
481
+ {
482
+ "epoch": 0.3376700764723409,
483
+ "grad_norm": 0.765625,
484
+ "learning_rate": 4.437382063760056e-05,
485
+ "loss": 0.8824,
486
+ "step": 3400
487
+ },
488
+ {
489
+ "epoch": 0.3426358128910517,
490
+ "grad_norm": 0.78125,
491
+ "learning_rate": 4.429105836395538e-05,
492
+ "loss": 0.8813,
493
+ "step": 3450
494
+ },
495
+ {
496
+ "epoch": 0.3476015493097626,
497
+ "grad_norm": 0.8046875,
498
+ "learning_rate": 4.4208296090310196e-05,
499
+ "loss": 0.885,
500
+ "step": 3500
501
+ },
502
+ {
503
+ "epoch": 0.3525672857284735,
504
+ "grad_norm": 0.765625,
505
+ "learning_rate": 4.4125533816665013e-05,
506
+ "loss": 0.8812,
507
+ "step": 3550
508
+ },
509
+ {
510
+ "epoch": 0.3575330221471844,
511
+ "grad_norm": 0.81640625,
512
+ "learning_rate": 4.404277154301983e-05,
513
+ "loss": 0.8826,
514
+ "step": 3600
515
+ },
516
+ {
517
+ "epoch": 0.3624987585658953,
518
+ "grad_norm": 0.76953125,
519
+ "learning_rate": 4.3960009269374655e-05,
520
+ "loss": 0.8788,
521
+ "step": 3650
522
+ },
523
+ {
524
+ "epoch": 0.3674644949846062,
525
+ "grad_norm": 0.7578125,
526
+ "learning_rate": 4.387724699572947e-05,
527
+ "loss": 0.883,
528
+ "step": 3700
529
+ },
530
+ {
531
+ "epoch": 0.3724302314033171,
532
+ "grad_norm": 0.73046875,
533
+ "learning_rate": 4.379448472208429e-05,
534
+ "loss": 0.8804,
535
+ "step": 3750
536
+ },
537
+ {
538
+ "epoch": 0.377395967822028,
539
+ "grad_norm": 0.74609375,
540
+ "learning_rate": 4.371172244843911e-05,
541
+ "loss": 0.8826,
542
+ "step": 3800
543
+ },
544
+ {
545
+ "epoch": 0.3823617042407389,
546
+ "grad_norm": 0.7734375,
547
+ "learning_rate": 4.3628960174793925e-05,
548
+ "loss": 0.8795,
549
+ "step": 3850
550
+ },
551
+ {
552
+ "epoch": 0.3873274406594498,
553
+ "grad_norm": 0.75,
554
+ "learning_rate": 4.354619790114874e-05,
555
+ "loss": 0.8791,
556
+ "step": 3900
557
+ },
558
+ {
559
+ "epoch": 0.3922931770781607,
560
+ "grad_norm": 0.82421875,
561
+ "learning_rate": 4.346343562750356e-05,
562
+ "loss": 0.8775,
563
+ "step": 3950
564
+ },
565
+ {
566
+ "epoch": 0.3972589134968716,
567
+ "grad_norm": 0.7578125,
568
+ "learning_rate": 4.3380673353858384e-05,
569
+ "loss": 0.8797,
570
+ "step": 4000
571
+ },
572
+ {
573
+ "epoch": 0.4022246499155825,
574
+ "grad_norm": 0.78515625,
575
+ "learning_rate": 4.32979110802132e-05,
576
+ "loss": 0.8796,
577
+ "step": 4050
578
+ },
579
+ {
580
+ "epoch": 0.4071903863342934,
581
+ "grad_norm": 0.76953125,
582
+ "learning_rate": 4.321514880656802e-05,
583
+ "loss": 0.8779,
584
+ "step": 4100
585
+ },
586
+ {
587
+ "epoch": 0.4121561227530043,
588
+ "grad_norm": 0.7421875,
589
+ "learning_rate": 4.3132386532922836e-05,
590
+ "loss": 0.8787,
591
+ "step": 4150
592
+ },
593
+ {
594
+ "epoch": 0.4171218591717152,
595
+ "grad_norm": 0.765625,
596
+ "learning_rate": 4.3049624259277653e-05,
597
+ "loss": 0.8785,
598
+ "step": 4200
599
+ },
600
+ {
601
+ "epoch": 0.4220875955904261,
602
+ "grad_norm": 0.76171875,
603
+ "learning_rate": 4.296686198563247e-05,
604
+ "loss": 0.8784,
605
+ "step": 4250
606
+ },
607
+ {
608
+ "epoch": 0.427053332009137,
609
+ "grad_norm": 0.765625,
610
+ "learning_rate": 4.288409971198729e-05,
611
+ "loss": 0.8763,
612
+ "step": 4300
613
+ },
614
+ {
615
+ "epoch": 0.4320190684278479,
616
+ "grad_norm": 0.79296875,
617
+ "learning_rate": 4.280133743834211e-05,
618
+ "loss": 0.8749,
619
+ "step": 4350
620
+ },
621
+ {
622
+ "epoch": 0.43698480484655877,
623
+ "grad_norm": 0.78125,
624
+ "learning_rate": 4.271857516469693e-05,
625
+ "loss": 0.8772,
626
+ "step": 4400
627
+ },
628
+ {
629
+ "epoch": 0.4419505412652696,
630
+ "grad_norm": 0.765625,
631
+ "learning_rate": 4.263581289105175e-05,
632
+ "loss": 0.8757,
633
+ "step": 4450
634
+ },
635
+ {
636
+ "epoch": 0.4469162776839805,
637
+ "grad_norm": 0.765625,
638
+ "learning_rate": 4.2553050617406565e-05,
639
+ "loss": 0.8764,
640
+ "step": 4500
641
+ },
642
+ {
643
+ "epoch": 0.4518820141026914,
644
+ "grad_norm": 0.76953125,
645
+ "learning_rate": 4.247028834376138e-05,
646
+ "loss": 0.8744,
647
+ "step": 4550
648
+ },
649
+ {
650
+ "epoch": 0.4568477505214023,
651
+ "grad_norm": 0.765625,
652
+ "learning_rate": 4.23875260701162e-05,
653
+ "loss": 0.8753,
654
+ "step": 4600
655
+ },
656
+ {
657
+ "epoch": 0.4618134869401132,
658
+ "grad_norm": 0.78515625,
659
+ "learning_rate": 4.230476379647102e-05,
660
+ "loss": 0.8743,
661
+ "step": 4650
662
+ },
663
+ {
664
+ "epoch": 0.4667792233588241,
665
+ "grad_norm": 0.73046875,
666
+ "learning_rate": 4.222200152282584e-05,
667
+ "loss": 0.8713,
668
+ "step": 4700
669
+ },
670
+ {
671
+ "epoch": 0.471744959777535,
672
+ "grad_norm": 0.8046875,
673
+ "learning_rate": 4.213923924918066e-05,
674
+ "loss": 0.8719,
675
+ "step": 4750
676
+ },
677
+ {
678
+ "epoch": 0.4767106961962459,
679
+ "grad_norm": 0.78515625,
680
+ "learning_rate": 4.2056476975535476e-05,
681
+ "loss": 0.8714,
682
+ "step": 4800
683
+ },
684
+ {
685
+ "epoch": 0.4816764326149568,
686
+ "grad_norm": 0.7578125,
687
+ "learning_rate": 4.1973714701890293e-05,
688
+ "loss": 0.8718,
689
+ "step": 4850
690
+ },
691
+ {
692
+ "epoch": 0.4866421690336677,
693
+ "grad_norm": 0.79296875,
694
+ "learning_rate": 4.189095242824511e-05,
695
+ "loss": 0.8708,
696
+ "step": 4900
697
+ },
698
+ {
699
+ "epoch": 0.4916079054523786,
700
+ "grad_norm": 0.78125,
701
+ "learning_rate": 4.180819015459993e-05,
702
+ "loss": 0.8741,
703
+ "step": 4950
704
+ },
705
+ {
706
+ "epoch": 0.4965736418710895,
707
+ "grad_norm": 0.7578125,
708
+ "learning_rate": 4.1725427880954746e-05,
709
+ "loss": 0.873,
710
+ "step": 5000
711
+ },
712
+ {
713
+ "epoch": 0.5015393782898003,
714
+ "grad_norm": 0.765625,
715
+ "learning_rate": 4.164266560730957e-05,
716
+ "loss": 0.8732,
717
+ "step": 5050
718
+ },
719
+ {
720
+ "epoch": 0.5065051147085112,
721
+ "grad_norm": 0.7734375,
722
+ "learning_rate": 4.155990333366439e-05,
723
+ "loss": 0.872,
724
+ "step": 5100
725
+ },
726
+ {
727
+ "epoch": 0.5114708511272221,
728
+ "grad_norm": 0.74609375,
729
+ "learning_rate": 4.1477141060019205e-05,
730
+ "loss": 0.8711,
731
+ "step": 5150
732
+ },
733
+ {
734
+ "epoch": 0.516436587545933,
735
+ "grad_norm": 0.77734375,
736
+ "learning_rate": 4.139437878637402e-05,
737
+ "loss": 0.8707,
738
+ "step": 5200
739
+ },
740
+ {
741
+ "epoch": 0.5214023239646439,
742
+ "grad_norm": 0.78125,
743
+ "learning_rate": 4.131161651272884e-05,
744
+ "loss": 0.8691,
745
+ "step": 5250
746
+ },
747
+ {
748
+ "epoch": 0.5263680603833548,
749
+ "grad_norm": 0.73828125,
750
+ "learning_rate": 4.122885423908366e-05,
751
+ "loss": 0.8679,
752
+ "step": 5300
753
+ },
754
+ {
755
+ "epoch": 0.5313337968020657,
756
+ "grad_norm": 0.78515625,
757
+ "learning_rate": 4.1146091965438474e-05,
758
+ "loss": 0.8686,
759
+ "step": 5350
760
+ },
761
+ {
762
+ "epoch": 0.5362995332207766,
763
+ "grad_norm": 0.7578125,
764
+ "learning_rate": 4.10633296917933e-05,
765
+ "loss": 0.8683,
766
+ "step": 5400
767
+ },
768
+ {
769
+ "epoch": 0.5412652696394875,
770
+ "grad_norm": 0.74609375,
771
+ "learning_rate": 4.0980567418148116e-05,
772
+ "loss": 0.8693,
773
+ "step": 5450
774
+ },
775
+ {
776
+ "epoch": 0.5462310060581984,
777
+ "grad_norm": 0.73828125,
778
+ "learning_rate": 4.0897805144502933e-05,
779
+ "loss": 0.8699,
780
+ "step": 5500
781
+ },
782
+ {
783
+ "epoch": 0.5511967424769093,
784
+ "grad_norm": 0.7734375,
785
+ "learning_rate": 4.081504287085775e-05,
786
+ "loss": 0.8671,
787
+ "step": 5550
788
+ },
789
+ {
790
+ "epoch": 0.5561624788956202,
791
+ "grad_norm": 0.734375,
792
+ "learning_rate": 4.073228059721257e-05,
793
+ "loss": 0.8681,
794
+ "step": 5600
795
+ },
796
+ {
797
+ "epoch": 0.5611282153143311,
798
+ "grad_norm": 0.76171875,
799
+ "learning_rate": 4.0649518323567386e-05,
800
+ "loss": 0.8703,
801
+ "step": 5650
802
+ },
803
+ {
804
+ "epoch": 0.566093951733042,
805
+ "grad_norm": 0.7421875,
806
+ "learning_rate": 4.05667560499222e-05,
807
+ "loss": 0.8675,
808
+ "step": 5700
809
+ },
810
+ {
811
+ "epoch": 0.5710596881517529,
812
+ "grad_norm": 0.78125,
813
+ "learning_rate": 4.048399377627702e-05,
814
+ "loss": 0.8697,
815
+ "step": 5750
816
+ },
817
+ {
818
+ "epoch": 0.5760254245704638,
819
+ "grad_norm": 0.75,
820
+ "learning_rate": 4.0401231502631845e-05,
821
+ "loss": 0.8685,
822
+ "step": 5800
823
+ },
824
+ {
825
+ "epoch": 0.5809911609891747,
826
+ "grad_norm": 0.76953125,
827
+ "learning_rate": 4.031846922898666e-05,
828
+ "loss": 0.8683,
829
+ "step": 5850
830
+ },
831
+ {
832
+ "epoch": 0.5859568974078856,
833
+ "grad_norm": 0.75,
834
+ "learning_rate": 4.023570695534148e-05,
835
+ "loss": 0.8649,
836
+ "step": 5900
837
+ },
838
+ {
839
+ "epoch": 0.5909226338265965,
840
+ "grad_norm": 0.76953125,
841
+ "learning_rate": 4.01529446816963e-05,
842
+ "loss": 0.8662,
843
+ "step": 5950
844
+ },
845
+ {
846
+ "epoch": 0.5958883702453074,
847
+ "grad_norm": 0.77734375,
848
+ "learning_rate": 4.0070182408051114e-05,
849
+ "loss": 0.8667,
850
+ "step": 6000
851
+ },
852
+ {
853
+ "epoch": 0.6008541066640183,
854
+ "grad_norm": 0.80859375,
855
+ "learning_rate": 3.998742013440593e-05,
856
+ "loss": 0.8682,
857
+ "step": 6050
858
+ },
859
+ {
860
+ "epoch": 0.6058198430827292,
861
+ "grad_norm": 0.734375,
862
+ "learning_rate": 3.990465786076075e-05,
863
+ "loss": 0.8682,
864
+ "step": 6100
865
+ },
866
+ {
867
+ "epoch": 0.6107855795014401,
868
+ "grad_norm": 0.73828125,
869
+ "learning_rate": 3.9821895587115573e-05,
870
+ "loss": 0.8684,
871
+ "step": 6150
872
+ },
873
+ {
874
+ "epoch": 0.615751315920151,
875
+ "grad_norm": 0.73828125,
876
+ "learning_rate": 3.973913331347039e-05,
877
+ "loss": 0.8662,
878
+ "step": 6200
879
+ },
880
+ {
881
+ "epoch": 0.6207170523388619,
882
+ "grad_norm": 0.78515625,
883
+ "learning_rate": 3.965637103982521e-05,
884
+ "loss": 0.8673,
885
+ "step": 6250
886
+ },
887
+ {
888
+ "epoch": 0.6256827887575728,
889
+ "grad_norm": 0.76953125,
890
+ "learning_rate": 3.9573608766180026e-05,
891
+ "loss": 0.8665,
892
+ "step": 6300
893
+ },
894
+ {
895
+ "epoch": 0.6306485251762837,
896
+ "grad_norm": 0.7578125,
897
+ "learning_rate": 3.949084649253484e-05,
898
+ "loss": 0.864,
899
+ "step": 6350
900
+ },
901
+ {
902
+ "epoch": 0.6356142615949946,
903
+ "grad_norm": 0.7890625,
904
+ "learning_rate": 3.940808421888966e-05,
905
+ "loss": 0.8645,
906
+ "step": 6400
907
+ },
908
+ {
909
+ "epoch": 0.6405799980137055,
910
+ "grad_norm": 0.8046875,
911
+ "learning_rate": 3.932532194524448e-05,
912
+ "loss": 0.8633,
913
+ "step": 6450
914
+ },
915
+ {
916
+ "epoch": 0.6455457344324164,
917
+ "grad_norm": 0.7890625,
918
+ "learning_rate": 3.92425596715993e-05,
919
+ "loss": 0.8676,
920
+ "step": 6500
921
+ },
922
+ {
923
+ "epoch": 0.6505114708511273,
924
+ "grad_norm": 0.77734375,
925
+ "learning_rate": 3.915979739795412e-05,
926
+ "loss": 0.866,
927
+ "step": 6550
928
+ },
929
+ {
930
+ "epoch": 0.6554772072698382,
931
+ "grad_norm": 0.78515625,
932
+ "learning_rate": 3.907703512430894e-05,
933
+ "loss": 0.8642,
934
+ "step": 6600
935
+ },
936
+ {
937
+ "epoch": 0.6604429436885491,
938
+ "grad_norm": 0.78125,
939
+ "learning_rate": 3.8994272850663754e-05,
940
+ "loss": 0.864,
941
+ "step": 6650
942
+ },
943
+ {
944
+ "epoch": 0.66540868010726,
945
+ "grad_norm": 0.78125,
946
+ "learning_rate": 3.891151057701857e-05,
947
+ "loss": 0.864,
948
+ "step": 6700
949
+ },
950
+ {
951
+ "epoch": 0.6703744165259709,
952
+ "grad_norm": 0.77734375,
953
+ "learning_rate": 3.882874830337339e-05,
954
+ "loss": 0.8617,
955
+ "step": 6750
956
+ },
957
+ {
958
+ "epoch": 0.6753401529446817,
959
+ "grad_norm": 0.80859375,
960
+ "learning_rate": 3.874598602972821e-05,
961
+ "loss": 0.8662,
962
+ "step": 6800
963
+ },
964
+ {
965
+ "epoch": 0.6803058893633926,
966
+ "grad_norm": 0.765625,
967
+ "learning_rate": 3.866322375608303e-05,
968
+ "loss": 0.8586,
969
+ "step": 6850
970
+ },
971
+ {
972
+ "epoch": 0.6852716257821034,
973
+ "grad_norm": 0.796875,
974
+ "learning_rate": 3.858046148243785e-05,
975
+ "loss": 0.8642,
976
+ "step": 6900
977
+ },
978
+ {
979
+ "epoch": 0.6902373622008143,
980
+ "grad_norm": 0.76953125,
981
+ "learning_rate": 3.8497699208792666e-05,
982
+ "loss": 0.8621,
983
+ "step": 6950
984
+ },
985
+ {
986
+ "epoch": 0.6952030986195252,
987
+ "grad_norm": 0.72265625,
988
+ "learning_rate": 3.841493693514748e-05,
989
+ "loss": 0.8614,
990
+ "step": 7000
991
+ },
992
+ {
993
+ "epoch": 0.7001688350382361,
994
+ "grad_norm": 0.7421875,
995
+ "learning_rate": 3.83321746615023e-05,
996
+ "loss": 0.8615,
997
+ "step": 7050
998
+ },
999
+ {
1000
+ "epoch": 0.705134571456947,
1001
+ "grad_norm": 0.74609375,
1002
+ "learning_rate": 3.824941238785712e-05,
1003
+ "loss": 0.8623,
1004
+ "step": 7100
1005
+ },
1006
+ {
1007
+ "epoch": 0.7101003078756579,
1008
+ "grad_norm": 0.73828125,
1009
+ "learning_rate": 3.8166650114211935e-05,
1010
+ "loss": 0.8617,
1011
+ "step": 7150
1012
+ },
1013
+ {
1014
+ "epoch": 0.7150660442943688,
1015
+ "grad_norm": 0.7421875,
1016
+ "learning_rate": 3.808388784056676e-05,
1017
+ "loss": 0.8616,
1018
+ "step": 7200
1019
+ },
1020
+ {
1021
+ "epoch": 0.7200317807130797,
1022
+ "grad_norm": 0.75,
1023
+ "learning_rate": 3.800112556692158e-05,
1024
+ "loss": 0.8625,
1025
+ "step": 7250
1026
+ },
1027
+ {
1028
+ "epoch": 0.7249975171317906,
1029
+ "grad_norm": 0.76171875,
1030
+ "learning_rate": 3.7918363293276394e-05,
1031
+ "loss": 0.8638,
1032
+ "step": 7300
1033
+ },
1034
+ {
1035
+ "epoch": 0.7299632535505015,
1036
+ "grad_norm": 1.0234375,
1037
+ "learning_rate": 3.783560101963121e-05,
1038
+ "loss": 0.8616,
1039
+ "step": 7350
1040
+ },
1041
+ {
1042
+ "epoch": 0.7349289899692124,
1043
+ "grad_norm": 0.78515625,
1044
+ "learning_rate": 3.775283874598603e-05,
1045
+ "loss": 0.8607,
1046
+ "step": 7400
1047
+ },
1048
+ {
1049
+ "epoch": 0.7398947263879233,
1050
+ "grad_norm": 0.7578125,
1051
+ "learning_rate": 3.767007647234085e-05,
1052
+ "loss": 0.8628,
1053
+ "step": 7450
1054
+ },
1055
+ {
1056
+ "epoch": 0.7448604628066342,
1057
+ "grad_norm": 0.77734375,
1058
+ "learning_rate": 3.7587314198695664e-05,
1059
+ "loss": 0.8593,
1060
+ "step": 7500
1061
+ },
1062
+ {
1063
+ "epoch": 0.7498261992253451,
1064
+ "grad_norm": 0.79296875,
1065
+ "learning_rate": 3.750455192505049e-05,
1066
+ "loss": 0.8632,
1067
+ "step": 7550
1068
+ },
1069
+ {
1070
+ "epoch": 0.754791935644056,
1071
+ "grad_norm": 0.78515625,
1072
+ "learning_rate": 3.7421789651405306e-05,
1073
+ "loss": 0.8605,
1074
+ "step": 7600
1075
+ },
1076
+ {
1077
+ "epoch": 0.7597576720627669,
1078
+ "grad_norm": 0.77734375,
1079
+ "learning_rate": 3.733902737776012e-05,
1080
+ "loss": 0.8599,
1081
+ "step": 7650
1082
+ },
1083
+ {
1084
+ "epoch": 0.7647234084814778,
1085
+ "grad_norm": 0.77734375,
1086
+ "learning_rate": 3.725626510411494e-05,
1087
+ "loss": 0.8586,
1088
+ "step": 7700
1089
+ },
1090
+ {
1091
+ "epoch": 0.7696891449001887,
1092
+ "grad_norm": 0.75390625,
1093
+ "learning_rate": 3.717350283046976e-05,
1094
+ "loss": 0.8595,
1095
+ "step": 7750
1096
+ },
1097
+ {
1098
+ "epoch": 0.7746548813188996,
1099
+ "grad_norm": 0.734375,
1100
+ "learning_rate": 3.7090740556824575e-05,
1101
+ "loss": 0.8588,
1102
+ "step": 7800
1103
+ },
1104
+ {
1105
+ "epoch": 0.7796206177376105,
1106
+ "grad_norm": 0.76171875,
1107
+ "learning_rate": 3.700797828317939e-05,
1108
+ "loss": 0.8605,
1109
+ "step": 7850
1110
+ },
1111
+ {
1112
+ "epoch": 0.7845863541563214,
1113
+ "grad_norm": 0.796875,
1114
+ "learning_rate": 3.692521600953422e-05,
1115
+ "loss": 0.8607,
1116
+ "step": 7900
1117
+ },
1118
+ {
1119
+ "epoch": 0.7895520905750323,
1120
+ "grad_norm": 0.78515625,
1121
+ "learning_rate": 3.6842453735889034e-05,
1122
+ "loss": 0.8589,
1123
+ "step": 7950
1124
+ },
1125
+ {
1126
+ "epoch": 0.7945178269937432,
1127
+ "grad_norm": 0.76953125,
1128
+ "learning_rate": 3.675969146224385e-05,
1129
+ "loss": 0.8588,
1130
+ "step": 8000
1131
+ },
1132
+ {
1133
+ "epoch": 0.7994835634124541,
1134
+ "grad_norm": 0.765625,
1135
+ "learning_rate": 3.667692918859867e-05,
1136
+ "loss": 0.8583,
1137
+ "step": 8050
1138
+ },
1139
+ {
1140
+ "epoch": 0.804449299831165,
1141
+ "grad_norm": 0.74609375,
1142
+ "learning_rate": 3.659416691495349e-05,
1143
+ "loss": 0.8592,
1144
+ "step": 8100
1145
+ },
1146
+ {
1147
+ "epoch": 0.8094150362498759,
1148
+ "grad_norm": 0.86328125,
1149
+ "learning_rate": 3.6511404641308304e-05,
1150
+ "loss": 0.8588,
1151
+ "step": 8150
1152
+ },
1153
+ {
1154
+ "epoch": 0.8143807726685868,
1155
+ "grad_norm": 0.80078125,
1156
+ "learning_rate": 3.642864236766312e-05,
1157
+ "loss": 0.8602,
1158
+ "step": 8200
1159
+ },
1160
+ {
1161
+ "epoch": 0.8193465090872977,
1162
+ "grad_norm": 0.76171875,
1163
+ "learning_rate": 3.634588009401794e-05,
1164
+ "loss": 0.8615,
1165
+ "step": 8250
1166
+ },
1167
+ {
1168
+ "epoch": 0.8243122455060086,
1169
+ "grad_norm": 0.75,
1170
+ "learning_rate": 3.626311782037276e-05,
1171
+ "loss": 0.8577,
1172
+ "step": 8300
1173
+ },
1174
+ {
1175
+ "epoch": 0.8292779819247195,
1176
+ "grad_norm": 0.765625,
1177
+ "learning_rate": 3.618035554672758e-05,
1178
+ "loss": 0.8591,
1179
+ "step": 8350
1180
+ },
1181
+ {
1182
+ "epoch": 0.8342437183434304,
1183
+ "grad_norm": 0.7734375,
1184
+ "learning_rate": 3.60975932730824e-05,
1185
+ "loss": 0.8605,
1186
+ "step": 8400
1187
+ },
1188
+ {
1189
+ "epoch": 0.8392094547621413,
1190
+ "grad_norm": 0.765625,
1191
+ "learning_rate": 3.6014830999437215e-05,
1192
+ "loss": 0.8578,
1193
+ "step": 8450
1194
+ },
1195
+ {
1196
+ "epoch": 0.8441751911808522,
1197
+ "grad_norm": 0.79296875,
1198
+ "learning_rate": 3.593206872579203e-05,
1199
+ "loss": 0.8585,
1200
+ "step": 8500
1201
+ },
1202
+ {
1203
+ "epoch": 0.8491409275995631,
1204
+ "grad_norm": 0.7734375,
1205
+ "learning_rate": 3.584930645214685e-05,
1206
+ "loss": 0.8571,
1207
+ "step": 8550
1208
+ },
1209
+ {
1210
+ "epoch": 0.854106664018274,
1211
+ "grad_norm": 0.765625,
1212
+ "learning_rate": 3.576654417850167e-05,
1213
+ "loss": 0.8581,
1214
+ "step": 8600
1215
+ },
1216
+ {
1217
+ "epoch": 0.8590724004369849,
1218
+ "grad_norm": 0.80078125,
1219
+ "learning_rate": 3.568378190485649e-05,
1220
+ "loss": 0.8569,
1221
+ "step": 8650
1222
+ },
1223
+ {
1224
+ "epoch": 0.8640381368556957,
1225
+ "grad_norm": 0.75390625,
1226
+ "learning_rate": 3.5601019631211316e-05,
1227
+ "loss": 0.8559,
1228
+ "step": 8700
1229
+ },
1230
+ {
1231
+ "epoch": 0.8690038732744066,
1232
+ "grad_norm": 0.73046875,
1233
+ "learning_rate": 3.5518257357566133e-05,
1234
+ "loss": 0.8578,
1235
+ "step": 8750
1236
+ },
1237
+ {
1238
+ "epoch": 0.8739696096931175,
1239
+ "grad_norm": 0.76171875,
1240
+ "learning_rate": 3.543549508392095e-05,
1241
+ "loss": 0.8589,
1242
+ "step": 8800
1243
+ },
1244
+ {
1245
+ "epoch": 0.8789353461118283,
1246
+ "grad_norm": 0.79296875,
1247
+ "learning_rate": 3.535273281027577e-05,
1248
+ "loss": 0.8571,
1249
+ "step": 8850
1250
+ },
1251
+ {
1252
+ "epoch": 0.8839010825305392,
1253
+ "grad_norm": 0.74609375,
1254
+ "learning_rate": 3.5269970536630586e-05,
1255
+ "loss": 0.8581,
1256
+ "step": 8900
1257
+ },
1258
+ {
1259
+ "epoch": 0.8888668189492501,
1260
+ "grad_norm": 0.76171875,
1261
+ "learning_rate": 3.51872082629854e-05,
1262
+ "loss": 0.8566,
1263
+ "step": 8950
1264
+ },
1265
+ {
1266
+ "epoch": 0.893832555367961,
1267
+ "grad_norm": 0.7578125,
1268
+ "learning_rate": 3.510444598934022e-05,
1269
+ "loss": 0.8558,
1270
+ "step": 9000
1271
+ },
1272
+ {
1273
+ "epoch": 0.8987982917866719,
1274
+ "grad_norm": 0.734375,
1275
+ "learning_rate": 3.5021683715695045e-05,
1276
+ "loss": 0.8536,
1277
+ "step": 9050
1278
+ },
1279
+ {
1280
+ "epoch": 0.9037640282053828,
1281
+ "grad_norm": 0.73046875,
1282
+ "learning_rate": 3.493892144204986e-05,
1283
+ "loss": 0.8566,
1284
+ "step": 9100
1285
+ },
1286
+ {
1287
+ "epoch": 0.9087297646240937,
1288
+ "grad_norm": 0.796875,
1289
+ "learning_rate": 3.485615916840468e-05,
1290
+ "loss": 0.8575,
1291
+ "step": 9150
1292
+ },
1293
+ {
1294
+ "epoch": 0.9136955010428046,
1295
+ "grad_norm": 0.74609375,
1296
+ "learning_rate": 3.47733968947595e-05,
1297
+ "loss": 0.8567,
1298
+ "step": 9200
1299
+ },
1300
+ {
1301
+ "epoch": 0.9186612374615155,
1302
+ "grad_norm": 0.7734375,
1303
+ "learning_rate": 3.4690634621114314e-05,
1304
+ "loss": 0.8535,
1305
+ "step": 9250
1306
+ },
1307
+ {
1308
+ "epoch": 0.9236269738802264,
1309
+ "grad_norm": 0.765625,
1310
+ "learning_rate": 3.460787234746913e-05,
1311
+ "loss": 0.8578,
1312
+ "step": 9300
1313
+ },
1314
+ {
1315
+ "epoch": 0.9285927102989373,
1316
+ "grad_norm": 0.7734375,
1317
+ "learning_rate": 3.452511007382395e-05,
1318
+ "loss": 0.8565,
1319
+ "step": 9350
1320
+ },
1321
+ {
1322
+ "epoch": 0.9335584467176482,
1323
+ "grad_norm": 0.76953125,
1324
+ "learning_rate": 3.4442347800178773e-05,
1325
+ "loss": 0.8551,
1326
+ "step": 9400
1327
+ },
1328
+ {
1329
+ "epoch": 0.9385241831363591,
1330
+ "grad_norm": 0.7734375,
1331
+ "learning_rate": 3.435958552653359e-05,
1332
+ "loss": 0.8552,
1333
+ "step": 9450
1334
+ },
1335
+ {
1336
+ "epoch": 0.94348991955507,
1337
+ "grad_norm": 0.7421875,
1338
+ "learning_rate": 3.427682325288841e-05,
1339
+ "loss": 0.8536,
1340
+ "step": 9500
1341
+ },
1342
+ {
1343
+ "epoch": 0.9484556559737809,
1344
+ "grad_norm": 0.78125,
1345
+ "learning_rate": 3.4194060979243226e-05,
1346
+ "loss": 0.8558,
1347
+ "step": 9550
1348
+ },
1349
+ {
1350
+ "epoch": 0.9534213923924918,
1351
+ "grad_norm": 0.7734375,
1352
+ "learning_rate": 3.411129870559804e-05,
1353
+ "loss": 0.8562,
1354
+ "step": 9600
1355
+ },
1356
+ {
1357
+ "epoch": 0.9583871288112027,
1358
+ "grad_norm": 0.7421875,
1359
+ "learning_rate": 3.402853643195286e-05,
1360
+ "loss": 0.8558,
1361
+ "step": 9650
1362
+ },
1363
+ {
1364
+ "epoch": 0.9633528652299136,
1365
+ "grad_norm": 0.74609375,
1366
+ "learning_rate": 3.394577415830768e-05,
1367
+ "loss": 0.8556,
1368
+ "step": 9700
1369
+ },
1370
+ {
1371
+ "epoch": 0.9683186016486245,
1372
+ "grad_norm": 0.75,
1373
+ "learning_rate": 3.38630118846625e-05,
1374
+ "loss": 0.8559,
1375
+ "step": 9750
1376
+ },
1377
+ {
1378
+ "epoch": 0.9732843380673354,
1379
+ "grad_norm": 0.83203125,
1380
+ "learning_rate": 3.378024961101732e-05,
1381
+ "loss": 0.8535,
1382
+ "step": 9800
1383
+ },
1384
+ {
1385
+ "epoch": 0.9782500744860463,
1386
+ "grad_norm": 0.7734375,
1387
+ "learning_rate": 3.369748733737214e-05,
1388
+ "loss": 0.8551,
1389
+ "step": 9850
1390
+ },
1391
+ {
1392
+ "epoch": 0.9832158109047572,
1393
+ "grad_norm": 0.79296875,
1394
+ "learning_rate": 3.3614725063726954e-05,
1395
+ "loss": 0.8606,
1396
+ "step": 9900
1397
+ },
1398
+ {
1399
+ "epoch": 0.9881815473234681,
1400
+ "grad_norm": 0.75,
1401
+ "learning_rate": 3.353196279008177e-05,
1402
+ "loss": 0.8544,
1403
+ "step": 9950
1404
+ },
1405
+ {
1406
+ "epoch": 0.993147283742179,
1407
+ "grad_norm": 0.828125,
1408
+ "learning_rate": 3.344920051643659e-05,
1409
+ "loss": 0.8527,
1410
+ "step": 10000
1411
+ },
1412
+ {
1413
+ "epoch": 0.9981130201608899,
1414
+ "grad_norm": 0.765625,
1415
+ "learning_rate": 3.3366438242791407e-05,
1416
+ "loss": 0.8521,
1417
+ "step": 10050
1418
+ },
1419
+ {
1420
+ "epoch": 1.0030787565796007,
1421
+ "grad_norm": 0.77734375,
1422
+ "learning_rate": 3.328367596914623e-05,
1423
+ "loss": 0.8398,
1424
+ "step": 10100
1425
+ },
1426
+ {
1427
+ "epoch": 1.0080444929983117,
1428
+ "grad_norm": 0.80078125,
1429
+ "learning_rate": 3.320091369550105e-05,
1430
+ "loss": 0.8317,
1431
+ "step": 10150
1432
+ },
1433
+ {
1434
+ "epoch": 1.0130102294170225,
1435
+ "grad_norm": 0.8125,
1436
+ "learning_rate": 3.3118151421855866e-05,
1437
+ "loss": 0.8283,
1438
+ "step": 10200
1439
+ },
1440
+ {
1441
+ "epoch": 1.0179759658357335,
1442
+ "grad_norm": 0.796875,
1443
+ "learning_rate": 3.303538914821068e-05,
1444
+ "loss": 0.8334,
1445
+ "step": 10250
1446
+ },
1447
+ {
1448
+ "epoch": 1.0229417022544443,
1449
+ "grad_norm": 0.76953125,
1450
+ "learning_rate": 3.29526268745655e-05,
1451
+ "loss": 0.83,
1452
+ "step": 10300
1453
+ },
1454
+ {
1455
+ "epoch": 1.0279074386731553,
1456
+ "grad_norm": 0.78125,
1457
+ "learning_rate": 3.286986460092032e-05,
1458
+ "loss": 0.8272,
1459
+ "step": 10350
1460
+ },
1461
+ {
1462
+ "epoch": 1.032873175091866,
1463
+ "grad_norm": 0.76171875,
1464
+ "learning_rate": 3.2787102327275135e-05,
1465
+ "loss": 0.8327,
1466
+ "step": 10400
1467
+ },
1468
+ {
1469
+ "epoch": 1.037838911510577,
1470
+ "grad_norm": 0.76953125,
1471
+ "learning_rate": 3.270434005362995e-05,
1472
+ "loss": 0.8302,
1473
+ "step": 10450
1474
+ },
1475
+ {
1476
+ "epoch": 1.0428046479292878,
1477
+ "grad_norm": 0.78515625,
1478
+ "learning_rate": 3.262157777998478e-05,
1479
+ "loss": 0.833,
1480
+ "step": 10500
1481
+ },
1482
+ {
1483
+ "epoch": 1.0477703843479989,
1484
+ "grad_norm": 0.80859375,
1485
+ "learning_rate": 3.2538815506339594e-05,
1486
+ "loss": 0.832,
1487
+ "step": 10550
1488
+ },
1489
+ {
1490
+ "epoch": 1.0527361207667096,
1491
+ "grad_norm": 0.80078125,
1492
+ "learning_rate": 3.245605323269441e-05,
1493
+ "loss": 0.8311,
1494
+ "step": 10600
1495
+ },
1496
+ {
1497
+ "epoch": 1.0577018571854206,
1498
+ "grad_norm": 0.80078125,
1499
+ "learning_rate": 3.237329095904923e-05,
1500
+ "loss": 0.8295,
1501
+ "step": 10650
1502
+ },
1503
+ {
1504
+ "epoch": 1.0626675936041314,
1505
+ "grad_norm": 0.80859375,
1506
+ "learning_rate": 3.2290528685404047e-05,
1507
+ "loss": 0.8309,
1508
+ "step": 10700
1509
+ },
1510
+ {
1511
+ "epoch": 1.0676333300228424,
1512
+ "grad_norm": 0.76953125,
1513
+ "learning_rate": 3.2207766411758864e-05,
1514
+ "loss": 0.8322,
1515
+ "step": 10750
1516
+ },
1517
+ {
1518
+ "epoch": 1.0725990664415532,
1519
+ "grad_norm": 0.765625,
1520
+ "learning_rate": 3.212500413811368e-05,
1521
+ "loss": 0.8318,
1522
+ "step": 10800
1523
+ },
1524
+ {
1525
+ "epoch": 1.0775648028602642,
1526
+ "grad_norm": 0.77734375,
1527
+ "learning_rate": 3.2042241864468506e-05,
1528
+ "loss": 0.8316,
1529
+ "step": 10850
1530
+ },
1531
+ {
1532
+ "epoch": 1.082530539278975,
1533
+ "grad_norm": 0.78125,
1534
+ "learning_rate": 3.195947959082332e-05,
1535
+ "loss": 0.8294,
1536
+ "step": 10900
1537
+ },
1538
+ {
1539
+ "epoch": 1.087496275697686,
1540
+ "grad_norm": 0.76171875,
1541
+ "learning_rate": 3.187671731717814e-05,
1542
+ "loss": 0.8282,
1543
+ "step": 10950
1544
+ },
1545
+ {
1546
+ "epoch": 1.0924620121163968,
1547
+ "grad_norm": 0.7578125,
1548
+ "learning_rate": 3.179395504353296e-05,
1549
+ "loss": 0.8287,
1550
+ "step": 11000
1551
+ },
1552
+ {
1553
+ "epoch": 1.0974277485351078,
1554
+ "grad_norm": 0.81640625,
1555
+ "learning_rate": 3.1711192769887775e-05,
1556
+ "loss": 0.8303,
1557
+ "step": 11050
1558
+ },
1559
+ {
1560
+ "epoch": 1.1023934849538186,
1561
+ "grad_norm": 0.7890625,
1562
+ "learning_rate": 3.162843049624259e-05,
1563
+ "loss": 0.8299,
1564
+ "step": 11100
1565
+ },
1566
+ {
1567
+ "epoch": 1.1073592213725296,
1568
+ "grad_norm": 0.80078125,
1569
+ "learning_rate": 3.154566822259741e-05,
1570
+ "loss": 0.8301,
1571
+ "step": 11150
1572
+ },
1573
+ {
1574
+ "epoch": 1.1123249577912404,
1575
+ "grad_norm": 0.796875,
1576
+ "learning_rate": 3.1462905948952234e-05,
1577
+ "loss": 0.8283,
1578
+ "step": 11200
1579
+ },
1580
+ {
1581
+ "epoch": 1.1172906942099514,
1582
+ "grad_norm": 0.80078125,
1583
+ "learning_rate": 3.138014367530705e-05,
1584
+ "loss": 0.8315,
1585
+ "step": 11250
1586
+ },
1587
+ {
1588
+ "epoch": 1.1222564306286622,
1589
+ "grad_norm": 0.80078125,
1590
+ "learning_rate": 3.129738140166187e-05,
1591
+ "loss": 0.8307,
1592
+ "step": 11300
1593
+ },
1594
+ {
1595
+ "epoch": 1.1272221670473732,
1596
+ "grad_norm": 0.78125,
1597
+ "learning_rate": 3.1214619128016687e-05,
1598
+ "loss": 0.8327,
1599
+ "step": 11350
1600
+ },
1601
+ {
1602
+ "epoch": 1.132187903466084,
1603
+ "grad_norm": 0.81640625,
1604
+ "learning_rate": 3.1131856854371504e-05,
1605
+ "loss": 0.8296,
1606
+ "step": 11400
1607
+ },
1608
+ {
1609
+ "epoch": 1.137153639884795,
1610
+ "grad_norm": 0.7890625,
1611
+ "learning_rate": 3.104909458072632e-05,
1612
+ "loss": 0.8294,
1613
+ "step": 11450
1614
+ },
1615
+ {
1616
+ "epoch": 1.1421193763035058,
1617
+ "grad_norm": 0.7734375,
1618
+ "learning_rate": 3.096633230708114e-05,
1619
+ "loss": 0.8296,
1620
+ "step": 11500
1621
+ },
1622
+ {
1623
+ "epoch": 1.1470851127222166,
1624
+ "grad_norm": 0.765625,
1625
+ "learning_rate": 3.088357003343596e-05,
1626
+ "loss": 0.8295,
1627
+ "step": 11550
1628
+ },
1629
+ {
1630
+ "epoch": 1.1520508491409276,
1631
+ "grad_norm": 0.80078125,
1632
+ "learning_rate": 3.080080775979078e-05,
1633
+ "loss": 0.8307,
1634
+ "step": 11600
1635
+ },
1636
+ {
1637
+ "epoch": 1.1570165855596386,
1638
+ "grad_norm": 0.80078125,
1639
+ "learning_rate": 3.07180454861456e-05,
1640
+ "loss": 0.8302,
1641
+ "step": 11650
1642
+ },
1643
+ {
1644
+ "epoch": 1.1619823219783494,
1645
+ "grad_norm": 0.80078125,
1646
+ "learning_rate": 3.0635283212500415e-05,
1647
+ "loss": 0.8286,
1648
+ "step": 11700
1649
+ },
1650
+ {
1651
+ "epoch": 1.1669480583970602,
1652
+ "grad_norm": 0.80859375,
1653
+ "learning_rate": 3.055252093885523e-05,
1654
+ "loss": 0.8302,
1655
+ "step": 11750
1656
+ },
1657
+ {
1658
+ "epoch": 1.1719137948157712,
1659
+ "grad_norm": 0.8046875,
1660
+ "learning_rate": 3.0469758665210054e-05,
1661
+ "loss": 0.8332,
1662
+ "step": 11800
1663
+ },
1664
+ {
1665
+ "epoch": 1.1768795312344822,
1666
+ "grad_norm": 0.79296875,
1667
+ "learning_rate": 3.038699639156487e-05,
1668
+ "loss": 0.8308,
1669
+ "step": 11850
1670
+ },
1671
+ {
1672
+ "epoch": 1.181845267653193,
1673
+ "grad_norm": 0.8046875,
1674
+ "learning_rate": 3.030423411791969e-05,
1675
+ "loss": 0.83,
1676
+ "step": 11900
1677
+ },
1678
+ {
1679
+ "epoch": 1.1868110040719038,
1680
+ "grad_norm": 0.77734375,
1681
+ "learning_rate": 3.0221471844274506e-05,
1682
+ "loss": 0.8326,
1683
+ "step": 11950
1684
+ },
1685
+ {
1686
+ "epoch": 1.1917767404906148,
1687
+ "grad_norm": 0.79296875,
1688
+ "learning_rate": 3.0138709570629327e-05,
1689
+ "loss": 0.8298,
1690
+ "step": 12000
1691
+ },
1692
+ {
1693
+ "epoch": 1.1967424769093256,
1694
+ "grad_norm": 0.796875,
1695
+ "learning_rate": 3.0055947296984144e-05,
1696
+ "loss": 0.8312,
1697
+ "step": 12050
1698
+ },
1699
+ {
1700
+ "epoch": 1.2017082133280366,
1701
+ "grad_norm": 0.79296875,
1702
+ "learning_rate": 2.997318502333896e-05,
1703
+ "loss": 0.8306,
1704
+ "step": 12100
1705
+ },
1706
+ {
1707
+ "epoch": 1.2066739497467474,
1708
+ "grad_norm": 0.796875,
1709
+ "learning_rate": 2.9890422749693782e-05,
1710
+ "loss": 0.8289,
1711
+ "step": 12150
1712
+ },
1713
+ {
1714
+ "epoch": 1.2116396861654584,
1715
+ "grad_norm": 0.796875,
1716
+ "learning_rate": 2.98076604760486e-05,
1717
+ "loss": 0.8305,
1718
+ "step": 12200
1719
+ },
1720
+ {
1721
+ "epoch": 1.2166054225841691,
1722
+ "grad_norm": 0.79296875,
1723
+ "learning_rate": 2.9724898202403417e-05,
1724
+ "loss": 0.8274,
1725
+ "step": 12250
1726
+ },
1727
+ {
1728
+ "epoch": 1.2215711590028802,
1729
+ "grad_norm": 0.77734375,
1730
+ "learning_rate": 2.9642135928758234e-05,
1731
+ "loss": 0.8294,
1732
+ "step": 12300
1733
+ },
1734
+ {
1735
+ "epoch": 1.226536895421591,
1736
+ "grad_norm": 1.046875,
1737
+ "learning_rate": 2.9559373655113055e-05,
1738
+ "loss": 0.8316,
1739
+ "step": 12350
1740
+ },
1741
+ {
1742
+ "epoch": 1.231502631840302,
1743
+ "grad_norm": 0.78515625,
1744
+ "learning_rate": 2.9476611381467873e-05,
1745
+ "loss": 0.831,
1746
+ "step": 12400
1747
+ },
1748
+ {
1749
+ "epoch": 1.2364683682590127,
1750
+ "grad_norm": 0.8359375,
1751
+ "learning_rate": 2.939384910782269e-05,
1752
+ "loss": 0.8281,
1753
+ "step": 12450
1754
+ },
1755
+ {
1756
+ "epoch": 1.2414341046777237,
1757
+ "grad_norm": 0.77734375,
1758
+ "learning_rate": 2.9311086834177508e-05,
1759
+ "loss": 0.8306,
1760
+ "step": 12500
1761
+ },
1762
+ {
1763
+ "epoch": 1.2463998410964345,
1764
+ "grad_norm": 0.7890625,
1765
+ "learning_rate": 2.922832456053233e-05,
1766
+ "loss": 0.8293,
1767
+ "step": 12550
1768
+ },
1769
+ {
1770
+ "epoch": 1.2513655775151455,
1771
+ "grad_norm": 0.81640625,
1772
+ "learning_rate": 2.9145562286887146e-05,
1773
+ "loss": 0.8281,
1774
+ "step": 12600
1775
+ },
1776
+ {
1777
+ "epoch": 1.2563313139338563,
1778
+ "grad_norm": 0.8125,
1779
+ "learning_rate": 2.9062800013241963e-05,
1780
+ "loss": 0.8296,
1781
+ "step": 12650
1782
+ },
1783
+ {
1784
+ "epoch": 1.2612970503525673,
1785
+ "grad_norm": 0.77734375,
1786
+ "learning_rate": 2.8980037739596784e-05,
1787
+ "loss": 0.8288,
1788
+ "step": 12700
1789
+ },
1790
+ {
1791
+ "epoch": 1.2662627867712781,
1792
+ "grad_norm": 0.77734375,
1793
+ "learning_rate": 2.88972754659516e-05,
1794
+ "loss": 0.8297,
1795
+ "step": 12750
1796
+ },
1797
+ {
1798
+ "epoch": 1.2712285231899891,
1799
+ "grad_norm": 0.79296875,
1800
+ "learning_rate": 2.881451319230642e-05,
1801
+ "loss": 0.831,
1802
+ "step": 12800
1803
+ },
1804
+ {
1805
+ "epoch": 1.2761942596087,
1806
+ "grad_norm": 0.80078125,
1807
+ "learning_rate": 2.8731750918661236e-05,
1808
+ "loss": 0.8286,
1809
+ "step": 12850
1810
+ },
1811
+ {
1812
+ "epoch": 1.281159996027411,
1813
+ "grad_norm": 0.796875,
1814
+ "learning_rate": 2.8648988645016057e-05,
1815
+ "loss": 0.8283,
1816
+ "step": 12900
1817
+ },
1818
+ {
1819
+ "epoch": 1.2861257324461217,
1820
+ "grad_norm": 0.78515625,
1821
+ "learning_rate": 2.8566226371370874e-05,
1822
+ "loss": 0.8295,
1823
+ "step": 12950
1824
+ },
1825
+ {
1826
+ "epoch": 1.2910914688648327,
1827
+ "grad_norm": 0.78125,
1828
+ "learning_rate": 2.8483464097725692e-05,
1829
+ "loss": 0.8284,
1830
+ "step": 13000
1831
+ },
1832
+ {
1833
+ "epoch": 1.2960572052835435,
1834
+ "grad_norm": 0.8046875,
1835
+ "learning_rate": 2.8400701824080513e-05,
1836
+ "loss": 0.829,
1837
+ "step": 13050
1838
+ },
1839
+ {
1840
+ "epoch": 1.3010229417022545,
1841
+ "grad_norm": 0.796875,
1842
+ "learning_rate": 2.831793955043533e-05,
1843
+ "loss": 0.8301,
1844
+ "step": 13100
1845
+ },
1846
+ {
1847
+ "epoch": 1.3059886781209653,
1848
+ "grad_norm": 0.81640625,
1849
+ "learning_rate": 2.8235177276790148e-05,
1850
+ "loss": 0.8287,
1851
+ "step": 13150
1852
+ },
1853
+ {
1854
+ "epoch": 1.3109544145396763,
1855
+ "grad_norm": 0.79296875,
1856
+ "learning_rate": 2.8152415003144965e-05,
1857
+ "loss": 0.8307,
1858
+ "step": 13200
1859
+ },
1860
+ {
1861
+ "epoch": 1.315920150958387,
1862
+ "grad_norm": 0.81640625,
1863
+ "learning_rate": 2.8069652729499786e-05,
1864
+ "loss": 0.8298,
1865
+ "step": 13250
1866
+ },
1867
+ {
1868
+ "epoch": 1.3208858873770981,
1869
+ "grad_norm": 0.79296875,
1870
+ "learning_rate": 2.7986890455854603e-05,
1871
+ "loss": 0.8289,
1872
+ "step": 13300
1873
+ },
1874
+ {
1875
+ "epoch": 1.325851623795809,
1876
+ "grad_norm": 0.80859375,
1877
+ "learning_rate": 2.790412818220942e-05,
1878
+ "loss": 0.8297,
1879
+ "step": 13350
1880
+ },
1881
+ {
1882
+ "epoch": 1.3308173602145197,
1883
+ "grad_norm": 0.78125,
1884
+ "learning_rate": 2.782136590856424e-05,
1885
+ "loss": 0.8308,
1886
+ "step": 13400
1887
+ },
1888
+ {
1889
+ "epoch": 1.3357830966332307,
1890
+ "grad_norm": 0.79296875,
1891
+ "learning_rate": 2.773860363491906e-05,
1892
+ "loss": 0.8291,
1893
+ "step": 13450
1894
+ },
1895
+ {
1896
+ "epoch": 1.3407488330519417,
1897
+ "grad_norm": 0.8046875,
1898
+ "learning_rate": 2.7655841361273876e-05,
1899
+ "loss": 0.8262,
1900
+ "step": 13500
1901
+ },
1902
+ {
1903
+ "epoch": 1.3457145694706525,
1904
+ "grad_norm": 0.8203125,
1905
+ "learning_rate": 2.7573079087628694e-05,
1906
+ "loss": 0.8272,
1907
+ "step": 13550
1908
+ },
1909
+ {
1910
+ "epoch": 1.3506803058893633,
1911
+ "grad_norm": 0.78125,
1912
+ "learning_rate": 2.7490316813983514e-05,
1913
+ "loss": 0.8305,
1914
+ "step": 13600
1915
+ },
1916
+ {
1917
+ "epoch": 1.3556460423080743,
1918
+ "grad_norm": 0.8125,
1919
+ "learning_rate": 2.7407554540338332e-05,
1920
+ "loss": 0.8295,
1921
+ "step": 13650
1922
+ },
1923
+ {
1924
+ "epoch": 1.3606117787267853,
1925
+ "grad_norm": 0.7890625,
1926
+ "learning_rate": 2.732479226669315e-05,
1927
+ "loss": 0.8263,
1928
+ "step": 13700
1929
+ },
1930
+ {
1931
+ "epoch": 1.365577515145496,
1932
+ "grad_norm": 0.80078125,
1933
+ "learning_rate": 2.7242029993047967e-05,
1934
+ "loss": 0.8303,
1935
+ "step": 13750
1936
+ },
1937
+ {
1938
+ "epoch": 1.3705432515642069,
1939
+ "grad_norm": 0.78125,
1940
+ "learning_rate": 2.7159267719402788e-05,
1941
+ "loss": 0.8275,
1942
+ "step": 13800
1943
+ },
1944
+ {
1945
+ "epoch": 1.3755089879829179,
1946
+ "grad_norm": 0.80078125,
1947
+ "learning_rate": 2.7076505445757605e-05,
1948
+ "loss": 0.8288,
1949
+ "step": 13850
1950
+ },
1951
+ {
1952
+ "epoch": 1.3804747244016289,
1953
+ "grad_norm": 0.8046875,
1954
+ "learning_rate": 2.6993743172112422e-05,
1955
+ "loss": 0.8272,
1956
+ "step": 13900
1957
+ },
1958
+ {
1959
+ "epoch": 1.3854404608203397,
1960
+ "grad_norm": 0.796875,
1961
+ "learning_rate": 2.6910980898467243e-05,
1962
+ "loss": 0.8287,
1963
+ "step": 13950
1964
+ },
1965
+ {
1966
+ "epoch": 1.3904061972390505,
1967
+ "grad_norm": 0.77734375,
1968
+ "learning_rate": 2.682821862482206e-05,
1969
+ "loss": 0.8289,
1970
+ "step": 14000
1971
+ },
1972
+ {
1973
+ "epoch": 1.3953719336577615,
1974
+ "grad_norm": 0.8125,
1975
+ "learning_rate": 2.6745456351176878e-05,
1976
+ "loss": 0.8307,
1977
+ "step": 14050
1978
+ },
1979
+ {
1980
+ "epoch": 1.4003376700764725,
1981
+ "grad_norm": 0.8203125,
1982
+ "learning_rate": 2.6662694077531695e-05,
1983
+ "loss": 0.8259,
1984
+ "step": 14100
1985
+ },
1986
+ {
1987
+ "epoch": 1.4053034064951833,
1988
+ "grad_norm": 0.79296875,
1989
+ "learning_rate": 2.6579931803886516e-05,
1990
+ "loss": 0.8265,
1991
+ "step": 14150
1992
+ },
1993
+ {
1994
+ "epoch": 1.410269142913894,
1995
+ "grad_norm": 0.80859375,
1996
+ "learning_rate": 2.6497169530241334e-05,
1997
+ "loss": 0.8292,
1998
+ "step": 14200
1999
+ },
2000
+ {
2001
+ "epoch": 1.415234879332605,
2002
+ "grad_norm": 0.78125,
2003
+ "learning_rate": 2.641440725659615e-05,
2004
+ "loss": 0.8332,
2005
+ "step": 14250
2006
+ },
2007
+ {
2008
+ "epoch": 1.4202006157513158,
2009
+ "grad_norm": 0.7890625,
2010
+ "learning_rate": 2.6331644982950972e-05,
2011
+ "loss": 0.829,
2012
+ "step": 14300
2013
+ },
2014
+ {
2015
+ "epoch": 1.4251663521700269,
2016
+ "grad_norm": 0.7890625,
2017
+ "learning_rate": 2.624888270930579e-05,
2018
+ "loss": 0.8282,
2019
+ "step": 14350
2020
+ },
2021
+ {
2022
+ "epoch": 1.4301320885887376,
2023
+ "grad_norm": 0.80859375,
2024
+ "learning_rate": 2.6166120435660607e-05,
2025
+ "loss": 0.8274,
2026
+ "step": 14400
2027
+ },
2028
+ {
2029
+ "epoch": 1.4350978250074486,
2030
+ "grad_norm": 0.78125,
2031
+ "learning_rate": 2.6083358162015424e-05,
2032
+ "loss": 0.8296,
2033
+ "step": 14450
2034
+ },
2035
+ {
2036
+ "epoch": 1.4400635614261594,
2037
+ "grad_norm": 0.7734375,
2038
+ "learning_rate": 2.6000595888370245e-05,
2039
+ "loss": 0.8304,
2040
+ "step": 14500
2041
+ },
2042
+ {
2043
+ "epoch": 1.4450292978448704,
2044
+ "grad_norm": 0.81640625,
2045
+ "learning_rate": 2.5917833614725062e-05,
2046
+ "loss": 0.8298,
2047
+ "step": 14550
2048
+ },
2049
+ {
2050
+ "epoch": 1.4499950342635812,
2051
+ "grad_norm": 0.76953125,
2052
+ "learning_rate": 2.5835071341079887e-05,
2053
+ "loss": 0.8271,
2054
+ "step": 14600
2055
+ },
2056
+ {
2057
+ "epoch": 1.4549607706822922,
2058
+ "grad_norm": 0.7890625,
2059
+ "learning_rate": 2.5752309067434704e-05,
2060
+ "loss": 0.8255,
2061
+ "step": 14650
2062
+ },
2063
+ {
2064
+ "epoch": 1.459926507101003,
2065
+ "grad_norm": 0.78125,
2066
+ "learning_rate": 2.566954679378952e-05,
2067
+ "loss": 0.8259,
2068
+ "step": 14700
2069
+ },
2070
+ {
2071
+ "epoch": 1.464892243519714,
2072
+ "grad_norm": 0.76171875,
2073
+ "learning_rate": 2.5586784520144342e-05,
2074
+ "loss": 0.8287,
2075
+ "step": 14750
2076
+ },
2077
+ {
2078
+ "epoch": 1.4698579799384248,
2079
+ "grad_norm": 0.796875,
2080
+ "learning_rate": 2.550402224649916e-05,
2081
+ "loss": 0.8268,
2082
+ "step": 14800
2083
+ },
2084
+ {
2085
+ "epoch": 1.4748237163571358,
2086
+ "grad_norm": 0.7890625,
2087
+ "learning_rate": 2.5421259972853977e-05,
2088
+ "loss": 0.8304,
2089
+ "step": 14850
2090
+ },
2091
+ {
2092
+ "epoch": 1.4797894527758466,
2093
+ "grad_norm": 0.80859375,
2094
+ "learning_rate": 2.5338497699208798e-05,
2095
+ "loss": 0.8265,
2096
+ "step": 14900
2097
+ },
2098
+ {
2099
+ "epoch": 1.4847551891945576,
2100
+ "grad_norm": 0.80078125,
2101
+ "learning_rate": 2.5255735425563615e-05,
2102
+ "loss": 0.8272,
2103
+ "step": 14950
2104
+ },
2105
+ {
2106
+ "epoch": 1.4897209256132684,
2107
+ "grad_norm": 0.796875,
2108
+ "learning_rate": 2.5172973151918433e-05,
2109
+ "loss": 0.8264,
2110
+ "step": 15000
2111
+ },
2112
+ {
2113
+ "epoch": 1.4946866620319794,
2114
+ "grad_norm": 0.81640625,
2115
+ "learning_rate": 2.509021087827325e-05,
2116
+ "loss": 0.8296,
2117
+ "step": 15050
2118
+ },
2119
+ {
2120
+ "epoch": 1.4996523984506902,
2121
+ "grad_norm": 0.8046875,
2122
+ "learning_rate": 2.500744860462807e-05,
2123
+ "loss": 0.8302,
2124
+ "step": 15100
2125
+ },
2126
+ {
2127
+ "epoch": 1.5046181348694012,
2128
+ "grad_norm": 0.78125,
2129
+ "learning_rate": 2.4924686330982885e-05,
2130
+ "loss": 0.8237,
2131
+ "step": 15150
2132
+ },
2133
+ {
2134
+ "epoch": 1.509583871288112,
2135
+ "grad_norm": 0.80859375,
2136
+ "learning_rate": 2.4841924057337702e-05,
2137
+ "loss": 0.825,
2138
+ "step": 15200
2139
+ },
2140
+ {
2141
+ "epoch": 1.5145496077068228,
2142
+ "grad_norm": 0.8125,
2143
+ "learning_rate": 2.4759161783692523e-05,
2144
+ "loss": 0.8244,
2145
+ "step": 15250
2146
+ },
2147
+ {
2148
+ "epoch": 1.5195153441255338,
2149
+ "grad_norm": 0.8203125,
2150
+ "learning_rate": 2.467639951004734e-05,
2151
+ "loss": 0.8267,
2152
+ "step": 15300
2153
+ },
2154
+ {
2155
+ "epoch": 1.5244810805442448,
2156
+ "grad_norm": 0.81640625,
2157
+ "learning_rate": 2.4593637236402158e-05,
2158
+ "loss": 0.8273,
2159
+ "step": 15350
2160
+ },
2161
+ {
2162
+ "epoch": 1.5294468169629556,
2163
+ "grad_norm": 0.80078125,
2164
+ "learning_rate": 2.451087496275698e-05,
2165
+ "loss": 0.83,
2166
+ "step": 15400
2167
+ },
2168
+ {
2169
+ "epoch": 1.5344125533816664,
2170
+ "grad_norm": 0.8125,
2171
+ "learning_rate": 2.4428112689111796e-05,
2172
+ "loss": 0.8268,
2173
+ "step": 15450
2174
+ },
2175
+ {
2176
+ "epoch": 1.5393782898003774,
2177
+ "grad_norm": 0.80859375,
2178
+ "learning_rate": 2.4345350415466614e-05,
2179
+ "loss": 0.8254,
2180
+ "step": 15500
2181
+ },
2182
+ {
2183
+ "epoch": 1.5443440262190884,
2184
+ "grad_norm": 0.80859375,
2185
+ "learning_rate": 2.426258814182143e-05,
2186
+ "loss": 0.8261,
2187
+ "step": 15550
2188
+ },
2189
+ {
2190
+ "epoch": 1.5493097626377992,
2191
+ "grad_norm": 0.796875,
2192
+ "learning_rate": 2.4179825868176252e-05,
2193
+ "loss": 0.8261,
2194
+ "step": 15600
2195
+ },
2196
+ {
2197
+ "epoch": 1.55427549905651,
2198
+ "grad_norm": 0.82421875,
2199
+ "learning_rate": 2.409706359453107e-05,
2200
+ "loss": 0.8293,
2201
+ "step": 15650
2202
+ },
2203
+ {
2204
+ "epoch": 1.559241235475221,
2205
+ "grad_norm": 0.765625,
2206
+ "learning_rate": 2.4014301320885887e-05,
2207
+ "loss": 0.8301,
2208
+ "step": 15700
2209
+ },
2210
+ {
2211
+ "epoch": 1.564206971893932,
2212
+ "grad_norm": 0.8203125,
2213
+ "learning_rate": 2.3931539047240707e-05,
2214
+ "loss": 0.8262,
2215
+ "step": 15750
2216
+ },
2217
+ {
2218
+ "epoch": 1.5691727083126428,
2219
+ "grad_norm": 0.79296875,
2220
+ "learning_rate": 2.3848776773595525e-05,
2221
+ "loss": 0.8281,
2222
+ "step": 15800
2223
+ },
2224
+ {
2225
+ "epoch": 1.5741384447313536,
2226
+ "grad_norm": 0.8046875,
2227
+ "learning_rate": 2.3766014499950342e-05,
2228
+ "loss": 0.8277,
2229
+ "step": 15850
2230
+ },
2231
+ {
2232
+ "epoch": 1.5791041811500646,
2233
+ "grad_norm": 0.7734375,
2234
+ "learning_rate": 2.368325222630516e-05,
2235
+ "loss": 0.8257,
2236
+ "step": 15900
2237
+ },
2238
+ {
2239
+ "epoch": 1.5840699175687756,
2240
+ "grad_norm": 0.8125,
2241
+ "learning_rate": 2.360048995265998e-05,
2242
+ "loss": 0.8291,
2243
+ "step": 15950
2244
+ },
2245
+ {
2246
+ "epoch": 1.5890356539874864,
2247
+ "grad_norm": 0.796875,
2248
+ "learning_rate": 2.3517727679014798e-05,
2249
+ "loss": 0.8289,
2250
+ "step": 16000
2251
+ },
2252
+ {
2253
+ "epoch": 1.5940013904061971,
2254
+ "grad_norm": 0.80078125,
2255
+ "learning_rate": 2.343496540536962e-05,
2256
+ "loss": 0.8269,
2257
+ "step": 16050
2258
+ },
2259
+ {
2260
+ "epoch": 1.5989671268249082,
2261
+ "grad_norm": 0.80078125,
2262
+ "learning_rate": 2.3352203131724436e-05,
2263
+ "loss": 0.8258,
2264
+ "step": 16100
2265
+ },
2266
+ {
2267
+ "epoch": 1.6039328632436192,
2268
+ "grad_norm": 0.80078125,
2269
+ "learning_rate": 2.3269440858079257e-05,
2270
+ "loss": 0.8272,
2271
+ "step": 16150
2272
+ },
2273
+ {
2274
+ "epoch": 1.60889859966233,
2275
+ "grad_norm": 0.82421875,
2276
+ "learning_rate": 2.3186678584434074e-05,
2277
+ "loss": 0.8262,
2278
+ "step": 16200
2279
+ },
2280
+ {
2281
+ "epoch": 1.6138643360810407,
2282
+ "grad_norm": 0.8046875,
2283
+ "learning_rate": 2.3103916310788892e-05,
2284
+ "loss": 0.8271,
2285
+ "step": 16250
2286
+ },
2287
+ {
2288
+ "epoch": 1.6188300724997517,
2289
+ "grad_norm": 0.8046875,
2290
+ "learning_rate": 2.302115403714371e-05,
2291
+ "loss": 0.8258,
2292
+ "step": 16300
2293
+ },
2294
+ {
2295
+ "epoch": 1.6237958089184628,
2296
+ "grad_norm": 0.7890625,
2297
+ "learning_rate": 2.293839176349853e-05,
2298
+ "loss": 0.8266,
2299
+ "step": 16350
2300
+ },
2301
+ {
2302
+ "epoch": 1.6287615453371735,
2303
+ "grad_norm": 0.8046875,
2304
+ "learning_rate": 2.2855629489853347e-05,
2305
+ "loss": 0.8252,
2306
+ "step": 16400
2307
+ },
2308
+ {
2309
+ "epoch": 1.6337272817558843,
2310
+ "grad_norm": 0.78515625,
2311
+ "learning_rate": 2.2772867216208165e-05,
2312
+ "loss": 0.8269,
2313
+ "step": 16450
2314
+ },
2315
+ {
2316
+ "epoch": 1.6386930181745953,
2317
+ "grad_norm": 0.79296875,
2318
+ "learning_rate": 2.2690104942562986e-05,
2319
+ "loss": 0.8265,
2320
+ "step": 16500
2321
+ },
2322
+ {
2323
+ "epoch": 1.6436587545933063,
2324
+ "grad_norm": 0.81640625,
2325
+ "learning_rate": 2.2607342668917803e-05,
2326
+ "loss": 0.8253,
2327
+ "step": 16550
2328
+ },
2329
+ {
2330
+ "epoch": 1.6486244910120171,
2331
+ "grad_norm": 0.80078125,
2332
+ "learning_rate": 2.252458039527262e-05,
2333
+ "loss": 0.8302,
2334
+ "step": 16600
2335
+ },
2336
+ {
2337
+ "epoch": 1.653590227430728,
2338
+ "grad_norm": 0.78515625,
2339
+ "learning_rate": 2.2441818121627438e-05,
2340
+ "loss": 0.8286,
2341
+ "step": 16650
2342
+ },
2343
+ {
2344
+ "epoch": 1.6585559638494387,
2345
+ "grad_norm": 0.80859375,
2346
+ "learning_rate": 2.235905584798226e-05,
2347
+ "loss": 0.8285,
2348
+ "step": 16700
2349
+ },
2350
+ {
2351
+ "epoch": 1.6635217002681497,
2352
+ "grad_norm": 0.80078125,
2353
+ "learning_rate": 2.2276293574337076e-05,
2354
+ "loss": 0.8253,
2355
+ "step": 16750
2356
+ },
2357
+ {
2358
+ "epoch": 1.6684874366868607,
2359
+ "grad_norm": 0.828125,
2360
+ "learning_rate": 2.2193531300691894e-05,
2361
+ "loss": 0.8283,
2362
+ "step": 16800
2363
+ },
2364
+ {
2365
+ "epoch": 1.6734531731055715,
2366
+ "grad_norm": 0.8125,
2367
+ "learning_rate": 2.2110769027046714e-05,
2368
+ "loss": 0.8291,
2369
+ "step": 16850
2370
+ },
2371
+ {
2372
+ "epoch": 1.6784189095242823,
2373
+ "grad_norm": 0.7890625,
2374
+ "learning_rate": 2.2028006753401532e-05,
2375
+ "loss": 0.8257,
2376
+ "step": 16900
2377
+ },
2378
+ {
2379
+ "epoch": 1.6833846459429933,
2380
+ "grad_norm": 0.7890625,
2381
+ "learning_rate": 2.194524447975635e-05,
2382
+ "loss": 0.8256,
2383
+ "step": 16950
2384
+ },
2385
+ {
2386
+ "epoch": 1.6883503823617043,
2387
+ "grad_norm": 0.80078125,
2388
+ "learning_rate": 2.1862482206111167e-05,
2389
+ "loss": 0.826,
2390
+ "step": 17000
2391
+ },
2392
+ {
2393
+ "epoch": 1.693316118780415,
2394
+ "grad_norm": 0.80859375,
2395
+ "learning_rate": 2.1779719932465987e-05,
2396
+ "loss": 0.8237,
2397
+ "step": 17050
2398
+ },
2399
+ {
2400
+ "epoch": 1.698281855199126,
2401
+ "grad_norm": 0.78125,
2402
+ "learning_rate": 2.1696957658820805e-05,
2403
+ "loss": 0.826,
2404
+ "step": 17100
2405
+ },
2406
+ {
2407
+ "epoch": 1.703247591617837,
2408
+ "grad_norm": 0.82421875,
2409
+ "learning_rate": 2.1614195385175622e-05,
2410
+ "loss": 0.8275,
2411
+ "step": 17150
2412
+ },
2413
+ {
2414
+ "epoch": 1.708213328036548,
2415
+ "grad_norm": 0.8359375,
2416
+ "learning_rate": 2.153143311153044e-05,
2417
+ "loss": 0.8238,
2418
+ "step": 17200
2419
+ },
2420
+ {
2421
+ "epoch": 1.7131790644552587,
2422
+ "grad_norm": 0.8359375,
2423
+ "learning_rate": 2.144867083788526e-05,
2424
+ "loss": 0.826,
2425
+ "step": 17250
2426
+ },
2427
+ {
2428
+ "epoch": 1.7181448008739695,
2429
+ "grad_norm": 0.8203125,
2430
+ "learning_rate": 2.1365908564240078e-05,
2431
+ "loss": 0.8261,
2432
+ "step": 17300
2433
+ },
2434
+ {
2435
+ "epoch": 1.7231105372926805,
2436
+ "grad_norm": 0.796875,
2437
+ "learning_rate": 2.1283146290594895e-05,
2438
+ "loss": 0.8245,
2439
+ "step": 17350
2440
+ },
2441
+ {
2442
+ "epoch": 1.7280762737113915,
2443
+ "grad_norm": 0.82421875,
2444
+ "learning_rate": 2.1200384016949716e-05,
2445
+ "loss": 0.8273,
2446
+ "step": 17400
2447
+ },
2448
+ {
2449
+ "epoch": 1.7330420101301023,
2450
+ "grad_norm": 0.7890625,
2451
+ "learning_rate": 2.1117621743304534e-05,
2452
+ "loss": 0.8249,
2453
+ "step": 17450
2454
+ },
2455
+ {
2456
+ "epoch": 1.738007746548813,
2457
+ "grad_norm": 0.80078125,
2458
+ "learning_rate": 2.103485946965935e-05,
2459
+ "loss": 0.826,
2460
+ "step": 17500
2461
+ },
2462
+ {
2463
+ "epoch": 1.742973482967524,
2464
+ "grad_norm": 0.78515625,
2465
+ "learning_rate": 2.095209719601417e-05,
2466
+ "loss": 0.8259,
2467
+ "step": 17550
2468
+ },
2469
+ {
2470
+ "epoch": 1.747939219386235,
2471
+ "grad_norm": 0.78515625,
2472
+ "learning_rate": 2.086933492236899e-05,
2473
+ "loss": 0.827,
2474
+ "step": 17600
2475
+ },
2476
+ {
2477
+ "epoch": 1.7529049558049459,
2478
+ "grad_norm": 0.8046875,
2479
+ "learning_rate": 2.0786572648723807e-05,
2480
+ "loss": 0.8239,
2481
+ "step": 17650
2482
+ },
2483
+ {
2484
+ "epoch": 1.7578706922236567,
2485
+ "grad_norm": 0.78515625,
2486
+ "learning_rate": 2.0703810375078624e-05,
2487
+ "loss": 0.8262,
2488
+ "step": 17700
2489
+ },
2490
+ {
2491
+ "epoch": 1.7628364286423677,
2492
+ "grad_norm": 0.82421875,
2493
+ "learning_rate": 2.0621048101433445e-05,
2494
+ "loss": 0.8259,
2495
+ "step": 17750
2496
+ },
2497
+ {
2498
+ "epoch": 1.7678021650610787,
2499
+ "grad_norm": 0.78515625,
2500
+ "learning_rate": 2.0538285827788262e-05,
2501
+ "loss": 0.8258,
2502
+ "step": 17800
2503
+ },
2504
+ {
2505
+ "epoch": 1.7727679014797895,
2506
+ "grad_norm": 0.81640625,
2507
+ "learning_rate": 2.045552355414308e-05,
2508
+ "loss": 0.8264,
2509
+ "step": 17850
2510
+ },
2511
+ {
2512
+ "epoch": 1.7777336378985003,
2513
+ "grad_norm": 0.83984375,
2514
+ "learning_rate": 2.0372761280497897e-05,
2515
+ "loss": 0.8247,
2516
+ "step": 17900
2517
+ },
2518
+ {
2519
+ "epoch": 1.7826993743172113,
2520
+ "grad_norm": 0.8046875,
2521
+ "learning_rate": 2.0289999006852718e-05,
2522
+ "loss": 0.8259,
2523
+ "step": 17950
2524
+ },
2525
+ {
2526
+ "epoch": 1.7876651107359223,
2527
+ "grad_norm": 0.8203125,
2528
+ "learning_rate": 2.0207236733207535e-05,
2529
+ "loss": 0.8243,
2530
+ "step": 18000
2531
+ },
2532
+ {
2533
+ "epoch": 1.792630847154633,
2534
+ "grad_norm": 0.8125,
2535
+ "learning_rate": 2.0124474459562353e-05,
2536
+ "loss": 0.8273,
2537
+ "step": 18050
2538
+ },
2539
+ {
2540
+ "epoch": 1.7975965835733438,
2541
+ "grad_norm": 0.81640625,
2542
+ "learning_rate": 2.0041712185917174e-05,
2543
+ "loss": 0.8233,
2544
+ "step": 18100
2545
+ },
2546
+ {
2547
+ "epoch": 1.8025623199920549,
2548
+ "grad_norm": 0.8203125,
2549
+ "learning_rate": 1.995894991227199e-05,
2550
+ "loss": 0.8281,
2551
+ "step": 18150
2552
+ },
2553
+ {
2554
+ "epoch": 1.8075280564107659,
2555
+ "grad_norm": 0.828125,
2556
+ "learning_rate": 1.987618763862681e-05,
2557
+ "loss": 0.8263,
2558
+ "step": 18200
2559
+ },
2560
+ {
2561
+ "epoch": 1.8124937928294766,
2562
+ "grad_norm": 0.82421875,
2563
+ "learning_rate": 1.9793425364981626e-05,
2564
+ "loss": 0.8282,
2565
+ "step": 18250
2566
+ },
2567
+ {
2568
+ "epoch": 1.8174595292481874,
2569
+ "grad_norm": 0.8046875,
2570
+ "learning_rate": 1.9710663091336447e-05,
2571
+ "loss": 0.8268,
2572
+ "step": 18300
2573
+ },
2574
+ {
2575
+ "epoch": 1.8224252656668984,
2576
+ "grad_norm": 0.7890625,
2577
+ "learning_rate": 1.9627900817691264e-05,
2578
+ "loss": 0.8239,
2579
+ "step": 18350
2580
+ },
2581
+ {
2582
+ "epoch": 1.8273910020856095,
2583
+ "grad_norm": 0.80078125,
2584
+ "learning_rate": 1.954513854404608e-05,
2585
+ "loss": 0.827,
2586
+ "step": 18400
2587
+ },
2588
+ {
2589
+ "epoch": 1.8323567385043202,
2590
+ "grad_norm": 0.78515625,
2591
+ "learning_rate": 1.94623762704009e-05,
2592
+ "loss": 0.8244,
2593
+ "step": 18450
2594
+ },
2595
+ {
2596
+ "epoch": 1.837322474923031,
2597
+ "grad_norm": 0.81640625,
2598
+ "learning_rate": 1.937961399675572e-05,
2599
+ "loss": 0.8271,
2600
+ "step": 18500
2601
+ },
2602
+ {
2603
+ "epoch": 1.842288211341742,
2604
+ "grad_norm": 0.8203125,
2605
+ "learning_rate": 1.9296851723110537e-05,
2606
+ "loss": 0.8258,
2607
+ "step": 18550
2608
+ },
2609
+ {
2610
+ "epoch": 1.847253947760453,
2611
+ "grad_norm": 0.8125,
2612
+ "learning_rate": 1.9214089449465355e-05,
2613
+ "loss": 0.8235,
2614
+ "step": 18600
2615
+ },
2616
+ {
2617
+ "epoch": 1.8522196841791638,
2618
+ "grad_norm": 0.81640625,
2619
+ "learning_rate": 1.9131327175820175e-05,
2620
+ "loss": 0.8298,
2621
+ "step": 18650
2622
+ },
2623
+ {
2624
+ "epoch": 1.8571854205978746,
2625
+ "grad_norm": 0.8125,
2626
+ "learning_rate": 1.9048564902174993e-05,
2627
+ "loss": 0.8227,
2628
+ "step": 18700
2629
+ },
2630
+ {
2631
+ "epoch": 1.8621511570165854,
2632
+ "grad_norm": 0.81640625,
2633
+ "learning_rate": 1.896580262852981e-05,
2634
+ "loss": 0.823,
2635
+ "step": 18750
2636
+ },
2637
+ {
2638
+ "epoch": 1.8671168934352964,
2639
+ "grad_norm": 0.77734375,
2640
+ "learning_rate": 1.8883040354884628e-05,
2641
+ "loss": 0.8258,
2642
+ "step": 18800
2643
+ },
2644
+ {
2645
+ "epoch": 1.8720826298540074,
2646
+ "grad_norm": 0.84375,
2647
+ "learning_rate": 1.880027808123945e-05,
2648
+ "loss": 0.8245,
2649
+ "step": 18850
2650
+ },
2651
+ {
2652
+ "epoch": 1.8770483662727182,
2653
+ "grad_norm": 0.8125,
2654
+ "learning_rate": 1.8717515807594266e-05,
2655
+ "loss": 0.8258,
2656
+ "step": 18900
2657
+ },
2658
+ {
2659
+ "epoch": 1.882014102691429,
2660
+ "grad_norm": 0.7890625,
2661
+ "learning_rate": 1.8634753533949083e-05,
2662
+ "loss": 0.8272,
2663
+ "step": 18950
2664
+ },
2665
+ {
2666
+ "epoch": 1.88697983911014,
2667
+ "grad_norm": 0.828125,
2668
+ "learning_rate": 1.8551991260303904e-05,
2669
+ "loss": 0.8259,
2670
+ "step": 19000
2671
+ },
2672
+ {
2673
+ "epoch": 1.891945575528851,
2674
+ "grad_norm": 0.8125,
2675
+ "learning_rate": 1.8469228986658725e-05,
2676
+ "loss": 0.8273,
2677
+ "step": 19050
2678
+ },
2679
+ {
2680
+ "epoch": 1.8969113119475618,
2681
+ "grad_norm": 0.80859375,
2682
+ "learning_rate": 1.8386466713013542e-05,
2683
+ "loss": 0.8268,
2684
+ "step": 19100
2685
+ },
2686
+ {
2687
+ "epoch": 1.9018770483662726,
2688
+ "grad_norm": 0.8515625,
2689
+ "learning_rate": 1.830370443936836e-05,
2690
+ "loss": 0.8238,
2691
+ "step": 19150
2692
+ },
2693
+ {
2694
+ "epoch": 1.9068427847849836,
2695
+ "grad_norm": 0.8515625,
2696
+ "learning_rate": 1.822094216572318e-05,
2697
+ "loss": 0.8208,
2698
+ "step": 19200
2699
+ },
2700
+ {
2701
+ "epoch": 1.9118085212036946,
2702
+ "grad_norm": 0.79296875,
2703
+ "learning_rate": 1.8138179892077998e-05,
2704
+ "loss": 0.827,
2705
+ "step": 19250
2706
+ },
2707
+ {
2708
+ "epoch": 1.9167742576224054,
2709
+ "grad_norm": 0.8125,
2710
+ "learning_rate": 1.8055417618432815e-05,
2711
+ "loss": 0.8229,
2712
+ "step": 19300
2713
+ },
2714
+ {
2715
+ "epoch": 1.9217399940411162,
2716
+ "grad_norm": 0.78515625,
2717
+ "learning_rate": 1.7972655344787633e-05,
2718
+ "loss": 0.8251,
2719
+ "step": 19350
2720
+ },
2721
+ {
2722
+ "epoch": 1.9267057304598272,
2723
+ "grad_norm": 0.80078125,
2724
+ "learning_rate": 1.7889893071142454e-05,
2725
+ "loss": 0.8266,
2726
+ "step": 19400
2727
+ },
2728
+ {
2729
+ "epoch": 1.9316714668785382,
2730
+ "grad_norm": 0.80078125,
2731
+ "learning_rate": 1.780713079749727e-05,
2732
+ "loss": 0.8248,
2733
+ "step": 19450
2734
+ },
2735
+ {
2736
+ "epoch": 1.936637203297249,
2737
+ "grad_norm": 0.78515625,
2738
+ "learning_rate": 1.772436852385209e-05,
2739
+ "loss": 0.8225,
2740
+ "step": 19500
2741
+ },
2742
+ {
2743
+ "epoch": 1.9416029397159598,
2744
+ "grad_norm": 0.796875,
2745
+ "learning_rate": 1.7641606250206906e-05,
2746
+ "loss": 0.8242,
2747
+ "step": 19550
2748
+ },
2749
+ {
2750
+ "epoch": 1.9465686761346708,
2751
+ "grad_norm": 0.8046875,
2752
+ "learning_rate": 1.7558843976561727e-05,
2753
+ "loss": 0.8247,
2754
+ "step": 19600
2755
+ },
2756
+ {
2757
+ "epoch": 1.9515344125533818,
2758
+ "grad_norm": 0.80078125,
2759
+ "learning_rate": 1.7476081702916544e-05,
2760
+ "loss": 0.824,
2761
+ "step": 19650
2762
+ },
2763
+ {
2764
+ "epoch": 1.9565001489720926,
2765
+ "grad_norm": 0.8046875,
2766
+ "learning_rate": 1.739331942927136e-05,
2767
+ "loss": 0.8249,
2768
+ "step": 19700
2769
+ },
2770
+ {
2771
+ "epoch": 1.9614658853908034,
2772
+ "grad_norm": 0.80078125,
2773
+ "learning_rate": 1.7310557155626182e-05,
2774
+ "loss": 0.8258,
2775
+ "step": 19750
2776
+ },
2777
+ {
2778
+ "epoch": 1.9664316218095144,
2779
+ "grad_norm": 0.7734375,
2780
+ "learning_rate": 1.7227794881981e-05,
2781
+ "loss": 0.8267,
2782
+ "step": 19800
2783
+ },
2784
+ {
2785
+ "epoch": 1.9713973582282254,
2786
+ "grad_norm": 0.79296875,
2787
+ "learning_rate": 1.7145032608335817e-05,
2788
+ "loss": 0.8257,
2789
+ "step": 19850
2790
+ },
2791
+ {
2792
+ "epoch": 1.9763630946469362,
2793
+ "grad_norm": 0.8203125,
2794
+ "learning_rate": 1.7062270334690635e-05,
2795
+ "loss": 0.8259,
2796
+ "step": 19900
2797
+ },
2798
+ {
2799
+ "epoch": 1.981328831065647,
2800
+ "grad_norm": 0.796875,
2801
+ "learning_rate": 1.6979508061045455e-05,
2802
+ "loss": 0.823,
2803
+ "step": 19950
2804
+ },
2805
+ {
2806
+ "epoch": 1.986294567484358,
2807
+ "grad_norm": 0.828125,
2808
+ "learning_rate": 1.6896745787400273e-05,
2809
+ "loss": 0.8236,
2810
+ "step": 20000
2811
+ }
2812
+ ],
2813
+ "logging_steps": 50,
2814
+ "max_steps": 30207,
2815
+ "num_input_tokens_seen": 0,
2816
+ "num_train_epochs": 3,
2817
+ "save_steps": 10000,
2818
+ "stateful_callbacks": {
2819
+ "TrainerControl": {
2820
+ "args": {
2821
+ "should_epoch_stop": false,
2822
+ "should_evaluate": false,
2823
+ "should_log": false,
2824
+ "should_save": true,
2825
+ "should_training_stop": false
2826
+ },
2827
+ "attributes": {}
2828
+ }
2829
+ },
2830
+ "total_flos": 1.0812269791868355e+19,
2831
+ "train_batch_size": 1,
2832
+ "trial_name": null,
2833
+ "trial_params": null
2834
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c8ba693cfa8766cf24b84dc4ce14db5b64a400d1d6d60284338106708582878
3
+ size 5777