Dbmaxwell commited on
Commit
3a09181
·
verified ·
1 Parent(s): 35bde9b

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. added_tokens.json +3 -0
  3. chat_template.jinja +47 -0
  4. checkpoint-2000/added_tokens.json +3 -0
  5. checkpoint-2000/chat_template.jinja +47 -0
  6. checkpoint-2000/config.json +56 -0
  7. checkpoint-2000/generation_config.json +11 -0
  8. checkpoint-2000/model.safetensors +3 -0
  9. checkpoint-2000/optimizer.pt +3 -0
  10. checkpoint-2000/rng_state.pth +3 -0
  11. checkpoint-2000/scheduler.pt +3 -0
  12. checkpoint-2000/special_tokens_map.json +33 -0
  13. checkpoint-2000/tokenizer.json +3 -0
  14. checkpoint-2000/tokenizer.model +3 -0
  15. checkpoint-2000/tokenizer_config.json +0 -0
  16. checkpoint-2000/trainer_state.json +474 -0
  17. checkpoint-2000/training_args.bin +3 -0
  18. checkpoint-2200/added_tokens.json +3 -0
  19. checkpoint-2200/chat_template.jinja +47 -0
  20. checkpoint-2200/config.json +56 -0
  21. checkpoint-2200/generation_config.json +11 -0
  22. checkpoint-2200/model.safetensors +3 -0
  23. checkpoint-2200/optimizer.pt +3 -0
  24. checkpoint-2200/rng_state.pth +3 -0
  25. checkpoint-2200/scheduler.pt +3 -0
  26. checkpoint-2200/special_tokens_map.json +33 -0
  27. checkpoint-2200/tokenizer.json +3 -0
  28. checkpoint-2200/tokenizer.model +3 -0
  29. checkpoint-2200/tokenizer_config.json +0 -0
  30. checkpoint-2200/trainer_state.json +518 -0
  31. checkpoint-2200/training_args.bin +3 -0
  32. checkpoint-2250/added_tokens.json +3 -0
  33. checkpoint-2250/chat_template.jinja +47 -0
  34. checkpoint-2250/config.json +56 -0
  35. checkpoint-2250/generation_config.json +11 -0
  36. checkpoint-2250/model.safetensors +3 -0
  37. checkpoint-2250/optimizer.pt +3 -0
  38. checkpoint-2250/rng_state.pth +3 -0
  39. checkpoint-2250/scheduler.pt +3 -0
  40. checkpoint-2250/special_tokens_map.json +33 -0
  41. checkpoint-2250/tokenizer.json +3 -0
  42. checkpoint-2250/tokenizer.model +3 -0
  43. checkpoint-2250/tokenizer_config.json +0 -0
  44. checkpoint-2250/trainer_state.json +525 -0
  45. checkpoint-2250/training_args.bin +3 -0
  46. config.json +56 -0
  47. generation_config.json +11 -0
  48. model.safetensors +3 -0
  49. special_tokens_map.json +33 -0
  50. tokenizer.json +3 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-2200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-2250/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-2000/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
checkpoint-2000/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-2000/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "cache_implementation": "hybrid",
11
+ "eos_token_id": 1,
12
+ "final_logit_softcapping": null,
13
+ "head_dim": 256,
14
+ "hidden_activation": "gelu_pytorch_tanh",
15
+ "hidden_size": 640,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 2048,
18
+ "layer_types": [
19
+ "sliding_attention",
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
+ "model_type": "gemma3_text",
40
+ "num_attention_heads": 4,
41
+ "num_hidden_layers": 18,
42
+ "num_key_value_heads": 1,
43
+ "pad_token_id": 0,
44
+ "query_pre_attn_scalar": 256,
45
+ "rms_norm_eps": 1e-06,
46
+ "rope_local_base_freq": 10000.0,
47
+ "rope_scaling": null,
48
+ "rope_theta": 1000000.0,
49
+ "sliding_window": 512,
50
+ "sliding_window_pattern": 6,
51
+ "torch_dtype": "bfloat16",
52
+ "transformers_version": "4.52.4",
53
+ "use_bidirectional_attention": false,
54
+ "use_cache": true,
55
+ "vocab_size": 262144
56
+ }
checkpoint-2000/generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cache_implementation": "hybrid",
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 1,
6
+ 106
7
+ ],
8
+ "top_k": 64,
9
+ "top_p": 0.95,
10
+ "transformers_version": "4.52.4"
11
+ }
checkpoint-2000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7204dfb6f3a944a032ca5ed20d71c923a4fed2c3b11dfabc2c1ff9f2fabe8af8
3
+ size 536223056
checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ca1a55e9b06ab065a794a5a279c828b57c60635010bf66dec7ffd15ff01ee57
3
+ size 1072590714
checkpoint-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ec28ea0c416565eeac14a0e9c944f185ac250f4ed4bd15c84ff77ed78ba9301
3
+ size 14244
checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3acb1a08fa90ba7cf2a95ded566b5ae5aa74d9f58bbee28804cc9682a3227ce
3
+ size 1064
checkpoint-2000/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
checkpoint-2000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e570d6288ff0afcea981a80492eddfa3e2239a79de89e5074cbb74b548fa5e2b
3
+ size 33384833
checkpoint-2000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
checkpoint-2000/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2000/trainer_state.json ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2000,
3
+ "best_metric": 1.8876391649246216,
4
+ "best_model_checkpoint": "./gemma3-270m-turkish_instructions-finetuned/checkpoint-2000",
5
+ "epoch": 1.7777777777777777,
6
+ "eval_steps": 100,
7
+ "global_step": 2000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.044444444444444446,
14
+ "grad_norm": 19.25,
15
+ "learning_rate": 1.088888888888889e-05,
16
+ "loss": 3.5828,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.08888888888888889,
21
+ "grad_norm": 17.875,
22
+ "learning_rate": 2.2000000000000003e-05,
23
+ "loss": 2.4877,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.08888888888888889,
28
+ "eval_loss": 2.275900363922119,
29
+ "eval_runtime": 48.3609,
30
+ "eval_samples_per_second": 20.678,
31
+ "eval_steps_per_second": 10.339,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.13333333333333333,
36
+ "grad_norm": 18.25,
37
+ "learning_rate": 3.311111111111112e-05,
38
+ "loss": 2.255,
39
+ "step": 150
40
+ },
41
+ {
42
+ "epoch": 0.17777777777777778,
43
+ "grad_norm": 13.4375,
44
+ "learning_rate": 4.422222222222222e-05,
45
+ "loss": 2.2559,
46
+ "step": 200
47
+ },
48
+ {
49
+ "epoch": 0.17777777777777778,
50
+ "eval_loss": 2.1721303462982178,
51
+ "eval_runtime": 48.5933,
52
+ "eval_samples_per_second": 20.579,
53
+ "eval_steps_per_second": 10.289,
54
+ "step": 200
55
+ },
56
+ {
57
+ "epoch": 0.2222222222222222,
58
+ "grad_norm": 18.875,
59
+ "learning_rate": 4.99826726554013e-05,
60
+ "loss": 2.2218,
61
+ "step": 250
62
+ },
63
+ {
64
+ "epoch": 0.26666666666666666,
65
+ "grad_norm": 16.625,
66
+ "learning_rate": 4.983543173414964e-05,
67
+ "loss": 2.1805,
68
+ "step": 300
69
+ },
70
+ {
71
+ "epoch": 0.26666666666666666,
72
+ "eval_loss": 2.1423935890197754,
73
+ "eval_runtime": 48.7813,
74
+ "eval_samples_per_second": 20.5,
75
+ "eval_steps_per_second": 10.25,
76
+ "step": 300
77
+ },
78
+ {
79
+ "epoch": 0.3111111111111111,
80
+ "grad_norm": 15.0625,
81
+ "learning_rate": 4.953882760420223e-05,
82
+ "loss": 2.2345,
83
+ "step": 350
84
+ },
85
+ {
86
+ "epoch": 0.35555555555555557,
87
+ "grad_norm": 15.75,
88
+ "learning_rate": 4.909464407769633e-05,
89
+ "loss": 2.1545,
90
+ "step": 400
91
+ },
92
+ {
93
+ "epoch": 0.35555555555555557,
94
+ "eval_loss": 2.081547260284424,
95
+ "eval_runtime": 48.8759,
96
+ "eval_samples_per_second": 20.46,
97
+ "eval_steps_per_second": 10.23,
98
+ "step": 400
99
+ },
100
+ {
101
+ "epoch": 0.4,
102
+ "grad_norm": 17.875,
103
+ "learning_rate": 4.850555252662495e-05,
104
+ "loss": 2.0966,
105
+ "step": 450
106
+ },
107
+ {
108
+ "epoch": 0.4444444444444444,
109
+ "grad_norm": 13.1875,
110
+ "learning_rate": 4.7775095816891336e-05,
111
+ "loss": 2.0782,
112
+ "step": 500
113
+ },
114
+ {
115
+ "epoch": 0.4444444444444444,
116
+ "eval_loss": 2.0508856773376465,
117
+ "eval_runtime": 48.9079,
118
+ "eval_samples_per_second": 20.447,
119
+ "eval_steps_per_second": 10.223,
120
+ "step": 500
121
+ },
122
+ {
123
+ "epoch": 0.4888888888888889,
124
+ "grad_norm": 15.9375,
125
+ "learning_rate": 4.690766700109659e-05,
126
+ "loss": 2.115,
127
+ "step": 550
128
+ },
129
+ {
130
+ "epoch": 0.5333333333333333,
131
+ "grad_norm": 13.9375,
132
+ "learning_rate": 4.590848289820442e-05,
133
+ "loss": 2.0314,
134
+ "step": 600
135
+ },
136
+ {
137
+ "epoch": 0.5333333333333333,
138
+ "eval_loss": 2.0262327194213867,
139
+ "eval_runtime": 48.768,
140
+ "eval_samples_per_second": 20.505,
141
+ "eval_steps_per_second": 10.253,
142
+ "step": 600
143
+ },
144
+ {
145
+ "epoch": 0.5777777777777777,
146
+ "grad_norm": 14.5,
147
+ "learning_rate": 4.4783552718978e-05,
148
+ "loss": 2.0892,
149
+ "step": 650
150
+ },
151
+ {
152
+ "epoch": 0.6222222222222222,
153
+ "grad_norm": 14.0,
154
+ "learning_rate": 4.3539641925879495e-05,
155
+ "loss": 2.0352,
156
+ "step": 700
157
+ },
158
+ {
159
+ "epoch": 0.6222222222222222,
160
+ "eval_loss": 1.9995155334472656,
161
+ "eval_runtime": 48.8752,
162
+ "eval_samples_per_second": 20.46,
163
+ "eval_steps_per_second": 10.23,
164
+ "step": 700
165
+ },
166
+ {
167
+ "epoch": 0.6666666666666666,
168
+ "grad_norm": 13.25,
169
+ "learning_rate": 4.2184231544782596e-05,
170
+ "loss": 2.0117,
171
+ "step": 750
172
+ },
173
+ {
174
+ "epoch": 0.7111111111111111,
175
+ "grad_norm": 13.25,
176
+ "learning_rate": 4.072547317320281e-05,
177
+ "loss": 1.9848,
178
+ "step": 800
179
+ },
180
+ {
181
+ "epoch": 0.7111111111111111,
182
+ "eval_loss": 1.9770301580429077,
183
+ "eval_runtime": 48.7445,
184
+ "eval_samples_per_second": 20.515,
185
+ "eval_steps_per_second": 10.258,
186
+ "step": 800
187
+ },
188
+ {
189
+ "epoch": 0.7555555555555555,
190
+ "grad_norm": 13.375,
191
+ "learning_rate": 3.9172139955630774e-05,
192
+ "loss": 2.0014,
193
+ "step": 850
194
+ },
195
+ {
196
+ "epoch": 0.8,
197
+ "grad_norm": 12.75,
198
+ "learning_rate": 3.7533573820809006e-05,
199
+ "loss": 1.9938,
200
+ "step": 900
201
+ },
202
+ {
203
+ "epoch": 0.8,
204
+ "eval_loss": 1.9461404085159302,
205
+ "eval_runtime": 48.8094,
206
+ "eval_samples_per_second": 20.488,
207
+ "eval_steps_per_second": 10.244,
208
+ "step": 900
209
+ },
210
+ {
211
+ "epoch": 0.8444444444444444,
212
+ "grad_norm": 13.8125,
213
+ "learning_rate": 3.5819629298273245e-05,
214
+ "loss": 1.9914,
215
+ "step": 950
216
+ },
217
+ {
218
+ "epoch": 0.8888888888888888,
219
+ "grad_norm": 13.6875,
220
+ "learning_rate": 3.4040614252052305e-05,
221
+ "loss": 1.9724,
222
+ "step": 1000
223
+ },
224
+ {
225
+ "epoch": 0.8888888888888888,
226
+ "eval_loss": 1.9275351762771606,
227
+ "eval_runtime": 48.8023,
228
+ "eval_samples_per_second": 20.491,
229
+ "eval_steps_per_second": 10.245,
230
+ "step": 1000
231
+ },
232
+ {
233
+ "epoch": 0.9333333333333333,
234
+ "grad_norm": 13.25,
235
+ "learning_rate": 3.2207227887960935e-05,
236
+ "loss": 1.9607,
237
+ "step": 1050
238
+ },
239
+ {
240
+ "epoch": 0.9777777777777777,
241
+ "grad_norm": 15.375,
242
+ "learning_rate": 3.033049640731711e-05,
243
+ "loss": 1.9418,
244
+ "step": 1100
245
+ },
246
+ {
247
+ "epoch": 0.9777777777777777,
248
+ "eval_loss": 1.9146583080291748,
249
+ "eval_runtime": 48.7562,
250
+ "eval_samples_per_second": 20.51,
251
+ "eval_steps_per_second": 10.255,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "epoch": 1.0222222222222221,
256
+ "grad_norm": 11.875,
257
+ "learning_rate": 2.8421706694069926e-05,
258
+ "loss": 1.8229,
259
+ "step": 1150
260
+ },
261
+ {
262
+ "epoch": 1.0666666666666667,
263
+ "grad_norm": 13.625,
264
+ "learning_rate": 2.649233843415149e-05,
265
+ "loss": 1.6831,
266
+ "step": 1200
267
+ },
268
+ {
269
+ "epoch": 1.0666666666666667,
270
+ "eval_loss": 1.9151355028152466,
271
+ "eval_runtime": 48.9802,
272
+ "eval_samples_per_second": 20.416,
273
+ "eval_steps_per_second": 10.208,
274
+ "step": 1200
275
+ },
276
+ {
277
+ "epoch": 1.1111111111111112,
278
+ "grad_norm": 12.625,
279
+ "learning_rate": 2.4553995075294933e-05,
280
+ "loss": 1.6801,
281
+ "step": 1250
282
+ },
283
+ {
284
+ "epoch": 1.1555555555555554,
285
+ "grad_norm": 14.5,
286
+ "learning_rate": 2.2618334042534464e-05,
287
+ "loss": 1.7079,
288
+ "step": 1300
289
+ },
290
+ {
291
+ "epoch": 1.1555555555555554,
292
+ "eval_loss": 1.9095146656036377,
293
+ "eval_runtime": 48.6949,
294
+ "eval_samples_per_second": 20.536,
295
+ "eval_steps_per_second": 10.268,
296
+ "step": 1300
297
+ },
298
+ {
299
+ "epoch": 1.2,
300
+ "grad_norm": 11.75,
301
+ "learning_rate": 2.0696996629079526e-05,
302
+ "loss": 1.6667,
303
+ "step": 1350
304
+ },
305
+ {
306
+ "epoch": 1.2444444444444445,
307
+ "grad_norm": 13.375,
308
+ "learning_rate": 1.880153798420768e-05,
309
+ "loss": 1.6615,
310
+ "step": 1400
311
+ },
312
+ {
313
+ "epoch": 1.2444444444444445,
314
+ "eval_loss": 1.9029484987258911,
315
+ "eval_runtime": 48.6447,
316
+ "eval_samples_per_second": 20.557,
317
+ "eval_steps_per_second": 10.279,
318
+ "step": 1400
319
+ },
320
+ {
321
+ "epoch": 1.2888888888888888,
322
+ "grad_norm": 15.3125,
323
+ "learning_rate": 1.6943357619237226e-05,
324
+ "loss": 1.6417,
325
+ "step": 1450
326
+ },
327
+ {
328
+ "epoch": 1.3333333333333333,
329
+ "grad_norm": 12.75,
330
+ "learning_rate": 1.5133630849524793e-05,
331
+ "loss": 1.6729,
332
+ "step": 1500
333
+ },
334
+ {
335
+ "epoch": 1.3333333333333333,
336
+ "eval_loss": 1.8957328796386719,
337
+ "eval_runtime": 48.7052,
338
+ "eval_samples_per_second": 20.532,
339
+ "eval_steps_per_second": 10.266,
340
+ "step": 1500
341
+ },
342
+ {
343
+ "epoch": 1.3777777777777778,
344
+ "grad_norm": 12.8125,
345
+ "learning_rate": 1.3383241584803884e-05,
346
+ "loss": 1.6703,
347
+ "step": 1550
348
+ },
349
+ {
350
+ "epoch": 1.4222222222222223,
351
+ "grad_norm": 12.3125,
352
+ "learning_rate": 1.170271687207106e-05,
353
+ "loss": 1.6666,
354
+ "step": 1600
355
+ },
356
+ {
357
+ "epoch": 1.4222222222222223,
358
+ "eval_loss": 1.89302659034729,
359
+ "eval_runtime": 48.6027,
360
+ "eval_samples_per_second": 20.575,
361
+ "eval_steps_per_second": 10.287,
362
+ "step": 1600
363
+ },
364
+ {
365
+ "epoch": 1.4666666666666668,
366
+ "grad_norm": 15.5,
367
+ "learning_rate": 1.010216358468665e-05,
368
+ "loss": 1.6447,
369
+ "step": 1650
370
+ },
371
+ {
372
+ "epoch": 1.511111111111111,
373
+ "grad_norm": 12.625,
374
+ "learning_rate": 8.591207638449154e-06,
375
+ "loss": 1.6755,
376
+ "step": 1700
377
+ },
378
+ {
379
+ "epoch": 1.511111111111111,
380
+ "eval_loss": 1.891126275062561,
381
+ "eval_runtime": 48.5903,
382
+ "eval_samples_per_second": 20.58,
383
+ "eval_steps_per_second": 10.29,
384
+ "step": 1700
385
+ },
386
+ {
387
+ "epoch": 1.5555555555555556,
388
+ "grad_norm": 14.0625,
389
+ "learning_rate": 7.178936100204994e-06,
390
+ "loss": 1.6474,
391
+ "step": 1750
392
+ },
393
+ {
394
+ "epoch": 1.6,
395
+ "grad_norm": 12.6875,
396
+ "learning_rate": 5.873842537159274e-06,
397
+ "loss": 1.66,
398
+ "step": 1800
399
+ },
400
+ {
401
+ "epoch": 1.6,
402
+ "eval_loss": 1.8891392946243286,
403
+ "eval_runtime": 48.6956,
404
+ "eval_samples_per_second": 20.536,
405
+ "eval_steps_per_second": 10.268,
406
+ "step": 1800
407
+ },
408
+ {
409
+ "epoch": 1.6444444444444444,
410
+ "grad_norm": 14.0625,
411
+ "learning_rate": 4.683775935563222e-06,
412
+ "loss": 1.682,
413
+ "step": 1850
414
+ },
415
+ {
416
+ "epoch": 1.6888888888888889,
417
+ "grad_norm": 14.6875,
418
+ "learning_rate": 3.6158934959873353e-06,
419
+ "loss": 1.6299,
420
+ "step": 1900
421
+ },
422
+ {
423
+ "epoch": 1.6888888888888889,
424
+ "eval_loss": 1.8877633810043335,
425
+ "eval_runtime": 48.685,
426
+ "eval_samples_per_second": 20.54,
427
+ "eval_steps_per_second": 10.27,
428
+ "step": 1900
429
+ },
430
+ {
431
+ "epoch": 1.7333333333333334,
432
+ "grad_norm": 11.875,
433
+ "learning_rate": 2.6766175890749786e-06,
434
+ "loss": 1.6601,
435
+ "step": 1950
436
+ },
437
+ {
438
+ "epoch": 1.7777777777777777,
439
+ "grad_norm": 13.625,
440
+ "learning_rate": 1.8715971306496745e-06,
441
+ "loss": 1.6286,
442
+ "step": 2000
443
+ },
444
+ {
445
+ "epoch": 1.7777777777777777,
446
+ "eval_loss": 1.8876391649246216,
447
+ "eval_runtime": 48.4452,
448
+ "eval_samples_per_second": 20.642,
449
+ "eval_steps_per_second": 10.321,
450
+ "step": 2000
451
+ }
452
+ ],
453
+ "logging_steps": 50,
454
+ "max_steps": 2250,
455
+ "num_input_tokens_seen": 0,
456
+ "num_train_epochs": 2,
457
+ "save_steps": 200,
458
+ "stateful_callbacks": {
459
+ "TrainerControl": {
460
+ "args": {
461
+ "should_epoch_stop": false,
462
+ "should_evaluate": false,
463
+ "should_log": false,
464
+ "should_save": true,
465
+ "should_training_stop": false
466
+ },
467
+ "attributes": {}
468
+ }
469
+ },
470
+ "total_flos": 2465612169216000.0,
471
+ "train_batch_size": 2,
472
+ "trial_name": null,
473
+ "trial_params": null
474
+ }
checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51ff9f61bcdaae5ac53531ca1cc31aab96bc0e3a0c0ccd88418c58ca3f44c82d
3
+ size 5304
checkpoint-2200/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
checkpoint-2200/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-2200/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "cache_implementation": "hybrid",
11
+ "eos_token_id": 1,
12
+ "final_logit_softcapping": null,
13
+ "head_dim": 256,
14
+ "hidden_activation": "gelu_pytorch_tanh",
15
+ "hidden_size": 640,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 2048,
18
+ "layer_types": [
19
+ "sliding_attention",
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
+ "model_type": "gemma3_text",
40
+ "num_attention_heads": 4,
41
+ "num_hidden_layers": 18,
42
+ "num_key_value_heads": 1,
43
+ "pad_token_id": 0,
44
+ "query_pre_attn_scalar": 256,
45
+ "rms_norm_eps": 1e-06,
46
+ "rope_local_base_freq": 10000.0,
47
+ "rope_scaling": null,
48
+ "rope_theta": 1000000.0,
49
+ "sliding_window": 512,
50
+ "sliding_window_pattern": 6,
51
+ "torch_dtype": "bfloat16",
52
+ "transformers_version": "4.52.4",
53
+ "use_bidirectional_attention": false,
54
+ "use_cache": true,
55
+ "vocab_size": 262144
56
+ }
checkpoint-2200/generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cache_implementation": "hybrid",
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 1,
6
+ 106
7
+ ],
8
+ "top_k": 64,
9
+ "top_p": 0.95,
10
+ "transformers_version": "4.52.4"
11
+ }
checkpoint-2200/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efe872bec3b6dd893add9a14c1f41c7221af35506062d825c0c88f42f9065b80
3
+ size 536223056
checkpoint-2200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33f9ef975b278600b0268b01644e66932af546a098a8007c4900562970239155
3
+ size 1072590714
checkpoint-2200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5f38f6446ba43bb9ae0be4911c150b41c5adfbe2712844ea3cb854b62ee2432
3
+ size 14244
checkpoint-2200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78b6b554e381cf0f46cecfb4e224f04cd835ca288ef75eb3df2c929e21a0cb8a
3
+ size 1064
checkpoint-2200/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
checkpoint-2200/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e570d6288ff0afcea981a80492eddfa3e2239a79de89e5074cbb74b548fa5e2b
3
+ size 33384833
checkpoint-2200/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
checkpoint-2200/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2200/trainer_state.json ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2000,
3
+ "best_metric": 1.8876391649246216,
4
+ "best_model_checkpoint": "./gemma3-270m-turkish_instructions-finetuned/checkpoint-2000",
5
+ "epoch": 1.9555555555555557,
6
+ "eval_steps": 100,
7
+ "global_step": 2200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.044444444444444446,
14
+ "grad_norm": 19.25,
15
+ "learning_rate": 1.088888888888889e-05,
16
+ "loss": 3.5828,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.08888888888888889,
21
+ "grad_norm": 17.875,
22
+ "learning_rate": 2.2000000000000003e-05,
23
+ "loss": 2.4877,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.08888888888888889,
28
+ "eval_loss": 2.275900363922119,
29
+ "eval_runtime": 48.3609,
30
+ "eval_samples_per_second": 20.678,
31
+ "eval_steps_per_second": 10.339,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.13333333333333333,
36
+ "grad_norm": 18.25,
37
+ "learning_rate": 3.311111111111112e-05,
38
+ "loss": 2.255,
39
+ "step": 150
40
+ },
41
+ {
42
+ "epoch": 0.17777777777777778,
43
+ "grad_norm": 13.4375,
44
+ "learning_rate": 4.422222222222222e-05,
45
+ "loss": 2.2559,
46
+ "step": 200
47
+ },
48
+ {
49
+ "epoch": 0.17777777777777778,
50
+ "eval_loss": 2.1721303462982178,
51
+ "eval_runtime": 48.5933,
52
+ "eval_samples_per_second": 20.579,
53
+ "eval_steps_per_second": 10.289,
54
+ "step": 200
55
+ },
56
+ {
57
+ "epoch": 0.2222222222222222,
58
+ "grad_norm": 18.875,
59
+ "learning_rate": 4.99826726554013e-05,
60
+ "loss": 2.2218,
61
+ "step": 250
62
+ },
63
+ {
64
+ "epoch": 0.26666666666666666,
65
+ "grad_norm": 16.625,
66
+ "learning_rate": 4.983543173414964e-05,
67
+ "loss": 2.1805,
68
+ "step": 300
69
+ },
70
+ {
71
+ "epoch": 0.26666666666666666,
72
+ "eval_loss": 2.1423935890197754,
73
+ "eval_runtime": 48.7813,
74
+ "eval_samples_per_second": 20.5,
75
+ "eval_steps_per_second": 10.25,
76
+ "step": 300
77
+ },
78
+ {
79
+ "epoch": 0.3111111111111111,
80
+ "grad_norm": 15.0625,
81
+ "learning_rate": 4.953882760420223e-05,
82
+ "loss": 2.2345,
83
+ "step": 350
84
+ },
85
+ {
86
+ "epoch": 0.35555555555555557,
87
+ "grad_norm": 15.75,
88
+ "learning_rate": 4.909464407769633e-05,
89
+ "loss": 2.1545,
90
+ "step": 400
91
+ },
92
+ {
93
+ "epoch": 0.35555555555555557,
94
+ "eval_loss": 2.081547260284424,
95
+ "eval_runtime": 48.8759,
96
+ "eval_samples_per_second": 20.46,
97
+ "eval_steps_per_second": 10.23,
98
+ "step": 400
99
+ },
100
+ {
101
+ "epoch": 0.4,
102
+ "grad_norm": 17.875,
103
+ "learning_rate": 4.850555252662495e-05,
104
+ "loss": 2.0966,
105
+ "step": 450
106
+ },
107
+ {
108
+ "epoch": 0.4444444444444444,
109
+ "grad_norm": 13.1875,
110
+ "learning_rate": 4.7775095816891336e-05,
111
+ "loss": 2.0782,
112
+ "step": 500
113
+ },
114
+ {
115
+ "epoch": 0.4444444444444444,
116
+ "eval_loss": 2.0508856773376465,
117
+ "eval_runtime": 48.9079,
118
+ "eval_samples_per_second": 20.447,
119
+ "eval_steps_per_second": 10.223,
120
+ "step": 500
121
+ },
122
+ {
123
+ "epoch": 0.4888888888888889,
124
+ "grad_norm": 15.9375,
125
+ "learning_rate": 4.690766700109659e-05,
126
+ "loss": 2.115,
127
+ "step": 550
128
+ },
129
+ {
130
+ "epoch": 0.5333333333333333,
131
+ "grad_norm": 13.9375,
132
+ "learning_rate": 4.590848289820442e-05,
133
+ "loss": 2.0314,
134
+ "step": 600
135
+ },
136
+ {
137
+ "epoch": 0.5333333333333333,
138
+ "eval_loss": 2.0262327194213867,
139
+ "eval_runtime": 48.768,
140
+ "eval_samples_per_second": 20.505,
141
+ "eval_steps_per_second": 10.253,
142
+ "step": 600
143
+ },
144
+ {
145
+ "epoch": 0.5777777777777777,
146
+ "grad_norm": 14.5,
147
+ "learning_rate": 4.4783552718978e-05,
148
+ "loss": 2.0892,
149
+ "step": 650
150
+ },
151
+ {
152
+ "epoch": 0.6222222222222222,
153
+ "grad_norm": 14.0,
154
+ "learning_rate": 4.3539641925879495e-05,
155
+ "loss": 2.0352,
156
+ "step": 700
157
+ },
158
+ {
159
+ "epoch": 0.6222222222222222,
160
+ "eval_loss": 1.9995155334472656,
161
+ "eval_runtime": 48.8752,
162
+ "eval_samples_per_second": 20.46,
163
+ "eval_steps_per_second": 10.23,
164
+ "step": 700
165
+ },
166
+ {
167
+ "epoch": 0.6666666666666666,
168
+ "grad_norm": 13.25,
169
+ "learning_rate": 4.2184231544782596e-05,
170
+ "loss": 2.0117,
171
+ "step": 750
172
+ },
173
+ {
174
+ "epoch": 0.7111111111111111,
175
+ "grad_norm": 13.25,
176
+ "learning_rate": 4.072547317320281e-05,
177
+ "loss": 1.9848,
178
+ "step": 800
179
+ },
180
+ {
181
+ "epoch": 0.7111111111111111,
182
+ "eval_loss": 1.9770301580429077,
183
+ "eval_runtime": 48.7445,
184
+ "eval_samples_per_second": 20.515,
185
+ "eval_steps_per_second": 10.258,
186
+ "step": 800
187
+ },
188
+ {
189
+ "epoch": 0.7555555555555555,
190
+ "grad_norm": 13.375,
191
+ "learning_rate": 3.9172139955630774e-05,
192
+ "loss": 2.0014,
193
+ "step": 850
194
+ },
195
+ {
196
+ "epoch": 0.8,
197
+ "grad_norm": 12.75,
198
+ "learning_rate": 3.7533573820809006e-05,
199
+ "loss": 1.9938,
200
+ "step": 900
201
+ },
202
+ {
203
+ "epoch": 0.8,
204
+ "eval_loss": 1.9461404085159302,
205
+ "eval_runtime": 48.8094,
206
+ "eval_samples_per_second": 20.488,
207
+ "eval_steps_per_second": 10.244,
208
+ "step": 900
209
+ },
210
+ {
211
+ "epoch": 0.8444444444444444,
212
+ "grad_norm": 13.8125,
213
+ "learning_rate": 3.5819629298273245e-05,
214
+ "loss": 1.9914,
215
+ "step": 950
216
+ },
217
+ {
218
+ "epoch": 0.8888888888888888,
219
+ "grad_norm": 13.6875,
220
+ "learning_rate": 3.4040614252052305e-05,
221
+ "loss": 1.9724,
222
+ "step": 1000
223
+ },
224
+ {
225
+ "epoch": 0.8888888888888888,
226
+ "eval_loss": 1.9275351762771606,
227
+ "eval_runtime": 48.8023,
228
+ "eval_samples_per_second": 20.491,
229
+ "eval_steps_per_second": 10.245,
230
+ "step": 1000
231
+ },
232
+ {
233
+ "epoch": 0.9333333333333333,
234
+ "grad_norm": 13.25,
235
+ "learning_rate": 3.2207227887960935e-05,
236
+ "loss": 1.9607,
237
+ "step": 1050
238
+ },
239
+ {
240
+ "epoch": 0.9777777777777777,
241
+ "grad_norm": 15.375,
242
+ "learning_rate": 3.033049640731711e-05,
243
+ "loss": 1.9418,
244
+ "step": 1100
245
+ },
246
+ {
247
+ "epoch": 0.9777777777777777,
248
+ "eval_loss": 1.9146583080291748,
249
+ "eval_runtime": 48.7562,
250
+ "eval_samples_per_second": 20.51,
251
+ "eval_steps_per_second": 10.255,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "epoch": 1.0222222222222221,
256
+ "grad_norm": 11.875,
257
+ "learning_rate": 2.8421706694069926e-05,
258
+ "loss": 1.8229,
259
+ "step": 1150
260
+ },
261
+ {
262
+ "epoch": 1.0666666666666667,
263
+ "grad_norm": 13.625,
264
+ "learning_rate": 2.649233843415149e-05,
265
+ "loss": 1.6831,
266
+ "step": 1200
267
+ },
268
+ {
269
+ "epoch": 1.0666666666666667,
270
+ "eval_loss": 1.9151355028152466,
271
+ "eval_runtime": 48.9802,
272
+ "eval_samples_per_second": 20.416,
273
+ "eval_steps_per_second": 10.208,
274
+ "step": 1200
275
+ },
276
+ {
277
+ "epoch": 1.1111111111111112,
278
+ "grad_norm": 12.625,
279
+ "learning_rate": 2.4553995075294933e-05,
280
+ "loss": 1.6801,
281
+ "step": 1250
282
+ },
283
+ {
284
+ "epoch": 1.1555555555555554,
285
+ "grad_norm": 14.5,
286
+ "learning_rate": 2.2618334042534464e-05,
287
+ "loss": 1.7079,
288
+ "step": 1300
289
+ },
290
+ {
291
+ "epoch": 1.1555555555555554,
292
+ "eval_loss": 1.9095146656036377,
293
+ "eval_runtime": 48.6949,
294
+ "eval_samples_per_second": 20.536,
295
+ "eval_steps_per_second": 10.268,
296
+ "step": 1300
297
+ },
298
+ {
299
+ "epoch": 1.2,
300
+ "grad_norm": 11.75,
301
+ "learning_rate": 2.0696996629079526e-05,
302
+ "loss": 1.6667,
303
+ "step": 1350
304
+ },
305
+ {
306
+ "epoch": 1.2444444444444445,
307
+ "grad_norm": 13.375,
308
+ "learning_rate": 1.880153798420768e-05,
309
+ "loss": 1.6615,
310
+ "step": 1400
311
+ },
312
+ {
313
+ "epoch": 1.2444444444444445,
314
+ "eval_loss": 1.9029484987258911,
315
+ "eval_runtime": 48.6447,
316
+ "eval_samples_per_second": 20.557,
317
+ "eval_steps_per_second": 10.279,
318
+ "step": 1400
319
+ },
320
+ {
321
+ "epoch": 1.2888888888888888,
322
+ "grad_norm": 15.3125,
323
+ "learning_rate": 1.6943357619237226e-05,
324
+ "loss": 1.6417,
325
+ "step": 1450
326
+ },
327
+ {
328
+ "epoch": 1.3333333333333333,
329
+ "grad_norm": 12.75,
330
+ "learning_rate": 1.5133630849524793e-05,
331
+ "loss": 1.6729,
332
+ "step": 1500
333
+ },
334
+ {
335
+ "epoch": 1.3333333333333333,
336
+ "eval_loss": 1.8957328796386719,
337
+ "eval_runtime": 48.7052,
338
+ "eval_samples_per_second": 20.532,
339
+ "eval_steps_per_second": 10.266,
340
+ "step": 1500
341
+ },
342
+ {
343
+ "epoch": 1.3777777777777778,
344
+ "grad_norm": 12.8125,
345
+ "learning_rate": 1.3383241584803884e-05,
346
+ "loss": 1.6703,
347
+ "step": 1550
348
+ },
349
+ {
350
+ "epoch": 1.4222222222222223,
351
+ "grad_norm": 12.3125,
352
+ "learning_rate": 1.170271687207106e-05,
353
+ "loss": 1.6666,
354
+ "step": 1600
355
+ },
356
+ {
357
+ "epoch": 1.4222222222222223,
358
+ "eval_loss": 1.89302659034729,
359
+ "eval_runtime": 48.6027,
360
+ "eval_samples_per_second": 20.575,
361
+ "eval_steps_per_second": 10.287,
362
+ "step": 1600
363
+ },
364
+ {
365
+ "epoch": 1.4666666666666668,
366
+ "grad_norm": 15.5,
367
+ "learning_rate": 1.010216358468665e-05,
368
+ "loss": 1.6447,
369
+ "step": 1650
370
+ },
371
+ {
372
+ "epoch": 1.511111111111111,
373
+ "grad_norm": 12.625,
374
+ "learning_rate": 8.591207638449154e-06,
375
+ "loss": 1.6755,
376
+ "step": 1700
377
+ },
378
+ {
379
+ "epoch": 1.511111111111111,
380
+ "eval_loss": 1.891126275062561,
381
+ "eval_runtime": 48.5903,
382
+ "eval_samples_per_second": 20.58,
383
+ "eval_steps_per_second": 10.29,
384
+ "step": 1700
385
+ },
386
+ {
387
+ "epoch": 1.5555555555555556,
388
+ "grad_norm": 14.0625,
389
+ "learning_rate": 7.178936100204994e-06,
390
+ "loss": 1.6474,
391
+ "step": 1750
392
+ },
393
+ {
394
+ "epoch": 1.6,
395
+ "grad_norm": 12.6875,
396
+ "learning_rate": 5.873842537159274e-06,
397
+ "loss": 1.66,
398
+ "step": 1800
399
+ },
400
+ {
401
+ "epoch": 1.6,
402
+ "eval_loss": 1.8891392946243286,
403
+ "eval_runtime": 48.6956,
404
+ "eval_samples_per_second": 20.536,
405
+ "eval_steps_per_second": 10.268,
406
+ "step": 1800
407
+ },
408
+ {
409
+ "epoch": 1.6444444444444444,
410
+ "grad_norm": 14.0625,
411
+ "learning_rate": 4.683775935563222e-06,
412
+ "loss": 1.682,
413
+ "step": 1850
414
+ },
415
+ {
416
+ "epoch": 1.6888888888888889,
417
+ "grad_norm": 14.6875,
418
+ "learning_rate": 3.6158934959873353e-06,
419
+ "loss": 1.6299,
420
+ "step": 1900
421
+ },
422
+ {
423
+ "epoch": 1.6888888888888889,
424
+ "eval_loss": 1.8877633810043335,
425
+ "eval_runtime": 48.685,
426
+ "eval_samples_per_second": 20.54,
427
+ "eval_steps_per_second": 10.27,
428
+ "step": 1900
429
+ },
430
+ {
431
+ "epoch": 1.7333333333333334,
432
+ "grad_norm": 11.875,
433
+ "learning_rate": 2.6766175890749786e-06,
434
+ "loss": 1.6601,
435
+ "step": 1950
436
+ },
437
+ {
438
+ "epoch": 1.7777777777777777,
439
+ "grad_norm": 13.625,
440
+ "learning_rate": 1.8715971306496745e-06,
441
+ "loss": 1.6286,
442
+ "step": 2000
443
+ },
444
+ {
445
+ "epoch": 1.7777777777777777,
446
+ "eval_loss": 1.8876391649246216,
447
+ "eval_runtime": 48.4452,
448
+ "eval_samples_per_second": 20.642,
449
+ "eval_steps_per_second": 10.321,
450
+ "step": 2000
451
+ },
452
+ {
453
+ "epoch": 1.8222222222222222,
454
+ "grad_norm": 11.625,
455
+ "learning_rate": 1.2056736084706589e-06,
456
+ "loss": 1.6788,
457
+ "step": 2050
458
+ },
459
+ {
460
+ "epoch": 1.8666666666666667,
461
+ "grad_norm": 13.125,
462
+ "learning_rate": 6.828519649558191e-07,
463
+ "loss": 1.6811,
464
+ "step": 2100
465
+ },
466
+ {
467
+ "epoch": 1.8666666666666667,
468
+ "eval_loss": 1.8877239227294922,
469
+ "eval_runtime": 48.6643,
470
+ "eval_samples_per_second": 20.549,
471
+ "eval_steps_per_second": 10.274,
472
+ "step": 2100
473
+ },
474
+ {
475
+ "epoch": 1.911111111111111,
476
+ "grad_norm": 12.1875,
477
+ "learning_rate": 3.062765109867499e-07,
478
+ "loss": 1.6562,
479
+ "step": 2150
480
+ },
481
+ {
482
+ "epoch": 1.9555555555555557,
483
+ "grad_norm": 13.4375,
484
+ "learning_rate": 7.821201565316182e-08,
485
+ "loss": 1.6527,
486
+ "step": 2200
487
+ },
488
+ {
489
+ "epoch": 1.9555555555555557,
490
+ "eval_loss": 1.8878294229507446,
491
+ "eval_runtime": 48.65,
492
+ "eval_samples_per_second": 20.555,
493
+ "eval_steps_per_second": 10.277,
494
+ "step": 2200
495
+ }
496
+ ],
497
+ "logging_steps": 50,
498
+ "max_steps": 2250,
499
+ "num_input_tokens_seen": 0,
500
+ "num_train_epochs": 2,
501
+ "save_steps": 200,
502
+ "stateful_callbacks": {
503
+ "TrainerControl": {
504
+ "args": {
505
+ "should_epoch_stop": false,
506
+ "should_evaluate": false,
507
+ "should_log": false,
508
+ "should_save": true,
509
+ "should_training_stop": false
510
+ },
511
+ "attributes": {}
512
+ }
513
+ },
514
+ "total_flos": 2712173386137600.0,
515
+ "train_batch_size": 2,
516
+ "trial_name": null,
517
+ "trial_params": null
518
+ }
checkpoint-2200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51ff9f61bcdaae5ac53531ca1cc31aab96bc0e3a0c0ccd88418c58ca3f44c82d
3
+ size 5304
checkpoint-2250/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
checkpoint-2250/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-2250/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "cache_implementation": "hybrid",
11
+ "eos_token_id": 1,
12
+ "final_logit_softcapping": null,
13
+ "head_dim": 256,
14
+ "hidden_activation": "gelu_pytorch_tanh",
15
+ "hidden_size": 640,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 2048,
18
+ "layer_types": [
19
+ "sliding_attention",
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
+ "model_type": "gemma3_text",
40
+ "num_attention_heads": 4,
41
+ "num_hidden_layers": 18,
42
+ "num_key_value_heads": 1,
43
+ "pad_token_id": 0,
44
+ "query_pre_attn_scalar": 256,
45
+ "rms_norm_eps": 1e-06,
46
+ "rope_local_base_freq": 10000.0,
47
+ "rope_scaling": null,
48
+ "rope_theta": 1000000.0,
49
+ "sliding_window": 512,
50
+ "sliding_window_pattern": 6,
51
+ "torch_dtype": "bfloat16",
52
+ "transformers_version": "4.52.4",
53
+ "use_bidirectional_attention": false,
54
+ "use_cache": true,
55
+ "vocab_size": 262144
56
+ }
checkpoint-2250/generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cache_implementation": "hybrid",
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 1,
6
+ 106
7
+ ],
8
+ "top_k": 64,
9
+ "top_p": 0.95,
10
+ "transformers_version": "4.52.4"
11
+ }
checkpoint-2250/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:167a1b553a36c55710b27020272e1c73fa083db459266e49447d4a5f9fa0e99e
3
+ size 536223056
checkpoint-2250/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7ca38aa29fdc7d7fae28324f62f22d3b66a9327f3d1569d34cf169fd82dfd9d
3
+ size 1072590714
checkpoint-2250/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5f38f6446ba43bb9ae0be4911c150b41c5adfbe2712844ea3cb854b62ee2432
3
+ size 14244
checkpoint-2250/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bec953c174e41850d61f3908d88b9683a2b8a87c5e9752e8ae44ce839aa2004
3
+ size 1064
checkpoint-2250/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
checkpoint-2250/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e570d6288ff0afcea981a80492eddfa3e2239a79de89e5074cbb74b548fa5e2b
3
+ size 33384833
checkpoint-2250/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
checkpoint-2250/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2250/trainer_state.json ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2000,
3
+ "best_metric": 1.8876391649246216,
4
+ "best_model_checkpoint": "./gemma3-270m-turkish_instructions-finetuned/checkpoint-2000",
5
+ "epoch": 2.0,
6
+ "eval_steps": 100,
7
+ "global_step": 2250,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.044444444444444446,
14
+ "grad_norm": 19.25,
15
+ "learning_rate": 1.088888888888889e-05,
16
+ "loss": 3.5828,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.08888888888888889,
21
+ "grad_norm": 17.875,
22
+ "learning_rate": 2.2000000000000003e-05,
23
+ "loss": 2.4877,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.08888888888888889,
28
+ "eval_loss": 2.275900363922119,
29
+ "eval_runtime": 48.3609,
30
+ "eval_samples_per_second": 20.678,
31
+ "eval_steps_per_second": 10.339,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.13333333333333333,
36
+ "grad_norm": 18.25,
37
+ "learning_rate": 3.311111111111112e-05,
38
+ "loss": 2.255,
39
+ "step": 150
40
+ },
41
+ {
42
+ "epoch": 0.17777777777777778,
43
+ "grad_norm": 13.4375,
44
+ "learning_rate": 4.422222222222222e-05,
45
+ "loss": 2.2559,
46
+ "step": 200
47
+ },
48
+ {
49
+ "epoch": 0.17777777777777778,
50
+ "eval_loss": 2.1721303462982178,
51
+ "eval_runtime": 48.5933,
52
+ "eval_samples_per_second": 20.579,
53
+ "eval_steps_per_second": 10.289,
54
+ "step": 200
55
+ },
56
+ {
57
+ "epoch": 0.2222222222222222,
58
+ "grad_norm": 18.875,
59
+ "learning_rate": 4.99826726554013e-05,
60
+ "loss": 2.2218,
61
+ "step": 250
62
+ },
63
+ {
64
+ "epoch": 0.26666666666666666,
65
+ "grad_norm": 16.625,
66
+ "learning_rate": 4.983543173414964e-05,
67
+ "loss": 2.1805,
68
+ "step": 300
69
+ },
70
+ {
71
+ "epoch": 0.26666666666666666,
72
+ "eval_loss": 2.1423935890197754,
73
+ "eval_runtime": 48.7813,
74
+ "eval_samples_per_second": 20.5,
75
+ "eval_steps_per_second": 10.25,
76
+ "step": 300
77
+ },
78
+ {
79
+ "epoch": 0.3111111111111111,
80
+ "grad_norm": 15.0625,
81
+ "learning_rate": 4.953882760420223e-05,
82
+ "loss": 2.2345,
83
+ "step": 350
84
+ },
85
+ {
86
+ "epoch": 0.35555555555555557,
87
+ "grad_norm": 15.75,
88
+ "learning_rate": 4.909464407769633e-05,
89
+ "loss": 2.1545,
90
+ "step": 400
91
+ },
92
+ {
93
+ "epoch": 0.35555555555555557,
94
+ "eval_loss": 2.081547260284424,
95
+ "eval_runtime": 48.8759,
96
+ "eval_samples_per_second": 20.46,
97
+ "eval_steps_per_second": 10.23,
98
+ "step": 400
99
+ },
100
+ {
101
+ "epoch": 0.4,
102
+ "grad_norm": 17.875,
103
+ "learning_rate": 4.850555252662495e-05,
104
+ "loss": 2.0966,
105
+ "step": 450
106
+ },
107
+ {
108
+ "epoch": 0.4444444444444444,
109
+ "grad_norm": 13.1875,
110
+ "learning_rate": 4.7775095816891336e-05,
111
+ "loss": 2.0782,
112
+ "step": 500
113
+ },
114
+ {
115
+ "epoch": 0.4444444444444444,
116
+ "eval_loss": 2.0508856773376465,
117
+ "eval_runtime": 48.9079,
118
+ "eval_samples_per_second": 20.447,
119
+ "eval_steps_per_second": 10.223,
120
+ "step": 500
121
+ },
122
+ {
123
+ "epoch": 0.4888888888888889,
124
+ "grad_norm": 15.9375,
125
+ "learning_rate": 4.690766700109659e-05,
126
+ "loss": 2.115,
127
+ "step": 550
128
+ },
129
+ {
130
+ "epoch": 0.5333333333333333,
131
+ "grad_norm": 13.9375,
132
+ "learning_rate": 4.590848289820442e-05,
133
+ "loss": 2.0314,
134
+ "step": 600
135
+ },
136
+ {
137
+ "epoch": 0.5333333333333333,
138
+ "eval_loss": 2.0262327194213867,
139
+ "eval_runtime": 48.768,
140
+ "eval_samples_per_second": 20.505,
141
+ "eval_steps_per_second": 10.253,
142
+ "step": 600
143
+ },
144
+ {
145
+ "epoch": 0.5777777777777777,
146
+ "grad_norm": 14.5,
147
+ "learning_rate": 4.4783552718978e-05,
148
+ "loss": 2.0892,
149
+ "step": 650
150
+ },
151
+ {
152
+ "epoch": 0.6222222222222222,
153
+ "grad_norm": 14.0,
154
+ "learning_rate": 4.3539641925879495e-05,
155
+ "loss": 2.0352,
156
+ "step": 700
157
+ },
158
+ {
159
+ "epoch": 0.6222222222222222,
160
+ "eval_loss": 1.9995155334472656,
161
+ "eval_runtime": 48.8752,
162
+ "eval_samples_per_second": 20.46,
163
+ "eval_steps_per_second": 10.23,
164
+ "step": 700
165
+ },
166
+ {
167
+ "epoch": 0.6666666666666666,
168
+ "grad_norm": 13.25,
169
+ "learning_rate": 4.2184231544782596e-05,
170
+ "loss": 2.0117,
171
+ "step": 750
172
+ },
173
+ {
174
+ "epoch": 0.7111111111111111,
175
+ "grad_norm": 13.25,
176
+ "learning_rate": 4.072547317320281e-05,
177
+ "loss": 1.9848,
178
+ "step": 800
179
+ },
180
+ {
181
+ "epoch": 0.7111111111111111,
182
+ "eval_loss": 1.9770301580429077,
183
+ "eval_runtime": 48.7445,
184
+ "eval_samples_per_second": 20.515,
185
+ "eval_steps_per_second": 10.258,
186
+ "step": 800
187
+ },
188
+ {
189
+ "epoch": 0.7555555555555555,
190
+ "grad_norm": 13.375,
191
+ "learning_rate": 3.9172139955630774e-05,
192
+ "loss": 2.0014,
193
+ "step": 850
194
+ },
195
+ {
196
+ "epoch": 0.8,
197
+ "grad_norm": 12.75,
198
+ "learning_rate": 3.7533573820809006e-05,
199
+ "loss": 1.9938,
200
+ "step": 900
201
+ },
202
+ {
203
+ "epoch": 0.8,
204
+ "eval_loss": 1.9461404085159302,
205
+ "eval_runtime": 48.8094,
206
+ "eval_samples_per_second": 20.488,
207
+ "eval_steps_per_second": 10.244,
208
+ "step": 900
209
+ },
210
+ {
211
+ "epoch": 0.8444444444444444,
212
+ "grad_norm": 13.8125,
213
+ "learning_rate": 3.5819629298273245e-05,
214
+ "loss": 1.9914,
215
+ "step": 950
216
+ },
217
+ {
218
+ "epoch": 0.8888888888888888,
219
+ "grad_norm": 13.6875,
220
+ "learning_rate": 3.4040614252052305e-05,
221
+ "loss": 1.9724,
222
+ "step": 1000
223
+ },
224
+ {
225
+ "epoch": 0.8888888888888888,
226
+ "eval_loss": 1.9275351762771606,
227
+ "eval_runtime": 48.8023,
228
+ "eval_samples_per_second": 20.491,
229
+ "eval_steps_per_second": 10.245,
230
+ "step": 1000
231
+ },
232
+ {
233
+ "epoch": 0.9333333333333333,
234
+ "grad_norm": 13.25,
235
+ "learning_rate": 3.2207227887960935e-05,
236
+ "loss": 1.9607,
237
+ "step": 1050
238
+ },
239
+ {
240
+ "epoch": 0.9777777777777777,
241
+ "grad_norm": 15.375,
242
+ "learning_rate": 3.033049640731711e-05,
243
+ "loss": 1.9418,
244
+ "step": 1100
245
+ },
246
+ {
247
+ "epoch": 0.9777777777777777,
248
+ "eval_loss": 1.9146583080291748,
249
+ "eval_runtime": 48.7562,
250
+ "eval_samples_per_second": 20.51,
251
+ "eval_steps_per_second": 10.255,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "epoch": 1.0222222222222221,
256
+ "grad_norm": 11.875,
257
+ "learning_rate": 2.8421706694069926e-05,
258
+ "loss": 1.8229,
259
+ "step": 1150
260
+ },
261
+ {
262
+ "epoch": 1.0666666666666667,
263
+ "grad_norm": 13.625,
264
+ "learning_rate": 2.649233843415149e-05,
265
+ "loss": 1.6831,
266
+ "step": 1200
267
+ },
268
+ {
269
+ "epoch": 1.0666666666666667,
270
+ "eval_loss": 1.9151355028152466,
271
+ "eval_runtime": 48.9802,
272
+ "eval_samples_per_second": 20.416,
273
+ "eval_steps_per_second": 10.208,
274
+ "step": 1200
275
+ },
276
+ {
277
+ "epoch": 1.1111111111111112,
278
+ "grad_norm": 12.625,
279
+ "learning_rate": 2.4553995075294933e-05,
280
+ "loss": 1.6801,
281
+ "step": 1250
282
+ },
283
+ {
284
+ "epoch": 1.1555555555555554,
285
+ "grad_norm": 14.5,
286
+ "learning_rate": 2.2618334042534464e-05,
287
+ "loss": 1.7079,
288
+ "step": 1300
289
+ },
290
+ {
291
+ "epoch": 1.1555555555555554,
292
+ "eval_loss": 1.9095146656036377,
293
+ "eval_runtime": 48.6949,
294
+ "eval_samples_per_second": 20.536,
295
+ "eval_steps_per_second": 10.268,
296
+ "step": 1300
297
+ },
298
+ {
299
+ "epoch": 1.2,
300
+ "grad_norm": 11.75,
301
+ "learning_rate": 2.0696996629079526e-05,
302
+ "loss": 1.6667,
303
+ "step": 1350
304
+ },
305
+ {
306
+ "epoch": 1.2444444444444445,
307
+ "grad_norm": 13.375,
308
+ "learning_rate": 1.880153798420768e-05,
309
+ "loss": 1.6615,
310
+ "step": 1400
311
+ },
312
+ {
313
+ "epoch": 1.2444444444444445,
314
+ "eval_loss": 1.9029484987258911,
315
+ "eval_runtime": 48.6447,
316
+ "eval_samples_per_second": 20.557,
317
+ "eval_steps_per_second": 10.279,
318
+ "step": 1400
319
+ },
320
+ {
321
+ "epoch": 1.2888888888888888,
322
+ "grad_norm": 15.3125,
323
+ "learning_rate": 1.6943357619237226e-05,
324
+ "loss": 1.6417,
325
+ "step": 1450
326
+ },
327
+ {
328
+ "epoch": 1.3333333333333333,
329
+ "grad_norm": 12.75,
330
+ "learning_rate": 1.5133630849524793e-05,
331
+ "loss": 1.6729,
332
+ "step": 1500
333
+ },
334
+ {
335
+ "epoch": 1.3333333333333333,
336
+ "eval_loss": 1.8957328796386719,
337
+ "eval_runtime": 48.7052,
338
+ "eval_samples_per_second": 20.532,
339
+ "eval_steps_per_second": 10.266,
340
+ "step": 1500
341
+ },
342
+ {
343
+ "epoch": 1.3777777777777778,
344
+ "grad_norm": 12.8125,
345
+ "learning_rate": 1.3383241584803884e-05,
346
+ "loss": 1.6703,
347
+ "step": 1550
348
+ },
349
+ {
350
+ "epoch": 1.4222222222222223,
351
+ "grad_norm": 12.3125,
352
+ "learning_rate": 1.170271687207106e-05,
353
+ "loss": 1.6666,
354
+ "step": 1600
355
+ },
356
+ {
357
+ "epoch": 1.4222222222222223,
358
+ "eval_loss": 1.89302659034729,
359
+ "eval_runtime": 48.6027,
360
+ "eval_samples_per_second": 20.575,
361
+ "eval_steps_per_second": 10.287,
362
+ "step": 1600
363
+ },
364
+ {
365
+ "epoch": 1.4666666666666668,
366
+ "grad_norm": 15.5,
367
+ "learning_rate": 1.010216358468665e-05,
368
+ "loss": 1.6447,
369
+ "step": 1650
370
+ },
371
+ {
372
+ "epoch": 1.511111111111111,
373
+ "grad_norm": 12.625,
374
+ "learning_rate": 8.591207638449154e-06,
375
+ "loss": 1.6755,
376
+ "step": 1700
377
+ },
378
+ {
379
+ "epoch": 1.511111111111111,
380
+ "eval_loss": 1.891126275062561,
381
+ "eval_runtime": 48.5903,
382
+ "eval_samples_per_second": 20.58,
383
+ "eval_steps_per_second": 10.29,
384
+ "step": 1700
385
+ },
386
+ {
387
+ "epoch": 1.5555555555555556,
388
+ "grad_norm": 14.0625,
389
+ "learning_rate": 7.178936100204994e-06,
390
+ "loss": 1.6474,
391
+ "step": 1750
392
+ },
393
+ {
394
+ "epoch": 1.6,
395
+ "grad_norm": 12.6875,
396
+ "learning_rate": 5.873842537159274e-06,
397
+ "loss": 1.66,
398
+ "step": 1800
399
+ },
400
+ {
401
+ "epoch": 1.6,
402
+ "eval_loss": 1.8891392946243286,
403
+ "eval_runtime": 48.6956,
404
+ "eval_samples_per_second": 20.536,
405
+ "eval_steps_per_second": 10.268,
406
+ "step": 1800
407
+ },
408
+ {
409
+ "epoch": 1.6444444444444444,
410
+ "grad_norm": 14.0625,
411
+ "learning_rate": 4.683775935563222e-06,
412
+ "loss": 1.682,
413
+ "step": 1850
414
+ },
415
+ {
416
+ "epoch": 1.6888888888888889,
417
+ "grad_norm": 14.6875,
418
+ "learning_rate": 3.6158934959873353e-06,
419
+ "loss": 1.6299,
420
+ "step": 1900
421
+ },
422
+ {
423
+ "epoch": 1.6888888888888889,
424
+ "eval_loss": 1.8877633810043335,
425
+ "eval_runtime": 48.685,
426
+ "eval_samples_per_second": 20.54,
427
+ "eval_steps_per_second": 10.27,
428
+ "step": 1900
429
+ },
430
+ {
431
+ "epoch": 1.7333333333333334,
432
+ "grad_norm": 11.875,
433
+ "learning_rate": 2.6766175890749786e-06,
434
+ "loss": 1.6601,
435
+ "step": 1950
436
+ },
437
+ {
438
+ "epoch": 1.7777777777777777,
439
+ "grad_norm": 13.625,
440
+ "learning_rate": 1.8715971306496745e-06,
441
+ "loss": 1.6286,
442
+ "step": 2000
443
+ },
444
+ {
445
+ "epoch": 1.7777777777777777,
446
+ "eval_loss": 1.8876391649246216,
447
+ "eval_runtime": 48.4452,
448
+ "eval_samples_per_second": 20.642,
449
+ "eval_steps_per_second": 10.321,
450
+ "step": 2000
451
+ },
452
+ {
453
+ "epoch": 1.8222222222222222,
454
+ "grad_norm": 11.625,
455
+ "learning_rate": 1.2056736084706589e-06,
456
+ "loss": 1.6788,
457
+ "step": 2050
458
+ },
459
+ {
460
+ "epoch": 1.8666666666666667,
461
+ "grad_norm": 13.125,
462
+ "learning_rate": 6.828519649558191e-07,
463
+ "loss": 1.6811,
464
+ "step": 2100
465
+ },
466
+ {
467
+ "epoch": 1.8666666666666667,
468
+ "eval_loss": 1.8877239227294922,
469
+ "eval_runtime": 48.6643,
470
+ "eval_samples_per_second": 20.549,
471
+ "eval_steps_per_second": 10.274,
472
+ "step": 2100
473
+ },
474
+ {
475
+ "epoch": 1.911111111111111,
476
+ "grad_norm": 12.1875,
477
+ "learning_rate": 3.062765109867499e-07,
478
+ "loss": 1.6562,
479
+ "step": 2150
480
+ },
481
+ {
482
+ "epoch": 1.9555555555555557,
483
+ "grad_norm": 13.4375,
484
+ "learning_rate": 7.821201565316182e-08,
485
+ "loss": 1.6527,
486
+ "step": 2200
487
+ },
488
+ {
489
+ "epoch": 1.9555555555555557,
490
+ "eval_loss": 1.8878294229507446,
491
+ "eval_runtime": 48.65,
492
+ "eval_samples_per_second": 20.555,
493
+ "eval_steps_per_second": 10.277,
494
+ "step": 2200
495
+ },
496
+ {
497
+ "epoch": 2.0,
498
+ "grad_norm": 13.75,
499
+ "learning_rate": 3.008566505646737e-11,
500
+ "loss": 1.6683,
501
+ "step": 2250
502
+ }
503
+ ],
504
+ "logging_steps": 50,
505
+ "max_steps": 2250,
506
+ "num_input_tokens_seen": 0,
507
+ "num_train_epochs": 2,
508
+ "save_steps": 200,
509
+ "stateful_callbacks": {
510
+ "TrainerControl": {
511
+ "args": {
512
+ "should_epoch_stop": false,
513
+ "should_evaluate": false,
514
+ "should_log": false,
515
+ "should_save": true,
516
+ "should_training_stop": true
517
+ },
518
+ "attributes": {}
519
+ }
520
+ },
521
+ "total_flos": 2773813690368000.0,
522
+ "train_batch_size": 2,
523
+ "trial_name": null,
524
+ "trial_params": null
525
+ }
checkpoint-2250/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51ff9f61bcdaae5ac53531ca1cc31aab96bc0e3a0c0ccd88418c58ca3f44c82d
3
+ size 5304
config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "cache_implementation": "hybrid",
11
+ "eos_token_id": 1,
12
+ "final_logit_softcapping": null,
13
+ "head_dim": 256,
14
+ "hidden_activation": "gelu_pytorch_tanh",
15
+ "hidden_size": 640,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 2048,
18
+ "layer_types": [
19
+ "sliding_attention",
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
+ "model_type": "gemma3_text",
40
+ "num_attention_heads": 4,
41
+ "num_hidden_layers": 18,
42
+ "num_key_value_heads": 1,
43
+ "pad_token_id": 0,
44
+ "query_pre_attn_scalar": 256,
45
+ "rms_norm_eps": 1e-06,
46
+ "rope_local_base_freq": 10000.0,
47
+ "rope_scaling": null,
48
+ "rope_theta": 1000000.0,
49
+ "sliding_window": 512,
50
+ "sliding_window_pattern": 6,
51
+ "torch_dtype": "bfloat16",
52
+ "transformers_version": "4.52.4",
53
+ "use_bidirectional_attention": false,
54
+ "use_cache": true,
55
+ "vocab_size": 262144
56
+ }
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cache_implementation": "hybrid",
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 1,
6
+ 106
7
+ ],
8
+ "top_k": 64,
9
+ "top_p": 0.95,
10
+ "transformers_version": "4.52.4"
11
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7204dfb6f3a944a032ca5ed20d71c923a4fed2c3b11dfabc2c1ff9f2fabe8af8
3
+ size 536223056
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e570d6288ff0afcea981a80492eddfa3e2239a79de89e5074cbb74b548fa5e2b
3
+ size 33384833