hazentr commited on
Commit
fc00c82
·
verified ·
1 Parent(s): 616d559

End of training

Browse files
README.md CHANGED
@@ -1,20 +1,20 @@
1
  ---
2
- base_model: Gensyn/Qwen2.5-0.5B-Instruct
3
  library_name: transformers
4
  model_name: Qwen2.5-0.5B-Instruct-Gensyn-Swarm-quick_timid_frog
5
  tags:
6
  - generated_from_trainer
7
- - rl-swarm
8
  - grpo
9
  - gensyn
10
  - I am quick timid frog
11
  - trl
 
12
  licence: license
13
  ---
14
 
15
  # Model Card for Qwen2.5-0.5B-Instruct-Gensyn-Swarm-quick_timid_frog
16
 
17
- This model is a fine-tuned version of [Gensyn/Qwen2.5-0.5B-Instruct](https://huggingface.co/Gensyn/Qwen2.5-0.5B-Instruct).
18
  It has been trained using [TRL](https://github.com/huggingface/trl).
19
 
20
  ## Quick start
@@ -37,10 +37,10 @@ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing
37
 
38
  ### Framework versions
39
 
40
- - TRL: 0.15.2
41
- - Transformers: 4.51.2
42
- - Pytorch: 2.5.1
43
- - Datasets: 3.5.0
44
  - Tokenizers: 0.21.1
45
 
46
  ## Citations
@@ -62,7 +62,7 @@ Cite TRL as:
62
  ```bibtex
63
  @misc{vonwerra2022trl,
64
  title = {{TRL: Transformer Reinforcement Learning}},
65
- author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
66
  year = 2020,
67
  journal = {GitHub repository},
68
  publisher = {GitHub},
 
1
  ---
2
+ base_model: unsloth/Qwen2.5-0.5B-Instruct
3
  library_name: transformers
4
  model_name: Qwen2.5-0.5B-Instruct-Gensyn-Swarm-quick_timid_frog
5
  tags:
6
  - generated_from_trainer
 
7
  - grpo
8
  - gensyn
9
  - I am quick timid frog
10
  - trl
11
+ - rl-swarm
12
  licence: license
13
  ---
14
 
15
  # Model Card for Qwen2.5-0.5B-Instruct-Gensyn-Swarm-quick_timid_frog
16
 
17
+ This model is a fine-tuned version of [unsloth/Qwen2.5-0.5B-Instruct](https://huggingface.co/unsloth/Qwen2.5-0.5B-Instruct).
18
  It has been trained using [TRL](https://github.com/huggingface/trl).
19
 
20
  ## Quick start
 
37
 
38
  ### Framework versions
39
 
40
+ - TRL: 0.19.0
41
+ - Transformers: 4.52.4
42
+ - Pytorch: 2.7.1
43
+ - Datasets: 3.6.0
44
  - Tokenizers: 0.21.1
45
 
46
  ## Citations
 
62
  ```bibtex
63
  @misc{vonwerra2022trl,
64
  title = {{TRL: Transformer Reinforcement Learning}},
65
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
66
  year = 2020,
67
  journal = {GitHub repository},
68
  publisher = {GitHub},
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.12956260753126117,
4
- "train_runtime": 123.8534,
5
- "train_samples": 28,
6
- "train_samples_per_second": 2.584,
7
- "train_steps_per_second": 0.161
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.03613492660224438,
4
+ "train_runtime": 1374.3618,
5
+ "train_samples": 83,
6
+ "train_samples_per_second": 0.116,
7
+ "train_steps_per_second": 0.015
8
  }
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
config.json CHANGED
@@ -3,7 +3,6 @@
3
  "Qwen2ForCausalLM"
4
  ],
5
  "attention_dropout": 0.0,
6
- "bos_token_id": 151643,
7
  "eos_token_id": 151645,
8
  "hidden_act": "silu",
9
  "hidden_size": 896,
@@ -15,13 +14,15 @@
15
  "num_attention_heads": 14,
16
  "num_hidden_layers": 24,
17
  "num_key_value_heads": 2,
 
18
  "rms_norm_eps": 1e-06,
19
  "rope_scaling": null,
20
  "rope_theta": 1000000.0,
21
- "sliding_window": 32768,
22
  "tie_word_embeddings": true,
23
  "torch_dtype": "float32",
24
- "transformers_version": "4.51.2",
 
25
  "use_cache": true,
26
  "use_sliding_window": false,
27
  "vocab_size": 151936
 
3
  "Qwen2ForCausalLM"
4
  ],
5
  "attention_dropout": 0.0,
 
6
  "eos_token_id": 151645,
7
  "hidden_act": "silu",
8
  "hidden_size": 896,
 
14
  "num_attention_heads": 14,
15
  "num_hidden_layers": 24,
16
  "num_key_value_heads": 2,
17
+ "pad_token_id": 151654,
18
  "rms_norm_eps": 1e-06,
19
  "rope_scaling": null,
20
  "rope_theta": 1000000.0,
21
+ "sliding_window": null,
22
  "tie_word_embeddings": true,
23
  "torch_dtype": "float32",
24
+ "transformers_version": "4.52.4",
25
+ "unsloth_fixed": true,
26
  "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
generation_config.json CHANGED
@@ -5,10 +5,11 @@
5
  151645,
6
  151643
7
  ],
8
- "pad_token_id": 151643,
 
9
  "repetition_penalty": 1.1,
10
  "temperature": 0.7,
11
  "top_k": 20,
12
  "top_p": 0.8,
13
- "transformers_version": "4.51.2"
14
  }
 
5
  151645,
6
  151643
7
  ],
8
+ "max_length": 32768,
9
+ "pad_token_id": 151654,
10
  "repetition_penalty": 1.1,
11
  "temperature": 0.7,
12
  "top_k": 20,
13
  "top_p": 0.8,
14
+ "transformers_version": "4.52.4"
15
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19b9b6f34a86f4c7236d477f7ba60b1c001b120b58c0abea0fa85431e70e8c5c
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:866a08c94c86a17ef6575016471b9aaf5cca32cdb45de267c72663a76249e8ca
3
  size 1976163472
special_tokens_map.json CHANGED
@@ -22,7 +22,7 @@
22
  "single_word": false
23
  },
24
  "pad_token": {
25
- "content": "<|endoftext|>",
26
  "lstrip": false,
27
  "normalized": false,
28
  "rstrip": false,
 
22
  "single_word": false
23
  },
24
  "pad_token": {
25
+ "content": "<|vision_pad|>",
26
  "lstrip": false,
27
  "normalized": false,
28
  "rstrip": false,
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5eee858c5123a4279c3e1f7b81247343f356ac767940b2692a928ad929543214
3
- size 11422063
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64e71213db910f5cafa86d35091f37393dcc344b1bbc34091d1b3eed4cca01d5
3
+ size 11422064
tokenizer_config.json CHANGED
@@ -195,13 +195,13 @@
195
  "<|video_pad|>"
196
  ],
197
  "bos_token": null,
198
- "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
  "clean_up_tokenization_spaces": false,
200
  "eos_token": "<|im_end|>",
201
  "errors": "replace",
202
  "extra_special_tokens": {},
203
- "model_max_length": 131072,
204
- "pad_token": "<|endoftext|>",
 
205
  "split_special_tokens": false,
206
  "tokenizer_class": "Qwen2Tokenizer",
207
  "unk_token": null
 
195
  "<|video_pad|>"
196
  ],
197
  "bos_token": null,
 
198
  "clean_up_tokenization_spaces": false,
199
  "eos_token": "<|im_end|>",
200
  "errors": "replace",
201
  "extra_special_tokens": {},
202
+ "model_max_length": 32768,
203
+ "pad_token": "<|vision_pad|>",
204
+ "padding_side": "left",
205
  "split_special_tokens": false,
206
  "tokenizer_class": "Qwen2Tokenizer",
207
  "unk_token": null
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.12956260753126117,
4
- "train_runtime": 123.8534,
5
- "train_samples": 28,
6
- "train_samples_per_second": 2.584,
7
- "train_steps_per_second": 0.161
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.03613492660224438,
4
+ "train_runtime": 1374.3618,
5
+ "train_samples": 83,
6
+ "train_samples_per_second": 0.116,
7
+ "train_steps_per_second": 0.015
8
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 5.0,
6
  "eval_steps": 500,
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
@@ -10,209 +10,419 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 249.875,
14
- "epoch": 0.5714285714285714,
15
- "grad_norm": 47.85331726074219,
16
- "kl": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
17
  "learning_rate": 5e-07,
18
- "loss": 0.0,
19
- "reward": 3.0323707722127438,
20
- "reward_std": 0.8131099180318415,
21
- "rewards/concensus_correctness_reward_func": 0.6197499856352806,
22
- "rewards/consensus_reward_func": 0.875,
23
- "rewards/cumulative_reward_2": 0.0,
24
- "rewards/final_correctness_reward_func": 0.0,
25
- "rewards/question_recreation_reward_func": 0.6093395496718585,
26
- "rewards/soft_format_reward_func": 0.015625,
27
- "rewards/strict_format_reward_func": 0.15625,
28
- "rewards/xmlcount_reward_func": 0.7564062550663948,
 
 
 
 
 
 
 
 
 
29
  "step": 2
30
  },
31
  {
32
- "completion_length": 149.91666666666666,
33
- "epoch": 1.0,
34
- "grad_norm": 103.94192504882812,
35
- "kl": 0.13347215698255846,
 
 
 
 
 
 
 
 
 
 
 
 
36
  "learning_rate": 4.864543104251586e-07,
37
- "loss": 0.0001,
38
- "reward": 5.739850004514058,
39
- "reward_std": 0.6970982489486536,
40
- "rewards/concensus_correctness_reward_func": 1.7566666553417842,
41
- "rewards/consensus_reward_func": 1.5833333333333333,
42
- "rewards/cumulative_reward_2": 0.0,
43
- "rewards/final_correctness_reward_func": 0.08333333333333333,
44
- "rewards/question_recreation_reward_func": 0.8079750432322422,
45
- "rewards/soft_format_reward_func": 0.0,
46
- "rewards/strict_format_reward_func": 0.3333333333333333,
47
- "rewards/xmlcount_reward_func": 1.175208330154419,
 
 
 
 
 
 
 
 
 
48
  "step": 4
49
  },
50
  {
51
- "completion_length": 158.9375,
52
- "epoch": 1.5714285714285714,
53
- "grad_norm": 89.15827941894531,
54
- "kl": 1.9806956076063216,
 
 
 
 
 
 
 
 
 
 
 
 
55
  "learning_rate": 4.472851273490984e-07,
56
- "loss": 0.002,
57
- "reward": 5.2741532772779465,
58
- "reward_std": 0.8553773319144966,
59
- "rewards/concensus_correctness_reward_func": 1.6266249865293503,
60
- "rewards/consensus_reward_func": 1.5,
61
- "rewards/cumulative_reward_2": 0.0,
62
- "rewards/final_correctness_reward_func": 0.0625,
63
- "rewards/question_recreation_reward_func": 0.7314033512957394,
64
- "rewards/soft_format_reward_func": 0.0,
65
- "rewards/strict_format_reward_func": 0.234375,
66
- "rewards/xmlcount_reward_func": 1.1192499995231628,
 
 
 
 
 
 
 
 
 
67
  "step": 6
68
  },
69
  {
70
- "completion_length": 211.54166666666666,
71
- "epoch": 2.0,
72
- "grad_norm": 115.98817443847656,
73
- "kl": 8.089183079699675,
 
 
 
 
 
 
 
 
 
 
 
 
74
  "learning_rate": 3.867370395306068e-07,
75
- "loss": 0.0061,
76
- "reward": 4.624983638525009,
77
- "reward_std": 0.8529471913352609,
78
- "rewards/concensus_correctness_reward_func": 1.2500833123922348,
79
- "rewards/consensus_reward_func": 1.5,
80
- "rewards/cumulative_reward_2": 0.0,
81
- "rewards/final_correctness_reward_func": 0.0,
82
- "rewards/question_recreation_reward_func": 0.6848169888059298,
83
- "rewards/soft_format_reward_func": 0.0,
84
- "rewards/strict_format_reward_func": 0.14583333333333334,
85
- "rewards/xmlcount_reward_func": 1.0442500114440918,
 
 
 
 
 
 
 
 
 
86
  "step": 8
87
  },
88
  {
89
- "completion_length": 158.53125,
90
- "epoch": 2.571428571428571,
91
- "grad_norm": 128.32345581054688,
92
- "kl": 10.754117728210986,
 
 
 
 
 
 
 
 
 
 
 
 
93
  "learning_rate": 3.1137137178519977e-07,
94
- "loss": 0.0108,
95
- "reward": 4.959709584712982,
96
- "reward_std": 1.3167340854997747,
97
- "rewards/concensus_correctness_reward_func": 1.4026249796152115,
98
- "rewards/consensus_reward_func": 1.3125,
99
- "rewards/cumulative_reward_2": 0.0,
100
- "rewards/final_correctness_reward_func": 0.0625,
101
- "rewards/question_recreation_reward_func": 0.7500846465118229,
102
- "rewards/soft_format_reward_func": 0.0,
103
- "rewards/strict_format_reward_func": 0.296875,
104
- "rewards/xmlcount_reward_func": 1.135124996304512,
 
 
 
 
 
 
 
 
 
105
  "step": 10
106
  },
107
  {
108
- "completion_length": 184.33333333333334,
109
- "epoch": 3.0,
110
- "grad_norm": 24.66672706604004,
111
- "kl": 3.1319144380589328,
 
 
 
 
 
 
 
 
 
 
 
 
112
  "learning_rate": 2.2935516363191693e-07,
113
- "loss": 0.0023,
114
- "reward": 5.048190355300903,
115
- "reward_std": 0.7422210735579332,
116
- "rewards/concensus_correctness_reward_func": 1.428333322207133,
117
- "rewards/consensus_reward_func": 1.6666666666666667,
118
- "rewards/cumulative_reward_2": 0.0,
119
- "rewards/final_correctness_reward_func": 0.0,
120
- "rewards/question_recreation_reward_func": 0.7871903777122498,
121
- "rewards/soft_format_reward_func": 0.0,
122
- "rewards/strict_format_reward_func": 0.20833333333333334,
123
- "rewards/xmlcount_reward_func": 0.9576666702826818,
 
 
 
 
 
 
 
 
 
124
  "step": 12
125
  },
126
  {
127
- "completion_length": 210.53125,
128
- "epoch": 3.571428571428571,
129
- "grad_norm": 440.6236267089844,
130
- "kl": 37.00089144241065,
 
 
 
 
 
 
 
 
 
 
 
 
131
  "learning_rate": 1.4957614383675767e-07,
132
- "loss": 0.037,
133
- "reward": 4.599987611174583,
134
- "reward_std": 1.3417337444698205,
135
- "rewards/concensus_correctness_reward_func": 1.2383124865591526,
136
- "rewards/consensus_reward_func": 1.4375,
137
- "rewards/cumulative_reward_2": 0.0,
138
- "rewards/final_correctness_reward_func": 0.0,
139
- "rewards/question_recreation_reward_func": 0.7561751045286655,
140
- "rewards/soft_format_reward_func": 0.0,
141
- "rewards/strict_format_reward_func": 0.203125,
142
- "rewards/xmlcount_reward_func": 0.9648750014603138,
 
 
 
 
 
 
 
 
 
143
  "step": 14
144
  },
145
  {
146
- "completion_length": 165.45833333333334,
147
- "epoch": 4.0,
148
- "grad_norm": 884.29345703125,
149
- "kl": 153.8786713940402,
 
 
 
 
 
 
 
 
 
 
 
 
150
  "learning_rate": 8.067960709356478e-08,
151
- "loss": 0.1154,
152
- "reward": 5.407021721204122,
153
- "reward_std": 0.9267256280872971,
154
- "rewards/concensus_correctness_reward_func": 1.5748333086570103,
155
- "rewards/consensus_reward_func": 1.5,
156
- "rewards/cumulative_reward_2": 0.0,
157
- "rewards/final_correctness_reward_func": 0.16666666666666666,
158
- "rewards/question_recreation_reward_func": 0.7176050413399935,
159
- "rewards/soft_format_reward_func": 0.0,
160
- "rewards/strict_format_reward_func": 0.3125,
161
- "rewards/xmlcount_reward_func": 1.1354166666666667,
 
 
 
 
 
 
 
 
 
162
  "step": 16
163
  },
164
  {
165
- "completion_length": 167.03125,
166
- "epoch": 4.571428571428571,
167
- "grad_norm": 433.138916015625,
168
- "kl": 51.80729316617362,
 
 
 
 
 
 
 
 
 
 
 
 
169
  "learning_rate": 3.013156219837776e-08,
170
- "loss": 0.0518,
171
- "reward": 4.95432373136282,
172
- "reward_std": 0.6955515777153778,
173
- "rewards/concensus_correctness_reward_func": 1.3982499837875366,
174
- "rewards/consensus_reward_func": 1.5,
175
- "rewards/cumulative_reward_2": 0.0,
176
- "rewards/final_correctness_reward_func": 0.0,
177
- "rewards/question_recreation_reward_func": 0.7722300551831722,
178
- "rewards/soft_format_reward_func": 0.0,
179
- "rewards/strict_format_reward_func": 0.25,
180
- "rewards/xmlcount_reward_func": 1.0338437519967556,
 
 
 
 
 
 
 
 
 
181
  "step": 18
182
  },
183
  {
184
- "completion_length": 213.45833333333334,
185
- "epoch": 5.0,
186
- "grad_norm": 30.303447723388672,
187
- "kl": 1426.8845755159855,
 
 
 
 
 
 
 
 
 
 
 
 
188
  "learning_rate": 3.4096741493194193e-09,
189
- "loss": 1.0702,
190
- "reward": 4.796473105748494,
191
- "reward_std": 0.7849306451777617,
192
- "rewards/concensus_correctness_reward_func": 1.431666652361552,
193
- "rewards/consensus_reward_func": 1.4166666666666667,
194
- "rewards/cumulative_reward_2": 0.0,
195
- "rewards/final_correctness_reward_func": 0.0,
196
- "rewards/question_recreation_reward_func": 0.7149731454749902,
197
- "rewards/soft_format_reward_func": 0.0,
198
- "rewards/strict_format_reward_func": 0.25,
199
- "rewards/xmlcount_reward_func": 0.9831666549046835,
 
 
 
 
 
 
 
 
 
200
  "step": 20
201
  },
202
  {
203
- "epoch": 5.0,
204
  "step": 20,
205
  "total_flos": 0.0,
206
- "train_loss": 0.12956260753126117,
207
- "train_runtime": 123.8534,
208
- "train_samples_per_second": 2.584,
209
- "train_steps_per_second": 0.161
210
  }
211
  ],
212
  "logging_steps": 2,
213
  "max_steps": 20,
214
- "num_input_tokens_seen": 0,
215
- "num_train_epochs": 7,
216
  "save_steps": 25,
217
  "stateful_callbacks": {
218
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.963855421686747,
6
  "eval_steps": 500,
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.0625,
19
+ "completions/max_length": 768.5,
20
+ "completions/max_terminated_length": 544.0,
21
+ "completions/mean_length": 282.8125,
22
+ "completions/mean_terminated_length": 233.37500762939453,
23
+ "completions/min_length": 35.5,
24
+ "completions/min_terminated_length": 35.5,
25
+ "epoch": 0.0963855421686747,
26
+ "frac_reward_zero_std": 0.125,
27
+ "grad_norm": 113.29230499267578,
28
+ "kl": -6.7503988510075885e-09,
29
  "learning_rate": 5e-07,
30
+ "loss": -0.0304,
31
+ "num_tokens": 8621.0,
32
+ "reward": 0.05961298104375601,
33
+ "reward_std": 0.021443639416247606,
34
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
35
+ "rewards/concensus_correctness_reward_func/std": 0.0,
36
+ "rewards/consensus_reward_func/mean": 0.0,
37
+ "rewards/consensus_reward_func/std": 0.0,
38
+ "rewards/cumulative_reward_2/mean": 0.0,
39
+ "rewards/cumulative_reward_2/std": 0.0,
40
+ "rewards/final_correctness_reward_func/mean": 0.0,
41
+ "rewards/final_correctness_reward_func/std": 0.0,
42
+ "rewards/question_recreation_reward_func/mean": 0.05961298104375601,
43
+ "rewards/question_recreation_reward_func/std": 0.03218379570171237,
44
+ "rewards/soft_format_reward_func/mean": 0.0,
45
+ "rewards/soft_format_reward_func/std": 0.0,
46
+ "rewards/strict_format_reward_func/mean": 0.0,
47
+ "rewards/strict_format_reward_func/std": 0.0,
48
+ "rewards/xmlcount_reward_func/mean": 0.0,
49
+ "rewards/xmlcount_reward_func/std": 0.0,
50
  "step": 2
51
  },
52
  {
53
+ "clip_ratio/high_max": 0.0,
54
+ "clip_ratio/high_mean": 0.0,
55
+ "clip_ratio/low_mean": 0.0,
56
+ "clip_ratio/low_min": 0.0,
57
+ "clip_ratio/region_mean": 0.0,
58
+ "completions/clipped_ratio": 0.0,
59
+ "completions/max_length": 881.0,
60
+ "completions/max_terminated_length": 881.0,
61
+ "completions/mean_length": 393.1875,
62
+ "completions/mean_terminated_length": 393.1875,
63
+ "completions/min_length": 62.0,
64
+ "completions/min_terminated_length": 62.0,
65
+ "epoch": 0.1927710843373494,
66
+ "frac_reward_zero_std": 0.0,
67
+ "grad_norm": 6.477648735046387,
68
+ "kl": 0.00010615352056220217,
69
  "learning_rate": 4.864543104251586e-07,
70
+ "loss": -0.0486,
71
+ "num_tokens": 19008.0,
72
+ "reward": 0.018797683529555798,
73
+ "reward_std": 0.007865483523346484,
74
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
75
+ "rewards/concensus_correctness_reward_func/std": 0.0,
76
+ "rewards/consensus_reward_func/mean": 0.0,
77
+ "rewards/consensus_reward_func/std": 0.0,
78
+ "rewards/cumulative_reward_2/mean": 0.0,
79
+ "rewards/cumulative_reward_2/std": 0.0,
80
+ "rewards/final_correctness_reward_func/mean": 0.0,
81
+ "rewards/final_correctness_reward_func/std": 0.0,
82
+ "rewards/question_recreation_reward_func/mean": 0.018797683529555798,
83
+ "rewards/question_recreation_reward_func/std": 0.00972810504026711,
84
+ "rewards/soft_format_reward_func/mean": 0.0,
85
+ "rewards/soft_format_reward_func/std": 0.0,
86
+ "rewards/strict_format_reward_func/mean": 0.0,
87
+ "rewards/strict_format_reward_func/std": 0.0,
88
+ "rewards/xmlcount_reward_func/mean": 0.0,
89
+ "rewards/xmlcount_reward_func/std": 0.0,
90
  "step": 4
91
  },
92
  {
93
+ "clip_ratio/high_max": 0.0,
94
+ "clip_ratio/high_mean": 0.0,
95
+ "clip_ratio/low_mean": 0.0,
96
+ "clip_ratio/low_min": 0.0,
97
+ "clip_ratio/region_mean": 0.0,
98
+ "completions/clipped_ratio": 0.0,
99
+ "completions/max_length": 806.5,
100
+ "completions/max_terminated_length": 806.5,
101
+ "completions/mean_length": 288.1875,
102
+ "completions/mean_terminated_length": 288.1875,
103
+ "completions/min_length": 50.0,
104
+ "completions/min_terminated_length": 50.0,
105
+ "epoch": 0.2891566265060241,
106
+ "frac_reward_zero_std": 0.0,
107
+ "grad_norm": 7.433625221252441,
108
+ "kl": 0.0004048098953717272,
109
  "learning_rate": 4.472851273490984e-07,
110
+ "loss": 0.0337,
111
+ "num_tokens": 27715.0,
112
+ "reward": 0.08134639449417591,
113
+ "reward_std": 0.033843206241726875,
114
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
115
+ "rewards/concensus_correctness_reward_func/std": 0.0,
116
+ "rewards/consensus_reward_func/mean": 0.0,
117
+ "rewards/consensus_reward_func/std": 0.0,
118
+ "rewards/cumulative_reward_2/mean": 0.0,
119
+ "rewards/cumulative_reward_2/std": 0.0,
120
+ "rewards/final_correctness_reward_func/mean": 0.0,
121
+ "rewards/final_correctness_reward_func/std": 0.0,
122
+ "rewards/question_recreation_reward_func/mean": 0.09159639663994312,
123
+ "rewards/question_recreation_reward_func/std": 0.03545556031167507,
124
+ "rewards/soft_format_reward_func/mean": 0.0,
125
+ "rewards/soft_format_reward_func/std": 0.0,
126
+ "rewards/strict_format_reward_func/mean": 0.0,
127
+ "rewards/strict_format_reward_func/std": 0.0,
128
+ "rewards/xmlcount_reward_func/mean": -0.010250000283122063,
129
+ "rewards/xmlcount_reward_func/std": 0.028991378843784332,
130
  "step": 6
131
  },
132
  {
133
+ "clip_ratio/high_max": 0.0,
134
+ "clip_ratio/high_mean": 0.0,
135
+ "clip_ratio/low_mean": 0.0,
136
+ "clip_ratio/low_min": 0.0,
137
+ "clip_ratio/region_mean": 0.0,
138
+ "completions/clipped_ratio": 0.0,
139
+ "completions/max_length": 443.0,
140
+ "completions/max_terminated_length": 443.0,
141
+ "completions/mean_length": 268.3125,
142
+ "completions/mean_terminated_length": 268.3125,
143
+ "completions/min_length": 39.0,
144
+ "completions/min_terminated_length": 39.0,
145
+ "epoch": 0.3855421686746988,
146
+ "frac_reward_zero_std": 0.0,
147
+ "grad_norm": 5.330082416534424,
148
+ "kl": 0.0007420043511956464,
149
  "learning_rate": 3.867370395306068e-07,
150
+ "loss": 0.0677,
151
+ "num_tokens": 36104.0,
152
+ "reward": 0.13727860897779465,
153
+ "reward_std": 0.04922554735094309,
154
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
155
+ "rewards/concensus_correctness_reward_func/std": 0.0,
156
+ "rewards/consensus_reward_func/mean": 0.0,
157
+ "rewards/consensus_reward_func/std": 0.0,
158
+ "rewards/cumulative_reward_2/mean": 0.0,
159
+ "rewards/cumulative_reward_2/std": 0.0,
160
+ "rewards/final_correctness_reward_func/mean": 0.0,
161
+ "rewards/final_correctness_reward_func/std": 0.0,
162
+ "rewards/question_recreation_reward_func/mean": 0.12946611270308495,
163
+ "rewards/question_recreation_reward_func/std": 0.10384266264736652,
164
+ "rewards/soft_format_reward_func/mean": 0.0,
165
+ "rewards/soft_format_reward_func/std": 0.0,
166
+ "rewards/strict_format_reward_func/mean": 0.0,
167
+ "rewards/strict_format_reward_func/std": 0.0,
168
+ "rewards/xmlcount_reward_func/mean": 0.0078125,
169
+ "rewards/xmlcount_reward_func/std": 0.022097086533904076,
170
  "step": 8
171
  },
172
  {
173
+ "clip_ratio/high_max": 0.0,
174
+ "clip_ratio/high_mean": 0.0,
175
+ "clip_ratio/low_mean": 0.0,
176
+ "clip_ratio/low_min": 0.0,
177
+ "clip_ratio/region_mean": 0.0,
178
+ "completions/clipped_ratio": 0.1875,
179
+ "completions/max_length": 1024.0,
180
+ "completions/max_terminated_length": 889.5,
181
+ "completions/mean_length": 506.625,
182
+ "completions/mean_terminated_length": 393.5833435058594,
183
+ "completions/min_length": 29.5,
184
+ "completions/min_terminated_length": 29.5,
185
+ "epoch": 0.4819277108433735,
186
+ "frac_reward_zero_std": 0.0,
187
+ "grad_norm": 5.515347957611084,
188
+ "kl": 0.00031582830342813395,
189
  "learning_rate": 3.1137137178519977e-07,
190
+ "loss": 0.0551,
191
+ "num_tokens": 48306.0,
192
+ "reward": 0.04514514096081257,
193
+ "reward_std": 0.04106513550505042,
194
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
195
+ "rewards/concensus_correctness_reward_func/std": 0.0,
196
+ "rewards/consensus_reward_func/mean": 0.0,
197
+ "rewards/consensus_reward_func/std": 0.0,
198
+ "rewards/cumulative_reward_2/mean": 0.0,
199
+ "rewards/cumulative_reward_2/std": 0.0,
200
+ "rewards/final_correctness_reward_func/mean": 0.0,
201
+ "rewards/final_correctness_reward_func/std": 0.0,
202
+ "rewards/question_recreation_reward_func/mean": 0.019582641310989857,
203
+ "rewards/question_recreation_reward_func/std": 0.013477418571710587,
204
+ "rewards/soft_format_reward_func/mean": 0.0,
205
+ "rewards/soft_format_reward_func/std": 0.0,
206
+ "rewards/strict_format_reward_func/mean": 0.0,
207
+ "rewards/strict_format_reward_func/std": 0.0,
208
+ "rewards/xmlcount_reward_func/mean": 0.025562500581145287,
209
+ "rewards/xmlcount_reward_func/std": 0.07230167090892792,
210
  "step": 10
211
  },
212
  {
213
+ "clip_ratio/high_max": 0.0,
214
+ "clip_ratio/high_mean": 0.0,
215
+ "clip_ratio/low_mean": 0.0,
216
+ "clip_ratio/low_min": 0.0,
217
+ "clip_ratio/region_mean": 0.0,
218
+ "completions/clipped_ratio": 0.0,
219
+ "completions/max_length": 685.0,
220
+ "completions/max_terminated_length": 685.0,
221
+ "completions/mean_length": 216.75,
222
+ "completions/mean_terminated_length": 216.75,
223
+ "completions/min_length": 35.5,
224
+ "completions/min_terminated_length": 35.5,
225
+ "epoch": 0.5783132530120482,
226
+ "frac_reward_zero_std": 0.125,
227
+ "grad_norm": 24.28751564025879,
228
+ "kl": 0.0019251212124800077,
229
  "learning_rate": 2.2935516363191693e-07,
230
+ "loss": 0.1978,
231
+ "num_tokens": 55870.0,
232
+ "reward": 0.1261079115793109,
233
+ "reward_std": 0.15609003114514053,
234
+ "rewards/concensus_correctness_reward_func/mean": 0.10837499797344208,
235
+ "rewards/concensus_correctness_reward_func/std": 0.30653080344200134,
236
+ "rewards/consensus_reward_func/mean": 0.0,
237
+ "rewards/consensus_reward_func/std": 0.0,
238
+ "rewards/cumulative_reward_2/mean": 0.0,
239
+ "rewards/cumulative_reward_2/std": 0.0,
240
+ "rewards/final_correctness_reward_func/mean": 0.0,
241
+ "rewards/final_correctness_reward_func/std": 0.0,
242
+ "rewards/question_recreation_reward_func/mean": 0.01773291453719139,
243
+ "rewards/question_recreation_reward_func/std": 0.008510306011885405,
244
+ "rewards/soft_format_reward_func/mean": 0.0,
245
+ "rewards/soft_format_reward_func/std": 0.0,
246
+ "rewards/strict_format_reward_func/mean": 0.0,
247
+ "rewards/strict_format_reward_func/std": 0.0,
248
+ "rewards/xmlcount_reward_func/mean": 0.0,
249
+ "rewards/xmlcount_reward_func/std": 0.0,
250
  "step": 12
251
  },
252
  {
253
+ "clip_ratio/high_max": 0.0,
254
+ "clip_ratio/high_mean": 0.0,
255
+ "clip_ratio/low_mean": 0.0,
256
+ "clip_ratio/low_min": 0.0,
257
+ "clip_ratio/region_mean": 0.0,
258
+ "completions/clipped_ratio": 0.3125,
259
+ "completions/max_length": 1024.0,
260
+ "completions/max_terminated_length": 774.5,
261
+ "completions/mean_length": 514.625,
262
+ "completions/mean_terminated_length": 292.6999969482422,
263
+ "completions/min_length": 18.0,
264
+ "completions/min_terminated_length": 18.0,
265
+ "epoch": 0.6746987951807228,
266
+ "frac_reward_zero_std": 0.0,
267
+ "grad_norm": 7.534204959869385,
268
+ "kl": 0.0008234772067226004,
269
  "learning_rate": 1.4957614383675767e-07,
270
+ "loss": 0.0264,
271
+ "num_tokens": 68200.0,
272
+ "reward": 0.15735165495425463,
273
+ "reward_std": 0.20545156858861446,
274
+ "rewards/concensus_correctness_reward_func/mean": 0.015625,
275
+ "rewards/concensus_correctness_reward_func/std": 0.04419417306780815,
276
+ "rewards/consensus_reward_func/mean": 0.125,
277
+ "rewards/consensus_reward_func/std": 0.3535533845424652,
278
+ "rewards/cumulative_reward_2/mean": 0.0,
279
+ "rewards/cumulative_reward_2/std": 0.0,
280
+ "rewards/final_correctness_reward_func/mean": 0.0,
281
+ "rewards/final_correctness_reward_func/std": 0.0,
282
+ "rewards/question_recreation_reward_func/mean": 0.016726648900657892,
283
+ "rewards/question_recreation_reward_func/std": 0.010374929523095489,
284
+ "rewards/soft_format_reward_func/mean": 0.0,
285
+ "rewards/soft_format_reward_func/std": 0.0,
286
+ "rewards/strict_format_reward_func/mean": 0.0,
287
+ "rewards/strict_format_reward_func/std": 0.0,
288
+ "rewards/xmlcount_reward_func/mean": 0.0,
289
+ "rewards/xmlcount_reward_func/std": 0.0,
290
  "step": 14
291
  },
292
  {
293
+ "clip_ratio/high_max": 0.0,
294
+ "clip_ratio/high_mean": 0.0,
295
+ "clip_ratio/low_mean": 0.0,
296
+ "clip_ratio/low_min": 0.0,
297
+ "clip_ratio/region_mean": 0.0,
298
+ "completions/clipped_ratio": 0.0625,
299
+ "completions/max_length": 749.5,
300
+ "completions/max_terminated_length": 605.5,
301
+ "completions/mean_length": 282.5,
302
+ "completions/mean_terminated_length": 231.04464721679688,
303
+ "completions/min_length": 12.5,
304
+ "completions/min_terminated_length": 12.5,
305
+ "epoch": 0.7710843373493976,
306
+ "frac_reward_zero_std": 0.0,
307
+ "grad_norm": 6.5093994140625,
308
+ "kl": 0.002072617062367499,
309
  "learning_rate": 8.067960709356478e-08,
310
+ "loss": 0.0046,
311
+ "num_tokens": 76816.0,
312
+ "reward": 0.017174751963466406,
313
+ "reward_std": 0.011310524307191372,
314
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
315
+ "rewards/concensus_correctness_reward_func/std": 0.0,
316
+ "rewards/consensus_reward_func/mean": 0.0,
317
+ "rewards/consensus_reward_func/std": 0.0,
318
+ "rewards/cumulative_reward_2/mean": 0.0,
319
+ "rewards/cumulative_reward_2/std": 0.0,
320
+ "rewards/final_correctness_reward_func/mean": 0.0,
321
+ "rewards/final_correctness_reward_func/std": 0.0,
322
+ "rewards/question_recreation_reward_func/mean": 0.017174751963466406,
323
+ "rewards/question_recreation_reward_func/std": 0.011285829357802868,
324
+ "rewards/soft_format_reward_func/mean": 0.0,
325
+ "rewards/soft_format_reward_func/std": 0.0,
326
+ "rewards/strict_format_reward_func/mean": 0.0,
327
+ "rewards/strict_format_reward_func/std": 0.0,
328
+ "rewards/xmlcount_reward_func/mean": 0.0,
329
+ "rewards/xmlcount_reward_func/std": 0.0,
330
  "step": 16
331
  },
332
  {
333
+ "clip_ratio/high_max": 0.0,
334
+ "clip_ratio/high_mean": 0.0,
335
+ "clip_ratio/low_mean": 0.0,
336
+ "clip_ratio/low_min": 0.0,
337
+ "clip_ratio/region_mean": 0.0,
338
+ "completions/clipped_ratio": 0.0,
339
+ "completions/max_length": 941.5,
340
+ "completions/max_terminated_length": 941.5,
341
+ "completions/mean_length": 366.375,
342
+ "completions/mean_terminated_length": 366.375,
343
+ "completions/min_length": 4.5,
344
+ "completions/min_terminated_length": 4.5,
345
+ "epoch": 0.8674698795180723,
346
+ "frac_reward_zero_std": 0.0,
347
+ "grad_norm": 4.666317939758301,
348
+ "kl": 0.0007759865002299193,
349
  "learning_rate": 3.013156219837776e-08,
350
+ "loss": 0.1212,
351
+ "num_tokens": 86774.0,
352
+ "reward": -0.03930019214749336,
353
+ "reward_std": 0.15650326944887638,
354
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
355
+ "rewards/concensus_correctness_reward_func/std": 0.0,
356
+ "rewards/consensus_reward_func/mean": 0.0,
357
+ "rewards/consensus_reward_func/std": 0.0,
358
+ "rewards/cumulative_reward_2/mean": 0.0,
359
+ "rewards/cumulative_reward_2/std": 0.0,
360
+ "rewards/final_correctness_reward_func/mean": 0.0,
361
+ "rewards/final_correctness_reward_func/std": 0.0,
362
+ "rewards/question_recreation_reward_func/mean": 0.05757479928433895,
363
+ "rewards/question_recreation_reward_func/std": 0.03582485252991319,
364
+ "rewards/soft_format_reward_func/mean": 0.0,
365
+ "rewards/soft_format_reward_func/std": 0.0,
366
+ "rewards/strict_format_reward_func/mean": 0.0,
367
+ "rewards/strict_format_reward_func/std": 0.0,
368
+ "rewards/xmlcount_reward_func/mean": -0.09687499701976776,
369
+ "rewards/xmlcount_reward_func/std": 0.27400386333465576,
370
  "step": 18
371
  },
372
  {
373
+ "clip_ratio/high_max": 0.0,
374
+ "clip_ratio/high_mean": 0.0,
375
+ "clip_ratio/low_mean": 0.0,
376
+ "clip_ratio/low_min": 0.0,
377
+ "clip_ratio/region_mean": 0.0,
378
+ "completions/clipped_ratio": 0.1875,
379
+ "completions/max_length": 928.0,
380
+ "completions/max_terminated_length": 814.0,
381
+ "completions/mean_length": 394.0625,
382
+ "completions/mean_terminated_length": 260.71250915527344,
383
+ "completions/min_length": 4.5,
384
+ "completions/min_terminated_length": 4.5,
385
+ "epoch": 0.963855421686747,
386
+ "frac_reward_zero_std": 0.0,
387
+ "grad_norm": 11.774459838867188,
388
+ "kl": 0.001760888408171013,
389
  "learning_rate": 3.4096741493194193e-09,
390
+ "loss": -0.0662,
391
+ "num_tokens": 97175.0,
392
+ "reward": 0.017644216306507587,
393
+ "reward_std": 0.008533301530405879,
394
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
395
+ "rewards/concensus_correctness_reward_func/std": 0.0,
396
+ "rewards/consensus_reward_func/mean": 0.0,
397
+ "rewards/consensus_reward_func/std": 0.0,
398
+ "rewards/cumulative_reward_2/mean": 0.0,
399
+ "rewards/cumulative_reward_2/std": 0.0,
400
+ "rewards/final_correctness_reward_func/mean": 0.0,
401
+ "rewards/final_correctness_reward_func/std": 0.0,
402
+ "rewards/question_recreation_reward_func/mean": 0.017644216306507587,
403
+ "rewards/question_recreation_reward_func/std": 0.014659160049632192,
404
+ "rewards/soft_format_reward_func/mean": 0.0,
405
+ "rewards/soft_format_reward_func/std": 0.0,
406
+ "rewards/strict_format_reward_func/mean": 0.0,
407
+ "rewards/strict_format_reward_func/std": 0.0,
408
+ "rewards/xmlcount_reward_func/mean": 0.0,
409
+ "rewards/xmlcount_reward_func/std": 0.0,
410
  "step": 20
411
  },
412
  {
413
+ "epoch": 0.963855421686747,
414
  "step": 20,
415
  "total_flos": 0.0,
416
+ "train_loss": 0.03613492660224438,
417
+ "train_runtime": 1374.3618,
418
+ "train_samples_per_second": 0.116,
419
+ "train_steps_per_second": 0.015
420
  }
421
  ],
422
  "logging_steps": 2,
423
  "max_steps": 20,
424
+ "num_input_tokens_seen": 97175,
425
+ "num_train_epochs": 1,
426
  "save_steps": 25,
427
  "stateful_callbacks": {
428
  "TrainerControl": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15a35ed0dc1b427b3d92ea468297276e7e5514301a2a4c403cab4aa16463d2e6
3
- size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bc2740e757e02dffaef9c67456e365b5d93e3b713615b66406fede2cc76013c
3
+ size 6929