Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

.gitattributes +1 -0
README.md +210 -0
adapter_config.json +42 -0
adapter_model.safetensors +3 -0
added_tokens.json +28 -0
chat_template.jinja +86 -0
merges.txt +0 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +240 -0
trainer_state.json +2234 -0
training_args.bin +3 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,210 @@

+---
+base_model: unsloth/Qwen3-4B-Instruct-2507
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/Qwen3-4B-Instruct-2507
+- grpo
+- lora
+- transformers
+- trl
+- unsloth
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "unsloth/Qwen3-4B-Instruct-2507",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 1024,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "down_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfaf48dd6ae2245f0352dec7342fa4979499f63d12753260746ae9239096e965
+size 264308896

added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,86 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88d0a3dd406909af1bf921e0d96c3b503f5a268acc1d29d503dc3f8d9b8bd983
+size 134777605

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61ce686ad42c944542a1fda9a3e81863ee7f6571f45c76d92dd8fd4703c04403
+size 14645

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eed58a28305544c69e5eb98978132b6b7061f7d95ff30e1cbe45a25350f8e2ac
+size 1465

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|vision_pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:132c0fb88b2070b782a69e8833d01ab987b1198ec606df151512d91820abb758
+size 11422822

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,240 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 262144,
+  "pad_token": "<|vision_pad|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2234 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "completion_length": 415.125,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 444.0,
+      "completions/mean_length": 415.125,
+      "completions/mean_terminated_length": 318.25,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "epoch": 0.01,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 16.844226837158203,
+      "kl": 0.0,
+      "learning_rate": 0.0,
+      "loss": -0.0,
+      "num_tokens": 3841.0,
+      "reward": 0.36250001192092896,
+      "reward_std": 0.42499998211860657,
+      "rewards/compute_privacy_reward/mean": 0.36250001192092896,
+      "rewards/compute_privacy_reward/std": 0.4172614812850952,
+      "step": 1
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.02,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 10.855765342712402,
+      "kl": 0.0,
+      "learning_rate": 1.5000000000000002e-07,
+      "loss": -0.0,
+      "num_tokens": 8457.0,
+      "reward": 0.737500011920929,
+      "reward_std": 0.17499998211860657,
+      "rewards/compute_privacy_reward/mean": 0.737500011920929,
+      "rewards/compute_privacy_reward/std": 0.36228442192077637,
+      "step": 2
+    },
+    {
+      "completion_length": 498.625,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 405.0,
+      "completions/mean_length": 498.625,
+      "completions/mean_terminated_length": 405.0,
+      "completions/min_length": 405.0,
+      "completions/min_terminated_length": 405.0,
+      "epoch": 0.03,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 10.087930679321289,
+      "kl": 0.0012364824069663882,
+      "learning_rate": 3.0000000000000004e-07,
+      "loss": 0.0,
+      "num_tokens": 12966.0,
+      "reward": 0.824999988079071,
+      "reward_std": 0.20207259058952332,
+      "rewards/compute_privacy_reward/mean": 0.824999988079071,
+      "rewards/compute_privacy_reward/std": 0.32403701543807983,
+      "step": 3
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 13.167259216308594,
+      "kl": 0.0012778548407368362,
+      "learning_rate": 4.5e-07,
+      "loss": 0.0,
+      "num_tokens": 17582.0,
+      "reward": 0.9375,
+      "reward_std": 0.125,
+      "rewards/compute_privacy_reward/mean": 0.9375,
+      "rewards/compute_privacy_reward/std": 0.1767766922712326,
+      "step": 4
+    },
+    {
+      "completion_length": 477.125,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 233.0,
+      "completions/mean_length": 477.125,
+      "completions/mean_terminated_length": 233.0,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 0.05,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.007137409411370754,
+      "kl": 0.0012993840500712395,
+      "learning_rate": 6.000000000000001e-07,
+      "loss": 0.0,
+      "num_tokens": 21919.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 5
+    },
+    {
+      "completion_length": 502.875,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 439.0,
+      "completions/mean_length": 502.875,
+      "completions/mean_terminated_length": 439.0,
+      "completions/min_length": 439.0,
+      "completions/min_terminated_length": 439.0,
+      "epoch": 0.06,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 15.382135391235352,
+      "kl": 0.0014336315216496587,
+      "learning_rate": 7.5e-07,
+      "loss": 0.0,
+      "num_tokens": 26462.0,
+      "reward": 0.8500000238418579,
+      "reward_std": 0.24716877937316895,
+      "rewards/compute_privacy_reward/mean": 0.8500000238418579,
+      "rewards/compute_privacy_reward/std": 0.24928468465805054,
+      "step": 6
+    },
+    {
+      "completion_length": 492.625,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 357.0,
+      "completions/mean_length": 492.625,
+      "completions/mean_terminated_length": 357.0,
+      "completions/min_length": 357.0,
+      "completions/min_terminated_length": 357.0,
+      "epoch": 0.07,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 9.826014518737793,
+      "kl": 0.0017287725349888206,
+      "learning_rate": 9e-07,
+      "loss": 0.0,
+      "num_tokens": 30923.0,
+      "reward": 0.737500011920929,
+      "reward_std": 0.17499998211860657,
+      "rewards/compute_privacy_reward/mean": 0.737500011920929,
+      "rewards/compute_privacy_reward/std": 0.36228442192077637,
+      "step": 7
+    },
+    {
+      "completion_length": 467.625,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 410.0,
+      "completions/mean_length": 467.625,
+      "completions/mean_terminated_length": 334.5,
+      "completions/min_length": 259.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 0.08,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 11.862555503845215,
+      "kl": 0.002132528112269938,
+      "learning_rate": 1.05e-06,
+      "loss": 0.0,
+      "num_tokens": 35184.0,
+      "reward": 0.9125000238418579,
+      "reward_std": 0.17499999701976776,
+      "rewards/compute_privacy_reward/mean": 0.9125000238418579,
+      "rewards/compute_privacy_reward/std": 0.2474873811006546,
+      "step": 8
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.09,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.010152203030884266,
+      "kl": 0.0027272438164800406,
+      "learning_rate": 1.2000000000000002e-06,
+      "loss": 0.0,
+      "num_tokens": 39800.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 9
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.1,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 11.472397804260254,
+      "kl": 0.0036284179659560323,
+      "learning_rate": 1.35e-06,
+      "loss": 0.0,
+      "num_tokens": 44416.0,
+      "reward": 0.7875000238418579,
+      "reward_std": 0.2528998553752899,
+      "rewards/compute_privacy_reward/mean": 0.7875000238418579,
+      "rewards/compute_privacy_reward/std": 0.401559442281723,
+      "step": 10
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.11,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 13.772138595581055,
+      "kl": 0.0046011891681700945,
+      "learning_rate": 1.5e-06,
+      "loss": 0.0,
+      "num_tokens": 49032.0,
+      "reward": 0.699999988079071,
+      "reward_std": 0.4520725905895233,
+      "rewards/compute_privacy_reward/mean": 0.699999988079071,
+      "rewards/compute_privacy_reward/std": 0.4242640733718872,
+      "step": 11
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.12,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 17.931562423706055,
+      "kl": 0.012186857871711254,
+      "learning_rate": 1.65e-06,
+      "loss": 0.0,
+      "num_tokens": 53648.0,
+      "reward": 0.6499999761581421,
+      "reward_std": 0.40414518117904663,
+      "rewards/compute_privacy_reward/mean": 0.6499999761581421,
+      "rewards/compute_privacy_reward/std": 0.37416574358940125,
+      "step": 12
+    },
+    {
+      "completion_length": 473.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 200.0,
+      "completions/mean_length": 473.0,
+      "completions/mean_terminated_length": 200.0,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 0.13,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 14.103535652160645,
+      "kl": 0.007157353218644857,
+      "learning_rate": 1.8e-06,
+      "loss": 0.0,
+      "num_tokens": 57952.0,
+      "reward": 0.8458333015441895,
+      "reward_std": 0.3083333373069763,
+      "rewards/compute_privacy_reward/mean": 0.8458333015441895,
+      "rewards/compute_privacy_reward/std": 0.28891560435295105,
+      "step": 13
+    },
+    {
+      "completion_length": 454.75,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 360.0,
+      "completions/mean_length": 454.75,
+      "completions/mean_terminated_length": 283.0,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 0.14,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 9.746970176696777,
+      "kl": 0.010588065255433321,
+      "learning_rate": 1.95e-06,
+      "loss": 0.0,
+      "num_tokens": 62110.0,
+      "reward": 0.9125000238418579,
+      "reward_std": 0.17499999701976776,
+      "rewards/compute_privacy_reward/mean": 0.9125000238418579,
+      "rewards/compute_privacy_reward/std": 0.2474873811006546,
+      "step": 14
+    },
+    {
+      "completion_length": 464.25,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 372.0,
+      "completions/mean_length": 464.25,
+      "completions/mean_terminated_length": 321.0,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "epoch": 0.15,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.005599740892648697,
+      "kl": 0.009548019152134657,
+      "learning_rate": 2.1e-06,
+      "loss": 0.0,
+      "num_tokens": 66344.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 15
+    },
+    {
+      "completion_length": 487.25,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 314.0,
+      "completions/mean_length": 487.25,
+      "completions/mean_terminated_length": 314.0,
+      "completions/min_length": 314.0,
+      "completions/min_terminated_length": 314.0,
+      "epoch": 0.16,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 15.274740219116211,
+      "kl": 0.01842383248731494,
+      "learning_rate": 2.25e-06,
+      "loss": 0.0,
+      "num_tokens": 70762.0,
+      "reward": 0.875,
+      "reward_std": 0.14433756470680237,
+      "rewards/compute_privacy_reward/mean": 0.875,
+      "rewards/compute_privacy_reward/std": 0.2314550280570984,
+      "step": 16
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.17,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.009588563814759254,
+      "kl": 0.01250272523611784,
+      "learning_rate": 2.4000000000000003e-06,
+      "loss": 0.0,
+      "num_tokens": 75378.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 17
+    },
+    {
+      "completion_length": 486.875,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 311.0,
+      "completions/mean_length": 486.875,
+      "completions/mean_terminated_length": 311.0,
+      "completions/min_length": 311.0,
+      "completions/min_terminated_length": 311.0,
+      "epoch": 0.18,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.014890752732753754,
+      "kl": 0.028752848505973816,
+      "learning_rate": 2.55e-06,
+      "loss": 0.0,
+      "num_tokens": 79793.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 18
+    },
+    {
+      "completion_length": 501.625,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 429.0,
+      "completions/mean_length": 501.625,
+      "completions/mean_terminated_length": 429.0,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "epoch": 0.19,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 9.796663284301758,
+      "kl": 0.02753235213458538,
+      "learning_rate": 2.7e-06,
+      "loss": 0.0,
+      "num_tokens": 84326.0,
+      "reward": 0.9375,
+      "reward_std": 0.125,
+      "rewards/compute_privacy_reward/mean": 0.9375,
+      "rewards/compute_privacy_reward/std": 0.1767766922712326,
+      "step": 19
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.2,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.018745651468634605,
+      "kl": 0.032726893201470375,
+      "learning_rate": 2.85e-06,
+      "loss": 0.0,
+      "num_tokens": 88942.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 20
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.21,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 13.171895027160645,
+      "kl": 0.03730332292616367,
+      "learning_rate": 3e-06,
+      "loss": 0.0,
+      "num_tokens": 93558.0,
+      "reward": 0.8500000238418579,
+      "reward_std": 0.30000001192092896,
+      "rewards/compute_privacy_reward/mean": 0.8500000238418579,
+      "rewards/compute_privacy_reward/std": 0.2828426957130432,
+      "step": 21
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.22,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.01750584878027439,
+      "kl": 0.034751106053590775,
+      "learning_rate": 2.999771542734587e-06,
+      "loss": 0.0,
+      "num_tokens": 98174.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 22
+    },
+    {
+      "completion_length": 485.125,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 406.0,
+      "completions/mean_length": 485.125,
+      "completions/mean_terminated_length": 404.5,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "epoch": 0.23,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.039246831089258194,
+      "kl": 0.04311051033437252,
+      "learning_rate": 2.9990862405286437e-06,
+      "loss": 0.0,
+      "num_tokens": 102575.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 23
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.24,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.015374306589365005,
+      "kl": 0.024610635824501514,
+      "learning_rate": 2.9979443021318607e-06,
+      "loss": 0.0,
+      "num_tokens": 107191.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 24
+    },
+    {
+      "completion_length": 487.375,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 315.0,
+      "completions/mean_length": 487.375,
+      "completions/mean_terminated_length": 315.0,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "epoch": 0.25,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 14.616058349609375,
+      "kl": 0.03151147440075874,
+      "learning_rate": 2.9963460753897363e-06,
+      "loss": 0.0,
+      "num_tokens": 111610.0,
+      "reward": 0.9125000238418579,
+      "reward_std": 0.17499999701976776,
+      "rewards/compute_privacy_reward/mean": 0.9125000238418579,
+      "rewards/compute_privacy_reward/std": 0.2474873811006546,
+      "step": 25
+    },
+    {
+      "completion_length": 500.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 416.0,
+      "completions/mean_length": 500.0,
+      "completions/mean_terminated_length": 416.0,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "epoch": 0.26,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04938950762152672,
+      "kl": 0.0381797980517149,
+      "learning_rate": 2.994292047137618e-06,
+      "loss": 0.0,
+      "num_tokens": 116130.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 26
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.27,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 10.795992851257324,
+      "kl": 0.021807033568620682,
+      "learning_rate": 2.99178284305241e-06,
+      "loss": 0.0,
+      "num_tokens": 120746.0,
+      "reward": 0.875,
+      "reward_std": 0.25,
+      "rewards/compute_privacy_reward/mean": 0.875,
+      "rewards/compute_privacy_reward/std": 0.3535533845424652,
+      "step": 27
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.28,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.01516253687441349,
+      "kl": 0.02713087946176529,
+      "learning_rate": 2.9888192274619833e-06,
+      "loss": 0.0,
+      "num_tokens": 125362.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 28
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.29,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.010867624543607235,
+      "kl": 0.0178441577591002,
+      "learning_rate": 2.9854021031123555e-06,
+      "loss": 0.0,
+      "num_tokens": 129978.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 29
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.3,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 14.238554954528809,
+      "kl": 0.029790643602609634,
+      "learning_rate": 2.981532510892707e-06,
+      "loss": 0.0,
+      "num_tokens": 134594.0,
+      "reward": 0.824999988079071,
+      "reward_std": 0.3499999940395355,
+      "rewards/compute_privacy_reward/mean": 0.824999988079071,
+      "rewards/compute_privacy_reward/std": 0.32403701543807983,
+      "step": 30
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.31,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 15.67749309539795,
+      "kl": 0.03798237256705761,
+      "learning_rate": 2.9772116295183124e-06,
+      "loss": 0.0,
+      "num_tokens": 139210.0,
+      "reward": 0.9125000238418579,
+      "reward_std": 0.17499999701976776,
+      "rewards/compute_privacy_reward/mean": 0.9125000238418579,
+      "rewards/compute_privacy_reward/std": 0.2474873811006546,
+      "step": 31
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.32,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.018243158236145973,
+      "kl": 0.04286060109734535,
+      "learning_rate": 2.972440775171496e-06,
+      "loss": 0.0,
+      "num_tokens": 143826.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 32
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.33,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 7.639444351196289,
+      "kl": 0.03161488939076662,
+      "learning_rate": 2.9672214011007086e-06,
+      "loss": 0.0,
+      "num_tokens": 148442.0,
+      "reward": 0.9125000238418579,
+      "reward_std": 0.17499999701976776,
+      "rewards/compute_privacy_reward/mean": 0.9125000238418579,
+      "rewards/compute_privacy_reward/std": 0.2474873811006546,
+      "step": 33
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.34,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.025024209171533585,
+      "kl": 0.046919526532292366,
+      "learning_rate": 2.961555097177853e-06,
+      "loss": 0.0,
+      "num_tokens": 153058.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 34
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.35,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 10.07824420928955,
+      "kl": 0.05792512744665146,
+      "learning_rate": 2.9554435894139947e-06,
+      "loss": 0.0001,
+      "num_tokens": 157674.0,
+      "reward": 0.9375,
+      "reward_std": 0.125,
+      "rewards/compute_privacy_reward/mean": 0.9375,
+      "rewards/compute_privacy_reward/std": 0.1767766922712326,
+      "step": 35
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.36,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.023909006267786026,
+      "kl": 0.0673011764883995,
+      "learning_rate": 2.9488887394336023e-06,
+      "loss": 0.0001,
+      "num_tokens": 162290.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 36
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.37,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.03504253923892975,
+      "kl": 0.06536441668868065,
+      "learning_rate": 2.9418925439074784e-06,
+      "loss": 0.0001,
+      "num_tokens": 166906.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 37
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.38,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 12.266355514526367,
+      "kl": 0.0688310619443655,
+      "learning_rate": 2.9344571339445534e-06,
+      "loss": 0.0001,
+      "num_tokens": 171522.0,
+      "reward": 0.875,
+      "reward_std": 0.14433756470680237,
+      "rewards/compute_privacy_reward/mean": 0.875,
+      "rewards/compute_privacy_reward/std": 0.2314550280570984,
+      "step": 38
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.39,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.02357407845556736,
+      "kl": 0.06263950653374195,
+      "learning_rate": 2.9265847744427307e-06,
+      "loss": 0.0001,
+      "num_tokens": 176138.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 39
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.4,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 9.83141040802002,
+      "kl": 0.06703359261155128,
+      "learning_rate": 2.9182778633989753e-06,
+      "loss": 0.0001,
+      "num_tokens": 180754.0,
+      "reward": 0.9125000238418579,
+      "reward_std": 0.17499999701976776,
+      "rewards/compute_privacy_reward/mean": 0.9125000238418579,
+      "rewards/compute_privacy_reward/std": 0.2474873811006546,
+      "step": 40
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.41,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 8.036130905151367,
+      "kl": 0.051177822053432465,
+      "learning_rate": 2.9095389311788626e-06,
+      "loss": 0.0001,
+      "num_tokens": 185370.0,
+      "reward": 0.9375,
+      "reward_std": 0.125,
+      "rewards/compute_privacy_reward/mean": 0.9375,
+      "rewards/compute_privacy_reward/std": 0.1767766922712326,
+      "step": 41
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.42,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.018833357840776443,
+      "kl": 0.06767978891730309,
+      "learning_rate": 2.9003706397458025e-06,
+      "loss": 0.0001,
+      "num_tokens": 189986.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 42
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.43,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.02449095994234085,
+      "kl": 0.05824752897024155,
+      "learning_rate": 2.8907757818501814e-06,
+      "loss": 0.0001,
+      "num_tokens": 194602.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 43
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.44,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.016382554545998573,
+      "kl": 0.05880605801939964,
+      "learning_rate": 2.880757280178661e-06,
+      "loss": 0.0001,
+      "num_tokens": 199218.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 44
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.45,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.021268218755722046,
+      "kl": 0.051731258630752563,
+      "learning_rate": 2.8703181864639013e-06,
+      "loss": 0.0001,
+      "num_tokens": 203834.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 45
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.46,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 7.906425476074219,
+      "kl": 0.04663049802184105,
+      "learning_rate": 2.859461680554975e-06,
+      "loss": 0.0,
+      "num_tokens": 208450.0,
+      "reward": 0.824999988079071,
+      "reward_std": 0.20207259058952332,
+      "rewards/compute_privacy_reward/mean": 0.824999988079071,
+      "rewards/compute_privacy_reward/std": 0.32403701543807983,
+      "step": 46
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.47,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 10.449911117553711,
+      "kl": 0.0463445782661438,
+      "learning_rate": 2.8481910694487506e-06,
+      "loss": 0.0,
+      "num_tokens": 213066.0,
+      "reward": 0.9375,
+      "reward_std": 0.125,
+      "rewards/compute_privacy_reward/mean": 0.9375,
+      "rewards/compute_privacy_reward/std": 0.1767766922712326,
+      "step": 47
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.48,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 8.364709854125977,
+      "kl": 0.04474816285073757,
+      "learning_rate": 2.8365097862825516e-06,
+      "loss": 0.0,
+      "num_tokens": 217682.0,
+      "reward": 0.9791666269302368,
+      "reward_std": 0.041666675359010696,
+      "rewards/compute_privacy_reward/mean": 0.9791666269302368,
+      "rewards/compute_privacy_reward/std": 0.0589255727827549,
+      "step": 48
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.49,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.011722804978489876,
+      "kl": 0.033881235867738724,
+      "learning_rate": 2.8244213892883906e-06,
+      "loss": 0.0,
+      "num_tokens": 222298.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 49
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.5,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.011438421905040741,
+      "kl": 0.04947184585034847,
+      "learning_rate": 2.811929560709094e-06,
+      "loss": 0.0,
+      "num_tokens": 226914.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 50
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.51,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.009610641747713089,
+      "kl": 0.04447856545448303,
+      "learning_rate": 2.7990381056766585e-06,
+      "loss": 0.0,
+      "num_tokens": 231530.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 51
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.52,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.012405481189489365,
+      "kl": 0.04076301120221615,
+      "learning_rate": 2.7857509510531684e-06,
+      "loss": 0.0,
+      "num_tokens": 236146.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 52
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.53,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.012948819436132908,
+      "kl": 0.036498142406344414,
+      "learning_rate": 2.772072144234639e-06,
+      "loss": 0.0,
+      "num_tokens": 240762.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 53
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.54,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.013138208538293839,
+      "kl": 0.04784107208251953,
+      "learning_rate": 2.758005851918136e-06,
+      "loss": 0.0,
+      "num_tokens": 245378.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 54
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.55,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.01497066393494606,
+      "kl": 0.04166642762720585,
+      "learning_rate": 2.7435563588325624e-06,
+      "loss": 0.0,
+      "num_tokens": 249994.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 55
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.56,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.013515869155526161,
+      "kl": 0.04080471396446228,
+      "learning_rate": 2.728728066433488e-06,
+      "loss": 0.0,
+      "num_tokens": 254610.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 56
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.57,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.013806214556097984,
+      "kl": 0.04461108334362507,
+      "learning_rate": 2.713525491562421e-06,
+      "loss": 0.0,
+      "num_tokens": 259226.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 57
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.58,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 35.78581237792969,
+      "kl": 0.4047622624784708,
+      "learning_rate": 2.6979532650709395e-06,
+      "loss": 0.0004,
+      "num_tokens": 263842.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 58
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.59,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 11.663408279418945,
+      "kl": 0.03375354781746864,
+      "learning_rate": 2.6820161304100827e-06,
+      "loss": 0.0,
+      "num_tokens": 268458.0,
+      "reward": 0.8500000238418579,
+      "reward_std": 0.30000001192092896,
+      "rewards/compute_privacy_reward/mean": 0.8500000238418579,
+      "rewards/compute_privacy_reward/std": 0.2828426957130432,
+      "step": 59
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.6,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 9.63570499420166,
+      "kl": 0.03328128904104233,
+      "learning_rate": 2.6657189421854562e-06,
+      "loss": 0.0,
+      "num_tokens": 273074.0,
+      "reward": 0.9125000238418579,
+      "reward_std": 0.17499999701976776,
+      "rewards/compute_privacy_reward/mean": 0.9125000238418579,
+      "rewards/compute_privacy_reward/std": 0.2474873811006546,
+      "step": 60
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.61,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 8.041156768798828,
+      "kl": 0.04519444517791271,
+      "learning_rate": 2.649066664678467e-06,
+      "loss": 0.0,
+      "num_tokens": 277690.0,
+      "reward": 0.8125,
+      "reward_std": 0.125,
+      "rewards/compute_privacy_reward/mean": 0.8125,
+      "rewards/compute_privacy_reward/std": 0.25877460837364197,
+      "step": 61
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.62,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 8.975259780883789,
+      "kl": 0.041264547035098076,
+      "learning_rate": 2.632064370334158e-06,
+      "loss": 0.0,
+      "num_tokens": 282306.0,
+      "reward": 0.875,
+      "reward_std": 0.14433756470680237,
+      "rewards/compute_privacy_reward/mean": 0.875,
+      "rewards/compute_privacy_reward/std": 0.2314550280570984,
+      "step": 62
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.63,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.01326824352145195,
+      "kl": 0.04475162923336029,
+      "learning_rate": 2.6147172382160914e-06,
+      "loss": 0.0,
+      "num_tokens": 286922.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 63
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.64,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.017783720046281815,
+      "kl": 0.04413033276796341,
+      "learning_rate": 2.597030552428756e-06,
+      "loss": 0.0,
+      "num_tokens": 291538.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 64
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.65,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.012393958866596222,
+      "kl": 0.04510576277971268,
+      "learning_rate": 2.5790097005079765e-06,
+      "loss": 0.0,
+      "num_tokens": 296154.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 65
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.66,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.010504755191504955,
+      "kl": 0.04213905707001686,
+      "learning_rate": 2.5606601717798212e-06,
+      "loss": 0.0,
+      "num_tokens": 300770.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 66
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.67,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.012795431539416313,
+      "kl": 0.04897330328822136,
+      "learning_rate": 2.5419875556884957e-06,
+      "loss": 0.0,
+      "num_tokens": 305386.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 67
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.68,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.010495081543922424,
+      "kl": 0.04848748818039894,
+      "learning_rate": 2.522997540093748e-06,
+      "loss": 0.0,
+      "num_tokens": 310002.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 68
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.69,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 6.685613632202148,
+      "kl": 0.05607937276363373,
+      "learning_rate": 2.5036959095382875e-06,
+      "loss": 0.0001,
+      "num_tokens": 314618.0,
+      "reward": 0.9375,
+      "reward_std": 0.125,
+      "rewards/compute_privacy_reward/mean": 0.9375,
+      "rewards/compute_privacy_reward/std": 0.1767766922712326,
+      "step": 69
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.7,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 10.363982200622559,
+      "kl": 0.04981439188122749,
+      "learning_rate": 2.484088543485761e-06,
+      "loss": 0.0,
+      "num_tokens": 319234.0,
+      "reward": 0.9125000238418579,
+      "reward_std": 0.17499999701976776,
+      "rewards/compute_privacy_reward/mean": 0.9125000238418579,
+      "rewards/compute_privacy_reward/std": 0.2474873811006546,
+      "step": 70
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.71,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 12.277480125427246,
+      "kl": 0.0549851655960083,
+      "learning_rate": 2.464181414529809e-06,
+      "loss": 0.0001,
+      "num_tokens": 323850.0,
+      "reward": 0.8125,
+      "reward_std": 0.26933756470680237,
+      "rewards/compute_privacy_reward/mean": 0.8125,
+      "rewards/compute_privacy_reward/std": 0.25877460837364197,
+      "step": 71
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.72,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.012248357757925987,
+      "kl": 0.05290329270064831,
+      "learning_rate": 2.4439805865747562e-06,
+      "loss": 0.0001,
+      "num_tokens": 328466.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 72
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.73,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.011897348798811436,
+      "kl": 0.048916060477495193,
+      "learning_rate": 2.4234922129884873e-06,
+      "loss": 0.0,
+      "num_tokens": 333082.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 73
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.74,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.00852968916296959,
+      "kl": 0.044865330681204796,
+      "learning_rate": 2.4027225347280728e-06,
+      "loss": 0.0,
+      "num_tokens": 337698.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 74
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.75,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.018844276666641235,
+      "kl": 0.07143920287489891,
+      "learning_rate": 2.3816778784387097e-06,
+      "loss": 0.0001,
+      "num_tokens": 342314.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 75
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.76,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04624907672405243,
+      "kl": 0.04331217147409916,
+      "learning_rate": 2.3603646545265692e-06,
+      "loss": 0.0,
+      "num_tokens": 346930.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 76
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.77,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 7.180686950683594,
+      "kl": 0.05896450765430927,
+      "learning_rate": 2.3387893552061204e-06,
+      "loss": 0.0001,
+      "num_tokens": 351546.0,
+      "reward": 0.875,
+      "reward_std": 0.14433756470680237,
+      "rewards/compute_privacy_reward/mean": 0.875,
+      "rewards/compute_privacy_reward/std": 0.2314550280570984,
+      "step": 77
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.78,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.011330473236739635,
+      "kl": 0.03922133147716522,
+      "learning_rate": 2.316958552522541e-06,
+      "loss": 0.0,
+      "num_tokens": 356162.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 78
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.79,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.01726508140563965,
+      "kl": 0.07372071966528893,
+      "learning_rate": 2.2948788963498076e-06,
+      "loss": 0.0001,
+      "num_tokens": 360778.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 79
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.8,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.010915080085396767,
+      "kl": 0.037650689482688904,
+      "learning_rate": 2.2725571123650813e-06,
+      "loss": 0.0,
+      "num_tokens": 365394.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 80
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.81,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.008705356158316135,
+      "kl": 0.03181586228311062,
+      "learning_rate": 2.25e-06,
+      "loss": 0.0,
+      "num_tokens": 370010.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 81
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.82,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 5.754616737365723,
+      "kl": 0.06010713614523411,
+      "learning_rate": 2.227214430369506e-06,
+      "loss": 0.0001,
+      "num_tokens": 374626.0,
+      "reward": 0.9375,
+      "reward_std": 0.125,
+      "rewards/compute_privacy_reward/mean": 0.9375,
+      "rewards/compute_privacy_reward/std": 0.1767766922712326,
+      "step": 82
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.83,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.013759837485849857,
+      "kl": 0.045676082372665405,
+      "learning_rate": 2.204207344178836e-06,
+      "loss": 0.0,
+      "num_tokens": 379242.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 83
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.84,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.010396880097687244,
+      "kl": 0.03468891978263855,
+      "learning_rate": 2.18098574960932e-06,
+      "loss": 0.0,
+      "num_tokens": 383858.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 84
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.85,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.017123762518167496,
+      "kl": 0.06448491849005222,
+      "learning_rate": 2.157556720183616e-06,
+      "loss": 0.0001,
+      "num_tokens": 388474.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 85
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.86,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.013710317201912403,
+      "kl": 0.06623340398073196,
+      "learning_rate": 2.1339273926110494e-06,
+      "loss": 0.0001,
+      "num_tokens": 393090.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 86
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.87,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.011785466223955154,
+      "kl": 0.06992711499333382,
+      "learning_rate": 2.1101049646137005e-06,
+      "loss": 0.0001,
+      "num_tokens": 397706.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 87
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.88,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.02513691782951355,
+      "kl": 0.06381690315902233,
+      "learning_rate": 2.0860966927339105e-06,
+      "loss": 0.0001,
+      "num_tokens": 402322.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 88
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.89,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.014653935097157955,
+      "kl": 0.09172879531979561,
+      "learning_rate": 2.061909890123868e-06,
+      "loss": 0.0001,
+      "num_tokens": 406938.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 89
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.9,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.011389149352908134,
+      "kl": 0.06803522258996964,
+      "learning_rate": 2.03755192431795e-06,
+      "loss": 0.0001,
+      "num_tokens": 411554.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 90
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.91,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.013293416239321232,
+      "kl": 0.05595729127526283,
+      "learning_rate": 2.0130302149885033e-06,
+      "loss": 0.0001,
+      "num_tokens": 416170.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 91
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.92,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.011796104721724987,
+      "kl": 0.08676471933722496,
+      "learning_rate": 1.988352231685735e-06,
+      "loss": 0.0001,
+      "num_tokens": 420786.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 92
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.93,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 4.556504249572754,
+      "kl": 0.06344365514814854,
+      "learning_rate": 1.963525491562421e-06,
+      "loss": 0.0001,
+      "num_tokens": 425402.0,
+      "reward": 0.9125000238418579,
+      "reward_std": 0.17499999701976776,
+      "rewards/compute_privacy_reward/mean": 0.9125000238418579,
+      "rewards/compute_privacy_reward/std": 0.2474873811006546,
+      "step": 93
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.94,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.012822140008211136,
+      "kl": 0.04089527949690819,
+      "learning_rate": 1.9385575570841053e-06,
+      "loss": 0.0,
+      "num_tokens": 430018.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 94
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.95,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.014192065224051476,
+      "kl": 0.07499993592500687,
+      "learning_rate": 1.9134560337254986e-06,
+      "loss": 0.0001,
+      "num_tokens": 434634.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 95
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.96,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 8.906691551208496,
+      "kl": 0.08601418137550354,
+      "learning_rate": 1.888228567653781e-06,
+      "loss": 0.0001,
+      "num_tokens": 439250.0,
+      "reward": 0.9375,
+      "reward_std": 0.125,
+      "rewards/compute_privacy_reward/mean": 0.9375,
+      "rewards/compute_privacy_reward/std": 0.1767766922712326,
+      "step": 96
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.97,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.013597754761576653,
+      "kl": 0.057392850518226624,
+      "learning_rate": 1.8628828433995015e-06,
+      "loss": 0.0001,
+      "num_tokens": 443866.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 97
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.98,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.012854848057031631,
+      "kl": 0.08813724294304848,
+      "learning_rate": 1.8374265815157977e-06,
+      "loss": 0.0001,
+      "num_tokens": 448482.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 98
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.99,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.011715216562151909,
+      "kl": 0.12619851529598236,
+      "learning_rate": 1.8118675362266389e-06,
+      "loss": 0.0001,
+      "num_tokens": 453098.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 99
+    },
+    {
+      "completion_length": 512.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 512.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 1.0,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.012364454567432404,
+      "kl": 0.12042452394962311,
+      "learning_rate": 1.7862134930648174e-06,
+      "loss": 0.0001,
+      "num_tokens": 457714.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/compute_privacy_reward/mean": 1.0,
+      "rewards/compute_privacy_reward/std": 0.0,
+      "step": 100
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 200,
+  "num_input_tokens_seen": 457714,
+  "num_train_epochs": 2,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc146d2f3740f3b64d0b0ae856e7009071840f7b41ed8ddd5f24f2fe6a34d959
+size 6929

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff