tahamajs
/

my-awesome-model_final_bitcoin_enhanced_prediction_dataset_with_local_comprehensive_news

tahamajs commited on 7 days ago

Commit

068ceb7

verified ·

1 Parent(s): 8e52da4

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
added_tokens.json +31 -0
chat_template.jinja +89 -0
checkpoint-1000/README.md +208 -0
checkpoint-1000/adapter_config.json +41 -0
checkpoint-1000/adapter_model.safetensors +3 -0
checkpoint-1000/added_tokens.json +31 -0
checkpoint-1000/chat_template.jinja +89 -0
checkpoint-1000/merges.txt +0 -0
checkpoint-1000/optimizer.pt +3 -0
checkpoint-1000/rng_state.pth +3 -0
checkpoint-1000/scaler.pt +3 -0
checkpoint-1000/scheduler.pt +3 -0
checkpoint-1000/special_tokens_map.json +39 -0
checkpoint-1000/tokenizer.json +3 -0
checkpoint-1000/tokenizer_config.json +254 -0
checkpoint-1000/trainer_state.json +3534 -0
checkpoint-1000/training_args.bin +3 -0
checkpoint-1000/vocab.json +0 -0
checkpoint-1152/README.md +208 -0
checkpoint-1152/adapter_config.json +41 -0
checkpoint-1152/adapter_model.safetensors +3 -0
checkpoint-1152/added_tokens.json +31 -0
checkpoint-1152/chat_template.jinja +89 -0
checkpoint-1152/merges.txt +0 -0
checkpoint-1152/optimizer.pt +3 -0
checkpoint-1152/rng_state.pth +3 -0
checkpoint-1152/scaler.pt +3 -0
checkpoint-1152/scheduler.pt +3 -0
checkpoint-1152/special_tokens_map.json +39 -0
checkpoint-1152/tokenizer.json +3 -0
checkpoint-1152/tokenizer_config.json +254 -0
checkpoint-1152/trainer_state.json +4066 -0
checkpoint-1152/training_args.bin +3 -0
checkpoint-1152/vocab.json +0 -0
checkpoint-200/README.md +208 -0
checkpoint-200/adapter_config.json +41 -0
checkpoint-200/adapter_model.safetensors +3 -0
checkpoint-200/added_tokens.json +31 -0
checkpoint-200/chat_template.jinja +89 -0
checkpoint-200/merges.txt +0 -0
checkpoint-200/optimizer.pt +3 -0
checkpoint-200/rng_state.pth +3 -0
checkpoint-200/scaler.pt +3 -0
checkpoint-200/scheduler.pt +3 -0
checkpoint-200/special_tokens_map.json +39 -0
checkpoint-200/tokenizer.json +3 -0
checkpoint-200/tokenizer_config.json +254 -0
checkpoint-200/trainer_state.json +734 -0
checkpoint-200/training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1152/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|analysis|>": 151670,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|forecast|>": 151671,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|response|>": 151669,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

checkpoint-1000/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: ./Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:./Qwen3-8B
+- lora
+- transformers
+- unsloth
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.16.0

checkpoint-1000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": true
+}

checkpoint-1000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad45661894700811a0a4f19acc6b8afc9d98db1406cc80d88fa464a18ce495ea
+size 2834238032

checkpoint-1000/added_tokens.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|analysis|>": 151670,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|forecast|>": 151671,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|response|>": 151669,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-1000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

checkpoint-1000/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:888353d1da4d630234d41b91788dd2aafb767cfb17ebf85761b00e83607c23e1
+size 698777675

checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8e2011629d8bed3ef560fa11175cac55684c4e12a72634bb24abf767b6c7399
+size 14645

checkpoint-1000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9779a733270277f15e820d84d3dfdfb3a66fd96b857f3f0109ac7f2b54244d67
+size 1383

checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b938d276d47acf5a6dc15bc7c48a9e3e0ede2cc320ecd371c94b59541d8d616
+size 1465

checkpoint-1000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|response|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|analysis|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|forecast|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-1000/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77247e5fb2e966d04e513068b17cca472e105e7c56953e9b1d27d70b93d77e6f
+size 11423221

checkpoint-1000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,254 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<|response|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|analysis|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|forecast|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|response|>",
+    "<|analysis|>",
+    "<|forecast|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 40960,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3534 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.4722222222222223,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006944444444444444,
+      "grad_norm": 0.0001319512666668743,
+      "learning_rate": 3.448275862068966e-06,
+      "loss": 21.3796,
+      "step": 2
+    },
+    {
+      "epoch": 0.013888888888888888,
+      "grad_norm": 0.00012970938405487686,
+      "learning_rate": 1.0344827586206897e-05,
+      "loss": 2.5144,
+      "step": 4
+    },
+    {
+      "epoch": 0.020833333333333332,
+      "grad_norm": 0.00012933755351696163,
+      "learning_rate": 1.7241379310344828e-05,
+      "loss": 13.8026,
+      "step": 6
+    },
+    {
+      "epoch": 0.027777777777777776,
+      "grad_norm": 0.00012198727199574932,
+      "learning_rate": 2.413793103448276e-05,
+      "loss": 4.9835,
+      "step": 8
+    },
+    {
+      "epoch": 0.034722222222222224,
+      "grad_norm": 0.00010912115249084309,
+      "learning_rate": 3.103448275862069e-05,
+      "loss": 2.397,
+      "step": 10
+    },
+    {
+      "epoch": 0.041666666666666664,
+      "grad_norm": 9.947551734512672e-05,
+      "learning_rate": 3.793103448275862e-05,
+      "loss": 2.3169,
+      "step": 12
+    },
+    {
+      "epoch": 0.04861111111111111,
+      "grad_norm": 9.700353257358074e-05,
+      "learning_rate": 4.482758620689655e-05,
+      "loss": 2.2121,
+      "step": 14
+    },
+    {
+      "epoch": 0.05555555555555555,
+      "grad_norm": 0.00010787531937239692,
+      "learning_rate": 5.172413793103449e-05,
+      "loss": 2.5911,
+      "step": 16
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.00010262445721309632,
+      "learning_rate": 5.862068965517241e-05,
+      "loss": 1.9774,
+      "step": 18
+    },
+    {
+      "epoch": 0.06944444444444445,
+      "grad_norm": 8.186206105165184e-05,
+      "learning_rate": 6.551724137931034e-05,
+      "loss": 1.8391,
+      "step": 20
+    },
+    {
+      "epoch": 0.0763888888888889,
+      "grad_norm": 5.4703454225091264e-05,
+      "learning_rate": 7.241379310344828e-05,
+      "loss": 1.7289,
+      "step": 22
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 4.5757446059724316e-05,
+      "learning_rate": 7.931034482758621e-05,
+      "loss": 1.867,
+      "step": 24
+    },
+    {
+      "epoch": 0.09027777777777778,
+      "grad_norm": 8.86492634890601e-05,
+      "learning_rate": 8.620689655172413e-05,
+      "loss": 1.7276,
+      "step": 26
+    },
+    {
+      "epoch": 0.09722222222222222,
+      "grad_norm": 3.919301525456831e-05,
+      "learning_rate": 9.310344827586207e-05,
+      "loss": 1.6506,
+      "step": 28
+    },
+    {
+      "epoch": 0.10416666666666667,
+      "grad_norm": 5.642673568218015e-05,
+      "learning_rate": 0.0001,
+      "loss": 1.5732,
+      "step": 30
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 6.0812188166892156e-05,
+      "learning_rate": 0.00010689655172413792,
+      "loss": 1.5062,
+      "step": 32
+    },
+    {
+      "epoch": 0.11805555555555555,
+      "grad_norm": 2.352882074774243e-05,
+      "learning_rate": 0.00011379310344827588,
+      "loss": 1.393,
+      "step": 34
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 3.3942505979212e-05,
+      "learning_rate": 0.0001206896551724138,
+      "loss": 1.3654,
+      "step": 36
+    },
+    {
+      "epoch": 0.13194444444444445,
+      "grad_norm": 2.9797614843118936e-05,
+      "learning_rate": 0.00012758620689655174,
+      "loss": 1.3662,
+      "step": 38
+    },
+    {
+      "epoch": 0.1388888888888889,
+      "grad_norm": 2.0893716282444075e-05,
+      "learning_rate": 0.00013448275862068965,
+      "loss": 1.344,
+      "step": 40
+    },
+    {
+      "epoch": 0.14583333333333334,
+      "grad_norm": 1.886891550384462e-05,
+      "learning_rate": 0.0001413793103448276,
+      "loss": 1.2809,
+      "step": 42
+    },
+    {
+      "epoch": 0.1527777777777778,
+      "grad_norm": 1.6420885003753938e-05,
+      "learning_rate": 0.00014827586206896554,
+      "loss": 1.2911,
+      "step": 44
+    },
+    {
+      "epoch": 0.1597222222222222,
+      "grad_norm": 2.6823576263268478e-05,
+      "learning_rate": 0.00015517241379310346,
+      "loss": 1.3095,
+      "step": 46
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 1.2686981790466234e-05,
+      "learning_rate": 0.00016206896551724137,
+      "loss": 1.2149,
+      "step": 48
+    },
+    {
+      "epoch": 0.1736111111111111,
+      "grad_norm": 1.0219813702860847e-05,
+      "learning_rate": 0.00016896551724137932,
+      "loss": 1.2522,
+      "step": 50
+    },
+    {
+      "epoch": 0.18055555555555555,
+      "grad_norm": 1.0080276297230739e-05,
+      "learning_rate": 0.00017586206896551723,
+      "loss": 1.2311,
+      "step": 52
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 9.221699656336568e-06,
+      "learning_rate": 0.00018275862068965518,
+      "loss": 1.2138,
+      "step": 54
+    },
+    {
+      "epoch": 0.19444444444444445,
+      "grad_norm": 1.0500927601242438e-05,
+      "learning_rate": 0.00018965517241379312,
+      "loss": 1.2149,
+      "step": 56
+    },
+    {
+      "epoch": 0.2013888888888889,
+      "grad_norm": 8.30852695798967e-06,
+      "learning_rate": 0.00019655172413793104,
+      "loss": 1.212,
+      "step": 58
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 1.2315202184254304e-05,
+      "learning_rate": 0.0001999995876796145,
+      "loss": 1.2239,
+      "step": 60
+    },
+    {
+      "epoch": 0.2152777777777778,
+      "grad_norm": 1.1090536645497195e-05,
+      "learning_rate": 0.00019999628913693117,
+      "loss": 1.2075,
+      "step": 62
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 1.1589069799811114e-05,
+      "learning_rate": 0.00019998969216036892,
+      "loss": 1.2026,
+      "step": 64
+    },
+    {
+      "epoch": 0.22916666666666666,
+      "grad_norm": 1.0656535778252874e-05,
+      "learning_rate": 0.0001999797969675326,
+      "loss": 1.2126,
+      "step": 66
+    },
+    {
+      "epoch": 0.2361111111111111,
+      "grad_norm": 9.856509677774739e-06,
+      "learning_rate": 0.00019996660388482083,
+      "loss": 1.166,
+      "step": 68
+    },
+    {
+      "epoch": 0.24305555555555555,
+      "grad_norm": 1.5398893083329313e-05,
+      "learning_rate": 0.00019995011334741477,
+      "loss": 1.215,
+      "step": 70
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.422492101090029e-06,
+      "learning_rate": 0.00019993032589926414,
+      "loss": 1.1868,
+      "step": 72
+    },
+    {
+      "epoch": 0.2569444444444444,
+      "grad_norm": 8.810847248241771e-06,
+      "learning_rate": 0.00019990724219306902,
+      "loss": 1.1971,
+      "step": 74
+    },
+    {
+      "epoch": 0.2638888888888889,
+      "grad_norm": 1.1227813956793398e-05,
+      "learning_rate": 0.00019988086299025848,
+      "loss": 1.1684,
+      "step": 76
+    },
+    {
+      "epoch": 0.2708333333333333,
+      "grad_norm": 1.0562929674051702e-05,
+      "learning_rate": 0.00019985118916096534,
+      "loss": 1.1981,
+      "step": 78
+    },
+    {
+      "epoch": 0.2777777777777778,
+      "grad_norm": 1.521587728348095e-05,
+      "learning_rate": 0.00019981822168399756,
+      "loss": 1.1838,
+      "step": 80
+    },
+    {
+      "epoch": 0.2847222222222222,
+      "grad_norm": 1.0759257747849915e-05,
+      "learning_rate": 0.00019978196164680597,
+      "loss": 1.2032,
+      "step": 82
+    },
+    {
+      "epoch": 0.2916666666666667,
+      "grad_norm": 8.712796443433035e-06,
+      "learning_rate": 0.00019974241024544828,
+      "loss": 1.1937,
+      "step": 84
+    },
+    {
+      "epoch": 0.2986111111111111,
+      "grad_norm": 1.368098037346499e-05,
+      "learning_rate": 0.00019969956878454972,
+      "loss": 1.1965,
+      "step": 86
+    },
+    {
+      "epoch": 0.3055555555555556,
+      "grad_norm": 1.0317597116227262e-05,
+      "learning_rate": 0.00019965343867725998,
+      "loss": 1.1908,
+      "step": 88
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 1.0250181730953045e-05,
+      "learning_rate": 0.00019960402144520665,
+      "loss": 1.1983,
+      "step": 90
+    },
+    {
+      "epoch": 0.3194444444444444,
+      "grad_norm": 1.0102179658133537e-05,
+      "learning_rate": 0.00019955131871844488,
+      "loss": 1.1842,
+      "step": 92
+    },
+    {
+      "epoch": 0.3263888888888889,
+      "grad_norm": 8.509154213243164e-06,
+      "learning_rate": 0.00019949533223540385,
+      "loss": 1.1871,
+      "step": 94
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 8.434612027485855e-06,
+      "learning_rate": 0.00019943606384282916,
+      "loss": 1.2072,
+      "step": 96
+    },
+    {
+      "epoch": 0.3402777777777778,
+      "grad_norm": 1.0174206181545742e-05,
+      "learning_rate": 0.0001993735154957221,
+      "loss": 1.2088,
+      "step": 98
+    },
+    {
+      "epoch": 0.3472222222222222,
+      "grad_norm": 9.98695850285003e-06,
+      "learning_rate": 0.00019930768925727514,
+      "loss": 1.1847,
+      "step": 100
+    },
+    {
+      "epoch": 0.3541666666666667,
+      "grad_norm": 8.591785444878042e-06,
+      "learning_rate": 0.0001992385872988038,
+      "loss": 1.2041,
+      "step": 102
+    },
+    {
+      "epoch": 0.3611111111111111,
+      "grad_norm": 1.0436694537929725e-05,
+      "learning_rate": 0.00019916621189967502,
+      "loss": 1.2194,
+      "step": 104
+    },
+    {
+      "epoch": 0.3680555555555556,
+      "grad_norm": 1.137161689257482e-05,
+      "learning_rate": 0.00019909056544723213,
+      "loss": 1.1788,
+      "step": 106
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 1.0015843145083636e-05,
+      "learning_rate": 0.00019901165043671593,
+      "loss": 1.1979,
+      "step": 108
+    },
+    {
+      "epoch": 0.3819444444444444,
+      "grad_norm": 1.0921584362222347e-05,
+      "learning_rate": 0.00019892946947118242,
+      "loss": 1.1836,
+      "step": 110
+    },
+    {
+      "epoch": 0.3888888888888889,
+      "grad_norm": 2.0685600247816183e-05,
+      "learning_rate": 0.00019884402526141709,
+      "loss": 1.1883,
+      "step": 112
+    },
+    {
+      "epoch": 0.3958333333333333,
+      "grad_norm": 1.0137908247997984e-05,
+      "learning_rate": 0.00019875532062584519,
+      "loss": 1.183,
+      "step": 114
+    },
+    {
+      "epoch": 0.4027777777777778,
+      "grad_norm": 1.1504852409416344e-05,
+      "learning_rate": 0.00019866335849043912,
+      "loss": 1.1957,
+      "step": 116
+    },
+    {
+      "epoch": 0.4097222222222222,
+      "grad_norm": 8.32590194477234e-06,
+      "learning_rate": 0.00019856814188862166,
+      "loss": 1.1605,
+      "step": 118
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 1.0243880751659162e-05,
+      "learning_rate": 0.000198469673961166,
+      "loss": 1.1787,
+      "step": 120
+    },
+    {
+      "epoch": 0.4236111111111111,
+      "grad_norm": 9.695215339888819e-06,
+      "learning_rate": 0.00019836795795609213,
+      "loss": 1.1849,
+      "step": 122
+    },
+    {
+      "epoch": 0.4305555555555556,
+      "grad_norm": 9.29382167669246e-06,
+      "learning_rate": 0.00019826299722855976,
+      "loss": 1.1779,
+      "step": 124
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 1.0769907930807676e-05,
+      "learning_rate": 0.00019815479524075758,
+      "loss": 1.1878,
+      "step": 126
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 1.0001721420849208e-05,
+      "learning_rate": 0.000198043355561789,
+      "loss": 1.1963,
+      "step": 128
+    },
+    {
+      "epoch": 0.4513888888888889,
+      "grad_norm": 1.2609498298843391e-05,
+      "learning_rate": 0.00019792868186755463,
+      "loss": 1.2135,
+      "step": 130
+    },
+    {
+      "epoch": 0.4583333333333333,
+      "grad_norm": 9.788008355826605e-06,
+      "learning_rate": 0.00019781077794063073,
+      "loss": 1.2,
+      "step": 132
+    },
+    {
+      "epoch": 0.4652777777777778,
+      "grad_norm": 1.0332081728847697e-05,
+      "learning_rate": 0.00019768964767014475,
+      "loss": 1.1747,
+      "step": 134
+    },
+    {
+      "epoch": 0.4722222222222222,
+      "grad_norm": 1.2005073585896753e-05,
+      "learning_rate": 0.00019756529505164682,
+      "loss": 1.1907,
+      "step": 136
+    },
+    {
+      "epoch": 0.4791666666666667,
+      "grad_norm": 8.512701242580079e-06,
+      "learning_rate": 0.00019743772418697806,
+      "loss": 1.2034,
+      "step": 138
+    },
+    {
+      "epoch": 0.4861111111111111,
+      "grad_norm": 1.081860773410881e-05,
+      "learning_rate": 0.0001973069392841352,
+      "loss": 1.1764,
+      "step": 140
+    },
+    {
+      "epoch": 0.4930555555555556,
+      "grad_norm": 1.0664197361620609e-05,
+      "learning_rate": 0.000197172944657132,
+      "loss": 1.169,
+      "step": 142
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.31518536870135e-06,
+      "learning_rate": 0.00019703574472585648,
+      "loss": 1.1787,
+      "step": 144
+    },
+    {
+      "epoch": 0.5069444444444444,
+      "grad_norm": 9.874113857222255e-06,
+      "learning_rate": 0.00019689534401592568,
+      "loss": 1.1908,
+      "step": 146
+    },
+    {
+      "epoch": 0.5138888888888888,
+      "grad_norm": 1.027604412229266e-05,
+      "learning_rate": 0.00019675174715853605,
+      "loss": 1.2001,
+      "step": 148
+    },
+    {
+      "epoch": 0.5208333333333334,
+      "grad_norm": 1.5908390196273103e-05,
+      "learning_rate": 0.00019660495889031073,
+      "loss": 1.1771,
+      "step": 150
+    },
+    {
+      "epoch": 0.5277777777777778,
+      "grad_norm": 1.0185674909735098e-05,
+      "learning_rate": 0.00019645498405314337,
+      "loss": 1.1809,
+      "step": 152
+    },
+    {
+      "epoch": 0.5347222222222222,
+      "grad_norm": 1.4728296264365781e-05,
+      "learning_rate": 0.0001963018275940384,
+      "loss": 1.2066,
+      "step": 154
+    },
+    {
+      "epoch": 0.5416666666666666,
+      "grad_norm": 1.1691463441820815e-05,
+      "learning_rate": 0.00019614549456494778,
+      "loss": 1.1879,
+      "step": 156
+    },
+    {
+      "epoch": 0.5486111111111112,
+      "grad_norm": 1.1677379916363861e-05,
+      "learning_rate": 0.0001959859901226045,
+      "loss": 1.1758,
+      "step": 158
+    },
+    {
+      "epoch": 0.5555555555555556,
+      "grad_norm": 1.2042312846460845e-05,
+      "learning_rate": 0.0001958233195283524,
+      "loss": 1.1741,
+      "step": 160
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 1.5415562302223407e-05,
+      "learning_rate": 0.00019565748814797252,
+      "loss": 1.1855,
+      "step": 162
+    },
+    {
+      "epoch": 0.5694444444444444,
+      "grad_norm": 1.4021643437445164e-05,
+      "learning_rate": 0.00019548850145150633,
+      "loss": 1.1937,
+      "step": 164
+    },
+    {
+      "epoch": 0.5763888888888888,
+      "grad_norm": 1.470915594836697e-05,
+      "learning_rate": 0.00019531636501307512,
+      "loss": 1.1946,
+      "step": 166
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 1.355435324512655e-05,
+      "learning_rate": 0.00019514108451069615,
+      "loss": 1.1898,
+      "step": 168
+    },
+    {
+      "epoch": 0.5902777777777778,
+      "grad_norm": 1.642305687710177e-05,
+      "learning_rate": 0.00019496266572609547,
+      "loss": 1.1822,
+      "step": 170
+    },
+    {
+      "epoch": 0.5972222222222222,
+      "grad_norm": 1.8154083591070957e-05,
+      "learning_rate": 0.00019478111454451712,
+      "loss": 1.1751,
+      "step": 172
+    },
+    {
+      "epoch": 0.6041666666666666,
+      "grad_norm": 1.3315224350662902e-05,
+      "learning_rate": 0.00019459643695452904,
+      "loss": 1.1826,
+      "step": 174
+    },
+    {
+      "epoch": 0.6111111111111112,
+      "grad_norm": 1.2248892744537443e-05,
+      "learning_rate": 0.00019440863904782543,
+      "loss": 1.213,
+      "step": 176
+    },
+    {
+      "epoch": 0.6180555555555556,
+      "grad_norm": 1.7752956409822218e-05,
+      "learning_rate": 0.00019421772701902596,
+      "loss": 1.1833,
+      "step": 178
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 1.744980545481667e-05,
+      "learning_rate": 0.00019402370716547135,
+      "loss": 1.1974,
+      "step": 180
+    },
+    {
+      "epoch": 0.6319444444444444,
+      "grad_norm": 1.9521774447639473e-05,
+      "learning_rate": 0.00019382658588701568,
+      "loss": 1.1931,
+      "step": 182
+    },
+    {
+      "epoch": 0.6388888888888888,
+      "grad_norm": 1.5161932424234692e-05,
+      "learning_rate": 0.00019362636968581524,
+      "loss": 1.1901,
+      "step": 184
+    },
+    {
+      "epoch": 0.6458333333333334,
+      "grad_norm": 1.868332583399024e-05,
+      "learning_rate": 0.00019342306516611417,
+      "loss": 1.2045,
+      "step": 186
+    },
+    {
+      "epoch": 0.6527777777777778,
+      "grad_norm": 2.162260534532834e-05,
+      "learning_rate": 0.00019321667903402642,
+      "loss": 1.1899,
+      "step": 188
+    },
+    {
+      "epoch": 0.6597222222222222,
+      "grad_norm": 1.507936030975543e-05,
+      "learning_rate": 0.00019300721809731476,
+      "loss": 1.2029,
+      "step": 190
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 2.0704670532722957e-05,
+      "learning_rate": 0.00019279468926516606,
+      "loss": 1.2063,
+      "step": 192
+    },
+    {
+      "epoch": 0.6736111111111112,
+      "grad_norm": 1.8972095858771354e-05,
+      "learning_rate": 0.0001925790995479635,
+      "loss": 1.1861,
+      "step": 194
+    },
+    {
+      "epoch": 0.6805555555555556,
+      "grad_norm": 1.5989120583981276e-05,
+      "learning_rate": 0.0001923604560570552,
+      "loss": 1.2143,
+      "step": 196
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 1.15529483082355e-05,
+      "learning_rate": 0.00019213876600451978,
+      "loss": 1.1939,
+      "step": 198
+    },
+    {
+      "epoch": 0.6944444444444444,
+      "grad_norm": 1.474355212849332e-05,
+      "learning_rate": 0.0001919140367029284,
+      "loss": 1.1909,
+      "step": 200
+    },
+    {
+      "epoch": 0.7013888888888888,
+      "grad_norm": 1.4453116818913259e-05,
+      "learning_rate": 0.00019168627556510358,
+      "loss": 1.1669,
+      "step": 202
+    },
+    {
+      "epoch": 0.7083333333333334,
+      "grad_norm": 1.5761746908538043e-05,
+      "learning_rate": 0.00019145549010387463,
+      "loss": 1.1724,
+      "step": 204
+    },
+    {
+      "epoch": 0.7152777777777778,
+      "grad_norm": 1.6146143025252968e-05,
+      "learning_rate": 0.00019122168793182987,
+      "loss": 1.1755,
+      "step": 206
+    },
+    {
+      "epoch": 0.7222222222222222,
+      "grad_norm": 1.3478926121024415e-05,
+      "learning_rate": 0.00019098487676106558,
+      "loss": 1.1972,
+      "step": 208
+    },
+    {
+      "epoch": 0.7291666666666666,
+      "grad_norm": 1.650435478950385e-05,
+      "learning_rate": 0.00019074506440293148,
+      "loss": 1.1689,
+      "step": 210
+    },
+    {
+      "epoch": 0.7361111111111112,
+      "grad_norm": 1.2287140634725802e-05,
+      "learning_rate": 0.00019050225876777316,
+      "loss": 1.1904,
+      "step": 212
+    },
+    {
+      "epoch": 0.7430555555555556,
+      "grad_norm": 1.4642901078332216e-05,
+      "learning_rate": 0.00019025646786467116,
+      "loss": 1.179,
+      "step": 214
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.2564278222271241e-05,
+      "learning_rate": 0.00019000769980117682,
+      "loss": 1.1748,
+      "step": 216
+    },
+    {
+      "epoch": 0.7569444444444444,
+      "grad_norm": 1.6164931366802193e-05,
+      "learning_rate": 0.0001897559627830447,
+      "loss": 1.1975,
+      "step": 218
+    },
+    {
+      "epoch": 0.7638888888888888,
+      "grad_norm": 1.9057128156418912e-05,
+      "learning_rate": 0.000189501265113962,
+      "loss": 1.1759,
+      "step": 220
+    },
+    {
+      "epoch": 0.7708333333333334,
+      "grad_norm": 1.6131505617522635e-05,
+      "learning_rate": 0.00018924361519527473,
+      "loss": 1.1886,
+      "step": 222
+    },
+    {
+      "epoch": 0.7777777777777778,
+      "grad_norm": 1.875819725682959e-05,
+      "learning_rate": 0.00018898302152571043,
+      "loss": 1.198,
+      "step": 224
+    },
+    {
+      "epoch": 0.7847222222222222,
+      "grad_norm": 2.0776369638042524e-05,
+      "learning_rate": 0.000188719492701098,
+      "loss": 1.1851,
+      "step": 226
+    },
+    {
+      "epoch": 0.7916666666666666,
+      "grad_norm": 1.8655549865798093e-05,
+      "learning_rate": 0.000188453037414084,
+      "loss": 1.1942,
+      "step": 228
+    },
+    {
+      "epoch": 0.7986111111111112,
+      "grad_norm": 2.2143083697301336e-05,
+      "learning_rate": 0.0001881836644538461,
+      "loss": 1.1713,
+      "step": 230
+    },
+    {
+      "epoch": 0.8055555555555556,
+      "grad_norm": 1.975363375095185e-05,
+      "learning_rate": 0.000187911382705803,
+      "loss": 1.1768,
+      "step": 232
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 2.22298640437657e-05,
+      "learning_rate": 0.00018763620115132135,
+      "loss": 1.1878,
+      "step": 234
+    },
+    {
+      "epoch": 0.8194444444444444,
+      "grad_norm": 1.9831964891636744e-05,
+      "learning_rate": 0.00018735812886741968,
+      "loss": 1.1945,
+      "step": 236
+    },
+    {
+      "epoch": 0.8263888888888888,
+      "grad_norm": 2.3476728529203683e-05,
+      "learning_rate": 0.00018707717502646873,
+      "loss": 1.1759,
+      "step": 238
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 2.4707373086130247e-05,
+      "learning_rate": 0.0001867933488958891,
+      "loss": 1.207,
+      "step": 240
+    },
+    {
+      "epoch": 0.8402777777777778,
+      "grad_norm": 2.212941035395488e-05,
+      "learning_rate": 0.00018650665983784546,
+      "loss": 1.2087,
+      "step": 242
+    },
+    {
+      "epoch": 0.8472222222222222,
+      "grad_norm": 1.8686198018258438e-05,
+      "learning_rate": 0.00018621711730893776,
+      "loss": 1.1937,
+      "step": 244
+    },
+    {
+      "epoch": 0.8541666666666666,
+      "grad_norm": 2.0610183128155768e-05,
+      "learning_rate": 0.00018592473085988925,
+      "loss": 1.1864,
+      "step": 246
+    },
+    {
+      "epoch": 0.8611111111111112,
+      "grad_norm": 1.8564192941994406e-05,
+      "learning_rate": 0.00018562951013523154,
+      "loss": 1.1726,
+      "step": 248
+    },
+    {
+      "epoch": 0.8680555555555556,
+      "grad_norm": 2.2897967937751673e-05,
+      "learning_rate": 0.00018533146487298638,
+      "loss": 1.2127,
+      "step": 250
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 2.0674704501288943e-05,
+      "learning_rate": 0.0001850306049043445,
+      "loss": 1.1884,
+      "step": 252
+    },
+    {
+      "epoch": 0.8819444444444444,
+      "grad_norm": 1.8763206753646955e-05,
+      "learning_rate": 0.00018472694015334132,
+      "loss": 1.1698,
+      "step": 254
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 4.040408748551272e-05,
+      "learning_rate": 0.00018442048063652952,
+      "loss": 1.1681,
+      "step": 256
+    },
+    {
+      "epoch": 0.8958333333333334,
+      "grad_norm": 2.0314189896453172e-05,
+      "learning_rate": 0.00018411123646264882,
+      "loss": 1.1708,
+      "step": 258
+    },
+    {
+      "epoch": 0.9027777777777778,
+      "grad_norm": 2.7322474124957807e-05,
+      "learning_rate": 0.0001837992178322923,
+      "loss": 1.1995,
+      "step": 260
+    },
+    {
+      "epoch": 0.9097222222222222,
+      "grad_norm": 2.6407045879750513e-05,
+      "learning_rate": 0.0001834844350375701,
+      "loss": 1.1898,
+      "step": 262
+    },
+    {
+      "epoch": 0.9166666666666666,
+      "grad_norm": 2.5243745767511427e-05,
+      "learning_rate": 0.00018316689846176992,
+      "loss": 1.1898,
+      "step": 264
+    },
+    {
+      "epoch": 0.9236111111111112,
+      "grad_norm": 2.753574517555535e-05,
+      "learning_rate": 0.00018284661857901436,
+      "loss": 1.187,
+      "step": 266
+    },
+    {
+      "epoch": 0.9305555555555556,
+      "grad_norm": 3.223833846277557e-05,
+      "learning_rate": 0.00018252360595391565,
+      "loss": 1.1908,
+      "step": 268
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 2.6311214242014103e-05,
+      "learning_rate": 0.00018219787124122708,
+      "loss": 1.206,
+      "step": 270
+    },
+    {
+      "epoch": 0.9444444444444444,
+      "grad_norm": 2.6744064598460682e-05,
+      "learning_rate": 0.00018186942518549145,
+      "loss": 1.2154,
+      "step": 272
+    },
+    {
+      "epoch": 0.9513888888888888,
+      "grad_norm": 2.5860512323561125e-05,
+      "learning_rate": 0.00018153827862068674,
+      "loss": 1.1825,
+      "step": 274
+    },
+    {
+      "epoch": 0.9583333333333334,
+      "grad_norm": 3.4333595976931974e-05,
+      "learning_rate": 0.00018120444246986882,
+      "loss": 1.1831,
+      "step": 276
+    },
+    {
+      "epoch": 0.9652777777777778,
+      "grad_norm": 3.32353483827319e-05,
+      "learning_rate": 0.00018086792774481102,
+      "loss": 1.2021,
+      "step": 278
+    },
+    {
+      "epoch": 0.9722222222222222,
+      "grad_norm": 3.6012690543429926e-05,
+      "learning_rate": 0.00018052874554564088,
+      "loss": 1.1799,
+      "step": 280
+    },
+    {
+      "epoch": 0.9791666666666666,
+      "grad_norm": 3.507096698740497e-05,
+      "learning_rate": 0.00018018690706047422,
+      "loss": 1.2028,
+      "step": 282
+    },
+    {
+      "epoch": 0.9861111111111112,
+      "grad_norm": 4.70953673357144e-05,
+      "learning_rate": 0.00017984242356504585,
+      "loss": 1.1677,
+      "step": 284
+    },
+    {
+      "epoch": 0.9930555555555556,
+      "grad_norm": 3.5310935345478356e-05,
+      "learning_rate": 0.00017949530642233773,
+      "loss": 1.2099,
+      "step": 286
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 4.8496305680600926e-05,
+      "learning_rate": 0.00017914556708220424,
+      "loss": 1.195,
+      "step": 288
+    },
+    {
+      "epoch": 1.0069444444444444,
+      "grad_norm": 4.527560668066144e-05,
+      "learning_rate": 0.00017879321708099433,
+      "loss": 1.2009,
+      "step": 290
+    },
+    {
+      "epoch": 1.0138888888888888,
+      "grad_norm": 4.206915036775172e-05,
+      "learning_rate": 0.0001784382680411711,
+      "loss": 1.1736,
+      "step": 292
+    },
+    {
+      "epoch": 1.0208333333333333,
+      "grad_norm": 4.1610346670495346e-05,
+      "learning_rate": 0.0001780807316709284,
+      "loss": 1.2031,
+      "step": 294
+    },
+    {
+      "epoch": 1.0277777777777777,
+      "grad_norm": 5.509778202394955e-05,
+      "learning_rate": 0.00017772061976380465,
+      "loss": 1.1989,
+      "step": 296
+    },
+    {
+      "epoch": 1.0347222222222223,
+      "grad_norm": 4.0269846067531034e-05,
+      "learning_rate": 0.0001773579441982938,
+      "loss": 1.1862,
+      "step": 298
+    },
+    {
+      "epoch": 1.0416666666666667,
+      "grad_norm": 4.829439421882853e-05,
+      "learning_rate": 0.0001769927169374535,
+      "loss": 1.1962,
+      "step": 300
+    },
+    {
+      "epoch": 1.0486111111111112,
+      "grad_norm": 4.680109486798756e-05,
+      "learning_rate": 0.00017662495002851049,
+      "loss": 1.1822,
+      "step": 302
+    },
+    {
+      "epoch": 1.0555555555555556,
+      "grad_norm": 4.947670822730288e-05,
+      "learning_rate": 0.0001762546556024633,
+      "loss": 1.214,
+      "step": 304
+    },
+    {
+      "epoch": 1.0625,
+      "grad_norm": 5.944677104707807e-05,
+      "learning_rate": 0.00017588184587368196,
+      "loss": 1.2075,
+      "step": 306
+    },
+    {
+      "epoch": 1.0694444444444444,
+      "grad_norm": 6.578410102520138e-05,
+      "learning_rate": 0.0001755065331395052,
+      "loss": 1.2299,
+      "step": 308
+    },
+    {
+      "epoch": 1.0763888888888888,
+      "grad_norm": 8.231692481786013e-05,
+      "learning_rate": 0.00017512872977983482,
+      "loss": 1.208,
+      "step": 310
+    },
+    {
+      "epoch": 1.0833333333333333,
+      "grad_norm": 6.243359530344605e-05,
+      "learning_rate": 0.00017474844825672727,
+      "loss": 1.2286,
+      "step": 312
+    },
+    {
+      "epoch": 1.0902777777777777,
+      "grad_norm": 8.204213372664526e-05,
+      "learning_rate": 0.00017436570111398263,
+      "loss": 1.2454,
+      "step": 314
+    },
+    {
+      "epoch": 1.0972222222222223,
+      "grad_norm": 7.384664058918133e-05,
+      "learning_rate": 0.00017398050097673081,
+      "loss": 1.2061,
+      "step": 316
+    },
+    {
+      "epoch": 1.1041666666666667,
+      "grad_norm": 7.188819290604442e-05,
+      "learning_rate": 0.0001735928605510152,
+      "loss": 1.2155,
+      "step": 318
+    },
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 0.00010495231254026294,
+      "learning_rate": 0.00017320279262337333,
+      "loss": 1.236,
+      "step": 320
+    },
+    {
+      "epoch": 1.1180555555555556,
+      "grad_norm": 9.512733231531456e-05,
+      "learning_rate": 0.00017281031006041538,
+      "loss": 1.253,
+      "step": 322
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 9.048975334735587e-05,
+      "learning_rate": 0.00017241542580839964,
+      "loss": 1.2448,
+      "step": 324
+    },
+    {
+      "epoch": 1.1319444444444444,
+      "grad_norm": 8.587296906625852e-05,
+      "learning_rate": 0.0001720181528928054,
+      "loss": 1.2478,
+      "step": 326
+    },
+    {
+      "epoch": 1.1388888888888888,
+      "grad_norm": 9.011059592012316e-05,
+      "learning_rate": 0.00017161850441790332,
+      "loss": 1.2569,
+      "step": 328
+    },
+    {
+      "epoch": 1.1458333333333333,
+      "grad_norm": 9.588886314304546e-05,
+      "learning_rate": 0.00017121649356632333,
+      "loss": 1.2707,
+      "step": 330
+    },
+    {
+      "epoch": 1.1527777777777777,
+      "grad_norm": 9.867311746347696e-05,
+      "learning_rate": 0.00017081213359861964,
+      "loss": 1.2893,
+      "step": 332
+    },
+    {
+      "epoch": 1.1597222222222223,
+      "grad_norm": 0.00010548109275987372,
+      "learning_rate": 0.00017040543785283336,
+      "loss": 1.3716,
+      "step": 334
+    },
+    {
+      "epoch": 1.1666666666666667,
+      "grad_norm": 9.849516936810687e-05,
+      "learning_rate": 0.0001699964197440526,
+      "loss": 1.3677,
+      "step": 336
+    },
+    {
+      "epoch": 1.1736111111111112,
+      "grad_norm": 9.55751893343404e-05,
+      "learning_rate": 0.00016958509276396986,
+      "loss": 1.3871,
+      "step": 338
+    },
+    {
+      "epoch": 1.1805555555555556,
+      "grad_norm": 8.779441122896969e-05,
+      "learning_rate": 0.00016917147048043708,
+      "loss": 1.4266,
+      "step": 340
+    },
+    {
+      "epoch": 1.1875,
+      "grad_norm": 8.045154390856624e-05,
+      "learning_rate": 0.00016875556653701807,
+      "loss": 1.5035,
+      "step": 342
+    },
+    {
+      "epoch": 1.1944444444444444,
+      "grad_norm": 7.365510100498796e-05,
+      "learning_rate": 0.00016833739465253855,
+      "loss": 1.532,
+      "step": 344
+    },
+    {
+      "epoch": 1.2013888888888888,
+      "grad_norm": 6.366265006363392e-05,
+      "learning_rate": 0.00016791696862063343,
+      "loss": 1.5854,
+      "step": 346
+    },
+    {
+      "epoch": 1.2083333333333333,
+      "grad_norm": 5.384908217820339e-05,
+      "learning_rate": 0.0001674943023092921,
+      "loss": 1.6161,
+      "step": 348
+    },
+    {
+      "epoch": 1.2152777777777777,
+      "grad_norm": 4.3953212298220024e-05,
+      "learning_rate": 0.00016706940966040062,
+      "loss": 1.6505,
+      "step": 350
+    },
+    {
+      "epoch": 1.2222222222222223,
+      "grad_norm": 4.8670408432371914e-05,
+      "learning_rate": 0.00016664230468928226,
+      "loss": 1.6546,
+      "step": 352
+    },
+    {
+      "epoch": 1.2291666666666667,
+      "grad_norm": 4.31089210906066e-05,
+      "learning_rate": 0.0001662130014842348,
+      "loss": 1.6483,
+      "step": 354
+    },
+    {
+      "epoch": 1.2361111111111112,
+      "grad_norm": 3.783537613344379e-05,
+      "learning_rate": 0.0001657815142060661,
+      "loss": 1.6821,
+      "step": 356
+    },
+    {
+      "epoch": 1.2430555555555556,
+      "grad_norm": 3.4532826248323545e-05,
+      "learning_rate": 0.00016534785708762693,
+      "loss": 1.7036,
+      "step": 358
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 3.33928874169942e-05,
+      "learning_rate": 0.0001649120444333414,
+      "loss": 1.7133,
+      "step": 360
+    },
+    {
+      "epoch": 1.2569444444444444,
+      "grad_norm": 3.3881518902489915e-05,
+      "learning_rate": 0.0001644740906187352,
+      "loss": 1.7246,
+      "step": 362
+    },
+    {
+      "epoch": 1.2638888888888888,
+      "grad_norm": 2.9356864615692757e-05,
+      "learning_rate": 0.0001640340100899614,
+      "loss": 1.7653,
+      "step": 364
+    },
+    {
+      "epoch": 1.2708333333333333,
+      "grad_norm": 3.071104583796114e-05,
+      "learning_rate": 0.00016359181736332393,
+      "loss": 1.8163,
+      "step": 366
+    },
+    {
+      "epoch": 1.2777777777777777,
+      "grad_norm": 3.3303516829619184e-05,
+      "learning_rate": 0.00016314752702479882,
+      "loss": 1.8645,
+      "step": 368
+    },
+    {
+      "epoch": 1.2847222222222223,
+      "grad_norm": 3.376286258571781e-05,
+      "learning_rate": 0.00016270115372955286,
+      "loss": 1.904,
+      "step": 370
+    },
+    {
+      "epoch": 1.2916666666666667,
+      "grad_norm": 5.366417462937534e-05,
+      "learning_rate": 0.0001622527122014605,
+      "loss": 1.9253,
+      "step": 372
+    },
+    {
+      "epoch": 1.2986111111111112,
+      "grad_norm": 3.328424281789921e-05,
+      "learning_rate": 0.0001618022172326179,
+      "loss": 1.9897,
+      "step": 374
+    },
+    {
+      "epoch": 1.3055555555555556,
+      "grad_norm": 2.7745934858103283e-05,
+      "learning_rate": 0.00016134968368285518,
+      "loss": 2.0278,
+      "step": 376
+    },
+    {
+      "epoch": 1.3125,
+      "grad_norm": 3.089087113039568e-05,
+      "learning_rate": 0.0001608951264792462,
+      "loss": 2.0393,
+      "step": 378
+    },
+    {
+      "epoch": 1.3194444444444444,
+      "grad_norm": 3.455130718066357e-05,
+      "learning_rate": 0.00016043856061561613,
+      "loss": 2.0616,
+      "step": 380
+    },
+    {
+      "epoch": 1.3263888888888888,
+      "grad_norm": 2.891979420382995e-05,
+      "learning_rate": 0.000159980001152047,
+      "loss": 2.0735,
+      "step": 382
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 2.5224271666957065e-05,
+      "learning_rate": 0.00015951946321438073,
+      "loss": 2.1789,
+      "step": 384
+    },
+    {
+      "epoch": 1.3402777777777777,
+      "grad_norm": 2.7425830921856686e-05,
+      "learning_rate": 0.0001590569619937205,
+      "loss": 2.1456,
+      "step": 386
+    },
+    {
+      "epoch": 1.3472222222222223,
+      "grad_norm": 2.8204965929035097e-05,
+      "learning_rate": 0.00015859251274592934,
+      "loss": 2.1728,
+      "step": 388
+    },
+    {
+      "epoch": 1.3541666666666667,
+      "grad_norm": 2.821902853611391e-05,
+      "learning_rate": 0.00015812613079112708,
+      "loss": 2.2326,
+      "step": 390
+    },
+    {
+      "epoch": 1.3611111111111112,
+      "grad_norm": 2.6897825591731817e-05,
+      "learning_rate": 0.00015765783151318506,
+      "loss": 2.2963,
+      "step": 392
+    },
+    {
+      "epoch": 1.3680555555555556,
+      "grad_norm": 3.4906293876701966e-05,
+      "learning_rate": 0.00015718763035921847,
+      "loss": 2.4097,
+      "step": 394
+    },
+    {
+      "epoch": 1.375,
+      "grad_norm": 3.223119711037725e-05,
+      "learning_rate": 0.00015671554283907705,
+      "loss": 2.4404,
+      "step": 396
+    },
+    {
+      "epoch": 1.3819444444444444,
+      "grad_norm": 3.1298048270400614e-05,
+      "learning_rate": 0.00015624158452483337,
+      "loss": 2.5603,
+      "step": 398
+    },
+    {
+      "epoch": 1.3888888888888888,
+      "grad_norm": 4.206110679660924e-05,
+      "learning_rate": 0.00015576577105026916,
+      "loss": 2.695,
+      "step": 400
+    },
+    {
+      "epoch": 1.3958333333333333,
+      "grad_norm": 7.035292219370604e-05,
+      "learning_rate": 0.00015528811811035972,
+      "loss": 2.9149,
+      "step": 402
+    },
+    {
+      "epoch": 1.4027777777777777,
+      "grad_norm": 0.00011025326239177957,
+      "learning_rate": 0.00015480864146075608,
+      "loss": 3.4276,
+      "step": 404
+    },
+    {
+      "epoch": 1.4097222222222223,
+      "grad_norm": 0.0001375137799186632,
+      "learning_rate": 0.00015432735691726547,
+      "loss": 3.8393,
+      "step": 406
+    },
+    {
+      "epoch": 1.4166666666666667,
+      "grad_norm": 0.00015558676386717707,
+      "learning_rate": 0.00015384428035532932,
+      "loss": 4.228,
+      "step": 408
+    },
+    {
+      "epoch": 1.4236111111111112,
+      "grad_norm": 0.00016118814528454095,
+      "learning_rate": 0.00015335942770950003,
+      "loss": 4.5641,
+      "step": 410
+    },
+    {
+      "epoch": 1.4305555555555556,
+      "grad_norm": 0.0001592924090800807,
+      "learning_rate": 0.00015287281497291497,
+      "loss": 4.851,
+      "step": 412
+    },
+    {
+      "epoch": 1.4375,
+      "grad_norm": 0.00015159139002207667,
+      "learning_rate": 0.0001523844581967691,
+      "loss": 5.0476,
+      "step": 414
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 0.0001463508524466306,
+      "learning_rate": 0.00015189437348978561,
+      "loss": 5.1316,
+      "step": 416
+    },
+    {
+      "epoch": 1.4513888888888888,
+      "grad_norm": 0.00018889355123974383,
+      "learning_rate": 0.00015140257701768442,
+      "loss": 5.4955,
+      "step": 418
+    },
+    {
+      "epoch": 1.4583333333333333,
+      "grad_norm": 0.00014158397971186787,
+      "learning_rate": 0.0001509090850026489,
+      "loss": 5.5181,
+      "step": 420
+    },
+    {
+      "epoch": 1.4652777777777777,
+      "grad_norm": 0.00013335797120817006,
+      "learning_rate": 0.00015041391372279094,
+      "loss": 5.675,
+      "step": 422
+    },
+    {
+      "epoch": 1.4722222222222223,
+      "grad_norm": 0.00012022190639982,
+      "learning_rate": 0.0001499170795116139,
+      "loss": 5.6161,
+      "step": 424
+    },
+    {
+      "epoch": 1.4791666666666667,
+      "grad_norm": 0.00010211220069322735,
+      "learning_rate": 0.0001494185987574739,
+      "loss": 5.6191,
+      "step": 426
+    },
+    {
+      "epoch": 1.4861111111111112,
+      "grad_norm": 8.821386290946975e-05,
+      "learning_rate": 0.0001489184879030392,
+      "loss": 5.6616,
+      "step": 428
+    },
+    {
+      "epoch": 1.4930555555555556,
+      "grad_norm": 8.532748324796557e-05,
+      "learning_rate": 0.00014841676344474775,
+      "loss": 5.6971,
+      "step": 430
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.207001519622281e-05,
+      "learning_rate": 0.00014791344193226325,
+      "loss": 5.7875,
+      "step": 432
+    },
+    {
+      "epoch": 1.5069444444444444,
+      "grad_norm": 7.251853821799159e-05,
+      "learning_rate": 0.00014740853996792903,
+      "loss": 5.7305,
+      "step": 434
+    },
+    {
+      "epoch": 1.5138888888888888,
+      "grad_norm": 0.00016514182789251208,
+      "learning_rate": 0.00014690207420622062,
+      "loss": 5.7891,
+      "step": 436
+    },
+    {
+      "epoch": 1.5208333333333335,
+      "grad_norm": 6.044754263712093e-05,
+      "learning_rate": 0.00014639406135319617,
+      "loss": 6.0149,
+      "step": 438
+    },
+    {
+      "epoch": 1.5277777777777777,
+      "grad_norm": 7.798307342454791e-05,
+      "learning_rate": 0.0001458845181659456,
+      "loss": 5.9835,
+      "step": 440
+    },
+    {
+      "epoch": 1.5347222222222223,
+      "grad_norm": 5.688685632776469e-05,
+      "learning_rate": 0.00014537346145203776,
+      "loss": 6.145,
+      "step": 442
+    },
+    {
+      "epoch": 1.5416666666666665,
+      "grad_norm": 5.519447586266324e-05,
+      "learning_rate": 0.00014486090806896596,
+      "loss": 6.1921,
+      "step": 444
+    },
+    {
+      "epoch": 1.5486111111111112,
+      "grad_norm": 5.198594226385467e-05,
+      "learning_rate": 0.00014434687492359202,
+      "loss": 6.2428,
+      "step": 446
+    },
+    {
+      "epoch": 1.5555555555555556,
+      "grad_norm": 5.4393378377426416e-05,
+      "learning_rate": 0.00014383137897158857,
+      "loss": 6.2366,
+      "step": 448
+    },
+    {
+      "epoch": 1.5625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014331443721687974,
+      "loss": 6.5184,
+      "step": 450
+    },
+    {
+      "epoch": 1.5694444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014331443721687974,
+      "loss": 6.3582,
+      "step": 452
+    },
+    {
+      "epoch": 1.5763888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014331443721687974,
+      "loss": 6.4165,
+      "step": 454
+    },
+    {
+      "epoch": 1.5833333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014331443721687974,
+      "loss": 6.5681,
+      "step": 456
+    },
+    {
+      "epoch": 1.5902777777777777,
+      "grad_norm": 0.013171161524951458,
+      "learning_rate": 0.00014331443721687974,
+      "loss": 6.4663,
+      "step": 458
+    },
+    {
+      "epoch": 1.5972222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 460
+    },
+    {
+      "epoch": 1.6041666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 462
+    },
+    {
+      "epoch": 1.6111111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 464
+    },
+    {
+      "epoch": 1.6180555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 466
+    },
+    {
+      "epoch": 1.625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 468
+    },
+    {
+      "epoch": 1.6319444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 470
+    },
+    {
+      "epoch": 1.6388888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 472
+    },
+    {
+      "epoch": 1.6458333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 474
+    },
+    {
+      "epoch": 1.6527777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 476
+    },
+    {
+      "epoch": 1.6597222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 478
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 480
+    },
+    {
+      "epoch": 1.6736111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 482
+    },
+    {
+      "epoch": 1.6805555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 484
+    },
+    {
+      "epoch": 1.6875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 486
+    },
+    {
+      "epoch": 1.6944444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 488
+    },
+    {
+      "epoch": 1.7013888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 490
+    },
+    {
+      "epoch": 1.7083333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 492
+    },
+    {
+      "epoch": 1.7152777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 494
+    },
+    {
+      "epoch": 1.7222222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 496
+    },
+    {
+      "epoch": 1.7291666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 498
+    },
+    {
+      "epoch": 1.7361111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 500
+    },
+    {
+      "epoch": 1.7430555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 502
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 504
+    },
+    {
+      "epoch": 1.7569444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 506
+    },
+    {
+      "epoch": 1.7638888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 508
+    },
+    {
+      "epoch": 1.7708333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 510
+    },
+    {
+      "epoch": 1.7777777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 512
+    },
+    {
+      "epoch": 1.7847222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 514
+    },
+    {
+      "epoch": 1.7916666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 516
+    },
+    {
+      "epoch": 1.7986111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 518
+    },
+    {
+      "epoch": 1.8055555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 520
+    },
+    {
+      "epoch": 1.8125,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 522
+    },
+    {
+      "epoch": 1.8194444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 524
+    },
+    {
+      "epoch": 1.8263888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 526
+    },
+    {
+      "epoch": 1.8333333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 528
+    },
+    {
+      "epoch": 1.8402777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 530
+    },
+    {
+      "epoch": 1.8472222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 532
+    },
+    {
+      "epoch": 1.8541666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 534
+    },
+    {
+      "epoch": 1.8611111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 536
+    },
+    {
+      "epoch": 1.8680555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 538
+    },
+    {
+      "epoch": 1.875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 540
+    },
+    {
+      "epoch": 1.8819444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 542
+    },
+    {
+      "epoch": 1.8888888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 544
+    },
+    {
+      "epoch": 1.8958333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 546
+    },
+    {
+      "epoch": 1.9027777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 548
+    },
+    {
+      "epoch": 1.9097222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 550
+    },
+    {
+      "epoch": 1.9166666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 552
+    },
+    {
+      "epoch": 1.9236111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 554
+    },
+    {
+      "epoch": 1.9305555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 556
+    },
+    {
+      "epoch": 1.9375,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 558
+    },
+    {
+      "epoch": 1.9444444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 560
+    },
+    {
+      "epoch": 1.9513888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 562
+    },
+    {
+      "epoch": 1.9583333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 564
+    },
+    {
+      "epoch": 1.9652777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 566
+    },
+    {
+      "epoch": 1.9722222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 568
+    },
+    {
+      "epoch": 1.9791666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 570
+    },
+    {
+      "epoch": 1.9861111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 572
+    },
+    {
+      "epoch": 1.9930555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 574
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 576
+    },
+    {
+      "epoch": 2.0069444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 578
+    },
+    {
+      "epoch": 2.013888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 580
+    },
+    {
+      "epoch": 2.0208333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 582
+    },
+    {
+      "epoch": 2.0277777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 584
+    },
+    {
+      "epoch": 2.0347222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 586
+    },
+    {
+      "epoch": 2.0416666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 588
+    },
+    {
+      "epoch": 2.048611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 590
+    },
+    {
+      "epoch": 2.0555555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 592
+    },
+    {
+      "epoch": 2.0625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 594
+    },
+    {
+      "epoch": 2.0694444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 596
+    },
+    {
+      "epoch": 2.076388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 598
+    },
+    {
+      "epoch": 2.0833333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 600
+    },
+    {
+      "epoch": 2.0902777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 602
+    },
+    {
+      "epoch": 2.0972222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 604
+    },
+    {
+      "epoch": 2.1041666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 606
+    },
+    {
+      "epoch": 2.111111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 608
+    },
+    {
+      "epoch": 2.1180555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 610
+    },
+    {
+      "epoch": 2.125,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 612
+    },
+    {
+      "epoch": 2.1319444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 614
+    },
+    {
+      "epoch": 2.138888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 616
+    },
+    {
+      "epoch": 2.1458333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 618
+    },
+    {
+      "epoch": 2.1527777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 620
+    },
+    {
+      "epoch": 2.1597222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 622
+    },
+    {
+      "epoch": 2.1666666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 624
+    },
+    {
+      "epoch": 2.173611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 626
+    },
+    {
+      "epoch": 2.1805555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 628
+    },
+    {
+      "epoch": 2.1875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 630
+    },
+    {
+      "epoch": 2.1944444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 632
+    },
+    {
+      "epoch": 2.201388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 634
+    },
+    {
+      "epoch": 2.2083333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 636
+    },
+    {
+      "epoch": 2.2152777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 638
+    },
+    {
+      "epoch": 2.2222222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 640
+    },
+    {
+      "epoch": 2.2291666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 642
+    },
+    {
+      "epoch": 2.236111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 644
+    },
+    {
+      "epoch": 2.2430555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 646
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 648
+    },
+    {
+      "epoch": 2.2569444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 650
+    },
+    {
+      "epoch": 2.263888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 652
+    },
+    {
+      "epoch": 2.2708333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 654
+    },
+    {
+      "epoch": 2.2777777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 656
+    },
+    {
+      "epoch": 2.2847222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 658
+    },
+    {
+      "epoch": 2.2916666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 660
+    },
+    {
+      "epoch": 2.298611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 662
+    },
+    {
+      "epoch": 2.3055555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 664
+    },
+    {
+      "epoch": 2.3125,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 666
+    },
+    {
+      "epoch": 2.3194444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 668
+    },
+    {
+      "epoch": 2.326388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 670
+    },
+    {
+      "epoch": 2.3333333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 672
+    },
+    {
+      "epoch": 2.3402777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 674
+    },
+    {
+      "epoch": 2.3472222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 676
+    },
+    {
+      "epoch": 2.3541666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 678
+    },
+    {
+      "epoch": 2.361111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 680
+    },
+    {
+      "epoch": 2.3680555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 682
+    },
+    {
+      "epoch": 2.375,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 684
+    },
+    {
+      "epoch": 2.3819444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 686
+    },
+    {
+      "epoch": 2.388888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 688
+    },
+    {
+      "epoch": 2.3958333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 690
+    },
+    {
+      "epoch": 2.4027777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 692
+    },
+    {
+      "epoch": 2.4097222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 694
+    },
+    {
+      "epoch": 2.4166666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 696
+    },
+    {
+      "epoch": 2.423611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 698
+    },
+    {
+      "epoch": 2.4305555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 700
+    },
+    {
+      "epoch": 2.4375,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 702
+    },
+    {
+      "epoch": 2.4444444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 704
+    },
+    {
+      "epoch": 2.451388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 706
+    },
+    {
+      "epoch": 2.4583333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 708
+    },
+    {
+      "epoch": 2.4652777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 710
+    },
+    {
+      "epoch": 2.4722222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 712
+    },
+    {
+      "epoch": 2.4791666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 714
+    },
+    {
+      "epoch": 2.486111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 716
+    },
+    {
+      "epoch": 2.4930555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 718
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 720
+    },
+    {
+      "epoch": 2.5069444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 722
+    },
+    {
+      "epoch": 2.513888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 724
+    },
+    {
+      "epoch": 2.5208333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 726
+    },
+    {
+      "epoch": 2.5277777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 728
+    },
+    {
+      "epoch": 2.5347222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 730
+    },
+    {
+      "epoch": 2.5416666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 732
+    },
+    {
+      "epoch": 2.548611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 734
+    },
+    {
+      "epoch": 2.5555555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 736
+    },
+    {
+      "epoch": 2.5625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 738
+    },
+    {
+      "epoch": 2.5694444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 740
+    },
+    {
+      "epoch": 2.576388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 742
+    },
+    {
+      "epoch": 2.5833333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 744
+    },
+    {
+      "epoch": 2.5902777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 746
+    },
+    {
+      "epoch": 2.5972222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 748
+    },
+    {
+      "epoch": 2.6041666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 750
+    },
+    {
+      "epoch": 2.611111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 752
+    },
+    {
+      "epoch": 2.6180555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 754
+    },
+    {
+      "epoch": 2.625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 756
+    },
+    {
+      "epoch": 2.6319444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 758
+    },
+    {
+      "epoch": 2.638888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 760
+    },
+    {
+      "epoch": 2.6458333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 762
+    },
+    {
+      "epoch": 2.6527777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 764
+    },
+    {
+      "epoch": 2.6597222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 766
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 768
+    },
+    {
+      "epoch": 2.673611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 770
+    },
+    {
+      "epoch": 2.6805555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 772
+    },
+    {
+      "epoch": 2.6875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 774
+    },
+    {
+      "epoch": 2.6944444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 776
+    },
+    {
+      "epoch": 2.701388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 778
+    },
+    {
+      "epoch": 2.7083333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 780
+    },
+    {
+      "epoch": 2.7152777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 782
+    },
+    {
+      "epoch": 2.7222222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 784
+    },
+    {
+      "epoch": 2.7291666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 786
+    },
+    {
+      "epoch": 2.736111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 788
+    },
+    {
+      "epoch": 2.7430555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 790
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 792
+    },
+    {
+      "epoch": 2.7569444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 794
+    },
+    {
+      "epoch": 2.763888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 796
+    },
+    {
+      "epoch": 2.7708333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 798
+    },
+    {
+      "epoch": 2.7777777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 800
+    },
+    {
+      "epoch": 2.7847222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 802
+    },
+    {
+      "epoch": 2.7916666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 804
+    },
+    {
+      "epoch": 2.798611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 806
+    },
+    {
+      "epoch": 2.8055555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 808
+    },
+    {
+      "epoch": 2.8125,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 810
+    },
+    {
+      "epoch": 2.8194444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 812
+    },
+    {
+      "epoch": 2.826388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 814
+    },
+    {
+      "epoch": 2.8333333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 816
+    },
+    {
+      "epoch": 2.8402777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 818
+    },
+    {
+      "epoch": 2.8472222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 820
+    },
+    {
+      "epoch": 2.8541666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 822
+    },
+    {
+      "epoch": 2.861111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 824
+    },
+    {
+      "epoch": 2.8680555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 826
+    },
+    {
+      "epoch": 2.875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 828
+    },
+    {
+      "epoch": 2.8819444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 830
+    },
+    {
+      "epoch": 2.888888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 832
+    },
+    {
+      "epoch": 2.8958333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 834
+    },
+    {
+      "epoch": 2.9027777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 836
+    },
+    {
+      "epoch": 2.9097222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 838
+    },
+    {
+      "epoch": 2.9166666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 840
+    },
+    {
+      "epoch": 2.923611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 842
+    },
+    {
+      "epoch": 2.9305555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 844
+    },
+    {
+      "epoch": 2.9375,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 846
+    },
+    {
+      "epoch": 2.9444444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 848
+    },
+    {
+      "epoch": 2.951388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 850
+    },
+    {
+      "epoch": 2.9583333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 852
+    },
+    {
+      "epoch": 2.9652777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 854
+    },
+    {
+      "epoch": 2.9722222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 856
+    },
+    {
+      "epoch": 2.9791666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 858
+    },
+    {
+      "epoch": 2.986111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 860
+    },
+    {
+      "epoch": 2.9930555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 862
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 864
+    },
+    {
+      "epoch": 3.0069444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 866
+    },
+    {
+      "epoch": 3.013888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 868
+    },
+    {
+      "epoch": 3.0208333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 870
+    },
+    {
+      "epoch": 3.0277777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 872
+    },
+    {
+      "epoch": 3.0347222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 874
+    },
+    {
+      "epoch": 3.0416666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 876
+    },
+    {
+      "epoch": 3.048611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 878
+    },
+    {
+      "epoch": 3.0555555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 880
+    },
+    {
+      "epoch": 3.0625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 882
+    },
+    {
+      "epoch": 3.0694444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 884
+    },
+    {
+      "epoch": 3.076388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 886
+    },
+    {
+      "epoch": 3.0833333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 888
+    },
+    {
+      "epoch": 3.0902777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 890
+    },
+    {
+      "epoch": 3.0972222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 892
+    },
+    {
+      "epoch": 3.1041666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 894
+    },
+    {
+      "epoch": 3.111111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 896
+    },
+    {
+      "epoch": 3.1180555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 898
+    },
+    {
+      "epoch": 3.125,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 900
+    },
+    {
+      "epoch": 3.1319444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 902
+    },
+    {
+      "epoch": 3.138888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 904
+    },
+    {
+      "epoch": 3.1458333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 906
+    },
+    {
+      "epoch": 3.1527777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 908
+    },
+    {
+      "epoch": 3.1597222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 910
+    },
+    {
+      "epoch": 3.1666666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 912
+    },
+    {
+      "epoch": 3.173611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 914
+    },
+    {
+      "epoch": 3.1805555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 916
+    },
+    {
+      "epoch": 3.1875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 918
+    },
+    {
+      "epoch": 3.1944444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 920
+    },
+    {
+      "epoch": 3.201388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 922
+    },
+    {
+      "epoch": 3.2083333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 924
+    },
+    {
+      "epoch": 3.2152777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 926
+    },
+    {
+      "epoch": 3.2222222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 928
+    },
+    {
+      "epoch": 3.2291666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 930
+    },
+    {
+      "epoch": 3.236111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 932
+    },
+    {
+      "epoch": 3.2430555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 934
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 936
+    },
+    {
+      "epoch": 3.2569444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 938
+    },
+    {
+      "epoch": 3.263888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 940
+    },
+    {
+      "epoch": 3.2708333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 942
+    },
+    {
+      "epoch": 3.2777777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 944
+    },
+    {
+      "epoch": 3.2847222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 946
+    },
+    {
+      "epoch": 3.2916666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 948
+    },
+    {
+      "epoch": 3.298611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 950
+    },
+    {
+      "epoch": 3.3055555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 952
+    },
+    {
+      "epoch": 3.3125,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 954
+    },
+    {
+      "epoch": 3.3194444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 956
+    },
+    {
+      "epoch": 3.326388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 958
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 960
+    },
+    {
+      "epoch": 3.3402777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 962
+    },
+    {
+      "epoch": 3.3472222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 964
+    },
+    {
+      "epoch": 3.3541666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 966
+    },
+    {
+      "epoch": 3.361111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 968
+    },
+    {
+      "epoch": 3.3680555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 970
+    },
+    {
+      "epoch": 3.375,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 972
+    },
+    {
+      "epoch": 3.3819444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 974
+    },
+    {
+      "epoch": 3.388888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 976
+    },
+    {
+      "epoch": 3.3958333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 978
+    },
+    {
+      "epoch": 3.4027777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 980
+    },
+    {
+      "epoch": 3.4097222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 982
+    },
+    {
+      "epoch": 3.4166666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 984
+    },
+    {
+      "epoch": 3.423611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 986
+    },
+    {
+      "epoch": 3.4305555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 988
+    },
+    {
+      "epoch": 3.4375,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 990
+    },
+    {
+      "epoch": 3.4444444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 992
+    },
+    {
+      "epoch": 3.451388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 994
+    },
+    {
+      "epoch": 3.4583333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 996
+    },
+    {
+      "epoch": 3.4652777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 998
+    },
+    {
+      "epoch": 3.4722222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 2,
+  "max_steps": 1152,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.5043947980799345e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80311e2b6a4df1fda961527e50aa92bc3d64f6451e2aa8695c0c2905ab9601d6
+size 5713

checkpoint-1000/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1152/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: ./Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:./Qwen3-8B
+- lora
+- transformers
+- unsloth
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.16.0

checkpoint-1152/adapter_config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": true
+}

checkpoint-1152/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad45661894700811a0a4f19acc6b8afc9d98db1406cc80d88fa464a18ce495ea
+size 2834238032

checkpoint-1152/added_tokens.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|analysis|>": 151670,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|forecast|>": 151671,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|response|>": 151669,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-1152/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

checkpoint-1152/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1152/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:888353d1da4d630234d41b91788dd2aafb767cfb17ebf85761b00e83607c23e1
+size 698777675

checkpoint-1152/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8e2011629d8bed3ef560fa11175cac55684c4e12a72634bb24abf767b6c7399
+size 14645

checkpoint-1152/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9779a733270277f15e820d84d3dfdfb3a66fd96b857f3f0109ac7f2b54244d67
+size 1383

checkpoint-1152/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b938d276d47acf5a6dc15bc7c48a9e3e0ede2cc320ecd371c94b59541d8d616
+size 1465

checkpoint-1152/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|response|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|analysis|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|forecast|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-1152/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77247e5fb2e966d04e513068b17cca472e105e7c56953e9b1d27d70b93d77e6f
+size 11423221

checkpoint-1152/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,254 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<|response|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|analysis|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|forecast|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|response|>",
+    "<|analysis|>",
+    "<|forecast|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 40960,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-1152/trainer_state.json ADDED Viewed

	@@ -0,0 +1,4066 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.0,
+  "eval_steps": 500,
+  "global_step": 1152,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006944444444444444,
+      "grad_norm": 0.0001319512666668743,
+      "learning_rate": 3.448275862068966e-06,
+      "loss": 21.3796,
+      "step": 2
+    },
+    {
+      "epoch": 0.013888888888888888,
+      "grad_norm": 0.00012970938405487686,
+      "learning_rate": 1.0344827586206897e-05,
+      "loss": 2.5144,
+      "step": 4
+    },
+    {
+      "epoch": 0.020833333333333332,
+      "grad_norm": 0.00012933755351696163,
+      "learning_rate": 1.7241379310344828e-05,
+      "loss": 13.8026,
+      "step": 6
+    },
+    {
+      "epoch": 0.027777777777777776,
+      "grad_norm": 0.00012198727199574932,
+      "learning_rate": 2.413793103448276e-05,
+      "loss": 4.9835,
+      "step": 8
+    },
+    {
+      "epoch": 0.034722222222222224,
+      "grad_norm": 0.00010912115249084309,
+      "learning_rate": 3.103448275862069e-05,
+      "loss": 2.397,
+      "step": 10
+    },
+    {
+      "epoch": 0.041666666666666664,
+      "grad_norm": 9.947551734512672e-05,
+      "learning_rate": 3.793103448275862e-05,
+      "loss": 2.3169,
+      "step": 12
+    },
+    {
+      "epoch": 0.04861111111111111,
+      "grad_norm": 9.700353257358074e-05,
+      "learning_rate": 4.482758620689655e-05,
+      "loss": 2.2121,
+      "step": 14
+    },
+    {
+      "epoch": 0.05555555555555555,
+      "grad_norm": 0.00010787531937239692,
+      "learning_rate": 5.172413793103449e-05,
+      "loss": 2.5911,
+      "step": 16
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.00010262445721309632,
+      "learning_rate": 5.862068965517241e-05,
+      "loss": 1.9774,
+      "step": 18
+    },
+    {
+      "epoch": 0.06944444444444445,
+      "grad_norm": 8.186206105165184e-05,
+      "learning_rate": 6.551724137931034e-05,
+      "loss": 1.8391,
+      "step": 20
+    },
+    {
+      "epoch": 0.0763888888888889,
+      "grad_norm": 5.4703454225091264e-05,
+      "learning_rate": 7.241379310344828e-05,
+      "loss": 1.7289,
+      "step": 22
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 4.5757446059724316e-05,
+      "learning_rate": 7.931034482758621e-05,
+      "loss": 1.867,
+      "step": 24
+    },
+    {
+      "epoch": 0.09027777777777778,
+      "grad_norm": 8.86492634890601e-05,
+      "learning_rate": 8.620689655172413e-05,
+      "loss": 1.7276,
+      "step": 26
+    },
+    {
+      "epoch": 0.09722222222222222,
+      "grad_norm": 3.919301525456831e-05,
+      "learning_rate": 9.310344827586207e-05,
+      "loss": 1.6506,
+      "step": 28
+    },
+    {
+      "epoch": 0.10416666666666667,
+      "grad_norm": 5.642673568218015e-05,
+      "learning_rate": 0.0001,
+      "loss": 1.5732,
+      "step": 30
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 6.0812188166892156e-05,
+      "learning_rate": 0.00010689655172413792,
+      "loss": 1.5062,
+      "step": 32
+    },
+    {
+      "epoch": 0.11805555555555555,
+      "grad_norm": 2.352882074774243e-05,
+      "learning_rate": 0.00011379310344827588,
+      "loss": 1.393,
+      "step": 34
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 3.3942505979212e-05,
+      "learning_rate": 0.0001206896551724138,
+      "loss": 1.3654,
+      "step": 36
+    },
+    {
+      "epoch": 0.13194444444444445,
+      "grad_norm": 2.9797614843118936e-05,
+      "learning_rate": 0.00012758620689655174,
+      "loss": 1.3662,
+      "step": 38
+    },
+    {
+      "epoch": 0.1388888888888889,
+      "grad_norm": 2.0893716282444075e-05,
+      "learning_rate": 0.00013448275862068965,
+      "loss": 1.344,
+      "step": 40
+    },
+    {
+      "epoch": 0.14583333333333334,
+      "grad_norm": 1.886891550384462e-05,
+      "learning_rate": 0.0001413793103448276,
+      "loss": 1.2809,
+      "step": 42
+    },
+    {
+      "epoch": 0.1527777777777778,
+      "grad_norm": 1.6420885003753938e-05,
+      "learning_rate": 0.00014827586206896554,
+      "loss": 1.2911,
+      "step": 44
+    },
+    {
+      "epoch": 0.1597222222222222,
+      "grad_norm": 2.6823576263268478e-05,
+      "learning_rate": 0.00015517241379310346,
+      "loss": 1.3095,
+      "step": 46
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 1.2686981790466234e-05,
+      "learning_rate": 0.00016206896551724137,
+      "loss": 1.2149,
+      "step": 48
+    },
+    {
+      "epoch": 0.1736111111111111,
+      "grad_norm": 1.0219813702860847e-05,
+      "learning_rate": 0.00016896551724137932,
+      "loss": 1.2522,
+      "step": 50
+    },
+    {
+      "epoch": 0.18055555555555555,
+      "grad_norm": 1.0080276297230739e-05,
+      "learning_rate": 0.00017586206896551723,
+      "loss": 1.2311,
+      "step": 52
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 9.221699656336568e-06,
+      "learning_rate": 0.00018275862068965518,
+      "loss": 1.2138,
+      "step": 54
+    },
+    {
+      "epoch": 0.19444444444444445,
+      "grad_norm": 1.0500927601242438e-05,
+      "learning_rate": 0.00018965517241379312,
+      "loss": 1.2149,
+      "step": 56
+    },
+    {
+      "epoch": 0.2013888888888889,
+      "grad_norm": 8.30852695798967e-06,
+      "learning_rate": 0.00019655172413793104,
+      "loss": 1.212,
+      "step": 58
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 1.2315202184254304e-05,
+      "learning_rate": 0.0001999995876796145,
+      "loss": 1.2239,
+      "step": 60
+    },
+    {
+      "epoch": 0.2152777777777778,
+      "grad_norm": 1.1090536645497195e-05,
+      "learning_rate": 0.00019999628913693117,
+      "loss": 1.2075,
+      "step": 62
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 1.1589069799811114e-05,
+      "learning_rate": 0.00019998969216036892,
+      "loss": 1.2026,
+      "step": 64
+    },
+    {
+      "epoch": 0.22916666666666666,
+      "grad_norm": 1.0656535778252874e-05,
+      "learning_rate": 0.0001999797969675326,
+      "loss": 1.2126,
+      "step": 66
+    },
+    {
+      "epoch": 0.2361111111111111,
+      "grad_norm": 9.856509677774739e-06,
+      "learning_rate": 0.00019996660388482083,
+      "loss": 1.166,
+      "step": 68
+    },
+    {
+      "epoch": 0.24305555555555555,
+      "grad_norm": 1.5398893083329313e-05,
+      "learning_rate": 0.00019995011334741477,
+      "loss": 1.215,
+      "step": 70
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.422492101090029e-06,
+      "learning_rate": 0.00019993032589926414,
+      "loss": 1.1868,
+      "step": 72
+    },
+    {
+      "epoch": 0.2569444444444444,
+      "grad_norm": 8.810847248241771e-06,
+      "learning_rate": 0.00019990724219306902,
+      "loss": 1.1971,
+      "step": 74
+    },
+    {
+      "epoch": 0.2638888888888889,
+      "grad_norm": 1.1227813956793398e-05,
+      "learning_rate": 0.00019988086299025848,
+      "loss": 1.1684,
+      "step": 76
+    },
+    {
+      "epoch": 0.2708333333333333,
+      "grad_norm": 1.0562929674051702e-05,
+      "learning_rate": 0.00019985118916096534,
+      "loss": 1.1981,
+      "step": 78
+    },
+    {
+      "epoch": 0.2777777777777778,
+      "grad_norm": 1.521587728348095e-05,
+      "learning_rate": 0.00019981822168399756,
+      "loss": 1.1838,
+      "step": 80
+    },
+    {
+      "epoch": 0.2847222222222222,
+      "grad_norm": 1.0759257747849915e-05,
+      "learning_rate": 0.00019978196164680597,
+      "loss": 1.2032,
+      "step": 82
+    },
+    {
+      "epoch": 0.2916666666666667,
+      "grad_norm": 8.712796443433035e-06,
+      "learning_rate": 0.00019974241024544828,
+      "loss": 1.1937,
+      "step": 84
+    },
+    {
+      "epoch": 0.2986111111111111,
+      "grad_norm": 1.368098037346499e-05,
+      "learning_rate": 0.00019969956878454972,
+      "loss": 1.1965,
+      "step": 86
+    },
+    {
+      "epoch": 0.3055555555555556,
+      "grad_norm": 1.0317597116227262e-05,
+      "learning_rate": 0.00019965343867725998,
+      "loss": 1.1908,
+      "step": 88
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 1.0250181730953045e-05,
+      "learning_rate": 0.00019960402144520665,
+      "loss": 1.1983,
+      "step": 90
+    },
+    {
+      "epoch": 0.3194444444444444,
+      "grad_norm": 1.0102179658133537e-05,
+      "learning_rate": 0.00019955131871844488,
+      "loss": 1.1842,
+      "step": 92
+    },
+    {
+      "epoch": 0.3263888888888889,
+      "grad_norm": 8.509154213243164e-06,
+      "learning_rate": 0.00019949533223540385,
+      "loss": 1.1871,
+      "step": 94
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 8.434612027485855e-06,
+      "learning_rate": 0.00019943606384282916,
+      "loss": 1.2072,
+      "step": 96
+    },
+    {
+      "epoch": 0.3402777777777778,
+      "grad_norm": 1.0174206181545742e-05,
+      "learning_rate": 0.0001993735154957221,
+      "loss": 1.2088,
+      "step": 98
+    },
+    {
+      "epoch": 0.3472222222222222,
+      "grad_norm": 9.98695850285003e-06,
+      "learning_rate": 0.00019930768925727514,
+      "loss": 1.1847,
+      "step": 100
+    },
+    {
+      "epoch": 0.3541666666666667,
+      "grad_norm": 8.591785444878042e-06,
+      "learning_rate": 0.0001992385872988038,
+      "loss": 1.2041,
+      "step": 102
+    },
+    {
+      "epoch": 0.3611111111111111,
+      "grad_norm": 1.0436694537929725e-05,
+      "learning_rate": 0.00019916621189967502,
+      "loss": 1.2194,
+      "step": 104
+    },
+    {
+      "epoch": 0.3680555555555556,
+      "grad_norm": 1.137161689257482e-05,
+      "learning_rate": 0.00019909056544723213,
+      "loss": 1.1788,
+      "step": 106
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 1.0015843145083636e-05,
+      "learning_rate": 0.00019901165043671593,
+      "loss": 1.1979,
+      "step": 108
+    },
+    {
+      "epoch": 0.3819444444444444,
+      "grad_norm": 1.0921584362222347e-05,
+      "learning_rate": 0.00019892946947118242,
+      "loss": 1.1836,
+      "step": 110
+    },
+    {
+      "epoch": 0.3888888888888889,
+      "grad_norm": 2.0685600247816183e-05,
+      "learning_rate": 0.00019884402526141709,
+      "loss": 1.1883,
+      "step": 112
+    },
+    {
+      "epoch": 0.3958333333333333,
+      "grad_norm": 1.0137908247997984e-05,
+      "learning_rate": 0.00019875532062584519,
+      "loss": 1.183,
+      "step": 114
+    },
+    {
+      "epoch": 0.4027777777777778,
+      "grad_norm": 1.1504852409416344e-05,
+      "learning_rate": 0.00019866335849043912,
+      "loss": 1.1957,
+      "step": 116
+    },
+    {
+      "epoch": 0.4097222222222222,
+      "grad_norm": 8.32590194477234e-06,
+      "learning_rate": 0.00019856814188862166,
+      "loss": 1.1605,
+      "step": 118
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 1.0243880751659162e-05,
+      "learning_rate": 0.000198469673961166,
+      "loss": 1.1787,
+      "step": 120
+    },
+    {
+      "epoch": 0.4236111111111111,
+      "grad_norm": 9.695215339888819e-06,
+      "learning_rate": 0.00019836795795609213,
+      "loss": 1.1849,
+      "step": 122
+    },
+    {
+      "epoch": 0.4305555555555556,
+      "grad_norm": 9.29382167669246e-06,
+      "learning_rate": 0.00019826299722855976,
+      "loss": 1.1779,
+      "step": 124
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 1.0769907930807676e-05,
+      "learning_rate": 0.00019815479524075758,
+      "loss": 1.1878,
+      "step": 126
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 1.0001721420849208e-05,
+      "learning_rate": 0.000198043355561789,
+      "loss": 1.1963,
+      "step": 128
+    },
+    {
+      "epoch": 0.4513888888888889,
+      "grad_norm": 1.2609498298843391e-05,
+      "learning_rate": 0.00019792868186755463,
+      "loss": 1.2135,
+      "step": 130
+    },
+    {
+      "epoch": 0.4583333333333333,
+      "grad_norm": 9.788008355826605e-06,
+      "learning_rate": 0.00019781077794063073,
+      "loss": 1.2,
+      "step": 132
+    },
+    {
+      "epoch": 0.4652777777777778,
+      "grad_norm": 1.0332081728847697e-05,
+      "learning_rate": 0.00019768964767014475,
+      "loss": 1.1747,
+      "step": 134
+    },
+    {
+      "epoch": 0.4722222222222222,
+      "grad_norm": 1.2005073585896753e-05,
+      "learning_rate": 0.00019756529505164682,
+      "loss": 1.1907,
+      "step": 136
+    },
+    {
+      "epoch": 0.4791666666666667,
+      "grad_norm": 8.512701242580079e-06,
+      "learning_rate": 0.00019743772418697806,
+      "loss": 1.2034,
+      "step": 138
+    },
+    {
+      "epoch": 0.4861111111111111,
+      "grad_norm": 1.081860773410881e-05,
+      "learning_rate": 0.0001973069392841352,
+      "loss": 1.1764,
+      "step": 140
+    },
+    {
+      "epoch": 0.4930555555555556,
+      "grad_norm": 1.0664197361620609e-05,
+      "learning_rate": 0.000197172944657132,
+      "loss": 1.169,
+      "step": 142
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.31518536870135e-06,
+      "learning_rate": 0.00019703574472585648,
+      "loss": 1.1787,
+      "step": 144
+    },
+    {
+      "epoch": 0.5069444444444444,
+      "grad_norm": 9.874113857222255e-06,
+      "learning_rate": 0.00019689534401592568,
+      "loss": 1.1908,
+      "step": 146
+    },
+    {
+      "epoch": 0.5138888888888888,
+      "grad_norm": 1.027604412229266e-05,
+      "learning_rate": 0.00019675174715853605,
+      "loss": 1.2001,
+      "step": 148
+    },
+    {
+      "epoch": 0.5208333333333334,
+      "grad_norm": 1.5908390196273103e-05,
+      "learning_rate": 0.00019660495889031073,
+      "loss": 1.1771,
+      "step": 150
+    },
+    {
+      "epoch": 0.5277777777777778,
+      "grad_norm": 1.0185674909735098e-05,
+      "learning_rate": 0.00019645498405314337,
+      "loss": 1.1809,
+      "step": 152
+    },
+    {
+      "epoch": 0.5347222222222222,
+      "grad_norm": 1.4728296264365781e-05,
+      "learning_rate": 0.0001963018275940384,
+      "loss": 1.2066,
+      "step": 154
+    },
+    {
+      "epoch": 0.5416666666666666,
+      "grad_norm": 1.1691463441820815e-05,
+      "learning_rate": 0.00019614549456494778,
+      "loss": 1.1879,
+      "step": 156
+    },
+    {
+      "epoch": 0.5486111111111112,
+      "grad_norm": 1.1677379916363861e-05,
+      "learning_rate": 0.0001959859901226045,
+      "loss": 1.1758,
+      "step": 158
+    },
+    {
+      "epoch": 0.5555555555555556,
+      "grad_norm": 1.2042312846460845e-05,
+      "learning_rate": 0.0001958233195283524,
+      "loss": 1.1741,
+      "step": 160
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 1.5415562302223407e-05,
+      "learning_rate": 0.00019565748814797252,
+      "loss": 1.1855,
+      "step": 162
+    },
+    {
+      "epoch": 0.5694444444444444,
+      "grad_norm": 1.4021643437445164e-05,
+      "learning_rate": 0.00019548850145150633,
+      "loss": 1.1937,
+      "step": 164
+    },
+    {
+      "epoch": 0.5763888888888888,
+      "grad_norm": 1.470915594836697e-05,
+      "learning_rate": 0.00019531636501307512,
+      "loss": 1.1946,
+      "step": 166
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 1.355435324512655e-05,
+      "learning_rate": 0.00019514108451069615,
+      "loss": 1.1898,
+      "step": 168
+    },
+    {
+      "epoch": 0.5902777777777778,
+      "grad_norm": 1.642305687710177e-05,
+      "learning_rate": 0.00019496266572609547,
+      "loss": 1.1822,
+      "step": 170
+    },
+    {
+      "epoch": 0.5972222222222222,
+      "grad_norm": 1.8154083591070957e-05,
+      "learning_rate": 0.00019478111454451712,
+      "loss": 1.1751,
+      "step": 172
+    },
+    {
+      "epoch": 0.6041666666666666,
+      "grad_norm": 1.3315224350662902e-05,
+      "learning_rate": 0.00019459643695452904,
+      "loss": 1.1826,
+      "step": 174
+    },
+    {
+      "epoch": 0.6111111111111112,
+      "grad_norm": 1.2248892744537443e-05,
+      "learning_rate": 0.00019440863904782543,
+      "loss": 1.213,
+      "step": 176
+    },
+    {
+      "epoch": 0.6180555555555556,
+      "grad_norm": 1.7752956409822218e-05,
+      "learning_rate": 0.00019421772701902596,
+      "loss": 1.1833,
+      "step": 178
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 1.744980545481667e-05,
+      "learning_rate": 0.00019402370716547135,
+      "loss": 1.1974,
+      "step": 180
+    },
+    {
+      "epoch": 0.6319444444444444,
+      "grad_norm": 1.9521774447639473e-05,
+      "learning_rate": 0.00019382658588701568,
+      "loss": 1.1931,
+      "step": 182
+    },
+    {
+      "epoch": 0.6388888888888888,
+      "grad_norm": 1.5161932424234692e-05,
+      "learning_rate": 0.00019362636968581524,
+      "loss": 1.1901,
+      "step": 184
+    },
+    {
+      "epoch": 0.6458333333333334,
+      "grad_norm": 1.868332583399024e-05,
+      "learning_rate": 0.00019342306516611417,
+      "loss": 1.2045,
+      "step": 186
+    },
+    {
+      "epoch": 0.6527777777777778,
+      "grad_norm": 2.162260534532834e-05,
+      "learning_rate": 0.00019321667903402642,
+      "loss": 1.1899,
+      "step": 188
+    },
+    {
+      "epoch": 0.6597222222222222,
+      "grad_norm": 1.507936030975543e-05,
+      "learning_rate": 0.00019300721809731476,
+      "loss": 1.2029,
+      "step": 190
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 2.0704670532722957e-05,
+      "learning_rate": 0.00019279468926516606,
+      "loss": 1.2063,
+      "step": 192
+    },
+    {
+      "epoch": 0.6736111111111112,
+      "grad_norm": 1.8972095858771354e-05,
+      "learning_rate": 0.0001925790995479635,
+      "loss": 1.1861,
+      "step": 194
+    },
+    {
+      "epoch": 0.6805555555555556,
+      "grad_norm": 1.5989120583981276e-05,
+      "learning_rate": 0.0001923604560570552,
+      "loss": 1.2143,
+      "step": 196
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 1.15529483082355e-05,
+      "learning_rate": 0.00019213876600451978,
+      "loss": 1.1939,
+      "step": 198
+    },
+    {
+      "epoch": 0.6944444444444444,
+      "grad_norm": 1.474355212849332e-05,
+      "learning_rate": 0.0001919140367029284,
+      "loss": 1.1909,
+      "step": 200
+    },
+    {
+      "epoch": 0.7013888888888888,
+      "grad_norm": 1.4453116818913259e-05,
+      "learning_rate": 0.00019168627556510358,
+      "loss": 1.1669,
+      "step": 202
+    },
+    {
+      "epoch": 0.7083333333333334,
+      "grad_norm": 1.5761746908538043e-05,
+      "learning_rate": 0.00019145549010387463,
+      "loss": 1.1724,
+      "step": 204
+    },
+    {
+      "epoch": 0.7152777777777778,
+      "grad_norm": 1.6146143025252968e-05,
+      "learning_rate": 0.00019122168793182987,
+      "loss": 1.1755,
+      "step": 206
+    },
+    {
+      "epoch": 0.7222222222222222,
+      "grad_norm": 1.3478926121024415e-05,
+      "learning_rate": 0.00019098487676106558,
+      "loss": 1.1972,
+      "step": 208
+    },
+    {
+      "epoch": 0.7291666666666666,
+      "grad_norm": 1.650435478950385e-05,
+      "learning_rate": 0.00019074506440293148,
+      "loss": 1.1689,
+      "step": 210
+    },
+    {
+      "epoch": 0.7361111111111112,
+      "grad_norm": 1.2287140634725802e-05,
+      "learning_rate": 0.00019050225876777316,
+      "loss": 1.1904,
+      "step": 212
+    },
+    {
+      "epoch": 0.7430555555555556,
+      "grad_norm": 1.4642901078332216e-05,
+      "learning_rate": 0.00019025646786467116,
+      "loss": 1.179,
+      "step": 214
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.2564278222271241e-05,
+      "learning_rate": 0.00019000769980117682,
+      "loss": 1.1748,
+      "step": 216
+    },
+    {
+      "epoch": 0.7569444444444444,
+      "grad_norm": 1.6164931366802193e-05,
+      "learning_rate": 0.0001897559627830447,
+      "loss": 1.1975,
+      "step": 218
+    },
+    {
+      "epoch": 0.7638888888888888,
+      "grad_norm": 1.9057128156418912e-05,
+      "learning_rate": 0.000189501265113962,
+      "loss": 1.1759,
+      "step": 220
+    },
+    {
+      "epoch": 0.7708333333333334,
+      "grad_norm": 1.6131505617522635e-05,
+      "learning_rate": 0.00018924361519527473,
+      "loss": 1.1886,
+      "step": 222
+    },
+    {
+      "epoch": 0.7777777777777778,
+      "grad_norm": 1.875819725682959e-05,
+      "learning_rate": 0.00018898302152571043,
+      "loss": 1.198,
+      "step": 224
+    },
+    {
+      "epoch": 0.7847222222222222,
+      "grad_norm": 2.0776369638042524e-05,
+      "learning_rate": 0.000188719492701098,
+      "loss": 1.1851,
+      "step": 226
+    },
+    {
+      "epoch": 0.7916666666666666,
+      "grad_norm": 1.8655549865798093e-05,
+      "learning_rate": 0.000188453037414084,
+      "loss": 1.1942,
+      "step": 228
+    },
+    {
+      "epoch": 0.7986111111111112,
+      "grad_norm": 2.2143083697301336e-05,
+      "learning_rate": 0.0001881836644538461,
+      "loss": 1.1713,
+      "step": 230
+    },
+    {
+      "epoch": 0.8055555555555556,
+      "grad_norm": 1.975363375095185e-05,
+      "learning_rate": 0.000187911382705803,
+      "loss": 1.1768,
+      "step": 232
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 2.22298640437657e-05,
+      "learning_rate": 0.00018763620115132135,
+      "loss": 1.1878,
+      "step": 234
+    },
+    {
+      "epoch": 0.8194444444444444,
+      "grad_norm": 1.9831964891636744e-05,
+      "learning_rate": 0.00018735812886741968,
+      "loss": 1.1945,
+      "step": 236
+    },
+    {
+      "epoch": 0.8263888888888888,
+      "grad_norm": 2.3476728529203683e-05,
+      "learning_rate": 0.00018707717502646873,
+      "loss": 1.1759,
+      "step": 238
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 2.4707373086130247e-05,
+      "learning_rate": 0.0001867933488958891,
+      "loss": 1.207,
+      "step": 240
+    },
+    {
+      "epoch": 0.8402777777777778,
+      "grad_norm": 2.212941035395488e-05,
+      "learning_rate": 0.00018650665983784546,
+      "loss": 1.2087,
+      "step": 242
+    },
+    {
+      "epoch": 0.8472222222222222,
+      "grad_norm": 1.8686198018258438e-05,
+      "learning_rate": 0.00018621711730893776,
+      "loss": 1.1937,
+      "step": 244
+    },
+    {
+      "epoch": 0.8541666666666666,
+      "grad_norm": 2.0610183128155768e-05,
+      "learning_rate": 0.00018592473085988925,
+      "loss": 1.1864,
+      "step": 246
+    },
+    {
+      "epoch": 0.8611111111111112,
+      "grad_norm": 1.8564192941994406e-05,
+      "learning_rate": 0.00018562951013523154,
+      "loss": 1.1726,
+      "step": 248
+    },
+    {
+      "epoch": 0.8680555555555556,
+      "grad_norm": 2.2897967937751673e-05,
+      "learning_rate": 0.00018533146487298638,
+      "loss": 1.2127,
+      "step": 250
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 2.0674704501288943e-05,
+      "learning_rate": 0.0001850306049043445,
+      "loss": 1.1884,
+      "step": 252
+    },
+    {
+      "epoch": 0.8819444444444444,
+      "grad_norm": 1.8763206753646955e-05,
+      "learning_rate": 0.00018472694015334132,
+      "loss": 1.1698,
+      "step": 254
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 4.040408748551272e-05,
+      "learning_rate": 0.00018442048063652952,
+      "loss": 1.1681,
+      "step": 256
+    },
+    {
+      "epoch": 0.8958333333333334,
+      "grad_norm": 2.0314189896453172e-05,
+      "learning_rate": 0.00018411123646264882,
+      "loss": 1.1708,
+      "step": 258
+    },
+    {
+      "epoch": 0.9027777777777778,
+      "grad_norm": 2.7322474124957807e-05,
+      "learning_rate": 0.0001837992178322923,
+      "loss": 1.1995,
+      "step": 260
+    },
+    {
+      "epoch": 0.9097222222222222,
+      "grad_norm": 2.6407045879750513e-05,
+      "learning_rate": 0.0001834844350375701,
+      "loss": 1.1898,
+      "step": 262
+    },
+    {
+      "epoch": 0.9166666666666666,
+      "grad_norm": 2.5243745767511427e-05,
+      "learning_rate": 0.00018316689846176992,
+      "loss": 1.1898,
+      "step": 264
+    },
+    {
+      "epoch": 0.9236111111111112,
+      "grad_norm": 2.753574517555535e-05,
+      "learning_rate": 0.00018284661857901436,
+      "loss": 1.187,
+      "step": 266
+    },
+    {
+      "epoch": 0.9305555555555556,
+      "grad_norm": 3.223833846277557e-05,
+      "learning_rate": 0.00018252360595391565,
+      "loss": 1.1908,
+      "step": 268
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 2.6311214242014103e-05,
+      "learning_rate": 0.00018219787124122708,
+      "loss": 1.206,
+      "step": 270
+    },
+    {
+      "epoch": 0.9444444444444444,
+      "grad_norm": 2.6744064598460682e-05,
+      "learning_rate": 0.00018186942518549145,
+      "loss": 1.2154,
+      "step": 272
+    },
+    {
+      "epoch": 0.9513888888888888,
+      "grad_norm": 2.5860512323561125e-05,
+      "learning_rate": 0.00018153827862068674,
+      "loss": 1.1825,
+      "step": 274
+    },
+    {
+      "epoch": 0.9583333333333334,
+      "grad_norm": 3.4333595976931974e-05,
+      "learning_rate": 0.00018120444246986882,
+      "loss": 1.1831,
+      "step": 276
+    },
+    {
+      "epoch": 0.9652777777777778,
+      "grad_norm": 3.32353483827319e-05,
+      "learning_rate": 0.00018086792774481102,
+      "loss": 1.2021,
+      "step": 278
+    },
+    {
+      "epoch": 0.9722222222222222,
+      "grad_norm": 3.6012690543429926e-05,
+      "learning_rate": 0.00018052874554564088,
+      "loss": 1.1799,
+      "step": 280
+    },
+    {
+      "epoch": 0.9791666666666666,
+      "grad_norm": 3.507096698740497e-05,
+      "learning_rate": 0.00018018690706047422,
+      "loss": 1.2028,
+      "step": 282
+    },
+    {
+      "epoch": 0.9861111111111112,
+      "grad_norm": 4.70953673357144e-05,
+      "learning_rate": 0.00017984242356504585,
+      "loss": 1.1677,
+      "step": 284
+    },
+    {
+      "epoch": 0.9930555555555556,
+      "grad_norm": 3.5310935345478356e-05,
+      "learning_rate": 0.00017949530642233773,
+      "loss": 1.2099,
+      "step": 286
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 4.8496305680600926e-05,
+      "learning_rate": 0.00017914556708220424,
+      "loss": 1.195,
+      "step": 288
+    },
+    {
+      "epoch": 1.0069444444444444,
+      "grad_norm": 4.527560668066144e-05,
+      "learning_rate": 0.00017879321708099433,
+      "loss": 1.2009,
+      "step": 290
+    },
+    {
+      "epoch": 1.0138888888888888,
+      "grad_norm": 4.206915036775172e-05,
+      "learning_rate": 0.0001784382680411711,
+      "loss": 1.1736,
+      "step": 292
+    },
+    {
+      "epoch": 1.0208333333333333,
+      "grad_norm": 4.1610346670495346e-05,
+      "learning_rate": 0.0001780807316709284,
+      "loss": 1.2031,
+      "step": 294
+    },
+    {
+      "epoch": 1.0277777777777777,
+      "grad_norm": 5.509778202394955e-05,
+      "learning_rate": 0.00017772061976380465,
+      "loss": 1.1989,
+      "step": 296
+    },
+    {
+      "epoch": 1.0347222222222223,
+      "grad_norm": 4.0269846067531034e-05,
+      "learning_rate": 0.0001773579441982938,
+      "loss": 1.1862,
+      "step": 298
+    },
+    {
+      "epoch": 1.0416666666666667,
+      "grad_norm": 4.829439421882853e-05,
+      "learning_rate": 0.0001769927169374535,
+      "loss": 1.1962,
+      "step": 300
+    },
+    {
+      "epoch": 1.0486111111111112,
+      "grad_norm": 4.680109486798756e-05,
+      "learning_rate": 0.00017662495002851049,
+      "loss": 1.1822,
+      "step": 302
+    },
+    {
+      "epoch": 1.0555555555555556,
+      "grad_norm": 4.947670822730288e-05,
+      "learning_rate": 0.0001762546556024633,
+      "loss": 1.214,
+      "step": 304
+    },
+    {
+      "epoch": 1.0625,
+      "grad_norm": 5.944677104707807e-05,
+      "learning_rate": 0.00017588184587368196,
+      "loss": 1.2075,
+      "step": 306
+    },
+    {
+      "epoch": 1.0694444444444444,
+      "grad_norm": 6.578410102520138e-05,
+      "learning_rate": 0.0001755065331395052,
+      "loss": 1.2299,
+      "step": 308
+    },
+    {
+      "epoch": 1.0763888888888888,
+      "grad_norm": 8.231692481786013e-05,
+      "learning_rate": 0.00017512872977983482,
+      "loss": 1.208,
+      "step": 310
+    },
+    {
+      "epoch": 1.0833333333333333,
+      "grad_norm": 6.243359530344605e-05,
+      "learning_rate": 0.00017474844825672727,
+      "loss": 1.2286,
+      "step": 312
+    },
+    {
+      "epoch": 1.0902777777777777,
+      "grad_norm": 8.204213372664526e-05,
+      "learning_rate": 0.00017436570111398263,
+      "loss": 1.2454,
+      "step": 314
+    },
+    {
+      "epoch": 1.0972222222222223,
+      "grad_norm": 7.384664058918133e-05,
+      "learning_rate": 0.00017398050097673081,
+      "loss": 1.2061,
+      "step": 316
+    },
+    {
+      "epoch": 1.1041666666666667,
+      "grad_norm": 7.188819290604442e-05,
+      "learning_rate": 0.0001735928605510152,
+      "loss": 1.2155,
+      "step": 318
+    },
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 0.00010495231254026294,
+      "learning_rate": 0.00017320279262337333,
+      "loss": 1.236,
+      "step": 320
+    },
+    {
+      "epoch": 1.1180555555555556,
+      "grad_norm": 9.512733231531456e-05,
+      "learning_rate": 0.00017281031006041538,
+      "loss": 1.253,
+      "step": 322
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 9.048975334735587e-05,
+      "learning_rate": 0.00017241542580839964,
+      "loss": 1.2448,
+      "step": 324
+    },
+    {
+      "epoch": 1.1319444444444444,
+      "grad_norm": 8.587296906625852e-05,
+      "learning_rate": 0.0001720181528928054,
+      "loss": 1.2478,
+      "step": 326
+    },
+    {
+      "epoch": 1.1388888888888888,
+      "grad_norm": 9.011059592012316e-05,
+      "learning_rate": 0.00017161850441790332,
+      "loss": 1.2569,
+      "step": 328
+    },
+    {
+      "epoch": 1.1458333333333333,
+      "grad_norm": 9.588886314304546e-05,
+      "learning_rate": 0.00017121649356632333,
+      "loss": 1.2707,
+      "step": 330
+    },
+    {
+      "epoch": 1.1527777777777777,
+      "grad_norm": 9.867311746347696e-05,
+      "learning_rate": 0.00017081213359861964,
+      "loss": 1.2893,
+      "step": 332
+    },
+    {
+      "epoch": 1.1597222222222223,
+      "grad_norm": 0.00010548109275987372,
+      "learning_rate": 0.00017040543785283336,
+      "loss": 1.3716,
+      "step": 334
+    },
+    {
+      "epoch": 1.1666666666666667,
+      "grad_norm": 9.849516936810687e-05,
+      "learning_rate": 0.0001699964197440526,
+      "loss": 1.3677,
+      "step": 336
+    },
+    {
+      "epoch": 1.1736111111111112,
+      "grad_norm": 9.55751893343404e-05,
+      "learning_rate": 0.00016958509276396986,
+      "loss": 1.3871,
+      "step": 338
+    },
+    {
+      "epoch": 1.1805555555555556,
+      "grad_norm": 8.779441122896969e-05,
+      "learning_rate": 0.00016917147048043708,
+      "loss": 1.4266,
+      "step": 340
+    },
+    {
+      "epoch": 1.1875,
+      "grad_norm": 8.045154390856624e-05,
+      "learning_rate": 0.00016875556653701807,
+      "loss": 1.5035,
+      "step": 342
+    },
+    {
+      "epoch": 1.1944444444444444,
+      "grad_norm": 7.365510100498796e-05,
+      "learning_rate": 0.00016833739465253855,
+      "loss": 1.532,
+      "step": 344
+    },
+    {
+      "epoch": 1.2013888888888888,
+      "grad_norm": 6.366265006363392e-05,
+      "learning_rate": 0.00016791696862063343,
+      "loss": 1.5854,
+      "step": 346
+    },
+    {
+      "epoch": 1.2083333333333333,
+      "grad_norm": 5.384908217820339e-05,
+      "learning_rate": 0.0001674943023092921,
+      "loss": 1.6161,
+      "step": 348
+    },
+    {
+      "epoch": 1.2152777777777777,
+      "grad_norm": 4.3953212298220024e-05,
+      "learning_rate": 0.00016706940966040062,
+      "loss": 1.6505,
+      "step": 350
+    },
+    {
+      "epoch": 1.2222222222222223,
+      "grad_norm": 4.8670408432371914e-05,
+      "learning_rate": 0.00016664230468928226,
+      "loss": 1.6546,
+      "step": 352
+    },
+    {
+      "epoch": 1.2291666666666667,
+      "grad_norm": 4.31089210906066e-05,
+      "learning_rate": 0.0001662130014842348,
+      "loss": 1.6483,
+      "step": 354
+    },
+    {
+      "epoch": 1.2361111111111112,
+      "grad_norm": 3.783537613344379e-05,
+      "learning_rate": 0.0001657815142060661,
+      "loss": 1.6821,
+      "step": 356
+    },
+    {
+      "epoch": 1.2430555555555556,
+      "grad_norm": 3.4532826248323545e-05,
+      "learning_rate": 0.00016534785708762693,
+      "loss": 1.7036,
+      "step": 358
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 3.33928874169942e-05,
+      "learning_rate": 0.0001649120444333414,
+      "loss": 1.7133,
+      "step": 360
+    },
+    {
+      "epoch": 1.2569444444444444,
+      "grad_norm": 3.3881518902489915e-05,
+      "learning_rate": 0.0001644740906187352,
+      "loss": 1.7246,
+      "step": 362
+    },
+    {
+      "epoch": 1.2638888888888888,
+      "grad_norm": 2.9356864615692757e-05,
+      "learning_rate": 0.0001640340100899614,
+      "loss": 1.7653,
+      "step": 364
+    },
+    {
+      "epoch": 1.2708333333333333,
+      "grad_norm": 3.071104583796114e-05,
+      "learning_rate": 0.00016359181736332393,
+      "loss": 1.8163,
+      "step": 366
+    },
+    {
+      "epoch": 1.2777777777777777,
+      "grad_norm": 3.3303516829619184e-05,
+      "learning_rate": 0.00016314752702479882,
+      "loss": 1.8645,
+      "step": 368
+    },
+    {
+      "epoch": 1.2847222222222223,
+      "grad_norm": 3.376286258571781e-05,
+      "learning_rate": 0.00016270115372955286,
+      "loss": 1.904,
+      "step": 370
+    },
+    {
+      "epoch": 1.2916666666666667,
+      "grad_norm": 5.366417462937534e-05,
+      "learning_rate": 0.0001622527122014605,
+      "loss": 1.9253,
+      "step": 372
+    },
+    {
+      "epoch": 1.2986111111111112,
+      "grad_norm": 3.328424281789921e-05,
+      "learning_rate": 0.0001618022172326179,
+      "loss": 1.9897,
+      "step": 374
+    },
+    {
+      "epoch": 1.3055555555555556,
+      "grad_norm": 2.7745934858103283e-05,
+      "learning_rate": 0.00016134968368285518,
+      "loss": 2.0278,
+      "step": 376
+    },
+    {
+      "epoch": 1.3125,
+      "grad_norm": 3.089087113039568e-05,
+      "learning_rate": 0.0001608951264792462,
+      "loss": 2.0393,
+      "step": 378
+    },
+    {
+      "epoch": 1.3194444444444444,
+      "grad_norm": 3.455130718066357e-05,
+      "learning_rate": 0.00016043856061561613,
+      "loss": 2.0616,
+      "step": 380
+    },
+    {
+      "epoch": 1.3263888888888888,
+      "grad_norm": 2.891979420382995e-05,
+      "learning_rate": 0.000159980001152047,
+      "loss": 2.0735,
+      "step": 382
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 2.5224271666957065e-05,
+      "learning_rate": 0.00015951946321438073,
+      "loss": 2.1789,
+      "step": 384
+    },
+    {
+      "epoch": 1.3402777777777777,
+      "grad_norm": 2.7425830921856686e-05,
+      "learning_rate": 0.0001590569619937205,
+      "loss": 2.1456,
+      "step": 386
+    },
+    {
+      "epoch": 1.3472222222222223,
+      "grad_norm": 2.8204965929035097e-05,
+      "learning_rate": 0.00015859251274592934,
+      "loss": 2.1728,
+      "step": 388
+    },
+    {
+      "epoch": 1.3541666666666667,
+      "grad_norm": 2.821902853611391e-05,
+      "learning_rate": 0.00015812613079112708,
+      "loss": 2.2326,
+      "step": 390
+    },
+    {
+      "epoch": 1.3611111111111112,
+      "grad_norm": 2.6897825591731817e-05,
+      "learning_rate": 0.00015765783151318506,
+      "loss": 2.2963,
+      "step": 392
+    },
+    {
+      "epoch": 1.3680555555555556,
+      "grad_norm": 3.4906293876701966e-05,
+      "learning_rate": 0.00015718763035921847,
+      "loss": 2.4097,
+      "step": 394
+    },
+    {
+      "epoch": 1.375,
+      "grad_norm": 3.223119711037725e-05,
+      "learning_rate": 0.00015671554283907705,
+      "loss": 2.4404,
+      "step": 396
+    },
+    {
+      "epoch": 1.3819444444444444,
+      "grad_norm": 3.1298048270400614e-05,
+      "learning_rate": 0.00015624158452483337,
+      "loss": 2.5603,
+      "step": 398
+    },
+    {
+      "epoch": 1.3888888888888888,
+      "grad_norm": 4.206110679660924e-05,
+      "learning_rate": 0.00015576577105026916,
+      "loss": 2.695,
+      "step": 400
+    },
+    {
+      "epoch": 1.3958333333333333,
+      "grad_norm": 7.035292219370604e-05,
+      "learning_rate": 0.00015528811811035972,
+      "loss": 2.9149,
+      "step": 402
+    },
+    {
+      "epoch": 1.4027777777777777,
+      "grad_norm": 0.00011025326239177957,
+      "learning_rate": 0.00015480864146075608,
+      "loss": 3.4276,
+      "step": 404
+    },
+    {
+      "epoch": 1.4097222222222223,
+      "grad_norm": 0.0001375137799186632,
+      "learning_rate": 0.00015432735691726547,
+      "loss": 3.8393,
+      "step": 406
+    },
+    {
+      "epoch": 1.4166666666666667,
+      "grad_norm": 0.00015558676386717707,
+      "learning_rate": 0.00015384428035532932,
+      "loss": 4.228,
+      "step": 408
+    },
+    {
+      "epoch": 1.4236111111111112,
+      "grad_norm": 0.00016118814528454095,
+      "learning_rate": 0.00015335942770950003,
+      "loss": 4.5641,
+      "step": 410
+    },
+    {
+      "epoch": 1.4305555555555556,
+      "grad_norm": 0.0001592924090800807,
+      "learning_rate": 0.00015287281497291497,
+      "loss": 4.851,
+      "step": 412
+    },
+    {
+      "epoch": 1.4375,
+      "grad_norm": 0.00015159139002207667,
+      "learning_rate": 0.0001523844581967691,
+      "loss": 5.0476,
+      "step": 414
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 0.0001463508524466306,
+      "learning_rate": 0.00015189437348978561,
+      "loss": 5.1316,
+      "step": 416
+    },
+    {
+      "epoch": 1.4513888888888888,
+      "grad_norm": 0.00018889355123974383,
+      "learning_rate": 0.00015140257701768442,
+      "loss": 5.4955,
+      "step": 418
+    },
+    {
+      "epoch": 1.4583333333333333,
+      "grad_norm": 0.00014158397971186787,
+      "learning_rate": 0.0001509090850026489,
+      "loss": 5.5181,
+      "step": 420
+    },
+    {
+      "epoch": 1.4652777777777777,
+      "grad_norm": 0.00013335797120817006,
+      "learning_rate": 0.00015041391372279094,
+      "loss": 5.675,
+      "step": 422
+    },
+    {
+      "epoch": 1.4722222222222223,
+      "grad_norm": 0.00012022190639982,
+      "learning_rate": 0.0001499170795116139,
+      "loss": 5.6161,
+      "step": 424
+    },
+    {
+      "epoch": 1.4791666666666667,
+      "grad_norm": 0.00010211220069322735,
+      "learning_rate": 0.0001494185987574739,
+      "loss": 5.6191,
+      "step": 426
+    },
+    {
+      "epoch": 1.4861111111111112,
+      "grad_norm": 8.821386290946975e-05,
+      "learning_rate": 0.0001489184879030392,
+      "loss": 5.6616,
+      "step": 428
+    },
+    {
+      "epoch": 1.4930555555555556,
+      "grad_norm": 8.532748324796557e-05,
+      "learning_rate": 0.00014841676344474775,
+      "loss": 5.6971,
+      "step": 430
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 8.207001519622281e-05,
+      "learning_rate": 0.00014791344193226325,
+      "loss": 5.7875,
+      "step": 432
+    },
+    {
+      "epoch": 1.5069444444444444,
+      "grad_norm": 7.251853821799159e-05,
+      "learning_rate": 0.00014740853996792903,
+      "loss": 5.7305,
+      "step": 434
+    },
+    {
+      "epoch": 1.5138888888888888,
+      "grad_norm": 0.00016514182789251208,
+      "learning_rate": 0.00014690207420622062,
+      "loss": 5.7891,
+      "step": 436
+    },
+    {
+      "epoch": 1.5208333333333335,
+      "grad_norm": 6.044754263712093e-05,
+      "learning_rate": 0.00014639406135319617,
+      "loss": 6.0149,
+      "step": 438
+    },
+    {
+      "epoch": 1.5277777777777777,
+      "grad_norm": 7.798307342454791e-05,
+      "learning_rate": 0.0001458845181659456,
+      "loss": 5.9835,
+      "step": 440
+    },
+    {
+      "epoch": 1.5347222222222223,
+      "grad_norm": 5.688685632776469e-05,
+      "learning_rate": 0.00014537346145203776,
+      "loss": 6.145,
+      "step": 442
+    },
+    {
+      "epoch": 1.5416666666666665,
+      "grad_norm": 5.519447586266324e-05,
+      "learning_rate": 0.00014486090806896596,
+      "loss": 6.1921,
+      "step": 444
+    },
+    {
+      "epoch": 1.5486111111111112,
+      "grad_norm": 5.198594226385467e-05,
+      "learning_rate": 0.00014434687492359202,
+      "loss": 6.2428,
+      "step": 446
+    },
+    {
+      "epoch": 1.5555555555555556,
+      "grad_norm": 5.4393378377426416e-05,
+      "learning_rate": 0.00014383137897158857,
+      "loss": 6.2366,
+      "step": 448
+    },
+    {
+      "epoch": 1.5625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014331443721687974,
+      "loss": 6.5184,
+      "step": 450
+    },
+    {
+      "epoch": 1.5694444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014331443721687974,
+      "loss": 6.3582,
+      "step": 452
+    },
+    {
+      "epoch": 1.5763888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014331443721687974,
+      "loss": 6.4165,
+      "step": 454
+    },
+    {
+      "epoch": 1.5833333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014331443721687974,
+      "loss": 6.5681,
+      "step": 456
+    },
+    {
+      "epoch": 1.5902777777777777,
+      "grad_norm": 0.013171161524951458,
+      "learning_rate": 0.00014331443721687974,
+      "loss": 6.4663,
+      "step": 458
+    },
+    {
+      "epoch": 1.5972222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 460
+    },
+    {
+      "epoch": 1.6041666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 462
+    },
+    {
+      "epoch": 1.6111111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 464
+    },
+    {
+      "epoch": 1.6180555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 466
+    },
+    {
+      "epoch": 1.625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 468
+    },
+    {
+      "epoch": 1.6319444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 470
+    },
+    {
+      "epoch": 1.6388888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 472
+    },
+    {
+      "epoch": 1.6458333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 474
+    },
+    {
+      "epoch": 1.6527777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 476
+    },
+    {
+      "epoch": 1.6597222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 478
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 480
+    },
+    {
+      "epoch": 1.6736111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 482
+    },
+    {
+      "epoch": 1.6805555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 484
+    },
+    {
+      "epoch": 1.6875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 486
+    },
+    {
+      "epoch": 1.6944444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 488
+    },
+    {
+      "epoch": 1.7013888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 490
+    },
+    {
+      "epoch": 1.7083333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 492
+    },
+    {
+      "epoch": 1.7152777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 494
+    },
+    {
+      "epoch": 1.7222222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 496
+    },
+    {
+      "epoch": 1.7291666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 498
+    },
+    {
+      "epoch": 1.7361111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 500
+    },
+    {
+      "epoch": 1.7430555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 502
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 504
+    },
+    {
+      "epoch": 1.7569444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 506
+    },
+    {
+      "epoch": 1.7638888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 508
+    },
+    {
+      "epoch": 1.7708333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 510
+    },
+    {
+      "epoch": 1.7777777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 512
+    },
+    {
+      "epoch": 1.7847222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 514
+    },
+    {
+      "epoch": 1.7916666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 516
+    },
+    {
+      "epoch": 1.7986111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 518
+    },
+    {
+      "epoch": 1.8055555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 520
+    },
+    {
+      "epoch": 1.8125,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 522
+    },
+    {
+      "epoch": 1.8194444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 524
+    },
+    {
+      "epoch": 1.8263888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 526
+    },
+    {
+      "epoch": 1.8333333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 528
+    },
+    {
+      "epoch": 1.8402777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 530
+    },
+    {
+      "epoch": 1.8472222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 532
+    },
+    {
+      "epoch": 1.8541666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 534
+    },
+    {
+      "epoch": 1.8611111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 536
+    },
+    {
+      "epoch": 1.8680555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 538
+    },
+    {
+      "epoch": 1.875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 540
+    },
+    {
+      "epoch": 1.8819444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 542
+    },
+    {
+      "epoch": 1.8888888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 544
+    },
+    {
+      "epoch": 1.8958333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 546
+    },
+    {
+      "epoch": 1.9027777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 548
+    },
+    {
+      "epoch": 1.9097222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 550
+    },
+    {
+      "epoch": 1.9166666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 552
+    },
+    {
+      "epoch": 1.9236111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 554
+    },
+    {
+      "epoch": 1.9305555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 556
+    },
+    {
+      "epoch": 1.9375,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 558
+    },
+    {
+      "epoch": 1.9444444444444444,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 560
+    },
+    {
+      "epoch": 1.9513888888888888,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 562
+    },
+    {
+      "epoch": 1.9583333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 564
+    },
+    {
+      "epoch": 1.9652777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 566
+    },
+    {
+      "epoch": 1.9722222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 568
+    },
+    {
+      "epoch": 1.9791666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 570
+    },
+    {
+      "epoch": 1.9861111111111112,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 572
+    },
+    {
+      "epoch": 1.9930555555555556,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 574
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 576
+    },
+    {
+      "epoch": 2.0069444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 578
+    },
+    {
+      "epoch": 2.013888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 580
+    },
+    {
+      "epoch": 2.0208333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 582
+    },
+    {
+      "epoch": 2.0277777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 584
+    },
+    {
+      "epoch": 2.0347222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 586
+    },
+    {
+      "epoch": 2.0416666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 588
+    },
+    {
+      "epoch": 2.048611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 590
+    },
+    {
+      "epoch": 2.0555555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 592
+    },
+    {
+      "epoch": 2.0625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 594
+    },
+    {
+      "epoch": 2.0694444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 596
+    },
+    {
+      "epoch": 2.076388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 598
+    },
+    {
+      "epoch": 2.0833333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 600
+    },
+    {
+      "epoch": 2.0902777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 602
+    },
+    {
+      "epoch": 2.0972222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 604
+    },
+    {
+      "epoch": 2.1041666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 606
+    },
+    {
+      "epoch": 2.111111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 608
+    },
+    {
+      "epoch": 2.1180555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 610
+    },
+    {
+      "epoch": 2.125,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 612
+    },
+    {
+      "epoch": 2.1319444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 614
+    },
+    {
+      "epoch": 2.138888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 616
+    },
+    {
+      "epoch": 2.1458333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 618
+    },
+    {
+      "epoch": 2.1527777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 620
+    },
+    {
+      "epoch": 2.1597222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 622
+    },
+    {
+      "epoch": 2.1666666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 624
+    },
+    {
+      "epoch": 2.173611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 626
+    },
+    {
+      "epoch": 2.1805555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 628
+    },
+    {
+      "epoch": 2.1875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 630
+    },
+    {
+      "epoch": 2.1944444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 632
+    },
+    {
+      "epoch": 2.201388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 634
+    },
+    {
+      "epoch": 2.2083333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 636
+    },
+    {
+      "epoch": 2.2152777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 638
+    },
+    {
+      "epoch": 2.2222222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 640
+    },
+    {
+      "epoch": 2.2291666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 642
+    },
+    {
+      "epoch": 2.236111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 644
+    },
+    {
+      "epoch": 2.2430555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 646
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 648
+    },
+    {
+      "epoch": 2.2569444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 650
+    },
+    {
+      "epoch": 2.263888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 652
+    },
+    {
+      "epoch": 2.2708333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 654
+    },
+    {
+      "epoch": 2.2777777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 656
+    },
+    {
+      "epoch": 2.2847222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 658
+    },
+    {
+      "epoch": 2.2916666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 660
+    },
+    {
+      "epoch": 2.298611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 662
+    },
+    {
+      "epoch": 2.3055555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 664
+    },
+    {
+      "epoch": 2.3125,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 666
+    },
+    {
+      "epoch": 2.3194444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 668
+    },
+    {
+      "epoch": 2.326388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 670
+    },
+    {
+      "epoch": 2.3333333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 672
+    },
+    {
+      "epoch": 2.3402777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 674
+    },
+    {
+      "epoch": 2.3472222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 676
+    },
+    {
+      "epoch": 2.3541666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 678
+    },
+    {
+      "epoch": 2.361111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 680
+    },
+    {
+      "epoch": 2.3680555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 682
+    },
+    {
+      "epoch": 2.375,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 684
+    },
+    {
+      "epoch": 2.3819444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 686
+    },
+    {
+      "epoch": 2.388888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 688
+    },
+    {
+      "epoch": 2.3958333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 690
+    },
+    {
+      "epoch": 2.4027777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 692
+    },
+    {
+      "epoch": 2.4097222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 694
+    },
+    {
+      "epoch": 2.4166666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 696
+    },
+    {
+      "epoch": 2.423611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 698
+    },
+    {
+      "epoch": 2.4305555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 700
+    },
+    {
+      "epoch": 2.4375,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 702
+    },
+    {
+      "epoch": 2.4444444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 704
+    },
+    {
+      "epoch": 2.451388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 706
+    },
+    {
+      "epoch": 2.4583333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 708
+    },
+    {
+      "epoch": 2.4652777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 710
+    },
+    {
+      "epoch": 2.4722222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 712
+    },
+    {
+      "epoch": 2.4791666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 714
+    },
+    {
+      "epoch": 2.486111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 716
+    },
+    {
+      "epoch": 2.4930555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 718
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 720
+    },
+    {
+      "epoch": 2.5069444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 722
+    },
+    {
+      "epoch": 2.513888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 724
+    },
+    {
+      "epoch": 2.5208333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 726
+    },
+    {
+      "epoch": 2.5277777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 728
+    },
+    {
+      "epoch": 2.5347222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 730
+    },
+    {
+      "epoch": 2.5416666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 732
+    },
+    {
+      "epoch": 2.548611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 734
+    },
+    {
+      "epoch": 2.5555555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 736
+    },
+    {
+      "epoch": 2.5625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 738
+    },
+    {
+      "epoch": 2.5694444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 740
+    },
+    {
+      "epoch": 2.576388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 742
+    },
+    {
+      "epoch": 2.5833333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 744
+    },
+    {
+      "epoch": 2.5902777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 746
+    },
+    {
+      "epoch": 2.5972222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 748
+    },
+    {
+      "epoch": 2.6041666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 750
+    },
+    {
+      "epoch": 2.611111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 752
+    },
+    {
+      "epoch": 2.6180555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 754
+    },
+    {
+      "epoch": 2.625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 756
+    },
+    {
+      "epoch": 2.6319444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 758
+    },
+    {
+      "epoch": 2.638888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 760
+    },
+    {
+      "epoch": 2.6458333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 762
+    },
+    {
+      "epoch": 2.6527777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 764
+    },
+    {
+      "epoch": 2.6597222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 766
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 768
+    },
+    {
+      "epoch": 2.673611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 770
+    },
+    {
+      "epoch": 2.6805555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 772
+    },
+    {
+      "epoch": 2.6875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 774
+    },
+    {
+      "epoch": 2.6944444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 776
+    },
+    {
+      "epoch": 2.701388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 778
+    },
+    {
+      "epoch": 2.7083333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 780
+    },
+    {
+      "epoch": 2.7152777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 782
+    },
+    {
+      "epoch": 2.7222222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 784
+    },
+    {
+      "epoch": 2.7291666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 786
+    },
+    {
+      "epoch": 2.736111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 788
+    },
+    {
+      "epoch": 2.7430555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 790
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 792
+    },
+    {
+      "epoch": 2.7569444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 794
+    },
+    {
+      "epoch": 2.763888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 796
+    },
+    {
+      "epoch": 2.7708333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 798
+    },
+    {
+      "epoch": 2.7777777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 800
+    },
+    {
+      "epoch": 2.7847222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 802
+    },
+    {
+      "epoch": 2.7916666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 804
+    },
+    {
+      "epoch": 2.798611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 806
+    },
+    {
+      "epoch": 2.8055555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 808
+    },
+    {
+      "epoch": 2.8125,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 810
+    },
+    {
+      "epoch": 2.8194444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 812
+    },
+    {
+      "epoch": 2.826388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 814
+    },
+    {
+      "epoch": 2.8333333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 816
+    },
+    {
+      "epoch": 2.8402777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 818
+    },
+    {
+      "epoch": 2.8472222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 820
+    },
+    {
+      "epoch": 2.8541666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 822
+    },
+    {
+      "epoch": 2.861111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 824
+    },
+    {
+      "epoch": 2.8680555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 826
+    },
+    {
+      "epoch": 2.875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 828
+    },
+    {
+      "epoch": 2.8819444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 830
+    },
+    {
+      "epoch": 2.888888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 832
+    },
+    {
+      "epoch": 2.8958333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 834
+    },
+    {
+      "epoch": 2.9027777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 836
+    },
+    {
+      "epoch": 2.9097222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 838
+    },
+    {
+      "epoch": 2.9166666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 840
+    },
+    {
+      "epoch": 2.923611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 842
+    },
+    {
+      "epoch": 2.9305555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 844
+    },
+    {
+      "epoch": 2.9375,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 846
+    },
+    {
+      "epoch": 2.9444444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 848
+    },
+    {
+      "epoch": 2.951388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 850
+    },
+    {
+      "epoch": 2.9583333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 852
+    },
+    {
+      "epoch": 2.9652777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 854
+    },
+    {
+      "epoch": 2.9722222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 856
+    },
+    {
+      "epoch": 2.9791666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 858
+    },
+    {
+      "epoch": 2.986111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 860
+    },
+    {
+      "epoch": 2.9930555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 862
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 864
+    },
+    {
+      "epoch": 3.0069444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 866
+    },
+    {
+      "epoch": 3.013888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 868
+    },
+    {
+      "epoch": 3.0208333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 870
+    },
+    {
+      "epoch": 3.0277777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 872
+    },
+    {
+      "epoch": 3.0347222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 874
+    },
+    {
+      "epoch": 3.0416666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 876
+    },
+    {
+      "epoch": 3.048611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 878
+    },
+    {
+      "epoch": 3.0555555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 880
+    },
+    {
+      "epoch": 3.0625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 882
+    },
+    {
+      "epoch": 3.0694444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 884
+    },
+    {
+      "epoch": 3.076388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 886
+    },
+    {
+      "epoch": 3.0833333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 888
+    },
+    {
+      "epoch": 3.0902777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 890
+    },
+    {
+      "epoch": 3.0972222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 892
+    },
+    {
+      "epoch": 3.1041666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 894
+    },
+    {
+      "epoch": 3.111111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 896
+    },
+    {
+      "epoch": 3.1180555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 898
+    },
+    {
+      "epoch": 3.125,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 900
+    },
+    {
+      "epoch": 3.1319444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 902
+    },
+    {
+      "epoch": 3.138888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 904
+    },
+    {
+      "epoch": 3.1458333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 906
+    },
+    {
+      "epoch": 3.1527777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 908
+    },
+    {
+      "epoch": 3.1597222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 910
+    },
+    {
+      "epoch": 3.1666666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 912
+    },
+    {
+      "epoch": 3.173611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 914
+    },
+    {
+      "epoch": 3.1805555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 916
+    },
+    {
+      "epoch": 3.1875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 918
+    },
+    {
+      "epoch": 3.1944444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 920
+    },
+    {
+      "epoch": 3.201388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 922
+    },
+    {
+      "epoch": 3.2083333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 924
+    },
+    {
+      "epoch": 3.2152777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 926
+    },
+    {
+      "epoch": 3.2222222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 928
+    },
+    {
+      "epoch": 3.2291666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 930
+    },
+    {
+      "epoch": 3.236111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 932
+    },
+    {
+      "epoch": 3.2430555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 934
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 936
+    },
+    {
+      "epoch": 3.2569444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 938
+    },
+    {
+      "epoch": 3.263888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 940
+    },
+    {
+      "epoch": 3.2708333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 942
+    },
+    {
+      "epoch": 3.2777777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 944
+    },
+    {
+      "epoch": 3.2847222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 946
+    },
+    {
+      "epoch": 3.2916666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 948
+    },
+    {
+      "epoch": 3.298611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 950
+    },
+    {
+      "epoch": 3.3055555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 952
+    },
+    {
+      "epoch": 3.3125,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 954
+    },
+    {
+      "epoch": 3.3194444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 956
+    },
+    {
+      "epoch": 3.326388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 958
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 960
+    },
+    {
+      "epoch": 3.3402777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 962
+    },
+    {
+      "epoch": 3.3472222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 964
+    },
+    {
+      "epoch": 3.3541666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 966
+    },
+    {
+      "epoch": 3.361111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 968
+    },
+    {
+      "epoch": 3.3680555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 970
+    },
+    {
+      "epoch": 3.375,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 972
+    },
+    {
+      "epoch": 3.3819444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 974
+    },
+    {
+      "epoch": 3.388888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 976
+    },
+    {
+      "epoch": 3.3958333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 978
+    },
+    {
+      "epoch": 3.4027777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 980
+    },
+    {
+      "epoch": 3.4097222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 982
+    },
+    {
+      "epoch": 3.4166666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 984
+    },
+    {
+      "epoch": 3.423611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 986
+    },
+    {
+      "epoch": 3.4305555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 988
+    },
+    {
+      "epoch": 3.4375,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 990
+    },
+    {
+      "epoch": 3.4444444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 992
+    },
+    {
+      "epoch": 3.451388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 994
+    },
+    {
+      "epoch": 3.4583333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 996
+    },
+    {
+      "epoch": 3.4652777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 998
+    },
+    {
+      "epoch": 3.4722222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1000
+    },
+    {
+      "epoch": 3.4791666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1002
+    },
+    {
+      "epoch": 3.486111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1004
+    },
+    {
+      "epoch": 3.4930555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1006
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1008
+    },
+    {
+      "epoch": 3.5069444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1010
+    },
+    {
+      "epoch": 3.513888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1012
+    },
+    {
+      "epoch": 3.5208333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1014
+    },
+    {
+      "epoch": 3.5277777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1016
+    },
+    {
+      "epoch": 3.5347222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1018
+    },
+    {
+      "epoch": 3.5416666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1020
+    },
+    {
+      "epoch": 3.548611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1022
+    },
+    {
+      "epoch": 3.5555555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1024
+    },
+    {
+      "epoch": 3.5625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1026
+    },
+    {
+      "epoch": 3.5694444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1028
+    },
+    {
+      "epoch": 3.576388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1030
+    },
+    {
+      "epoch": 3.5833333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1032
+    },
+    {
+      "epoch": 3.5902777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1034
+    },
+    {
+      "epoch": 3.5972222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1036
+    },
+    {
+      "epoch": 3.6041666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1038
+    },
+    {
+      "epoch": 3.611111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1040
+    },
+    {
+      "epoch": 3.6180555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1042
+    },
+    {
+      "epoch": 3.625,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1044
+    },
+    {
+      "epoch": 3.6319444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1046
+    },
+    {
+      "epoch": 3.638888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1048
+    },
+    {
+      "epoch": 3.6458333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1050
+    },
+    {
+      "epoch": 3.6527777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1052
+    },
+    {
+      "epoch": 3.6597222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1054
+    },
+    {
+      "epoch": 3.6666666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1056
+    },
+    {
+      "epoch": 3.673611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1058
+    },
+    {
+      "epoch": 3.6805555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1060
+    },
+    {
+      "epoch": 3.6875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1062
+    },
+    {
+      "epoch": 3.6944444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1064
+    },
+    {
+      "epoch": 3.701388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1066
+    },
+    {
+      "epoch": 3.7083333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1068
+    },
+    {
+      "epoch": 3.7152777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1070
+    },
+    {
+      "epoch": 3.7222222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1072
+    },
+    {
+      "epoch": 3.7291666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1074
+    },
+    {
+      "epoch": 3.736111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1076
+    },
+    {
+      "epoch": 3.7430555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1078
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1080
+    },
+    {
+      "epoch": 3.7569444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1082
+    },
+    {
+      "epoch": 3.763888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1084
+    },
+    {
+      "epoch": 3.7708333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1086
+    },
+    {
+      "epoch": 3.7777777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1088
+    },
+    {
+      "epoch": 3.7847222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1090
+    },
+    {
+      "epoch": 3.7916666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1092
+    },
+    {
+      "epoch": 3.798611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1094
+    },
+    {
+      "epoch": 3.8055555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1096
+    },
+    {
+      "epoch": 3.8125,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1098
+    },
+    {
+      "epoch": 3.8194444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1100
+    },
+    {
+      "epoch": 3.826388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1102
+    },
+    {
+      "epoch": 3.8333333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1104
+    },
+    {
+      "epoch": 3.8402777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1106
+    },
+    {
+      "epoch": 3.8472222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1108
+    },
+    {
+      "epoch": 3.8541666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1110
+    },
+    {
+      "epoch": 3.861111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1112
+    },
+    {
+      "epoch": 3.8680555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1114
+    },
+    {
+      "epoch": 3.875,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1116
+    },
+    {
+      "epoch": 3.8819444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1118
+    },
+    {
+      "epoch": 3.888888888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1120
+    },
+    {
+      "epoch": 3.8958333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1122
+    },
+    {
+      "epoch": 3.9027777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1124
+    },
+    {
+      "epoch": 3.9097222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1126
+    },
+    {
+      "epoch": 3.9166666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1128
+    },
+    {
+      "epoch": 3.923611111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1130
+    },
+    {
+      "epoch": 3.9305555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1132
+    },
+    {
+      "epoch": 3.9375,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1134
+    },
+    {
+      "epoch": 3.9444444444444446,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1136
+    },
+    {
+      "epoch": 3.951388888888889,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1138
+    },
+    {
+      "epoch": 3.9583333333333335,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1140
+    },
+    {
+      "epoch": 3.9652777777777777,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1142
+    },
+    {
+      "epoch": 3.9722222222222223,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1144
+    },
+    {
+      "epoch": 3.9791666666666665,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1146
+    },
+    {
+      "epoch": 3.986111111111111,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1148
+    },
+    {
+      "epoch": 3.9930555555555554,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1150
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014305542949029284,
+      "loss": 0.0,
+      "step": 1152
+    }
+  ],
+  "logging_steps": 2,
+  "max_steps": 1152,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.7329604701653565e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1152/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80311e2b6a4df1fda961527e50aa92bc3d64f6451e2aa8695c0c2905ab9601d6
+size 5713

checkpoint-1152/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-200/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: ./Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:./Qwen3-8B
+- lora
+- transformers
+- unsloth
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.16.0

checkpoint-200/adapter_config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": true
+}

checkpoint-200/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e5ce57df73faa719c30b2b30e6153ec831aaef6e5c3502c855f35199ec17b03
+size 2834238032

checkpoint-200/added_tokens.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|analysis|>": 151670,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|forecast|>": 151671,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|response|>": 151669,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-200/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

checkpoint-200/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-200/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0d3e224460c14d3500d1e957a6e59fc96ba82a5e7246d58a7a1a1a459c887da
+size 698777675

checkpoint-200/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
+size 14645

checkpoint-200/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:124625e167eb28acbfc793cfcb3e8a08b32e7fea06501462bc9e420a5e1beb2a
+size 1383

checkpoint-200/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c2624154b69dde7e504cf59e5c9c1f1e77e854340e9a523f65e58b2f3f1dec3
+size 1465

checkpoint-200/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|response|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|analysis|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|forecast|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-200/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77247e5fb2e966d04e513068b17cca472e105e7c56953e9b1d27d70b93d77e6f
+size 11423221

checkpoint-200/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,254 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<|response|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|analysis|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|forecast|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|response|>",
+    "<|analysis|>",
+    "<|forecast|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 40960,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,734 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6944444444444444,
+  "eval_steps": 500,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006944444444444444,
+      "grad_norm": 0.0001319512666668743,
+      "learning_rate": 3.448275862068966e-06,
+      "loss": 21.3796,
+      "step": 2
+    },
+    {
+      "epoch": 0.013888888888888888,
+      "grad_norm": 0.00012970938405487686,
+      "learning_rate": 1.0344827586206897e-05,
+      "loss": 2.5144,
+      "step": 4
+    },
+    {
+      "epoch": 0.020833333333333332,
+      "grad_norm": 0.00012933755351696163,
+      "learning_rate": 1.7241379310344828e-05,
+      "loss": 13.8026,
+      "step": 6
+    },
+    {
+      "epoch": 0.027777777777777776,
+      "grad_norm": 0.00012198727199574932,
+      "learning_rate": 2.413793103448276e-05,
+      "loss": 4.9835,
+      "step": 8
+    },
+    {
+      "epoch": 0.034722222222222224,
+      "grad_norm": 0.00010912115249084309,
+      "learning_rate": 3.103448275862069e-05,
+      "loss": 2.397,
+      "step": 10
+    },
+    {
+      "epoch": 0.041666666666666664,
+      "grad_norm": 9.947551734512672e-05,
+      "learning_rate": 3.793103448275862e-05,
+      "loss": 2.3169,
+      "step": 12
+    },
+    {
+      "epoch": 0.04861111111111111,
+      "grad_norm": 9.700353257358074e-05,
+      "learning_rate": 4.482758620689655e-05,
+      "loss": 2.2121,
+      "step": 14
+    },
+    {
+      "epoch": 0.05555555555555555,
+      "grad_norm": 0.00010787531937239692,
+      "learning_rate": 5.172413793103449e-05,
+      "loss": 2.5911,
+      "step": 16
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.00010262445721309632,
+      "learning_rate": 5.862068965517241e-05,
+      "loss": 1.9774,
+      "step": 18
+    },
+    {
+      "epoch": 0.06944444444444445,
+      "grad_norm": 8.186206105165184e-05,
+      "learning_rate": 6.551724137931034e-05,
+      "loss": 1.8391,
+      "step": 20
+    },
+    {
+      "epoch": 0.0763888888888889,
+      "grad_norm": 5.4703454225091264e-05,
+      "learning_rate": 7.241379310344828e-05,
+      "loss": 1.7289,
+      "step": 22
+    },
+    {
+      "epoch": 0.08333333333333333,
+      "grad_norm": 4.5757446059724316e-05,
+      "learning_rate": 7.931034482758621e-05,
+      "loss": 1.867,
+      "step": 24
+    },
+    {
+      "epoch": 0.09027777777777778,
+      "grad_norm": 8.86492634890601e-05,
+      "learning_rate": 8.620689655172413e-05,
+      "loss": 1.7276,
+      "step": 26
+    },
+    {
+      "epoch": 0.09722222222222222,
+      "grad_norm": 3.919301525456831e-05,
+      "learning_rate": 9.310344827586207e-05,
+      "loss": 1.6506,
+      "step": 28
+    },
+    {
+      "epoch": 0.10416666666666667,
+      "grad_norm": 5.642673568218015e-05,
+      "learning_rate": 0.0001,
+      "loss": 1.5732,
+      "step": 30
+    },
+    {
+      "epoch": 0.1111111111111111,
+      "grad_norm": 6.0812188166892156e-05,
+      "learning_rate": 0.00010689655172413792,
+      "loss": 1.5062,
+      "step": 32
+    },
+    {
+      "epoch": 0.11805555555555555,
+      "grad_norm": 2.352882074774243e-05,
+      "learning_rate": 0.00011379310344827588,
+      "loss": 1.393,
+      "step": 34
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 3.3942505979212e-05,
+      "learning_rate": 0.0001206896551724138,
+      "loss": 1.3654,
+      "step": 36
+    },
+    {
+      "epoch": 0.13194444444444445,
+      "grad_norm": 2.9797614843118936e-05,
+      "learning_rate": 0.00012758620689655174,
+      "loss": 1.3662,
+      "step": 38
+    },
+    {
+      "epoch": 0.1388888888888889,
+      "grad_norm": 2.0893716282444075e-05,
+      "learning_rate": 0.00013448275862068965,
+      "loss": 1.344,
+      "step": 40
+    },
+    {
+      "epoch": 0.14583333333333334,
+      "grad_norm": 1.886891550384462e-05,
+      "learning_rate": 0.0001413793103448276,
+      "loss": 1.2809,
+      "step": 42
+    },
+    {
+      "epoch": 0.1527777777777778,
+      "grad_norm": 1.6420885003753938e-05,
+      "learning_rate": 0.00014827586206896554,
+      "loss": 1.2911,
+      "step": 44
+    },
+    {
+      "epoch": 0.1597222222222222,
+      "grad_norm": 2.6823576263268478e-05,
+      "learning_rate": 0.00015517241379310346,
+      "loss": 1.3095,
+      "step": 46
+    },
+    {
+      "epoch": 0.16666666666666666,
+      "grad_norm": 1.2686981790466234e-05,
+      "learning_rate": 0.00016206896551724137,
+      "loss": 1.2149,
+      "step": 48
+    },
+    {
+      "epoch": 0.1736111111111111,
+      "grad_norm": 1.0219813702860847e-05,
+      "learning_rate": 0.00016896551724137932,
+      "loss": 1.2522,
+      "step": 50
+    },
+    {
+      "epoch": 0.18055555555555555,
+      "grad_norm": 1.0080276297230739e-05,
+      "learning_rate": 0.00017586206896551723,
+      "loss": 1.2311,
+      "step": 52
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 9.221699656336568e-06,
+      "learning_rate": 0.00018275862068965518,
+      "loss": 1.2138,
+      "step": 54
+    },
+    {
+      "epoch": 0.19444444444444445,
+      "grad_norm": 1.0500927601242438e-05,
+      "learning_rate": 0.00018965517241379312,
+      "loss": 1.2149,
+      "step": 56
+    },
+    {
+      "epoch": 0.2013888888888889,
+      "grad_norm": 8.30852695798967e-06,
+      "learning_rate": 0.00019655172413793104,
+      "loss": 1.212,
+      "step": 58
+    },
+    {
+      "epoch": 0.20833333333333334,
+      "grad_norm": 1.2315202184254304e-05,
+      "learning_rate": 0.0001999995876796145,
+      "loss": 1.2239,
+      "step": 60
+    },
+    {
+      "epoch": 0.2152777777777778,
+      "grad_norm": 1.1090536645497195e-05,
+      "learning_rate": 0.00019999628913693117,
+      "loss": 1.2075,
+      "step": 62
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 1.1589069799811114e-05,
+      "learning_rate": 0.00019998969216036892,
+      "loss": 1.2026,
+      "step": 64
+    },
+    {
+      "epoch": 0.22916666666666666,
+      "grad_norm": 1.0656535778252874e-05,
+      "learning_rate": 0.0001999797969675326,
+      "loss": 1.2126,
+      "step": 66
+    },
+    {
+      "epoch": 0.2361111111111111,
+      "grad_norm": 9.856509677774739e-06,
+      "learning_rate": 0.00019996660388482083,
+      "loss": 1.166,
+      "step": 68
+    },
+    {
+      "epoch": 0.24305555555555555,
+      "grad_norm": 1.5398893083329313e-05,
+      "learning_rate": 0.00019995011334741477,
+      "loss": 1.215,
+      "step": 70
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 8.422492101090029e-06,
+      "learning_rate": 0.00019993032589926414,
+      "loss": 1.1868,
+      "step": 72
+    },
+    {
+      "epoch": 0.2569444444444444,
+      "grad_norm": 8.810847248241771e-06,
+      "learning_rate": 0.00019990724219306902,
+      "loss": 1.1971,
+      "step": 74
+    },
+    {
+      "epoch": 0.2638888888888889,
+      "grad_norm": 1.1227813956793398e-05,
+      "learning_rate": 0.00019988086299025848,
+      "loss": 1.1684,
+      "step": 76
+    },
+    {
+      "epoch": 0.2708333333333333,
+      "grad_norm": 1.0562929674051702e-05,
+      "learning_rate": 0.00019985118916096534,
+      "loss": 1.1981,
+      "step": 78
+    },
+    {
+      "epoch": 0.2777777777777778,
+      "grad_norm": 1.521587728348095e-05,
+      "learning_rate": 0.00019981822168399756,
+      "loss": 1.1838,
+      "step": 80
+    },
+    {
+      "epoch": 0.2847222222222222,
+      "grad_norm": 1.0759257747849915e-05,
+      "learning_rate": 0.00019978196164680597,
+      "loss": 1.2032,
+      "step": 82
+    },
+    {
+      "epoch": 0.2916666666666667,
+      "grad_norm": 8.712796443433035e-06,
+      "learning_rate": 0.00019974241024544828,
+      "loss": 1.1937,
+      "step": 84
+    },
+    {
+      "epoch": 0.2986111111111111,
+      "grad_norm": 1.368098037346499e-05,
+      "learning_rate": 0.00019969956878454972,
+      "loss": 1.1965,
+      "step": 86
+    },
+    {
+      "epoch": 0.3055555555555556,
+      "grad_norm": 1.0317597116227262e-05,
+      "learning_rate": 0.00019965343867725998,
+      "loss": 1.1908,
+      "step": 88
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 1.0250181730953045e-05,
+      "learning_rate": 0.00019960402144520665,
+      "loss": 1.1983,
+      "step": 90
+    },
+    {
+      "epoch": 0.3194444444444444,
+      "grad_norm": 1.0102179658133537e-05,
+      "learning_rate": 0.00019955131871844488,
+      "loss": 1.1842,
+      "step": 92
+    },
+    {
+      "epoch": 0.3263888888888889,
+      "grad_norm": 8.509154213243164e-06,
+      "learning_rate": 0.00019949533223540385,
+      "loss": 1.1871,
+      "step": 94
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 8.434612027485855e-06,
+      "learning_rate": 0.00019943606384282916,
+      "loss": 1.2072,
+      "step": 96
+    },
+    {
+      "epoch": 0.3402777777777778,
+      "grad_norm": 1.0174206181545742e-05,
+      "learning_rate": 0.0001993735154957221,
+      "loss": 1.2088,
+      "step": 98
+    },
+    {
+      "epoch": 0.3472222222222222,
+      "grad_norm": 9.98695850285003e-06,
+      "learning_rate": 0.00019930768925727514,
+      "loss": 1.1847,
+      "step": 100
+    },
+    {
+      "epoch": 0.3541666666666667,
+      "grad_norm": 8.591785444878042e-06,
+      "learning_rate": 0.0001992385872988038,
+      "loss": 1.2041,
+      "step": 102
+    },
+    {
+      "epoch": 0.3611111111111111,
+      "grad_norm": 1.0436694537929725e-05,
+      "learning_rate": 0.00019916621189967502,
+      "loss": 1.2194,
+      "step": 104
+    },
+    {
+      "epoch": 0.3680555555555556,
+      "grad_norm": 1.137161689257482e-05,
+      "learning_rate": 0.00019909056544723213,
+      "loss": 1.1788,
+      "step": 106
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 1.0015843145083636e-05,
+      "learning_rate": 0.00019901165043671593,
+      "loss": 1.1979,
+      "step": 108
+    },
+    {
+      "epoch": 0.3819444444444444,
+      "grad_norm": 1.0921584362222347e-05,
+      "learning_rate": 0.00019892946947118242,
+      "loss": 1.1836,
+      "step": 110
+    },
+    {
+      "epoch": 0.3888888888888889,
+      "grad_norm": 2.0685600247816183e-05,
+      "learning_rate": 0.00019884402526141709,
+      "loss": 1.1883,
+      "step": 112
+    },
+    {
+      "epoch": 0.3958333333333333,
+      "grad_norm": 1.0137908247997984e-05,
+      "learning_rate": 0.00019875532062584519,
+      "loss": 1.183,
+      "step": 114
+    },
+    {
+      "epoch": 0.4027777777777778,
+      "grad_norm": 1.1504852409416344e-05,
+      "learning_rate": 0.00019866335849043912,
+      "loss": 1.1957,
+      "step": 116
+    },
+    {
+      "epoch": 0.4097222222222222,
+      "grad_norm": 8.32590194477234e-06,
+      "learning_rate": 0.00019856814188862166,
+      "loss": 1.1605,
+      "step": 118
+    },
+    {
+      "epoch": 0.4166666666666667,
+      "grad_norm": 1.0243880751659162e-05,
+      "learning_rate": 0.000198469673961166,
+      "loss": 1.1787,
+      "step": 120
+    },
+    {
+      "epoch": 0.4236111111111111,
+      "grad_norm": 9.695215339888819e-06,
+      "learning_rate": 0.00019836795795609213,
+      "loss": 1.1849,
+      "step": 122
+    },
+    {
+      "epoch": 0.4305555555555556,
+      "grad_norm": 9.29382167669246e-06,
+      "learning_rate": 0.00019826299722855976,
+      "loss": 1.1779,
+      "step": 124
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 1.0769907930807676e-05,
+      "learning_rate": 0.00019815479524075758,
+      "loss": 1.1878,
+      "step": 126
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 1.0001721420849208e-05,
+      "learning_rate": 0.000198043355561789,
+      "loss": 1.1963,
+      "step": 128
+    },
+    {
+      "epoch": 0.4513888888888889,
+      "grad_norm": 1.2609498298843391e-05,
+      "learning_rate": 0.00019792868186755463,
+      "loss": 1.2135,
+      "step": 130
+    },
+    {
+      "epoch": 0.4583333333333333,
+      "grad_norm": 9.788008355826605e-06,
+      "learning_rate": 0.00019781077794063073,
+      "loss": 1.2,
+      "step": 132
+    },
+    {
+      "epoch": 0.4652777777777778,
+      "grad_norm": 1.0332081728847697e-05,
+      "learning_rate": 0.00019768964767014475,
+      "loss": 1.1747,
+      "step": 134
+    },
+    {
+      "epoch": 0.4722222222222222,
+      "grad_norm": 1.2005073585896753e-05,
+      "learning_rate": 0.00019756529505164682,
+      "loss": 1.1907,
+      "step": 136
+    },
+    {
+      "epoch": 0.4791666666666667,
+      "grad_norm": 8.512701242580079e-06,
+      "learning_rate": 0.00019743772418697806,
+      "loss": 1.2034,
+      "step": 138
+    },
+    {
+      "epoch": 0.4861111111111111,
+      "grad_norm": 1.081860773410881e-05,
+      "learning_rate": 0.0001973069392841352,
+      "loss": 1.1764,
+      "step": 140
+    },
+    {
+      "epoch": 0.4930555555555556,
+      "grad_norm": 1.0664197361620609e-05,
+      "learning_rate": 0.000197172944657132,
+      "loss": 1.169,
+      "step": 142
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 8.31518536870135e-06,
+      "learning_rate": 0.00019703574472585648,
+      "loss": 1.1787,
+      "step": 144
+    },
+    {
+      "epoch": 0.5069444444444444,
+      "grad_norm": 9.874113857222255e-06,
+      "learning_rate": 0.00019689534401592568,
+      "loss": 1.1908,
+      "step": 146
+    },
+    {
+      "epoch": 0.5138888888888888,
+      "grad_norm": 1.027604412229266e-05,
+      "learning_rate": 0.00019675174715853605,
+      "loss": 1.2001,
+      "step": 148
+    },
+    {
+      "epoch": 0.5208333333333334,
+      "grad_norm": 1.5908390196273103e-05,
+      "learning_rate": 0.00019660495889031073,
+      "loss": 1.1771,
+      "step": 150
+    },
+    {
+      "epoch": 0.5277777777777778,
+      "grad_norm": 1.0185674909735098e-05,
+      "learning_rate": 0.00019645498405314337,
+      "loss": 1.1809,
+      "step": 152
+    },
+    {
+      "epoch": 0.5347222222222222,
+      "grad_norm": 1.4728296264365781e-05,
+      "learning_rate": 0.0001963018275940384,
+      "loss": 1.2066,
+      "step": 154
+    },
+    {
+      "epoch": 0.5416666666666666,
+      "grad_norm": 1.1691463441820815e-05,
+      "learning_rate": 0.00019614549456494778,
+      "loss": 1.1879,
+      "step": 156
+    },
+    {
+      "epoch": 0.5486111111111112,
+      "grad_norm": 1.1677379916363861e-05,
+      "learning_rate": 0.0001959859901226045,
+      "loss": 1.1758,
+      "step": 158
+    },
+    {
+      "epoch": 0.5555555555555556,
+      "grad_norm": 1.2042312846460845e-05,
+      "learning_rate": 0.0001958233195283524,
+      "loss": 1.1741,
+      "step": 160
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 1.5415562302223407e-05,
+      "learning_rate": 0.00019565748814797252,
+      "loss": 1.1855,
+      "step": 162
+    },
+    {
+      "epoch": 0.5694444444444444,
+      "grad_norm": 1.4021643437445164e-05,
+      "learning_rate": 0.00019548850145150633,
+      "loss": 1.1937,
+      "step": 164
+    },
+    {
+      "epoch": 0.5763888888888888,
+      "grad_norm": 1.470915594836697e-05,
+      "learning_rate": 0.00019531636501307512,
+      "loss": 1.1946,
+      "step": 166
+    },
+    {
+      "epoch": 0.5833333333333334,
+      "grad_norm": 1.355435324512655e-05,
+      "learning_rate": 0.00019514108451069615,
+      "loss": 1.1898,
+      "step": 168
+    },
+    {
+      "epoch": 0.5902777777777778,
+      "grad_norm": 1.642305687710177e-05,
+      "learning_rate": 0.00019496266572609547,
+      "loss": 1.1822,
+      "step": 170
+    },
+    {
+      "epoch": 0.5972222222222222,
+      "grad_norm": 1.8154083591070957e-05,
+      "learning_rate": 0.00019478111454451712,
+      "loss": 1.1751,
+      "step": 172
+    },
+    {
+      "epoch": 0.6041666666666666,
+      "grad_norm": 1.3315224350662902e-05,
+      "learning_rate": 0.00019459643695452904,
+      "loss": 1.1826,
+      "step": 174
+    },
+    {
+      "epoch": 0.6111111111111112,
+      "grad_norm": 1.2248892744537443e-05,
+      "learning_rate": 0.00019440863904782543,
+      "loss": 1.213,
+      "step": 176
+    },
+    {
+      "epoch": 0.6180555555555556,
+      "grad_norm": 1.7752956409822218e-05,
+      "learning_rate": 0.00019421772701902596,
+      "loss": 1.1833,
+      "step": 178
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 1.744980545481667e-05,
+      "learning_rate": 0.00019402370716547135,
+      "loss": 1.1974,
+      "step": 180
+    },
+    {
+      "epoch": 0.6319444444444444,
+      "grad_norm": 1.9521774447639473e-05,
+      "learning_rate": 0.00019382658588701568,
+      "loss": 1.1931,
+      "step": 182
+    },
+    {
+      "epoch": 0.6388888888888888,
+      "grad_norm": 1.5161932424234692e-05,
+      "learning_rate": 0.00019362636968581524,
+      "loss": 1.1901,
+      "step": 184
+    },
+    {
+      "epoch": 0.6458333333333334,
+      "grad_norm": 1.868332583399024e-05,
+      "learning_rate": 0.00019342306516611417,
+      "loss": 1.2045,
+      "step": 186
+    },
+    {
+      "epoch": 0.6527777777777778,
+      "grad_norm": 2.162260534532834e-05,
+      "learning_rate": 0.00019321667903402642,
+      "loss": 1.1899,
+      "step": 188
+    },
+    {
+      "epoch": 0.6597222222222222,
+      "grad_norm": 1.507936030975543e-05,
+      "learning_rate": 0.00019300721809731476,
+      "loss": 1.2029,
+      "step": 190
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 2.0704670532722957e-05,
+      "learning_rate": 0.00019279468926516606,
+      "loss": 1.2063,
+      "step": 192
+    },
+    {
+      "epoch": 0.6736111111111112,
+      "grad_norm": 1.8972095858771354e-05,
+      "learning_rate": 0.0001925790995479635,
+      "loss": 1.1861,
+      "step": 194
+    },
+    {
+      "epoch": 0.6805555555555556,
+      "grad_norm": 1.5989120583981276e-05,
+      "learning_rate": 0.0001923604560570552,
+      "loss": 1.2143,
+      "step": 196
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 1.15529483082355e-05,
+      "learning_rate": 0.00019213876600451978,
+      "loss": 1.1939,
+      "step": 198
+    },
+    {
+      "epoch": 0.6944444444444444,
+      "grad_norm": 1.474355212849332e-05,
+      "learning_rate": 0.0001919140367029284,
+      "loss": 1.1909,
+      "step": 200
+    }
+  ],
+  "logging_steps": 2,
+  "max_steps": 1152,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.009918315528192e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-200/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80311e2b6a4df1fda961527e50aa92bc3d64f6451e2aa8695c0c2905ab9601d6
+size 5713