Visdom9 commited on Mar 10

Commit

3254881

1 Parent(s): 196e35f

Pushing fine-tuned Norah model

Browse files

Files changed (29) hide show

download_dataset.py +11 -0
fine_tune_norah.py +54 -0
norah_lora/README.md +202 -0
norah_lora/adapter_config.json +32 -0
norah_lora/adapter_model.safetensors +3 -0
norah_lora/checkpoint-3500/README.md +202 -0
norah_lora/checkpoint-3500/adapter_config.json +32 -0
norah_lora/checkpoint-3500/adapter_model.safetensors +3 -0
norah_lora/checkpoint-3500/optimizer.pt +3 -0
norah_lora/checkpoint-3500/rng_state.pth +3 -0
norah_lora/checkpoint-3500/scheduler.pt +3 -0
norah_lora/checkpoint-3500/trainer_state.json +2483 -0
norah_lora/checkpoint-3500/training_args.bin +3 -0
norah_lora/checkpoint-3711/README.md +202 -0
norah_lora/checkpoint-3711/adapter_config.json +32 -0
norah_lora/checkpoint-3711/adapter_model.safetensors +3 -0
norah_lora/checkpoint-3711/optimizer.pt +3 -0
norah_lora/checkpoint-3711/rng_state.pth +3 -0
norah_lora/checkpoint-3711/scheduler.pt +3 -0
norah_lora/checkpoint-3711/trainer_state.json +2630 -0
norah_lora/checkpoint-3711/training_args.bin +3 -0
norah_lora/special_tokens_map.json +34 -0
norah_lora/tokenizer.json +0 -0
norah_lora/tokenizer_config.json +95 -0
test_norah.py +56 -0
tokenize_dataset.py +30 -0
tokenized_norah/data-00000-of-00001.arrow +3 -0
tokenized_norah/dataset_info.json +65 -0
tokenized_norah/state.json +13 -0

download_dataset.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from datasets import load_dataset
+# Download the OpenAssistant dataset
+dataset = load_dataset("OpenAssistant/oasst1", split="train")
+# Keep only French conversations
+dataset = dataset.filter(lambda x: x["lang"] == "fr")
+# Print an example to check if it's correct
+print("Example conversation from dataset:")
+print(dataset[0])

fine_tune_norah.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
+from peft import get_peft_model, LoraConfig, TaskType
+from datasets import load_from_disk
+import torch
+# Load tokenizer and model
+model_name = "Visdom9/Norah"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, device_map={"": "cpu"})
+# Apply LoRA (Low-Rank Adaptation)
+config = LoraConfig(
+    task_type="CAUSAL_LM",  # Correct Task Type
+    r=8,
+    lora_alpha=32,
+    lora_dropout=0.1
+)
+model = get_peft_model(model, config)
+# Load the tokenized dataset
+tokenized_dataset = load_from_disk("tokenized_norah")
+# Training arguments
+training_args = TrainingArguments(
+    output_dir="./norah_lora",
+    per_device_train_batch_size=1,  # ✅ Lower batch size to avoid memory issues
+    gradient_accumulation_steps=2,  # ✅ Reduce accumulation steps
+    learning_rate=5e-5,
+    num_train_epochs=3,
+    save_steps=500,
+    save_total_limit=2,
+    logging_steps=10,
+    fp16=False  # ✅ Disable FP16 because you're using CPU
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_dataset,
+)
+# Train the model
+trainer.train()
+# Save the fine-tuned model
+model.save_pretrained("./norah_lora")
+tokenizer.save_pretrained("./norah_lora")
+print("✅ Fine-tuning complete! Model saved in 'norah_lora'")

norah_lora/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Visdom9/Norah
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

norah_lora/adapter_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Visdom9/Norah",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

norah_lora/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d59cff2a6bd8bd4549db675d631da3cdb3d83feba06da5fefc2970ca60dd38c
+size 1284192

norah_lora/checkpoint-3500/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Visdom9/Norah
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

norah_lora/checkpoint-3500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Visdom9/Norah",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

norah_lora/checkpoint-3500/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abb60d3c7375456653ede5d88728016b021fe37d0d0a4968d963ae7352d781c4
+size 1284192

norah_lora/checkpoint-3500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c871457043d5fe60aa15c7184bc7bf0bf053b7bfb866342354591898b241cec6
+size 2595258

norah_lora/checkpoint-3500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c7c4ae0dd11eccf6b8e9788335c054c607a3525d3b01061cc9e98dbb70689a4
+size 13990

norah_lora/checkpoint-3500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a097afe2de16207c7fe1741d5d79682d0a7fcdebfa8a93b583133d901a5e6f89
+size 1064

norah_lora/checkpoint-3500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2483 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.8294260307194827,
+  "eval_steps": 500,
+  "global_step": 3500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008084074373484237,
+      "grad_norm": Infinity,
+      "learning_rate": 4.98652654271086e-05,
+      "loss": 1.3186,
+      "step": 10
+    },
+    {
+      "epoch": 0.016168148746968473,
+      "grad_norm": Infinity,
+      "learning_rate": 4.973053085421719e-05,
+      "loss": 1.6363,
+      "step": 20
+    },
+    {
+      "epoch": 0.024252223120452707,
+      "grad_norm": Infinity,
+      "learning_rate": 4.959579628132579e-05,
+      "loss": 1.8313,
+      "step": 30
+    },
+    {
+      "epoch": 0.03233629749393695,
+      "grad_norm": Infinity,
+      "learning_rate": 4.946106170843439e-05,
+      "loss": 1.646,
+      "step": 40
+    },
+    {
+      "epoch": 0.04042037186742118,
+      "grad_norm": Infinity,
+      "learning_rate": 4.932632713554298e-05,
+      "loss": 1.4233,
+      "step": 50
+    },
+    {
+      "epoch": 0.04850444624090541,
+      "grad_norm": Infinity,
+      "learning_rate": 4.919159256265158e-05,
+      "loss": 1.8705,
+      "step": 60
+    },
+    {
+      "epoch": 0.056588520614389654,
+      "grad_norm": Infinity,
+      "learning_rate": 4.905685798976018e-05,
+      "loss": 1.7552,
+      "step": 70
+    },
+    {
+      "epoch": 0.0646725949878739,
+      "grad_norm": Infinity,
+      "learning_rate": 4.892212341686877e-05,
+      "loss": 1.1921,
+      "step": 80
+    },
+    {
+      "epoch": 0.07275666936135812,
+      "grad_norm": Infinity,
+      "learning_rate": 4.878738884397737e-05,
+      "loss": 1.4613,
+      "step": 90
+    },
+    {
+      "epoch": 0.08084074373484236,
+      "grad_norm": Infinity,
+      "learning_rate": 4.865265427108596e-05,
+      "loss": 0.6314,
+      "step": 100
+    },
+    {
+      "epoch": 0.0889248181083266,
+      "grad_norm": Infinity,
+      "learning_rate": 4.851791969819456e-05,
+      "loss": 1.8646,
+      "step": 110
+    },
+    {
+      "epoch": 0.09700889248181083,
+      "grad_norm": Infinity,
+      "learning_rate": 4.8383185125303156e-05,
+      "loss": 1.4315,
+      "step": 120
+    },
+    {
+      "epoch": 0.10509296685529507,
+      "grad_norm": Infinity,
+      "learning_rate": 4.824845055241175e-05,
+      "loss": 2.6826,
+      "step": 130
+    },
+    {
+      "epoch": 0.11317704122877931,
+      "grad_norm": Infinity,
+      "learning_rate": 4.8113715979520346e-05,
+      "loss": 2.2289,
+      "step": 140
+    },
+    {
+      "epoch": 0.12126111560226355,
+      "grad_norm": Infinity,
+      "learning_rate": 4.7978981406628945e-05,
+      "loss": 1.6823,
+      "step": 150
+    },
+    {
+      "epoch": 0.1293451899757478,
+      "grad_norm": Infinity,
+      "learning_rate": 4.7844246833737536e-05,
+      "loss": 0.7194,
+      "step": 160
+    },
+    {
+      "epoch": 0.137429264349232,
+      "grad_norm": Infinity,
+      "learning_rate": 4.7709512260846135e-05,
+      "loss": 2.28,
+      "step": 170
+    },
+    {
+      "epoch": 0.14551333872271624,
+      "grad_norm": Infinity,
+      "learning_rate": 4.757477768795473e-05,
+      "loss": 1.156,
+      "step": 180
+    },
+    {
+      "epoch": 0.15359741309620048,
+      "grad_norm": Infinity,
+      "learning_rate": 4.7440043115063325e-05,
+      "loss": 2.0865,
+      "step": 190
+    },
+    {
+      "epoch": 0.16168148746968472,
+      "grad_norm": Infinity,
+      "learning_rate": 4.730530854217192e-05,
+      "loss": 1.7647,
+      "step": 200
+    },
+    {
+      "epoch": 0.16976556184316896,
+      "grad_norm": Infinity,
+      "learning_rate": 4.717057396928052e-05,
+      "loss": 2.3384,
+      "step": 210
+    },
+    {
+      "epoch": 0.1778496362166532,
+      "grad_norm": Infinity,
+      "learning_rate": 4.703583939638911e-05,
+      "loss": 2.5532,
+      "step": 220
+    },
+    {
+      "epoch": 0.18593371059013744,
+      "grad_norm": Infinity,
+      "learning_rate": 4.690110482349771e-05,
+      "loss": 1.016,
+      "step": 230
+    },
+    {
+      "epoch": 0.19401778496362165,
+      "grad_norm": Infinity,
+      "learning_rate": 4.67663702506063e-05,
+      "loss": 2.1508,
+      "step": 240
+    },
+    {
+      "epoch": 0.2021018593371059,
+      "grad_norm": Infinity,
+      "learning_rate": 4.66316356777149e-05,
+      "loss": 1.267,
+      "step": 250
+    },
+    {
+      "epoch": 0.21018593371059013,
+      "grad_norm": Infinity,
+      "learning_rate": 4.64969011048235e-05,
+      "loss": 1.28,
+      "step": 260
+    },
+    {
+      "epoch": 0.21827000808407437,
+      "grad_norm": Infinity,
+      "learning_rate": 4.636216653193209e-05,
+      "loss": 1.6061,
+      "step": 270
+    },
+    {
+      "epoch": 0.22635408245755861,
+      "grad_norm": Infinity,
+      "learning_rate": 4.622743195904069e-05,
+      "loss": 0.7908,
+      "step": 280
+    },
+    {
+      "epoch": 0.23443815683104285,
+      "grad_norm": Infinity,
+      "learning_rate": 4.609269738614929e-05,
+      "loss": 1.8026,
+      "step": 290
+    },
+    {
+      "epoch": 0.2425222312045271,
+      "grad_norm": Infinity,
+      "learning_rate": 4.595796281325788e-05,
+      "loss": 1.2974,
+      "step": 300
+    },
+    {
+      "epoch": 0.25060630557801133,
+      "grad_norm": Infinity,
+      "learning_rate": 4.582322824036648e-05,
+      "loss": 1.885,
+      "step": 310
+    },
+    {
+      "epoch": 0.2586903799514956,
+      "grad_norm": Infinity,
+      "learning_rate": 4.568849366747508e-05,
+      "loss": 1.2212,
+      "step": 320
+    },
+    {
+      "epoch": 0.2667744543249798,
+      "grad_norm": Infinity,
+      "learning_rate": 4.555375909458367e-05,
+      "loss": 1.5874,
+      "step": 330
+    },
+    {
+      "epoch": 0.274858528698464,
+      "grad_norm": Infinity,
+      "learning_rate": 4.541902452169227e-05,
+      "loss": 0.8497,
+      "step": 340
+    },
+    {
+      "epoch": 0.28294260307194824,
+      "grad_norm": Infinity,
+      "learning_rate": 4.5284289948800865e-05,
+      "loss": 1.1971,
+      "step": 350
+    },
+    {
+      "epoch": 0.2910266774454325,
+      "grad_norm": Infinity,
+      "learning_rate": 4.514955537590946e-05,
+      "loss": 2.5525,
+      "step": 360
+    },
+    {
+      "epoch": 0.2991107518189167,
+      "grad_norm": Infinity,
+      "learning_rate": 4.5014820803018055e-05,
+      "loss": 0.5229,
+      "step": 370
+    },
+    {
+      "epoch": 0.30719482619240096,
+      "grad_norm": Infinity,
+      "learning_rate": 4.488008623012665e-05,
+      "loss": 1.9758,
+      "step": 380
+    },
+    {
+      "epoch": 0.3152789005658852,
+      "grad_norm": Infinity,
+      "learning_rate": 4.4745351657235245e-05,
+      "loss": 1.5789,
+      "step": 390
+    },
+    {
+      "epoch": 0.32336297493936944,
+      "grad_norm": Infinity,
+      "learning_rate": 4.4610617084343844e-05,
+      "loss": 2.2642,
+      "step": 400
+    },
+    {
+      "epoch": 0.3314470493128537,
+      "grad_norm": Infinity,
+      "learning_rate": 4.447588251145244e-05,
+      "loss": 2.1261,
+      "step": 410
+    },
+    {
+      "epoch": 0.3395311236863379,
+      "grad_norm": Infinity,
+      "learning_rate": 4.434114793856104e-05,
+      "loss": 2.9391,
+      "step": 420
+    },
+    {
+      "epoch": 0.34761519805982216,
+      "grad_norm": Infinity,
+      "learning_rate": 4.420641336566964e-05,
+      "loss": 1.7748,
+      "step": 430
+    },
+    {
+      "epoch": 0.3556992724333064,
+      "grad_norm": Infinity,
+      "learning_rate": 4.407167879277823e-05,
+      "loss": 1.3519,
+      "step": 440
+    },
+    {
+      "epoch": 0.36378334680679064,
+      "grad_norm": Infinity,
+      "learning_rate": 4.393694421988683e-05,
+      "loss": 1.6652,
+      "step": 450
+    },
+    {
+      "epoch": 0.3718674211802749,
+      "grad_norm": Infinity,
+      "learning_rate": 4.380220964699542e-05,
+      "loss": 1.7532,
+      "step": 460
+    },
+    {
+      "epoch": 0.3799514955537591,
+      "grad_norm": Infinity,
+      "learning_rate": 4.366747507410402e-05,
+      "loss": 1.8121,
+      "step": 470
+    },
+    {
+      "epoch": 0.3880355699272433,
+      "grad_norm": Infinity,
+      "learning_rate": 4.353274050121262e-05,
+      "loss": 1.0565,
+      "step": 480
+    },
+    {
+      "epoch": 0.39611964430072755,
+      "grad_norm": Infinity,
+      "learning_rate": 4.339800592832121e-05,
+      "loss": 2.5515,
+      "step": 490
+    },
+    {
+      "epoch": 0.4042037186742118,
+      "grad_norm": Infinity,
+      "learning_rate": 4.326327135542981e-05,
+      "loss": 1.8491,
+      "step": 500
+    },
+    {
+      "epoch": 0.412287793047696,
+      "grad_norm": Infinity,
+      "learning_rate": 4.3128536782538406e-05,
+      "loss": 1.3268,
+      "step": 510
+    },
+    {
+      "epoch": 0.42037186742118027,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2993802209647e-05,
+      "loss": 2.3801,
+      "step": 520
+    },
+    {
+      "epoch": 0.4284559417946645,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2859067636755596e-05,
+      "loss": 2.3338,
+      "step": 530
+    },
+    {
+      "epoch": 0.43654001616814875,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2724333063864194e-05,
+      "loss": 1.5153,
+      "step": 540
+    },
+    {
+      "epoch": 0.444624090541633,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2589598490972786e-05,
+      "loss": 0.8897,
+      "step": 550
+    },
+    {
+      "epoch": 0.45270816491511723,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2454863918081384e-05,
+      "loss": 0.8557,
+      "step": 560
+    },
+    {
+      "epoch": 0.46079223928860147,
+      "grad_norm": Infinity,
+      "learning_rate": 4.232012934518998e-05,
+      "loss": 1.021,
+      "step": 570
+    },
+    {
+      "epoch": 0.4688763136620857,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2185394772298574e-05,
+      "loss": 1.3295,
+      "step": 580
+    },
+    {
+      "epoch": 0.47696038803556995,
+      "grad_norm": Infinity,
+      "learning_rate": 4.205066019940717e-05,
+      "loss": 2.0716,
+      "step": 590
+    },
+    {
+      "epoch": 0.4850444624090542,
+      "grad_norm": Infinity,
+      "learning_rate": 4.1915925626515764e-05,
+      "loss": 2.5046,
+      "step": 600
+    },
+    {
+      "epoch": 0.4931285367825384,
+      "grad_norm": Infinity,
+      "learning_rate": 4.178119105362436e-05,
+      "loss": 1.4814,
+      "step": 610
+    },
+    {
+      "epoch": 0.5012126111560227,
+      "grad_norm": Infinity,
+      "learning_rate": 4.164645648073296e-05,
+      "loss": 1.5643,
+      "step": 620
+    },
+    {
+      "epoch": 0.5092966855295069,
+      "grad_norm": Infinity,
+      "learning_rate": 4.151172190784155e-05,
+      "loss": 1.4721,
+      "step": 630
+    },
+    {
+      "epoch": 0.5173807599029911,
+      "grad_norm": Infinity,
+      "learning_rate": 4.137698733495015e-05,
+      "loss": 2.1584,
+      "step": 640
+    },
+    {
+      "epoch": 0.5254648342764754,
+      "grad_norm": Infinity,
+      "learning_rate": 4.124225276205875e-05,
+      "loss": 0.9858,
+      "step": 650
+    },
+    {
+      "epoch": 0.5335489086499596,
+      "grad_norm": Infinity,
+      "learning_rate": 4.110751818916734e-05,
+      "loss": 2.2872,
+      "step": 660
+    },
+    {
+      "epoch": 0.5416329830234439,
+      "grad_norm": Infinity,
+      "learning_rate": 4.097278361627594e-05,
+      "loss": 1.8746,
+      "step": 670
+    },
+    {
+      "epoch": 0.549717057396928,
+      "grad_norm": Infinity,
+      "learning_rate": 4.083804904338454e-05,
+      "loss": 1.5985,
+      "step": 680
+    },
+    {
+      "epoch": 0.5578011317704122,
+      "grad_norm": Infinity,
+      "learning_rate": 4.070331447049313e-05,
+      "loss": 1.4411,
+      "step": 690
+    },
+    {
+      "epoch": 0.5658852061438965,
+      "grad_norm": Infinity,
+      "learning_rate": 4.056857989760173e-05,
+      "loss": 1.6405,
+      "step": 700
+    },
+    {
+      "epoch": 0.5739692805173807,
+      "grad_norm": Infinity,
+      "learning_rate": 4.0433845324710326e-05,
+      "loss": 0.9719,
+      "step": 710
+    },
+    {
+      "epoch": 0.582053354890865,
+      "grad_norm": Infinity,
+      "learning_rate": 4.029911075181892e-05,
+      "loss": 0.8405,
+      "step": 720
+    },
+    {
+      "epoch": 0.5901374292643492,
+      "grad_norm": Infinity,
+      "learning_rate": 4.0164376178927516e-05,
+      "loss": 0.5547,
+      "step": 730
+    },
+    {
+      "epoch": 0.5982215036378334,
+      "grad_norm": Infinity,
+      "learning_rate": 4.002964160603611e-05,
+      "loss": 1.0534,
+      "step": 740
+    },
+    {
+      "epoch": 0.6063055780113177,
+      "grad_norm": Infinity,
+      "learning_rate": 3.9894907033144707e-05,
+      "loss": 0.827,
+      "step": 750
+    },
+    {
+      "epoch": 0.6143896523848019,
+      "grad_norm": Infinity,
+      "learning_rate": 3.9760172460253305e-05,
+      "loss": 2.7736,
+      "step": 760
+    },
+    {
+      "epoch": 0.6224737267582862,
+      "grad_norm": Infinity,
+      "learning_rate": 3.9625437887361897e-05,
+      "loss": 1.637,
+      "step": 770
+    },
+    {
+      "epoch": 0.6305578011317704,
+      "grad_norm": Infinity,
+      "learning_rate": 3.9490703314470495e-05,
+      "loss": 2.0057,
+      "step": 780
+    },
+    {
+      "epoch": 0.6386418755052546,
+      "grad_norm": Infinity,
+      "learning_rate": 3.935596874157909e-05,
+      "loss": 1.1342,
+      "step": 790
+    },
+    {
+      "epoch": 0.6467259498787389,
+      "grad_norm": Infinity,
+      "learning_rate": 3.9221234168687685e-05,
+      "loss": 0.9694,
+      "step": 800
+    },
+    {
+      "epoch": 0.6548100242522231,
+      "grad_norm": Infinity,
+      "learning_rate": 3.908649959579628e-05,
+      "loss": 2.2358,
+      "step": 810
+    },
+    {
+      "epoch": 0.6628940986257074,
+      "grad_norm": Infinity,
+      "learning_rate": 3.895176502290488e-05,
+      "loss": 0.7282,
+      "step": 820
+    },
+    {
+      "epoch": 0.6709781729991916,
+      "grad_norm": Infinity,
+      "learning_rate": 3.8817030450013473e-05,
+      "loss": 1.0698,
+      "step": 830
+    },
+    {
+      "epoch": 0.6790622473726758,
+      "grad_norm": Infinity,
+      "learning_rate": 3.868229587712207e-05,
+      "loss": 1.3923,
+      "step": 840
+    },
+    {
+      "epoch": 0.6871463217461601,
+      "grad_norm": Infinity,
+      "learning_rate": 3.854756130423067e-05,
+      "loss": 0.8056,
+      "step": 850
+    },
+    {
+      "epoch": 0.6952303961196443,
+      "grad_norm": Infinity,
+      "learning_rate": 3.841282673133926e-05,
+      "loss": 1.6625,
+      "step": 860
+    },
+    {
+      "epoch": 0.7033144704931286,
+      "grad_norm": Infinity,
+      "learning_rate": 3.827809215844786e-05,
+      "loss": 1.2065,
+      "step": 870
+    },
+    {
+      "epoch": 0.7113985448666128,
+      "grad_norm": Infinity,
+      "learning_rate": 3.814335758555645e-05,
+      "loss": 1.1378,
+      "step": 880
+    },
+    {
+      "epoch": 0.719482619240097,
+      "grad_norm": Infinity,
+      "learning_rate": 3.800862301266505e-05,
+      "loss": 1.4192,
+      "step": 890
+    },
+    {
+      "epoch": 0.7275666936135813,
+      "grad_norm": Infinity,
+      "learning_rate": 3.787388843977365e-05,
+      "loss": 1.2827,
+      "step": 900
+    },
+    {
+      "epoch": 0.7356507679870655,
+      "grad_norm": Infinity,
+      "learning_rate": 3.773915386688224e-05,
+      "loss": 0.5413,
+      "step": 910
+    },
+    {
+      "epoch": 0.7437348423605498,
+      "grad_norm": Infinity,
+      "learning_rate": 3.760441929399084e-05,
+      "loss": 1.524,
+      "step": 920
+    },
+    {
+      "epoch": 0.751818916734034,
+      "grad_norm": Infinity,
+      "learning_rate": 3.746968472109944e-05,
+      "loss": 2.3918,
+      "step": 930
+    },
+    {
+      "epoch": 0.7599029911075182,
+      "grad_norm": Infinity,
+      "learning_rate": 3.733495014820803e-05,
+      "loss": 1.4762,
+      "step": 940
+    },
+    {
+      "epoch": 0.7679870654810024,
+      "grad_norm": Infinity,
+      "learning_rate": 3.720021557531663e-05,
+      "loss": 1.2758,
+      "step": 950
+    },
+    {
+      "epoch": 0.7760711398544866,
+      "grad_norm": Infinity,
+      "learning_rate": 3.7065481002425226e-05,
+      "loss": 1.2247,
+      "step": 960
+    },
+    {
+      "epoch": 0.7841552142279709,
+      "grad_norm": Infinity,
+      "learning_rate": 3.693074642953382e-05,
+      "loss": 1.3217,
+      "step": 970
+    },
+    {
+      "epoch": 0.7922392886014551,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6796011856642416e-05,
+      "loss": 1.0781,
+      "step": 980
+    },
+    {
+      "epoch": 0.8003233629749393,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6661277283751014e-05,
+      "loss": 1.1996,
+      "step": 990
+    },
+    {
+      "epoch": 0.8084074373484236,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6526542710859606e-05,
+      "loss": 1.8774,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8164915117219078,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6391808137968204e-05,
+      "loss": 0.4993,
+      "step": 1010
+    },
+    {
+      "epoch": 0.824575586095392,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6257073565076796e-05,
+      "loss": 1.1835,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8326596604688763,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6122338992185394e-05,
+      "loss": 1.6,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8407437348423605,
+      "grad_norm": Infinity,
+      "learning_rate": 3.598760441929399e-05,
+      "loss": 2.5379,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8488278092158448,
+      "grad_norm": Infinity,
+      "learning_rate": 3.5852869846402584e-05,
+      "loss": 1.0088,
+      "step": 1050
+    },
+    {
+      "epoch": 0.856911883589329,
+      "grad_norm": Infinity,
+      "learning_rate": 3.571813527351118e-05,
+      "loss": 2.2007,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8649959579628133,
+      "grad_norm": Infinity,
+      "learning_rate": 3.558340070061978e-05,
+      "loss": 1.3587,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8730800323362975,
+      "grad_norm": Infinity,
+      "learning_rate": 3.544866612772837e-05,
+      "loss": 3.0178,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8811641067097817,
+      "grad_norm": Infinity,
+      "learning_rate": 3.531393155483697e-05,
+      "loss": 1.7664,
+      "step": 1090
+    },
+    {
+      "epoch": 0.889248181083266,
+      "grad_norm": Infinity,
+      "learning_rate": 3.517919698194557e-05,
+      "loss": 0.8585,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8973322554567502,
+      "grad_norm": Infinity,
+      "learning_rate": 3.504446240905416e-05,
+      "loss": 1.5722,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9054163298302345,
+      "grad_norm": Infinity,
+      "learning_rate": 3.490972783616276e-05,
+      "loss": 2.0158,
+      "step": 1120
+    },
+    {
+      "epoch": 0.9135004042037187,
+      "grad_norm": Infinity,
+      "learning_rate": 3.477499326327136e-05,
+      "loss": 1.8439,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9215844785772029,
+      "grad_norm": Infinity,
+      "learning_rate": 3.464025869037995e-05,
+      "loss": 1.7193,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9296685529506872,
+      "grad_norm": Infinity,
+      "learning_rate": 3.450552411748855e-05,
+      "loss": 0.8563,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9377526273241714,
+      "grad_norm": Infinity,
+      "learning_rate": 3.4370789544597146e-05,
+      "loss": 1.2554,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9458367016976557,
+      "grad_norm": Infinity,
+      "learning_rate": 3.423605497170574e-05,
+      "loss": 1.2612,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9539207760711399,
+      "grad_norm": Infinity,
+      "learning_rate": 3.4101320398814336e-05,
+      "loss": 0.6833,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9620048504446241,
+      "grad_norm": Infinity,
+      "learning_rate": 3.396658582592293e-05,
+      "loss": 0.7645,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9700889248181084,
+      "grad_norm": Infinity,
+      "learning_rate": 3.3831851253031526e-05,
+      "loss": 1.7546,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9781729991915926,
+      "grad_norm": Infinity,
+      "learning_rate": 3.3697116680140125e-05,
+      "loss": 2.0247,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9862570735650767,
+      "grad_norm": Infinity,
+      "learning_rate": 3.356238210724872e-05,
+      "loss": 0.8708,
+      "step": 1220
+    },
+    {
+      "epoch": 0.994341147938561,
+      "grad_norm": Infinity,
+      "learning_rate": 3.342764753435732e-05,
+      "loss": 2.1135,
+      "step": 1230
+    },
+    {
+      "epoch": 1.0024252223120453,
+      "grad_norm": Infinity,
+      "learning_rate": 3.329291296146591e-05,
+      "loss": 1.73,
+      "step": 1240
+    },
+    {
+      "epoch": 1.0105092966855296,
+      "grad_norm": Infinity,
+      "learning_rate": 3.315817838857451e-05,
+      "loss": 1.5269,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0185933710590138,
+      "grad_norm": Infinity,
+      "learning_rate": 3.302344381568311e-05,
+      "loss": 1.3657,
+      "step": 1260
+    },
+    {
+      "epoch": 1.026677445432498,
+      "grad_norm": Infinity,
+      "learning_rate": 3.28887092427917e-05,
+      "loss": 1.3771,
+      "step": 1270
+    },
+    {
+      "epoch": 1.0347615198059823,
+      "grad_norm": Infinity,
+      "learning_rate": 3.27539746699003e-05,
+      "loss": 0.9131,
+      "step": 1280
+    },
+    {
+      "epoch": 1.0428455941794665,
+      "grad_norm": Infinity,
+      "learning_rate": 3.26192400970089e-05,
+      "loss": 0.844,
+      "step": 1290
+    },
+    {
+      "epoch": 1.0509296685529508,
+      "grad_norm": Infinity,
+      "learning_rate": 3.248450552411749e-05,
+      "loss": 1.4511,
+      "step": 1300
+    },
+    {
+      "epoch": 1.059013742926435,
+      "grad_norm": Infinity,
+      "learning_rate": 3.234977095122609e-05,
+      "loss": 2.0453,
+      "step": 1310
+    },
+    {
+      "epoch": 1.0670978172999193,
+      "grad_norm": Infinity,
+      "learning_rate": 3.221503637833469e-05,
+      "loss": 1.4035,
+      "step": 1320
+    },
+    {
+      "epoch": 1.0751818916734033,
+      "grad_norm": Infinity,
+      "learning_rate": 3.208030180544328e-05,
+      "loss": 1.5244,
+      "step": 1330
+    },
+    {
+      "epoch": 1.0832659660468877,
+      "grad_norm": Infinity,
+      "learning_rate": 3.194556723255188e-05,
+      "loss": 0.8892,
+      "step": 1340
+    },
+    {
+      "epoch": 1.0913500404203718,
+      "grad_norm": Infinity,
+      "learning_rate": 3.1810832659660475e-05,
+      "loss": 1.6417,
+      "step": 1350
+    },
+    {
+      "epoch": 1.0994341147938562,
+      "grad_norm": Infinity,
+      "learning_rate": 3.167609808676907e-05,
+      "loss": 2.1219,
+      "step": 1360
+    },
+    {
+      "epoch": 1.1075181891673402,
+      "grad_norm": Infinity,
+      "learning_rate": 3.1541363513877665e-05,
+      "loss": 2.4549,
+      "step": 1370
+    },
+    {
+      "epoch": 1.1156022635408245,
+      "grad_norm": Infinity,
+      "learning_rate": 3.140662894098626e-05,
+      "loss": 1.1852,
+      "step": 1380
+    },
+    {
+      "epoch": 1.1236863379143087,
+      "grad_norm": Infinity,
+      "learning_rate": 3.1271894368094855e-05,
+      "loss": 2.5192,
+      "step": 1390
+    },
+    {
+      "epoch": 1.131770412287793,
+      "grad_norm": Infinity,
+      "learning_rate": 3.1137159795203454e-05,
+      "loss": 1.0584,
+      "step": 1400
+    },
+    {
+      "epoch": 1.1398544866612772,
+      "grad_norm": Infinity,
+      "learning_rate": 3.1002425222312045e-05,
+      "loss": 1.9475,
+      "step": 1410
+    },
+    {
+      "epoch": 1.1479385610347614,
+      "grad_norm": Infinity,
+      "learning_rate": 3.0867690649420644e-05,
+      "loss": 1.3349,
+      "step": 1420
+    },
+    {
+      "epoch": 1.1560226354082457,
+      "grad_norm": Infinity,
+      "learning_rate": 3.073295607652924e-05,
+      "loss": 2.005,
+      "step": 1430
+    },
+    {
+      "epoch": 1.16410670978173,
+      "grad_norm": Infinity,
+      "learning_rate": 3.0598221503637834e-05,
+      "loss": 0.8468,
+      "step": 1440
+    },
+    {
+      "epoch": 1.1721907841552142,
+      "grad_norm": Infinity,
+      "learning_rate": 3.0463486930746432e-05,
+      "loss": 1.3994,
+      "step": 1450
+    },
+    {
+      "epoch": 1.1802748585286984,
+      "grad_norm": Infinity,
+      "learning_rate": 3.0328752357855027e-05,
+      "loss": 0.5119,
+      "step": 1460
+    },
+    {
+      "epoch": 1.1883589329021826,
+      "grad_norm": Infinity,
+      "learning_rate": 3.0194017784963626e-05,
+      "loss": 0.7779,
+      "step": 1470
+    },
+    {
+      "epoch": 1.1964430072756669,
+      "grad_norm": Infinity,
+      "learning_rate": 3.005928321207222e-05,
+      "loss": 1.7018,
+      "step": 1480
+    },
+    {
+      "epoch": 1.2045270816491511,
+      "grad_norm": Infinity,
+      "learning_rate": 2.9924548639180816e-05,
+      "loss": 1.3685,
+      "step": 1490
+    },
+    {
+      "epoch": 1.2126111560226354,
+      "grad_norm": Infinity,
+      "learning_rate": 2.978981406628941e-05,
+      "loss": 1.361,
+      "step": 1500
+    },
+    {
+      "epoch": 1.2206952303961196,
+      "grad_norm": Infinity,
+      "learning_rate": 2.965507949339801e-05,
+      "loss": 1.5077,
+      "step": 1510
+    },
+    {
+      "epoch": 1.2287793047696038,
+      "grad_norm": Infinity,
+      "learning_rate": 2.9520344920506604e-05,
+      "loss": 1.0513,
+      "step": 1520
+    },
+    {
+      "epoch": 1.236863379143088,
+      "grad_norm": Infinity,
+      "learning_rate": 2.93856103476152e-05,
+      "loss": 1.6926,
+      "step": 1530
+    },
+    {
+      "epoch": 1.2449474535165723,
+      "grad_norm": Infinity,
+      "learning_rate": 2.9250875774723797e-05,
+      "loss": 1.3084,
+      "step": 1540
+    },
+    {
+      "epoch": 1.2530315278900566,
+      "grad_norm": Infinity,
+      "learning_rate": 2.9116141201832392e-05,
+      "loss": 1.8298,
+      "step": 1550
+    },
+    {
+      "epoch": 1.2611156022635408,
+      "grad_norm": Infinity,
+      "learning_rate": 2.8981406628940987e-05,
+      "loss": 0.9793,
+      "step": 1560
+    },
+    {
+      "epoch": 1.269199676637025,
+      "grad_norm": Infinity,
+      "learning_rate": 2.8846672056049582e-05,
+      "loss": 1.4149,
+      "step": 1570
+    },
+    {
+      "epoch": 1.2772837510105093,
+      "grad_norm": Infinity,
+      "learning_rate": 2.871193748315818e-05,
+      "loss": 0.9485,
+      "step": 1580
+    },
+    {
+      "epoch": 1.2853678253839935,
+      "grad_norm": Infinity,
+      "learning_rate": 2.8577202910266776e-05,
+      "loss": 1.6182,
+      "step": 1590
+    },
+    {
+      "epoch": 1.2934518997574778,
+      "grad_norm": Infinity,
+      "learning_rate": 2.844246833737537e-05,
+      "loss": 0.9473,
+      "step": 1600
+    },
+    {
+      "epoch": 1.301535974130962,
+      "grad_norm": Infinity,
+      "learning_rate": 2.830773376448397e-05,
+      "loss": 1.8231,
+      "step": 1610
+    },
+    {
+      "epoch": 1.3096200485044462,
+      "grad_norm": Infinity,
+      "learning_rate": 2.8172999191592564e-05,
+      "loss": 1.687,
+      "step": 1620
+    },
+    {
+      "epoch": 1.3177041228779305,
+      "grad_norm": Infinity,
+      "learning_rate": 2.803826461870116e-05,
+      "loss": 1.0405,
+      "step": 1630
+    },
+    {
+      "epoch": 1.3257881972514147,
+      "grad_norm": Infinity,
+      "learning_rate": 2.7903530045809754e-05,
+      "loss": 1.2729,
+      "step": 1640
+    },
+    {
+      "epoch": 1.333872271624899,
+      "grad_norm": Infinity,
+      "learning_rate": 2.7768795472918353e-05,
+      "loss": 1.7429,
+      "step": 1650
+    },
+    {
+      "epoch": 1.3419563459983832,
+      "grad_norm": Infinity,
+      "learning_rate": 2.7634060900026948e-05,
+      "loss": 1.1652,
+      "step": 1660
+    },
+    {
+      "epoch": 1.3500404203718674,
+      "grad_norm": Infinity,
+      "learning_rate": 2.7499326327135543e-05,
+      "loss": 1.6927,
+      "step": 1670
+    },
+    {
+      "epoch": 1.3581244947453517,
+      "grad_norm": Infinity,
+      "learning_rate": 2.736459175424414e-05,
+      "loss": 1.1215,
+      "step": 1680
+    },
+    {
+      "epoch": 1.366208569118836,
+      "grad_norm": Infinity,
+      "learning_rate": 2.7229857181352736e-05,
+      "loss": 1.0629,
+      "step": 1690
+    },
+    {
+      "epoch": 1.3742926434923202,
+      "grad_norm": Infinity,
+      "learning_rate": 2.709512260846133e-05,
+      "loss": 1.0104,
+      "step": 1700
+    },
+    {
+      "epoch": 1.3823767178658044,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6960388035569926e-05,
+      "loss": 1.4327,
+      "step": 1710
+    },
+    {
+      "epoch": 1.3904607922392886,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6825653462678525e-05,
+      "loss": 1.0857,
+      "step": 1720
+    },
+    {
+      "epoch": 1.3985448666127729,
+      "grad_norm": Infinity,
+      "learning_rate": 2.669091888978712e-05,
+      "loss": 1.7623,
+      "step": 1730
+    },
+    {
+      "epoch": 1.4066289409862571,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6556184316895715e-05,
+      "loss": 1.4973,
+      "step": 1740
+    },
+    {
+      "epoch": 1.4147130153597414,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6421449744004313e-05,
+      "loss": 1.313,
+      "step": 1750
+    },
+    {
+      "epoch": 1.4227970897332256,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6286715171112908e-05,
+      "loss": 1.1965,
+      "step": 1760
+    },
+    {
+      "epoch": 1.4308811641067098,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6151980598221503e-05,
+      "loss": 1.432,
+      "step": 1770
+    },
+    {
+      "epoch": 1.438965238480194,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6017246025330098e-05,
+      "loss": 1.3331,
+      "step": 1780
+    },
+    {
+      "epoch": 1.4470493128536783,
+      "grad_norm": Infinity,
+      "learning_rate": 2.5882511452438697e-05,
+      "loss": 1.2561,
+      "step": 1790
+    },
+    {
+      "epoch": 1.4551333872271626,
+      "grad_norm": Infinity,
+      "learning_rate": 2.574777687954729e-05,
+      "loss": 1.5896,
+      "step": 1800
+    },
+    {
+      "epoch": 1.4632174616006468,
+      "grad_norm": Infinity,
+      "learning_rate": 2.5613042306655887e-05,
+      "loss": 2.9014,
+      "step": 1810
+    },
+    {
+      "epoch": 1.4713015359741308,
+      "grad_norm": Infinity,
+      "learning_rate": 2.5478307733764485e-05,
+      "loss": 1.5244,
+      "step": 1820
+    },
+    {
+      "epoch": 1.4793856103476153,
+      "grad_norm": Infinity,
+      "learning_rate": 2.534357316087308e-05,
+      "loss": 1.0577,
+      "step": 1830
+    },
+    {
+      "epoch": 1.4874696847210993,
+      "grad_norm": Infinity,
+      "learning_rate": 2.5208838587981675e-05,
+      "loss": 1.2323,
+      "step": 1840
+    },
+    {
+      "epoch": 1.4955537590945838,
+      "grad_norm": Infinity,
+      "learning_rate": 2.5074104015090273e-05,
+      "loss": 2.4222,
+      "step": 1850
+    },
+    {
+      "epoch": 1.5036378334680678,
+      "grad_norm": Infinity,
+      "learning_rate": 2.4939369442198872e-05,
+      "loss": 1.9402,
+      "step": 1860
+    },
+    {
+      "epoch": 1.5117219078415522,
+      "grad_norm": Infinity,
+      "learning_rate": 2.4804634869307467e-05,
+      "loss": 2.2911,
+      "step": 1870
+    },
+    {
+      "epoch": 1.5198059822150363,
+      "grad_norm": Infinity,
+      "learning_rate": 2.4669900296416062e-05,
+      "loss": 1.7301,
+      "step": 1880
+    },
+    {
+      "epoch": 1.5278900565885207,
+      "grad_norm": Infinity,
+      "learning_rate": 2.4535165723524657e-05,
+      "loss": 0.6863,
+      "step": 1890
+    },
+    {
+      "epoch": 1.5359741309620047,
+      "grad_norm": Infinity,
+      "learning_rate": 2.4400431150633255e-05,
+      "loss": 1.8456,
+      "step": 1900
+    },
+    {
+      "epoch": 1.5440582053354892,
+      "grad_norm": Infinity,
+      "learning_rate": 2.426569657774185e-05,
+      "loss": 2.3463,
+      "step": 1910
+    },
+    {
+      "epoch": 1.5521422797089732,
+      "grad_norm": Infinity,
+      "learning_rate": 2.4130962004850445e-05,
+      "loss": 1.63,
+      "step": 1920
+    },
+    {
+      "epoch": 1.5602263540824577,
+      "grad_norm": Infinity,
+      "learning_rate": 2.3996227431959044e-05,
+      "loss": 2.1095,
+      "step": 1930
+    },
+    {
+      "epoch": 1.5683104284559417,
+      "grad_norm": Infinity,
+      "learning_rate": 2.386149285906764e-05,
+      "loss": 0.9828,
+      "step": 1940
+    },
+    {
+      "epoch": 1.5763945028294262,
+      "grad_norm": Infinity,
+      "learning_rate": 2.3726758286176234e-05,
+      "loss": 0.7091,
+      "step": 1950
+    },
+    {
+      "epoch": 1.5844785772029102,
+      "grad_norm": Infinity,
+      "learning_rate": 2.359202371328483e-05,
+      "loss": 0.5691,
+      "step": 1960
+    },
+    {
+      "epoch": 1.5925626515763947,
+      "grad_norm": Infinity,
+      "learning_rate": 2.3457289140393427e-05,
+      "loss": 1.5768,
+      "step": 1970
+    },
+    {
+      "epoch": 1.6006467259498787,
+      "grad_norm": Infinity,
+      "learning_rate": 2.3322554567502022e-05,
+      "loss": 1.161,
+      "step": 1980
+    },
+    {
+      "epoch": 1.6087308003233631,
+      "grad_norm": Infinity,
+      "learning_rate": 2.3187819994610617e-05,
+      "loss": 0.9278,
+      "step": 1990
+    },
+    {
+      "epoch": 1.6168148746968471,
+      "grad_norm": Infinity,
+      "learning_rate": 2.3053085421719216e-05,
+      "loss": 1.0681,
+      "step": 2000
+    },
+    {
+      "epoch": 1.6248989490703316,
+      "grad_norm": Infinity,
+      "learning_rate": 2.291835084882781e-05,
+      "loss": 2.3668,
+      "step": 2010
+    },
+    {
+      "epoch": 1.6329830234438156,
+      "grad_norm": Infinity,
+      "learning_rate": 2.2783616275936406e-05,
+      "loss": 0.7226,
+      "step": 2020
+    },
+    {
+      "epoch": 1.6410670978172999,
+      "grad_norm": Infinity,
+      "learning_rate": 2.2648881703045e-05,
+      "loss": 0.5279,
+      "step": 2030
+    },
+    {
+      "epoch": 1.649151172190784,
+      "grad_norm": Infinity,
+      "learning_rate": 2.25141471301536e-05,
+      "loss": 0.7175,
+      "step": 2040
+    },
+    {
+      "epoch": 1.6572352465642683,
+      "grad_norm": Infinity,
+      "learning_rate": 2.2379412557262194e-05,
+      "loss": 2.026,
+      "step": 2050
+    },
+    {
+      "epoch": 1.6653193209377526,
+      "grad_norm": Infinity,
+      "learning_rate": 2.224467798437079e-05,
+      "loss": 1.204,
+      "step": 2060
+    },
+    {
+      "epoch": 1.6734033953112368,
+      "grad_norm": Infinity,
+      "learning_rate": 2.2109943411479387e-05,
+      "loss": 2.0731,
+      "step": 2070
+    },
+    {
+      "epoch": 1.681487469684721,
+      "grad_norm": Infinity,
+      "learning_rate": 2.1975208838587983e-05,
+      "loss": 1.9343,
+      "step": 2080
+    },
+    {
+      "epoch": 1.6895715440582053,
+      "grad_norm": Infinity,
+      "learning_rate": 2.1840474265696578e-05,
+      "loss": 2.1806,
+      "step": 2090
+    },
+    {
+      "epoch": 1.6976556184316896,
+      "grad_norm": Infinity,
+      "learning_rate": 2.1705739692805176e-05,
+      "loss": 2.457,
+      "step": 2100
+    },
+    {
+      "epoch": 1.7057396928051738,
+      "grad_norm": Infinity,
+      "learning_rate": 2.157100511991377e-05,
+      "loss": 1.7964,
+      "step": 2110
+    },
+    {
+      "epoch": 1.713823767178658,
+      "grad_norm": Infinity,
+      "learning_rate": 2.1436270547022366e-05,
+      "loss": 1.9725,
+      "step": 2120
+    },
+    {
+      "epoch": 1.7219078415521423,
+      "grad_norm": Infinity,
+      "learning_rate": 2.130153597413096e-05,
+      "loss": 1.4176,
+      "step": 2130
+    },
+    {
+      "epoch": 1.7299919159256265,
+      "grad_norm": Infinity,
+      "learning_rate": 2.116680140123956e-05,
+      "loss": 2.6819,
+      "step": 2140
+    },
+    {
+      "epoch": 1.7380759902991108,
+      "grad_norm": Infinity,
+      "learning_rate": 2.1032066828348154e-05,
+      "loss": 2.2248,
+      "step": 2150
+    },
+    {
+      "epoch": 1.746160064672595,
+      "grad_norm": Infinity,
+      "learning_rate": 2.089733225545675e-05,
+      "loss": 2.2926,
+      "step": 2160
+    },
+    {
+      "epoch": 1.7542441390460792,
+      "grad_norm": Infinity,
+      "learning_rate": 2.0762597682565348e-05,
+      "loss": 0.6392,
+      "step": 2170
+    },
+    {
+      "epoch": 1.7623282134195635,
+      "grad_norm": Infinity,
+      "learning_rate": 2.0627863109673943e-05,
+      "loss": 1.4321,
+      "step": 2180
+    },
+    {
+      "epoch": 1.7704122877930477,
+      "grad_norm": Infinity,
+      "learning_rate": 2.0493128536782538e-05,
+      "loss": 1.9084,
+      "step": 2190
+    },
+    {
+      "epoch": 1.778496362166532,
+      "grad_norm": Infinity,
+      "learning_rate": 2.0358393963891133e-05,
+      "loss": 2.2621,
+      "step": 2200
+    },
+    {
+      "epoch": 1.7865804365400162,
+      "grad_norm": Infinity,
+      "learning_rate": 2.022365939099973e-05,
+      "loss": 1.8285,
+      "step": 2210
+    },
+    {
+      "epoch": 1.7946645109135004,
+      "grad_norm": Infinity,
+      "learning_rate": 2.008892481810833e-05,
+      "loss": 1.5897,
+      "step": 2220
+    },
+    {
+      "epoch": 1.8027485852869847,
+      "grad_norm": Infinity,
+      "learning_rate": 1.9954190245216925e-05,
+      "loss": 1.6952,
+      "step": 2230
+    },
+    {
+      "epoch": 1.810832659660469,
+      "grad_norm": Infinity,
+      "learning_rate": 1.981945567232552e-05,
+      "loss": 2.6125,
+      "step": 2240
+    },
+    {
+      "epoch": 1.8189167340339532,
+      "grad_norm": Infinity,
+      "learning_rate": 1.9684721099434118e-05,
+      "loss": 1.2341,
+      "step": 2250
+    },
+    {
+      "epoch": 1.8270008084074374,
+      "grad_norm": Infinity,
+      "learning_rate": 1.9549986526542713e-05,
+      "loss": 1.9369,
+      "step": 2260
+    },
+    {
+      "epoch": 1.8350848827809216,
+      "grad_norm": Infinity,
+      "learning_rate": 1.9415251953651308e-05,
+      "loss": 2.6913,
+      "step": 2270
+    },
+    {
+      "epoch": 1.8431689571544059,
+      "grad_norm": Infinity,
+      "learning_rate": 1.9280517380759907e-05,
+      "loss": 2.0335,
+      "step": 2280
+    },
+    {
+      "epoch": 1.85125303152789,
+      "grad_norm": Infinity,
+      "learning_rate": 1.91457828078685e-05,
+      "loss": 1.2543,
+      "step": 2290
+    },
+    {
+      "epoch": 1.8593371059013744,
+      "grad_norm": Infinity,
+      "learning_rate": 1.9011048234977097e-05,
+      "loss": 1.6764,
+      "step": 2300
+    },
+    {
+      "epoch": 1.8674211802748584,
+      "grad_norm": Infinity,
+      "learning_rate": 1.887631366208569e-05,
+      "loss": 1.2704,
+      "step": 2310
+    },
+    {
+      "epoch": 1.8755052546483428,
+      "grad_norm": Infinity,
+      "learning_rate": 1.874157908919429e-05,
+      "loss": 2.1497,
+      "step": 2320
+    },
+    {
+      "epoch": 1.8835893290218269,
+      "grad_norm": Infinity,
+      "learning_rate": 1.8606844516302885e-05,
+      "loss": 1.7158,
+      "step": 2330
+    },
+    {
+      "epoch": 1.8916734033953113,
+      "grad_norm": Infinity,
+      "learning_rate": 1.847210994341148e-05,
+      "loss": 0.9835,
+      "step": 2340
+    },
+    {
+      "epoch": 1.8997574777687953,
+      "grad_norm": Infinity,
+      "learning_rate": 1.833737537052008e-05,
+      "loss": 1.6897,
+      "step": 2350
+    },
+    {
+      "epoch": 1.9078415521422798,
+      "grad_norm": Infinity,
+      "learning_rate": 1.8202640797628673e-05,
+      "loss": 1.5966,
+      "step": 2360
+    },
+    {
+      "epoch": 1.9159256265157638,
+      "grad_norm": Infinity,
+      "learning_rate": 1.806790622473727e-05,
+      "loss": 1.3293,
+      "step": 2370
+    },
+    {
+      "epoch": 1.9240097008892483,
+      "grad_norm": Infinity,
+      "learning_rate": 1.7933171651845863e-05,
+      "loss": 0.9033,
+      "step": 2380
+    },
+    {
+      "epoch": 1.9320937752627323,
+      "grad_norm": Infinity,
+      "learning_rate": 1.7798437078954462e-05,
+      "loss": 1.1496,
+      "step": 2390
+    },
+    {
+      "epoch": 1.9401778496362168,
+      "grad_norm": Infinity,
+      "learning_rate": 1.7663702506063057e-05,
+      "loss": 1.0576,
+      "step": 2400
+    },
+    {
+      "epoch": 1.9482619240097008,
+      "grad_norm": Infinity,
+      "learning_rate": 1.7528967933171652e-05,
+      "loss": 2.219,
+      "step": 2410
+    },
+    {
+      "epoch": 1.9563459983831852,
+      "grad_norm": Infinity,
+      "learning_rate": 1.739423336028025e-05,
+      "loss": 0.8811,
+      "step": 2420
+    },
+    {
+      "epoch": 1.9644300727566693,
+      "grad_norm": Infinity,
+      "learning_rate": 1.7259498787388845e-05,
+      "loss": 1.5159,
+      "step": 2430
+    },
+    {
+      "epoch": 1.9725141471301537,
+      "grad_norm": Infinity,
+      "learning_rate": 1.712476421449744e-05,
+      "loss": 1.5736,
+      "step": 2440
+    },
+    {
+      "epoch": 1.9805982215036377,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6990029641606035e-05,
+      "loss": 2.0976,
+      "step": 2450
+    },
+    {
+      "epoch": 1.9886822958771222,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6855295068714634e-05,
+      "loss": 2.2363,
+      "step": 2460
+    },
+    {
+      "epoch": 1.9967663702506062,
+      "grad_norm": Infinity,
+      "learning_rate": 1.672056049582323e-05,
+      "loss": 2.5238,
+      "step": 2470
+    },
+    {
+      "epoch": 2.0048504446240907,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6585825922931824e-05,
+      "loss": 0.9174,
+      "step": 2480
+    },
+    {
+      "epoch": 2.0129345189975747,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6451091350040422e-05,
+      "loss": 1.5876,
+      "step": 2490
+    },
+    {
+      "epoch": 2.021018593371059,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6316356777149017e-05,
+      "loss": 1.129,
+      "step": 2500
+    },
+    {
+      "epoch": 2.029102667744543,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6181622204257612e-05,
+      "loss": 1.6202,
+      "step": 2510
+    },
+    {
+      "epoch": 2.0371867421180276,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6046887631366207e-05,
+      "loss": 0.9087,
+      "step": 2520
+    },
+    {
+      "epoch": 2.0452708164915117,
+      "grad_norm": Infinity,
+      "learning_rate": 1.5912153058474806e-05,
+      "loss": 2.0087,
+      "step": 2530
+    },
+    {
+      "epoch": 2.053354890864996,
+      "grad_norm": Infinity,
+      "learning_rate": 1.57774184855834e-05,
+      "loss": 2.1463,
+      "step": 2540
+    },
+    {
+      "epoch": 2.06143896523848,
+      "grad_norm": Infinity,
+      "learning_rate": 1.5642683912691996e-05,
+      "loss": 1.7217,
+      "step": 2550
+    },
+    {
+      "epoch": 2.0695230396119646,
+      "grad_norm": Infinity,
+      "learning_rate": 1.5507949339800594e-05,
+      "loss": 0.7802,
+      "step": 2560
+    },
+    {
+      "epoch": 2.0776071139854486,
+      "grad_norm": Infinity,
+      "learning_rate": 1.537321476690919e-05,
+      "loss": 2.4857,
+      "step": 2570
+    },
+    {
+      "epoch": 2.085691188358933,
+      "grad_norm": Infinity,
+      "learning_rate": 1.5238480194017784e-05,
+      "loss": 1.5868,
+      "step": 2580
+    },
+    {
+      "epoch": 2.093775262732417,
+      "grad_norm": Infinity,
+      "learning_rate": 1.510374562112638e-05,
+      "loss": 2.8292,
+      "step": 2590
+    },
+    {
+      "epoch": 2.1018593371059016,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4969011048234976e-05,
+      "loss": 1.4174,
+      "step": 2600
+    },
+    {
+      "epoch": 2.1099434114793856,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4834276475343573e-05,
+      "loss": 1.4931,
+      "step": 2610
+    },
+    {
+      "epoch": 2.11802748585287,
+      "grad_norm": Infinity,
+      "learning_rate": 1.469954190245217e-05,
+      "loss": 1.1888,
+      "step": 2620
+    },
+    {
+      "epoch": 2.126111560226354,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4564807329560768e-05,
+      "loss": 2.5423,
+      "step": 2630
+    },
+    {
+      "epoch": 2.1341956345998385,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4430072756669363e-05,
+      "loss": 1.0222,
+      "step": 2640
+    },
+    {
+      "epoch": 2.1422797089733225,
+      "grad_norm": Infinity,
+      "learning_rate": 1.429533818377796e-05,
+      "loss": 1.2646,
+      "step": 2650
+    },
+    {
+      "epoch": 2.1503637833468066,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4160603610886556e-05,
+      "loss": 0.8661,
+      "step": 2660
+    },
+    {
+      "epoch": 2.158447857720291,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4025869037995151e-05,
+      "loss": 1.0685,
+      "step": 2670
+    },
+    {
+      "epoch": 2.1665319320937755,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3891134465103748e-05,
+      "loss": 1.6561,
+      "step": 2680
+    },
+    {
+      "epoch": 2.1746160064672595,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3756399892212343e-05,
+      "loss": 1.4664,
+      "step": 2690
+    },
+    {
+      "epoch": 2.1827000808407435,
+      "grad_norm": Infinity,
+      "learning_rate": 1.362166531932094e-05,
+      "loss": 1.8332,
+      "step": 2700
+    },
+    {
+      "epoch": 2.190784155214228,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3486930746429535e-05,
+      "loss": 1.2506,
+      "step": 2710
+    },
+    {
+      "epoch": 2.1988682295877124,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3352196173538131e-05,
+      "loss": 1.1132,
+      "step": 2720
+    },
+    {
+      "epoch": 2.2069523039611965,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3217461600646728e-05,
+      "loss": 0.8307,
+      "step": 2730
+    },
+    {
+      "epoch": 2.2150363783346805,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3082727027755323e-05,
+      "loss": 1.706,
+      "step": 2740
+    },
+    {
+      "epoch": 2.223120452708165,
+      "grad_norm": Infinity,
+      "learning_rate": 1.294799245486392e-05,
+      "loss": 1.2033,
+      "step": 2750
+    },
+    {
+      "epoch": 2.231204527081649,
+      "grad_norm": Infinity,
+      "learning_rate": 1.2813257881972515e-05,
+      "loss": 1.5921,
+      "step": 2760
+    },
+    {
+      "epoch": 2.2392886014551334,
+      "grad_norm": Infinity,
+      "learning_rate": 1.2678523309081111e-05,
+      "loss": 1.1893,
+      "step": 2770
+    },
+    {
+      "epoch": 2.2473726758286174,
+      "grad_norm": Infinity,
+      "learning_rate": 1.2543788736189706e-05,
+      "loss": 1.9335,
+      "step": 2780
+    },
+    {
+      "epoch": 2.255456750202102,
+      "grad_norm": Infinity,
+      "learning_rate": 1.2409054163298303e-05,
+      "loss": 1.5802,
+      "step": 2790
+    },
+    {
+      "epoch": 2.263540824575586,
+      "grad_norm": Infinity,
+      "learning_rate": 1.22743195904069e-05,
+      "loss": 2.0087,
+      "step": 2800
+    },
+    {
+      "epoch": 2.2716248989490704,
+      "grad_norm": Infinity,
+      "learning_rate": 1.2139585017515495e-05,
+      "loss": 1.1383,
+      "step": 2810
+    },
+    {
+      "epoch": 2.2797089733225544,
+      "grad_norm": Infinity,
+      "learning_rate": 1.2004850444624092e-05,
+      "loss": 1.3281,
+      "step": 2820
+    },
+    {
+      "epoch": 2.287793047696039,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1870115871732687e-05,
+      "loss": 1.0895,
+      "step": 2830
+    },
+    {
+      "epoch": 2.295877122069523,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1735381298841283e-05,
+      "loss": 1.0182,
+      "step": 2840
+    },
+    {
+      "epoch": 2.3039611964430073,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1600646725949878e-05,
+      "loss": 1.6568,
+      "step": 2850
+    },
+    {
+      "epoch": 2.3120452708164914,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1465912153058475e-05,
+      "loss": 1.6142,
+      "step": 2860
+    },
+    {
+      "epoch": 2.320129345189976,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1331177580167072e-05,
+      "loss": 1.9218,
+      "step": 2870
+    },
+    {
+      "epoch": 2.32821341956346,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1196443007275667e-05,
+      "loss": 1.6749,
+      "step": 2880
+    },
+    {
+      "epoch": 2.3362974939369443,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1061708434384263e-05,
+      "loss": 0.6671,
+      "step": 2890
+    },
+    {
+      "epoch": 2.3443815683104283,
+      "grad_norm": Infinity,
+      "learning_rate": 1.0926973861492859e-05,
+      "loss": 1.8723,
+      "step": 2900
+    },
+    {
+      "epoch": 2.352465642683913,
+      "grad_norm": Infinity,
+      "learning_rate": 1.0792239288601455e-05,
+      "loss": 1.8936,
+      "step": 2910
+    },
+    {
+      "epoch": 2.360549717057397,
+      "grad_norm": Infinity,
+      "learning_rate": 1.065750471571005e-05,
+      "loss": 1.7514,
+      "step": 2920
+    },
+    {
+      "epoch": 2.3686337914308813,
+      "grad_norm": Infinity,
+      "learning_rate": 1.0522770142818649e-05,
+      "loss": 1.6109,
+      "step": 2930
+    },
+    {
+      "epoch": 2.3767178658043653,
+      "grad_norm": Infinity,
+      "learning_rate": 1.0388035569927244e-05,
+      "loss": 2.3036,
+      "step": 2940
+    },
+    {
+      "epoch": 2.3848019401778497,
+      "grad_norm": Infinity,
+      "learning_rate": 1.025330099703584e-05,
+      "loss": 1.6592,
+      "step": 2950
+    },
+    {
+      "epoch": 2.3928860145513338,
+      "grad_norm": Infinity,
+      "learning_rate": 1.0118566424144437e-05,
+      "loss": 1.64,
+      "step": 2960
+    },
+    {
+      "epoch": 2.4009700889248182,
+      "grad_norm": Infinity,
+      "learning_rate": 9.983831851253032e-06,
+      "loss": 2.0912,
+      "step": 2970
+    },
+    {
+      "epoch": 2.4090541632983022,
+      "grad_norm": Infinity,
+      "learning_rate": 9.849097278361629e-06,
+      "loss": 1.4023,
+      "step": 2980
+    },
+    {
+      "epoch": 2.4171382376717867,
+      "grad_norm": Infinity,
+      "learning_rate": 9.714362705470224e-06,
+      "loss": 1.8342,
+      "step": 2990
+    },
+    {
+      "epoch": 2.4252223120452707,
+      "grad_norm": Infinity,
+      "learning_rate": 9.57962813257882e-06,
+      "loss": 0.9314,
+      "step": 3000
+    },
+    {
+      "epoch": 2.433306386418755,
+      "grad_norm": Infinity,
+      "learning_rate": 9.444893559687416e-06,
+      "loss": 1.271,
+      "step": 3010
+    },
+    {
+      "epoch": 2.441390460792239,
+      "grad_norm": Infinity,
+      "learning_rate": 9.310158986796012e-06,
+      "loss": 2.358,
+      "step": 3020
+    },
+    {
+      "epoch": 2.4494745351657237,
+      "grad_norm": Infinity,
+      "learning_rate": 9.175424413904609e-06,
+      "loss": 1.9627,
+      "step": 3030
+    },
+    {
+      "epoch": 2.4575586095392077,
+      "grad_norm": Infinity,
+      "learning_rate": 9.040689841013204e-06,
+      "loss": 1.1173,
+      "step": 3040
+    },
+    {
+      "epoch": 2.465642683912692,
+      "grad_norm": Infinity,
+      "learning_rate": 8.9059552681218e-06,
+      "loss": 0.91,
+      "step": 3050
+    },
+    {
+      "epoch": 2.473726758286176,
+      "grad_norm": Infinity,
+      "learning_rate": 8.771220695230396e-06,
+      "loss": 0.8115,
+      "step": 3060
+    },
+    {
+      "epoch": 2.4818108326596606,
+      "grad_norm": Infinity,
+      "learning_rate": 8.636486122338992e-06,
+      "loss": 1.3157,
+      "step": 3070
+    },
+    {
+      "epoch": 2.4898949070331446,
+      "grad_norm": Infinity,
+      "learning_rate": 8.501751549447589e-06,
+      "loss": 2.2141,
+      "step": 3080
+    },
+    {
+      "epoch": 2.497978981406629,
+      "grad_norm": Infinity,
+      "learning_rate": 8.367016976556184e-06,
+      "loss": 1.5275,
+      "step": 3090
+    },
+    {
+      "epoch": 2.506063055780113,
+      "grad_norm": Infinity,
+      "learning_rate": 8.23228240366478e-06,
+      "loss": 1.8859,
+      "step": 3100
+    },
+    {
+      "epoch": 2.5141471301535976,
+      "grad_norm": Infinity,
+      "learning_rate": 8.097547830773376e-06,
+      "loss": 1.6131,
+      "step": 3110
+    },
+    {
+      "epoch": 2.5222312045270816,
+      "grad_norm": Infinity,
+      "learning_rate": 7.962813257881973e-06,
+      "loss": 1.4654,
+      "step": 3120
+    },
+    {
+      "epoch": 2.5303152789005656,
+      "grad_norm": Infinity,
+      "learning_rate": 7.82807868499057e-06,
+      "loss": 1.4129,
+      "step": 3130
+    },
+    {
+      "epoch": 2.53839935327405,
+      "grad_norm": Infinity,
+      "learning_rate": 7.693344112099166e-06,
+      "loss": 1.2493,
+      "step": 3140
+    },
+    {
+      "epoch": 2.5464834276475345,
+      "grad_norm": Infinity,
+      "learning_rate": 7.558609539207762e-06,
+      "loss": 1.6047,
+      "step": 3150
+    },
+    {
+      "epoch": 2.5545675020210186,
+      "grad_norm": Infinity,
+      "learning_rate": 7.423874966316358e-06,
+      "loss": 0.6597,
+      "step": 3160
+    },
+    {
+      "epoch": 2.5626515763945026,
+      "grad_norm": Infinity,
+      "learning_rate": 7.2891403934249536e-06,
+      "loss": 1.8984,
+      "step": 3170
+    },
+    {
+      "epoch": 2.570735650767987,
+      "grad_norm": Infinity,
+      "learning_rate": 7.1544058205335494e-06,
+      "loss": 1.9384,
+      "step": 3180
+    },
+    {
+      "epoch": 2.5788197251414715,
+      "grad_norm": Infinity,
+      "learning_rate": 7.019671247642145e-06,
+      "loss": 2.2221,
+      "step": 3190
+    },
+    {
+      "epoch": 2.5869037995149555,
+      "grad_norm": Infinity,
+      "learning_rate": 6.884936674750741e-06,
+      "loss": 1.1601,
+      "step": 3200
+    },
+    {
+      "epoch": 2.5949878738884395,
+      "grad_norm": Infinity,
+      "learning_rate": 6.750202101859338e-06,
+      "loss": 2.0688,
+      "step": 3210
+    },
+    {
+      "epoch": 2.603071948261924,
+      "grad_norm": Infinity,
+      "learning_rate": 6.615467528967934e-06,
+      "loss": 1.5725,
+      "step": 3220
+    },
+    {
+      "epoch": 2.6111560226354085,
+      "grad_norm": Infinity,
+      "learning_rate": 6.48073295607653e-06,
+      "loss": 1.9509,
+      "step": 3230
+    },
+    {
+      "epoch": 2.6192400970088925,
+      "grad_norm": Infinity,
+      "learning_rate": 6.3459983831851255e-06,
+      "loss": 1.8586,
+      "step": 3240
+    },
+    {
+      "epoch": 2.6273241713823765,
+      "grad_norm": Infinity,
+      "learning_rate": 6.211263810293721e-06,
+      "loss": 1.516,
+      "step": 3250
+    },
+    {
+      "epoch": 2.635408245755861,
+      "grad_norm": Infinity,
+      "learning_rate": 6.076529237402317e-06,
+      "loss": 1.5061,
+      "step": 3260
+    },
+    {
+      "epoch": 2.6434923201293454,
+      "grad_norm": Infinity,
+      "learning_rate": 5.941794664510914e-06,
+      "loss": 0.9186,
+      "step": 3270
+    },
+    {
+      "epoch": 2.6515763945028294,
+      "grad_norm": Infinity,
+      "learning_rate": 5.80706009161951e-06,
+      "loss": 1.5769,
+      "step": 3280
+    },
+    {
+      "epoch": 2.6596604688763135,
+      "grad_norm": Infinity,
+      "learning_rate": 5.6723255187281065e-06,
+      "loss": 2.253,
+      "step": 3290
+    },
+    {
+      "epoch": 2.667744543249798,
+      "grad_norm": Infinity,
+      "learning_rate": 5.537590945836702e-06,
+      "loss": 0.7681,
+      "step": 3300
+    },
+    {
+      "epoch": 2.6758286176232824,
+      "grad_norm": Infinity,
+      "learning_rate": 5.402856372945298e-06,
+      "loss": 0.967,
+      "step": 3310
+    },
+    {
+      "epoch": 2.6839126919967664,
+      "grad_norm": Infinity,
+      "learning_rate": 5.268121800053894e-06,
+      "loss": 2.1412,
+      "step": 3320
+    },
+    {
+      "epoch": 2.6919967663702504,
+      "grad_norm": Infinity,
+      "learning_rate": 5.13338722716249e-06,
+      "loss": 3.0335,
+      "step": 3330
+    },
+    {
+      "epoch": 2.700080840743735,
+      "grad_norm": Infinity,
+      "learning_rate": 4.998652654271086e-06,
+      "loss": 1.7116,
+      "step": 3340
+    },
+    {
+      "epoch": 2.7081649151172194,
+      "grad_norm": Infinity,
+      "learning_rate": 4.8639180813796825e-06,
+      "loss": 1.5722,
+      "step": 3350
+    },
+    {
+      "epoch": 2.7162489894907034,
+      "grad_norm": Infinity,
+      "learning_rate": 4.729183508488278e-06,
+      "loss": 0.9447,
+      "step": 3360
+    },
+    {
+      "epoch": 2.7243330638641874,
+      "grad_norm": Infinity,
+      "learning_rate": 4.594448935596874e-06,
+      "loss": 1.5505,
+      "step": 3370
+    },
+    {
+      "epoch": 2.732417138237672,
+      "grad_norm": Infinity,
+      "learning_rate": 4.459714362705471e-06,
+      "loss": 1.4774,
+      "step": 3380
+    },
+    {
+      "epoch": 2.740501212611156,
+      "grad_norm": Infinity,
+      "learning_rate": 4.324979789814067e-06,
+      "loss": 0.9662,
+      "step": 3390
+    },
+    {
+      "epoch": 2.7485852869846403,
+      "grad_norm": Infinity,
+      "learning_rate": 4.190245216922663e-06,
+      "loss": 1.3972,
+      "step": 3400
+    },
+    {
+      "epoch": 2.7566693613581243,
+      "grad_norm": Infinity,
+      "learning_rate": 4.0555106440312585e-06,
+      "loss": 1.9129,
+      "step": 3410
+    },
+    {
+      "epoch": 2.764753435731609,
+      "grad_norm": Infinity,
+      "learning_rate": 3.920776071139854e-06,
+      "loss": 1.3831,
+      "step": 3420
+    },
+    {
+      "epoch": 2.772837510105093,
+      "grad_norm": Infinity,
+      "learning_rate": 3.7860414982484507e-06,
+      "loss": 1.2399,
+      "step": 3430
+    },
+    {
+      "epoch": 2.7809215844785773,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6513069253570465e-06,
+      "loss": 2.1903,
+      "step": 3440
+    },
+    {
+      "epoch": 2.7890056588520613,
+      "grad_norm": Infinity,
+      "learning_rate": 3.516572352465643e-06,
+      "loss": 2.2974,
+      "step": 3450
+    },
+    {
+      "epoch": 2.7970897332255458,
+      "grad_norm": Infinity,
+      "learning_rate": 3.3818377795742387e-06,
+      "loss": 0.904,
+      "step": 3460
+    },
+    {
+      "epoch": 2.80517380759903,
+      "grad_norm": Infinity,
+      "learning_rate": 3.2471032066828345e-06,
+      "loss": 2.1944,
+      "step": 3470
+    },
+    {
+      "epoch": 2.8132578819725143,
+      "grad_norm": Infinity,
+      "learning_rate": 3.112368633791431e-06,
+      "loss": 0.5356,
+      "step": 3480
+    },
+    {
+      "epoch": 2.8213419563459983,
+      "grad_norm": Infinity,
+      "learning_rate": 2.977634060900027e-06,
+      "loss": 1.952,
+      "step": 3490
+    },
+    {
+      "epoch": 2.8294260307194827,
+      "grad_norm": Infinity,
+      "learning_rate": 2.842899488008623e-06,
+      "loss": 0.8518,
+      "step": 3500
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3711,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4635626569728000.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

norah_lora/checkpoint-3500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f7af70a1c91c1728aececa3729ab0591b5edd689612a906672255dbec45ed35
+size 5304

norah_lora/checkpoint-3711/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Visdom9/Norah
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

norah_lora/checkpoint-3711/adapter_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Visdom9/Norah",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

norah_lora/checkpoint-3711/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d59cff2a6bd8bd4549db675d631da3cdb3d83feba06da5fefc2970ca60dd38c
+size 1284192

norah_lora/checkpoint-3711/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b502946018db5f952bdf926c2b2311bc874507d1378725550a2a3b2e5a2b1bdd
+size 2595258

norah_lora/checkpoint-3711/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6c6895b530907b08bbabfbbbd7bc9909b14d8b6d23c5d37900c2359ecd83b5a
+size 13990

norah_lora/checkpoint-3711/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5be38479cc6d4d7b05065d5ca01d483e46f63525134f3c6696acddfb309dee56
+size 1064

norah_lora/checkpoint-3711/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2630 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 3711,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008084074373484237,
+      "grad_norm": Infinity,
+      "learning_rate": 4.98652654271086e-05,
+      "loss": 1.3186,
+      "step": 10
+    },
+    {
+      "epoch": 0.016168148746968473,
+      "grad_norm": Infinity,
+      "learning_rate": 4.973053085421719e-05,
+      "loss": 1.6363,
+      "step": 20
+    },
+    {
+      "epoch": 0.024252223120452707,
+      "grad_norm": Infinity,
+      "learning_rate": 4.959579628132579e-05,
+      "loss": 1.8313,
+      "step": 30
+    },
+    {
+      "epoch": 0.03233629749393695,
+      "grad_norm": Infinity,
+      "learning_rate": 4.946106170843439e-05,
+      "loss": 1.646,
+      "step": 40
+    },
+    {
+      "epoch": 0.04042037186742118,
+      "grad_norm": Infinity,
+      "learning_rate": 4.932632713554298e-05,
+      "loss": 1.4233,
+      "step": 50
+    },
+    {
+      "epoch": 0.04850444624090541,
+      "grad_norm": Infinity,
+      "learning_rate": 4.919159256265158e-05,
+      "loss": 1.8705,
+      "step": 60
+    },
+    {
+      "epoch": 0.056588520614389654,
+      "grad_norm": Infinity,
+      "learning_rate": 4.905685798976018e-05,
+      "loss": 1.7552,
+      "step": 70
+    },
+    {
+      "epoch": 0.0646725949878739,
+      "grad_norm": Infinity,
+      "learning_rate": 4.892212341686877e-05,
+      "loss": 1.1921,
+      "step": 80
+    },
+    {
+      "epoch": 0.07275666936135812,
+      "grad_norm": Infinity,
+      "learning_rate": 4.878738884397737e-05,
+      "loss": 1.4613,
+      "step": 90
+    },
+    {
+      "epoch": 0.08084074373484236,
+      "grad_norm": Infinity,
+      "learning_rate": 4.865265427108596e-05,
+      "loss": 0.6314,
+      "step": 100
+    },
+    {
+      "epoch": 0.0889248181083266,
+      "grad_norm": Infinity,
+      "learning_rate": 4.851791969819456e-05,
+      "loss": 1.8646,
+      "step": 110
+    },
+    {
+      "epoch": 0.09700889248181083,
+      "grad_norm": Infinity,
+      "learning_rate": 4.8383185125303156e-05,
+      "loss": 1.4315,
+      "step": 120
+    },
+    {
+      "epoch": 0.10509296685529507,
+      "grad_norm": Infinity,
+      "learning_rate": 4.824845055241175e-05,
+      "loss": 2.6826,
+      "step": 130
+    },
+    {
+      "epoch": 0.11317704122877931,
+      "grad_norm": Infinity,
+      "learning_rate": 4.8113715979520346e-05,
+      "loss": 2.2289,
+      "step": 140
+    },
+    {
+      "epoch": 0.12126111560226355,
+      "grad_norm": Infinity,
+      "learning_rate": 4.7978981406628945e-05,
+      "loss": 1.6823,
+      "step": 150
+    },
+    {
+      "epoch": 0.1293451899757478,
+      "grad_norm": Infinity,
+      "learning_rate": 4.7844246833737536e-05,
+      "loss": 0.7194,
+      "step": 160
+    },
+    {
+      "epoch": 0.137429264349232,
+      "grad_norm": Infinity,
+      "learning_rate": 4.7709512260846135e-05,
+      "loss": 2.28,
+      "step": 170
+    },
+    {
+      "epoch": 0.14551333872271624,
+      "grad_norm": Infinity,
+      "learning_rate": 4.757477768795473e-05,
+      "loss": 1.156,
+      "step": 180
+    },
+    {
+      "epoch": 0.15359741309620048,
+      "grad_norm": Infinity,
+      "learning_rate": 4.7440043115063325e-05,
+      "loss": 2.0865,
+      "step": 190
+    },
+    {
+      "epoch": 0.16168148746968472,
+      "grad_norm": Infinity,
+      "learning_rate": 4.730530854217192e-05,
+      "loss": 1.7647,
+      "step": 200
+    },
+    {
+      "epoch": 0.16976556184316896,
+      "grad_norm": Infinity,
+      "learning_rate": 4.717057396928052e-05,
+      "loss": 2.3384,
+      "step": 210
+    },
+    {
+      "epoch": 0.1778496362166532,
+      "grad_norm": Infinity,
+      "learning_rate": 4.703583939638911e-05,
+      "loss": 2.5532,
+      "step": 220
+    },
+    {
+      "epoch": 0.18593371059013744,
+      "grad_norm": Infinity,
+      "learning_rate": 4.690110482349771e-05,
+      "loss": 1.016,
+      "step": 230
+    },
+    {
+      "epoch": 0.19401778496362165,
+      "grad_norm": Infinity,
+      "learning_rate": 4.67663702506063e-05,
+      "loss": 2.1508,
+      "step": 240
+    },
+    {
+      "epoch": 0.2021018593371059,
+      "grad_norm": Infinity,
+      "learning_rate": 4.66316356777149e-05,
+      "loss": 1.267,
+      "step": 250
+    },
+    {
+      "epoch": 0.21018593371059013,
+      "grad_norm": Infinity,
+      "learning_rate": 4.64969011048235e-05,
+      "loss": 1.28,
+      "step": 260
+    },
+    {
+      "epoch": 0.21827000808407437,
+      "grad_norm": Infinity,
+      "learning_rate": 4.636216653193209e-05,
+      "loss": 1.6061,
+      "step": 270
+    },
+    {
+      "epoch": 0.22635408245755861,
+      "grad_norm": Infinity,
+      "learning_rate": 4.622743195904069e-05,
+      "loss": 0.7908,
+      "step": 280
+    },
+    {
+      "epoch": 0.23443815683104285,
+      "grad_norm": Infinity,
+      "learning_rate": 4.609269738614929e-05,
+      "loss": 1.8026,
+      "step": 290
+    },
+    {
+      "epoch": 0.2425222312045271,
+      "grad_norm": Infinity,
+      "learning_rate": 4.595796281325788e-05,
+      "loss": 1.2974,
+      "step": 300
+    },
+    {
+      "epoch": 0.25060630557801133,
+      "grad_norm": Infinity,
+      "learning_rate": 4.582322824036648e-05,
+      "loss": 1.885,
+      "step": 310
+    },
+    {
+      "epoch": 0.2586903799514956,
+      "grad_norm": Infinity,
+      "learning_rate": 4.568849366747508e-05,
+      "loss": 1.2212,
+      "step": 320
+    },
+    {
+      "epoch": 0.2667744543249798,
+      "grad_norm": Infinity,
+      "learning_rate": 4.555375909458367e-05,
+      "loss": 1.5874,
+      "step": 330
+    },
+    {
+      "epoch": 0.274858528698464,
+      "grad_norm": Infinity,
+      "learning_rate": 4.541902452169227e-05,
+      "loss": 0.8497,
+      "step": 340
+    },
+    {
+      "epoch": 0.28294260307194824,
+      "grad_norm": Infinity,
+      "learning_rate": 4.5284289948800865e-05,
+      "loss": 1.1971,
+      "step": 350
+    },
+    {
+      "epoch": 0.2910266774454325,
+      "grad_norm": Infinity,
+      "learning_rate": 4.514955537590946e-05,
+      "loss": 2.5525,
+      "step": 360
+    },
+    {
+      "epoch": 0.2991107518189167,
+      "grad_norm": Infinity,
+      "learning_rate": 4.5014820803018055e-05,
+      "loss": 0.5229,
+      "step": 370
+    },
+    {
+      "epoch": 0.30719482619240096,
+      "grad_norm": Infinity,
+      "learning_rate": 4.488008623012665e-05,
+      "loss": 1.9758,
+      "step": 380
+    },
+    {
+      "epoch": 0.3152789005658852,
+      "grad_norm": Infinity,
+      "learning_rate": 4.4745351657235245e-05,
+      "loss": 1.5789,
+      "step": 390
+    },
+    {
+      "epoch": 0.32336297493936944,
+      "grad_norm": Infinity,
+      "learning_rate": 4.4610617084343844e-05,
+      "loss": 2.2642,
+      "step": 400
+    },
+    {
+      "epoch": 0.3314470493128537,
+      "grad_norm": Infinity,
+      "learning_rate": 4.447588251145244e-05,
+      "loss": 2.1261,
+      "step": 410
+    },
+    {
+      "epoch": 0.3395311236863379,
+      "grad_norm": Infinity,
+      "learning_rate": 4.434114793856104e-05,
+      "loss": 2.9391,
+      "step": 420
+    },
+    {
+      "epoch": 0.34761519805982216,
+      "grad_norm": Infinity,
+      "learning_rate": 4.420641336566964e-05,
+      "loss": 1.7748,
+      "step": 430
+    },
+    {
+      "epoch": 0.3556992724333064,
+      "grad_norm": Infinity,
+      "learning_rate": 4.407167879277823e-05,
+      "loss": 1.3519,
+      "step": 440
+    },
+    {
+      "epoch": 0.36378334680679064,
+      "grad_norm": Infinity,
+      "learning_rate": 4.393694421988683e-05,
+      "loss": 1.6652,
+      "step": 450
+    },
+    {
+      "epoch": 0.3718674211802749,
+      "grad_norm": Infinity,
+      "learning_rate": 4.380220964699542e-05,
+      "loss": 1.7532,
+      "step": 460
+    },
+    {
+      "epoch": 0.3799514955537591,
+      "grad_norm": Infinity,
+      "learning_rate": 4.366747507410402e-05,
+      "loss": 1.8121,
+      "step": 470
+    },
+    {
+      "epoch": 0.3880355699272433,
+      "grad_norm": Infinity,
+      "learning_rate": 4.353274050121262e-05,
+      "loss": 1.0565,
+      "step": 480
+    },
+    {
+      "epoch": 0.39611964430072755,
+      "grad_norm": Infinity,
+      "learning_rate": 4.339800592832121e-05,
+      "loss": 2.5515,
+      "step": 490
+    },
+    {
+      "epoch": 0.4042037186742118,
+      "grad_norm": Infinity,
+      "learning_rate": 4.326327135542981e-05,
+      "loss": 1.8491,
+      "step": 500
+    },
+    {
+      "epoch": 0.412287793047696,
+      "grad_norm": Infinity,
+      "learning_rate": 4.3128536782538406e-05,
+      "loss": 1.3268,
+      "step": 510
+    },
+    {
+      "epoch": 0.42037186742118027,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2993802209647e-05,
+      "loss": 2.3801,
+      "step": 520
+    },
+    {
+      "epoch": 0.4284559417946645,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2859067636755596e-05,
+      "loss": 2.3338,
+      "step": 530
+    },
+    {
+      "epoch": 0.43654001616814875,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2724333063864194e-05,
+      "loss": 1.5153,
+      "step": 540
+    },
+    {
+      "epoch": 0.444624090541633,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2589598490972786e-05,
+      "loss": 0.8897,
+      "step": 550
+    },
+    {
+      "epoch": 0.45270816491511723,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2454863918081384e-05,
+      "loss": 0.8557,
+      "step": 560
+    },
+    {
+      "epoch": 0.46079223928860147,
+      "grad_norm": Infinity,
+      "learning_rate": 4.232012934518998e-05,
+      "loss": 1.021,
+      "step": 570
+    },
+    {
+      "epoch": 0.4688763136620857,
+      "grad_norm": Infinity,
+      "learning_rate": 4.2185394772298574e-05,
+      "loss": 1.3295,
+      "step": 580
+    },
+    {
+      "epoch": 0.47696038803556995,
+      "grad_norm": Infinity,
+      "learning_rate": 4.205066019940717e-05,
+      "loss": 2.0716,
+      "step": 590
+    },
+    {
+      "epoch": 0.4850444624090542,
+      "grad_norm": Infinity,
+      "learning_rate": 4.1915925626515764e-05,
+      "loss": 2.5046,
+      "step": 600
+    },
+    {
+      "epoch": 0.4931285367825384,
+      "grad_norm": Infinity,
+      "learning_rate": 4.178119105362436e-05,
+      "loss": 1.4814,
+      "step": 610
+    },
+    {
+      "epoch": 0.5012126111560227,
+      "grad_norm": Infinity,
+      "learning_rate": 4.164645648073296e-05,
+      "loss": 1.5643,
+      "step": 620
+    },
+    {
+      "epoch": 0.5092966855295069,
+      "grad_norm": Infinity,
+      "learning_rate": 4.151172190784155e-05,
+      "loss": 1.4721,
+      "step": 630
+    },
+    {
+      "epoch": 0.5173807599029911,
+      "grad_norm": Infinity,
+      "learning_rate": 4.137698733495015e-05,
+      "loss": 2.1584,
+      "step": 640
+    },
+    {
+      "epoch": 0.5254648342764754,
+      "grad_norm": Infinity,
+      "learning_rate": 4.124225276205875e-05,
+      "loss": 0.9858,
+      "step": 650
+    },
+    {
+      "epoch": 0.5335489086499596,
+      "grad_norm": Infinity,
+      "learning_rate": 4.110751818916734e-05,
+      "loss": 2.2872,
+      "step": 660
+    },
+    {
+      "epoch": 0.5416329830234439,
+      "grad_norm": Infinity,
+      "learning_rate": 4.097278361627594e-05,
+      "loss": 1.8746,
+      "step": 670
+    },
+    {
+      "epoch": 0.549717057396928,
+      "grad_norm": Infinity,
+      "learning_rate": 4.083804904338454e-05,
+      "loss": 1.5985,
+      "step": 680
+    },
+    {
+      "epoch": 0.5578011317704122,
+      "grad_norm": Infinity,
+      "learning_rate": 4.070331447049313e-05,
+      "loss": 1.4411,
+      "step": 690
+    },
+    {
+      "epoch": 0.5658852061438965,
+      "grad_norm": Infinity,
+      "learning_rate": 4.056857989760173e-05,
+      "loss": 1.6405,
+      "step": 700
+    },
+    {
+      "epoch": 0.5739692805173807,
+      "grad_norm": Infinity,
+      "learning_rate": 4.0433845324710326e-05,
+      "loss": 0.9719,
+      "step": 710
+    },
+    {
+      "epoch": 0.582053354890865,
+      "grad_norm": Infinity,
+      "learning_rate": 4.029911075181892e-05,
+      "loss": 0.8405,
+      "step": 720
+    },
+    {
+      "epoch": 0.5901374292643492,
+      "grad_norm": Infinity,
+      "learning_rate": 4.0164376178927516e-05,
+      "loss": 0.5547,
+      "step": 730
+    },
+    {
+      "epoch": 0.5982215036378334,
+      "grad_norm": Infinity,
+      "learning_rate": 4.002964160603611e-05,
+      "loss": 1.0534,
+      "step": 740
+    },
+    {
+      "epoch": 0.6063055780113177,
+      "grad_norm": Infinity,
+      "learning_rate": 3.9894907033144707e-05,
+      "loss": 0.827,
+      "step": 750
+    },
+    {
+      "epoch": 0.6143896523848019,
+      "grad_norm": Infinity,
+      "learning_rate": 3.9760172460253305e-05,
+      "loss": 2.7736,
+      "step": 760
+    },
+    {
+      "epoch": 0.6224737267582862,
+      "grad_norm": Infinity,
+      "learning_rate": 3.9625437887361897e-05,
+      "loss": 1.637,
+      "step": 770
+    },
+    {
+      "epoch": 0.6305578011317704,
+      "grad_norm": Infinity,
+      "learning_rate": 3.9490703314470495e-05,
+      "loss": 2.0057,
+      "step": 780
+    },
+    {
+      "epoch": 0.6386418755052546,
+      "grad_norm": Infinity,
+      "learning_rate": 3.935596874157909e-05,
+      "loss": 1.1342,
+      "step": 790
+    },
+    {
+      "epoch": 0.6467259498787389,
+      "grad_norm": Infinity,
+      "learning_rate": 3.9221234168687685e-05,
+      "loss": 0.9694,
+      "step": 800
+    },
+    {
+      "epoch": 0.6548100242522231,
+      "grad_norm": Infinity,
+      "learning_rate": 3.908649959579628e-05,
+      "loss": 2.2358,
+      "step": 810
+    },
+    {
+      "epoch": 0.6628940986257074,
+      "grad_norm": Infinity,
+      "learning_rate": 3.895176502290488e-05,
+      "loss": 0.7282,
+      "step": 820
+    },
+    {
+      "epoch": 0.6709781729991916,
+      "grad_norm": Infinity,
+      "learning_rate": 3.8817030450013473e-05,
+      "loss": 1.0698,
+      "step": 830
+    },
+    {
+      "epoch": 0.6790622473726758,
+      "grad_norm": Infinity,
+      "learning_rate": 3.868229587712207e-05,
+      "loss": 1.3923,
+      "step": 840
+    },
+    {
+      "epoch": 0.6871463217461601,
+      "grad_norm": Infinity,
+      "learning_rate": 3.854756130423067e-05,
+      "loss": 0.8056,
+      "step": 850
+    },
+    {
+      "epoch": 0.6952303961196443,
+      "grad_norm": Infinity,
+      "learning_rate": 3.841282673133926e-05,
+      "loss": 1.6625,
+      "step": 860
+    },
+    {
+      "epoch": 0.7033144704931286,
+      "grad_norm": Infinity,
+      "learning_rate": 3.827809215844786e-05,
+      "loss": 1.2065,
+      "step": 870
+    },
+    {
+      "epoch": 0.7113985448666128,
+      "grad_norm": Infinity,
+      "learning_rate": 3.814335758555645e-05,
+      "loss": 1.1378,
+      "step": 880
+    },
+    {
+      "epoch": 0.719482619240097,
+      "grad_norm": Infinity,
+      "learning_rate": 3.800862301266505e-05,
+      "loss": 1.4192,
+      "step": 890
+    },
+    {
+      "epoch": 0.7275666936135813,
+      "grad_norm": Infinity,
+      "learning_rate": 3.787388843977365e-05,
+      "loss": 1.2827,
+      "step": 900
+    },
+    {
+      "epoch": 0.7356507679870655,
+      "grad_norm": Infinity,
+      "learning_rate": 3.773915386688224e-05,
+      "loss": 0.5413,
+      "step": 910
+    },
+    {
+      "epoch": 0.7437348423605498,
+      "grad_norm": Infinity,
+      "learning_rate": 3.760441929399084e-05,
+      "loss": 1.524,
+      "step": 920
+    },
+    {
+      "epoch": 0.751818916734034,
+      "grad_norm": Infinity,
+      "learning_rate": 3.746968472109944e-05,
+      "loss": 2.3918,
+      "step": 930
+    },
+    {
+      "epoch": 0.7599029911075182,
+      "grad_norm": Infinity,
+      "learning_rate": 3.733495014820803e-05,
+      "loss": 1.4762,
+      "step": 940
+    },
+    {
+      "epoch": 0.7679870654810024,
+      "grad_norm": Infinity,
+      "learning_rate": 3.720021557531663e-05,
+      "loss": 1.2758,
+      "step": 950
+    },
+    {
+      "epoch": 0.7760711398544866,
+      "grad_norm": Infinity,
+      "learning_rate": 3.7065481002425226e-05,
+      "loss": 1.2247,
+      "step": 960
+    },
+    {
+      "epoch": 0.7841552142279709,
+      "grad_norm": Infinity,
+      "learning_rate": 3.693074642953382e-05,
+      "loss": 1.3217,
+      "step": 970
+    },
+    {
+      "epoch": 0.7922392886014551,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6796011856642416e-05,
+      "loss": 1.0781,
+      "step": 980
+    },
+    {
+      "epoch": 0.8003233629749393,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6661277283751014e-05,
+      "loss": 1.1996,
+      "step": 990
+    },
+    {
+      "epoch": 0.8084074373484236,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6526542710859606e-05,
+      "loss": 1.8774,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8164915117219078,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6391808137968204e-05,
+      "loss": 0.4993,
+      "step": 1010
+    },
+    {
+      "epoch": 0.824575586095392,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6257073565076796e-05,
+      "loss": 1.1835,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8326596604688763,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6122338992185394e-05,
+      "loss": 1.6,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8407437348423605,
+      "grad_norm": Infinity,
+      "learning_rate": 3.598760441929399e-05,
+      "loss": 2.5379,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8488278092158448,
+      "grad_norm": Infinity,
+      "learning_rate": 3.5852869846402584e-05,
+      "loss": 1.0088,
+      "step": 1050
+    },
+    {
+      "epoch": 0.856911883589329,
+      "grad_norm": Infinity,
+      "learning_rate": 3.571813527351118e-05,
+      "loss": 2.2007,
+      "step": 1060
+    },
+    {
+      "epoch": 0.8649959579628133,
+      "grad_norm": Infinity,
+      "learning_rate": 3.558340070061978e-05,
+      "loss": 1.3587,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8730800323362975,
+      "grad_norm": Infinity,
+      "learning_rate": 3.544866612772837e-05,
+      "loss": 3.0178,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8811641067097817,
+      "grad_norm": Infinity,
+      "learning_rate": 3.531393155483697e-05,
+      "loss": 1.7664,
+      "step": 1090
+    },
+    {
+      "epoch": 0.889248181083266,
+      "grad_norm": Infinity,
+      "learning_rate": 3.517919698194557e-05,
+      "loss": 0.8585,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8973322554567502,
+      "grad_norm": Infinity,
+      "learning_rate": 3.504446240905416e-05,
+      "loss": 1.5722,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9054163298302345,
+      "grad_norm": Infinity,
+      "learning_rate": 3.490972783616276e-05,
+      "loss": 2.0158,
+      "step": 1120
+    },
+    {
+      "epoch": 0.9135004042037187,
+      "grad_norm": Infinity,
+      "learning_rate": 3.477499326327136e-05,
+      "loss": 1.8439,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9215844785772029,
+      "grad_norm": Infinity,
+      "learning_rate": 3.464025869037995e-05,
+      "loss": 1.7193,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9296685529506872,
+      "grad_norm": Infinity,
+      "learning_rate": 3.450552411748855e-05,
+      "loss": 0.8563,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9377526273241714,
+      "grad_norm": Infinity,
+      "learning_rate": 3.4370789544597146e-05,
+      "loss": 1.2554,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9458367016976557,
+      "grad_norm": Infinity,
+      "learning_rate": 3.423605497170574e-05,
+      "loss": 1.2612,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9539207760711399,
+      "grad_norm": Infinity,
+      "learning_rate": 3.4101320398814336e-05,
+      "loss": 0.6833,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9620048504446241,
+      "grad_norm": Infinity,
+      "learning_rate": 3.396658582592293e-05,
+      "loss": 0.7645,
+      "step": 1190
+    },
+    {
+      "epoch": 0.9700889248181084,
+      "grad_norm": Infinity,
+      "learning_rate": 3.3831851253031526e-05,
+      "loss": 1.7546,
+      "step": 1200
+    },
+    {
+      "epoch": 0.9781729991915926,
+      "grad_norm": Infinity,
+      "learning_rate": 3.3697116680140125e-05,
+      "loss": 2.0247,
+      "step": 1210
+    },
+    {
+      "epoch": 0.9862570735650767,
+      "grad_norm": Infinity,
+      "learning_rate": 3.356238210724872e-05,
+      "loss": 0.8708,
+      "step": 1220
+    },
+    {
+      "epoch": 0.994341147938561,
+      "grad_norm": Infinity,
+      "learning_rate": 3.342764753435732e-05,
+      "loss": 2.1135,
+      "step": 1230
+    },
+    {
+      "epoch": 1.0024252223120453,
+      "grad_norm": Infinity,
+      "learning_rate": 3.329291296146591e-05,
+      "loss": 1.73,
+      "step": 1240
+    },
+    {
+      "epoch": 1.0105092966855296,
+      "grad_norm": Infinity,
+      "learning_rate": 3.315817838857451e-05,
+      "loss": 1.5269,
+      "step": 1250
+    },
+    {
+      "epoch": 1.0185933710590138,
+      "grad_norm": Infinity,
+      "learning_rate": 3.302344381568311e-05,
+      "loss": 1.3657,
+      "step": 1260
+    },
+    {
+      "epoch": 1.026677445432498,
+      "grad_norm": Infinity,
+      "learning_rate": 3.28887092427917e-05,
+      "loss": 1.3771,
+      "step": 1270
+    },
+    {
+      "epoch": 1.0347615198059823,
+      "grad_norm": Infinity,
+      "learning_rate": 3.27539746699003e-05,
+      "loss": 0.9131,
+      "step": 1280
+    },
+    {
+      "epoch": 1.0428455941794665,
+      "grad_norm": Infinity,
+      "learning_rate": 3.26192400970089e-05,
+      "loss": 0.844,
+      "step": 1290
+    },
+    {
+      "epoch": 1.0509296685529508,
+      "grad_norm": Infinity,
+      "learning_rate": 3.248450552411749e-05,
+      "loss": 1.4511,
+      "step": 1300
+    },
+    {
+      "epoch": 1.059013742926435,
+      "grad_norm": Infinity,
+      "learning_rate": 3.234977095122609e-05,
+      "loss": 2.0453,
+      "step": 1310
+    },
+    {
+      "epoch": 1.0670978172999193,
+      "grad_norm": Infinity,
+      "learning_rate": 3.221503637833469e-05,
+      "loss": 1.4035,
+      "step": 1320
+    },
+    {
+      "epoch": 1.0751818916734033,
+      "grad_norm": Infinity,
+      "learning_rate": 3.208030180544328e-05,
+      "loss": 1.5244,
+      "step": 1330
+    },
+    {
+      "epoch": 1.0832659660468877,
+      "grad_norm": Infinity,
+      "learning_rate": 3.194556723255188e-05,
+      "loss": 0.8892,
+      "step": 1340
+    },
+    {
+      "epoch": 1.0913500404203718,
+      "grad_norm": Infinity,
+      "learning_rate": 3.1810832659660475e-05,
+      "loss": 1.6417,
+      "step": 1350
+    },
+    {
+      "epoch": 1.0994341147938562,
+      "grad_norm": Infinity,
+      "learning_rate": 3.167609808676907e-05,
+      "loss": 2.1219,
+      "step": 1360
+    },
+    {
+      "epoch": 1.1075181891673402,
+      "grad_norm": Infinity,
+      "learning_rate": 3.1541363513877665e-05,
+      "loss": 2.4549,
+      "step": 1370
+    },
+    {
+      "epoch": 1.1156022635408245,
+      "grad_norm": Infinity,
+      "learning_rate": 3.140662894098626e-05,
+      "loss": 1.1852,
+      "step": 1380
+    },
+    {
+      "epoch": 1.1236863379143087,
+      "grad_norm": Infinity,
+      "learning_rate": 3.1271894368094855e-05,
+      "loss": 2.5192,
+      "step": 1390
+    },
+    {
+      "epoch": 1.131770412287793,
+      "grad_norm": Infinity,
+      "learning_rate": 3.1137159795203454e-05,
+      "loss": 1.0584,
+      "step": 1400
+    },
+    {
+      "epoch": 1.1398544866612772,
+      "grad_norm": Infinity,
+      "learning_rate": 3.1002425222312045e-05,
+      "loss": 1.9475,
+      "step": 1410
+    },
+    {
+      "epoch": 1.1479385610347614,
+      "grad_norm": Infinity,
+      "learning_rate": 3.0867690649420644e-05,
+      "loss": 1.3349,
+      "step": 1420
+    },
+    {
+      "epoch": 1.1560226354082457,
+      "grad_norm": Infinity,
+      "learning_rate": 3.073295607652924e-05,
+      "loss": 2.005,
+      "step": 1430
+    },
+    {
+      "epoch": 1.16410670978173,
+      "grad_norm": Infinity,
+      "learning_rate": 3.0598221503637834e-05,
+      "loss": 0.8468,
+      "step": 1440
+    },
+    {
+      "epoch": 1.1721907841552142,
+      "grad_norm": Infinity,
+      "learning_rate": 3.0463486930746432e-05,
+      "loss": 1.3994,
+      "step": 1450
+    },
+    {
+      "epoch": 1.1802748585286984,
+      "grad_norm": Infinity,
+      "learning_rate": 3.0328752357855027e-05,
+      "loss": 0.5119,
+      "step": 1460
+    },
+    {
+      "epoch": 1.1883589329021826,
+      "grad_norm": Infinity,
+      "learning_rate": 3.0194017784963626e-05,
+      "loss": 0.7779,
+      "step": 1470
+    },
+    {
+      "epoch": 1.1964430072756669,
+      "grad_norm": Infinity,
+      "learning_rate": 3.005928321207222e-05,
+      "loss": 1.7018,
+      "step": 1480
+    },
+    {
+      "epoch": 1.2045270816491511,
+      "grad_norm": Infinity,
+      "learning_rate": 2.9924548639180816e-05,
+      "loss": 1.3685,
+      "step": 1490
+    },
+    {
+      "epoch": 1.2126111560226354,
+      "grad_norm": Infinity,
+      "learning_rate": 2.978981406628941e-05,
+      "loss": 1.361,
+      "step": 1500
+    },
+    {
+      "epoch": 1.2206952303961196,
+      "grad_norm": Infinity,
+      "learning_rate": 2.965507949339801e-05,
+      "loss": 1.5077,
+      "step": 1510
+    },
+    {
+      "epoch": 1.2287793047696038,
+      "grad_norm": Infinity,
+      "learning_rate": 2.9520344920506604e-05,
+      "loss": 1.0513,
+      "step": 1520
+    },
+    {
+      "epoch": 1.236863379143088,
+      "grad_norm": Infinity,
+      "learning_rate": 2.93856103476152e-05,
+      "loss": 1.6926,
+      "step": 1530
+    },
+    {
+      "epoch": 1.2449474535165723,
+      "grad_norm": Infinity,
+      "learning_rate": 2.9250875774723797e-05,
+      "loss": 1.3084,
+      "step": 1540
+    },
+    {
+      "epoch": 1.2530315278900566,
+      "grad_norm": Infinity,
+      "learning_rate": 2.9116141201832392e-05,
+      "loss": 1.8298,
+      "step": 1550
+    },
+    {
+      "epoch": 1.2611156022635408,
+      "grad_norm": Infinity,
+      "learning_rate": 2.8981406628940987e-05,
+      "loss": 0.9793,
+      "step": 1560
+    },
+    {
+      "epoch": 1.269199676637025,
+      "grad_norm": Infinity,
+      "learning_rate": 2.8846672056049582e-05,
+      "loss": 1.4149,
+      "step": 1570
+    },
+    {
+      "epoch": 1.2772837510105093,
+      "grad_norm": Infinity,
+      "learning_rate": 2.871193748315818e-05,
+      "loss": 0.9485,
+      "step": 1580
+    },
+    {
+      "epoch": 1.2853678253839935,
+      "grad_norm": Infinity,
+      "learning_rate": 2.8577202910266776e-05,
+      "loss": 1.6182,
+      "step": 1590
+    },
+    {
+      "epoch": 1.2934518997574778,
+      "grad_norm": Infinity,
+      "learning_rate": 2.844246833737537e-05,
+      "loss": 0.9473,
+      "step": 1600
+    },
+    {
+      "epoch": 1.301535974130962,
+      "grad_norm": Infinity,
+      "learning_rate": 2.830773376448397e-05,
+      "loss": 1.8231,
+      "step": 1610
+    },
+    {
+      "epoch": 1.3096200485044462,
+      "grad_norm": Infinity,
+      "learning_rate": 2.8172999191592564e-05,
+      "loss": 1.687,
+      "step": 1620
+    },
+    {
+      "epoch": 1.3177041228779305,
+      "grad_norm": Infinity,
+      "learning_rate": 2.803826461870116e-05,
+      "loss": 1.0405,
+      "step": 1630
+    },
+    {
+      "epoch": 1.3257881972514147,
+      "grad_norm": Infinity,
+      "learning_rate": 2.7903530045809754e-05,
+      "loss": 1.2729,
+      "step": 1640
+    },
+    {
+      "epoch": 1.333872271624899,
+      "grad_norm": Infinity,
+      "learning_rate": 2.7768795472918353e-05,
+      "loss": 1.7429,
+      "step": 1650
+    },
+    {
+      "epoch": 1.3419563459983832,
+      "grad_norm": Infinity,
+      "learning_rate": 2.7634060900026948e-05,
+      "loss": 1.1652,
+      "step": 1660
+    },
+    {
+      "epoch": 1.3500404203718674,
+      "grad_norm": Infinity,
+      "learning_rate": 2.7499326327135543e-05,
+      "loss": 1.6927,
+      "step": 1670
+    },
+    {
+      "epoch": 1.3581244947453517,
+      "grad_norm": Infinity,
+      "learning_rate": 2.736459175424414e-05,
+      "loss": 1.1215,
+      "step": 1680
+    },
+    {
+      "epoch": 1.366208569118836,
+      "grad_norm": Infinity,
+      "learning_rate": 2.7229857181352736e-05,
+      "loss": 1.0629,
+      "step": 1690
+    },
+    {
+      "epoch": 1.3742926434923202,
+      "grad_norm": Infinity,
+      "learning_rate": 2.709512260846133e-05,
+      "loss": 1.0104,
+      "step": 1700
+    },
+    {
+      "epoch": 1.3823767178658044,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6960388035569926e-05,
+      "loss": 1.4327,
+      "step": 1710
+    },
+    {
+      "epoch": 1.3904607922392886,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6825653462678525e-05,
+      "loss": 1.0857,
+      "step": 1720
+    },
+    {
+      "epoch": 1.3985448666127729,
+      "grad_norm": Infinity,
+      "learning_rate": 2.669091888978712e-05,
+      "loss": 1.7623,
+      "step": 1730
+    },
+    {
+      "epoch": 1.4066289409862571,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6556184316895715e-05,
+      "loss": 1.4973,
+      "step": 1740
+    },
+    {
+      "epoch": 1.4147130153597414,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6421449744004313e-05,
+      "loss": 1.313,
+      "step": 1750
+    },
+    {
+      "epoch": 1.4227970897332256,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6286715171112908e-05,
+      "loss": 1.1965,
+      "step": 1760
+    },
+    {
+      "epoch": 1.4308811641067098,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6151980598221503e-05,
+      "loss": 1.432,
+      "step": 1770
+    },
+    {
+      "epoch": 1.438965238480194,
+      "grad_norm": Infinity,
+      "learning_rate": 2.6017246025330098e-05,
+      "loss": 1.3331,
+      "step": 1780
+    },
+    {
+      "epoch": 1.4470493128536783,
+      "grad_norm": Infinity,
+      "learning_rate": 2.5882511452438697e-05,
+      "loss": 1.2561,
+      "step": 1790
+    },
+    {
+      "epoch": 1.4551333872271626,
+      "grad_norm": Infinity,
+      "learning_rate": 2.574777687954729e-05,
+      "loss": 1.5896,
+      "step": 1800
+    },
+    {
+      "epoch": 1.4632174616006468,
+      "grad_norm": Infinity,
+      "learning_rate": 2.5613042306655887e-05,
+      "loss": 2.9014,
+      "step": 1810
+    },
+    {
+      "epoch": 1.4713015359741308,
+      "grad_norm": Infinity,
+      "learning_rate": 2.5478307733764485e-05,
+      "loss": 1.5244,
+      "step": 1820
+    },
+    {
+      "epoch": 1.4793856103476153,
+      "grad_norm": Infinity,
+      "learning_rate": 2.534357316087308e-05,
+      "loss": 1.0577,
+      "step": 1830
+    },
+    {
+      "epoch": 1.4874696847210993,
+      "grad_norm": Infinity,
+      "learning_rate": 2.5208838587981675e-05,
+      "loss": 1.2323,
+      "step": 1840
+    },
+    {
+      "epoch": 1.4955537590945838,
+      "grad_norm": Infinity,
+      "learning_rate": 2.5074104015090273e-05,
+      "loss": 2.4222,
+      "step": 1850
+    },
+    {
+      "epoch": 1.5036378334680678,
+      "grad_norm": Infinity,
+      "learning_rate": 2.4939369442198872e-05,
+      "loss": 1.9402,
+      "step": 1860
+    },
+    {
+      "epoch": 1.5117219078415522,
+      "grad_norm": Infinity,
+      "learning_rate": 2.4804634869307467e-05,
+      "loss": 2.2911,
+      "step": 1870
+    },
+    {
+      "epoch": 1.5198059822150363,
+      "grad_norm": Infinity,
+      "learning_rate": 2.4669900296416062e-05,
+      "loss": 1.7301,
+      "step": 1880
+    },
+    {
+      "epoch": 1.5278900565885207,
+      "grad_norm": Infinity,
+      "learning_rate": 2.4535165723524657e-05,
+      "loss": 0.6863,
+      "step": 1890
+    },
+    {
+      "epoch": 1.5359741309620047,
+      "grad_norm": Infinity,
+      "learning_rate": 2.4400431150633255e-05,
+      "loss": 1.8456,
+      "step": 1900
+    },
+    {
+      "epoch": 1.5440582053354892,
+      "grad_norm": Infinity,
+      "learning_rate": 2.426569657774185e-05,
+      "loss": 2.3463,
+      "step": 1910
+    },
+    {
+      "epoch": 1.5521422797089732,
+      "grad_norm": Infinity,
+      "learning_rate": 2.4130962004850445e-05,
+      "loss": 1.63,
+      "step": 1920
+    },
+    {
+      "epoch": 1.5602263540824577,
+      "grad_norm": Infinity,
+      "learning_rate": 2.3996227431959044e-05,
+      "loss": 2.1095,
+      "step": 1930
+    },
+    {
+      "epoch": 1.5683104284559417,
+      "grad_norm": Infinity,
+      "learning_rate": 2.386149285906764e-05,
+      "loss": 0.9828,
+      "step": 1940
+    },
+    {
+      "epoch": 1.5763945028294262,
+      "grad_norm": Infinity,
+      "learning_rate": 2.3726758286176234e-05,
+      "loss": 0.7091,
+      "step": 1950
+    },
+    {
+      "epoch": 1.5844785772029102,
+      "grad_norm": Infinity,
+      "learning_rate": 2.359202371328483e-05,
+      "loss": 0.5691,
+      "step": 1960
+    },
+    {
+      "epoch": 1.5925626515763947,
+      "grad_norm": Infinity,
+      "learning_rate": 2.3457289140393427e-05,
+      "loss": 1.5768,
+      "step": 1970
+    },
+    {
+      "epoch": 1.6006467259498787,
+      "grad_norm": Infinity,
+      "learning_rate": 2.3322554567502022e-05,
+      "loss": 1.161,
+      "step": 1980
+    },
+    {
+      "epoch": 1.6087308003233631,
+      "grad_norm": Infinity,
+      "learning_rate": 2.3187819994610617e-05,
+      "loss": 0.9278,
+      "step": 1990
+    },
+    {
+      "epoch": 1.6168148746968471,
+      "grad_norm": Infinity,
+      "learning_rate": 2.3053085421719216e-05,
+      "loss": 1.0681,
+      "step": 2000
+    },
+    {
+      "epoch": 1.6248989490703316,
+      "grad_norm": Infinity,
+      "learning_rate": 2.291835084882781e-05,
+      "loss": 2.3668,
+      "step": 2010
+    },
+    {
+      "epoch": 1.6329830234438156,
+      "grad_norm": Infinity,
+      "learning_rate": 2.2783616275936406e-05,
+      "loss": 0.7226,
+      "step": 2020
+    },
+    {
+      "epoch": 1.6410670978172999,
+      "grad_norm": Infinity,
+      "learning_rate": 2.2648881703045e-05,
+      "loss": 0.5279,
+      "step": 2030
+    },
+    {
+      "epoch": 1.649151172190784,
+      "grad_norm": Infinity,
+      "learning_rate": 2.25141471301536e-05,
+      "loss": 0.7175,
+      "step": 2040
+    },
+    {
+      "epoch": 1.6572352465642683,
+      "grad_norm": Infinity,
+      "learning_rate": 2.2379412557262194e-05,
+      "loss": 2.026,
+      "step": 2050
+    },
+    {
+      "epoch": 1.6653193209377526,
+      "grad_norm": Infinity,
+      "learning_rate": 2.224467798437079e-05,
+      "loss": 1.204,
+      "step": 2060
+    },
+    {
+      "epoch": 1.6734033953112368,
+      "grad_norm": Infinity,
+      "learning_rate": 2.2109943411479387e-05,
+      "loss": 2.0731,
+      "step": 2070
+    },
+    {
+      "epoch": 1.681487469684721,
+      "grad_norm": Infinity,
+      "learning_rate": 2.1975208838587983e-05,
+      "loss": 1.9343,
+      "step": 2080
+    },
+    {
+      "epoch": 1.6895715440582053,
+      "grad_norm": Infinity,
+      "learning_rate": 2.1840474265696578e-05,
+      "loss": 2.1806,
+      "step": 2090
+    },
+    {
+      "epoch": 1.6976556184316896,
+      "grad_norm": Infinity,
+      "learning_rate": 2.1705739692805176e-05,
+      "loss": 2.457,
+      "step": 2100
+    },
+    {
+      "epoch": 1.7057396928051738,
+      "grad_norm": Infinity,
+      "learning_rate": 2.157100511991377e-05,
+      "loss": 1.7964,
+      "step": 2110
+    },
+    {
+      "epoch": 1.713823767178658,
+      "grad_norm": Infinity,
+      "learning_rate": 2.1436270547022366e-05,
+      "loss": 1.9725,
+      "step": 2120
+    },
+    {
+      "epoch": 1.7219078415521423,
+      "grad_norm": Infinity,
+      "learning_rate": 2.130153597413096e-05,
+      "loss": 1.4176,
+      "step": 2130
+    },
+    {
+      "epoch": 1.7299919159256265,
+      "grad_norm": Infinity,
+      "learning_rate": 2.116680140123956e-05,
+      "loss": 2.6819,
+      "step": 2140
+    },
+    {
+      "epoch": 1.7380759902991108,
+      "grad_norm": Infinity,
+      "learning_rate": 2.1032066828348154e-05,
+      "loss": 2.2248,
+      "step": 2150
+    },
+    {
+      "epoch": 1.746160064672595,
+      "grad_norm": Infinity,
+      "learning_rate": 2.089733225545675e-05,
+      "loss": 2.2926,
+      "step": 2160
+    },
+    {
+      "epoch": 1.7542441390460792,
+      "grad_norm": Infinity,
+      "learning_rate": 2.0762597682565348e-05,
+      "loss": 0.6392,
+      "step": 2170
+    },
+    {
+      "epoch": 1.7623282134195635,
+      "grad_norm": Infinity,
+      "learning_rate": 2.0627863109673943e-05,
+      "loss": 1.4321,
+      "step": 2180
+    },
+    {
+      "epoch": 1.7704122877930477,
+      "grad_norm": Infinity,
+      "learning_rate": 2.0493128536782538e-05,
+      "loss": 1.9084,
+      "step": 2190
+    },
+    {
+      "epoch": 1.778496362166532,
+      "grad_norm": Infinity,
+      "learning_rate": 2.0358393963891133e-05,
+      "loss": 2.2621,
+      "step": 2200
+    },
+    {
+      "epoch": 1.7865804365400162,
+      "grad_norm": Infinity,
+      "learning_rate": 2.022365939099973e-05,
+      "loss": 1.8285,
+      "step": 2210
+    },
+    {
+      "epoch": 1.7946645109135004,
+      "grad_norm": Infinity,
+      "learning_rate": 2.008892481810833e-05,
+      "loss": 1.5897,
+      "step": 2220
+    },
+    {
+      "epoch": 1.8027485852869847,
+      "grad_norm": Infinity,
+      "learning_rate": 1.9954190245216925e-05,
+      "loss": 1.6952,
+      "step": 2230
+    },
+    {
+      "epoch": 1.810832659660469,
+      "grad_norm": Infinity,
+      "learning_rate": 1.981945567232552e-05,
+      "loss": 2.6125,
+      "step": 2240
+    },
+    {
+      "epoch": 1.8189167340339532,
+      "grad_norm": Infinity,
+      "learning_rate": 1.9684721099434118e-05,
+      "loss": 1.2341,
+      "step": 2250
+    },
+    {
+      "epoch": 1.8270008084074374,
+      "grad_norm": Infinity,
+      "learning_rate": 1.9549986526542713e-05,
+      "loss": 1.9369,
+      "step": 2260
+    },
+    {
+      "epoch": 1.8350848827809216,
+      "grad_norm": Infinity,
+      "learning_rate": 1.9415251953651308e-05,
+      "loss": 2.6913,
+      "step": 2270
+    },
+    {
+      "epoch": 1.8431689571544059,
+      "grad_norm": Infinity,
+      "learning_rate": 1.9280517380759907e-05,
+      "loss": 2.0335,
+      "step": 2280
+    },
+    {
+      "epoch": 1.85125303152789,
+      "grad_norm": Infinity,
+      "learning_rate": 1.91457828078685e-05,
+      "loss": 1.2543,
+      "step": 2290
+    },
+    {
+      "epoch": 1.8593371059013744,
+      "grad_norm": Infinity,
+      "learning_rate": 1.9011048234977097e-05,
+      "loss": 1.6764,
+      "step": 2300
+    },
+    {
+      "epoch": 1.8674211802748584,
+      "grad_norm": Infinity,
+      "learning_rate": 1.887631366208569e-05,
+      "loss": 1.2704,
+      "step": 2310
+    },
+    {
+      "epoch": 1.8755052546483428,
+      "grad_norm": Infinity,
+      "learning_rate": 1.874157908919429e-05,
+      "loss": 2.1497,
+      "step": 2320
+    },
+    {
+      "epoch": 1.8835893290218269,
+      "grad_norm": Infinity,
+      "learning_rate": 1.8606844516302885e-05,
+      "loss": 1.7158,
+      "step": 2330
+    },
+    {
+      "epoch": 1.8916734033953113,
+      "grad_norm": Infinity,
+      "learning_rate": 1.847210994341148e-05,
+      "loss": 0.9835,
+      "step": 2340
+    },
+    {
+      "epoch": 1.8997574777687953,
+      "grad_norm": Infinity,
+      "learning_rate": 1.833737537052008e-05,
+      "loss": 1.6897,
+      "step": 2350
+    },
+    {
+      "epoch": 1.9078415521422798,
+      "grad_norm": Infinity,
+      "learning_rate": 1.8202640797628673e-05,
+      "loss": 1.5966,
+      "step": 2360
+    },
+    {
+      "epoch": 1.9159256265157638,
+      "grad_norm": Infinity,
+      "learning_rate": 1.806790622473727e-05,
+      "loss": 1.3293,
+      "step": 2370
+    },
+    {
+      "epoch": 1.9240097008892483,
+      "grad_norm": Infinity,
+      "learning_rate": 1.7933171651845863e-05,
+      "loss": 0.9033,
+      "step": 2380
+    },
+    {
+      "epoch": 1.9320937752627323,
+      "grad_norm": Infinity,
+      "learning_rate": 1.7798437078954462e-05,
+      "loss": 1.1496,
+      "step": 2390
+    },
+    {
+      "epoch": 1.9401778496362168,
+      "grad_norm": Infinity,
+      "learning_rate": 1.7663702506063057e-05,
+      "loss": 1.0576,
+      "step": 2400
+    },
+    {
+      "epoch": 1.9482619240097008,
+      "grad_norm": Infinity,
+      "learning_rate": 1.7528967933171652e-05,
+      "loss": 2.219,
+      "step": 2410
+    },
+    {
+      "epoch": 1.9563459983831852,
+      "grad_norm": Infinity,
+      "learning_rate": 1.739423336028025e-05,
+      "loss": 0.8811,
+      "step": 2420
+    },
+    {
+      "epoch": 1.9644300727566693,
+      "grad_norm": Infinity,
+      "learning_rate": 1.7259498787388845e-05,
+      "loss": 1.5159,
+      "step": 2430
+    },
+    {
+      "epoch": 1.9725141471301537,
+      "grad_norm": Infinity,
+      "learning_rate": 1.712476421449744e-05,
+      "loss": 1.5736,
+      "step": 2440
+    },
+    {
+      "epoch": 1.9805982215036377,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6990029641606035e-05,
+      "loss": 2.0976,
+      "step": 2450
+    },
+    {
+      "epoch": 1.9886822958771222,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6855295068714634e-05,
+      "loss": 2.2363,
+      "step": 2460
+    },
+    {
+      "epoch": 1.9967663702506062,
+      "grad_norm": Infinity,
+      "learning_rate": 1.672056049582323e-05,
+      "loss": 2.5238,
+      "step": 2470
+    },
+    {
+      "epoch": 2.0048504446240907,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6585825922931824e-05,
+      "loss": 0.9174,
+      "step": 2480
+    },
+    {
+      "epoch": 2.0129345189975747,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6451091350040422e-05,
+      "loss": 1.5876,
+      "step": 2490
+    },
+    {
+      "epoch": 2.021018593371059,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6316356777149017e-05,
+      "loss": 1.129,
+      "step": 2500
+    },
+    {
+      "epoch": 2.029102667744543,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6181622204257612e-05,
+      "loss": 1.6202,
+      "step": 2510
+    },
+    {
+      "epoch": 2.0371867421180276,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6046887631366207e-05,
+      "loss": 0.9087,
+      "step": 2520
+    },
+    {
+      "epoch": 2.0452708164915117,
+      "grad_norm": Infinity,
+      "learning_rate": 1.5912153058474806e-05,
+      "loss": 2.0087,
+      "step": 2530
+    },
+    {
+      "epoch": 2.053354890864996,
+      "grad_norm": Infinity,
+      "learning_rate": 1.57774184855834e-05,
+      "loss": 2.1463,
+      "step": 2540
+    },
+    {
+      "epoch": 2.06143896523848,
+      "grad_norm": Infinity,
+      "learning_rate": 1.5642683912691996e-05,
+      "loss": 1.7217,
+      "step": 2550
+    },
+    {
+      "epoch": 2.0695230396119646,
+      "grad_norm": Infinity,
+      "learning_rate": 1.5507949339800594e-05,
+      "loss": 0.7802,
+      "step": 2560
+    },
+    {
+      "epoch": 2.0776071139854486,
+      "grad_norm": Infinity,
+      "learning_rate": 1.537321476690919e-05,
+      "loss": 2.4857,
+      "step": 2570
+    },
+    {
+      "epoch": 2.085691188358933,
+      "grad_norm": Infinity,
+      "learning_rate": 1.5238480194017784e-05,
+      "loss": 1.5868,
+      "step": 2580
+    },
+    {
+      "epoch": 2.093775262732417,
+      "grad_norm": Infinity,
+      "learning_rate": 1.510374562112638e-05,
+      "loss": 2.8292,
+      "step": 2590
+    },
+    {
+      "epoch": 2.1018593371059016,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4969011048234976e-05,
+      "loss": 1.4174,
+      "step": 2600
+    },
+    {
+      "epoch": 2.1099434114793856,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4834276475343573e-05,
+      "loss": 1.4931,
+      "step": 2610
+    },
+    {
+      "epoch": 2.11802748585287,
+      "grad_norm": Infinity,
+      "learning_rate": 1.469954190245217e-05,
+      "loss": 1.1888,
+      "step": 2620
+    },
+    {
+      "epoch": 2.126111560226354,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4564807329560768e-05,
+      "loss": 2.5423,
+      "step": 2630
+    },
+    {
+      "epoch": 2.1341956345998385,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4430072756669363e-05,
+      "loss": 1.0222,
+      "step": 2640
+    },
+    {
+      "epoch": 2.1422797089733225,
+      "grad_norm": Infinity,
+      "learning_rate": 1.429533818377796e-05,
+      "loss": 1.2646,
+      "step": 2650
+    },
+    {
+      "epoch": 2.1503637833468066,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4160603610886556e-05,
+      "loss": 0.8661,
+      "step": 2660
+    },
+    {
+      "epoch": 2.158447857720291,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4025869037995151e-05,
+      "loss": 1.0685,
+      "step": 2670
+    },
+    {
+      "epoch": 2.1665319320937755,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3891134465103748e-05,
+      "loss": 1.6561,
+      "step": 2680
+    },
+    {
+      "epoch": 2.1746160064672595,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3756399892212343e-05,
+      "loss": 1.4664,
+      "step": 2690
+    },
+    {
+      "epoch": 2.1827000808407435,
+      "grad_norm": Infinity,
+      "learning_rate": 1.362166531932094e-05,
+      "loss": 1.8332,
+      "step": 2700
+    },
+    {
+      "epoch": 2.190784155214228,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3486930746429535e-05,
+      "loss": 1.2506,
+      "step": 2710
+    },
+    {
+      "epoch": 2.1988682295877124,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3352196173538131e-05,
+      "loss": 1.1132,
+      "step": 2720
+    },
+    {
+      "epoch": 2.2069523039611965,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3217461600646728e-05,
+      "loss": 0.8307,
+      "step": 2730
+    },
+    {
+      "epoch": 2.2150363783346805,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3082727027755323e-05,
+      "loss": 1.706,
+      "step": 2740
+    },
+    {
+      "epoch": 2.223120452708165,
+      "grad_norm": Infinity,
+      "learning_rate": 1.294799245486392e-05,
+      "loss": 1.2033,
+      "step": 2750
+    },
+    {
+      "epoch": 2.231204527081649,
+      "grad_norm": Infinity,
+      "learning_rate": 1.2813257881972515e-05,
+      "loss": 1.5921,
+      "step": 2760
+    },
+    {
+      "epoch": 2.2392886014551334,
+      "grad_norm": Infinity,
+      "learning_rate": 1.2678523309081111e-05,
+      "loss": 1.1893,
+      "step": 2770
+    },
+    {
+      "epoch": 2.2473726758286174,
+      "grad_norm": Infinity,
+      "learning_rate": 1.2543788736189706e-05,
+      "loss": 1.9335,
+      "step": 2780
+    },
+    {
+      "epoch": 2.255456750202102,
+      "grad_norm": Infinity,
+      "learning_rate": 1.2409054163298303e-05,
+      "loss": 1.5802,
+      "step": 2790
+    },
+    {
+      "epoch": 2.263540824575586,
+      "grad_norm": Infinity,
+      "learning_rate": 1.22743195904069e-05,
+      "loss": 2.0087,
+      "step": 2800
+    },
+    {
+      "epoch": 2.2716248989490704,
+      "grad_norm": Infinity,
+      "learning_rate": 1.2139585017515495e-05,
+      "loss": 1.1383,
+      "step": 2810
+    },
+    {
+      "epoch": 2.2797089733225544,
+      "grad_norm": Infinity,
+      "learning_rate": 1.2004850444624092e-05,
+      "loss": 1.3281,
+      "step": 2820
+    },
+    {
+      "epoch": 2.287793047696039,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1870115871732687e-05,
+      "loss": 1.0895,
+      "step": 2830
+    },
+    {
+      "epoch": 2.295877122069523,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1735381298841283e-05,
+      "loss": 1.0182,
+      "step": 2840
+    },
+    {
+      "epoch": 2.3039611964430073,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1600646725949878e-05,
+      "loss": 1.6568,
+      "step": 2850
+    },
+    {
+      "epoch": 2.3120452708164914,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1465912153058475e-05,
+      "loss": 1.6142,
+      "step": 2860
+    },
+    {
+      "epoch": 2.320129345189976,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1331177580167072e-05,
+      "loss": 1.9218,
+      "step": 2870
+    },
+    {
+      "epoch": 2.32821341956346,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1196443007275667e-05,
+      "loss": 1.6749,
+      "step": 2880
+    },
+    {
+      "epoch": 2.3362974939369443,
+      "grad_norm": Infinity,
+      "learning_rate": 1.1061708434384263e-05,
+      "loss": 0.6671,
+      "step": 2890
+    },
+    {
+      "epoch": 2.3443815683104283,
+      "grad_norm": Infinity,
+      "learning_rate": 1.0926973861492859e-05,
+      "loss": 1.8723,
+      "step": 2900
+    },
+    {
+      "epoch": 2.352465642683913,
+      "grad_norm": Infinity,
+      "learning_rate": 1.0792239288601455e-05,
+      "loss": 1.8936,
+      "step": 2910
+    },
+    {
+      "epoch": 2.360549717057397,
+      "grad_norm": Infinity,
+      "learning_rate": 1.065750471571005e-05,
+      "loss": 1.7514,
+      "step": 2920
+    },
+    {
+      "epoch": 2.3686337914308813,
+      "grad_norm": Infinity,
+      "learning_rate": 1.0522770142818649e-05,
+      "loss": 1.6109,
+      "step": 2930
+    },
+    {
+      "epoch": 2.3767178658043653,
+      "grad_norm": Infinity,
+      "learning_rate": 1.0388035569927244e-05,
+      "loss": 2.3036,
+      "step": 2940
+    },
+    {
+      "epoch": 2.3848019401778497,
+      "grad_norm": Infinity,
+      "learning_rate": 1.025330099703584e-05,
+      "loss": 1.6592,
+      "step": 2950
+    },
+    {
+      "epoch": 2.3928860145513338,
+      "grad_norm": Infinity,
+      "learning_rate": 1.0118566424144437e-05,
+      "loss": 1.64,
+      "step": 2960
+    },
+    {
+      "epoch": 2.4009700889248182,
+      "grad_norm": Infinity,
+      "learning_rate": 9.983831851253032e-06,
+      "loss": 2.0912,
+      "step": 2970
+    },
+    {
+      "epoch": 2.4090541632983022,
+      "grad_norm": Infinity,
+      "learning_rate": 9.849097278361629e-06,
+      "loss": 1.4023,
+      "step": 2980
+    },
+    {
+      "epoch": 2.4171382376717867,
+      "grad_norm": Infinity,
+      "learning_rate": 9.714362705470224e-06,
+      "loss": 1.8342,
+      "step": 2990
+    },
+    {
+      "epoch": 2.4252223120452707,
+      "grad_norm": Infinity,
+      "learning_rate": 9.57962813257882e-06,
+      "loss": 0.9314,
+      "step": 3000
+    },
+    {
+      "epoch": 2.433306386418755,
+      "grad_norm": Infinity,
+      "learning_rate": 9.444893559687416e-06,
+      "loss": 1.271,
+      "step": 3010
+    },
+    {
+      "epoch": 2.441390460792239,
+      "grad_norm": Infinity,
+      "learning_rate": 9.310158986796012e-06,
+      "loss": 2.358,
+      "step": 3020
+    },
+    {
+      "epoch": 2.4494745351657237,
+      "grad_norm": Infinity,
+      "learning_rate": 9.175424413904609e-06,
+      "loss": 1.9627,
+      "step": 3030
+    },
+    {
+      "epoch": 2.4575586095392077,
+      "grad_norm": Infinity,
+      "learning_rate": 9.040689841013204e-06,
+      "loss": 1.1173,
+      "step": 3040
+    },
+    {
+      "epoch": 2.465642683912692,
+      "grad_norm": Infinity,
+      "learning_rate": 8.9059552681218e-06,
+      "loss": 0.91,
+      "step": 3050
+    },
+    {
+      "epoch": 2.473726758286176,
+      "grad_norm": Infinity,
+      "learning_rate": 8.771220695230396e-06,
+      "loss": 0.8115,
+      "step": 3060
+    },
+    {
+      "epoch": 2.4818108326596606,
+      "grad_norm": Infinity,
+      "learning_rate": 8.636486122338992e-06,
+      "loss": 1.3157,
+      "step": 3070
+    },
+    {
+      "epoch": 2.4898949070331446,
+      "grad_norm": Infinity,
+      "learning_rate": 8.501751549447589e-06,
+      "loss": 2.2141,
+      "step": 3080
+    },
+    {
+      "epoch": 2.497978981406629,
+      "grad_norm": Infinity,
+      "learning_rate": 8.367016976556184e-06,
+      "loss": 1.5275,
+      "step": 3090
+    },
+    {
+      "epoch": 2.506063055780113,
+      "grad_norm": Infinity,
+      "learning_rate": 8.23228240366478e-06,
+      "loss": 1.8859,
+      "step": 3100
+    },
+    {
+      "epoch": 2.5141471301535976,
+      "grad_norm": Infinity,
+      "learning_rate": 8.097547830773376e-06,
+      "loss": 1.6131,
+      "step": 3110
+    },
+    {
+      "epoch": 2.5222312045270816,
+      "grad_norm": Infinity,
+      "learning_rate": 7.962813257881973e-06,
+      "loss": 1.4654,
+      "step": 3120
+    },
+    {
+      "epoch": 2.5303152789005656,
+      "grad_norm": Infinity,
+      "learning_rate": 7.82807868499057e-06,
+      "loss": 1.4129,
+      "step": 3130
+    },
+    {
+      "epoch": 2.53839935327405,
+      "grad_norm": Infinity,
+      "learning_rate": 7.693344112099166e-06,
+      "loss": 1.2493,
+      "step": 3140
+    },
+    {
+      "epoch": 2.5464834276475345,
+      "grad_norm": Infinity,
+      "learning_rate": 7.558609539207762e-06,
+      "loss": 1.6047,
+      "step": 3150
+    },
+    {
+      "epoch": 2.5545675020210186,
+      "grad_norm": Infinity,
+      "learning_rate": 7.423874966316358e-06,
+      "loss": 0.6597,
+      "step": 3160
+    },
+    {
+      "epoch": 2.5626515763945026,
+      "grad_norm": Infinity,
+      "learning_rate": 7.2891403934249536e-06,
+      "loss": 1.8984,
+      "step": 3170
+    },
+    {
+      "epoch": 2.570735650767987,
+      "grad_norm": Infinity,
+      "learning_rate": 7.1544058205335494e-06,
+      "loss": 1.9384,
+      "step": 3180
+    },
+    {
+      "epoch": 2.5788197251414715,
+      "grad_norm": Infinity,
+      "learning_rate": 7.019671247642145e-06,
+      "loss": 2.2221,
+      "step": 3190
+    },
+    {
+      "epoch": 2.5869037995149555,
+      "grad_norm": Infinity,
+      "learning_rate": 6.884936674750741e-06,
+      "loss": 1.1601,
+      "step": 3200
+    },
+    {
+      "epoch": 2.5949878738884395,
+      "grad_norm": Infinity,
+      "learning_rate": 6.750202101859338e-06,
+      "loss": 2.0688,
+      "step": 3210
+    },
+    {
+      "epoch": 2.603071948261924,
+      "grad_norm": Infinity,
+      "learning_rate": 6.615467528967934e-06,
+      "loss": 1.5725,
+      "step": 3220
+    },
+    {
+      "epoch": 2.6111560226354085,
+      "grad_norm": Infinity,
+      "learning_rate": 6.48073295607653e-06,
+      "loss": 1.9509,
+      "step": 3230
+    },
+    {
+      "epoch": 2.6192400970088925,
+      "grad_norm": Infinity,
+      "learning_rate": 6.3459983831851255e-06,
+      "loss": 1.8586,
+      "step": 3240
+    },
+    {
+      "epoch": 2.6273241713823765,
+      "grad_norm": Infinity,
+      "learning_rate": 6.211263810293721e-06,
+      "loss": 1.516,
+      "step": 3250
+    },
+    {
+      "epoch": 2.635408245755861,
+      "grad_norm": Infinity,
+      "learning_rate": 6.076529237402317e-06,
+      "loss": 1.5061,
+      "step": 3260
+    },
+    {
+      "epoch": 2.6434923201293454,
+      "grad_norm": Infinity,
+      "learning_rate": 5.941794664510914e-06,
+      "loss": 0.9186,
+      "step": 3270
+    },
+    {
+      "epoch": 2.6515763945028294,
+      "grad_norm": Infinity,
+      "learning_rate": 5.80706009161951e-06,
+      "loss": 1.5769,
+      "step": 3280
+    },
+    {
+      "epoch": 2.6596604688763135,
+      "grad_norm": Infinity,
+      "learning_rate": 5.6723255187281065e-06,
+      "loss": 2.253,
+      "step": 3290
+    },
+    {
+      "epoch": 2.667744543249798,
+      "grad_norm": Infinity,
+      "learning_rate": 5.537590945836702e-06,
+      "loss": 0.7681,
+      "step": 3300
+    },
+    {
+      "epoch": 2.6758286176232824,
+      "grad_norm": Infinity,
+      "learning_rate": 5.402856372945298e-06,
+      "loss": 0.967,
+      "step": 3310
+    },
+    {
+      "epoch": 2.6839126919967664,
+      "grad_norm": Infinity,
+      "learning_rate": 5.268121800053894e-06,
+      "loss": 2.1412,
+      "step": 3320
+    },
+    {
+      "epoch": 2.6919967663702504,
+      "grad_norm": Infinity,
+      "learning_rate": 5.13338722716249e-06,
+      "loss": 3.0335,
+      "step": 3330
+    },
+    {
+      "epoch": 2.700080840743735,
+      "grad_norm": Infinity,
+      "learning_rate": 4.998652654271086e-06,
+      "loss": 1.7116,
+      "step": 3340
+    },
+    {
+      "epoch": 2.7081649151172194,
+      "grad_norm": Infinity,
+      "learning_rate": 4.8639180813796825e-06,
+      "loss": 1.5722,
+      "step": 3350
+    },
+    {
+      "epoch": 2.7162489894907034,
+      "grad_norm": Infinity,
+      "learning_rate": 4.729183508488278e-06,
+      "loss": 0.9447,
+      "step": 3360
+    },
+    {
+      "epoch": 2.7243330638641874,
+      "grad_norm": Infinity,
+      "learning_rate": 4.594448935596874e-06,
+      "loss": 1.5505,
+      "step": 3370
+    },
+    {
+      "epoch": 2.732417138237672,
+      "grad_norm": Infinity,
+      "learning_rate": 4.459714362705471e-06,
+      "loss": 1.4774,
+      "step": 3380
+    },
+    {
+      "epoch": 2.740501212611156,
+      "grad_norm": Infinity,
+      "learning_rate": 4.324979789814067e-06,
+      "loss": 0.9662,
+      "step": 3390
+    },
+    {
+      "epoch": 2.7485852869846403,
+      "grad_norm": Infinity,
+      "learning_rate": 4.190245216922663e-06,
+      "loss": 1.3972,
+      "step": 3400
+    },
+    {
+      "epoch": 2.7566693613581243,
+      "grad_norm": Infinity,
+      "learning_rate": 4.0555106440312585e-06,
+      "loss": 1.9129,
+      "step": 3410
+    },
+    {
+      "epoch": 2.764753435731609,
+      "grad_norm": Infinity,
+      "learning_rate": 3.920776071139854e-06,
+      "loss": 1.3831,
+      "step": 3420
+    },
+    {
+      "epoch": 2.772837510105093,
+      "grad_norm": Infinity,
+      "learning_rate": 3.7860414982484507e-06,
+      "loss": 1.2399,
+      "step": 3430
+    },
+    {
+      "epoch": 2.7809215844785773,
+      "grad_norm": Infinity,
+      "learning_rate": 3.6513069253570465e-06,
+      "loss": 2.1903,
+      "step": 3440
+    },
+    {
+      "epoch": 2.7890056588520613,
+      "grad_norm": Infinity,
+      "learning_rate": 3.516572352465643e-06,
+      "loss": 2.2974,
+      "step": 3450
+    },
+    {
+      "epoch": 2.7970897332255458,
+      "grad_norm": Infinity,
+      "learning_rate": 3.3818377795742387e-06,
+      "loss": 0.904,
+      "step": 3460
+    },
+    {
+      "epoch": 2.80517380759903,
+      "grad_norm": Infinity,
+      "learning_rate": 3.2471032066828345e-06,
+      "loss": 2.1944,
+      "step": 3470
+    },
+    {
+      "epoch": 2.8132578819725143,
+      "grad_norm": Infinity,
+      "learning_rate": 3.112368633791431e-06,
+      "loss": 0.5356,
+      "step": 3480
+    },
+    {
+      "epoch": 2.8213419563459983,
+      "grad_norm": Infinity,
+      "learning_rate": 2.977634060900027e-06,
+      "loss": 1.952,
+      "step": 3490
+    },
+    {
+      "epoch": 2.8294260307194827,
+      "grad_norm": Infinity,
+      "learning_rate": 2.842899488008623e-06,
+      "loss": 0.8518,
+      "step": 3500
+    },
+    {
+      "epoch": 2.8375101050929668,
+      "grad_norm": Infinity,
+      "learning_rate": 2.7081649151172193e-06,
+      "loss": 1.0664,
+      "step": 3510
+    },
+    {
+      "epoch": 2.845594179466451,
+      "grad_norm": Infinity,
+      "learning_rate": 2.573430342225815e-06,
+      "loss": 1.7874,
+      "step": 3520
+    },
+    {
+      "epoch": 2.8536782538399352,
+      "grad_norm": Infinity,
+      "learning_rate": 2.4386957693344114e-06,
+      "loss": 1.4999,
+      "step": 3530
+    },
+    {
+      "epoch": 2.8617623282134197,
+      "grad_norm": Infinity,
+      "learning_rate": 2.3039611964430073e-06,
+      "loss": 1.01,
+      "step": 3540
+    },
+    {
+      "epoch": 2.8698464025869037,
+      "grad_norm": Infinity,
+      "learning_rate": 2.169226623551603e-06,
+      "loss": 1.6775,
+      "step": 3550
+    },
+    {
+      "epoch": 2.877930476960388,
+      "grad_norm": Infinity,
+      "learning_rate": 2.0344920506602e-06,
+      "loss": 0.7036,
+      "step": 3560
+    },
+    {
+      "epoch": 2.886014551333872,
+      "grad_norm": Infinity,
+      "learning_rate": 1.8997574777687957e-06,
+      "loss": 1.2962,
+      "step": 3570
+    },
+    {
+      "epoch": 2.8940986257073567,
+      "grad_norm": Infinity,
+      "learning_rate": 1.7650229048773916e-06,
+      "loss": 1.5148,
+      "step": 3580
+    },
+    {
+      "epoch": 2.9021827000808407,
+      "grad_norm": Infinity,
+      "learning_rate": 1.6302883319859876e-06,
+      "loss": 1.6301,
+      "step": 3590
+    },
+    {
+      "epoch": 2.910266774454325,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4955537590945837e-06,
+      "loss": 1.4741,
+      "step": 3600
+    },
+    {
+      "epoch": 2.918350848827809,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3608191862031798e-06,
+      "loss": 1.4487,
+      "step": 3610
+    },
+    {
+      "epoch": 2.9264349232012936,
+      "grad_norm": Infinity,
+      "learning_rate": 1.2260846133117759e-06,
+      "loss": 0.712,
+      "step": 3620
+    },
+    {
+      "epoch": 2.9345189975747776,
+      "grad_norm": Infinity,
+      "learning_rate": 1.091350040420372e-06,
+      "loss": 1.6328,
+      "step": 3630
+    },
+    {
+      "epoch": 2.9426030719482617,
+      "grad_norm": Infinity,
+      "learning_rate": 9.566154675289678e-07,
+      "loss": 1.4731,
+      "step": 3640
+    },
+    {
+      "epoch": 2.950687146321746,
+      "grad_norm": Infinity,
+      "learning_rate": 8.218808946375641e-07,
+      "loss": 1.1577,
+      "step": 3650
+    },
+    {
+      "epoch": 2.9587712206952306,
+      "grad_norm": Infinity,
+      "learning_rate": 6.871463217461601e-07,
+      "loss": 2.0929,
+      "step": 3660
+    },
+    {
+      "epoch": 2.9668552950687146,
+      "grad_norm": Infinity,
+      "learning_rate": 5.524117488547561e-07,
+      "loss": 1.4322,
+      "step": 3670
+    },
+    {
+      "epoch": 2.9749393694421986,
+      "grad_norm": Infinity,
+      "learning_rate": 4.176771759633522e-07,
+      "loss": 2.3774,
+      "step": 3680
+    },
+    {
+      "epoch": 2.983023443815683,
+      "grad_norm": Infinity,
+      "learning_rate": 2.8294260307194823e-07,
+      "loss": 1.5767,
+      "step": 3690
+    },
+    {
+      "epoch": 2.9911075181891675,
+      "grad_norm": Infinity,
+      "learning_rate": 1.4820803018054433e-07,
+      "loss": 2.7909,
+      "step": 3700
+    },
+    {
+      "epoch": 2.9991915925626516,
+      "grad_norm": Infinity,
+      "learning_rate": 1.3473457289140394e-08,
+      "loss": 1.3558,
+      "step": 3710
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3711,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4915088628645888.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

norah_lora/checkpoint-3711/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f7af70a1c91c1728aececa3729ab0591b5edd689612a906672255dbec45ed35
+size 5304

norah_lora/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "additional_special_tokens": [
+    "<|ASSISTANT|>",
+    "<|USER|>"
+  ],
+  "bos_token": {
+    "content": "<|bos|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

norah_lora/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

norah_lora/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,95 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<|bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|ASSISTANT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|USER|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|ASSISTANT|>",
+    "<|USER|>"
+  ],
+  "bos_token": "<|bos|>",
+  "chat_template": "{%- set ns = namespace(found=false) -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- set ns.found = true -%}{%- endif -%}{%- endfor -%}{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}}{%- else -%}{{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'<|im_start|>assistant\n'-}}{%- endif -%}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "max_length": 1536,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "stride": 0,
+  "tokenizer_class": "LlamaTokenizerFast",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": true
+}

test_norah.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
+import torch
+model_name = r"C:\Users\HP-Victus\GVAIDAL\Norah"  # Use full path
+print("🔄 Loading tokenizer and model...")
+tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
+model = AutoModelForCausalLM.from_pretrained(model_name, local_files_only=True, torch_dtype=torch.float16, device_map="auto")
+def format_prompt(user_input):
+    return (
+        "Tu es un assistant IA utile et intelligent qui répond toujours en français avec des réponses courtes et claires.\n\n"
+        "Utilisateur: Bonjour, comment vas-tu ?\n"
+        "Assistant: Bonjour ! Je vais bien, merci. Comment puis-je vous aider ?\n\n"
+        f"Utilisateur: {user_input}\n"
+        "Assistant:"
+    )
+# Test conversation
+prompt = format_prompt("Bonjour, comment puis-je vous aider aujourd'hui ?")
+inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to("cuda" if torch.cuda.is_available() else "cpu")
+print("📝 Generating response...")
+# Encode names to block
+bad_words = ["Jean", "Marie", "Bouchard", "Pierre", "Louis", "Antoine", "Jacques", "Robert", "Roper"]
+bad_words_ids = [tokenizer.encode(word, add_special_tokens=False) for word in bad_words]
+# Stopping criteria: Stop at sentence completion
+class StopOnSentenceEnd(StoppingCriteria):
+    def __call__(self, input_ids, scores, **kwargs):
+        stop_tokens = [tokenizer.encode(".", add_special_tokens=False)[0],
+                       tokenizer.encode("!", add_special_tokens=False)[0],
+                       tokenizer.encode("?", add_special_tokens=False)[0]]
+        return any(input_ids[0, -1].item() == stop for stop in stop_tokens)
+stopping_criteria = StoppingCriteriaList([StopOnSentenceEnd()])
+# Generate response
+outputs = model.generate(
+    **inputs,
+    max_length=100,  # Allows complete sentences
+    min_length=10,   # Ensures at least some response
+    do_sample=True,  # Allows varied responses
+    temperature=0.7,  # More natural responses
+    top_p=0.9,       # Higher probability for relevant words
+    repetition_penalty=1.5,  # Prevents repetition but keeps coherence
+    eos_token_id=model.config.eos_token_id,
+    stopping_criteria=stopping_criteria  # Ensures sentence completion
+)
+# Decode and display response
+response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print("💬 Model Response:", response)

tokenize_dataset.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from transformers import AutoTokenizer
+from datasets import load_dataset
+# Load tokenizer and dataset
+model_name = "Visdom9/Norah"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+dataset = load_dataset("OpenAssistant/oasst1", split="train")
+# Keep only French examples
+dataset = dataset.filter(lambda x: x["lang"] == "fr")
+# Tokenize dataset
+def tokenize_function(examples):
+    model_inputs = tokenizer(
+        examples["text"], padding="max_length", truncation=True, max_length=512
+    )
+    model_inputs["labels"] = model_inputs["input_ids"][:]  # ✅ Copy input_ids as labels
+    return model_inputs
+# Apply tokenization
+tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
+# Convert dataset to PyTorch tensors
+tokenized_dataset.set_format("torch")
+# Save tokenized dataset
+tokenized_dataset.save_to_disk("tokenized_norah")
+print("✅ Tokenization complete! Dataset saved to 'tokenized_norah'")

tokenized_norah/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d165415dd4278589d36555c08354f6bce3da3c6dddadd1ab72094ac5fe6d90ca
+size 16498592

tokenized_norah/dataset_info.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "oasst1",
+  "dataset_size": 106034902,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/OpenAssistant/oasst1@fdf72ae0827c1cda404aff25b6603abec9e3399b/data/train-00000-of-00001-b42a775f407cee45.parquet": {
+      "num_bytes": 39516251,
+      "checksum": null
+    },
+    "hf://datasets/OpenAssistant/oasst1@fdf72ae0827c1cda404aff25b6603abec9e3399b/data/validation-00000-of-00001-134b8fd0c89408b6.parquet": {
+      "num_bytes": 2080179,
+      "checksum": null
+    }
+  },
+  "download_size": 41596430,
+  "features": {
+    "labels": {
+      "feature": {
+        "dtype": "int64",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "attention_mask": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 147631332,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 100770129,
+      "num_examples": 84437,
+      "dataset_name": "oasst1"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 5264773,
+      "num_examples": 4401,
+      "dataset_name": "oasst1"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}

tokenized_norah/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "8b9ef9ee38e784ae",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": "torch",
+  "_output_all_columns": false,
+  "_split": "train"
+}