summerstars commited on Feb 9

Commit

e394e8f

verified ·

1 Parent(s): 1aac3e0

Initial commit of Summer1 model

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +202 -3
adapter_config.json +35 -0
adapter_model.safetensors +3 -0
checkpoint-100/README.md +202 -0
checkpoint-100/adapter_config.json +35 -0
checkpoint-100/adapter_model.safetensors +3 -0
checkpoint-100/optimizer.pt +3 -0
checkpoint-100/rng_state.pth +3 -0
checkpoint-100/scheduler.pt +3 -0
checkpoint-100/trainer_state.json +103 -0
checkpoint-100/training_args.bin +3 -0
checkpoint-1000/README.md +202 -0
checkpoint-1000/adapter_config.json +35 -0
checkpoint-1000/adapter_model.safetensors +3 -0
checkpoint-1000/optimizer.pt +3 -0
checkpoint-1000/rng_state.pth +3 -0
checkpoint-1000/scheduler.pt +3 -0
checkpoint-1000/trainer_state.json +733 -0
checkpoint-1000/training_args.bin +3 -0
checkpoint-1100/README.md +202 -0
checkpoint-1100/adapter_config.json +35 -0
checkpoint-1100/adapter_model.safetensors +3 -0
checkpoint-1100/optimizer.pt +3 -0
checkpoint-1100/rng_state.pth +3 -0
checkpoint-1100/scheduler.pt +3 -0
checkpoint-1100/trainer_state.json +803 -0
checkpoint-1100/training_args.bin +3 -0
checkpoint-1200/README.md +202 -0
checkpoint-1200/adapter_config.json +35 -0
checkpoint-1200/adapter_model.safetensors +3 -0
checkpoint-1200/optimizer.pt +3 -0
checkpoint-1200/rng_state.pth +3 -0
checkpoint-1200/scheduler.pt +3 -0
checkpoint-1200/trainer_state.json +873 -0
checkpoint-1200/training_args.bin +3 -0
checkpoint-1300/README.md +202 -0
checkpoint-1300/adapter_config.json +35 -0
checkpoint-1300/adapter_model.safetensors +3 -0
checkpoint-1300/optimizer.pt +3 -0
checkpoint-1300/rng_state.pth +3 -0
checkpoint-1300/scheduler.pt +3 -0
checkpoint-1300/trainer_state.json +943 -0
checkpoint-1300/training_args.bin +3 -0
checkpoint-1400/README.md +202 -0
checkpoint-1400/adapter_config.json +35 -0
checkpoint-1400/adapter_model.safetensors +3 -0
checkpoint-1400/optimizer.pt +3 -0
checkpoint-1400/rng_state.pth +3 -0
checkpoint-1400/scheduler.pt +3 -0
checkpoint-1400/trainer_state.json +1013 -0

README.md CHANGED Viewed

@@ -1,3 +1,202 @@
----
-license: apache-2.0
----

+---
+base_model: HuggingFaceTB/SmolLM2-135M-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama"
+  },
+  "base_model_name_or_path": "HuggingFaceTB/SmolLM2-135M-Instruct",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbcd33481137169e48f638b0daa2ba49b98156eaedc28a0edb7209336f1cf9ad
+size 1858776

checkpoint-100/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: HuggingFaceTB/SmolLM2-135M-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

checkpoint-100/adapter_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama"
+  },
+  "base_model_name_or_path": "HuggingFaceTB/SmolLM2-135M-Instruct",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-100/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2be3be0b0635ec6385192e4136cb7f39d291ecb63701ed3fe77de5f4e4e73ac
+size 1858776

checkpoint-100/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75e08660ee1df4c5c8e58df009fe826019efa6dfdfdf5447c86a24e16df2a072
+size 3787258

checkpoint-100/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c4ea0db29e69b85396427a93cd331a1afb55371c9224ea9162d234eedce8adc
+size 14244

checkpoint-100/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58f0e1735c06782c1d02870e59a87a12a575dafb1e5549b6c0c531da7d3857af
+size 1064

checkpoint-100/trainer_state.json ADDED Viewed

	@@ -0,0 +1,103 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.2,
+  "eval_steps": 500,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.28473100066185,
+      "learning_rate": 4.966666666666667e-05,
+      "loss": 1.3783,
+      "step": 10
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.09849760681390762,
+      "learning_rate": 4.933333333333334e-05,
+      "loss": 1.319,
+      "step": 20
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.12863746285438538,
+      "learning_rate": 4.9e-05,
+      "loss": 1.4715,
+      "step": 30
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.16797196865081787,
+      "learning_rate": 4.866666666666667e-05,
+      "loss": 1.4091,
+      "step": 40
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.2071741819381714,
+      "learning_rate": 4.8333333333333334e-05,
+      "loss": 1.5629,
+      "step": 50
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.11463215947151184,
+      "learning_rate": 4.8e-05,
+      "loss": 1.4113,
+      "step": 60
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4405534565448761,
+      "learning_rate": 4.766666666666667e-05,
+      "loss": 1.5984,
+      "step": 70
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.12359176576137543,
+      "learning_rate": 4.7333333333333336e-05,
+      "loss": 1.6352,
+      "step": 80
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.4452320635318756,
+      "learning_rate": 4.7e-05,
+      "loss": 1.4261,
+      "step": 90
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.15276187658309937,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 1.575,
+      "step": 100
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 101981695161600.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-100/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb8f7cf8d1e00c5133d5f1e9ff7888824b7c793be6dd6747a41226c47d016d32
+size 5304

checkpoint-1000/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: HuggingFaceTB/SmolLM2-135M-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

checkpoint-1000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama"
+  },
+  "base_model_name_or_path": "HuggingFaceTB/SmolLM2-135M-Instruct",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-1000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a703bd273862b8782c3be550362d9c9e9d32a165d6978af851531190632537d0
+size 1858776

checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:151148e103c0bb074369cb016faa8dc1a7067ced3855710667f27fe6ff20f29a
+size 3787258

checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fcc3937efda7abcd37be4671aea947059cce155d7dbbae9620db44d82cc553b
+size 14244

checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6229a91bbb3de67e9bf9bca4736a64084e8170a1fa112c1923289fbbf29c61c
+size 1064

checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,733 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.28473100066185,
+      "learning_rate": 4.966666666666667e-05,
+      "loss": 1.3783,
+      "step": 10
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.09849760681390762,
+      "learning_rate": 4.933333333333334e-05,
+      "loss": 1.319,
+      "step": 20
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.12863746285438538,
+      "learning_rate": 4.9e-05,
+      "loss": 1.4715,
+      "step": 30
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.16797196865081787,
+      "learning_rate": 4.866666666666667e-05,
+      "loss": 1.4091,
+      "step": 40
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.2071741819381714,
+      "learning_rate": 4.8333333333333334e-05,
+      "loss": 1.5629,
+      "step": 50
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.11463215947151184,
+      "learning_rate": 4.8e-05,
+      "loss": 1.4113,
+      "step": 60
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4405534565448761,
+      "learning_rate": 4.766666666666667e-05,
+      "loss": 1.5984,
+      "step": 70
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.12359176576137543,
+      "learning_rate": 4.7333333333333336e-05,
+      "loss": 1.6352,
+      "step": 80
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.4452320635318756,
+      "learning_rate": 4.7e-05,
+      "loss": 1.4261,
+      "step": 90
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.15276187658309937,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 1.575,
+      "step": 100
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.24705903232097626,
+      "learning_rate": 4.633333333333333e-05,
+      "loss": 1.3795,
+      "step": 110
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1368730366230011,
+      "learning_rate": 4.600000000000001e-05,
+      "loss": 1.4378,
+      "step": 120
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.40468767285346985,
+      "learning_rate": 4.566666666666667e-05,
+      "loss": 1.4759,
+      "step": 130
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.18973663449287415,
+      "learning_rate": 4.5333333333333335e-05,
+      "loss": 1.7495,
+      "step": 140
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.19974994659423828,
+      "learning_rate": 4.5e-05,
+      "loss": 1.4329,
+      "step": 150
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1658446341753006,
+      "learning_rate": 4.466666666666667e-05,
+      "loss": 1.4386,
+      "step": 160
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.20768527686595917,
+      "learning_rate": 4.433333333333334e-05,
+      "loss": 1.5677,
+      "step": 170
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.12603804469108582,
+      "learning_rate": 4.4000000000000006e-05,
+      "loss": 1.3503,
+      "step": 180
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.11383192241191864,
+      "learning_rate": 4.3666666666666666e-05,
+      "loss": 1.536,
+      "step": 190
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1635710746049881,
+      "learning_rate": 4.3333333333333334e-05,
+      "loss": 1.2382,
+      "step": 200
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.1817488968372345,
+      "learning_rate": 4.3e-05,
+      "loss": 1.442,
+      "step": 210
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.22358669340610504,
+      "learning_rate": 4.266666666666667e-05,
+      "loss": 1.6387,
+      "step": 220
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.242269828915596,
+      "learning_rate": 4.233333333333334e-05,
+      "loss": 1.289,
+      "step": 230
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.09938176721334457,
+      "learning_rate": 4.2e-05,
+      "loss": 1.6958,
+      "step": 240
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.24452297389507294,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 1.7457,
+      "step": 250
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6737746596336365,
+      "learning_rate": 4.133333333333333e-05,
+      "loss": 1.4782,
+      "step": 260
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.3342950642108917,
+      "learning_rate": 4.1e-05,
+      "loss": 1.5557,
+      "step": 270
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2859005928039551,
+      "learning_rate": 4.066666666666667e-05,
+      "loss": 1.6331,
+      "step": 280
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.196051225066185,
+      "learning_rate": 4.0333333333333336e-05,
+      "loss": 1.1884,
+      "step": 290
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.13173438608646393,
+      "learning_rate": 4e-05,
+      "loss": 1.5273,
+      "step": 300
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.15074381232261658,
+      "learning_rate": 3.966666666666667e-05,
+      "loss": 1.2017,
+      "step": 310
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.43569642305374146,
+      "learning_rate": 3.933333333333333e-05,
+      "loss": 1.4196,
+      "step": 320
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.13135027885437012,
+      "learning_rate": 3.9000000000000006e-05,
+      "loss": 1.2985,
+      "step": 330
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.683555543422699,
+      "learning_rate": 3.866666666666667e-05,
+      "loss": 2.0534,
+      "step": 340
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.18177451193332672,
+      "learning_rate": 3.8333333333333334e-05,
+      "loss": 1.4249,
+      "step": 350
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8321959376335144,
+      "learning_rate": 3.8e-05,
+      "loss": 1.3113,
+      "step": 360
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.2439054697751999,
+      "learning_rate": 3.766666666666667e-05,
+      "loss": 1.569,
+      "step": 370
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3370145261287689,
+      "learning_rate": 3.733333333333334e-05,
+      "loss": 1.6309,
+      "step": 380
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.17391540110111237,
+      "learning_rate": 3.7e-05,
+      "loss": 1.2326,
+      "step": 390
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.31036776304244995,
+      "learning_rate": 3.6666666666666666e-05,
+      "loss": 1.267,
+      "step": 400
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.16713590919971466,
+      "learning_rate": 3.633333333333333e-05,
+      "loss": 1.2097,
+      "step": 410
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.6574969291687012,
+      "learning_rate": 3.6e-05,
+      "loss": 1.7074,
+      "step": 420
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.21429121494293213,
+      "learning_rate": 3.566666666666667e-05,
+      "loss": 1.4363,
+      "step": 430
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.35936588048934937,
+      "learning_rate": 3.5333333333333336e-05,
+      "loss": 1.2745,
+      "step": 440
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.46381500363349915,
+      "learning_rate": 3.5e-05,
+      "loss": 1.3376,
+      "step": 450
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.1676020473241806,
+      "learning_rate": 3.466666666666667e-05,
+      "loss": 1.4441,
+      "step": 460
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.17079180479049683,
+      "learning_rate": 3.433333333333333e-05,
+      "loss": 1.4486,
+      "step": 470
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.35042911767959595,
+      "learning_rate": 3.4000000000000007e-05,
+      "loss": 1.6119,
+      "step": 480
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.13189171254634857,
+      "learning_rate": 3.366666666666667e-05,
+      "loss": 1.3443,
+      "step": 490
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.48539966344833374,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 1.3611,
+      "step": 500
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.28593960404396057,
+      "learning_rate": 3.3e-05,
+      "loss": 1.6098,
+      "step": 510
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.16387881338596344,
+      "learning_rate": 3.266666666666667e-05,
+      "loss": 1.3215,
+      "step": 520
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.17869319021701813,
+      "learning_rate": 3.233333333333333e-05,
+      "loss": 1.6028,
+      "step": 530
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.26113930344581604,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 1.4372,
+      "step": 540
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.16049513220787048,
+      "learning_rate": 3.1666666666666666e-05,
+      "loss": 1.3916,
+      "step": 550
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.14353357255458832,
+      "learning_rate": 3.1333333333333334e-05,
+      "loss": 1.3971,
+      "step": 560
+    },
+    {
+      "epoch": 1.1400000000000001,
+      "grad_norm": 0.348002552986145,
+      "learning_rate": 3.1e-05,
+      "loss": 1.5419,
+      "step": 570
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.4313276410102844,
+      "learning_rate": 3.066666666666667e-05,
+      "loss": 1.4511,
+      "step": 580
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.20719008147716522,
+      "learning_rate": 3.0333333333333337e-05,
+      "loss": 1.3274,
+      "step": 590
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.21788401901721954,
+      "learning_rate": 3e-05,
+      "loss": 1.2425,
+      "step": 600
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.4731273949146271,
+      "learning_rate": 2.9666666666666672e-05,
+      "loss": 1.4746,
+      "step": 610
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.2933167517185211,
+      "learning_rate": 2.9333333333333336e-05,
+      "loss": 1.475,
+      "step": 620
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.22523891925811768,
+      "learning_rate": 2.9e-05,
+      "loss": 1.3062,
+      "step": 630
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.20424093306064606,
+      "learning_rate": 2.8666666666666668e-05,
+      "loss": 1.4697,
+      "step": 640
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.5376447439193726,
+      "learning_rate": 2.8333333333333335e-05,
+      "loss": 1.5247,
+      "step": 650
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.3490164875984192,
+      "learning_rate": 2.8000000000000003e-05,
+      "loss": 1.3842,
+      "step": 660
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.18214678764343262,
+      "learning_rate": 2.7666666666666667e-05,
+      "loss": 1.3442,
+      "step": 670
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.25049060583114624,
+      "learning_rate": 2.733333333333333e-05,
+      "loss": 1.6505,
+      "step": 680
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.6477521657943726,
+      "learning_rate": 2.7000000000000002e-05,
+      "loss": 1.5945,
+      "step": 690
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.17422887682914734,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 1.0883,
+      "step": 700
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.2029237151145935,
+      "learning_rate": 2.633333333333333e-05,
+      "loss": 1.3454,
+      "step": 710
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.2694133222103119,
+      "learning_rate": 2.6000000000000002e-05,
+      "loss": 1.0081,
+      "step": 720
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.1709485799074173,
+      "learning_rate": 2.57e-05,
+      "loss": 1.5033,
+      "step": 730
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.23790444433689117,
+      "learning_rate": 2.5366666666666665e-05,
+      "loss": 1.3424,
+      "step": 740
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.16748583316802979,
+      "learning_rate": 2.5033333333333336e-05,
+      "loss": 1.2721,
+      "step": 750
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.7728779315948486,
+      "learning_rate": 2.47e-05,
+      "loss": 1.818,
+      "step": 760
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.900970458984375,
+      "learning_rate": 2.4366666666666668e-05,
+      "loss": 1.3781,
+      "step": 770
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.21673746407032013,
+      "learning_rate": 2.4033333333333336e-05,
+      "loss": 1.3427,
+      "step": 780
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.2009885162115097,
+      "learning_rate": 2.37e-05,
+      "loss": 1.265,
+      "step": 790
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.2574044466018677,
+      "learning_rate": 2.3366666666666668e-05,
+      "loss": 1.4462,
+      "step": 800
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.27698543667793274,
+      "learning_rate": 2.3033333333333335e-05,
+      "loss": 1.3308,
+      "step": 810
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.8570265173912048,
+      "learning_rate": 2.2700000000000003e-05,
+      "loss": 1.2596,
+      "step": 820
+    },
+    {
+      "epoch": 1.6600000000000001,
+      "grad_norm": 0.28962960839271545,
+      "learning_rate": 2.236666666666667e-05,
+      "loss": 1.3297,
+      "step": 830
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.22726690769195557,
+      "learning_rate": 2.2033333333333335e-05,
+      "loss": 1.3379,
+      "step": 840
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.2658108174800873,
+      "learning_rate": 2.1700000000000002e-05,
+      "loss": 1.3958,
+      "step": 850
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.5850313305854797,
+      "learning_rate": 2.1366666666666667e-05,
+      "loss": 1.6927,
+      "step": 860
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.45299115777015686,
+      "learning_rate": 2.1033333333333334e-05,
+      "loss": 1.4111,
+      "step": 870
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.19067615270614624,
+      "learning_rate": 2.07e-05,
+      "loss": 1.5171,
+      "step": 880
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.19123299419879913,
+      "learning_rate": 2.0366666666666666e-05,
+      "loss": 1.4562,
+      "step": 890
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.1748710572719574,
+      "learning_rate": 2.0033333333333334e-05,
+      "loss": 1.366,
+      "step": 900
+    },
+    {
+      "epoch": 1.8199999999999998,
+      "grad_norm": 0.27076292037963867,
+      "learning_rate": 1.97e-05,
+      "loss": 1.3784,
+      "step": 910
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.7414858937263489,
+      "learning_rate": 1.9366666666666665e-05,
+      "loss": 1.4304,
+      "step": 920
+    },
+    {
+      "epoch": 1.8599999999999999,
+      "grad_norm": 0.8297638893127441,
+      "learning_rate": 1.9033333333333333e-05,
+      "loss": 1.3251,
+      "step": 930
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.17869873344898224,
+      "learning_rate": 1.87e-05,
+      "loss": 1.3164,
+      "step": 940
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.37470296025276184,
+      "learning_rate": 1.8366666666666668e-05,
+      "loss": 1.4136,
+      "step": 950
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.345380038022995,
+      "learning_rate": 1.8033333333333336e-05,
+      "loss": 1.4664,
+      "step": 960
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.22281818091869354,
+      "learning_rate": 1.77e-05,
+      "loss": 1.505,
+      "step": 970
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.14282165467739105,
+      "learning_rate": 1.7366666666666668e-05,
+      "loss": 1.6581,
+      "step": 980
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.2700720429420471,
+      "learning_rate": 1.7033333333333335e-05,
+      "loss": 1.4055,
+      "step": 990
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.20198316872119904,
+      "learning_rate": 1.6700000000000003e-05,
+      "loss": 1.1787,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1088098288422912.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb8f7cf8d1e00c5133d5f1e9ff7888824b7c793be6dd6747a41226c47d016d32
+size 5304

checkpoint-1100/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: HuggingFaceTB/SmolLM2-135M-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

checkpoint-1100/adapter_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama"
+  },
+  "base_model_name_or_path": "HuggingFaceTB/SmolLM2-135M-Instruct",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-1100/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4d53a2b5f5340a7c40bbfaa04a80530f52755a286eae58c7684f11b8cf9074f
+size 1858776

checkpoint-1100/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5dc7f680e72109f3cfbc3b542fe5787a9999fe09daba7a49770cda6b66b51c6a
+size 3787258

checkpoint-1100/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b85dec5584bd1278556c8a6ae87342668c710605837ef11880fbfe90d712b45
+size 14244

checkpoint-1100/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ec5b2dcbbb1b08718ca6e437fab029b7183b68c4fd6277a91d282cc0d7cff18
+size 1064

checkpoint-1100/trainer_state.json ADDED Viewed

	@@ -0,0 +1,803 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.2,
+  "eval_steps": 500,
+  "global_step": 1100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.28473100066185,
+      "learning_rate": 4.966666666666667e-05,
+      "loss": 1.3783,
+      "step": 10
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.09849760681390762,
+      "learning_rate": 4.933333333333334e-05,
+      "loss": 1.319,
+      "step": 20
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.12863746285438538,
+      "learning_rate": 4.9e-05,
+      "loss": 1.4715,
+      "step": 30
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.16797196865081787,
+      "learning_rate": 4.866666666666667e-05,
+      "loss": 1.4091,
+      "step": 40
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.2071741819381714,
+      "learning_rate": 4.8333333333333334e-05,
+      "loss": 1.5629,
+      "step": 50
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.11463215947151184,
+      "learning_rate": 4.8e-05,
+      "loss": 1.4113,
+      "step": 60
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4405534565448761,
+      "learning_rate": 4.766666666666667e-05,
+      "loss": 1.5984,
+      "step": 70
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.12359176576137543,
+      "learning_rate": 4.7333333333333336e-05,
+      "loss": 1.6352,
+      "step": 80
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.4452320635318756,
+      "learning_rate": 4.7e-05,
+      "loss": 1.4261,
+      "step": 90
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.15276187658309937,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 1.575,
+      "step": 100
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.24705903232097626,
+      "learning_rate": 4.633333333333333e-05,
+      "loss": 1.3795,
+      "step": 110
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1368730366230011,
+      "learning_rate": 4.600000000000001e-05,
+      "loss": 1.4378,
+      "step": 120
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.40468767285346985,
+      "learning_rate": 4.566666666666667e-05,
+      "loss": 1.4759,
+      "step": 130
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.18973663449287415,
+      "learning_rate": 4.5333333333333335e-05,
+      "loss": 1.7495,
+      "step": 140
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.19974994659423828,
+      "learning_rate": 4.5e-05,
+      "loss": 1.4329,
+      "step": 150
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1658446341753006,
+      "learning_rate": 4.466666666666667e-05,
+      "loss": 1.4386,
+      "step": 160
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.20768527686595917,
+      "learning_rate": 4.433333333333334e-05,
+      "loss": 1.5677,
+      "step": 170
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.12603804469108582,
+      "learning_rate": 4.4000000000000006e-05,
+      "loss": 1.3503,
+      "step": 180
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.11383192241191864,
+      "learning_rate": 4.3666666666666666e-05,
+      "loss": 1.536,
+      "step": 190
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1635710746049881,
+      "learning_rate": 4.3333333333333334e-05,
+      "loss": 1.2382,
+      "step": 200
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.1817488968372345,
+      "learning_rate": 4.3e-05,
+      "loss": 1.442,
+      "step": 210
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.22358669340610504,
+      "learning_rate": 4.266666666666667e-05,
+      "loss": 1.6387,
+      "step": 220
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.242269828915596,
+      "learning_rate": 4.233333333333334e-05,
+      "loss": 1.289,
+      "step": 230
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.09938176721334457,
+      "learning_rate": 4.2e-05,
+      "loss": 1.6958,
+      "step": 240
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.24452297389507294,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 1.7457,
+      "step": 250
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6737746596336365,
+      "learning_rate": 4.133333333333333e-05,
+      "loss": 1.4782,
+      "step": 260
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.3342950642108917,
+      "learning_rate": 4.1e-05,
+      "loss": 1.5557,
+      "step": 270
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2859005928039551,
+      "learning_rate": 4.066666666666667e-05,
+      "loss": 1.6331,
+      "step": 280
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.196051225066185,
+      "learning_rate": 4.0333333333333336e-05,
+      "loss": 1.1884,
+      "step": 290
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.13173438608646393,
+      "learning_rate": 4e-05,
+      "loss": 1.5273,
+      "step": 300
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.15074381232261658,
+      "learning_rate": 3.966666666666667e-05,
+      "loss": 1.2017,
+      "step": 310
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.43569642305374146,
+      "learning_rate": 3.933333333333333e-05,
+      "loss": 1.4196,
+      "step": 320
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.13135027885437012,
+      "learning_rate": 3.9000000000000006e-05,
+      "loss": 1.2985,
+      "step": 330
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.683555543422699,
+      "learning_rate": 3.866666666666667e-05,
+      "loss": 2.0534,
+      "step": 340
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.18177451193332672,
+      "learning_rate": 3.8333333333333334e-05,
+      "loss": 1.4249,
+      "step": 350
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8321959376335144,
+      "learning_rate": 3.8e-05,
+      "loss": 1.3113,
+      "step": 360
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.2439054697751999,
+      "learning_rate": 3.766666666666667e-05,
+      "loss": 1.569,
+      "step": 370
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3370145261287689,
+      "learning_rate": 3.733333333333334e-05,
+      "loss": 1.6309,
+      "step": 380
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.17391540110111237,
+      "learning_rate": 3.7e-05,
+      "loss": 1.2326,
+      "step": 390
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.31036776304244995,
+      "learning_rate": 3.6666666666666666e-05,
+      "loss": 1.267,
+      "step": 400
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.16713590919971466,
+      "learning_rate": 3.633333333333333e-05,
+      "loss": 1.2097,
+      "step": 410
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.6574969291687012,
+      "learning_rate": 3.6e-05,
+      "loss": 1.7074,
+      "step": 420
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.21429121494293213,
+      "learning_rate": 3.566666666666667e-05,
+      "loss": 1.4363,
+      "step": 430
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.35936588048934937,
+      "learning_rate": 3.5333333333333336e-05,
+      "loss": 1.2745,
+      "step": 440
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.46381500363349915,
+      "learning_rate": 3.5e-05,
+      "loss": 1.3376,
+      "step": 450
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.1676020473241806,
+      "learning_rate": 3.466666666666667e-05,
+      "loss": 1.4441,
+      "step": 460
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.17079180479049683,
+      "learning_rate": 3.433333333333333e-05,
+      "loss": 1.4486,
+      "step": 470
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.35042911767959595,
+      "learning_rate": 3.4000000000000007e-05,
+      "loss": 1.6119,
+      "step": 480
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.13189171254634857,
+      "learning_rate": 3.366666666666667e-05,
+      "loss": 1.3443,
+      "step": 490
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.48539966344833374,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 1.3611,
+      "step": 500
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.28593960404396057,
+      "learning_rate": 3.3e-05,
+      "loss": 1.6098,
+      "step": 510
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.16387881338596344,
+      "learning_rate": 3.266666666666667e-05,
+      "loss": 1.3215,
+      "step": 520
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.17869319021701813,
+      "learning_rate": 3.233333333333333e-05,
+      "loss": 1.6028,
+      "step": 530
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.26113930344581604,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 1.4372,
+      "step": 540
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.16049513220787048,
+      "learning_rate": 3.1666666666666666e-05,
+      "loss": 1.3916,
+      "step": 550
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.14353357255458832,
+      "learning_rate": 3.1333333333333334e-05,
+      "loss": 1.3971,
+      "step": 560
+    },
+    {
+      "epoch": 1.1400000000000001,
+      "grad_norm": 0.348002552986145,
+      "learning_rate": 3.1e-05,
+      "loss": 1.5419,
+      "step": 570
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.4313276410102844,
+      "learning_rate": 3.066666666666667e-05,
+      "loss": 1.4511,
+      "step": 580
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.20719008147716522,
+      "learning_rate": 3.0333333333333337e-05,
+      "loss": 1.3274,
+      "step": 590
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.21788401901721954,
+      "learning_rate": 3e-05,
+      "loss": 1.2425,
+      "step": 600
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.4731273949146271,
+      "learning_rate": 2.9666666666666672e-05,
+      "loss": 1.4746,
+      "step": 610
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.2933167517185211,
+      "learning_rate": 2.9333333333333336e-05,
+      "loss": 1.475,
+      "step": 620
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.22523891925811768,
+      "learning_rate": 2.9e-05,
+      "loss": 1.3062,
+      "step": 630
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.20424093306064606,
+      "learning_rate": 2.8666666666666668e-05,
+      "loss": 1.4697,
+      "step": 640
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.5376447439193726,
+      "learning_rate": 2.8333333333333335e-05,
+      "loss": 1.5247,
+      "step": 650
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.3490164875984192,
+      "learning_rate": 2.8000000000000003e-05,
+      "loss": 1.3842,
+      "step": 660
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.18214678764343262,
+      "learning_rate": 2.7666666666666667e-05,
+      "loss": 1.3442,
+      "step": 670
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.25049060583114624,
+      "learning_rate": 2.733333333333333e-05,
+      "loss": 1.6505,
+      "step": 680
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.6477521657943726,
+      "learning_rate": 2.7000000000000002e-05,
+      "loss": 1.5945,
+      "step": 690
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.17422887682914734,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 1.0883,
+      "step": 700
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.2029237151145935,
+      "learning_rate": 2.633333333333333e-05,
+      "loss": 1.3454,
+      "step": 710
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.2694133222103119,
+      "learning_rate": 2.6000000000000002e-05,
+      "loss": 1.0081,
+      "step": 720
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.1709485799074173,
+      "learning_rate": 2.57e-05,
+      "loss": 1.5033,
+      "step": 730
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.23790444433689117,
+      "learning_rate": 2.5366666666666665e-05,
+      "loss": 1.3424,
+      "step": 740
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.16748583316802979,
+      "learning_rate": 2.5033333333333336e-05,
+      "loss": 1.2721,
+      "step": 750
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.7728779315948486,
+      "learning_rate": 2.47e-05,
+      "loss": 1.818,
+      "step": 760
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.900970458984375,
+      "learning_rate": 2.4366666666666668e-05,
+      "loss": 1.3781,
+      "step": 770
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.21673746407032013,
+      "learning_rate": 2.4033333333333336e-05,
+      "loss": 1.3427,
+      "step": 780
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.2009885162115097,
+      "learning_rate": 2.37e-05,
+      "loss": 1.265,
+      "step": 790
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.2574044466018677,
+      "learning_rate": 2.3366666666666668e-05,
+      "loss": 1.4462,
+      "step": 800
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.27698543667793274,
+      "learning_rate": 2.3033333333333335e-05,
+      "loss": 1.3308,
+      "step": 810
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.8570265173912048,
+      "learning_rate": 2.2700000000000003e-05,
+      "loss": 1.2596,
+      "step": 820
+    },
+    {
+      "epoch": 1.6600000000000001,
+      "grad_norm": 0.28962960839271545,
+      "learning_rate": 2.236666666666667e-05,
+      "loss": 1.3297,
+      "step": 830
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.22726690769195557,
+      "learning_rate": 2.2033333333333335e-05,
+      "loss": 1.3379,
+      "step": 840
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.2658108174800873,
+      "learning_rate": 2.1700000000000002e-05,
+      "loss": 1.3958,
+      "step": 850
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.5850313305854797,
+      "learning_rate": 2.1366666666666667e-05,
+      "loss": 1.6927,
+      "step": 860
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.45299115777015686,
+      "learning_rate": 2.1033333333333334e-05,
+      "loss": 1.4111,
+      "step": 870
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.19067615270614624,
+      "learning_rate": 2.07e-05,
+      "loss": 1.5171,
+      "step": 880
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.19123299419879913,
+      "learning_rate": 2.0366666666666666e-05,
+      "loss": 1.4562,
+      "step": 890
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.1748710572719574,
+      "learning_rate": 2.0033333333333334e-05,
+      "loss": 1.366,
+      "step": 900
+    },
+    {
+      "epoch": 1.8199999999999998,
+      "grad_norm": 0.27076292037963867,
+      "learning_rate": 1.97e-05,
+      "loss": 1.3784,
+      "step": 910
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.7414858937263489,
+      "learning_rate": 1.9366666666666665e-05,
+      "loss": 1.4304,
+      "step": 920
+    },
+    {
+      "epoch": 1.8599999999999999,
+      "grad_norm": 0.8297638893127441,
+      "learning_rate": 1.9033333333333333e-05,
+      "loss": 1.3251,
+      "step": 930
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.17869873344898224,
+      "learning_rate": 1.87e-05,
+      "loss": 1.3164,
+      "step": 940
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.37470296025276184,
+      "learning_rate": 1.8366666666666668e-05,
+      "loss": 1.4136,
+      "step": 950
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.345380038022995,
+      "learning_rate": 1.8033333333333336e-05,
+      "loss": 1.4664,
+      "step": 960
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.22281818091869354,
+      "learning_rate": 1.77e-05,
+      "loss": 1.505,
+      "step": 970
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.14282165467739105,
+      "learning_rate": 1.7366666666666668e-05,
+      "loss": 1.6581,
+      "step": 980
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.2700720429420471,
+      "learning_rate": 1.7033333333333335e-05,
+      "loss": 1.4055,
+      "step": 990
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.20198316872119904,
+      "learning_rate": 1.6700000000000003e-05,
+      "loss": 1.1787,
+      "step": 1000
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 0.249458909034729,
+      "learning_rate": 1.6366666666666667e-05,
+      "loss": 1.292,
+      "step": 1010
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 0.2311285138130188,
+      "learning_rate": 1.6033333333333335e-05,
+      "loss": 1.1209,
+      "step": 1020
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 0.21848829090595245,
+      "learning_rate": 1.5700000000000002e-05,
+      "loss": 1.3446,
+      "step": 1030
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.1550074964761734,
+      "learning_rate": 1.536666666666667e-05,
+      "loss": 1.4103,
+      "step": 1040
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 0.5638441443443298,
+      "learning_rate": 1.5033333333333336e-05,
+      "loss": 1.5772,
+      "step": 1050
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 0.3584989905357361,
+      "learning_rate": 1.47e-05,
+      "loss": 1.488,
+      "step": 1060
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 0.2701733410358429,
+      "learning_rate": 1.4366666666666667e-05,
+      "loss": 1.4173,
+      "step": 1070
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.1940249353647232,
+      "learning_rate": 1.4033333333333335e-05,
+      "loss": 1.209,
+      "step": 1080
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 0.46034470200538635,
+      "learning_rate": 1.3700000000000001e-05,
+      "loss": 1.7403,
+      "step": 1090
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.5914195775985718,
+      "learning_rate": 1.3366666666666667e-05,
+      "loss": 1.6092,
+      "step": 1100
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1201575403782144.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1100/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb8f7cf8d1e00c5133d5f1e9ff7888824b7c793be6dd6747a41226c47d016d32
+size 5304

checkpoint-1200/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: HuggingFaceTB/SmolLM2-135M-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

checkpoint-1200/adapter_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama"
+  },
+  "base_model_name_or_path": "HuggingFaceTB/SmolLM2-135M-Instruct",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-1200/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df6cf1a042166dced131a03ae1c5b46bcbfe6de8894f4addb0456036e163d1c5
+size 1858776

checkpoint-1200/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df2a447496eff071edbe194ba82318d5c5ab8d9ab1cf52cf239a3b4a7b158c01
+size 3787258

checkpoint-1200/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a395c0f7f3c73f88a175167d89069fb942f9a81fb568cfc934ae554d98464db8
+size 14244

checkpoint-1200/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dccc4d87a1c6c7ce2924d7d53e15349e17f271a954cb24fcf4c613a7d82d210d
+size 1064

checkpoint-1200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,873 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.4,
+  "eval_steps": 500,
+  "global_step": 1200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.28473100066185,
+      "learning_rate": 4.966666666666667e-05,
+      "loss": 1.3783,
+      "step": 10
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.09849760681390762,
+      "learning_rate": 4.933333333333334e-05,
+      "loss": 1.319,
+      "step": 20
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.12863746285438538,
+      "learning_rate": 4.9e-05,
+      "loss": 1.4715,
+      "step": 30
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.16797196865081787,
+      "learning_rate": 4.866666666666667e-05,
+      "loss": 1.4091,
+      "step": 40
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.2071741819381714,
+      "learning_rate": 4.8333333333333334e-05,
+      "loss": 1.5629,
+      "step": 50
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.11463215947151184,
+      "learning_rate": 4.8e-05,
+      "loss": 1.4113,
+      "step": 60
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4405534565448761,
+      "learning_rate": 4.766666666666667e-05,
+      "loss": 1.5984,
+      "step": 70
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.12359176576137543,
+      "learning_rate": 4.7333333333333336e-05,
+      "loss": 1.6352,
+      "step": 80
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.4452320635318756,
+      "learning_rate": 4.7e-05,
+      "loss": 1.4261,
+      "step": 90
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.15276187658309937,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 1.575,
+      "step": 100
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.24705903232097626,
+      "learning_rate": 4.633333333333333e-05,
+      "loss": 1.3795,
+      "step": 110
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1368730366230011,
+      "learning_rate": 4.600000000000001e-05,
+      "loss": 1.4378,
+      "step": 120
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.40468767285346985,
+      "learning_rate": 4.566666666666667e-05,
+      "loss": 1.4759,
+      "step": 130
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.18973663449287415,
+      "learning_rate": 4.5333333333333335e-05,
+      "loss": 1.7495,
+      "step": 140
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.19974994659423828,
+      "learning_rate": 4.5e-05,
+      "loss": 1.4329,
+      "step": 150
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1658446341753006,
+      "learning_rate": 4.466666666666667e-05,
+      "loss": 1.4386,
+      "step": 160
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.20768527686595917,
+      "learning_rate": 4.433333333333334e-05,
+      "loss": 1.5677,
+      "step": 170
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.12603804469108582,
+      "learning_rate": 4.4000000000000006e-05,
+      "loss": 1.3503,
+      "step": 180
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.11383192241191864,
+      "learning_rate": 4.3666666666666666e-05,
+      "loss": 1.536,
+      "step": 190
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1635710746049881,
+      "learning_rate": 4.3333333333333334e-05,
+      "loss": 1.2382,
+      "step": 200
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.1817488968372345,
+      "learning_rate": 4.3e-05,
+      "loss": 1.442,
+      "step": 210
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.22358669340610504,
+      "learning_rate": 4.266666666666667e-05,
+      "loss": 1.6387,
+      "step": 220
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.242269828915596,
+      "learning_rate": 4.233333333333334e-05,
+      "loss": 1.289,
+      "step": 230
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.09938176721334457,
+      "learning_rate": 4.2e-05,
+      "loss": 1.6958,
+      "step": 240
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.24452297389507294,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 1.7457,
+      "step": 250
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6737746596336365,
+      "learning_rate": 4.133333333333333e-05,
+      "loss": 1.4782,
+      "step": 260
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.3342950642108917,
+      "learning_rate": 4.1e-05,
+      "loss": 1.5557,
+      "step": 270
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2859005928039551,
+      "learning_rate": 4.066666666666667e-05,
+      "loss": 1.6331,
+      "step": 280
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.196051225066185,
+      "learning_rate": 4.0333333333333336e-05,
+      "loss": 1.1884,
+      "step": 290
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.13173438608646393,
+      "learning_rate": 4e-05,
+      "loss": 1.5273,
+      "step": 300
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.15074381232261658,
+      "learning_rate": 3.966666666666667e-05,
+      "loss": 1.2017,
+      "step": 310
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.43569642305374146,
+      "learning_rate": 3.933333333333333e-05,
+      "loss": 1.4196,
+      "step": 320
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.13135027885437012,
+      "learning_rate": 3.9000000000000006e-05,
+      "loss": 1.2985,
+      "step": 330
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.683555543422699,
+      "learning_rate": 3.866666666666667e-05,
+      "loss": 2.0534,
+      "step": 340
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.18177451193332672,
+      "learning_rate": 3.8333333333333334e-05,
+      "loss": 1.4249,
+      "step": 350
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8321959376335144,
+      "learning_rate": 3.8e-05,
+      "loss": 1.3113,
+      "step": 360
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.2439054697751999,
+      "learning_rate": 3.766666666666667e-05,
+      "loss": 1.569,
+      "step": 370
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3370145261287689,
+      "learning_rate": 3.733333333333334e-05,
+      "loss": 1.6309,
+      "step": 380
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.17391540110111237,
+      "learning_rate": 3.7e-05,
+      "loss": 1.2326,
+      "step": 390
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.31036776304244995,
+      "learning_rate": 3.6666666666666666e-05,
+      "loss": 1.267,
+      "step": 400
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.16713590919971466,
+      "learning_rate": 3.633333333333333e-05,
+      "loss": 1.2097,
+      "step": 410
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.6574969291687012,
+      "learning_rate": 3.6e-05,
+      "loss": 1.7074,
+      "step": 420
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.21429121494293213,
+      "learning_rate": 3.566666666666667e-05,
+      "loss": 1.4363,
+      "step": 430
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.35936588048934937,
+      "learning_rate": 3.5333333333333336e-05,
+      "loss": 1.2745,
+      "step": 440
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.46381500363349915,
+      "learning_rate": 3.5e-05,
+      "loss": 1.3376,
+      "step": 450
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.1676020473241806,
+      "learning_rate": 3.466666666666667e-05,
+      "loss": 1.4441,
+      "step": 460
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.17079180479049683,
+      "learning_rate": 3.433333333333333e-05,
+      "loss": 1.4486,
+      "step": 470
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.35042911767959595,
+      "learning_rate": 3.4000000000000007e-05,
+      "loss": 1.6119,
+      "step": 480
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.13189171254634857,
+      "learning_rate": 3.366666666666667e-05,
+      "loss": 1.3443,
+      "step": 490
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.48539966344833374,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 1.3611,
+      "step": 500
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.28593960404396057,
+      "learning_rate": 3.3e-05,
+      "loss": 1.6098,
+      "step": 510
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.16387881338596344,
+      "learning_rate": 3.266666666666667e-05,
+      "loss": 1.3215,
+      "step": 520
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.17869319021701813,
+      "learning_rate": 3.233333333333333e-05,
+      "loss": 1.6028,
+      "step": 530
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.26113930344581604,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 1.4372,
+      "step": 540
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.16049513220787048,
+      "learning_rate": 3.1666666666666666e-05,
+      "loss": 1.3916,
+      "step": 550
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.14353357255458832,
+      "learning_rate": 3.1333333333333334e-05,
+      "loss": 1.3971,
+      "step": 560
+    },
+    {
+      "epoch": 1.1400000000000001,
+      "grad_norm": 0.348002552986145,
+      "learning_rate": 3.1e-05,
+      "loss": 1.5419,
+      "step": 570
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.4313276410102844,
+      "learning_rate": 3.066666666666667e-05,
+      "loss": 1.4511,
+      "step": 580
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.20719008147716522,
+      "learning_rate": 3.0333333333333337e-05,
+      "loss": 1.3274,
+      "step": 590
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.21788401901721954,
+      "learning_rate": 3e-05,
+      "loss": 1.2425,
+      "step": 600
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.4731273949146271,
+      "learning_rate": 2.9666666666666672e-05,
+      "loss": 1.4746,
+      "step": 610
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.2933167517185211,
+      "learning_rate": 2.9333333333333336e-05,
+      "loss": 1.475,
+      "step": 620
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.22523891925811768,
+      "learning_rate": 2.9e-05,
+      "loss": 1.3062,
+      "step": 630
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.20424093306064606,
+      "learning_rate": 2.8666666666666668e-05,
+      "loss": 1.4697,
+      "step": 640
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.5376447439193726,
+      "learning_rate": 2.8333333333333335e-05,
+      "loss": 1.5247,
+      "step": 650
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.3490164875984192,
+      "learning_rate": 2.8000000000000003e-05,
+      "loss": 1.3842,
+      "step": 660
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.18214678764343262,
+      "learning_rate": 2.7666666666666667e-05,
+      "loss": 1.3442,
+      "step": 670
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.25049060583114624,
+      "learning_rate": 2.733333333333333e-05,
+      "loss": 1.6505,
+      "step": 680
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.6477521657943726,
+      "learning_rate": 2.7000000000000002e-05,
+      "loss": 1.5945,
+      "step": 690
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.17422887682914734,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 1.0883,
+      "step": 700
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.2029237151145935,
+      "learning_rate": 2.633333333333333e-05,
+      "loss": 1.3454,
+      "step": 710
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.2694133222103119,
+      "learning_rate": 2.6000000000000002e-05,
+      "loss": 1.0081,
+      "step": 720
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.1709485799074173,
+      "learning_rate": 2.57e-05,
+      "loss": 1.5033,
+      "step": 730
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.23790444433689117,
+      "learning_rate": 2.5366666666666665e-05,
+      "loss": 1.3424,
+      "step": 740
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.16748583316802979,
+      "learning_rate": 2.5033333333333336e-05,
+      "loss": 1.2721,
+      "step": 750
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.7728779315948486,
+      "learning_rate": 2.47e-05,
+      "loss": 1.818,
+      "step": 760
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.900970458984375,
+      "learning_rate": 2.4366666666666668e-05,
+      "loss": 1.3781,
+      "step": 770
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.21673746407032013,
+      "learning_rate": 2.4033333333333336e-05,
+      "loss": 1.3427,
+      "step": 780
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.2009885162115097,
+      "learning_rate": 2.37e-05,
+      "loss": 1.265,
+      "step": 790
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.2574044466018677,
+      "learning_rate": 2.3366666666666668e-05,
+      "loss": 1.4462,
+      "step": 800
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.27698543667793274,
+      "learning_rate": 2.3033333333333335e-05,
+      "loss": 1.3308,
+      "step": 810
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.8570265173912048,
+      "learning_rate": 2.2700000000000003e-05,
+      "loss": 1.2596,
+      "step": 820
+    },
+    {
+      "epoch": 1.6600000000000001,
+      "grad_norm": 0.28962960839271545,
+      "learning_rate": 2.236666666666667e-05,
+      "loss": 1.3297,
+      "step": 830
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.22726690769195557,
+      "learning_rate": 2.2033333333333335e-05,
+      "loss": 1.3379,
+      "step": 840
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.2658108174800873,
+      "learning_rate": 2.1700000000000002e-05,
+      "loss": 1.3958,
+      "step": 850
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.5850313305854797,
+      "learning_rate": 2.1366666666666667e-05,
+      "loss": 1.6927,
+      "step": 860
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.45299115777015686,
+      "learning_rate": 2.1033333333333334e-05,
+      "loss": 1.4111,
+      "step": 870
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.19067615270614624,
+      "learning_rate": 2.07e-05,
+      "loss": 1.5171,
+      "step": 880
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.19123299419879913,
+      "learning_rate": 2.0366666666666666e-05,
+      "loss": 1.4562,
+      "step": 890
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.1748710572719574,
+      "learning_rate": 2.0033333333333334e-05,
+      "loss": 1.366,
+      "step": 900
+    },
+    {
+      "epoch": 1.8199999999999998,
+      "grad_norm": 0.27076292037963867,
+      "learning_rate": 1.97e-05,
+      "loss": 1.3784,
+      "step": 910
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.7414858937263489,
+      "learning_rate": 1.9366666666666665e-05,
+      "loss": 1.4304,
+      "step": 920
+    },
+    {
+      "epoch": 1.8599999999999999,
+      "grad_norm": 0.8297638893127441,
+      "learning_rate": 1.9033333333333333e-05,
+      "loss": 1.3251,
+      "step": 930
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.17869873344898224,
+      "learning_rate": 1.87e-05,
+      "loss": 1.3164,
+      "step": 940
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.37470296025276184,
+      "learning_rate": 1.8366666666666668e-05,
+      "loss": 1.4136,
+      "step": 950
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.345380038022995,
+      "learning_rate": 1.8033333333333336e-05,
+      "loss": 1.4664,
+      "step": 960
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.22281818091869354,
+      "learning_rate": 1.77e-05,
+      "loss": 1.505,
+      "step": 970
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.14282165467739105,
+      "learning_rate": 1.7366666666666668e-05,
+      "loss": 1.6581,
+      "step": 980
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.2700720429420471,
+      "learning_rate": 1.7033333333333335e-05,
+      "loss": 1.4055,
+      "step": 990
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.20198316872119904,
+      "learning_rate": 1.6700000000000003e-05,
+      "loss": 1.1787,
+      "step": 1000
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 0.249458909034729,
+      "learning_rate": 1.6366666666666667e-05,
+      "loss": 1.292,
+      "step": 1010
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 0.2311285138130188,
+      "learning_rate": 1.6033333333333335e-05,
+      "loss": 1.1209,
+      "step": 1020
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 0.21848829090595245,
+      "learning_rate": 1.5700000000000002e-05,
+      "loss": 1.3446,
+      "step": 1030
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.1550074964761734,
+      "learning_rate": 1.536666666666667e-05,
+      "loss": 1.4103,
+      "step": 1040
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 0.5638441443443298,
+      "learning_rate": 1.5033333333333336e-05,
+      "loss": 1.5772,
+      "step": 1050
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 0.3584989905357361,
+      "learning_rate": 1.47e-05,
+      "loss": 1.488,
+      "step": 1060
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 0.2701733410358429,
+      "learning_rate": 1.4366666666666667e-05,
+      "loss": 1.4173,
+      "step": 1070
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.1940249353647232,
+      "learning_rate": 1.4033333333333335e-05,
+      "loss": 1.209,
+      "step": 1080
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 0.46034470200538635,
+      "learning_rate": 1.3700000000000001e-05,
+      "loss": 1.7403,
+      "step": 1090
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.5914195775985718,
+      "learning_rate": 1.3366666666666667e-05,
+      "loss": 1.6092,
+      "step": 1100
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 0.25916194915771484,
+      "learning_rate": 1.3033333333333333e-05,
+      "loss": 1.2053,
+      "step": 1110
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.2523491084575653,
+      "learning_rate": 1.27e-05,
+      "loss": 1.0867,
+      "step": 1120
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 0.219766303896904,
+      "learning_rate": 1.2366666666666666e-05,
+      "loss": 1.2608,
+      "step": 1130
+    },
+    {
+      "epoch": 2.2800000000000002,
+      "grad_norm": 0.2709806561470032,
+      "learning_rate": 1.2033333333333334e-05,
+      "loss": 1.4731,
+      "step": 1140
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 0.5920602679252625,
+      "learning_rate": 1.1700000000000001e-05,
+      "loss": 1.5593,
+      "step": 1150
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.3652634620666504,
+      "learning_rate": 1.1366666666666667e-05,
+      "loss": 1.3867,
+      "step": 1160
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 0.29644161462783813,
+      "learning_rate": 1.1033333333333335e-05,
+      "loss": 1.6093,
+      "step": 1170
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 0.23178377747535706,
+      "learning_rate": 1.0700000000000001e-05,
+      "loss": 1.5047,
+      "step": 1180
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 0.571533739566803,
+      "learning_rate": 1.0366666666666667e-05,
+      "loss": 1.5696,
+      "step": 1190
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.4471207857131958,
+      "learning_rate": 1.0033333333333333e-05,
+      "loss": 1.4618,
+      "step": 1200
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1309135212875520.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1200/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb8f7cf8d1e00c5133d5f1e9ff7888824b7c793be6dd6747a41226c47d016d32
+size 5304

checkpoint-1300/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: HuggingFaceTB/SmolLM2-135M-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

checkpoint-1300/adapter_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama"
+  },
+  "base_model_name_or_path": "HuggingFaceTB/SmolLM2-135M-Instruct",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-1300/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb3e8cdaa930b7c2eca15d3095c630a1c4e3effb28ef7fcc8b77c191ed881466
+size 1858776

checkpoint-1300/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07122209c567e872a53870f85cea0de4a5fc99235b284db6dab6beb82adba4af
+size 3787258

checkpoint-1300/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66b4911e0df8fb18bad039615da17421c8ecf279f4d20538a007939be8f38431
+size 14244

checkpoint-1300/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1193fc73299a47dcee7e1ac822c76d88af1f8de0bb6d3e8686f1a97b4729548
+size 1064

checkpoint-1300/trainer_state.json ADDED Viewed

	@@ -0,0 +1,943 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.6,
+  "eval_steps": 500,
+  "global_step": 1300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.28473100066185,
+      "learning_rate": 4.966666666666667e-05,
+      "loss": 1.3783,
+      "step": 10
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.09849760681390762,
+      "learning_rate": 4.933333333333334e-05,
+      "loss": 1.319,
+      "step": 20
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.12863746285438538,
+      "learning_rate": 4.9e-05,
+      "loss": 1.4715,
+      "step": 30
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.16797196865081787,
+      "learning_rate": 4.866666666666667e-05,
+      "loss": 1.4091,
+      "step": 40
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.2071741819381714,
+      "learning_rate": 4.8333333333333334e-05,
+      "loss": 1.5629,
+      "step": 50
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.11463215947151184,
+      "learning_rate": 4.8e-05,
+      "loss": 1.4113,
+      "step": 60
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4405534565448761,
+      "learning_rate": 4.766666666666667e-05,
+      "loss": 1.5984,
+      "step": 70
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.12359176576137543,
+      "learning_rate": 4.7333333333333336e-05,
+      "loss": 1.6352,
+      "step": 80
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.4452320635318756,
+      "learning_rate": 4.7e-05,
+      "loss": 1.4261,
+      "step": 90
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.15276187658309937,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 1.575,
+      "step": 100
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.24705903232097626,
+      "learning_rate": 4.633333333333333e-05,
+      "loss": 1.3795,
+      "step": 110
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1368730366230011,
+      "learning_rate": 4.600000000000001e-05,
+      "loss": 1.4378,
+      "step": 120
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.40468767285346985,
+      "learning_rate": 4.566666666666667e-05,
+      "loss": 1.4759,
+      "step": 130
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.18973663449287415,
+      "learning_rate": 4.5333333333333335e-05,
+      "loss": 1.7495,
+      "step": 140
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.19974994659423828,
+      "learning_rate": 4.5e-05,
+      "loss": 1.4329,
+      "step": 150
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1658446341753006,
+      "learning_rate": 4.466666666666667e-05,
+      "loss": 1.4386,
+      "step": 160
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.20768527686595917,
+      "learning_rate": 4.433333333333334e-05,
+      "loss": 1.5677,
+      "step": 170
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.12603804469108582,
+      "learning_rate": 4.4000000000000006e-05,
+      "loss": 1.3503,
+      "step": 180
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.11383192241191864,
+      "learning_rate": 4.3666666666666666e-05,
+      "loss": 1.536,
+      "step": 190
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1635710746049881,
+      "learning_rate": 4.3333333333333334e-05,
+      "loss": 1.2382,
+      "step": 200
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.1817488968372345,
+      "learning_rate": 4.3e-05,
+      "loss": 1.442,
+      "step": 210
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.22358669340610504,
+      "learning_rate": 4.266666666666667e-05,
+      "loss": 1.6387,
+      "step": 220
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.242269828915596,
+      "learning_rate": 4.233333333333334e-05,
+      "loss": 1.289,
+      "step": 230
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.09938176721334457,
+      "learning_rate": 4.2e-05,
+      "loss": 1.6958,
+      "step": 240
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.24452297389507294,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 1.7457,
+      "step": 250
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6737746596336365,
+      "learning_rate": 4.133333333333333e-05,
+      "loss": 1.4782,
+      "step": 260
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.3342950642108917,
+      "learning_rate": 4.1e-05,
+      "loss": 1.5557,
+      "step": 270
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2859005928039551,
+      "learning_rate": 4.066666666666667e-05,
+      "loss": 1.6331,
+      "step": 280
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.196051225066185,
+      "learning_rate": 4.0333333333333336e-05,
+      "loss": 1.1884,
+      "step": 290
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.13173438608646393,
+      "learning_rate": 4e-05,
+      "loss": 1.5273,
+      "step": 300
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.15074381232261658,
+      "learning_rate": 3.966666666666667e-05,
+      "loss": 1.2017,
+      "step": 310
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.43569642305374146,
+      "learning_rate": 3.933333333333333e-05,
+      "loss": 1.4196,
+      "step": 320
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.13135027885437012,
+      "learning_rate": 3.9000000000000006e-05,
+      "loss": 1.2985,
+      "step": 330
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.683555543422699,
+      "learning_rate": 3.866666666666667e-05,
+      "loss": 2.0534,
+      "step": 340
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.18177451193332672,
+      "learning_rate": 3.8333333333333334e-05,
+      "loss": 1.4249,
+      "step": 350
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8321959376335144,
+      "learning_rate": 3.8e-05,
+      "loss": 1.3113,
+      "step": 360
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.2439054697751999,
+      "learning_rate": 3.766666666666667e-05,
+      "loss": 1.569,
+      "step": 370
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3370145261287689,
+      "learning_rate": 3.733333333333334e-05,
+      "loss": 1.6309,
+      "step": 380
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.17391540110111237,
+      "learning_rate": 3.7e-05,
+      "loss": 1.2326,
+      "step": 390
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.31036776304244995,
+      "learning_rate": 3.6666666666666666e-05,
+      "loss": 1.267,
+      "step": 400
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.16713590919971466,
+      "learning_rate": 3.633333333333333e-05,
+      "loss": 1.2097,
+      "step": 410
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.6574969291687012,
+      "learning_rate": 3.6e-05,
+      "loss": 1.7074,
+      "step": 420
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.21429121494293213,
+      "learning_rate": 3.566666666666667e-05,
+      "loss": 1.4363,
+      "step": 430
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.35936588048934937,
+      "learning_rate": 3.5333333333333336e-05,
+      "loss": 1.2745,
+      "step": 440
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.46381500363349915,
+      "learning_rate": 3.5e-05,
+      "loss": 1.3376,
+      "step": 450
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.1676020473241806,
+      "learning_rate": 3.466666666666667e-05,
+      "loss": 1.4441,
+      "step": 460
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.17079180479049683,
+      "learning_rate": 3.433333333333333e-05,
+      "loss": 1.4486,
+      "step": 470
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.35042911767959595,
+      "learning_rate": 3.4000000000000007e-05,
+      "loss": 1.6119,
+      "step": 480
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.13189171254634857,
+      "learning_rate": 3.366666666666667e-05,
+      "loss": 1.3443,
+      "step": 490
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.48539966344833374,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 1.3611,
+      "step": 500
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.28593960404396057,
+      "learning_rate": 3.3e-05,
+      "loss": 1.6098,
+      "step": 510
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.16387881338596344,
+      "learning_rate": 3.266666666666667e-05,
+      "loss": 1.3215,
+      "step": 520
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.17869319021701813,
+      "learning_rate": 3.233333333333333e-05,
+      "loss": 1.6028,
+      "step": 530
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.26113930344581604,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 1.4372,
+      "step": 540
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.16049513220787048,
+      "learning_rate": 3.1666666666666666e-05,
+      "loss": 1.3916,
+      "step": 550
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.14353357255458832,
+      "learning_rate": 3.1333333333333334e-05,
+      "loss": 1.3971,
+      "step": 560
+    },
+    {
+      "epoch": 1.1400000000000001,
+      "grad_norm": 0.348002552986145,
+      "learning_rate": 3.1e-05,
+      "loss": 1.5419,
+      "step": 570
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.4313276410102844,
+      "learning_rate": 3.066666666666667e-05,
+      "loss": 1.4511,
+      "step": 580
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.20719008147716522,
+      "learning_rate": 3.0333333333333337e-05,
+      "loss": 1.3274,
+      "step": 590
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.21788401901721954,
+      "learning_rate": 3e-05,
+      "loss": 1.2425,
+      "step": 600
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.4731273949146271,
+      "learning_rate": 2.9666666666666672e-05,
+      "loss": 1.4746,
+      "step": 610
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.2933167517185211,
+      "learning_rate": 2.9333333333333336e-05,
+      "loss": 1.475,
+      "step": 620
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.22523891925811768,
+      "learning_rate": 2.9e-05,
+      "loss": 1.3062,
+      "step": 630
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.20424093306064606,
+      "learning_rate": 2.8666666666666668e-05,
+      "loss": 1.4697,
+      "step": 640
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.5376447439193726,
+      "learning_rate": 2.8333333333333335e-05,
+      "loss": 1.5247,
+      "step": 650
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.3490164875984192,
+      "learning_rate": 2.8000000000000003e-05,
+      "loss": 1.3842,
+      "step": 660
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.18214678764343262,
+      "learning_rate": 2.7666666666666667e-05,
+      "loss": 1.3442,
+      "step": 670
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.25049060583114624,
+      "learning_rate": 2.733333333333333e-05,
+      "loss": 1.6505,
+      "step": 680
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.6477521657943726,
+      "learning_rate": 2.7000000000000002e-05,
+      "loss": 1.5945,
+      "step": 690
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.17422887682914734,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 1.0883,
+      "step": 700
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.2029237151145935,
+      "learning_rate": 2.633333333333333e-05,
+      "loss": 1.3454,
+      "step": 710
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.2694133222103119,
+      "learning_rate": 2.6000000000000002e-05,
+      "loss": 1.0081,
+      "step": 720
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.1709485799074173,
+      "learning_rate": 2.57e-05,
+      "loss": 1.5033,
+      "step": 730
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.23790444433689117,
+      "learning_rate": 2.5366666666666665e-05,
+      "loss": 1.3424,
+      "step": 740
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.16748583316802979,
+      "learning_rate": 2.5033333333333336e-05,
+      "loss": 1.2721,
+      "step": 750
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.7728779315948486,
+      "learning_rate": 2.47e-05,
+      "loss": 1.818,
+      "step": 760
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.900970458984375,
+      "learning_rate": 2.4366666666666668e-05,
+      "loss": 1.3781,
+      "step": 770
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.21673746407032013,
+      "learning_rate": 2.4033333333333336e-05,
+      "loss": 1.3427,
+      "step": 780
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.2009885162115097,
+      "learning_rate": 2.37e-05,
+      "loss": 1.265,
+      "step": 790
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.2574044466018677,
+      "learning_rate": 2.3366666666666668e-05,
+      "loss": 1.4462,
+      "step": 800
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.27698543667793274,
+      "learning_rate": 2.3033333333333335e-05,
+      "loss": 1.3308,
+      "step": 810
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.8570265173912048,
+      "learning_rate": 2.2700000000000003e-05,
+      "loss": 1.2596,
+      "step": 820
+    },
+    {
+      "epoch": 1.6600000000000001,
+      "grad_norm": 0.28962960839271545,
+      "learning_rate": 2.236666666666667e-05,
+      "loss": 1.3297,
+      "step": 830
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.22726690769195557,
+      "learning_rate": 2.2033333333333335e-05,
+      "loss": 1.3379,
+      "step": 840
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.2658108174800873,
+      "learning_rate": 2.1700000000000002e-05,
+      "loss": 1.3958,
+      "step": 850
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.5850313305854797,
+      "learning_rate": 2.1366666666666667e-05,
+      "loss": 1.6927,
+      "step": 860
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.45299115777015686,
+      "learning_rate": 2.1033333333333334e-05,
+      "loss": 1.4111,
+      "step": 870
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.19067615270614624,
+      "learning_rate": 2.07e-05,
+      "loss": 1.5171,
+      "step": 880
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.19123299419879913,
+      "learning_rate": 2.0366666666666666e-05,
+      "loss": 1.4562,
+      "step": 890
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.1748710572719574,
+      "learning_rate": 2.0033333333333334e-05,
+      "loss": 1.366,
+      "step": 900
+    },
+    {
+      "epoch": 1.8199999999999998,
+      "grad_norm": 0.27076292037963867,
+      "learning_rate": 1.97e-05,
+      "loss": 1.3784,
+      "step": 910
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.7414858937263489,
+      "learning_rate": 1.9366666666666665e-05,
+      "loss": 1.4304,
+      "step": 920
+    },
+    {
+      "epoch": 1.8599999999999999,
+      "grad_norm": 0.8297638893127441,
+      "learning_rate": 1.9033333333333333e-05,
+      "loss": 1.3251,
+      "step": 930
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.17869873344898224,
+      "learning_rate": 1.87e-05,
+      "loss": 1.3164,
+      "step": 940
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.37470296025276184,
+      "learning_rate": 1.8366666666666668e-05,
+      "loss": 1.4136,
+      "step": 950
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.345380038022995,
+      "learning_rate": 1.8033333333333336e-05,
+      "loss": 1.4664,
+      "step": 960
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.22281818091869354,
+      "learning_rate": 1.77e-05,
+      "loss": 1.505,
+      "step": 970
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.14282165467739105,
+      "learning_rate": 1.7366666666666668e-05,
+      "loss": 1.6581,
+      "step": 980
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.2700720429420471,
+      "learning_rate": 1.7033333333333335e-05,
+      "loss": 1.4055,
+      "step": 990
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.20198316872119904,
+      "learning_rate": 1.6700000000000003e-05,
+      "loss": 1.1787,
+      "step": 1000
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 0.249458909034729,
+      "learning_rate": 1.6366666666666667e-05,
+      "loss": 1.292,
+      "step": 1010
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 0.2311285138130188,
+      "learning_rate": 1.6033333333333335e-05,
+      "loss": 1.1209,
+      "step": 1020
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 0.21848829090595245,
+      "learning_rate": 1.5700000000000002e-05,
+      "loss": 1.3446,
+      "step": 1030
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.1550074964761734,
+      "learning_rate": 1.536666666666667e-05,
+      "loss": 1.4103,
+      "step": 1040
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 0.5638441443443298,
+      "learning_rate": 1.5033333333333336e-05,
+      "loss": 1.5772,
+      "step": 1050
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 0.3584989905357361,
+      "learning_rate": 1.47e-05,
+      "loss": 1.488,
+      "step": 1060
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 0.2701733410358429,
+      "learning_rate": 1.4366666666666667e-05,
+      "loss": 1.4173,
+      "step": 1070
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.1940249353647232,
+      "learning_rate": 1.4033333333333335e-05,
+      "loss": 1.209,
+      "step": 1080
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 0.46034470200538635,
+      "learning_rate": 1.3700000000000001e-05,
+      "loss": 1.7403,
+      "step": 1090
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.5914195775985718,
+      "learning_rate": 1.3366666666666667e-05,
+      "loss": 1.6092,
+      "step": 1100
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 0.25916194915771484,
+      "learning_rate": 1.3033333333333333e-05,
+      "loss": 1.2053,
+      "step": 1110
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.2523491084575653,
+      "learning_rate": 1.27e-05,
+      "loss": 1.0867,
+      "step": 1120
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 0.219766303896904,
+      "learning_rate": 1.2366666666666666e-05,
+      "loss": 1.2608,
+      "step": 1130
+    },
+    {
+      "epoch": 2.2800000000000002,
+      "grad_norm": 0.2709806561470032,
+      "learning_rate": 1.2033333333333334e-05,
+      "loss": 1.4731,
+      "step": 1140
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 0.5920602679252625,
+      "learning_rate": 1.1700000000000001e-05,
+      "loss": 1.5593,
+      "step": 1150
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.3652634620666504,
+      "learning_rate": 1.1366666666666667e-05,
+      "loss": 1.3867,
+      "step": 1160
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 0.29644161462783813,
+      "learning_rate": 1.1033333333333335e-05,
+      "loss": 1.6093,
+      "step": 1170
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 0.23178377747535706,
+      "learning_rate": 1.0700000000000001e-05,
+      "loss": 1.5047,
+      "step": 1180
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 0.571533739566803,
+      "learning_rate": 1.0366666666666667e-05,
+      "loss": 1.5696,
+      "step": 1190
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.4471207857131958,
+      "learning_rate": 1.0033333333333333e-05,
+      "loss": 1.4618,
+      "step": 1200
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 0.7827824354171753,
+      "learning_rate": 9.7e-06,
+      "loss": 1.4032,
+      "step": 1210
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 0.24467045068740845,
+      "learning_rate": 9.366666666666666e-06,
+      "loss": 1.1998,
+      "step": 1220
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 0.18102020025253296,
+      "learning_rate": 9.033333333333334e-06,
+      "loss": 1.1606,
+      "step": 1230
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 0.9267832636833191,
+      "learning_rate": 8.733333333333333e-06,
+      "loss": 1.7444,
+      "step": 1240
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.6565711498260498,
+      "learning_rate": 8.400000000000001e-06,
+      "loss": 1.2445,
+      "step": 1250
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 0.6287052631378174,
+      "learning_rate": 8.066666666666667e-06,
+      "loss": 1.329,
+      "step": 1260
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 0.24446581304073334,
+      "learning_rate": 7.733333333333334e-06,
+      "loss": 1.3563,
+      "step": 1270
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.26736754179000854,
+      "learning_rate": 7.4e-06,
+      "loss": 1.5502,
+      "step": 1280
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 0.36966007947921753,
+      "learning_rate": 7.066666666666667e-06,
+      "loss": 1.3426,
+      "step": 1290
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.16758154332637787,
+      "learning_rate": 6.733333333333333e-06,
+      "loss": 1.4869,
+      "step": 1300
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1416003837590016.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1300/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb8f7cf8d1e00c5133d5f1e9ff7888824b7c793be6dd6747a41226c47d016d32
+size 5304

checkpoint-1400/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: HuggingFaceTB/SmolLM2-135M-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

checkpoint-1400/adapter_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "LlamaForCausalLM",
+    "parent_library": "transformers.models.llama.modeling_llama"
+  },
+  "base_model_name_or_path": "HuggingFaceTB/SmolLM2-135M-Instruct",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-1400/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28e7d895439768d6dca0bfbc24cf8abb805afefe6620c0fcc5818124b6422d89
+size 1858776

checkpoint-1400/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4dee45b1881846d2737491113535e1d218ad90e197662cad8eac9587f2357743
+size 3787258

checkpoint-1400/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d182853cc2bc1b2b09d8407374427b2c057a479d38422e8373ffb16328b6a7f
+size 14244

checkpoint-1400/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ae08d1367027f0374d995457918376df0903e9b0acf886cb9d092ed10feca26
+size 1064

checkpoint-1400/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1013 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.8,
+  "eval_steps": 500,
+  "global_step": 1400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.28473100066185,
+      "learning_rate": 4.966666666666667e-05,
+      "loss": 1.3783,
+      "step": 10
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.09849760681390762,
+      "learning_rate": 4.933333333333334e-05,
+      "loss": 1.319,
+      "step": 20
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.12863746285438538,
+      "learning_rate": 4.9e-05,
+      "loss": 1.4715,
+      "step": 30
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.16797196865081787,
+      "learning_rate": 4.866666666666667e-05,
+      "loss": 1.4091,
+      "step": 40
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.2071741819381714,
+      "learning_rate": 4.8333333333333334e-05,
+      "loss": 1.5629,
+      "step": 50
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.11463215947151184,
+      "learning_rate": 4.8e-05,
+      "loss": 1.4113,
+      "step": 60
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.4405534565448761,
+      "learning_rate": 4.766666666666667e-05,
+      "loss": 1.5984,
+      "step": 70
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.12359176576137543,
+      "learning_rate": 4.7333333333333336e-05,
+      "loss": 1.6352,
+      "step": 80
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.4452320635318756,
+      "learning_rate": 4.7e-05,
+      "loss": 1.4261,
+      "step": 90
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.15276187658309937,
+      "learning_rate": 4.666666666666667e-05,
+      "loss": 1.575,
+      "step": 100
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.24705903232097626,
+      "learning_rate": 4.633333333333333e-05,
+      "loss": 1.3795,
+      "step": 110
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1368730366230011,
+      "learning_rate": 4.600000000000001e-05,
+      "loss": 1.4378,
+      "step": 120
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.40468767285346985,
+      "learning_rate": 4.566666666666667e-05,
+      "loss": 1.4759,
+      "step": 130
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.18973663449287415,
+      "learning_rate": 4.5333333333333335e-05,
+      "loss": 1.7495,
+      "step": 140
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.19974994659423828,
+      "learning_rate": 4.5e-05,
+      "loss": 1.4329,
+      "step": 150
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1658446341753006,
+      "learning_rate": 4.466666666666667e-05,
+      "loss": 1.4386,
+      "step": 160
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.20768527686595917,
+      "learning_rate": 4.433333333333334e-05,
+      "loss": 1.5677,
+      "step": 170
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.12603804469108582,
+      "learning_rate": 4.4000000000000006e-05,
+      "loss": 1.3503,
+      "step": 180
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.11383192241191864,
+      "learning_rate": 4.3666666666666666e-05,
+      "loss": 1.536,
+      "step": 190
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1635710746049881,
+      "learning_rate": 4.3333333333333334e-05,
+      "loss": 1.2382,
+      "step": 200
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.1817488968372345,
+      "learning_rate": 4.3e-05,
+      "loss": 1.442,
+      "step": 210
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.22358669340610504,
+      "learning_rate": 4.266666666666667e-05,
+      "loss": 1.6387,
+      "step": 220
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.242269828915596,
+      "learning_rate": 4.233333333333334e-05,
+      "loss": 1.289,
+      "step": 230
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.09938176721334457,
+      "learning_rate": 4.2e-05,
+      "loss": 1.6958,
+      "step": 240
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.24452297389507294,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 1.7457,
+      "step": 250
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6737746596336365,
+      "learning_rate": 4.133333333333333e-05,
+      "loss": 1.4782,
+      "step": 260
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.3342950642108917,
+      "learning_rate": 4.1e-05,
+      "loss": 1.5557,
+      "step": 270
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2859005928039551,
+      "learning_rate": 4.066666666666667e-05,
+      "loss": 1.6331,
+      "step": 280
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.196051225066185,
+      "learning_rate": 4.0333333333333336e-05,
+      "loss": 1.1884,
+      "step": 290
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.13173438608646393,
+      "learning_rate": 4e-05,
+      "loss": 1.5273,
+      "step": 300
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.15074381232261658,
+      "learning_rate": 3.966666666666667e-05,
+      "loss": 1.2017,
+      "step": 310
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.43569642305374146,
+      "learning_rate": 3.933333333333333e-05,
+      "loss": 1.4196,
+      "step": 320
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.13135027885437012,
+      "learning_rate": 3.9000000000000006e-05,
+      "loss": 1.2985,
+      "step": 330
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.683555543422699,
+      "learning_rate": 3.866666666666667e-05,
+      "loss": 2.0534,
+      "step": 340
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.18177451193332672,
+      "learning_rate": 3.8333333333333334e-05,
+      "loss": 1.4249,
+      "step": 350
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8321959376335144,
+      "learning_rate": 3.8e-05,
+      "loss": 1.3113,
+      "step": 360
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.2439054697751999,
+      "learning_rate": 3.766666666666667e-05,
+      "loss": 1.569,
+      "step": 370
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.3370145261287689,
+      "learning_rate": 3.733333333333334e-05,
+      "loss": 1.6309,
+      "step": 380
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.17391540110111237,
+      "learning_rate": 3.7e-05,
+      "loss": 1.2326,
+      "step": 390
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.31036776304244995,
+      "learning_rate": 3.6666666666666666e-05,
+      "loss": 1.267,
+      "step": 400
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.16713590919971466,
+      "learning_rate": 3.633333333333333e-05,
+      "loss": 1.2097,
+      "step": 410
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.6574969291687012,
+      "learning_rate": 3.6e-05,
+      "loss": 1.7074,
+      "step": 420
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.21429121494293213,
+      "learning_rate": 3.566666666666667e-05,
+      "loss": 1.4363,
+      "step": 430
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.35936588048934937,
+      "learning_rate": 3.5333333333333336e-05,
+      "loss": 1.2745,
+      "step": 440
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.46381500363349915,
+      "learning_rate": 3.5e-05,
+      "loss": 1.3376,
+      "step": 450
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.1676020473241806,
+      "learning_rate": 3.466666666666667e-05,
+      "loss": 1.4441,
+      "step": 460
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.17079180479049683,
+      "learning_rate": 3.433333333333333e-05,
+      "loss": 1.4486,
+      "step": 470
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.35042911767959595,
+      "learning_rate": 3.4000000000000007e-05,
+      "loss": 1.6119,
+      "step": 480
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.13189171254634857,
+      "learning_rate": 3.366666666666667e-05,
+      "loss": 1.3443,
+      "step": 490
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.48539966344833374,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 1.3611,
+      "step": 500
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.28593960404396057,
+      "learning_rate": 3.3e-05,
+      "loss": 1.6098,
+      "step": 510
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.16387881338596344,
+      "learning_rate": 3.266666666666667e-05,
+      "loss": 1.3215,
+      "step": 520
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.17869319021701813,
+      "learning_rate": 3.233333333333333e-05,
+      "loss": 1.6028,
+      "step": 530
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.26113930344581604,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 1.4372,
+      "step": 540
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.16049513220787048,
+      "learning_rate": 3.1666666666666666e-05,
+      "loss": 1.3916,
+      "step": 550
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.14353357255458832,
+      "learning_rate": 3.1333333333333334e-05,
+      "loss": 1.3971,
+      "step": 560
+    },
+    {
+      "epoch": 1.1400000000000001,
+      "grad_norm": 0.348002552986145,
+      "learning_rate": 3.1e-05,
+      "loss": 1.5419,
+      "step": 570
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.4313276410102844,
+      "learning_rate": 3.066666666666667e-05,
+      "loss": 1.4511,
+      "step": 580
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.20719008147716522,
+      "learning_rate": 3.0333333333333337e-05,
+      "loss": 1.3274,
+      "step": 590
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.21788401901721954,
+      "learning_rate": 3e-05,
+      "loss": 1.2425,
+      "step": 600
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.4731273949146271,
+      "learning_rate": 2.9666666666666672e-05,
+      "loss": 1.4746,
+      "step": 610
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.2933167517185211,
+      "learning_rate": 2.9333333333333336e-05,
+      "loss": 1.475,
+      "step": 620
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.22523891925811768,
+      "learning_rate": 2.9e-05,
+      "loss": 1.3062,
+      "step": 630
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.20424093306064606,
+      "learning_rate": 2.8666666666666668e-05,
+      "loss": 1.4697,
+      "step": 640
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.5376447439193726,
+      "learning_rate": 2.8333333333333335e-05,
+      "loss": 1.5247,
+      "step": 650
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.3490164875984192,
+      "learning_rate": 2.8000000000000003e-05,
+      "loss": 1.3842,
+      "step": 660
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.18214678764343262,
+      "learning_rate": 2.7666666666666667e-05,
+      "loss": 1.3442,
+      "step": 670
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.25049060583114624,
+      "learning_rate": 2.733333333333333e-05,
+      "loss": 1.6505,
+      "step": 680
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.6477521657943726,
+      "learning_rate": 2.7000000000000002e-05,
+      "loss": 1.5945,
+      "step": 690
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.17422887682914734,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 1.0883,
+      "step": 700
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.2029237151145935,
+      "learning_rate": 2.633333333333333e-05,
+      "loss": 1.3454,
+      "step": 710
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.2694133222103119,
+      "learning_rate": 2.6000000000000002e-05,
+      "loss": 1.0081,
+      "step": 720
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 0.1709485799074173,
+      "learning_rate": 2.57e-05,
+      "loss": 1.5033,
+      "step": 730
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.23790444433689117,
+      "learning_rate": 2.5366666666666665e-05,
+      "loss": 1.3424,
+      "step": 740
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.16748583316802979,
+      "learning_rate": 2.5033333333333336e-05,
+      "loss": 1.2721,
+      "step": 750
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.7728779315948486,
+      "learning_rate": 2.47e-05,
+      "loss": 1.818,
+      "step": 760
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.900970458984375,
+      "learning_rate": 2.4366666666666668e-05,
+      "loss": 1.3781,
+      "step": 770
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.21673746407032013,
+      "learning_rate": 2.4033333333333336e-05,
+      "loss": 1.3427,
+      "step": 780
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.2009885162115097,
+      "learning_rate": 2.37e-05,
+      "loss": 1.265,
+      "step": 790
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.2574044466018677,
+      "learning_rate": 2.3366666666666668e-05,
+      "loss": 1.4462,
+      "step": 800
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.27698543667793274,
+      "learning_rate": 2.3033333333333335e-05,
+      "loss": 1.3308,
+      "step": 810
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.8570265173912048,
+      "learning_rate": 2.2700000000000003e-05,
+      "loss": 1.2596,
+      "step": 820
+    },
+    {
+      "epoch": 1.6600000000000001,
+      "grad_norm": 0.28962960839271545,
+      "learning_rate": 2.236666666666667e-05,
+      "loss": 1.3297,
+      "step": 830
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.22726690769195557,
+      "learning_rate": 2.2033333333333335e-05,
+      "loss": 1.3379,
+      "step": 840
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.2658108174800873,
+      "learning_rate": 2.1700000000000002e-05,
+      "loss": 1.3958,
+      "step": 850
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.5850313305854797,
+      "learning_rate": 2.1366666666666667e-05,
+      "loss": 1.6927,
+      "step": 860
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.45299115777015686,
+      "learning_rate": 2.1033333333333334e-05,
+      "loss": 1.4111,
+      "step": 870
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.19067615270614624,
+      "learning_rate": 2.07e-05,
+      "loss": 1.5171,
+      "step": 880
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.19123299419879913,
+      "learning_rate": 2.0366666666666666e-05,
+      "loss": 1.4562,
+      "step": 890
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.1748710572719574,
+      "learning_rate": 2.0033333333333334e-05,
+      "loss": 1.366,
+      "step": 900
+    },
+    {
+      "epoch": 1.8199999999999998,
+      "grad_norm": 0.27076292037963867,
+      "learning_rate": 1.97e-05,
+      "loss": 1.3784,
+      "step": 910
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.7414858937263489,
+      "learning_rate": 1.9366666666666665e-05,
+      "loss": 1.4304,
+      "step": 920
+    },
+    {
+      "epoch": 1.8599999999999999,
+      "grad_norm": 0.8297638893127441,
+      "learning_rate": 1.9033333333333333e-05,
+      "loss": 1.3251,
+      "step": 930
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.17869873344898224,
+      "learning_rate": 1.87e-05,
+      "loss": 1.3164,
+      "step": 940
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.37470296025276184,
+      "learning_rate": 1.8366666666666668e-05,
+      "loss": 1.4136,
+      "step": 950
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.345380038022995,
+      "learning_rate": 1.8033333333333336e-05,
+      "loss": 1.4664,
+      "step": 960
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.22281818091869354,
+      "learning_rate": 1.77e-05,
+      "loss": 1.505,
+      "step": 970
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.14282165467739105,
+      "learning_rate": 1.7366666666666668e-05,
+      "loss": 1.6581,
+      "step": 980
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.2700720429420471,
+      "learning_rate": 1.7033333333333335e-05,
+      "loss": 1.4055,
+      "step": 990
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.20198316872119904,
+      "learning_rate": 1.6700000000000003e-05,
+      "loss": 1.1787,
+      "step": 1000
+    },
+    {
+      "epoch": 2.02,
+      "grad_norm": 0.249458909034729,
+      "learning_rate": 1.6366666666666667e-05,
+      "loss": 1.292,
+      "step": 1010
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 0.2311285138130188,
+      "learning_rate": 1.6033333333333335e-05,
+      "loss": 1.1209,
+      "step": 1020
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 0.21848829090595245,
+      "learning_rate": 1.5700000000000002e-05,
+      "loss": 1.3446,
+      "step": 1030
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.1550074964761734,
+      "learning_rate": 1.536666666666667e-05,
+      "loss": 1.4103,
+      "step": 1040
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 0.5638441443443298,
+      "learning_rate": 1.5033333333333336e-05,
+      "loss": 1.5772,
+      "step": 1050
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 0.3584989905357361,
+      "learning_rate": 1.47e-05,
+      "loss": 1.488,
+      "step": 1060
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 0.2701733410358429,
+      "learning_rate": 1.4366666666666667e-05,
+      "loss": 1.4173,
+      "step": 1070
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.1940249353647232,
+      "learning_rate": 1.4033333333333335e-05,
+      "loss": 1.209,
+      "step": 1080
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 0.46034470200538635,
+      "learning_rate": 1.3700000000000001e-05,
+      "loss": 1.7403,
+      "step": 1090
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.5914195775985718,
+      "learning_rate": 1.3366666666666667e-05,
+      "loss": 1.6092,
+      "step": 1100
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 0.25916194915771484,
+      "learning_rate": 1.3033333333333333e-05,
+      "loss": 1.2053,
+      "step": 1110
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.2523491084575653,
+      "learning_rate": 1.27e-05,
+      "loss": 1.0867,
+      "step": 1120
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 0.219766303896904,
+      "learning_rate": 1.2366666666666666e-05,
+      "loss": 1.2608,
+      "step": 1130
+    },
+    {
+      "epoch": 2.2800000000000002,
+      "grad_norm": 0.2709806561470032,
+      "learning_rate": 1.2033333333333334e-05,
+      "loss": 1.4731,
+      "step": 1140
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 0.5920602679252625,
+      "learning_rate": 1.1700000000000001e-05,
+      "loss": 1.5593,
+      "step": 1150
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.3652634620666504,
+      "learning_rate": 1.1366666666666667e-05,
+      "loss": 1.3867,
+      "step": 1160
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 0.29644161462783813,
+      "learning_rate": 1.1033333333333335e-05,
+      "loss": 1.6093,
+      "step": 1170
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 0.23178377747535706,
+      "learning_rate": 1.0700000000000001e-05,
+      "loss": 1.5047,
+      "step": 1180
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 0.571533739566803,
+      "learning_rate": 1.0366666666666667e-05,
+      "loss": 1.5696,
+      "step": 1190
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.4471207857131958,
+      "learning_rate": 1.0033333333333333e-05,
+      "loss": 1.4618,
+      "step": 1200
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 0.7827824354171753,
+      "learning_rate": 9.7e-06,
+      "loss": 1.4032,
+      "step": 1210
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 0.24467045068740845,
+      "learning_rate": 9.366666666666666e-06,
+      "loss": 1.1998,
+      "step": 1220
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 0.18102020025253296,
+      "learning_rate": 9.033333333333334e-06,
+      "loss": 1.1606,
+      "step": 1230
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 0.9267832636833191,
+      "learning_rate": 8.733333333333333e-06,
+      "loss": 1.7444,
+      "step": 1240
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.6565711498260498,
+      "learning_rate": 8.400000000000001e-06,
+      "loss": 1.2445,
+      "step": 1250
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 0.6287052631378174,
+      "learning_rate": 8.066666666666667e-06,
+      "loss": 1.329,
+      "step": 1260
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 0.24446581304073334,
+      "learning_rate": 7.733333333333334e-06,
+      "loss": 1.3563,
+      "step": 1270
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.26736754179000854,
+      "learning_rate": 7.4e-06,
+      "loss": 1.5502,
+      "step": 1280
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 0.36966007947921753,
+      "learning_rate": 7.066666666666667e-06,
+      "loss": 1.3426,
+      "step": 1290
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.16758154332637787,
+      "learning_rate": 6.733333333333333e-06,
+      "loss": 1.4869,
+      "step": 1300
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 0.4700581729412079,
+      "learning_rate": 6.4000000000000006e-06,
+      "loss": 1.6992,
+      "step": 1310
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.533340334892273,
+      "learning_rate": 6.066666666666667e-06,
+      "loss": 1.6744,
+      "step": 1320
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 0.1966978758573532,
+      "learning_rate": 5.733333333333333e-06,
+      "loss": 1.4327,
+      "step": 1330
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 0.5327418446540833,
+      "learning_rate": 5.4e-06,
+      "loss": 1.2062,
+      "step": 1340
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.5234915614128113,
+      "learning_rate": 5.066666666666667e-06,
+      "loss": 1.157,
+      "step": 1350
+    },
+    {
+      "epoch": 2.7199999999999998,
+      "grad_norm": 0.6626607179641724,
+      "learning_rate": 4.7333333333333335e-06,
+      "loss": 1.3966,
+      "step": 1360
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 0.31440094113349915,
+      "learning_rate": 4.4e-06,
+      "loss": 1.1864,
+      "step": 1370
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.21158050000667572,
+      "learning_rate": 4.066666666666666e-06,
+      "loss": 1.2655,
+      "step": 1380
+    },
+    {
+      "epoch": 2.7800000000000002,
+      "grad_norm": 0.20304274559020996,
+      "learning_rate": 3.7333333333333337e-06,
+      "loss": 1.3801,
+      "step": 1390
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.4777492582798004,
+      "learning_rate": 3.4000000000000005e-06,
+      "loss": 1.5293,
+      "step": 1400
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1528382737769472.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}