Model save
Browse files- README.md +4 -6
- all_results.json +4 -9
- train_results.json +4 -4
- trainer_state.json +87 -87
    	
        README.md
    CHANGED
    
    | @@ -1,19 +1,17 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
             
            base_model: Qwen/Qwen2.5-0.5B-Instruct
         | 
| 3 | 
            -
            datasets: MelinaLaimon/stream-of-search
         | 
| 4 | 
             
            library_name: transformers
         | 
| 5 | 
            -
            model_name:  | 
| 6 | 
             
            tags:
         | 
| 7 | 
             
            - generated_from_trainer
         | 
| 8 | 
            -
            - alignment-handbook
         | 
| 9 | 
             
            - trl
         | 
| 10 | 
             
            - sft
         | 
| 11 | 
             
            licence: license
         | 
| 12 | 
             
            ---
         | 
| 13 |  | 
| 14 | 
            -
            # Model Card for  | 
| 15 |  | 
| 16 | 
            -
            This model is a fine-tuned version of [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 
| 17 | 
             
            It has been trained using [TRL](https://github.com/huggingface/trl).
         | 
| 18 |  | 
| 19 | 
             
            ## Quick start
         | 
| @@ -29,7 +27,7 @@ print(output["generated_text"]) | |
| 29 |  | 
| 30 | 
             
            ## Training procedure
         | 
| 31 |  | 
| 32 | 
            -
            [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chloeli/huggingface/runs/ | 
| 33 |  | 
| 34 |  | 
| 35 | 
             
            This model was trained with SFT.
         | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
             
            base_model: Qwen/Qwen2.5-0.5B-Instruct
         | 
|  | |
| 3 | 
             
            library_name: transformers
         | 
| 4 | 
            +
            model_name: qwen-2.5-0.5B-instruct-sft-lora-countdown-search-1k
         | 
| 5 | 
             
            tags:
         | 
| 6 | 
             
            - generated_from_trainer
         | 
|  | |
| 7 | 
             
            - trl
         | 
| 8 | 
             
            - sft
         | 
| 9 | 
             
            licence: license
         | 
| 10 | 
             
            ---
         | 
| 11 |  | 
| 12 | 
            +
            # Model Card for qwen-2.5-0.5B-instruct-sft-lora-countdown-search-1k
         | 
| 13 |  | 
| 14 | 
            +
            This model is a fine-tuned version of [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct).
         | 
| 15 | 
             
            It has been trained using [TRL](https://github.com/huggingface/trl).
         | 
| 16 |  | 
| 17 | 
             
            ## Quick start
         | 
|  | |
| 27 |  | 
| 28 | 
             
            ## Training procedure
         | 
| 29 |  | 
| 30 | 
            +
            [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chloeli/huggingface/runs/fi2orchz) 
         | 
| 31 |  | 
| 32 |  | 
| 33 | 
             
            This model was trained with SFT.
         | 
    	
        all_results.json
    CHANGED
    
    | @@ -1,13 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
            -
                "eval_loss": 0.042979925870895386,
         | 
| 3 | 
            -
                "eval_runtime": 84.0981,
         | 
| 4 | 
            -
                "eval_samples": 1000,
         | 
| 5 | 
            -
                "eval_samples_per_second": 11.891,
         | 
| 6 | 
            -
                "eval_steps_per_second": 1.486,
         | 
| 7 | 
             
                "total_flos": 4505964279496704.0,
         | 
| 8 | 
            -
                "train_loss": 0. | 
| 9 | 
            -
                "train_runtime":  | 
| 10 | 
             
                "train_samples": 1000,
         | 
| 11 | 
            -
                "train_samples_per_second": 2. | 
| 12 | 
            -
                "train_steps_per_second": 0. | 
| 13 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 2 | 
             
                "total_flos": 4505964279496704.0,
         | 
| 3 | 
            +
                "train_loss": 0.1011103401184082,
         | 
| 4 | 
            +
                "train_runtime": 450.8333,
         | 
| 5 | 
             
                "train_samples": 1000,
         | 
| 6 | 
            +
                "train_samples_per_second": 2.218,
         | 
| 7 | 
            +
                "train_steps_per_second": 0.277
         | 
| 8 | 
             
            }
         | 
    	
        train_results.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
                "total_flos": 4505964279496704.0,
         | 
| 3 | 
            -
                "train_loss": 0. | 
| 4 | 
            -
                "train_runtime":  | 
| 5 | 
             
                "train_samples": 1000,
         | 
| 6 | 
            -
                "train_samples_per_second": 2. | 
| 7 | 
            -
                "train_steps_per_second": 0. | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
                "total_flos": 4505964279496704.0,
         | 
| 3 | 
            +
                "train_loss": 0.1011103401184082,
         | 
| 4 | 
            +
                "train_runtime": 450.8333,
         | 
| 5 | 
             
                "train_samples": 1000,
         | 
| 6 | 
            +
                "train_samples_per_second": 2.218,
         | 
| 7 | 
            +
                "train_steps_per_second": 0.277
         | 
| 8 | 
             
            }
         | 
    	
        trainer_state.json
    CHANGED
    
    | @@ -10,229 +10,229 @@ | |
| 10 | 
             
              "log_history": [
         | 
| 11 | 
             
                {
         | 
| 12 | 
             
                  "epoch": 0.008,
         | 
| 13 | 
            -
                  "grad_norm": 1. | 
| 14 | 
             
                  "learning_rate": 1.5384615384615387e-05,
         | 
| 15 | 
            -
                  "loss": 0. | 
| 16 | 
            -
                  "mean_token_accuracy": 0. | 
| 17 | 
             
                  "step": 1
         | 
| 18 | 
             
                },
         | 
| 19 | 
             
                {
         | 
| 20 | 
             
                  "epoch": 0.04,
         | 
| 21 | 
            -
                  "grad_norm": 0. | 
| 22 | 
             
                  "learning_rate": 7.692307692307693e-05,
         | 
| 23 | 
            -
                  "loss": 0. | 
| 24 | 
            -
                  "mean_token_accuracy": 0. | 
| 25 | 
             
                  "step": 5
         | 
| 26 | 
             
                },
         | 
| 27 | 
             
                {
         | 
| 28 | 
             
                  "epoch": 0.08,
         | 
| 29 | 
            -
                  "grad_norm": 0. | 
| 30 | 
             
                  "learning_rate": 0.00015384615384615385,
         | 
| 31 | 
            -
                  "loss": 0. | 
| 32 | 
            -
                  "mean_token_accuracy": 0. | 
| 33 | 
             
                  "step": 10
         | 
| 34 | 
             
                },
         | 
| 35 | 
             
                {
         | 
| 36 | 
             
                  "epoch": 0.12,
         | 
| 37 | 
            -
                  "grad_norm": 0. | 
| 38 | 
             
                  "learning_rate": 0.00019984268150178167,
         | 
| 39 | 
            -
                  "loss": 0. | 
| 40 | 
            -
                  "mean_token_accuracy": 0. | 
| 41 | 
             
                  "step": 15
         | 
| 42 | 
             
                },
         | 
| 43 | 
             
                {
         | 
| 44 | 
             
                  "epoch": 0.16,
         | 
| 45 | 
            -
                  "grad_norm": 0. | 
| 46 | 
             
                  "learning_rate": 0.00019807852804032305,
         | 
| 47 | 
            -
                  "loss": 0. | 
| 48 | 
            -
                  "mean_token_accuracy": 0. | 
| 49 | 
             
                  "step": 20
         | 
| 50 | 
             
                },
         | 
| 51 | 
             
                {
         | 
| 52 | 
             
                  "epoch": 0.2,
         | 
| 53 | 
            -
                  "grad_norm": 0. | 
| 54 | 
             
                  "learning_rate": 0.00019438833303083678,
         | 
| 55 | 
            -
                  "loss": 0. | 
| 56 | 
            -
                  "mean_token_accuracy": 0. | 
| 57 | 
             
                  "step": 25
         | 
| 58 | 
             
                },
         | 
| 59 | 
             
                {
         | 
| 60 | 
             
                  "epoch": 0.24,
         | 
| 61 | 
            -
                  "grad_norm": 0. | 
| 62 | 
             
                  "learning_rate": 0.00018884456359788724,
         | 
| 63 | 
            -
                  "loss": 0. | 
| 64 | 
            -
                  "mean_token_accuracy": 0. | 
| 65 | 
             
                  "step": 30
         | 
| 66 | 
             
                },
         | 
| 67 | 
             
                {
         | 
| 68 | 
             
                  "epoch": 0.28,
         | 
| 69 | 
            -
                  "grad_norm": 0. | 
| 70 | 
             
                  "learning_rate": 0.00018155608689592604,
         | 
| 71 | 
            -
                  "loss": 0. | 
| 72 | 
            -
                  "mean_token_accuracy": 0. | 
| 73 | 
             
                  "step": 35
         | 
| 74 | 
             
                },
         | 
| 75 | 
             
                {
         | 
| 76 | 
             
                  "epoch": 0.32,
         | 
| 77 | 
            -
                  "grad_norm": 0. | 
| 78 | 
             
                  "learning_rate": 0.0001726660322034027,
         | 
| 79 | 
            -
                  "loss": 0. | 
| 80 | 
            -
                  "mean_token_accuracy": 0. | 
| 81 | 
             
                  "step": 40
         | 
| 82 | 
             
                },
         | 
| 83 | 
             
                {
         | 
| 84 | 
             
                  "epoch": 0.36,
         | 
| 85 | 
            -
                  "grad_norm": 0. | 
| 86 | 
             
                  "learning_rate": 0.00016234898018587337,
         | 
| 87 | 
            -
                  "loss": 0. | 
| 88 | 
            -
                  "mean_token_accuracy": 0. | 
| 89 | 
             
                  "step": 45
         | 
| 90 | 
             
                },
         | 
| 91 | 
             
                {
         | 
| 92 | 
             
                  "epoch": 0.4,
         | 
| 93 | 
            -
                  "grad_norm": 0. | 
| 94 | 
             
                  "learning_rate": 0.00015080753452465296,
         | 
| 95 | 
            -
                  "loss": 0. | 
| 96 | 
            -
                  "mean_token_accuracy": 0. | 
| 97 | 
             
                  "step": 50
         | 
| 98 | 
             
                },
         | 
| 99 | 
             
                {
         | 
| 100 | 
             
                  "epoch": 0.44,
         | 
| 101 | 
            -
                  "grad_norm": 0. | 
| 102 | 
             
                  "learning_rate": 0.000138268343236509,
         | 
| 103 | 
            -
                  "loss": 0. | 
| 104 | 
            -
                  "mean_token_accuracy": 0. | 
| 105 | 
             
                  "step": 55
         | 
| 106 | 
             
                },
         | 
| 107 | 
             
                {
         | 
| 108 | 
             
                  "epoch": 0.48,
         | 
| 109 | 
            -
                  "grad_norm": 0. | 
| 110 | 
             
                  "learning_rate": 0.0001249776478167227,
         | 
| 111 | 
            -
                  "loss": 0. | 
| 112 | 
            -
                  "mean_token_accuracy": 0. | 
| 113 | 
             
                  "step": 60
         | 
| 114 | 
             
                },
         | 
| 115 | 
             
                {
         | 
| 116 | 
             
                  "epoch": 0.52,
         | 
| 117 | 
            -
                  "grad_norm": 0. | 
| 118 | 
             
                  "learning_rate": 0.00011119644761033078,
         | 
| 119 | 
            -
                  "loss": 0. | 
| 120 | 
            -
                  "mean_token_accuracy": 0. | 
| 121 | 
             
                  "step": 65
         | 
| 122 | 
             
                },
         | 
| 123 | 
             
                {
         | 
| 124 | 
             
                  "epoch": 0.56,
         | 
| 125 | 
            -
                  "grad_norm": 0. | 
| 126 | 
             
                  "learning_rate": 9.719537437241312e-05,
         | 
| 127 | 
            -
                  "loss": 0. | 
| 128 | 
            -
                  "mean_token_accuracy": 0. | 
| 129 | 
             
                  "step": 70
         | 
| 130 | 
             
                },
         | 
| 131 | 
             
                {
         | 
| 132 | 
             
                  "epoch": 0.6,
         | 
| 133 | 
            -
                  "grad_norm": 0. | 
| 134 | 
             
                  "learning_rate": 8.324937766952638e-05,
         | 
| 135 | 
            -
                  "loss": 0. | 
| 136 | 
            -
                  "mean_token_accuracy": 0. | 
| 137 | 
             
                  "step": 75
         | 
| 138 | 
             
                },
         | 
| 139 | 
             
                {
         | 
| 140 | 
             
                  "epoch": 0.64,
         | 
| 141 | 
            -
                  "grad_norm": 0. | 
| 142 | 
             
                  "learning_rate": 6.963232548903853e-05,
         | 
| 143 | 
            -
                  "loss": 0. | 
| 144 | 
            -
                  "mean_token_accuracy": 0. | 
| 145 | 
             
                  "step": 80
         | 
| 146 | 
             
                },
         | 
| 147 | 
             
                {
         | 
| 148 | 
             
                  "epoch": 0.68,
         | 
| 149 | 
            -
                  "grad_norm": 0. | 
| 150 | 
             
                  "learning_rate": 5.6611626088244194e-05,
         | 
| 151 | 
            -
                  "loss": 0. | 
| 152 | 
            -
                  "mean_token_accuracy": 0. | 
| 153 | 
             
                  "step": 85
         | 
| 154 | 
             
                },
         | 
| 155 | 
             
                {
         | 
| 156 | 
             
                  "epoch": 0.72,
         | 
| 157 | 
            -
                  "grad_norm": 0. | 
| 158 | 
             
                  "learning_rate": 4.444297669803981e-05,
         | 
| 159 | 
            -
                  "loss": 0. | 
| 160 | 
            -
                  "mean_token_accuracy": 0. | 
| 161 | 
             
                  "step": 90
         | 
| 162 | 
             
                },
         | 
| 163 | 
             
                {
         | 
| 164 | 
             
                  "epoch": 0.76,
         | 
| 165 | 
            -
                  "grad_norm": 0. | 
| 166 | 
             
                  "learning_rate": 3.336534220479961e-05,
         | 
| 167 | 
            -
                  "loss": 0. | 
| 168 | 
            -
                  "mean_token_accuracy": 0. | 
| 169 | 
             
                  "step": 95
         | 
| 170 | 
             
                },
         | 
| 171 | 
             
                {
         | 
| 172 | 
             
                  "epoch": 0.8,
         | 
| 173 | 
            -
                  "grad_norm": 0. | 
| 174 | 
             
                  "learning_rate": 2.3596262417839255e-05,
         | 
| 175 | 
            -
                  "loss": 0. | 
| 176 | 
            -
                  "mean_token_accuracy": 0. | 
| 177 | 
             
                  "step": 100
         | 
| 178 | 
             
                },
         | 
| 179 | 
             
                {
         | 
| 180 | 
             
                  "epoch": 0.84,
         | 
| 181 | 
            -
                  "grad_norm": 0. | 
| 182 | 
             
                  "learning_rate": 1.5327580077171587e-05,
         | 
| 183 | 
            -
                  "loss": 0. | 
| 184 | 
            -
                  "mean_token_accuracy": 0. | 
| 185 | 
             
                  "step": 105
         | 
| 186 | 
             
                },
         | 
| 187 | 
             
                {
         | 
| 188 | 
             
                  "epoch": 0.88,
         | 
| 189 | 
            -
                  "grad_norm": 0. | 
| 190 | 
             
                  "learning_rate": 8.72167349386811e-06,
         | 
| 191 | 
            -
                  "loss": 0. | 
| 192 | 
            -
                  "mean_token_accuracy": 0. | 
| 193 | 
             
                  "step": 110
         | 
| 194 | 
             
                },
         | 
| 195 | 
             
                {
         | 
| 196 | 
             
                  "epoch": 0.92,
         | 
| 197 | 
            -
                  "grad_norm": 0. | 
| 198 | 
             
                  "learning_rate": 3.908267805490051e-06,
         | 
| 199 | 
            -
                  "loss": 0. | 
| 200 | 
            -
                  "mean_token_accuracy": 0. | 
| 201 | 
             
                  "step": 115
         | 
| 202 | 
             
                },
         | 
| 203 | 
             
                {
         | 
| 204 | 
             
                  "epoch": 0.96,
         | 
| 205 | 
            -
                  "grad_norm": 0. | 
| 206 | 
             
                  "learning_rate": 9.818874663554357e-07,
         | 
| 207 | 
            -
                  "loss": 0. | 
| 208 | 
            -
                  "mean_token_accuracy": 0. | 
| 209 | 
             
                  "step": 120
         | 
| 210 | 
             
                },
         | 
| 211 | 
             
                {
         | 
| 212 | 
             
                  "epoch": 1.0,
         | 
| 213 | 
            -
                  "grad_norm": 0. | 
| 214 | 
             
                  "learning_rate": 0.0,
         | 
| 215 | 
            -
                  "loss": 0. | 
| 216 | 
            -
                  "mean_token_accuracy": 0. | 
| 217 | 
             
                  "step": 125
         | 
| 218 | 
             
                },
         | 
| 219 | 
             
                {
         | 
| 220 | 
             
                  "epoch": 1.0,
         | 
| 221 | 
            -
                  "eval_loss": 0. | 
| 222 | 
            -
                  "eval_mean_token_accuracy": 0. | 
| 223 | 
            -
                  "eval_runtime":  | 
| 224 | 
            -
                  "eval_samples_per_second": 11. | 
| 225 | 
            -
                  "eval_steps_per_second": 1. | 
| 226 | 
             
                  "step": 125
         | 
| 227 | 
             
                },
         | 
| 228 | 
             
                {
         | 
| 229 | 
             
                  "epoch": 1.0,
         | 
| 230 | 
             
                  "step": 125,
         | 
| 231 | 
             
                  "total_flos": 4505964279496704.0,
         | 
| 232 | 
            -
                  "train_loss": 0. | 
| 233 | 
            -
                  "train_runtime":  | 
| 234 | 
            -
                  "train_samples_per_second": 2. | 
| 235 | 
            -
                  "train_steps_per_second": 0. | 
| 236 | 
             
                }
         | 
| 237 | 
             
              ],
         | 
| 238 | 
             
              "logging_steps": 5,
         | 
|  | |
| 10 | 
             
              "log_history": [
         | 
| 11 | 
             
                {
         | 
| 12 | 
             
                  "epoch": 0.008,
         | 
| 13 | 
            +
                  "grad_norm": 1.0864604711532593,
         | 
| 14 | 
             
                  "learning_rate": 1.5384615384615387e-05,
         | 
| 15 | 
            +
                  "loss": 0.4927,
         | 
| 16 | 
            +
                  "mean_token_accuracy": 0.8949072062969208,
         | 
| 17 | 
             
                  "step": 1
         | 
| 18 | 
             
                },
         | 
| 19 | 
             
                {
         | 
| 20 | 
             
                  "epoch": 0.04,
         | 
| 21 | 
            +
                  "grad_norm": 0.8317855596542358,
         | 
| 22 | 
             
                  "learning_rate": 7.692307692307693e-05,
         | 
| 23 | 
            +
                  "loss": 0.5321,
         | 
| 24 | 
            +
                  "mean_token_accuracy": 0.8870031237602234,
         | 
| 25 | 
             
                  "step": 5
         | 
| 26 | 
             
                },
         | 
| 27 | 
             
                {
         | 
| 28 | 
             
                  "epoch": 0.08,
         | 
| 29 | 
            +
                  "grad_norm": 0.35360249876976013,
         | 
| 30 | 
             
                  "learning_rate": 0.00015384615384615385,
         | 
| 31 | 
            +
                  "loss": 0.4318,
         | 
| 32 | 
            +
                  "mean_token_accuracy": 0.8946158409118652,
         | 
| 33 | 
             
                  "step": 10
         | 
| 34 | 
             
                },
         | 
| 35 | 
             
                {
         | 
| 36 | 
             
                  "epoch": 0.12,
         | 
| 37 | 
            +
                  "grad_norm": 0.26475989818573,
         | 
| 38 | 
             
                  "learning_rate": 0.00019984268150178167,
         | 
| 39 | 
            +
                  "loss": 0.3243,
         | 
| 40 | 
            +
                  "mean_token_accuracy": 0.9152041494846344,
         | 
| 41 | 
             
                  "step": 15
         | 
| 42 | 
             
                },
         | 
| 43 | 
             
                {
         | 
| 44 | 
             
                  "epoch": 0.16,
         | 
| 45 | 
            +
                  "grad_norm": 0.21459443867206573,
         | 
| 46 | 
             
                  "learning_rate": 0.00019807852804032305,
         | 
| 47 | 
            +
                  "loss": 0.2103,
         | 
| 48 | 
            +
                  "mean_token_accuracy": 0.941685950756073,
         | 
| 49 | 
             
                  "step": 20
         | 
| 50 | 
             
                },
         | 
| 51 | 
             
                {
         | 
| 52 | 
             
                  "epoch": 0.2,
         | 
| 53 | 
            +
                  "grad_norm": 0.15868614614009857,
         | 
| 54 | 
             
                  "learning_rate": 0.00019438833303083678,
         | 
| 55 | 
            +
                  "loss": 0.1155,
         | 
| 56 | 
            +
                  "mean_token_accuracy": 0.9637313485145569,
         | 
| 57 | 
             
                  "step": 25
         | 
| 58 | 
             
                },
         | 
| 59 | 
             
                {
         | 
| 60 | 
             
                  "epoch": 0.24,
         | 
| 61 | 
            +
                  "grad_norm": 0.15498687326908112,
         | 
| 62 | 
             
                  "learning_rate": 0.00018884456359788724,
         | 
| 63 | 
            +
                  "loss": 0.0816,
         | 
| 64 | 
            +
                  "mean_token_accuracy": 0.9711011052131653,
         | 
| 65 | 
             
                  "step": 30
         | 
| 66 | 
             
                },
         | 
| 67 | 
             
                {
         | 
| 68 | 
             
                  "epoch": 0.28,
         | 
| 69 | 
            +
                  "grad_norm": 0.11740818619728088,
         | 
| 70 | 
             
                  "learning_rate": 0.00018155608689592604,
         | 
| 71 | 
            +
                  "loss": 0.0622,
         | 
| 72 | 
            +
                  "mean_token_accuracy": 0.9760455787181854,
         | 
| 73 | 
             
                  "step": 35
         | 
| 74 | 
             
                },
         | 
| 75 | 
             
                {
         | 
| 76 | 
             
                  "epoch": 0.32,
         | 
| 77 | 
            +
                  "grad_norm": 0.10971173644065857,
         | 
| 78 | 
             
                  "learning_rate": 0.0001726660322034027,
         | 
| 79 | 
            +
                  "loss": 0.0545,
         | 
| 80 | 
            +
                  "mean_token_accuracy": 0.9776363372802734,
         | 
| 81 | 
             
                  "step": 40
         | 
| 82 | 
             
                },
         | 
| 83 | 
             
                {
         | 
| 84 | 
             
                  "epoch": 0.36,
         | 
| 85 | 
            +
                  "grad_norm": 0.08847042918205261,
         | 
| 86 | 
             
                  "learning_rate": 0.00016234898018587337,
         | 
| 87 | 
            +
                  "loss": 0.0519,
         | 
| 88 | 
            +
                  "mean_token_accuracy": 0.977908480167389,
         | 
| 89 | 
             
                  "step": 45
         | 
| 90 | 
             
                },
         | 
| 91 | 
             
                {
         | 
| 92 | 
             
                  "epoch": 0.4,
         | 
| 93 | 
            +
                  "grad_norm": 0.09057971835136414,
         | 
| 94 | 
             
                  "learning_rate": 0.00015080753452465296,
         | 
| 95 | 
            +
                  "loss": 0.0497,
         | 
| 96 | 
            +
                  "mean_token_accuracy": 0.9789802670478821,
         | 
| 97 | 
             
                  "step": 50
         | 
| 98 | 
             
                },
         | 
| 99 | 
             
                {
         | 
| 100 | 
             
                  "epoch": 0.44,
         | 
| 101 | 
            +
                  "grad_norm": 0.08262317627668381,
         | 
| 102 | 
             
                  "learning_rate": 0.000138268343236509,
         | 
| 103 | 
            +
                  "loss": 0.0434,
         | 
| 104 | 
            +
                  "mean_token_accuracy": 0.9816033959388732,
         | 
| 105 | 
             
                  "step": 55
         | 
| 106 | 
             
                },
         | 
| 107 | 
             
                {
         | 
| 108 | 
             
                  "epoch": 0.48,
         | 
| 109 | 
            +
                  "grad_norm": 0.08489084988832474,
         | 
| 110 | 
             
                  "learning_rate": 0.0001249776478167227,
         | 
| 111 | 
            +
                  "loss": 0.043,
         | 
| 112 | 
            +
                  "mean_token_accuracy": 0.9817807137966156,
         | 
| 113 | 
             
                  "step": 60
         | 
| 114 | 
             
                },
         | 
| 115 | 
             
                {
         | 
| 116 | 
             
                  "epoch": 0.52,
         | 
| 117 | 
            +
                  "grad_norm": 0.07146206498146057,
         | 
| 118 | 
             
                  "learning_rate": 0.00011119644761033078,
         | 
| 119 | 
            +
                  "loss": 0.0397,
         | 
| 120 | 
            +
                  "mean_token_accuracy": 0.9835689246654511,
         | 
| 121 | 
             
                  "step": 65
         | 
| 122 | 
             
                },
         | 
| 123 | 
             
                {
         | 
| 124 | 
             
                  "epoch": 0.56,
         | 
| 125 | 
            +
                  "grad_norm": 0.08493078500032425,
         | 
| 126 | 
             
                  "learning_rate": 9.719537437241312e-05,
         | 
| 127 | 
            +
                  "loss": 0.0432,
         | 
| 128 | 
            +
                  "mean_token_accuracy": 0.981769073009491,
         | 
| 129 | 
             
                  "step": 70
         | 
| 130 | 
             
                },
         | 
| 131 | 
             
                {
         | 
| 132 | 
             
                  "epoch": 0.6,
         | 
| 133 | 
            +
                  "grad_norm": 0.06853792816400528,
         | 
| 134 | 
             
                  "learning_rate": 8.324937766952638e-05,
         | 
| 135 | 
            +
                  "loss": 0.0416,
         | 
| 136 | 
            +
                  "mean_token_accuracy": 0.9825034320354462,
         | 
| 137 | 
             
                  "step": 75
         | 
| 138 | 
             
                },
         | 
| 139 | 
             
                {
         | 
| 140 | 
             
                  "epoch": 0.64,
         | 
| 141 | 
            +
                  "grad_norm": 0.07850378751754761,
         | 
| 142 | 
             
                  "learning_rate": 6.963232548903853e-05,
         | 
| 143 | 
            +
                  "loss": 0.0416,
         | 
| 144 | 
            +
                  "mean_token_accuracy": 0.9826254367828369,
         | 
| 145 | 
             
                  "step": 80
         | 
| 146 | 
             
                },
         | 
| 147 | 
             
                {
         | 
| 148 | 
             
                  "epoch": 0.68,
         | 
| 149 | 
            +
                  "grad_norm": 0.10064064711332321,
         | 
| 150 | 
             
                  "learning_rate": 5.6611626088244194e-05,
         | 
| 151 | 
            +
                  "loss": 0.0411,
         | 
| 152 | 
            +
                  "mean_token_accuracy": 0.9827791035175324,
         | 
| 153 | 
             
                  "step": 85
         | 
| 154 | 
             
                },
         | 
| 155 | 
             
                {
         | 
| 156 | 
             
                  "epoch": 0.72,
         | 
| 157 | 
            +
                  "grad_norm": 0.07430274784564972,
         | 
| 158 | 
             
                  "learning_rate": 4.444297669803981e-05,
         | 
| 159 | 
            +
                  "loss": 0.0432,
         | 
| 160 | 
            +
                  "mean_token_accuracy": 0.9819111526012421,
         | 
| 161 | 
             
                  "step": 90
         | 
| 162 | 
             
                },
         | 
| 163 | 
             
                {
         | 
| 164 | 
             
                  "epoch": 0.76,
         | 
| 165 | 
            +
                  "grad_norm": 0.05630122497677803,
         | 
| 166 | 
             
                  "learning_rate": 3.336534220479961e-05,
         | 
| 167 | 
            +
                  "loss": 0.0381,
         | 
| 168 | 
            +
                  "mean_token_accuracy": 0.984023529291153,
         | 
| 169 | 
             
                  "step": 95
         | 
| 170 | 
             
                },
         | 
| 171 | 
             
                {
         | 
| 172 | 
             
                  "epoch": 0.8,
         | 
| 173 | 
            +
                  "grad_norm": 0.07422107458114624,
         | 
| 174 | 
             
                  "learning_rate": 2.3596262417839255e-05,
         | 
| 175 | 
            +
                  "loss": 0.041,
         | 
| 176 | 
            +
                  "mean_token_accuracy": 0.9828759372234345,
         | 
| 177 | 
             
                  "step": 100
         | 
| 178 | 
             
                },
         | 
| 179 | 
             
                {
         | 
| 180 | 
             
                  "epoch": 0.84,
         | 
| 181 | 
            +
                  "grad_norm": 0.06742699444293976,
         | 
| 182 | 
             
                  "learning_rate": 1.5327580077171587e-05,
         | 
| 183 | 
            +
                  "loss": 0.0435,
         | 
| 184 | 
            +
                  "mean_token_accuracy": 0.9813436925411224,
         | 
| 185 | 
             
                  "step": 105
         | 
| 186 | 
             
                },
         | 
| 187 | 
             
                {
         | 
| 188 | 
             
                  "epoch": 0.88,
         | 
| 189 | 
            +
                  "grad_norm": 0.07175164669752121,
         | 
| 190 | 
             
                  "learning_rate": 8.72167349386811e-06,
         | 
| 191 | 
            +
                  "loss": 0.0406,
         | 
| 192 | 
            +
                  "mean_token_accuracy": 0.9831586062908173,
         | 
| 193 | 
             
                  "step": 110
         | 
| 194 | 
             
                },
         | 
| 195 | 
             
                {
         | 
| 196 | 
             
                  "epoch": 0.92,
         | 
| 197 | 
            +
                  "grad_norm": 0.06535231322050095,
         | 
| 198 | 
             
                  "learning_rate": 3.908267805490051e-06,
         | 
| 199 | 
            +
                  "loss": 0.0411,
         | 
| 200 | 
            +
                  "mean_token_accuracy": 0.9826524317264557,
         | 
| 201 | 
             
                  "step": 115
         | 
| 202 | 
             
                },
         | 
| 203 | 
             
                {
         | 
| 204 | 
             
                  "epoch": 0.96,
         | 
| 205 | 
            +
                  "grad_norm": 0.07355163991451263,
         | 
| 206 | 
             
                  "learning_rate": 9.818874663554357e-07,
         | 
| 207 | 
            +
                  "loss": 0.0407,
         | 
| 208 | 
            +
                  "mean_token_accuracy": 0.9827761054039001,
         | 
| 209 | 
             
                  "step": 120
         | 
| 210 | 
             
                },
         | 
| 211 | 
             
                {
         | 
| 212 | 
             
                  "epoch": 1.0,
         | 
| 213 | 
            +
                  "grad_norm": 0.07261276245117188,
         | 
| 214 | 
             
                  "learning_rate": 0.0,
         | 
| 215 | 
            +
                  "loss": 0.04,
         | 
| 216 | 
            +
                  "mean_token_accuracy": 0.9830402076244354,
         | 
| 217 | 
             
                  "step": 125
         | 
| 218 | 
             
                },
         | 
| 219 | 
             
                {
         | 
| 220 | 
             
                  "epoch": 1.0,
         | 
| 221 | 
            +
                  "eval_loss": 0.0398690365254879,
         | 
| 222 | 
            +
                  "eval_mean_token_accuracy": 0.9831665050983429,
         | 
| 223 | 
            +
                  "eval_runtime": 167.2529,
         | 
| 224 | 
            +
                  "eval_samples_per_second": 11.958,
         | 
| 225 | 
            +
                  "eval_steps_per_second": 1.495,
         | 
| 226 | 
             
                  "step": 125
         | 
| 227 | 
             
                },
         | 
| 228 | 
             
                {
         | 
| 229 | 
             
                  "epoch": 1.0,
         | 
| 230 | 
             
                  "step": 125,
         | 
| 231 | 
             
                  "total_flos": 4505964279496704.0,
         | 
| 232 | 
            +
                  "train_loss": 0.1011103401184082,
         | 
| 233 | 
            +
                  "train_runtime": 450.8333,
         | 
| 234 | 
            +
                  "train_samples_per_second": 2.218,
         | 
| 235 | 
            +
                  "train_steps_per_second": 0.277
         | 
| 236 | 
             
                }
         | 
| 237 | 
             
              ],
         | 
| 238 | 
             
              "logging_steps": 5,
         |