souging commited on
Commit
5f8071d
·
verified ·
1 Parent(s): 690ac02

Training in progress, epoch 2, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9730145d2e9a1b74d8107dab3e186e5c5dc5621b1d97cf6a4edc6d9b0ef0261b
3
  size 671149168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c009ddc1d94f455e9195d7d4be1972903acc93c240854a54f2b7dca65b168152
3
  size 671149168
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcb54acfd27792170354a562e02e97c6bff5c7389ae5521101a073cc1718a8be
3
  size 341314644
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15011166716c02678454558a3c1c8063eab1a441164609250a4f7a413c1a9a0c
3
  size 341314644
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd5c5a7042ca11efb427ead8ba2dbc4f73c398056754e15d1b2f16c2fa47e857
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c9e823615e7791da434ea2bf3e0b21d7f2555e1b7211e05c760d0e064f52187
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c783f814e8f5dba82df3104c7a8bfcce2c0498f44858b8fe4a7f335a282b54f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c43446093e1628094db12fedaa5d20a058b0e404459bac127359c7aff7c16508
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9ecbda77e0b1544b8770b935616dcc3668a63ca2309a66bb26abd2bac10d18d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c903c87f6cd97e09254fee79c1ec265e6625798f1d33889dd1dfc725ce5e997
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a07ba7bc7222af908f414510bbff1c4bc53360767ee977a47a928ec6bbd939c3
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdebfbcdc58146d93665b48de36547123b030e8c298a5acfdb0be3eb482d6737
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04de1d72f96661cbe84acc9ab7e840acfc58c9b32b1162803aaa7851a02fb75e
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f54cdade05a89000248252a038952b1cdc8afd44da2194ff958ed676c185eec3
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8222f06b98ce214cf357131289a86875c950e8de9792e62f8fface6f682e382f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90f5450644fcd4b776ed8591ec6cf0c4bcd4ecf66ea9ac014125200926a408e5
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6dc1b0ee641c6f411ec29d9d4f3961edff193cb98b33f58d9eb196d55ff26baa
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66ecc6eb41950527408a67f0b15598d30cacff86b3cf28d7afc6a699306482df
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dbde01187e8a684883c59dab05ed31aba8f1d6e286bfcd82ad8ca281d2654993
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f8af120907e390b29e30bc78422d8123341c5043c7535f1b5bddd4248bf24bf
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6803654b121ed3c4b32b12ffbae5f8254f376b259c3e6ff7de4d767dd5529f1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4ac31dcfba988823188c44099086577dd09e162577217892c3f4fd5ad8489c2
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.9982285208148804,
5
  "eval_steps": 500,
6
- "global_step": 564,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3955,6 +3955,958 @@
3955
  "learning_rate": 2.4543499379172615e-05,
3956
  "loss": 0.0016,
3957
  "step": 564
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3958
  }
3959
  ],
3960
  "logging_steps": 1,
@@ -3969,12 +4921,12 @@
3969
  "should_evaluate": false,
3970
  "should_log": false,
3971
  "should_save": true,
3972
- "should_training_stop": false
3973
  },
3974
  "attributes": {}
3975
  }
3976
  },
3977
- "total_flos": 2.2350396661537178e+18,
3978
  "train_batch_size": 4,
3979
  "trial_name": null,
3980
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.4800708591674048,
5
  "eval_steps": 500,
6
+ "global_step": 700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3955
  "learning_rate": 2.4543499379172615e-05,
3956
  "loss": 0.0016,
3957
  "step": 564
3958
+ },
3959
+ {
3960
+ "epoch": 2.0017714791851198,
3961
+ "grad_norm": 0.08274471014738083,
3962
+ "learning_rate": 2.4198997474396877e-05,
3963
+ "loss": 0.0003,
3964
+ "step": 565
3965
+ },
3966
+ {
3967
+ "epoch": 2.005314437555359,
3968
+ "grad_norm": 0.060062225908041,
3969
+ "learning_rate": 2.3856601107901166e-05,
3970
+ "loss": 0.0003,
3971
+ "step": 566
3972
+ },
3973
+ {
3974
+ "epoch": 2.008857395925598,
3975
+ "grad_norm": 0.1853446364402771,
3976
+ "learning_rate": 2.351631966665476e-05,
3977
+ "loss": 0.0015,
3978
+ "step": 567
3979
+ },
3980
+ {
3981
+ "epoch": 2.012400354295837,
3982
+ "grad_norm": 0.0592714287340641,
3983
+ "learning_rate": 2.31781624796453e-05,
3984
+ "loss": 0.0001,
3985
+ "step": 568
3986
+ },
3987
+ {
3988
+ "epoch": 2.015943312666076,
3989
+ "grad_norm": 0.13232074677944183,
3990
+ "learning_rate": 2.2842138817622883e-05,
3991
+ "loss": 0.0005,
3992
+ "step": 569
3993
+ },
3994
+ {
3995
+ "epoch": 2.0194862710363153,
3996
+ "grad_norm": 0.24643929302692413,
3997
+ "learning_rate": 2.250825789284594e-05,
3998
+ "loss": 0.0012,
3999
+ "step": 570
4000
+ },
4001
+ {
4002
+ "epoch": 2.0230292294065544,
4003
+ "grad_norm": 0.17991432547569275,
4004
+ "learning_rate": 2.217652885882869e-05,
4005
+ "loss": 0.0004,
4006
+ "step": 571
4007
+ },
4008
+ {
4009
+ "epoch": 2.0265721877767935,
4010
+ "grad_norm": 0.144062802195549,
4011
+ "learning_rate": 2.1846960810090188e-05,
4012
+ "loss": 0.0005,
4013
+ "step": 572
4014
+ },
4015
+ {
4016
+ "epoch": 2.0301151461470326,
4017
+ "grad_norm": 0.5604081153869629,
4018
+ "learning_rate": 2.151956278190494e-05,
4019
+ "loss": 0.0017,
4020
+ "step": 573
4021
+ },
4022
+ {
4023
+ "epoch": 2.033658104517272,
4024
+ "grad_norm": 0.4855101406574249,
4025
+ "learning_rate": 2.119434375005527e-05,
4026
+ "loss": 0.0008,
4027
+ "step": 574
4028
+ },
4029
+ {
4030
+ "epoch": 2.0372010628875112,
4031
+ "grad_norm": 0.9981550574302673,
4032
+ "learning_rate": 2.087131263058526e-05,
4033
+ "loss": 0.004,
4034
+ "step": 575
4035
+ },
4036
+ {
4037
+ "epoch": 2.0407440212577503,
4038
+ "grad_norm": 0.12432882189750671,
4039
+ "learning_rate": 2.055047827955618e-05,
4040
+ "loss": 0.0003,
4041
+ "step": 576
4042
+ },
4043
+ {
4044
+ "epoch": 2.0442869796279894,
4045
+ "grad_norm": 0.009903721511363983,
4046
+ "learning_rate": 2.0231849492803852e-05,
4047
+ "loss": 0.0001,
4048
+ "step": 577
4049
+ },
4050
+ {
4051
+ "epoch": 2.0478299379982285,
4052
+ "grad_norm": 0.044433970004320145,
4053
+ "learning_rate": 1.991543500569745e-05,
4054
+ "loss": 0.0002,
4055
+ "step": 578
4056
+ },
4057
+ {
4058
+ "epoch": 2.0513728963684676,
4059
+ "grad_norm": 0.2867472171783447,
4060
+ "learning_rate": 1.960124349289992e-05,
4061
+ "loss": 0.0019,
4062
+ "step": 579
4063
+ },
4064
+ {
4065
+ "epoch": 2.0549158547387067,
4066
+ "grad_norm": 0.371579110622406,
4067
+ "learning_rate": 1.928928356813032e-05,
4068
+ "loss": 0.0012,
4069
+ "step": 580
4070
+ },
4071
+ {
4072
+ "epoch": 2.058458813108946,
4073
+ "grad_norm": 0.07519207894802094,
4074
+ "learning_rate": 1.8979563783927565e-05,
4075
+ "loss": 0.0002,
4076
+ "step": 581
4077
+ },
4078
+ {
4079
+ "epoch": 2.062001771479185,
4080
+ "grad_norm": 0.20066216588020325,
4081
+ "learning_rate": 1.8672092631416013e-05,
4082
+ "loss": 0.0006,
4083
+ "step": 582
4084
+ },
4085
+ {
4086
+ "epoch": 2.065544729849424,
4087
+ "grad_norm": 0.19873085618019104,
4088
+ "learning_rate": 1.8366878540072614e-05,
4089
+ "loss": 0.0008,
4090
+ "step": 583
4091
+ },
4092
+ {
4093
+ "epoch": 2.0690876882196636,
4094
+ "grad_norm": 0.014256162568926811,
4095
+ "learning_rate": 1.8063929877495892e-05,
4096
+ "loss": 0.0001,
4097
+ "step": 584
4098
+ },
4099
+ {
4100
+ "epoch": 2.0726306465899027,
4101
+ "grad_norm": 0.03786277398467064,
4102
+ "learning_rate": 1.7763254949176414e-05,
4103
+ "loss": 0.0001,
4104
+ "step": 585
4105
+ },
4106
+ {
4107
+ "epoch": 2.076173604960142,
4108
+ "grad_norm": 0.15015755593776703,
4109
+ "learning_rate": 1.7464861998269243e-05,
4110
+ "loss": 0.0003,
4111
+ "step": 586
4112
+ },
4113
+ {
4114
+ "epoch": 2.079716563330381,
4115
+ "grad_norm": 0.3068810999393463,
4116
+ "learning_rate": 1.7168759205367893e-05,
4117
+ "loss": 0.0003,
4118
+ "step": 587
4119
+ },
4120
+ {
4121
+ "epoch": 2.08325952170062,
4122
+ "grad_norm": 0.32651999592781067,
4123
+ "learning_rate": 1.6874954688279956e-05,
4124
+ "loss": 0.0007,
4125
+ "step": 588
4126
+ },
4127
+ {
4128
+ "epoch": 2.086802480070859,
4129
+ "grad_norm": 0.5359931588172913,
4130
+ "learning_rate": 1.6583456501804725e-05,
4131
+ "loss": 0.0018,
4132
+ "step": 589
4133
+ },
4134
+ {
4135
+ "epoch": 2.090345438441098,
4136
+ "grad_norm": 0.08258194476366043,
4137
+ "learning_rate": 1.6294272637512183e-05,
4138
+ "loss": 0.0003,
4139
+ "step": 590
4140
+ },
4141
+ {
4142
+ "epoch": 2.0938883968113373,
4143
+ "grad_norm": 0.09279931336641312,
4144
+ "learning_rate": 1.600741102352409e-05,
4145
+ "loss": 0.0009,
4146
+ "step": 591
4147
+ },
4148
+ {
4149
+ "epoch": 2.0974313551815764,
4150
+ "grad_norm": 0.7681455016136169,
4151
+ "learning_rate": 1.57228795242965e-05,
4152
+ "loss": 0.0008,
4153
+ "step": 592
4154
+ },
4155
+ {
4156
+ "epoch": 2.100974313551816,
4157
+ "grad_norm": 0.5113418698310852,
4158
+ "learning_rate": 1.544068594040417e-05,
4159
+ "loss": 0.0006,
4160
+ "step": 593
4161
+ },
4162
+ {
4163
+ "epoch": 2.104517271922055,
4164
+ "grad_norm": 0.11657733470201492,
4165
+ "learning_rate": 1.516083800832676e-05,
4166
+ "loss": 0.0002,
4167
+ "step": 594
4168
+ },
4169
+ {
4170
+ "epoch": 2.108060230292294,
4171
+ "grad_norm": 0.06766359508037567,
4172
+ "learning_rate": 1.488334340023669e-05,
4173
+ "loss": 0.0002,
4174
+ "step": 595
4175
+ },
4176
+ {
4177
+ "epoch": 2.1116031886625333,
4178
+ "grad_norm": 0.01443282887339592,
4179
+ "learning_rate": 1.4608209723788835e-05,
4180
+ "loss": 0.0001,
4181
+ "step": 596
4182
+ },
4183
+ {
4184
+ "epoch": 2.1151461470327724,
4185
+ "grad_norm": 0.005601163953542709,
4186
+ "learning_rate": 1.4335444521911899e-05,
4187
+ "loss": 0.0,
4188
+ "step": 597
4189
+ },
4190
+ {
4191
+ "epoch": 2.1186891054030115,
4192
+ "grad_norm": 0.032016199082136154,
4193
+ "learning_rate": 1.4065055272601703e-05,
4194
+ "loss": 0.0001,
4195
+ "step": 598
4196
+ },
4197
+ {
4198
+ "epoch": 2.1222320637732506,
4199
+ "grad_norm": 0.04691418632864952,
4200
+ "learning_rate": 1.3797049388716065e-05,
4201
+ "loss": 0.0001,
4202
+ "step": 599
4203
+ },
4204
+ {
4205
+ "epoch": 2.1257750221434897,
4206
+ "grad_norm": 0.037374719977378845,
4207
+ "learning_rate": 1.3531434217771692e-05,
4208
+ "loss": 0.0002,
4209
+ "step": 600
4210
+ },
4211
+ {
4212
+ "epoch": 2.129317980513729,
4213
+ "grad_norm": 0.0651448667049408,
4214
+ "learning_rate": 1.3268217041742701e-05,
4215
+ "loss": 0.0001,
4216
+ "step": 601
4217
+ },
4218
+ {
4219
+ "epoch": 2.132860938883968,
4220
+ "grad_norm": 0.053037695586681366,
4221
+ "learning_rate": 1.3007405076860875e-05,
4222
+ "loss": 0.0001,
4223
+ "step": 602
4224
+ },
4225
+ {
4226
+ "epoch": 2.1364038972542074,
4227
+ "grad_norm": 0.056240785866975784,
4228
+ "learning_rate": 1.2749005473418015e-05,
4229
+ "loss": 0.0002,
4230
+ "step": 603
4231
+ },
4232
+ {
4233
+ "epoch": 2.1399468556244465,
4234
+ "grad_norm": 0.21008822321891785,
4235
+ "learning_rate": 1.2493025315569801e-05,
4236
+ "loss": 0.001,
4237
+ "step": 604
4238
+ },
4239
+ {
4240
+ "epoch": 2.1434898139946856,
4241
+ "grad_norm": 0.1465017944574356,
4242
+ "learning_rate": 1.2239471621141508e-05,
4243
+ "loss": 0.0005,
4244
+ "step": 605
4245
+ },
4246
+ {
4247
+ "epoch": 2.1470327723649247,
4248
+ "grad_norm": 0.017339682206511497,
4249
+ "learning_rate": 1.1988351341435792e-05,
4250
+ "loss": 0.0001,
4251
+ "step": 606
4252
+ },
4253
+ {
4254
+ "epoch": 2.150575730735164,
4255
+ "grad_norm": 0.5210103392601013,
4256
+ "learning_rate": 1.173967136104196e-05,
4257
+ "loss": 0.0013,
4258
+ "step": 607
4259
+ },
4260
+ {
4261
+ "epoch": 2.154118689105403,
4262
+ "grad_norm": 0.004422559402883053,
4263
+ "learning_rate": 1.1493438497647313e-05,
4264
+ "loss": 0.0,
4265
+ "step": 608
4266
+ },
4267
+ {
4268
+ "epoch": 2.157661647475642,
4269
+ "grad_norm": 0.03284211456775665,
4270
+ "learning_rate": 1.1249659501850155e-05,
4271
+ "loss": 0.0002,
4272
+ "step": 609
4273
+ },
4274
+ {
4275
+ "epoch": 2.161204605845881,
4276
+ "grad_norm": 0.03831435367465019,
4277
+ "learning_rate": 1.1008341056974854e-05,
4278
+ "loss": 0.0001,
4279
+ "step": 610
4280
+ },
4281
+ {
4282
+ "epoch": 2.1647475642161202,
4283
+ "grad_norm": 0.15469998121261597,
4284
+ "learning_rate": 1.0769489778888405e-05,
4285
+ "loss": 0.0003,
4286
+ "step": 611
4287
+ },
4288
+ {
4289
+ "epoch": 2.16829052258636,
4290
+ "grad_norm": 0.01614706963300705,
4291
+ "learning_rate": 1.0533112215819298e-05,
4292
+ "loss": 0.0,
4293
+ "step": 612
4294
+ },
4295
+ {
4296
+ "epoch": 2.171833480956599,
4297
+ "grad_norm": 0.10824459046125412,
4298
+ "learning_rate": 1.029921484817783e-05,
4299
+ "loss": 0.0004,
4300
+ "step": 613
4301
+ },
4302
+ {
4303
+ "epoch": 2.175376439326838,
4304
+ "grad_norm": 0.2749079465866089,
4305
+ "learning_rate": 1.0067804088378455e-05,
4306
+ "loss": 0.0013,
4307
+ "step": 614
4308
+ },
4309
+ {
4310
+ "epoch": 2.178919397697077,
4311
+ "grad_norm": 0.00494280643761158,
4312
+ "learning_rate": 9.8388862806641e-06,
4313
+ "loss": 0.0,
4314
+ "step": 615
4315
+ },
4316
+ {
4317
+ "epoch": 2.182462356067316,
4318
+ "grad_norm": 0.008706189692020416,
4319
+ "learning_rate": 9.612467700932045e-06,
4320
+ "loss": 0.0001,
4321
+ "step": 616
4322
+ },
4323
+ {
4324
+ "epoch": 2.1860053144375553,
4325
+ "grad_norm": 0.021209556609392166,
4326
+ "learning_rate": 9.388554556562049e-06,
4327
+ "loss": 0.0001,
4328
+ "step": 617
4329
+ },
4330
+ {
4331
+ "epoch": 2.1895482728077944,
4332
+ "grad_norm": 0.6736172437667847,
4333
+ "learning_rate": 9.167152986246078e-06,
4334
+ "loss": 0.001,
4335
+ "step": 618
4336
+ },
4337
+ {
4338
+ "epoch": 2.1930912311780335,
4339
+ "grad_norm": 0.09763076156377792,
4340
+ "learning_rate": 8.948269059820025e-06,
4341
+ "loss": 0.0007,
4342
+ "step": 619
4343
+ },
4344
+ {
4345
+ "epoch": 2.1966341895482726,
4346
+ "grad_norm": 0.38925012946128845,
4347
+ "learning_rate": 8.731908778097302e-06,
4348
+ "loss": 0.0025,
4349
+ "step": 620
4350
+ },
4351
+ {
4352
+ "epoch": 2.200177147918512,
4353
+ "grad_norm": 0.11619476974010468,
4354
+ "learning_rate": 8.518078072704338e-06,
4355
+ "loss": 0.0005,
4356
+ "step": 621
4357
+ },
4358
+ {
4359
+ "epoch": 2.2037201062887513,
4360
+ "grad_norm": 0.24150855839252472,
4361
+ "learning_rate": 8.306782805917904e-06,
4362
+ "loss": 0.0025,
4363
+ "step": 622
4364
+ },
4365
+ {
4366
+ "epoch": 2.2072630646589904,
4367
+ "grad_norm": 0.04961521923542023,
4368
+ "learning_rate": 8.098028770504494e-06,
4369
+ "loss": 0.0001,
4370
+ "step": 623
4371
+ },
4372
+ {
4373
+ "epoch": 2.2108060230292295,
4374
+ "grad_norm": 0.0838765874505043,
4375
+ "learning_rate": 7.891821689561459e-06,
4376
+ "loss": 0.0006,
4377
+ "step": 624
4378
+ },
4379
+ {
4380
+ "epoch": 2.2143489813994686,
4381
+ "grad_norm": 0.9362369179725647,
4382
+ "learning_rate": 7.68816721636004e-06,
4383
+ "loss": 0.0019,
4384
+ "step": 625
4385
+ },
4386
+ {
4387
+ "epoch": 2.2178919397697077,
4388
+ "grad_norm": 0.03214077651500702,
4389
+ "learning_rate": 7.487070934190532e-06,
4390
+ "loss": 0.0002,
4391
+ "step": 626
4392
+ },
4393
+ {
4394
+ "epoch": 2.2214348981399468,
4395
+ "grad_norm": 0.1560969352722168,
4396
+ "learning_rate": 7.288538356209092e-06,
4397
+ "loss": 0.0003,
4398
+ "step": 627
4399
+ },
4400
+ {
4401
+ "epoch": 2.224977856510186,
4402
+ "grad_norm": 0.05944027379155159,
4403
+ "learning_rate": 7.092574925286614e-06,
4404
+ "loss": 0.0002,
4405
+ "step": 628
4406
+ },
4407
+ {
4408
+ "epoch": 2.228520814880425,
4409
+ "grad_norm": 0.269961416721344,
4410
+ "learning_rate": 6.899186013859561e-06,
4411
+ "loss": 0.001,
4412
+ "step": 629
4413
+ },
4414
+ {
4415
+ "epoch": 2.2320637732506645,
4416
+ "grad_norm": 0.005772658158093691,
4417
+ "learning_rate": 6.708376923782635e-06,
4418
+ "loss": 0.0,
4419
+ "step": 630
4420
+ },
4421
+ {
4422
+ "epoch": 2.2356067316209036,
4423
+ "grad_norm": 0.0038862484507262707,
4424
+ "learning_rate": 6.520152886183406e-06,
4425
+ "loss": 0.0,
4426
+ "step": 631
4427
+ },
4428
+ {
4429
+ "epoch": 2.2391496899911427,
4430
+ "grad_norm": 0.055060192942619324,
4431
+ "learning_rate": 6.3345190613189635e-06,
4432
+ "loss": 0.0001,
4433
+ "step": 632
4434
+ },
4435
+ {
4436
+ "epoch": 2.242692648361382,
4437
+ "grad_norm": 0.020991992205381393,
4438
+ "learning_rate": 6.151480538434382e-06,
4439
+ "loss": 0.0001,
4440
+ "step": 633
4441
+ },
4442
+ {
4443
+ "epoch": 2.246235606731621,
4444
+ "grad_norm": 0.008830989710986614,
4445
+ "learning_rate": 5.971042335623229e-06,
4446
+ "loss": 0.0,
4447
+ "step": 634
4448
+ },
4449
+ {
4450
+ "epoch": 2.24977856510186,
4451
+ "grad_norm": 0.2199835330247879,
4452
+ "learning_rate": 5.793209399689978e-06,
4453
+ "loss": 0.0004,
4454
+ "step": 635
4455
+ },
4456
+ {
4457
+ "epoch": 2.253321523472099,
4458
+ "grad_norm": 0.08529902994632721,
4459
+ "learning_rate": 5.617986606014419e-06,
4460
+ "loss": 0.0004,
4461
+ "step": 636
4462
+ },
4463
+ {
4464
+ "epoch": 2.2568644818423382,
4465
+ "grad_norm": 0.19139614701271057,
4466
+ "learning_rate": 5.445378758417925e-06,
4467
+ "loss": 0.0016,
4468
+ "step": 637
4469
+ },
4470
+ {
4471
+ "epoch": 2.2604074402125773,
4472
+ "grad_norm": 0.009827593341469765,
4473
+ "learning_rate": 5.275390589031859e-06,
4474
+ "loss": 0.0001,
4475
+ "step": 638
4476
+ },
4477
+ {
4478
+ "epoch": 2.263950398582817,
4479
+ "grad_norm": 0.1290542483329773,
4480
+ "learning_rate": 5.108026758167719e-06,
4481
+ "loss": 0.0002,
4482
+ "step": 639
4483
+ },
4484
+ {
4485
+ "epoch": 2.267493356953056,
4486
+ "grad_norm": 0.17350342869758606,
4487
+ "learning_rate": 4.943291854189493e-06,
4488
+ "loss": 0.0005,
4489
+ "step": 640
4490
+ },
4491
+ {
4492
+ "epoch": 2.271036315323295,
4493
+ "grad_norm": 0.3507162928581238,
4494
+ "learning_rate": 4.781190393387796e-06,
4495
+ "loss": 0.0003,
4496
+ "step": 641
4497
+ },
4498
+ {
4499
+ "epoch": 2.274579273693534,
4500
+ "grad_norm": 0.02020534686744213,
4501
+ "learning_rate": 4.6217268198560404e-06,
4502
+ "loss": 0.0001,
4503
+ "step": 642
4504
+ },
4505
+ {
4506
+ "epoch": 2.2781222320637733,
4507
+ "grad_norm": 0.025796467438340187,
4508
+ "learning_rate": 4.464905505368658e-06,
4509
+ "loss": 0.0001,
4510
+ "step": 643
4511
+ },
4512
+ {
4513
+ "epoch": 2.2816651904340124,
4514
+ "grad_norm": 0.07381541281938553,
4515
+ "learning_rate": 4.3107307492612086e-06,
4516
+ "loss": 0.0003,
4517
+ "step": 644
4518
+ },
4519
+ {
4520
+ "epoch": 2.2852081488042515,
4521
+ "grad_norm": 0.03729734942317009,
4522
+ "learning_rate": 4.1592067783125015e-06,
4523
+ "loss": 0.0001,
4524
+ "step": 645
4525
+ },
4526
+ {
4527
+ "epoch": 2.2887511071744906,
4528
+ "grad_norm": 0.009395123459398746,
4529
+ "learning_rate": 4.010337746628751e-06,
4530
+ "loss": 0.0,
4531
+ "step": 646
4532
+ },
4533
+ {
4534
+ "epoch": 2.2922940655447297,
4535
+ "grad_norm": 0.1595831662416458,
4536
+ "learning_rate": 3.864127735529656e-06,
4537
+ "loss": 0.0004,
4538
+ "step": 647
4539
+ },
4540
+ {
4541
+ "epoch": 2.2958370239149692,
4542
+ "grad_norm": 0.008252732455730438,
4543
+ "learning_rate": 3.7205807534365315e-06,
4544
+ "loss": 0.0,
4545
+ "step": 648
4546
+ },
4547
+ {
4548
+ "epoch": 2.299379982285208,
4549
+ "grad_norm": 0.517535924911499,
4550
+ "learning_rate": 3.5797007357623945e-06,
4551
+ "loss": 0.0005,
4552
+ "step": 649
4553
+ },
4554
+ {
4555
+ "epoch": 2.3029229406554474,
4556
+ "grad_norm": 0.012122491374611855,
4557
+ "learning_rate": 3.441491544804112e-06,
4558
+ "loss": 0.0001,
4559
+ "step": 650
4560
+ },
4561
+ {
4562
+ "epoch": 2.3064658990256866,
4563
+ "grad_norm": 0.09782838821411133,
4564
+ "learning_rate": 3.3059569696364502e-06,
4565
+ "loss": 0.0004,
4566
+ "step": 651
4567
+ },
4568
+ {
4569
+ "epoch": 2.3100088573959257,
4570
+ "grad_norm": 0.19513198733329773,
4571
+ "learning_rate": 3.1731007260082616e-06,
4572
+ "loss": 0.0012,
4573
+ "step": 652
4574
+ },
4575
+ {
4576
+ "epoch": 2.3135518157661648,
4577
+ "grad_norm": 0.017000757157802582,
4578
+ "learning_rate": 3.0429264562405776e-06,
4579
+ "loss": 0.0001,
4580
+ "step": 653
4581
+ },
4582
+ {
4583
+ "epoch": 2.317094774136404,
4584
+ "grad_norm": 0.0052710892632603645,
4585
+ "learning_rate": 2.9154377291267674e-06,
4586
+ "loss": 0.0,
4587
+ "step": 654
4588
+ },
4589
+ {
4590
+ "epoch": 2.320637732506643,
4591
+ "grad_norm": 0.06341676414012909,
4592
+ "learning_rate": 2.790638039834668e-06,
4593
+ "loss": 0.0002,
4594
+ "step": 655
4595
+ },
4596
+ {
4597
+ "epoch": 2.324180690876882,
4598
+ "grad_norm": 0.018487611785531044,
4599
+ "learning_rate": 2.6685308098108106e-06,
4600
+ "loss": 0.0001,
4601
+ "step": 656
4602
+ },
4603
+ {
4604
+ "epoch": 2.327723649247121,
4605
+ "grad_norm": 0.19819696247577667,
4606
+ "learning_rate": 2.5491193866866025e-06,
4607
+ "loss": 0.0008,
4608
+ "step": 657
4609
+ },
4610
+ {
4611
+ "epoch": 2.3312666076173603,
4612
+ "grad_norm": 0.9627290964126587,
4613
+ "learning_rate": 2.432407044186509e-06,
4614
+ "loss": 0.0032,
4615
+ "step": 658
4616
+ },
4617
+ {
4618
+ "epoch": 2.3348095659876,
4619
+ "grad_norm": 0.26399293541908264,
4620
+ "learning_rate": 2.3183969820383735e-06,
4621
+ "loss": 0.0002,
4622
+ "step": 659
4623
+ },
4624
+ {
4625
+ "epoch": 2.338352524357839,
4626
+ "grad_norm": 0.0048317075707018375,
4627
+ "learning_rate": 2.2070923258856255e-06,
4628
+ "loss": 0.0,
4629
+ "step": 660
4630
+ },
4631
+ {
4632
+ "epoch": 2.341895482728078,
4633
+ "grad_norm": 0.019670935347676277,
4634
+ "learning_rate": 2.098496127201648e-06,
4635
+ "loss": 0.0001,
4636
+ "step": 661
4637
+ },
4638
+ {
4639
+ "epoch": 2.345438441098317,
4640
+ "grad_norm": 0.0395568385720253,
4641
+ "learning_rate": 1.992611363206103e-06,
4642
+ "loss": 0.0001,
4643
+ "step": 662
4644
+ },
4645
+ {
4646
+ "epoch": 2.348981399468556,
4647
+ "grad_norm": 0.2325470745563507,
4648
+ "learning_rate": 1.889440936783242e-06,
4649
+ "loss": 0.0026,
4650
+ "step": 663
4651
+ },
4652
+ {
4653
+ "epoch": 2.3525243578387953,
4654
+ "grad_norm": 0.019612450152635574,
4655
+ "learning_rate": 1.7889876764024505e-06,
4656
+ "loss": 0.0001,
4657
+ "step": 664
4658
+ },
4659
+ {
4660
+ "epoch": 2.3560673162090344,
4661
+ "grad_norm": 0.11476591974496841,
4662
+ "learning_rate": 1.691254336040595e-06,
4663
+ "loss": 0.0004,
4664
+ "step": 665
4665
+ },
4666
+ {
4667
+ "epoch": 2.3596102745792735,
4668
+ "grad_norm": 0.022656535729765892,
4669
+ "learning_rate": 1.59624359510657e-06,
4670
+ "loss": 0.0001,
4671
+ "step": 666
4672
+ },
4673
+ {
4674
+ "epoch": 2.3631532329495126,
4675
+ "grad_norm": 0.04269755259156227,
4676
+ "learning_rate": 1.5039580583678393e-06,
4677
+ "loss": 0.0003,
4678
+ "step": 667
4679
+ },
4680
+ {
4681
+ "epoch": 2.366696191319752,
4682
+ "grad_norm": 0.6594712734222412,
4683
+ "learning_rate": 1.414400255879008e-06,
4684
+ "loss": 0.0011,
4685
+ "step": 668
4686
+ },
4687
+ {
4688
+ "epoch": 2.3702391496899913,
4689
+ "grad_norm": 0.10037083178758621,
4690
+ "learning_rate": 1.327572642912468e-06,
4691
+ "loss": 0.0002,
4692
+ "step": 669
4693
+ },
4694
+ {
4695
+ "epoch": 2.3737821080602304,
4696
+ "grad_norm": 0.21993687748908997,
4697
+ "learning_rate": 1.2434775998910964e-06,
4698
+ "loss": 0.0002,
4699
+ "step": 670
4700
+ },
4701
+ {
4702
+ "epoch": 2.3773250664304695,
4703
+ "grad_norm": 0.018617277964949608,
4704
+ "learning_rate": 1.1621174323229612e-06,
4705
+ "loss": 0.0001,
4706
+ "step": 671
4707
+ },
4708
+ {
4709
+ "epoch": 2.3808680248007086,
4710
+ "grad_norm": 0.019156964495778084,
4711
+ "learning_rate": 1.0834943707381784e-06,
4712
+ "loss": 0.0001,
4713
+ "step": 672
4714
+ },
4715
+ {
4716
+ "epoch": 2.3844109831709477,
4717
+ "grad_norm": 0.017957083880901337,
4718
+ "learning_rate": 1.0076105706276888e-06,
4719
+ "loss": 0.0001,
4720
+ "step": 673
4721
+ },
4722
+ {
4723
+ "epoch": 2.387953941541187,
4724
+ "grad_norm": 0.0515175499022007,
4725
+ "learning_rate": 9.344681123841967e-07,
4726
+ "loss": 0.0002,
4727
+ "step": 674
4728
+ },
4729
+ {
4730
+ "epoch": 2.391496899911426,
4731
+ "grad_norm": 0.05943327769637108,
4732
+ "learning_rate": 8.640690012451515e-07,
4733
+ "loss": 0.0004,
4734
+ "step": 675
4735
+ },
4736
+ {
4737
+ "epoch": 2.395039858281665,
4738
+ "grad_norm": 0.24372032284736633,
4739
+ "learning_rate": 7.964151672377458e-07,
4740
+ "loss": 0.0015,
4741
+ "step": 676
4742
+ },
4743
+ {
4744
+ "epoch": 2.3985828166519045,
4745
+ "grad_norm": 0.07449764758348465,
4746
+ "learning_rate": 7.315084651260009e-07,
4747
+ "loss": 0.0002,
4748
+ "step": 677
4749
+ },
4750
+ {
4751
+ "epoch": 2.4021257750221436,
4752
+ "grad_norm": 0.021373214200139046,
4753
+ "learning_rate": 6.69350674359959e-07,
4754
+ "loss": 0.0001,
4755
+ "step": 678
4756
+ },
4757
+ {
4758
+ "epoch": 2.4056687333923827,
4759
+ "grad_norm": 0.008149687200784683,
4760
+ "learning_rate": 6.099434990268609e-07,
4761
+ "loss": 0.0001,
4762
+ "step": 679
4763
+ },
4764
+ {
4765
+ "epoch": 2.409211691762622,
4766
+ "grad_norm": 0.10549762845039368,
4767
+ "learning_rate": 5.532885678043977e-07,
4768
+ "loss": 0.0003,
4769
+ "step": 680
4770
+ },
4771
+ {
4772
+ "epoch": 2.412754650132861,
4773
+ "grad_norm": 0.08827023208141327,
4774
+ "learning_rate": 4.9938743391615e-07,
4775
+ "loss": 0.0003,
4776
+ "step": 681
4777
+ },
4778
+ {
4779
+ "epoch": 2.4162976085031,
4780
+ "grad_norm": 0.011174162849783897,
4781
+ "learning_rate": 4.482415750889204e-07,
4782
+ "loss": 0.0,
4783
+ "step": 682
4784
+ },
4785
+ {
4786
+ "epoch": 2.419840566873339,
4787
+ "grad_norm": 0.0026113265193998814,
4788
+ "learning_rate": 3.998523935122772e-07,
4789
+ "loss": 0.0,
4790
+ "step": 683
4791
+ },
4792
+ {
4793
+ "epoch": 2.4233835252435783,
4794
+ "grad_norm": 0.03312141075730324,
4795
+ "learning_rate": 3.5422121580005864e-07,
4796
+ "loss": 0.0001,
4797
+ "step": 684
4798
+ },
4799
+ {
4800
+ "epoch": 2.4269264836138174,
4801
+ "grad_norm": 0.08379507064819336,
4802
+ "learning_rate": 3.1134929295407564e-07,
4803
+ "loss": 0.0003,
4804
+ "step": 685
4805
+ },
4806
+ {
4807
+ "epoch": 2.430469441984057,
4808
+ "grad_norm": 0.025159165263175964,
4809
+ "learning_rate": 2.7123780032973235e-07,
4810
+ "loss": 0.0001,
4811
+ "step": 686
4812
+ },
4813
+ {
4814
+ "epoch": 2.434012400354296,
4815
+ "grad_norm": 0.025299694389104843,
4816
+ "learning_rate": 2.3388783760386601e-07,
4817
+ "loss": 0.0001,
4818
+ "step": 687
4819
+ },
4820
+ {
4821
+ "epoch": 2.437555358724535,
4822
+ "grad_norm": 0.6148785352706909,
4823
+ "learning_rate": 1.9930042874457254e-07,
4824
+ "loss": 0.0163,
4825
+ "step": 688
4826
+ },
4827
+ {
4828
+ "epoch": 2.441098317094774,
4829
+ "grad_norm": 0.0018735540797933936,
4830
+ "learning_rate": 1.6747652198313957e-07,
4831
+ "loss": 0.0,
4832
+ "step": 689
4833
+ },
4834
+ {
4835
+ "epoch": 2.4446412754650133,
4836
+ "grad_norm": 0.11185324192047119,
4837
+ "learning_rate": 1.3841698978804285e-07,
4838
+ "loss": 0.0003,
4839
+ "step": 690
4840
+ },
4841
+ {
4842
+ "epoch": 2.4481842338352524,
4843
+ "grad_norm": 0.47679388523101807,
4844
+ "learning_rate": 1.1212262884103974e-07,
4845
+ "loss": 0.001,
4846
+ "step": 691
4847
+ },
4848
+ {
4849
+ "epoch": 2.4517271922054915,
4850
+ "grad_norm": 0.017726508900523186,
4851
+ "learning_rate": 8.85941600153033e-08,
4852
+ "loss": 0.0001,
4853
+ "step": 692
4854
+ },
4855
+ {
4856
+ "epoch": 2.4552701505757306,
4857
+ "grad_norm": 0.40522223711013794,
4858
+ "learning_rate": 6.783222835572055e-08,
4859
+ "loss": 0.0018,
4860
+ "step": 693
4861
+ },
4862
+ {
4863
+ "epoch": 2.4588131089459697,
4864
+ "grad_norm": 0.01428899448364973,
4865
+ "learning_rate": 4.98374030611084e-08,
4866
+ "loss": 0.0,
4867
+ "step": 694
4868
+ },
4869
+ {
4870
+ "epoch": 2.4623560673162093,
4871
+ "grad_norm": 1.708270788192749,
4872
+ "learning_rate": 3.461017746871675e-08,
4873
+ "loss": 0.0083,
4874
+ "step": 695
4875
+ },
4876
+ {
4877
+ "epoch": 2.4658990256864484,
4878
+ "grad_norm": 0.04217607527971268,
4879
+ "learning_rate": 2.215096904060454e-08,
4880
+ "loss": 0.0001,
4881
+ "step": 696
4882
+ },
4883
+ {
4884
+ "epoch": 2.4694419840566875,
4885
+ "grad_norm": 0.007940036244690418,
4886
+ "learning_rate": 1.246011935228064e-08,
4887
+ "loss": 0.0,
4888
+ "step": 697
4889
+ },
4890
+ {
4891
+ "epoch": 2.4729849424269266,
4892
+ "grad_norm": 0.14349356293678284,
4893
+ "learning_rate": 5.537894083273543e-09,
4894
+ "loss": 0.0007,
4895
+ "step": 698
4896
+ },
4897
+ {
4898
+ "epoch": 2.4765279007971657,
4899
+ "grad_norm": 0.017691675573587418,
4900
+ "learning_rate": 1.384483009898796e-09,
4901
+ "loss": 0.0001,
4902
+ "step": 699
4903
+ },
4904
+ {
4905
+ "epoch": 2.4800708591674048,
4906
+ "grad_norm": 0.371564656496048,
4907
+ "learning_rate": 0.0,
4908
+ "loss": 0.0009,
4909
+ "step": 700
4910
  }
4911
  ],
4912
  "logging_steps": 1,
 
4921
  "should_evaluate": false,
4922
  "should_log": false,
4923
  "should_save": true,
4924
+ "should_training_stop": true
4925
  },
4926
  "attributes": {}
4927
  }
4928
  },
4929
+ "total_flos": 2.77336688291532e+18,
4930
  "train_batch_size": 4,
4931
  "trial_name": null,
4932
  "trial_params": null