Romain-XV commited on
Commit
d79bd85
·
verified ·
1 Parent(s): bbe546b

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32de89f64bfbdf5257adb9297417907f059af3bb9a4f224da336aef2a6b7bfa8
3
  size 9864
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:267408b82b60dab4b44412e9284bc1f5648d09dec0c8f209b3cf6c1f8a9f63c8
3
  size 9864
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:206339865b007a0db7499757f0cc0552961328fa94fd512a48b5732e4bb1ebcb
3
  size 24006
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c7aed16210dbfe7d4bef5bb06de34a5b5301fc935c146c1b94fd2dbb39fdcb7
3
  size 24006
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6bca983b309063a996168bc9ba0246dee10aad731d5eafae85ac843af75455c4
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee42b226fbd651b63bde6fca880bc3d2b1843f2955205d066e9bd7809f09c0a9
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e9a495185b30e410553401cbf647ae58e45b1f7a5b4cfd1421665ad738e6aa1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c1efa6588a3e275e4071dc95251b6e117e499882aa80f6f8f82ea2ac95bdaef
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 11.5,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
- "epoch": 0.20717337822089862,
5
  "eval_steps": 50,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -731,6 +731,722 @@
731
  "eval_samples_per_second": 238.303,
732
  "eval_steps_per_second": 59.649,
733
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
  }
735
  ],
736
  "logging_steps": 1,
@@ -745,7 +1461,7 @@
745
  "early_stopping_threshold": 0.0
746
  },
747
  "attributes": {
748
- "early_stopping_patience_counter": 0
749
  }
750
  },
751
  "TrainerControl": {
@@ -754,12 +1470,12 @@
754
  "should_evaluate": false,
755
  "should_log": false,
756
  "should_save": true,
757
- "should_training_stop": false
758
  },
759
  "attributes": {}
760
  }
761
  },
762
- "total_flos": 31916870860800.0,
763
  "train_batch_size": 4,
764
  "trial_name": null,
765
  "trial_params": null
 
1
  {
2
  "best_metric": 11.5,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
+ "epoch": 0.41434675644179725,
5
  "eval_steps": 50,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
731
  "eval_samples_per_second": 238.303,
732
  "eval_steps_per_second": 59.649,
733
  "step": 100
734
+ },
735
+ {
736
+ "epoch": 0.2092451120031076,
737
+ "grad_norm": 0.009546121582388878,
738
+ "learning_rate": 0.00018228387237361244,
739
+ "loss": 184.0,
740
+ "step": 101
741
+ },
742
+ {
743
+ "epoch": 0.2113168457853166,
744
+ "grad_norm": 0.010459087789058685,
745
+ "learning_rate": 0.00018190462079662896,
746
+ "loss": 184.0,
747
+ "step": 102
748
+ },
749
+ {
750
+ "epoch": 0.21338857956752558,
751
+ "grad_norm": 0.009236454963684082,
752
+ "learning_rate": 0.00018152175608395814,
753
+ "loss": 184.0,
754
+ "step": 103
755
+ },
756
+ {
757
+ "epoch": 0.21546031334973456,
758
+ "grad_norm": 0.009751472622156143,
759
+ "learning_rate": 0.0001811352951252717,
760
+ "loss": 184.0,
761
+ "step": 104
762
+ },
763
+ {
764
+ "epoch": 0.21753204713194355,
765
+ "grad_norm": 0.01094972062855959,
766
+ "learning_rate": 0.0001807452549688859,
767
+ "loss": 184.0,
768
+ "step": 105
769
+ },
770
+ {
771
+ "epoch": 0.21960378091415253,
772
+ "grad_norm": 0.009608612395823002,
773
+ "learning_rate": 0.0001803516528210096,
774
+ "loss": 184.0,
775
+ "step": 106
776
+ },
777
+ {
778
+ "epoch": 0.22167551469636151,
779
+ "grad_norm": 0.011158065870404243,
780
+ "learning_rate": 0.00017995450604498512,
781
+ "loss": 184.0,
782
+ "step": 107
783
+ },
784
+ {
785
+ "epoch": 0.2237472484785705,
786
+ "grad_norm": 0.011936160735785961,
787
+ "learning_rate": 0.0001795538321605222,
788
+ "loss": 184.0,
789
+ "step": 108
790
+ },
791
+ {
792
+ "epoch": 0.22581898226077948,
793
+ "grad_norm": 0.01062457449734211,
794
+ "learning_rate": 0.00017914964884292544,
795
+ "loss": 184.0,
796
+ "step": 109
797
+ },
798
+ {
799
+ "epoch": 0.22789071604298847,
800
+ "grad_norm": 0.010308627970516682,
801
+ "learning_rate": 0.00017874197392231414,
802
+ "loss": 184.0,
803
+ "step": 110
804
+ },
805
+ {
806
+ "epoch": 0.22996244982519745,
807
+ "grad_norm": 0.009443109855055809,
808
+ "learning_rate": 0.00017833082538283614,
809
+ "loss": 184.0,
810
+ "step": 111
811
+ },
812
+ {
813
+ "epoch": 0.23203418360740644,
814
+ "grad_norm": 0.012800839729607105,
815
+ "learning_rate": 0.00017791622136187422,
816
+ "loss": 184.0,
817
+ "step": 112
818
+ },
819
+ {
820
+ "epoch": 0.23410591738961545,
821
+ "grad_norm": 0.013194529339671135,
822
+ "learning_rate": 0.0001774981801492461,
823
+ "loss": 184.0,
824
+ "step": 113
825
+ },
826
+ {
827
+ "epoch": 0.23617765117182443,
828
+ "grad_norm": 0.013125723227858543,
829
+ "learning_rate": 0.00017707672018639758,
830
+ "loss": 184.0,
831
+ "step": 114
832
+ },
833
+ {
834
+ "epoch": 0.23824938495403342,
835
+ "grad_norm": 0.011432023718953133,
836
+ "learning_rate": 0.000176651860065589,
837
+ "loss": 184.0,
838
+ "step": 115
839
+ },
840
+ {
841
+ "epoch": 0.2403211187362424,
842
+ "grad_norm": 0.00989463273435831,
843
+ "learning_rate": 0.00017622361852907505,
844
+ "loss": 184.0,
845
+ "step": 116
846
+ },
847
+ {
848
+ "epoch": 0.24239285251845138,
849
+ "grad_norm": 0.01200682483613491,
850
+ "learning_rate": 0.000175792014468278,
851
+ "loss": 184.0,
852
+ "step": 117
853
+ },
854
+ {
855
+ "epoch": 0.24446458630066037,
856
+ "grad_norm": 0.010150207206606865,
857
+ "learning_rate": 0.00017535706692295436,
858
+ "loss": 184.0,
859
+ "step": 118
860
+ },
861
+ {
862
+ "epoch": 0.24653632008286935,
863
+ "grad_norm": 0.012173088267445564,
864
+ "learning_rate": 0.0001749187950803549,
865
+ "loss": 184.0,
866
+ "step": 119
867
+ },
868
+ {
869
+ "epoch": 0.24860805386507834,
870
+ "grad_norm": 0.012775926850736141,
871
+ "learning_rate": 0.0001744772182743782,
872
+ "loss": 184.0,
873
+ "step": 120
874
+ },
875
+ {
876
+ "epoch": 0.2506797876472873,
877
+ "grad_norm": 0.01158383209258318,
878
+ "learning_rate": 0.0001740323559847179,
879
+ "loss": 184.0,
880
+ "step": 121
881
+ },
882
+ {
883
+ "epoch": 0.2527515214294963,
884
+ "grad_norm": 0.01167603861540556,
885
+ "learning_rate": 0.0001735842278360032,
886
+ "loss": 184.0,
887
+ "step": 122
888
+ },
889
+ {
890
+ "epoch": 0.2548232552117053,
891
+ "grad_norm": 0.012568404898047447,
892
+ "learning_rate": 0.0001731328535969332,
893
+ "loss": 184.0,
894
+ "step": 123
895
+ },
896
+ {
897
+ "epoch": 0.2568949889939143,
898
+ "grad_norm": 0.01397041417658329,
899
+ "learning_rate": 0.00017267825317940493,
900
+ "loss": 184.0,
901
+ "step": 124
902
+ },
903
+ {
904
+ "epoch": 0.25896672277612326,
905
+ "grad_norm": 0.011861158534884453,
906
+ "learning_rate": 0.00017222044663763484,
907
+ "loss": 184.0,
908
+ "step": 125
909
+ },
910
+ {
911
+ "epoch": 0.26103845655833224,
912
+ "grad_norm": 0.012289059348404408,
913
+ "learning_rate": 0.00017175945416727405,
914
+ "loss": 184.0,
915
+ "step": 126
916
+ },
917
+ {
918
+ "epoch": 0.26311019034054123,
919
+ "grad_norm": 0.013283212669193745,
920
+ "learning_rate": 0.00017129529610451774,
921
+ "loss": 184.0,
922
+ "step": 127
923
+ },
924
+ {
925
+ "epoch": 0.2651819241227502,
926
+ "grad_norm": 0.010840194299817085,
927
+ "learning_rate": 0.00017082799292520768,
928
+ "loss": 184.0,
929
+ "step": 128
930
+ },
931
+ {
932
+ "epoch": 0.2672536579049592,
933
+ "grad_norm": 0.01217116229236126,
934
+ "learning_rate": 0.00017035756524392924,
935
+ "loss": 184.0,
936
+ "step": 129
937
+ },
938
+ {
939
+ "epoch": 0.2693253916871682,
940
+ "grad_norm": 0.014737384393811226,
941
+ "learning_rate": 0.00016988403381310176,
942
+ "loss": 184.0,
943
+ "step": 130
944
+ },
945
+ {
946
+ "epoch": 0.27139712546937717,
947
+ "grad_norm": 0.013226029463112354,
948
+ "learning_rate": 0.0001694074195220634,
949
+ "loss": 184.0,
950
+ "step": 131
951
+ },
952
+ {
953
+ "epoch": 0.27346885925158615,
954
+ "grad_norm": 0.013064621016383171,
955
+ "learning_rate": 0.00016892774339614928,
956
+ "loss": 184.0,
957
+ "step": 132
958
+ },
959
+ {
960
+ "epoch": 0.27554059303379513,
961
+ "grad_norm": 0.013637942261993885,
962
+ "learning_rate": 0.00016844502659576414,
963
+ "loss": 184.0,
964
+ "step": 133
965
+ },
966
+ {
967
+ "epoch": 0.2776123268160041,
968
+ "grad_norm": 0.011332944966852665,
969
+ "learning_rate": 0.0001679592904154489,
970
+ "loss": 184.0,
971
+ "step": 134
972
+ },
973
+ {
974
+ "epoch": 0.2796840605982131,
975
+ "grad_norm": 0.014374660328030586,
976
+ "learning_rate": 0.00016747055628294134,
977
+ "loss": 184.0,
978
+ "step": 135
979
+ },
980
+ {
981
+ "epoch": 0.28175579438042214,
982
+ "grad_norm": 0.010553686879575253,
983
+ "learning_rate": 0.00016697884575823043,
984
+ "loss": 184.0,
985
+ "step": 136
986
+ },
987
+ {
988
+ "epoch": 0.2838275281626311,
989
+ "grad_norm": 0.01155412383377552,
990
+ "learning_rate": 0.00016648418053260585,
991
+ "loss": 184.0,
992
+ "step": 137
993
+ },
994
+ {
995
+ "epoch": 0.2858992619448401,
996
+ "grad_norm": 0.012205103412270546,
997
+ "learning_rate": 0.00016598658242770054,
998
+ "loss": 184.0,
999
+ "step": 138
1000
+ },
1001
+ {
1002
+ "epoch": 0.2879709957270491,
1003
+ "grad_norm": 0.013882125727832317,
1004
+ "learning_rate": 0.00016548607339452853,
1005
+ "loss": 184.0,
1006
+ "step": 139
1007
+ },
1008
+ {
1009
+ "epoch": 0.2900427295092581,
1010
+ "grad_norm": 0.013000452890992165,
1011
+ "learning_rate": 0.00016498267551251616,
1012
+ "loss": 184.0,
1013
+ "step": 140
1014
+ },
1015
+ {
1016
+ "epoch": 0.29211446329146706,
1017
+ "grad_norm": 0.011942530982196331,
1018
+ "learning_rate": 0.0001644764109885284,
1019
+ "loss": 184.0,
1020
+ "step": 141
1021
+ },
1022
+ {
1023
+ "epoch": 0.29418619707367605,
1024
+ "grad_norm": 0.013372701592743397,
1025
+ "learning_rate": 0.00016396730215588915,
1026
+ "loss": 184.0,
1027
+ "step": 142
1028
+ },
1029
+ {
1030
+ "epoch": 0.29625793085588503,
1031
+ "grad_norm": 0.011299816891551018,
1032
+ "learning_rate": 0.00016345537147339579,
1033
+ "loss": 184.0,
1034
+ "step": 143
1035
+ },
1036
+ {
1037
+ "epoch": 0.298329664638094,
1038
+ "grad_norm": 0.013057565316557884,
1039
+ "learning_rate": 0.00016294064152432879,
1040
+ "loss": 184.0,
1041
+ "step": 144
1042
+ },
1043
+ {
1044
+ "epoch": 0.300401398420303,
1045
+ "grad_norm": 0.013032233342528343,
1046
+ "learning_rate": 0.0001624231350154552,
1047
+ "loss": 184.0,
1048
+ "step": 145
1049
+ },
1050
+ {
1051
+ "epoch": 0.302473132202512,
1052
+ "grad_norm": 0.012773418799042702,
1053
+ "learning_rate": 0.00016190287477602718,
1054
+ "loss": 184.0,
1055
+ "step": 146
1056
+ },
1057
+ {
1058
+ "epoch": 0.30454486598472097,
1059
+ "grad_norm": 0.013914387673139572,
1060
+ "learning_rate": 0.00016137988375677467,
1061
+ "loss": 184.0,
1062
+ "step": 147
1063
+ },
1064
+ {
1065
+ "epoch": 0.30661659976692995,
1066
+ "grad_norm": 0.012964308261871338,
1067
+ "learning_rate": 0.00016085418502889316,
1068
+ "loss": 184.0,
1069
+ "step": 148
1070
+ },
1071
+ {
1072
+ "epoch": 0.30868833354913894,
1073
+ "grad_norm": 0.012943675741553307,
1074
+ "learning_rate": 0.00016032580178302583,
1075
+ "loss": 184.0,
1076
+ "step": 149
1077
+ },
1078
+ {
1079
+ "epoch": 0.3107600673313479,
1080
+ "grad_norm": 0.013705256395041943,
1081
+ "learning_rate": 0.00015979475732824048,
1082
+ "loss": 184.0,
1083
+ "step": 150
1084
+ },
1085
+ {
1086
+ "epoch": 0.3107600673313479,
1087
+ "eval_loss": 11.5,
1088
+ "eval_runtime": 6.845,
1089
+ "eval_samples_per_second": 237.545,
1090
+ "eval_steps_per_second": 59.459,
1091
+ "step": 150
1092
+ },
1093
+ {
1094
+ "epoch": 0.3128318011135569,
1095
+ "grad_norm": 0.012948175892233849,
1096
+ "learning_rate": 0.00015926107509100137,
1097
+ "loss": 184.0,
1098
+ "step": 151
1099
+ },
1100
+ {
1101
+ "epoch": 0.3149035348957659,
1102
+ "grad_norm": 0.012896777130663395,
1103
+ "learning_rate": 0.00015872477861413576,
1104
+ "loss": 184.0,
1105
+ "step": 152
1106
+ },
1107
+ {
1108
+ "epoch": 0.3169752686779749,
1109
+ "grad_norm": 0.011514625512063503,
1110
+ "learning_rate": 0.0001581858915557953,
1111
+ "loss": 184.0,
1112
+ "step": 153
1113
+ },
1114
+ {
1115
+ "epoch": 0.31904700246018386,
1116
+ "grad_norm": 0.011680962517857552,
1117
+ "learning_rate": 0.00015764443768841234,
1118
+ "loss": 184.0,
1119
+ "step": 154
1120
+ },
1121
+ {
1122
+ "epoch": 0.32111873624239284,
1123
+ "grad_norm": 0.013176261447370052,
1124
+ "learning_rate": 0.00015710044089765145,
1125
+ "loss": 184.0,
1126
+ "step": 155
1127
+ },
1128
+ {
1129
+ "epoch": 0.32319047002460183,
1130
+ "grad_norm": 0.011944272555410862,
1131
+ "learning_rate": 0.00015655392518135539,
1132
+ "loss": 184.0,
1133
+ "step": 156
1134
+ },
1135
+ {
1136
+ "epoch": 0.3252622038068108,
1137
+ "grad_norm": 0.013128337450325489,
1138
+ "learning_rate": 0.00015600491464848678,
1139
+ "loss": 184.0,
1140
+ "step": 157
1141
+ },
1142
+ {
1143
+ "epoch": 0.3273339375890198,
1144
+ "grad_norm": 0.010075918398797512,
1145
+ "learning_rate": 0.00015545343351806444,
1146
+ "loss": 184.0,
1147
+ "step": 158
1148
+ },
1149
+ {
1150
+ "epoch": 0.3294056713712288,
1151
+ "grad_norm": 0.011055225506424904,
1152
+ "learning_rate": 0.00015489950611809484,
1153
+ "loss": 184.0,
1154
+ "step": 159
1155
+ },
1156
+ {
1157
+ "epoch": 0.33147740515343777,
1158
+ "grad_norm": 0.012351332232356071,
1159
+ "learning_rate": 0.00015434315688449924,
1160
+ "loss": 184.0,
1161
+ "step": 160
1162
+ },
1163
+ {
1164
+ "epoch": 0.33354913893564675,
1165
+ "grad_norm": 0.011134196072816849,
1166
+ "learning_rate": 0.0001537844103600354,
1167
+ "loss": 184.0,
1168
+ "step": 161
1169
+ },
1170
+ {
1171
+ "epoch": 0.33562087271785573,
1172
+ "grad_norm": 0.012073795311152935,
1173
+ "learning_rate": 0.00015322329119321507,
1174
+ "loss": 184.0,
1175
+ "step": 162
1176
+ },
1177
+ {
1178
+ "epoch": 0.3376926065000647,
1179
+ "grad_norm": 0.012623626738786697,
1180
+ "learning_rate": 0.00015265982413721662,
1181
+ "loss": 184.0,
1182
+ "step": 163
1183
+ },
1184
+ {
1185
+ "epoch": 0.3397643402822737,
1186
+ "grad_norm": 0.011703762225806713,
1187
+ "learning_rate": 0.00015209403404879303,
1188
+ "loss": 184.0,
1189
+ "step": 164
1190
+ },
1191
+ {
1192
+ "epoch": 0.3418360740644827,
1193
+ "grad_norm": 0.011324395425617695,
1194
+ "learning_rate": 0.00015152594588717543,
1195
+ "loss": 184.0,
1196
+ "step": 165
1197
+ },
1198
+ {
1199
+ "epoch": 0.3439078078466917,
1200
+ "grad_norm": 0.012606433592736721,
1201
+ "learning_rate": 0.00015095558471297195,
1202
+ "loss": 184.0,
1203
+ "step": 166
1204
+ },
1205
+ {
1206
+ "epoch": 0.3459795416289007,
1207
+ "grad_norm": 0.013505402021110058,
1208
+ "learning_rate": 0.00015038297568706243,
1209
+ "loss": 184.0,
1210
+ "step": 167
1211
+ },
1212
+ {
1213
+ "epoch": 0.3480512754111097,
1214
+ "grad_norm": 0.01219989825040102,
1215
+ "learning_rate": 0.00014980814406948806,
1216
+ "loss": 184.0,
1217
+ "step": 168
1218
+ },
1219
+ {
1220
+ "epoch": 0.3501230091933187,
1221
+ "grad_norm": 0.01135861687362194,
1222
+ "learning_rate": 0.00014923111521833758,
1223
+ "loss": 184.0,
1224
+ "step": 169
1225
+ },
1226
+ {
1227
+ "epoch": 0.35219474297552766,
1228
+ "grad_norm": 0.011894915252923965,
1229
+ "learning_rate": 0.00014865191458862816,
1230
+ "loss": 184.0,
1231
+ "step": 170
1232
+ },
1233
+ {
1234
+ "epoch": 0.35426647675773665,
1235
+ "grad_norm": 0.011499403044581413,
1236
+ "learning_rate": 0.00014807056773118274,
1237
+ "loss": 184.0,
1238
+ "step": 171
1239
+ },
1240
+ {
1241
+ "epoch": 0.35633821053994563,
1242
+ "grad_norm": 0.01057684887200594,
1243
+ "learning_rate": 0.00014748710029150293,
1244
+ "loss": 184.0,
1245
+ "step": 172
1246
+ },
1247
+ {
1248
+ "epoch": 0.3584099443221546,
1249
+ "grad_norm": 0.010864908806979656,
1250
+ "learning_rate": 0.0001469015380086374,
1251
+ "loss": 184.0,
1252
+ "step": 173
1253
+ },
1254
+ {
1255
+ "epoch": 0.3604816781043636,
1256
+ "grad_norm": 0.011844526045024395,
1257
+ "learning_rate": 0.0001463139067140468,
1258
+ "loss": 184.0,
1259
+ "step": 174
1260
+ },
1261
+ {
1262
+ "epoch": 0.3625534118865726,
1263
+ "grad_norm": 0.012101615779101849,
1264
+ "learning_rate": 0.00014572423233046386,
1265
+ "loss": 184.0,
1266
+ "step": 175
1267
+ },
1268
+ {
1269
+ "epoch": 0.36462514566878157,
1270
+ "grad_norm": 0.010460996069014072,
1271
+ "learning_rate": 0.00014513254087075014,
1272
+ "loss": 184.0,
1273
+ "step": 176
1274
+ },
1275
+ {
1276
+ "epoch": 0.36669687945099055,
1277
+ "grad_norm": 0.011071057058870792,
1278
+ "learning_rate": 0.00014453885843674838,
1279
+ "loss": 184.0,
1280
+ "step": 177
1281
+ },
1282
+ {
1283
+ "epoch": 0.36876861323319954,
1284
+ "grad_norm": 0.010831611230969429,
1285
+ "learning_rate": 0.00014394321121813093,
1286
+ "loss": 184.0,
1287
+ "step": 178
1288
+ },
1289
+ {
1290
+ "epoch": 0.3708403470154085,
1291
+ "grad_norm": 0.012190107256174088,
1292
+ "learning_rate": 0.00014334562549124467,
1293
+ "loss": 184.0,
1294
+ "step": 179
1295
+ },
1296
+ {
1297
+ "epoch": 0.3729120807976175,
1298
+ "grad_norm": 0.011053929105401039,
1299
+ "learning_rate": 0.0001427461276179517,
1300
+ "loss": 184.0,
1301
+ "step": 180
1302
+ },
1303
+ {
1304
+ "epoch": 0.3749838145798265,
1305
+ "grad_norm": 0.009843365289270878,
1306
+ "learning_rate": 0.0001421447440444663,
1307
+ "loss": 184.0,
1308
+ "step": 181
1309
+ },
1310
+ {
1311
+ "epoch": 0.3770555483620355,
1312
+ "grad_norm": 0.011174674145877361,
1313
+ "learning_rate": 0.00014154150130018866,
1314
+ "loss": 184.0,
1315
+ "step": 182
1316
+ },
1317
+ {
1318
+ "epoch": 0.37912728214424446,
1319
+ "grad_norm": 0.011116673238575459,
1320
+ "learning_rate": 0.00014093642599653406,
1321
+ "loss": 184.0,
1322
+ "step": 183
1323
+ },
1324
+ {
1325
+ "epoch": 0.38119901592645344,
1326
+ "grad_norm": 0.010997000150382519,
1327
+ "learning_rate": 0.00014032954482575937,
1328
+ "loss": 184.0,
1329
+ "step": 184
1330
+ },
1331
+ {
1332
+ "epoch": 0.38327074970866243,
1333
+ "grad_norm": 0.010498798452317715,
1334
+ "learning_rate": 0.00013972088455978536,
1335
+ "loss": 184.0,
1336
+ "step": 185
1337
+ },
1338
+ {
1339
+ "epoch": 0.3853424834908714,
1340
+ "grad_norm": 0.011176199652254581,
1341
+ "learning_rate": 0.0001391104720490156,
1342
+ "loss": 184.0,
1343
+ "step": 186
1344
+ },
1345
+ {
1346
+ "epoch": 0.3874142172730804,
1347
+ "grad_norm": 0.01269106101244688,
1348
+ "learning_rate": 0.00013849833422115222,
1349
+ "loss": 184.0,
1350
+ "step": 187
1351
+ },
1352
+ {
1353
+ "epoch": 0.3894859510552894,
1354
+ "grad_norm": 0.010353796184062958,
1355
+ "learning_rate": 0.0001378844980800078,
1356
+ "loss": 184.0,
1357
+ "step": 188
1358
+ },
1359
+ {
1360
+ "epoch": 0.39155768483749837,
1361
+ "grad_norm": 0.009553619660437107,
1362
+ "learning_rate": 0.00013726899070431423,
1363
+ "loss": 184.0,
1364
+ "step": 189
1365
+ },
1366
+ {
1367
+ "epoch": 0.39362941861970735,
1368
+ "grad_norm": 0.010300654917955399,
1369
+ "learning_rate": 0.00013665183924652815,
1370
+ "loss": 184.0,
1371
+ "step": 190
1372
+ },
1373
+ {
1374
+ "epoch": 0.39570115240191633,
1375
+ "grad_norm": 0.010640212334692478,
1376
+ "learning_rate": 0.00013603307093163318,
1377
+ "loss": 184.0,
1378
+ "step": 191
1379
+ },
1380
+ {
1381
+ "epoch": 0.3977728861841253,
1382
+ "grad_norm": 0.00991272833198309,
1383
+ "learning_rate": 0.00013541271305593877,
1384
+ "loss": 184.0,
1385
+ "step": 192
1386
+ },
1387
+ {
1388
+ "epoch": 0.3998446199663343,
1389
+ "grad_norm": 0.010035275481641293,
1390
+ "learning_rate": 0.00013479079298587635,
1391
+ "loss": 184.0,
1392
+ "step": 193
1393
+ },
1394
+ {
1395
+ "epoch": 0.4019163537485433,
1396
+ "grad_norm": 0.010469825938344002,
1397
+ "learning_rate": 0.00013416733815679166,
1398
+ "loss": 184.0,
1399
+ "step": 194
1400
+ },
1401
+ {
1402
+ "epoch": 0.40398808753075227,
1403
+ "grad_norm": 0.010510447435081005,
1404
+ "learning_rate": 0.00013354237607173495,
1405
+ "loss": 184.0,
1406
+ "step": 195
1407
+ },
1408
+ {
1409
+ "epoch": 0.4060598213129613,
1410
+ "grad_norm": 0.011126340366899967,
1411
+ "learning_rate": 0.00013291593430024727,
1412
+ "loss": 184.0,
1413
+ "step": 196
1414
+ },
1415
+ {
1416
+ "epoch": 0.4081315550951703,
1417
+ "grad_norm": 0.009824325330555439,
1418
+ "learning_rate": 0.00013228804047714463,
1419
+ "loss": 184.0,
1420
+ "step": 197
1421
+ },
1422
+ {
1423
+ "epoch": 0.4102032888773793,
1424
+ "grad_norm": 0.009497965686023235,
1425
+ "learning_rate": 0.00013165872230129868,
1426
+ "loss": 184.0,
1427
+ "step": 198
1428
+ },
1429
+ {
1430
+ "epoch": 0.41227502265958826,
1431
+ "grad_norm": 0.010396288707852364,
1432
+ "learning_rate": 0.00013102800753441487,
1433
+ "loss": 184.0,
1434
+ "step": 199
1435
+ },
1436
+ {
1437
+ "epoch": 0.41434675644179725,
1438
+ "grad_norm": 0.0102333789691329,
1439
+ "learning_rate": 0.00013039592399980785,
1440
+ "loss": 184.0,
1441
+ "step": 200
1442
+ },
1443
+ {
1444
+ "epoch": 0.41434675644179725,
1445
+ "eval_loss": 11.5,
1446
+ "eval_runtime": 6.8462,
1447
+ "eval_samples_per_second": 237.505,
1448
+ "eval_steps_per_second": 59.449,
1449
+ "step": 200
1450
  }
1451
  ],
1452
  "logging_steps": 1,
 
1461
  "early_stopping_threshold": 0.0
1462
  },
1463
  "attributes": {
1464
+ "early_stopping_patience_counter": 2
1465
  }
1466
  },
1467
  "TrainerControl": {
 
1470
  "should_evaluate": false,
1471
  "should_log": false,
1472
  "should_save": true,
1473
+ "should_training_stop": true
1474
  },
1475
  "attributes": {}
1476
  }
1477
  },
1478
+ "total_flos": 63833741721600.0,
1479
  "train_batch_size": 4,
1480
  "trial_name": null,
1481
  "trial_params": null