Training in progress, step 10000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2384234968
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0036e61ddac96c13d28af5b7348463838da31642973c16e5370deba79e225fb7
|
3 |
size 2384234968
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4768662910
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:81c5b9d8b1806de7455aa1e925033cdc78ae0c9f0b199eac035d87169284a120
|
3 |
size 4768662910
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ba427af78f54355503e8fb146121e9f936d278226f07d5bf09468fc62083d77
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -2,9 +2,9 @@
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
-
"epoch": 0.
|
6 |
"eval_steps": 100,
|
7 |
-
"global_step":
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
@@ -8558,6 +8558,456 @@
|
|
8558 |
"mean_token_accuracy": 0.9684931464493275,
|
8559 |
"num_tokens": 38912000.0,
|
8560 |
"step": 9500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8561 |
}
|
8562 |
],
|
8563 |
"logging_steps": 10,
|
@@ -8577,7 +9027,7 @@
|
|
8577 |
"attributes": {}
|
8578 |
}
|
8579 |
},
|
8580 |
-
"total_flos": 1.
|
8581 |
"train_batch_size": 1,
|
8582 |
"trial_name": null,
|
8583 |
"trial_params": null
|
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
+
"epoch": 0.9666038374172345,
|
6 |
"eval_steps": 100,
|
7 |
+
"global_step": 10000,
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
|
|
8558 |
"mean_token_accuracy": 0.9684931464493275,
|
8559 |
"num_tokens": 38912000.0,
|
8560 |
"step": 9500
|
8561 |
+
},
|
8562 |
+
{
|
8563 |
+
"epoch": 0.91924024938379,
|
8564 |
+
"grad_norm": 0.8302338123321533,
|
8565 |
+
"learning_rate": 4.489795918367347e-06,
|
8566 |
+
"loss": 0.1492,
|
8567 |
+
"mean_token_accuracy": 0.9706213280558587,
|
8568 |
+
"num_tokens": 38952960.0,
|
8569 |
+
"step": 9510
|
8570 |
+
},
|
8571 |
+
{
|
8572 |
+
"epoch": 0.9202068532212073,
|
8573 |
+
"grad_norm": 0.7152886390686035,
|
8574 |
+
"learning_rate": 4.436090225563911e-06,
|
8575 |
+
"loss": 0.123,
|
8576 |
+
"mean_token_accuracy": 0.9749999962747097,
|
8577 |
+
"num_tokens": 38993920.0,
|
8578 |
+
"step": 9520
|
8579 |
+
},
|
8580 |
+
{
|
8581 |
+
"epoch": 0.9211734570586245,
|
8582 |
+
"grad_norm": 0.5830357074737549,
|
8583 |
+
"learning_rate": 4.382384532760473e-06,
|
8584 |
+
"loss": 0.1219,
|
8585 |
+
"mean_token_accuracy": 0.9751712322235108,
|
8586 |
+
"num_tokens": 39034880.0,
|
8587 |
+
"step": 9530
|
8588 |
+
},
|
8589 |
+
{
|
8590 |
+
"epoch": 0.9221400608960417,
|
8591 |
+
"grad_norm": 0.8357058763504028,
|
8592 |
+
"learning_rate": 4.328678839957036e-06,
|
8593 |
+
"loss": 0.1569,
|
8594 |
+
"mean_token_accuracy": 0.9674412876367569,
|
8595 |
+
"num_tokens": 39075840.0,
|
8596 |
+
"step": 9540
|
8597 |
+
},
|
8598 |
+
{
|
8599 |
+
"epoch": 0.923106664733459,
|
8600 |
+
"grad_norm": 0.8819809556007385,
|
8601 |
+
"learning_rate": 4.274973147153598e-06,
|
8602 |
+
"loss": 0.143,
|
8603 |
+
"mean_token_accuracy": 0.9700097791850567,
|
8604 |
+
"num_tokens": 39116800.0,
|
8605 |
+
"step": 9550
|
8606 |
+
},
|
8607 |
+
{
|
8608 |
+
"epoch": 0.9240732685708762,
|
8609 |
+
"grad_norm": 0.6170474290847778,
|
8610 |
+
"learning_rate": 4.221267454350161e-06,
|
8611 |
+
"loss": 0.1453,
|
8612 |
+
"mean_token_accuracy": 0.9699363976716995,
|
8613 |
+
"num_tokens": 39157760.0,
|
8614 |
+
"step": 9560
|
8615 |
+
},
|
8616 |
+
{
|
8617 |
+
"epoch": 0.9250398724082934,
|
8618 |
+
"grad_norm": 0.7918187975883484,
|
8619 |
+
"learning_rate": 4.1675617615467236e-06,
|
8620 |
+
"loss": 0.1473,
|
8621 |
+
"mean_token_accuracy": 0.9694960817694664,
|
8622 |
+
"num_tokens": 39198720.0,
|
8623 |
+
"step": 9570
|
8624 |
+
},
|
8625 |
+
{
|
8626 |
+
"epoch": 0.9260064762457106,
|
8627 |
+
"grad_norm": 0.7999791502952576,
|
8628 |
+
"learning_rate": 4.113856068743287e-06,
|
8629 |
+
"loss": 0.12,
|
8630 |
+
"mean_token_accuracy": 0.9749755367636681,
|
8631 |
+
"num_tokens": 39239680.0,
|
8632 |
+
"step": 9580
|
8633 |
+
},
|
8634 |
+
{
|
8635 |
+
"epoch": 0.926973080083128,
|
8636 |
+
"grad_norm": 0.794882595539093,
|
8637 |
+
"learning_rate": 4.06015037593985e-06,
|
8638 |
+
"loss": 0.1322,
|
8639 |
+
"mean_token_accuracy": 0.9725782789289952,
|
8640 |
+
"num_tokens": 39280640.0,
|
8641 |
+
"step": 9590
|
8642 |
+
},
|
8643 |
+
{
|
8644 |
+
"epoch": 0.9279396839205452,
|
8645 |
+
"grad_norm": 0.6439830660820007,
|
8646 |
+
"learning_rate": 4.006444683136413e-06,
|
8647 |
+
"loss": 0.1463,
|
8648 |
+
"mean_token_accuracy": 0.9704745531082153,
|
8649 |
+
"num_tokens": 39321600.0,
|
8650 |
+
"step": 9600
|
8651 |
+
},
|
8652 |
+
{
|
8653 |
+
"epoch": 0.9289062877579624,
|
8654 |
+
"grad_norm": 0.7629963159561157,
|
8655 |
+
"learning_rate": 3.952738990332976e-06,
|
8656 |
+
"loss": 0.1556,
|
8657 |
+
"mean_token_accuracy": 0.9674657486379147,
|
8658 |
+
"num_tokens": 39362560.0,
|
8659 |
+
"step": 9610
|
8660 |
+
},
|
8661 |
+
{
|
8662 |
+
"epoch": 0.9298728915953797,
|
8663 |
+
"grad_norm": 0.7481008172035217,
|
8664 |
+
"learning_rate": 3.899033297529538e-06,
|
8665 |
+
"loss": 0.1459,
|
8666 |
+
"mean_token_accuracy": 0.9704256355762482,
|
8667 |
+
"num_tokens": 39403520.0,
|
8668 |
+
"step": 9620
|
8669 |
+
},
|
8670 |
+
{
|
8671 |
+
"epoch": 0.9308394954327969,
|
8672 |
+
"grad_norm": 0.7382989525794983,
|
8673 |
+
"learning_rate": 3.845327604726101e-06,
|
8674 |
+
"loss": 0.1481,
|
8675 |
+
"mean_token_accuracy": 0.9690802298486233,
|
8676 |
+
"num_tokens": 39444480.0,
|
8677 |
+
"step": 9630
|
8678 |
+
},
|
8679 |
+
{
|
8680 |
+
"epoch": 0.9318060992702141,
|
8681 |
+
"grad_norm": 0.7181780934333801,
|
8682 |
+
"learning_rate": 3.791621911922664e-06,
|
8683 |
+
"loss": 0.1432,
|
8684 |
+
"mean_token_accuracy": 0.9703767105937005,
|
8685 |
+
"num_tokens": 39485440.0,
|
8686 |
+
"step": 9640
|
8687 |
+
},
|
8688 |
+
{
|
8689 |
+
"epoch": 0.9327727031076314,
|
8690 |
+
"grad_norm": 0.7073920369148254,
|
8691 |
+
"learning_rate": 3.7379162191192266e-06,
|
8692 |
+
"loss": 0.1423,
|
8693 |
+
"mean_token_accuracy": 0.9701320916414261,
|
8694 |
+
"num_tokens": 39526400.0,
|
8695 |
+
"step": 9650
|
8696 |
+
},
|
8697 |
+
{
|
8698 |
+
"epoch": 0.9337393069450486,
|
8699 |
+
"grad_norm": 0.665108859539032,
|
8700 |
+
"learning_rate": 3.6842105263157892e-06,
|
8701 |
+
"loss": 0.1367,
|
8702 |
+
"mean_token_accuracy": 0.9713796436786651,
|
8703 |
+
"num_tokens": 39567360.0,
|
8704 |
+
"step": 9660
|
8705 |
+
},
|
8706 |
+
{
|
8707 |
+
"epoch": 0.9347059107824658,
|
8708 |
+
"grad_norm": 0.6856437921524048,
|
8709 |
+
"learning_rate": 3.6305048335123527e-06,
|
8710 |
+
"loss": 0.1487,
|
8711 |
+
"mean_token_accuracy": 0.9685665339231491,
|
8712 |
+
"num_tokens": 39608320.0,
|
8713 |
+
"step": 9670
|
8714 |
+
},
|
8715 |
+
{
|
8716 |
+
"epoch": 0.935672514619883,
|
8717 |
+
"grad_norm": 0.6998845934867859,
|
8718 |
+
"learning_rate": 3.5767991407089154e-06,
|
8719 |
+
"loss": 0.1325,
|
8720 |
+
"mean_token_accuracy": 0.9735078237950802,
|
8721 |
+
"num_tokens": 39649280.0,
|
8722 |
+
"step": 9680
|
8723 |
+
},
|
8724 |
+
{
|
8725 |
+
"epoch": 0.9366391184573003,
|
8726 |
+
"grad_norm": 0.7220867276191711,
|
8727 |
+
"learning_rate": 3.523093447905478e-06,
|
8728 |
+
"loss": 0.1322,
|
8729 |
+
"mean_token_accuracy": 0.9729941241443157,
|
8730 |
+
"num_tokens": 39690240.0,
|
8731 |
+
"step": 9690
|
8732 |
+
},
|
8733 |
+
{
|
8734 |
+
"epoch": 0.9376057222947175,
|
8735 |
+
"grad_norm": 0.750056803226471,
|
8736 |
+
"learning_rate": 3.469387755102041e-06,
|
8737 |
+
"loss": 0.1321,
|
8738 |
+
"mean_token_accuracy": 0.9733365938067436,
|
8739 |
+
"num_tokens": 39731200.0,
|
8740 |
+
"step": 9700
|
8741 |
+
},
|
8742 |
+
{
|
8743 |
+
"epoch": 0.9385723261321347,
|
8744 |
+
"grad_norm": 0.5716467499732971,
|
8745 |
+
"learning_rate": 3.415682062298604e-06,
|
8746 |
+
"loss": 0.1513,
|
8747 |
+
"mean_token_accuracy": 0.9702299371361732,
|
8748 |
+
"num_tokens": 39772160.0,
|
8749 |
+
"step": 9710
|
8750 |
+
},
|
8751 |
+
{
|
8752 |
+
"epoch": 0.939538929969552,
|
8753 |
+
"grad_norm": 0.7372239828109741,
|
8754 |
+
"learning_rate": 3.3619763694951665e-06,
|
8755 |
+
"loss": 0.1608,
|
8756 |
+
"mean_token_accuracy": 0.9661448121070861,
|
8757 |
+
"num_tokens": 39813120.0,
|
8758 |
+
"step": 9720
|
8759 |
+
},
|
8760 |
+
{
|
8761 |
+
"epoch": 0.9405055338069692,
|
8762 |
+
"grad_norm": 0.6863879561424255,
|
8763 |
+
"learning_rate": 3.308270676691729e-06,
|
8764 |
+
"loss": 0.141,
|
8765 |
+
"mean_token_accuracy": 0.9709148697555066,
|
8766 |
+
"num_tokens": 39854080.0,
|
8767 |
+
"step": 9730
|
8768 |
+
},
|
8769 |
+
{
|
8770 |
+
"epoch": 0.9414721376443864,
|
8771 |
+
"grad_norm": 0.7031144499778748,
|
8772 |
+
"learning_rate": 3.2545649838882926e-06,
|
8773 |
+
"loss": 0.1355,
|
8774 |
+
"mean_token_accuracy": 0.9717954933643341,
|
8775 |
+
"num_tokens": 39895040.0,
|
8776 |
+
"step": 9740
|
8777 |
+
},
|
8778 |
+
{
|
8779 |
+
"epoch": 0.9424387414818037,
|
8780 |
+
"grad_norm": 0.6682131886482239,
|
8781 |
+
"learning_rate": 3.2008592910848553e-06,
|
8782 |
+
"loss": 0.145,
|
8783 |
+
"mean_token_accuracy": 0.9700342446565628,
|
8784 |
+
"num_tokens": 39936000.0,
|
8785 |
+
"step": 9750
|
8786 |
+
},
|
8787 |
+
{
|
8788 |
+
"epoch": 0.9434053453192209,
|
8789 |
+
"grad_norm": 0.6650647521018982,
|
8790 |
+
"learning_rate": 3.147153598281418e-06,
|
8791 |
+
"loss": 0.1413,
|
8792 |
+
"mean_token_accuracy": 0.9703767091035843,
|
8793 |
+
"num_tokens": 39976960.0,
|
8794 |
+
"step": 9760
|
8795 |
+
},
|
8796 |
+
{
|
8797 |
+
"epoch": 0.9443719491566381,
|
8798 |
+
"grad_norm": 0.8530674576759338,
|
8799 |
+
"learning_rate": 3.0934479054779806e-06,
|
8800 |
+
"loss": 0.151,
|
8801 |
+
"mean_token_accuracy": 0.9692025408148766,
|
8802 |
+
"num_tokens": 40017920.0,
|
8803 |
+
"step": 9770
|
8804 |
+
},
|
8805 |
+
{
|
8806 |
+
"epoch": 0.9453385529940553,
|
8807 |
+
"grad_norm": 0.7780562043190002,
|
8808 |
+
"learning_rate": 3.0397422126745437e-06,
|
8809 |
+
"loss": 0.1308,
|
8810 |
+
"mean_token_accuracy": 0.9729207396507263,
|
8811 |
+
"num_tokens": 40058880.0,
|
8812 |
+
"step": 9780
|
8813 |
+
},
|
8814 |
+
{
|
8815 |
+
"epoch": 0.9463051568314726,
|
8816 |
+
"grad_norm": 0.613500714302063,
|
8817 |
+
"learning_rate": 2.9860365198711068e-06,
|
8818 |
+
"loss": 0.1301,
|
8819 |
+
"mean_token_accuracy": 0.9727984338998794,
|
8820 |
+
"num_tokens": 40099840.0,
|
8821 |
+
"step": 9790
|
8822 |
+
},
|
8823 |
+
{
|
8824 |
+
"epoch": 0.9472717606688899,
|
8825 |
+
"grad_norm": 0.629189670085907,
|
8826 |
+
"learning_rate": 2.9323308270676694e-06,
|
8827 |
+
"loss": 0.1198,
|
8828 |
+
"mean_token_accuracy": 0.9759295471012592,
|
8829 |
+
"num_tokens": 40140800.0,
|
8830 |
+
"step": 9800
|
8831 |
+
},
|
8832 |
+
{
|
8833 |
+
"epoch": 0.948238364506307,
|
8834 |
+
"grad_norm": 0.7294339537620544,
|
8835 |
+
"learning_rate": 2.878625134264232e-06,
|
8836 |
+
"loss": 0.1376,
|
8837 |
+
"mean_token_accuracy": 0.9716731905937195,
|
8838 |
+
"num_tokens": 40181760.0,
|
8839 |
+
"step": 9810
|
8840 |
+
},
|
8841 |
+
{
|
8842 |
+
"epoch": 0.9492049683437244,
|
8843 |
+
"grad_norm": 0.7860731482505798,
|
8844 |
+
"learning_rate": 2.8249194414607948e-06,
|
8845 |
+
"loss": 0.1398,
|
8846 |
+
"mean_token_accuracy": 0.9713307186961174,
|
8847 |
+
"num_tokens": 40222720.0,
|
8848 |
+
"step": 9820
|
8849 |
+
},
|
8850 |
+
{
|
8851 |
+
"epoch": 0.9501715721811416,
|
8852 |
+
"grad_norm": 0.6915313601493835,
|
8853 |
+
"learning_rate": 2.771213748657358e-06,
|
8854 |
+
"loss": 0.1543,
|
8855 |
+
"mean_token_accuracy": 0.9675146743655205,
|
8856 |
+
"num_tokens": 40263680.0,
|
8857 |
+
"step": 9830
|
8858 |
+
},
|
8859 |
+
{
|
8860 |
+
"epoch": 0.9511381760185588,
|
8861 |
+
"grad_norm": 0.6553166508674622,
|
8862 |
+
"learning_rate": 2.7175080558539205e-06,
|
8863 |
+
"loss": 0.143,
|
8864 |
+
"mean_token_accuracy": 0.9713307216763496,
|
8865 |
+
"num_tokens": 40304640.0,
|
8866 |
+
"step": 9840
|
8867 |
+
},
|
8868 |
+
{
|
8869 |
+
"epoch": 0.9521047798559761,
|
8870 |
+
"grad_norm": 0.5625308752059937,
|
8871 |
+
"learning_rate": 2.6638023630504836e-06,
|
8872 |
+
"loss": 0.1415,
|
8873 |
+
"mean_token_accuracy": 0.9711105637252331,
|
8874 |
+
"num_tokens": 40345600.0,
|
8875 |
+
"step": 9850
|
8876 |
+
},
|
8877 |
+
{
|
8878 |
+
"epoch": 0.9530713836933933,
|
8879 |
+
"grad_norm": 0.7295253872871399,
|
8880 |
+
"learning_rate": 2.6100966702470467e-06,
|
8881 |
+
"loss": 0.1567,
|
8882 |
+
"mean_token_accuracy": 0.9681262217462063,
|
8883 |
+
"num_tokens": 40386560.0,
|
8884 |
+
"step": 9860
|
8885 |
+
},
|
8886 |
+
{
|
8887 |
+
"epoch": 0.9540379875308105,
|
8888 |
+
"grad_norm": 0.5923715233802795,
|
8889 |
+
"learning_rate": 2.5563909774436093e-06,
|
8890 |
+
"loss": 0.1241,
|
8891 |
+
"mean_token_accuracy": 0.9743150658905506,
|
8892 |
+
"num_tokens": 40427520.0,
|
8893 |
+
"step": 9870
|
8894 |
+
},
|
8895 |
+
{
|
8896 |
+
"epoch": 0.9550045913682277,
|
8897 |
+
"grad_norm": 0.6577922105789185,
|
8898 |
+
"learning_rate": 2.502685284640172e-06,
|
8899 |
+
"loss": 0.1238,
|
8900 |
+
"mean_token_accuracy": 0.9746086105704308,
|
8901 |
+
"num_tokens": 40468480.0,
|
8902 |
+
"step": 9880
|
8903 |
+
},
|
8904 |
+
{
|
8905 |
+
"epoch": 0.955971195205645,
|
8906 |
+
"grad_norm": 0.607589066028595,
|
8907 |
+
"learning_rate": 2.4489795918367347e-06,
|
8908 |
+
"loss": 0.1403,
|
8909 |
+
"mean_token_accuracy": 0.9714041076600551,
|
8910 |
+
"num_tokens": 40509440.0,
|
8911 |
+
"step": 9890
|
8912 |
+
},
|
8913 |
+
{
|
8914 |
+
"epoch": 0.9569377990430622,
|
8915 |
+
"grad_norm": 0.620296835899353,
|
8916 |
+
"learning_rate": 2.3952738990332978e-06,
|
8917 |
+
"loss": 0.1438,
|
8918 |
+
"mean_token_accuracy": 0.9697896286845207,
|
8919 |
+
"num_tokens": 40550400.0,
|
8920 |
+
"step": 9900
|
8921 |
+
},
|
8922 |
+
{
|
8923 |
+
"epoch": 0.9579044028804794,
|
8924 |
+
"grad_norm": 0.783877968788147,
|
8925 |
+
"learning_rate": 2.3415682062298604e-06,
|
8926 |
+
"loss": 0.1415,
|
8927 |
+
"mean_token_accuracy": 0.9718688800930977,
|
8928 |
+
"num_tokens": 40591360.0,
|
8929 |
+
"step": 9910
|
8930 |
+
},
|
8931 |
+
{
|
8932 |
+
"epoch": 0.9588710067178967,
|
8933 |
+
"grad_norm": 0.6063189506530762,
|
8934 |
+
"learning_rate": 2.287862513426423e-06,
|
8935 |
+
"loss": 0.1497,
|
8936 |
+
"mean_token_accuracy": 0.9689579211175442,
|
8937 |
+
"num_tokens": 40632320.0,
|
8938 |
+
"step": 9920
|
8939 |
+
},
|
8940 |
+
{
|
8941 |
+
"epoch": 0.9598376105553139,
|
8942 |
+
"grad_norm": 0.7489660382270813,
|
8943 |
+
"learning_rate": 2.234156820622986e-06,
|
8944 |
+
"loss": 0.135,
|
8945 |
+
"mean_token_accuracy": 0.9724559679627418,
|
8946 |
+
"num_tokens": 40673280.0,
|
8947 |
+
"step": 9930
|
8948 |
+
},
|
8949 |
+
{
|
8950 |
+
"epoch": 0.9608042143927311,
|
8951 |
+
"grad_norm": 0.5833399891853333,
|
8952 |
+
"learning_rate": 2.1804511278195492e-06,
|
8953 |
+
"loss": 0.1408,
|
8954 |
+
"mean_token_accuracy": 0.9712328761816025,
|
8955 |
+
"num_tokens": 40714240.0,
|
8956 |
+
"step": 9940
|
8957 |
+
},
|
8958 |
+
{
|
8959 |
+
"epoch": 0.9617708182301484,
|
8960 |
+
"grad_norm": 0.6912499666213989,
|
8961 |
+
"learning_rate": 2.126745435016112e-06,
|
8962 |
+
"loss": 0.1114,
|
8963 |
+
"mean_token_accuracy": 0.975831700116396,
|
8964 |
+
"num_tokens": 40755200.0,
|
8965 |
+
"step": 9950
|
8966 |
+
},
|
8967 |
+
{
|
8968 |
+
"epoch": 0.9627374220675656,
|
8969 |
+
"grad_norm": 0.8001022934913635,
|
8970 |
+
"learning_rate": 2.0730397422126746e-06,
|
8971 |
+
"loss": 0.1424,
|
8972 |
+
"mean_token_accuracy": 0.9710371777415275,
|
8973 |
+
"num_tokens": 40796160.0,
|
8974 |
+
"step": 9960
|
8975 |
+
},
|
8976 |
+
{
|
8977 |
+
"epoch": 0.9637040259049828,
|
8978 |
+
"grad_norm": 0.6807326674461365,
|
8979 |
+
"learning_rate": 2.0193340494092377e-06,
|
8980 |
+
"loss": 0.1397,
|
8981 |
+
"mean_token_accuracy": 0.9719911940395832,
|
8982 |
+
"num_tokens": 40837120.0,
|
8983 |
+
"step": 9970
|
8984 |
+
},
|
8985 |
+
{
|
8986 |
+
"epoch": 0.9646706297424,
|
8987 |
+
"grad_norm": 0.603573203086853,
|
8988 |
+
"learning_rate": 1.9656283566058003e-06,
|
8989 |
+
"loss": 0.143,
|
8990 |
+
"mean_token_accuracy": 0.9709393292665481,
|
8991 |
+
"num_tokens": 40878080.0,
|
8992 |
+
"step": 9980
|
8993 |
+
},
|
8994 |
+
{
|
8995 |
+
"epoch": 0.9656372335798173,
|
8996 |
+
"grad_norm": 0.7243348360061646,
|
8997 |
+
"learning_rate": 1.911922663802363e-06,
|
8998 |
+
"loss": 0.1455,
|
8999 |
+
"mean_token_accuracy": 0.9693737730383873,
|
9000 |
+
"num_tokens": 40919040.0,
|
9001 |
+
"step": 9990
|
9002 |
+
},
|
9003 |
+
{
|
9004 |
+
"epoch": 0.9666038374172345,
|
9005 |
+
"grad_norm": 0.6356106996536255,
|
9006 |
+
"learning_rate": 1.8582169709989259e-06,
|
9007 |
+
"loss": 0.1502,
|
9008 |
+
"mean_token_accuracy": 0.968175146728754,
|
9009 |
+
"num_tokens": 40960000.0,
|
9010 |
+
"step": 10000
|
9011 |
}
|
9012 |
],
|
9013 |
"logging_steps": 10,
|
|
|
9027 |
"attributes": {}
|
9028 |
}
|
9029 |
},
|
9030 |
+
"total_flos": 1.0824928198656e+17,
|
9031 |
"train_batch_size": 1,
|
9032 |
"trial_name": null,
|
9033 |
"trial_params": null
|