xiulinyang commited on
Commit
27be3d0
·
1 Parent(s): d503c9b

Adding model checkpoints and config files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. checkpoint-0/config.json +38 -0
  3. checkpoint-0/pytorch_model.bin +3 -0
  4. checkpoint-0/special_tokens_map.json +1 -0
  5. checkpoint-0/tokenizer_config.json +1 -0
  6. checkpoint-0/training_args.bin +3 -0
  7. checkpoint-100/config.json +38 -0
  8. checkpoint-100/optimizer.pt +3 -0
  9. checkpoint-100/pytorch_model.bin +3 -0
  10. checkpoint-100/rng_state.pth +3 -0
  11. checkpoint-100/scaler.pt +3 -0
  12. checkpoint-100/scheduler.pt +3 -0
  13. checkpoint-100/special_tokens_map.json +1 -0
  14. checkpoint-100/tokenizer_config.json +1 -0
  15. checkpoint-100/trainer_state.json +34 -0
  16. checkpoint-100/training_args.bin +3 -0
  17. checkpoint-1000/config.json +38 -0
  18. checkpoint-1000/optimizer.pt +3 -0
  19. checkpoint-1000/pytorch_model.bin +3 -0
  20. checkpoint-1000/rng_state.pth +3 -0
  21. checkpoint-1000/scaler.pt +3 -0
  22. checkpoint-1000/scheduler.pt +3 -0
  23. checkpoint-1000/special_tokens_map.json +1 -0
  24. checkpoint-1000/tokenizer_config.json +1 -0
  25. checkpoint-1000/trainer_state.json +158 -0
  26. checkpoint-1000/training_args.bin +3 -0
  27. checkpoint-1100/config.json +38 -0
  28. checkpoint-1100/optimizer.pt +3 -0
  29. checkpoint-1100/pytorch_model.bin +3 -0
  30. checkpoint-1100/rng_state.pth +3 -0
  31. checkpoint-1100/scaler.pt +3 -0
  32. checkpoint-1100/scheduler.pt +3 -0
  33. checkpoint-1100/special_tokens_map.json +1 -0
  34. checkpoint-1100/tokenizer_config.json +1 -0
  35. checkpoint-1100/trainer_state.json +170 -0
  36. checkpoint-1100/training_args.bin +3 -0
  37. checkpoint-1200/config.json +38 -0
  38. checkpoint-1200/optimizer.pt +3 -0
  39. checkpoint-1200/pytorch_model.bin +3 -0
  40. checkpoint-1200/rng_state.pth +3 -0
  41. checkpoint-1200/scaler.pt +3 -0
  42. checkpoint-1200/scheduler.pt +3 -0
  43. checkpoint-1200/special_tokens_map.json +1 -0
  44. checkpoint-1200/tokenizer_config.json +1 -0
  45. checkpoint-1200/trainer_state.json +182 -0
  46. checkpoint-1200/training_args.bin +3 -0
  47. checkpoint-200/config.json +38 -0
  48. checkpoint-200/optimizer.pt +3 -0
  49. checkpoint-200/pytorch_model.bin +3 -0
  50. checkpoint-200/rng_state.pth +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
checkpoint-0/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 1024,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": null,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "reorder_and_upcast_attn": true,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": true,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.2,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "task_specific_params": {
29
+ "text-generation": {
30
+ "do_sample": true,
31
+ "max_length": 1024
32
+ }
33
+ },
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.18.0",
36
+ "use_cache": false,
37
+ "vocab_size": 21128
38
+ }
checkpoint-0/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4cb7ddeb977797468ae0b2e8a977bb66b33b53cf9ce0a85051cfaa03f3f32eb
3
+ size 420912233
checkpoint-0/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
checkpoint-0/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"tokenizer_class": "PassthroughTokenizer"}
checkpoint-0/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b987d18c87bab512217174f78ebb9d5fdbaa7c1ed0879e0503d5e121c6da3324
3
+ size 3183
checkpoint-100/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 1024,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": null,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "reorder_and_upcast_attn": true,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": true,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.2,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "task_specific_params": {
29
+ "text-generation": {
30
+ "do_sample": true,
31
+ "max_length": 1024
32
+ }
33
+ },
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.18.0",
36
+ "use_cache": false,
37
+ "vocab_size": 21128
38
+ }
checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2944bdd606d2287376b234d038b877f771ea4a64be138f6cb68d1dedbe1400a4
3
+ size 816635249
checkpoint-100/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8e7e0e12f325854a067c6536843560d935df112d53de1ad598d85a15b5d99aa
3
+ size 420912233
checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf2b9119c5ee54f7d5de06e880d4157349bc34f034dd2f1f90bb759f212bdc47
3
+ size 14567
checkpoint-100/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13a3423b2fe42f204bc8fe2c666ff379f9fd753a0f13613064a5e71e86b519e8
3
+ size 559
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75e9b9d31d11c624d89b0c04ad496adf4b5addd3e703848d2583972c703e8da6
3
+ size 623
checkpoint-100/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
checkpoint-100/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"tokenizer_class": "PassthroughTokenizer"}
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0233333333333334,
5
+ "global_step": 100,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 4.9999999999999996e-06,
13
+ "loss": 10.1063,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 1.01,
18
+ "learning_rate": 0.00025,
19
+ "loss": 6.9252,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 2.02,
24
+ "learning_rate": 0.0005,
25
+ "loss": 4.9715,
26
+ "step": 100
27
+ }
28
+ ],
29
+ "max_steps": 1200,
30
+ "num_train_epochs": 9223372036854775807,
31
+ "total_flos": 2.7043725312e+16,
32
+ "trial_name": null,
33
+ "trial_params": null
34
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b987d18c87bab512217174f78ebb9d5fdbaa7c1ed0879e0503d5e121c6da3324
3
+ size 3183
checkpoint-1000/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 1024,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": null,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "reorder_and_upcast_attn": true,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": true,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.2,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "task_specific_params": {
29
+ "text-generation": {
30
+ "do_sample": true,
31
+ "max_length": 1024
32
+ }
33
+ },
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.18.0",
36
+ "use_cache": false,
37
+ "vocab_size": 21128
38
+ }
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e3e80f8f1986212c54e3a756ff3314ba4dabcf709be6d5ec55ccea7c4a07359
3
+ size 816635441
checkpoint-1000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:992b81662beb246902e95b3b4b06bce3eb0e59186062fdbfd09664204e904f30
3
+ size 420912233
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:073bb4d4e2cdffa2f0d4373aa6e4d7efc3b064cb8395beb8f55fbc1a2d560b58
3
+ size 14567
checkpoint-1000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f810fc7b695697c440d8985f6042b4ba23a9e1027604c265718b518ca29f1b2b
3
+ size 559
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0691206f4bd9ca409d6e7104087a4e0eb05df8f8f555a400f6ecc532edba52d8
3
+ size 623
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"tokenizer_class": "PassthroughTokenizer"}
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 27.023333333333333,
5
+ "global_step": 1000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 4.9999999999999996e-06,
13
+ "loss": 10.1063,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 1.01,
18
+ "learning_rate": 0.00025,
19
+ "loss": 6.9252,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 2.02,
24
+ "learning_rate": 0.0005,
25
+ "loss": 4.9715,
26
+ "step": 100
27
+ },
28
+ {
29
+ "epoch": 4.0,
30
+ "learning_rate": 0.0005833333333333333,
31
+ "loss": 4.3779,
32
+ "step": 150
33
+ },
34
+ {
35
+ "epoch": 5.02,
36
+ "learning_rate": 0.0005555555555555556,
37
+ "loss": 4.0428,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 6.03,
42
+ "learning_rate": 0.0005277777777777777,
43
+ "loss": 3.925,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 8.01,
48
+ "learning_rate": 0.0005,
49
+ "loss": 3.8187,
50
+ "step": 300
51
+ },
52
+ {
53
+ "epoch": 9.02,
54
+ "learning_rate": 0.00047222222222222224,
55
+ "loss": 3.6091,
56
+ "step": 350
57
+ },
58
+ {
59
+ "epoch": 11.0,
60
+ "learning_rate": 0.00044444444444444436,
61
+ "loss": 3.4778,
62
+ "step": 400
63
+ },
64
+ {
65
+ "epoch": 12.02,
66
+ "learning_rate": 0.00041666666666666664,
67
+ "loss": 3.3148,
68
+ "step": 450
69
+ },
70
+ {
71
+ "epoch": 13.03,
72
+ "learning_rate": 0.00038888888888888887,
73
+ "loss": 3.2144,
74
+ "step": 500
75
+ },
76
+ {
77
+ "epoch": 15.01,
78
+ "learning_rate": 0.0003611111111111111,
79
+ "loss": 3.1636,
80
+ "step": 550
81
+ },
82
+ {
83
+ "epoch": 16.02,
84
+ "learning_rate": 0.0003333333333333333,
85
+ "loss": 3.0596,
86
+ "step": 600
87
+ },
88
+ {
89
+ "epoch": 18.0,
90
+ "learning_rate": 0.00030555555555555555,
91
+ "loss": 3.0281,
92
+ "step": 650
93
+ },
94
+ {
95
+ "epoch": 19.01,
96
+ "learning_rate": 0.0002777777777777778,
97
+ "loss": 2.9411,
98
+ "step": 700
99
+ },
100
+ {
101
+ "epoch": 20.02,
102
+ "learning_rate": 0.00025,
103
+ "loss": 2.8897,
104
+ "step": 750
105
+ },
106
+ {
107
+ "epoch": 22.01,
108
+ "learning_rate": 0.00022222222222222218,
109
+ "loss": 2.8768,
110
+ "step": 800
111
+ },
112
+ {
113
+ "epoch": 23.02,
114
+ "learning_rate": 0.00019444444444444443,
115
+ "loss": 2.8048,
116
+ "step": 850
117
+ },
118
+ {
119
+ "epoch": 24.03,
120
+ "learning_rate": 0.00016666666666666666,
121
+ "loss": 2.7668,
122
+ "step": 900
123
+ },
124
+ {
125
+ "epoch": 26.01,
126
+ "learning_rate": 0.0001388888888888889,
127
+ "loss": 2.7633,
128
+ "step": 950
129
+ },
130
+ {
131
+ "epoch": 27.02,
132
+ "learning_rate": 0.00011111111111111109,
133
+ "loss": 2.7004,
134
+ "step": 1000
135
+ },
136
+ {
137
+ "epoch": 27.02,
138
+ "eval_loss": 3.4081130027770996,
139
+ "eval_runtime": 2.3193,
140
+ "eval_samples_per_second": 56.914,
141
+ "eval_steps_per_second": 3.881,
142
+ "step": 1000
143
+ },
144
+ {
145
+ "epoch": 27.02,
146
+ "eval_/scratch/xiulyang/multilingual-LM/training/multilingual_dataset.py_loss": 3.4081130027770996,
147
+ "eval_/scratch/xiulyang/multilingual-LM/training/multilingual_dataset.py_ppl": 30.208187677268597,
148
+ "eval_/scratch/xiulyang/multilingual-LM/training/multilingual_dataset.py_runtime": 2.3193,
149
+ "eval_/scratch/xiulyang/multilingual-LM/training/multilingual_dataset.py_samples_per_second": 56.914,
150
+ "step": 1000
151
+ }
152
+ ],
153
+ "max_steps": 1200,
154
+ "num_train_epochs": 9223372036854775807,
155
+ "total_flos": 2.714432274432e+17,
156
+ "trial_name": null,
157
+ "trial_params": null
158
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b987d18c87bab512217174f78ebb9d5fdbaa7c1ed0879e0503d5e121c6da3324
3
+ size 3183
checkpoint-1100/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 1024,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": null,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "reorder_and_upcast_attn": true,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": true,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.2,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "task_specific_params": {
29
+ "text-generation": {
30
+ "do_sample": true,
31
+ "max_length": 1024
32
+ }
33
+ },
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.18.0",
36
+ "use_cache": false,
37
+ "vocab_size": 21128
38
+ }
checkpoint-1100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be864f3447e51f3429408ce1a52d87ce512f8c1fcd017925d2ef0dea1d4129d3
3
+ size 816635441
checkpoint-1100/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f637ac80b4e442386a6aab7aad0302fae9f06b49ecabb43762bcff27346e382e
3
+ size 420912233
checkpoint-1100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c7d341a5b762c20e215d6aed5ce9bd5b1a68f4a7b69498b4e1dae71559aa493
3
+ size 14567
checkpoint-1100/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fb16c30b686aa43e110b0d33f9d46bf3127b7124542ca8dc34831233d4675a0
3
+ size 559
checkpoint-1100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f1b7713e4bb40428f29080b7d08d4a52f779ac863737861e4724292b2cf6c59
3
+ size 623
checkpoint-1100/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
checkpoint-1100/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"tokenizer_class": "PassthroughTokenizer"}
checkpoint-1100/trainer_state.json ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 30.016666666666666,
5
+ "global_step": 1100,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 4.9999999999999996e-06,
13
+ "loss": 10.1063,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 1.01,
18
+ "learning_rate": 0.00025,
19
+ "loss": 6.9252,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 2.02,
24
+ "learning_rate": 0.0005,
25
+ "loss": 4.9715,
26
+ "step": 100
27
+ },
28
+ {
29
+ "epoch": 4.0,
30
+ "learning_rate": 0.0005833333333333333,
31
+ "loss": 4.3779,
32
+ "step": 150
33
+ },
34
+ {
35
+ "epoch": 5.02,
36
+ "learning_rate": 0.0005555555555555556,
37
+ "loss": 4.0428,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 6.03,
42
+ "learning_rate": 0.0005277777777777777,
43
+ "loss": 3.925,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 8.01,
48
+ "learning_rate": 0.0005,
49
+ "loss": 3.8187,
50
+ "step": 300
51
+ },
52
+ {
53
+ "epoch": 9.02,
54
+ "learning_rate": 0.00047222222222222224,
55
+ "loss": 3.6091,
56
+ "step": 350
57
+ },
58
+ {
59
+ "epoch": 11.0,
60
+ "learning_rate": 0.00044444444444444436,
61
+ "loss": 3.4778,
62
+ "step": 400
63
+ },
64
+ {
65
+ "epoch": 12.02,
66
+ "learning_rate": 0.00041666666666666664,
67
+ "loss": 3.3148,
68
+ "step": 450
69
+ },
70
+ {
71
+ "epoch": 13.03,
72
+ "learning_rate": 0.00038888888888888887,
73
+ "loss": 3.2144,
74
+ "step": 500
75
+ },
76
+ {
77
+ "epoch": 15.01,
78
+ "learning_rate": 0.0003611111111111111,
79
+ "loss": 3.1636,
80
+ "step": 550
81
+ },
82
+ {
83
+ "epoch": 16.02,
84
+ "learning_rate": 0.0003333333333333333,
85
+ "loss": 3.0596,
86
+ "step": 600
87
+ },
88
+ {
89
+ "epoch": 18.0,
90
+ "learning_rate": 0.00030555555555555555,
91
+ "loss": 3.0281,
92
+ "step": 650
93
+ },
94
+ {
95
+ "epoch": 19.01,
96
+ "learning_rate": 0.0002777777777777778,
97
+ "loss": 2.9411,
98
+ "step": 700
99
+ },
100
+ {
101
+ "epoch": 20.02,
102
+ "learning_rate": 0.00025,
103
+ "loss": 2.8897,
104
+ "step": 750
105
+ },
106
+ {
107
+ "epoch": 22.01,
108
+ "learning_rate": 0.00022222222222222218,
109
+ "loss": 2.8768,
110
+ "step": 800
111
+ },
112
+ {
113
+ "epoch": 23.02,
114
+ "learning_rate": 0.00019444444444444443,
115
+ "loss": 2.8048,
116
+ "step": 850
117
+ },
118
+ {
119
+ "epoch": 24.03,
120
+ "learning_rate": 0.00016666666666666666,
121
+ "loss": 2.7668,
122
+ "step": 900
123
+ },
124
+ {
125
+ "epoch": 26.01,
126
+ "learning_rate": 0.0001388888888888889,
127
+ "loss": 2.7633,
128
+ "step": 950
129
+ },
130
+ {
131
+ "epoch": 27.02,
132
+ "learning_rate": 0.00011111111111111109,
133
+ "loss": 2.7004,
134
+ "step": 1000
135
+ },
136
+ {
137
+ "epoch": 27.02,
138
+ "eval_loss": 3.4081130027770996,
139
+ "eval_runtime": 2.3193,
140
+ "eval_samples_per_second": 56.914,
141
+ "eval_steps_per_second": 3.881,
142
+ "step": 1000
143
+ },
144
+ {
145
+ "epoch": 27.02,
146
+ "eval_/scratch/xiulyang/multilingual-LM/training/multilingual_dataset.py_loss": 3.4081130027770996,
147
+ "eval_/scratch/xiulyang/multilingual-LM/training/multilingual_dataset.py_ppl": 30.208187677268597,
148
+ "eval_/scratch/xiulyang/multilingual-LM/training/multilingual_dataset.py_runtime": 2.3193,
149
+ "eval_/scratch/xiulyang/multilingual-LM/training/multilingual_dataset.py_samples_per_second": 56.914,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 29.0,
154
+ "learning_rate": 8.333333333333333e-05,
155
+ "loss": 2.6996,
156
+ "step": 1050
157
+ },
158
+ {
159
+ "epoch": 30.02,
160
+ "learning_rate": 5.5555555555555545e-05,
161
+ "loss": 2.6446,
162
+ "step": 1100
163
+ }
164
+ ],
165
+ "max_steps": 1200,
166
+ "num_train_epochs": 9223372036854775807,
167
+ "total_flos": 2.986306633728e+17,
168
+ "trial_name": null,
169
+ "trial_params": null
170
+ }
checkpoint-1100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b987d18c87bab512217174f78ebb9d5fdbaa7c1ed0879e0503d5e121c6da3324
3
+ size 3183
checkpoint-1200/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 1024,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": null,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "reorder_and_upcast_attn": true,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": true,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.2,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "task_specific_params": {
29
+ "text-generation": {
30
+ "do_sample": true,
31
+ "max_length": 1024
32
+ }
33
+ },
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.18.0",
36
+ "use_cache": false,
37
+ "vocab_size": 21128
38
+ }
checkpoint-1200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5fb146d56e1997dcad4f1b7b6639ecec91b4de3345f8d5b36cdcdc4cd773b6a
3
+ size 816635441
checkpoint-1200/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d50c4be2b63889ee1fe830b16255ee452b2ac2d80aaeab60a755c1adf42a2cf
3
+ size 420912233
checkpoint-1200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4370a90bd99585f0d7c30a5291ac27b6d766574cccf1ec17689cbd0f2533b864
3
+ size 14567
checkpoint-1200/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19c7277eaca0850ae3e9b6790b3d002d820169cce0671185e672c28c8ae8e056
3
+ size 559
checkpoint-1200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:935a8fb09a6e9698d9894853b05e181b3f56098deaaecddde08e55f06bf000c4
3
+ size 623
checkpoint-1200/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
checkpoint-1200/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"tokenizer_class": "PassthroughTokenizer"}
checkpoint-1200/trainer_state.json ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 33.01,
5
+ "global_step": 1200,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 4.9999999999999996e-06,
13
+ "loss": 10.1063,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 1.01,
18
+ "learning_rate": 0.00025,
19
+ "loss": 6.9252,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 2.02,
24
+ "learning_rate": 0.0005,
25
+ "loss": 4.9715,
26
+ "step": 100
27
+ },
28
+ {
29
+ "epoch": 4.0,
30
+ "learning_rate": 0.0005833333333333333,
31
+ "loss": 4.3779,
32
+ "step": 150
33
+ },
34
+ {
35
+ "epoch": 5.02,
36
+ "learning_rate": 0.0005555555555555556,
37
+ "loss": 4.0428,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 6.03,
42
+ "learning_rate": 0.0005277777777777777,
43
+ "loss": 3.925,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 8.01,
48
+ "learning_rate": 0.0005,
49
+ "loss": 3.8187,
50
+ "step": 300
51
+ },
52
+ {
53
+ "epoch": 9.02,
54
+ "learning_rate": 0.00047222222222222224,
55
+ "loss": 3.6091,
56
+ "step": 350
57
+ },
58
+ {
59
+ "epoch": 11.0,
60
+ "learning_rate": 0.00044444444444444436,
61
+ "loss": 3.4778,
62
+ "step": 400
63
+ },
64
+ {
65
+ "epoch": 12.02,
66
+ "learning_rate": 0.00041666666666666664,
67
+ "loss": 3.3148,
68
+ "step": 450
69
+ },
70
+ {
71
+ "epoch": 13.03,
72
+ "learning_rate": 0.00038888888888888887,
73
+ "loss": 3.2144,
74
+ "step": 500
75
+ },
76
+ {
77
+ "epoch": 15.01,
78
+ "learning_rate": 0.0003611111111111111,
79
+ "loss": 3.1636,
80
+ "step": 550
81
+ },
82
+ {
83
+ "epoch": 16.02,
84
+ "learning_rate": 0.0003333333333333333,
85
+ "loss": 3.0596,
86
+ "step": 600
87
+ },
88
+ {
89
+ "epoch": 18.0,
90
+ "learning_rate": 0.00030555555555555555,
91
+ "loss": 3.0281,
92
+ "step": 650
93
+ },
94
+ {
95
+ "epoch": 19.01,
96
+ "learning_rate": 0.0002777777777777778,
97
+ "loss": 2.9411,
98
+ "step": 700
99
+ },
100
+ {
101
+ "epoch": 20.02,
102
+ "learning_rate": 0.00025,
103
+ "loss": 2.8897,
104
+ "step": 750
105
+ },
106
+ {
107
+ "epoch": 22.01,
108
+ "learning_rate": 0.00022222222222222218,
109
+ "loss": 2.8768,
110
+ "step": 800
111
+ },
112
+ {
113
+ "epoch": 23.02,
114
+ "learning_rate": 0.00019444444444444443,
115
+ "loss": 2.8048,
116
+ "step": 850
117
+ },
118
+ {
119
+ "epoch": 24.03,
120
+ "learning_rate": 0.00016666666666666666,
121
+ "loss": 2.7668,
122
+ "step": 900
123
+ },
124
+ {
125
+ "epoch": 26.01,
126
+ "learning_rate": 0.0001388888888888889,
127
+ "loss": 2.7633,
128
+ "step": 950
129
+ },
130
+ {
131
+ "epoch": 27.02,
132
+ "learning_rate": 0.00011111111111111109,
133
+ "loss": 2.7004,
134
+ "step": 1000
135
+ },
136
+ {
137
+ "epoch": 27.02,
138
+ "eval_loss": 3.4081130027770996,
139
+ "eval_runtime": 2.3193,
140
+ "eval_samples_per_second": 56.914,
141
+ "eval_steps_per_second": 3.881,
142
+ "step": 1000
143
+ },
144
+ {
145
+ "epoch": 27.02,
146
+ "eval_/scratch/xiulyang/multilingual-LM/training/multilingual_dataset.py_loss": 3.4081130027770996,
147
+ "eval_/scratch/xiulyang/multilingual-LM/training/multilingual_dataset.py_ppl": 30.208187677268597,
148
+ "eval_/scratch/xiulyang/multilingual-LM/training/multilingual_dataset.py_runtime": 2.3193,
149
+ "eval_/scratch/xiulyang/multilingual-LM/training/multilingual_dataset.py_samples_per_second": 56.914,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 29.0,
154
+ "learning_rate": 8.333333333333333e-05,
155
+ "loss": 2.6996,
156
+ "step": 1050
157
+ },
158
+ {
159
+ "epoch": 30.02,
160
+ "learning_rate": 5.5555555555555545e-05,
161
+ "loss": 2.6446,
162
+ "step": 1100
163
+ },
164
+ {
165
+ "epoch": 31.03,
166
+ "learning_rate": 2.7777777777777772e-05,
167
+ "loss": 2.6211,
168
+ "step": 1150
169
+ },
170
+ {
171
+ "epoch": 33.01,
172
+ "learning_rate": 0.0,
173
+ "loss": 2.6316,
174
+ "step": 1200
175
+ }
176
+ ],
177
+ "max_steps": 1200,
178
+ "num_train_epochs": 9223372036854775807,
179
+ "total_flos": 3.258180993024e+17,
180
+ "trial_name": null,
181
+ "trial_params": null
182
+ }
checkpoint-1200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b987d18c87bab512217174f78ebb9d5fdbaa7c1ed0879e0503d5e121c6da3324
3
+ size 3183
checkpoint-200/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 1024,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": null,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "reorder_and_upcast_attn": true,
20
+ "resid_pdrop": 0.1,
21
+ "scale_attn_by_inverse_layer_idx": true,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.2,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "task_specific_params": {
29
+ "text-generation": {
30
+ "do_sample": true,
31
+ "max_length": 1024
32
+ }
33
+ },
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.18.0",
36
+ "use_cache": false,
37
+ "vocab_size": 21128
38
+ }
checkpoint-200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:596ead937fd4ddd3901963f0e0404e640d687515725187168c4af7c973f0db69
3
+ size 816635249
checkpoint-200/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a65b63301ebcc28e6ac4d258ebe45886aea9dc3b2faf81b7df2fa688eb3a3e5
3
+ size 420912233
checkpoint-200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d39f44e4b5be7446f136f301364ee8d940bbcc5f80d28b465050f2a5f6747422
3
+ size 14567