tim-lawson commited on
Commit
b1e55cc
·
verified ·
1 Parent(s): aa5645c

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. config.json +60 -19
  2. train_config.json +61 -1
config.json CHANGED
@@ -1,20 +1,61 @@
1
  {
2
- "architectures": [
3
- "SkipMiddleModel"
4
- ],
5
- "dim": 768,
6
- "ffn_dim_multiplier": 4,
7
- "initializer_range": 0.02,
8
- "max_seq_len": 1024,
9
- "multiple_of": 256,
10
- "n_heads": 12,
11
- "n_kv_heads": 12,
12
- "n_layers": 4,
13
- "norm_eps": 1e-05,
14
- "rope_theta": 10000,
15
- "torch_dtype": "bfloat16",
16
- "transformers_version": "4.51.1",
17
- "use_scaled_rope": false,
18
- "vocab_size": 50257,
19
- "zero_init_masks": true
20
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "data": {
3
+ "train_files": "data/fineweb_10B_gpt2/fineweb_train_*.bin",
4
+ "val_files": "data/fineweb_10B_gpt2/fineweb_val_*.bin",
5
+ "batch_size": 512,
6
+ "device_batch_size": 32
7
+ },
8
+ "model": {
9
+ "dim": 768,
10
+ "n_layers": 4,
11
+ "n_heads": 12,
12
+ "n_kv_heads": 12,
13
+ "vocab_size": 50257,
14
+ "multiple_of": 256,
15
+ "ffn_dim_multiplier": 4,
16
+ "norm_eps": 1e-05,
17
+ "rope_theta": 10000,
18
+ "use_scaled_rope": false,
19
+ "max_seq_len": 1024,
20
+ "initializer_range": 0.02,
21
+ "zero_init_masks": true
22
+ },
23
+ "optimizer": {
24
+ "default": {
25
+ "lr": 0.001,
26
+ "beta1": 0.8,
27
+ "beta2": 0.95,
28
+ "eps": 1e-10,
29
+ "weight_decay": 0
30
+ },
31
+ "masks": {
32
+ "lr": 0.001,
33
+ "beta1": 0.8,
34
+ "beta2": 0.95,
35
+ "eps": 1e-10,
36
+ "weight_decay": 0
37
+ },
38
+ "norms": {
39
+ "lr": 0.001,
40
+ "beta1": 0.8,
41
+ "beta2": 0.95,
42
+ "eps": 1e-10,
43
+ "weight_decay": 0
44
+ }
45
+ },
46
+ "scheduler": {
47
+ "warmup_steps": 0.1,
48
+ "start_factor": 0.1
49
+ },
50
+ "gates": {},
51
+ "gates_zero_eps": 1e-08,
52
+ "seed": 0,
53
+ "project": "fineweb-baseline",
54
+ "run_id": null,
55
+ "logdir": "logs/fineweb-baseline",
56
+ "log_gradients": false,
57
+ "log_params": false,
58
+ "log_every_steps": 1,
59
+ "val_every_steps": 100,
60
+ "save_every_steps": -1
61
+ }
train_config.json CHANGED
@@ -1 +1,61 @@
1
- {"data": {"train_files": "data/fineweb_10B_gpt2/fineweb_train_*.bin", "val_files": "data/fineweb_10B_gpt2/fineweb_val_*.bin", "batch_size": 512, "device_batch_size": 32}, "model": {"dim": 768, "n_layers": 4, "n_heads": 12, "n_kv_heads": 12, "vocab_size": 50257, "multiple_of": 256, "ffn_dim_multiplier": 4, "norm_eps": 1e-05, "rope_theta": 10000, "use_scaled_rope": false, "max_seq_len": 1024, "initializer_range": 0.02, "zero_init_masks": true}, "optimizer": {"default": {"lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0}, "masks": {"lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0}, "norms": {"lr": 0.001, "beta1": 0.8, "beta2": 0.95, "eps": 1e-10, "weight_decay": 0}}, "scheduler": {"warmup_steps": 0.1, "start_factor": 0.1}, "gates": {}, "gates_zero_eps": 1e-08, "seed": 0, "project": "fineweb-baseline", "run_id": null, "logdir": "logs/fineweb-baseline", "log_gradients": false, "log_params": false, "log_every_steps": 1, "val_every_steps": 100, "save_every_steps": -1}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data": {
3
+ "train_files": "data/fineweb_10B_gpt2/fineweb_train_*.bin",
4
+ "val_files": "data/fineweb_10B_gpt2/fineweb_val_*.bin",
5
+ "batch_size": 512,
6
+ "device_batch_size": 32
7
+ },
8
+ "model": {
9
+ "dim": 768,
10
+ "n_layers": 4,
11
+ "n_heads": 12,
12
+ "n_kv_heads": 12,
13
+ "vocab_size": 50257,
14
+ "multiple_of": 256,
15
+ "ffn_dim_multiplier": 4,
16
+ "norm_eps": 1e-05,
17
+ "rope_theta": 10000,
18
+ "use_scaled_rope": false,
19
+ "max_seq_len": 1024,
20
+ "initializer_range": 0.02,
21
+ "zero_init_masks": true
22
+ },
23
+ "optimizer": {
24
+ "default": {
25
+ "lr": 0.001,
26
+ "beta1": 0.8,
27
+ "beta2": 0.95,
28
+ "eps": 1e-10,
29
+ "weight_decay": 0
30
+ },
31
+ "masks": {
32
+ "lr": 0.001,
33
+ "beta1": 0.8,
34
+ "beta2": 0.95,
35
+ "eps": 1e-10,
36
+ "weight_decay": 0
37
+ },
38
+ "norms": {
39
+ "lr": 0.001,
40
+ "beta1": 0.8,
41
+ "beta2": 0.95,
42
+ "eps": 1e-10,
43
+ "weight_decay": 0
44
+ }
45
+ },
46
+ "scheduler": {
47
+ "warmup_steps": 0.1,
48
+ "start_factor": 0.1
49
+ },
50
+ "gates": {},
51
+ "gates_zero_eps": 1e-08,
52
+ "seed": 0,
53
+ "project": "fineweb-baseline",
54
+ "run_id": null,
55
+ "logdir": "logs/fineweb-baseline",
56
+ "log_gradients": false,
57
+ "log_params": false,
58
+ "log_every_steps": 1,
59
+ "val_every_steps": 100,
60
+ "save_every_steps": -1
61
+ }