pietrolesci commited on
Commit
a481f88
·
verified ·
1 Parent(s): 21efdf9

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Experiment Configuration
2
+ ```yaml
3
+ callbacks:
4
+ grad_norm:
5
+ _target_: primer.callbacks.grad_norm.GradNorm
6
+ check_clipping: false
7
+ group_separator: /
8
+ histogram_freq: null
9
+ log_weight_distribution: false
10
+ norm_type: 2
11
+ only_total: true
12
+ lr_monitor:
13
+ _target_: primer.callbacks.lr_monitor.SimpleLearningRateMonitor
14
+ model_checkpoint:
15
+ _target_: primer.callbacks.model_checkpoint.ModelCheckpoint
16
+ dirpath: .checkpoints
17
+ enable_version_counter: false
18
+ every_n_train_steps: 2000
19
+ filename: '{step}'
20
+ save_initial_checkpoint: true
21
+ save_last: link
22
+ save_top_k: -1
23
+ verbose: true
24
+ speed_monitor:
25
+ _target_: primer.callbacks.speed_monitor.SpeedMonitor
26
+ data:
27
+ batch_size: 64
28
+ drop_last: true
29
+ eval_batch_size: 64
30
+ intra_doc_causal_mask: true
31
+ multiprocessing_context: null
32
+ num_workers: 8
33
+ persistent_workers: false
34
+ pin_memory: true
35
+ prefetch_factor: 2
36
+ shuffle_seed: 42
37
+ loggers:
38
+ tensorboard:
39
+ _target_: primer.trainer.TensorBoardLogger
40
+ name: ''
41
+ save_dir: ./
42
+ version: null
43
+ model:
44
+ attention_bias: false
45
+ attention_dropout: 0.0
46
+ head_dim: 128
47
+ hidden_act: silu
48
+ hidden_size: 768
49
+ initializer_range: 0.02
50
+ intermediate_size: 2048
51
+ mlp_bias: false
52
+ model_type: llama
53
+ name: small
54
+ num_attention_heads: 6
55
+ num_hidden_layers: 6
56
+ num_key_value_heads: 6
57
+ pretraining_tp: 1
58
+ rms_norm_eps: 1.0e-05
59
+ rope_scaling: null
60
+ rope_theta: 10000.0
61
+ tie_word_embeddings: true
62
+ optim:
63
+ grad_acc_schedule:
64
+ 0: 2
65
+ lr: 0.0006
66
+ num_warmup_steps: 2000
67
+ optim_kwargs:
68
+ betas:
69
+ - 0.9
70
+ - 0.95
71
+ capturable: true
72
+ eps: 1.0e-08
73
+ fused: true
74
+ optim_name: adamw
75
+ scheduler_kwargs:
76
+ min_lr_ratio: 0.0
77
+ num_decay_steps: 4000
78
+ scheduler_name: warmup_stable_decay
79
+ set_grad_to_none: true
80
+ weight_decay: 0.01
81
+ weight_decay_embedding: false
82
+ zloss_factor: null
83
+ out_parent_folder: model_train
84
+ pwd: ./unimixlm
85
+ resume_from_checkpoint: .checkpoints/last.ckpt
86
+ run_folder: small_tokmix128k__2025-07-24T17-27-55
87
+ save_initial_checkpoint: true
88
+ seed: 42
89
+ tok_name: tokmix128k
90
+ tok_path: ./unimixlm/tokenizers/tokmix128k
91
+ tok_subfolder: null
92
+ torch_compile: true
93
+ train_data_path: ./unimixlm/data/tokmix128k/train
94
+ trainer:
95
+ accelerator: gpu
96
+ deterministic: false
97
+ devices: 1
98
+ enable_progress_bar: true
99
+ fast_dev_run: false
100
+ gradient_clip_algorithm: norm
101
+ gradient_clip_val: 1.0
102
+ limit_train_batches: null
103
+ limit_val_batches: 500
104
+ log_every_n_steps: 1
105
+ max_steps: 50000
106
+ precision: bf16-true
107
+ val_check_interval: 2000
108
+ use_liger: true
109
+ val_data_path: ./unimixlm/data/tokmix128k/validation
110
+ ```
hparams.yaml ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ loggers:
2
+ tensorboard:
3
+ _target_: primer.trainer.TensorBoardLogger
4
+ save_dir: ./
5
+ name: ''
6
+ version: null
7
+ callbacks:
8
+ lr_monitor:
9
+ _target_: primer.callbacks.lr_monitor.SimpleLearningRateMonitor
10
+ grad_norm:
11
+ _target_: primer.callbacks.grad_norm.GradNorm
12
+ norm_type: 2
13
+ group_separator: /
14
+ histogram_freq: null
15
+ check_clipping: false
16
+ log_weight_distribution: false
17
+ only_total: true
18
+ speed_monitor:
19
+ _target_: primer.callbacks.speed_monitor.SpeedMonitor
20
+ model_checkpoint:
21
+ _target_: primer.callbacks.model_checkpoint.ModelCheckpoint
22
+ dirpath: .checkpoints
23
+ filename: '{step}'
24
+ enable_version_counter: false
25
+ every_n_train_steps: 2000
26
+ save_top_k: -1
27
+ save_last: link
28
+ verbose: true
29
+ save_initial_checkpoint: true
30
+ model:
31
+ name: small
32
+ model_type: llama
33
+ head_dim: 128
34
+ hidden_size: 768
35
+ hidden_act: silu
36
+ intermediate_size: 2048
37
+ initializer_range: 0.02
38
+ num_hidden_layers: 6
39
+ num_attention_heads: 6
40
+ num_key_value_heads: 6
41
+ rms_norm_eps: 1.0e-05
42
+ tie_word_embeddings: true
43
+ rope_theta: 10000.0
44
+ rope_scaling: null
45
+ attention_bias: false
46
+ mlp_bias: false
47
+ attention_dropout: 0.0
48
+ pretraining_tp: 1
49
+ pwd: /home/pl487/unimixlm
50
+ out_parent_folder: model_train
51
+ run_folder: small_tokmix128k__2025-07-24T17-27-55
52
+ tok_path: /home/pl487/unimixlm/tokenizers/tokmix128k
53
+ tok_subfolder: null
54
+ train_data_path: /home/pl487/unimixlm/data/tokmix128k/train
55
+ val_data_path: /home/pl487/unimixlm/data/tokmix128k/validation
56
+ resume_from_checkpoint: .checkpoints/last.ckpt
57
+ save_initial_checkpoint: true
58
+ seed: 42
59
+ torch_compile: true
60
+ use_liger: true
61
+ data:
62
+ batch_size: 64
63
+ eval_batch_size: 64
64
+ shuffle_seed: 42
65
+ drop_last: true
66
+ num_workers: 8
67
+ pin_memory: true
68
+ persistent_workers: false
69
+ prefetch_factor: 2
70
+ multiprocessing_context: null
71
+ intra_doc_causal_mask: true
72
+ optim:
73
+ optim_name: adamw
74
+ lr: 0.0006
75
+ grad_acc_schedule:
76
+ 0: 2
77
+ zloss_factor: null
78
+ weight_decay: 0.01
79
+ optim_kwargs:
80
+ fused: true
81
+ eps: 1.0e-08
82
+ betas:
83
+ - 0.9
84
+ - 0.95
85
+ capturable: true
86
+ scheduler_name: warmup_stable_decay
87
+ num_warmup_steps: 2000
88
+ scheduler_kwargs:
89
+ num_decay_steps: 4000
90
+ min_lr_ratio: 0.0
91
+ weight_decay_embedding: false
92
+ set_grad_to_none: true
93
+ trainer:
94
+ accelerator: gpu
95
+ devices: 1
96
+ precision: bf16-true
97
+ deterministic: false
98
+ log_every_n_steps: 1
99
+ enable_progress_bar: true
100
+ fast_dev_run: false
101
+ gradient_clip_val: 1.0
102
+ gradient_clip_algorithm: norm
103
+ val_check_interval: 2000
104
+ max_steps: 50000
105
+ limit_val_batches: 500
106
+ limit_train_batches: null
107
+ tok_name: tokmix128k
tb_logs.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41970db3b6a8d5d97551b5de981afe130fd222f4e91efe838ab371ba5c277dd5
3
+ size 2393529
version_0/events.out.tfevents.1753374478.dev-gpu-pl487.1896758.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eee2f849a1e61d1ee9141156164fdd1d5df138e026523698d5a73a8c6245056b
3
+ size 30575909
version_0/events.out.tfevents.1753457634.dev-gpu-pl487.1896758.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad2461c3692276aeea280cb312220735e8a954756be63612989eea1ea986e0c6
3
+ size 14907
version_0/hparams.yaml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config:
2
+ vocab_size: 128000
3
+ bos_token_id: 0
4
+ eos_token_id: 1
5
+ pad_token_id: 2
6
+ torch_dtype: bfloat16
7
+ use_cache: false
8
+ max_position_embeddings: 2048
9
+ _attn_implementation: flash_attention_2
10
+ name: small
11
+ model_type: llama
12
+ head_dim: 128
13
+ hidden_size: 768
14
+ hidden_act: silu
15
+ intermediate_size: 2048
16
+ initializer_range: 0.02
17
+ num_hidden_layers: 6
18
+ num_attention_heads: 6
19
+ num_key_value_heads: 6
20
+ rms_norm_eps: 1.0e-05
21
+ tie_word_embeddings: true
22
+ rope_theta: 10000.0
23
+ rope_scaling: null
24
+ attention_bias: false
25
+ mlp_bias: false
26
+ attention_dropout: 0.0
27
+ pretraining_tp: 1
28
+ optim_config:
29
+ optim_name: adamw
30
+ lr: 0.0006
31
+ weight_decay: 0.01
32
+ weight_decay_embedding: false
33
+ set_grad_to_none: true
34
+ optim_kwargs:
35
+ fused: true
36
+ eps: 1.0e-08
37
+ betas:
38
+ - 0.9
39
+ - 0.95
40
+ capturable: true
41
+ scheduler_name: warmup_stable_decay
42
+ num_warmup_steps: 2000
43
+ scheduler_kwargs:
44
+ num_decay_steps: 4000
45
+ min_lr_ratio: 0.0
46
+ grad_acc_schedule:
47
+ 0: 2
48
+ zloss_factor: null
49
+ use_torch_compile: true
50
+ use_liger: true
51
+ train_data_path: /home/pl487/unimixlm/data/tokmix128k/train
52
+ val_data_path: /home/pl487/unimixlm/data/tokmix128k/validation
53
+ seq_len: 2048
54
+ eos_token_id: 1
55
+ dataloader_config:
56
+ batch_size: 64
57
+ eval_batch_size: 64
58
+ shuffle_seed: 42
59
+ intra_doc_causal_mask: true
60
+ num_workers: 8
61
+ pin_memory: true
62
+ drop_last: true
63
+ persistent_workers: false
64
+ multiprocessing_context: null
65
+ prefetch_factor: 2