Joosep Pata commited on
Commit
c0bca33
·
1 Parent(s): baf0aea

update readme

Browse files
clic/clusters/v2.3.0/README.md CHANGED
@@ -9,4 +9,9 @@ pyg-clic_20250209_100514_187330 - transformer + flash attention, 4M events from
9
  pyg-clic_20250130_214007_333962 - transformer + flash attention, full dataset, 10 epochs / ~80 hours, 1st run
10
  pyg-clic_20250306_105311_290722 - transformer + flash attention, full dataset, 10 epochs / ~80 hours, 2nd run
11
  pyg-clic_20250309_173756_957486 - transformer + flash attention, full dataset, 10 epochs / ~80 hours, 3rd run
 
 
 
 
 
12
  ```
 
9
  pyg-clic_20250130_214007_333962 - transformer + flash attention, full dataset, 10 epochs / ~80 hours, 1st run
10
  pyg-clic_20250306_105311_290722 - transformer + flash attention, full dataset, 10 epochs / ~80 hours, 2nd run
11
  pyg-clic_20250309_173756_957486 - transformer + flash attention, full dataset, 10 epochs / ~80 hours, 3rd run
12
+
13
+ #multi-GPU tests
14
+ largebatch_study_gpus4_notscaledLR0.0001_epochs30_bsm256_adamw_a100_cu124_fulldataset_pyg-clic-v230_20250219_055135_172489 - just run on 4x GPUs
15
+ largebatch_study_gpus4_linearscaledLR0.0004_epochs30_bsm256_adamw_a100_cu124_fulldataset_pyg-clic-v230_20250217_082738_406721 - run on 4x GPUs, scale learning rate by 4x
16
+ largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888 - run on 4x GPUs, scale learning rate by 4x, scale weight decay by 3x
17
  ```
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ plots_checkpoint*/** filter=lfs diff=lfs merge=lfs -text
2
+ preds_checkpoint*/** filter=lfs diff=lfs merge=lfs -text
3
+ runs/** filter=lfs diff=lfs merge=lfs -text
4
+ checkpoints/** filter=lfs diff=lfs merge=lfs -text
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/hyperparameters.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"num_mlpf_params": 52630547, "checkpoint_freq": 1, "comet": true, "comet_name": "particleflow-pt", "comet_offline": false, "comet_step_freq": 1000, "conv_type": "attention", "data_dir": "/mnt/ceph/users/ewulff/tensorflow_datasets/clic", "dataset": "clic", "dtype": "bfloat16", "enabled_test_datasets": ["clic_edm_qq_pf"], "finetune": null, "gpu_batch_multiplier": 256, "gpus": 4, "load": null, "lr": 0.0004, "optimizer": "adamw", "weight_decay": 0.03, "lr_schedule": "cosinedecay", "lr_schedule_config": {"onecycle": {"pct_start": 0.3}}, "make_plots": null, "model": {"attention": {"activation": "relu", "attention_type": "flash", "conv_type": "attention", "dropout_conv_id_ff": 0.0, "dropout_conv_id_mha": 0.0, "dropout_conv_reg_ff": 0.0, "dropout_conv_reg_mha": 0.0, "dropout_ff": 0.0, "head_dim": 32, "num_convs": 3, "num_heads": 32, "use_pre_layernorm": true}, "cos_phi_mode": "linear", "energy_mode": "direct-elemtype-split", "eta_mode": "linear", "gnn_lsh": {"activation": "elu", "bin_size": 32, "conv_type": "gnn_lsh", "distance_dim": 128, "embedding_dim": 512, "ffn_dist_hidden_dim": 128, "ffn_dist_num_layers": 2, "layernorm": true, "max_num_bins": 200, "num_convs": 8, "num_node_messages": 2, "width": 512}, "input_encoding": "split", "learned_representation_mode": "last", "mamba": {"activation": "elu", "conv_type": "mamba", "d_conv": 4, "d_state": 16, "dropout": 0.0, "embedding_dim": 128, "expand": 2, "num_convs": 2, "num_heads": 2, "width": 128}, "pt_mode": "direct-elemtype-split", "sin_phi_mode": "linear", "trainable": "all"}, "ntest": 2000, "ntrain": null, "num_epochs": 10, "num_workers": 12, "nvalid": null, "patience": 20, "prefetch_factor": 100, "raytune": {"asha": {"brackets": 1, "grace_period": 4, "max_t": 200, "reduction_factor": 4}, "default_metric": "val_loss", "default_mode": "min", "hyperband": {"max_t": 200, "reduction_factor": 4}, "hyperopt": {"n_random_steps": 10}, "local_dir": "/mnt/ceph/users/ewulff/ray_results", "nevergrad": {"n_random_steps": 10}, "sched": null, "search_alg": null}, "save_attention": true, "sort_data": false, "test": null, "test_dataset": {"clic_edm_qq_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}, "clic_edm_ttbar_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}, "clic_edm_ww_fullhad_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}}, "train": true, "train_dataset": {"clic": {"physical": {"batch_size": 1, "samples": {"clic_edm_qq_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}, "clic_edm_ttbar_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}, "clic_edm_ww_fullhad_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}}}}}, "val_freq": null, "valid_dataset": {"clic": {"physical": {"batch_size": 1, "samples": {"clic_edm_qq_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}, "clic_edm_ttbar_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}, "clic_edm_ww_fullhad_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}}}}}}
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/model_kwargs.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08d9fedbafe13195772b70373f3ad6d7dcff24c98624585cc27f59b557bb5ae7
3
+ size 553
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/overridden_config.yaml ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoint_freq: 1
2
+ comet: true
3
+ comet_name: particleflow-pt
4
+ comet_offline: false
5
+ comet_step_freq: 1000
6
+ conv_type: attention
7
+ data_dir: /mnt/ceph/users/ewulff/tensorflow_datasets/clic
8
+ dataset: clic
9
+ dtype: bfloat16
10
+ enabled_test_datasets:
11
+ - clic_edm_qq_pf
12
+ finetune: null
13
+ gpu_batch_multiplier: 256
14
+ gpus: 4
15
+ load: null
16
+ lr: 0.0004
17
+ lr_schedule: cosinedecay
18
+ lr_schedule_config:
19
+ onecycle:
20
+ pct_start: 0.3
21
+ make_plots: null
22
+ model:
23
+ attention:
24
+ activation: relu
25
+ attention_type: flash
26
+ conv_type: attention
27
+ dropout_conv_id_ff: 0.0
28
+ dropout_conv_id_mha: 0.0
29
+ dropout_conv_reg_ff: 0.0
30
+ dropout_conv_reg_mha: 0.0
31
+ dropout_ff: 0.0
32
+ head_dim: 32
33
+ num_convs: 3
34
+ num_heads: 32
35
+ use_pre_layernorm: true
36
+ cos_phi_mode: linear
37
+ energy_mode: direct-elemtype-split
38
+ eta_mode: linear
39
+ gnn_lsh:
40
+ activation: elu
41
+ bin_size: 32
42
+ conv_type: gnn_lsh
43
+ distance_dim: 128
44
+ embedding_dim: 512
45
+ ffn_dist_hidden_dim: 128
46
+ ffn_dist_num_layers: 2
47
+ layernorm: true
48
+ max_num_bins: 200
49
+ num_convs: 8
50
+ num_node_messages: 2
51
+ width: 512
52
+ input_encoding: split
53
+ learned_representation_mode: last
54
+ mamba:
55
+ activation: elu
56
+ conv_type: mamba
57
+ d_conv: 4
58
+ d_state: 16
59
+ dropout: 0.0
60
+ embedding_dim: 128
61
+ expand: 2
62
+ num_convs: 2
63
+ num_heads: 2
64
+ width: 128
65
+ pt_mode: direct-elemtype-split
66
+ sin_phi_mode: linear
67
+ trainable: all
68
+ ntest: 2000
69
+ ntrain: null
70
+ num_epochs: 10
71
+ num_workers: 12
72
+ nvalid: null
73
+ optimizer: adamw
74
+ patience: 20
75
+ prefetch_factor: 100
76
+ raytune:
77
+ asha:
78
+ brackets: 1
79
+ grace_period: 4
80
+ max_t: 200
81
+ reduction_factor: 4
82
+ default_metric: val_loss
83
+ default_mode: min
84
+ hyperband:
85
+ max_t: 200
86
+ reduction_factor: 4
87
+ hyperopt:
88
+ n_random_steps: 10
89
+ local_dir: /mnt/ceph/users/ewulff/ray_results
90
+ nevergrad:
91
+ n_random_steps: 10
92
+ sched: null
93
+ search_alg: null
94
+ save_attention: true
95
+ sort_data: false
96
+ test: null
97
+ test_dataset:
98
+ clic_edm_qq_pf:
99
+ splits:
100
+ - 1
101
+ - 2
102
+ - 3
103
+ - 4
104
+ - 5
105
+ - 6
106
+ - 7
107
+ - 8
108
+ - 9
109
+ - 10
110
+ version: 2.5.0
111
+ clic_edm_ttbar_pf:
112
+ splits:
113
+ - 1
114
+ - 2
115
+ - 3
116
+ - 4
117
+ - 5
118
+ - 6
119
+ - 7
120
+ - 8
121
+ - 9
122
+ - 10
123
+ version: 2.5.0
124
+ clic_edm_ww_fullhad_pf:
125
+ splits:
126
+ - 1
127
+ - 2
128
+ - 3
129
+ - 4
130
+ - 5
131
+ - 6
132
+ - 7
133
+ - 8
134
+ - 9
135
+ - 10
136
+ version: 2.5.0
137
+ train: true
138
+ train_dataset:
139
+ clic:
140
+ physical:
141
+ batch_size: 1
142
+ samples:
143
+ clic_edm_qq_pf:
144
+ splits:
145
+ - 1
146
+ - 2
147
+ - 3
148
+ - 4
149
+ - 5
150
+ - 6
151
+ - 7
152
+ - 8
153
+ - 9
154
+ - 10
155
+ version: 2.5.0
156
+ clic_edm_ttbar_pf:
157
+ splits:
158
+ - 1
159
+ - 2
160
+ - 3
161
+ - 4
162
+ - 5
163
+ - 6
164
+ - 7
165
+ - 8
166
+ - 9
167
+ - 10
168
+ version: 2.5.0
169
+ clic_edm_ww_fullhad_pf:
170
+ splits:
171
+ - 1
172
+ - 2
173
+ - 3
174
+ - 4
175
+ - 5
176
+ - 6
177
+ - 7
178
+ - 8
179
+ - 9
180
+ - 10
181
+ version: 2.5.0
182
+ val_freq: null
183
+ valid_dataset:
184
+ clic:
185
+ physical:
186
+ batch_size: 1
187
+ samples:
188
+ clic_edm_qq_pf:
189
+ splits:
190
+ - 1
191
+ - 2
192
+ - 3
193
+ - 4
194
+ - 5
195
+ - 6
196
+ - 7
197
+ - 8
198
+ - 9
199
+ - 10
200
+ version: 2.5.0
201
+ clic_edm_ttbar_pf:
202
+ splits:
203
+ - 1
204
+ - 2
205
+ - 3
206
+ - 4
207
+ - 5
208
+ - 6
209
+ - 7
210
+ - 8
211
+ - 9
212
+ - 10
213
+ version: 2.5.0
214
+ clic_edm_ww_fullhad_pf:
215
+ splits:
216
+ - 1
217
+ - 2
218
+ - 3
219
+ - 4
220
+ - 5
221
+ - 6
222
+ - 7
223
+ - 8
224
+ - 9
225
+ - 10
226
+ version: 2.5.0
227
+ weight_decay: 0.03
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/runs/train/events.out.tfevents.1741957152.workergpu072.934288.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18f9ea316e72db389ce5a1a6bd076330c12189182a3d9de30699ccb472188ca8
3
+ size 52833
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/runs/valid/events.out.tfevents.1741957152.workergpu072.934288.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15eac2fffca0b2c3b3c51d82c685ff3d2e41d521b9e4bed220b30e5cac13f451
3
+ size 11978655
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/train-config.yaml ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoint_freq: 1
2
+ comet: true
3
+ comet_name: particleflow-pt
4
+ comet_offline: false
5
+ comet_step_freq: 1000
6
+ conv_type: attention
7
+ data_dir: /mnt/ceph/users/ewulff/tensorflow_datasets/clic
8
+ dataset: clic
9
+ dtype: bfloat16
10
+ enabled_test_datasets:
11
+ - clic_edm_qq_pf
12
+ finetune: null
13
+ gpu_batch_multiplier: 256
14
+ gpus: 4
15
+ load: null
16
+ lr: 0.0004
17
+ lr_schedule: cosinedecay
18
+ lr_schedule_config:
19
+ onecycle:
20
+ pct_start: 0.3
21
+ make_plots: null
22
+ model:
23
+ attention:
24
+ activation: relu
25
+ attention_type: flash
26
+ conv_type: attention
27
+ dropout_conv_id_ff: 0.0
28
+ dropout_conv_id_mha: 0.0
29
+ dropout_conv_reg_ff: 0.0
30
+ dropout_conv_reg_mha: 0.0
31
+ dropout_ff: 0.0
32
+ head_dim: 32
33
+ num_convs: 3
34
+ num_heads: 32
35
+ use_pre_layernorm: true
36
+ cos_phi_mode: linear
37
+ energy_mode: direct-elemtype-split
38
+ eta_mode: linear
39
+ gnn_lsh:
40
+ activation: elu
41
+ bin_size: 32
42
+ conv_type: gnn_lsh
43
+ distance_dim: 128
44
+ embedding_dim: 512
45
+ ffn_dist_hidden_dim: 128
46
+ ffn_dist_num_layers: 2
47
+ layernorm: true
48
+ max_num_bins: 200
49
+ num_convs: 8
50
+ num_node_messages: 2
51
+ width: 512
52
+ input_encoding: split
53
+ learned_representation_mode: last
54
+ mamba:
55
+ activation: elu
56
+ conv_type: mamba
57
+ d_conv: 4
58
+ d_state: 16
59
+ dropout: 0.0
60
+ embedding_dim: 128
61
+ expand: 2
62
+ num_convs: 2
63
+ num_heads: 2
64
+ width: 128
65
+ pt_mode: direct-elemtype-split
66
+ sin_phi_mode: linear
67
+ trainable: all
68
+ ntest: 2000
69
+ ntrain: null
70
+ num_epochs: 10
71
+ num_workers: 12
72
+ nvalid: null
73
+ optimizer: adamw
74
+ patience: 20
75
+ prefetch_factor: 100
76
+ raytune:
77
+ asha:
78
+ brackets: 1
79
+ grace_period: 4
80
+ max_t: 200
81
+ reduction_factor: 4
82
+ default_metric: val_loss
83
+ default_mode: min
84
+ hyperband:
85
+ max_t: 200
86
+ reduction_factor: 4
87
+ hyperopt:
88
+ n_random_steps: 10
89
+ local_dir: /mnt/ceph/users/ewulff/ray_results
90
+ nevergrad:
91
+ n_random_steps: 10
92
+ sched: null
93
+ search_alg: null
94
+ save_attention: true
95
+ sort_data: false
96
+ test: null
97
+ test_dataset:
98
+ clic_edm_qq_pf:
99
+ splits:
100
+ - 1
101
+ - 2
102
+ - 3
103
+ - 4
104
+ - 5
105
+ - 6
106
+ - 7
107
+ - 8
108
+ - 9
109
+ - 10
110
+ version: 2.5.0
111
+ clic_edm_ttbar_pf:
112
+ splits:
113
+ - 1
114
+ - 2
115
+ - 3
116
+ - 4
117
+ - 5
118
+ - 6
119
+ - 7
120
+ - 8
121
+ - 9
122
+ - 10
123
+ version: 2.5.0
124
+ clic_edm_ww_fullhad_pf:
125
+ splits:
126
+ - 1
127
+ - 2
128
+ - 3
129
+ - 4
130
+ - 5
131
+ - 6
132
+ - 7
133
+ - 8
134
+ - 9
135
+ - 10
136
+ version: 2.5.0
137
+ train: true
138
+ train_dataset:
139
+ clic:
140
+ physical:
141
+ batch_size: 1
142
+ samples:
143
+ clic_edm_qq_pf:
144
+ splits:
145
+ - 1
146
+ - 2
147
+ - 3
148
+ - 4
149
+ - 5
150
+ - 6
151
+ - 7
152
+ - 8
153
+ - 9
154
+ - 10
155
+ version: 2.5.0
156
+ clic_edm_ttbar_pf:
157
+ splits:
158
+ - 1
159
+ - 2
160
+ - 3
161
+ - 4
162
+ - 5
163
+ - 6
164
+ - 7
165
+ - 8
166
+ - 9
167
+ - 10
168
+ version: 2.5.0
169
+ clic_edm_ww_fullhad_pf:
170
+ splits:
171
+ - 1
172
+ - 2
173
+ - 3
174
+ - 4
175
+ - 5
176
+ - 6
177
+ - 7
178
+ - 8
179
+ - 9
180
+ - 10
181
+ version: 2.5.0
182
+ val_freq: null
183
+ valid_dataset:
184
+ clic:
185
+ physical:
186
+ batch_size: 1
187
+ samples:
188
+ clic_edm_qq_pf:
189
+ splits:
190
+ - 1
191
+ - 2
192
+ - 3
193
+ - 4
194
+ - 5
195
+ - 6
196
+ - 7
197
+ - 8
198
+ - 9
199
+ - 10
200
+ version: 2.5.0
201
+ clic_edm_ttbar_pf:
202
+ splits:
203
+ - 1
204
+ - 2
205
+ - 3
206
+ - 4
207
+ - 5
208
+ - 6
209
+ - 7
210
+ - 8
211
+ - 9
212
+ - 10
213
+ version: 2.5.0
214
+ clic_edm_ww_fullhad_pf:
215
+ splits:
216
+ - 1
217
+ - 2
218
+ - 3
219
+ - 4
220
+ - 5
221
+ - 6
222
+ - 7
223
+ - 8
224
+ - 9
225
+ - 10
226
+ version: 2.5.0
227
+ weight_decay: 0.03