update readme

Files changed (8) hide show

clic/clusters/v2.3.0/README.md +5 -0
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/.gitattributes +4 -0
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/hyperparameters.json +1 -0
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/model_kwargs.pkl +3 -0
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/overridden_config.yaml +227 -0
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/runs/train/events.out.tfevents.1741957152.workergpu072.934288.0 +3 -0
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/runs/valid/events.out.tfevents.1741957152.workergpu072.934288.1 +3 -0
clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/train-config.yaml +227 -0

clic/clusters/v2.3.0/README.md CHANGED Viewed

@@ -9,4 +9,9 @@ pyg-clic_20250209_100514_187330 - transformer + flash attention, 4M events from
 pyg-clic_20250130_214007_333962 - transformer + flash attention, full dataset, 10 epochs / ~80 hours, 1st run
 pyg-clic_20250306_105311_290722 - transformer + flash attention, full dataset, 10 epochs / ~80 hours, 2nd run
 pyg-clic_20250309_173756_957486 - transformer + flash attention, full dataset, 10 epochs / ~80 hours, 3rd run
 ```

 pyg-clic_20250130_214007_333962 - transformer + flash attention, full dataset, 10 epochs / ~80 hours, 1st run
 pyg-clic_20250306_105311_290722 - transformer + flash attention, full dataset, 10 epochs / ~80 hours, 2nd run
 pyg-clic_20250309_173756_957486 - transformer + flash attention, full dataset, 10 epochs / ~80 hours, 3rd run
+#multi-GPU tests
+largebatch_study_gpus4_notscaledLR0.0001_epochs30_bsm256_adamw_a100_cu124_fulldataset_pyg-clic-v230_20250219_055135_172489 - just run on 4x GPUs
+largebatch_study_gpus4_linearscaledLR0.0004_epochs30_bsm256_adamw_a100_cu124_fulldataset_pyg-clic-v230_20250217_082738_406721 - run on 4x GPUs, scale learning rate by 4x
+largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888 - run on 4x GPUs, scale learning rate by 4x, scale weight decay by 3x
 ```

clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/.gitattributes ADDED Viewed

	@@ -0,0 +1,4 @@

+plots_checkpoint*/** filter=lfs diff=lfs merge=lfs -text
+preds_checkpoint*/** filter=lfs diff=lfs merge=lfs -text
+runs/** filter=lfs diff=lfs merge=lfs -text
+checkpoints/** filter=lfs diff=lfs merge=lfs -text

clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/hyperparameters.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"num_mlpf_params": 52630547, "checkpoint_freq": 1, "comet": true, "comet_name": "particleflow-pt", "comet_offline": false, "comet_step_freq": 1000, "conv_type": "attention", "data_dir": "/mnt/ceph/users/ewulff/tensorflow_datasets/clic", "dataset": "clic", "dtype": "bfloat16", "enabled_test_datasets": ["clic_edm_qq_pf"], "finetune": null, "gpu_batch_multiplier": 256, "gpus": 4, "load": null, "lr": 0.0004, "optimizer": "adamw", "weight_decay": 0.03, "lr_schedule": "cosinedecay", "lr_schedule_config": {"onecycle": {"pct_start": 0.3}}, "make_plots": null, "model": {"attention": {"activation": "relu", "attention_type": "flash", "conv_type": "attention", "dropout_conv_id_ff": 0.0, "dropout_conv_id_mha": 0.0, "dropout_conv_reg_ff": 0.0, "dropout_conv_reg_mha": 0.0, "dropout_ff": 0.0, "head_dim": 32, "num_convs": 3, "num_heads": 32, "use_pre_layernorm": true}, "cos_phi_mode": "linear", "energy_mode": "direct-elemtype-split", "eta_mode": "linear", "gnn_lsh": {"activation": "elu", "bin_size": 32, "conv_type": "gnn_lsh", "distance_dim": 128, "embedding_dim": 512, "ffn_dist_hidden_dim": 128, "ffn_dist_num_layers": 2, "layernorm": true, "max_num_bins": 200, "num_convs": 8, "num_node_messages": 2, "width": 512}, "input_encoding": "split", "learned_representation_mode": "last", "mamba": {"activation": "elu", "conv_type": "mamba", "d_conv": 4, "d_state": 16, "dropout": 0.0, "embedding_dim": 128, "expand": 2, "num_convs": 2, "num_heads": 2, "width": 128}, "pt_mode": "direct-elemtype-split", "sin_phi_mode": "linear", "trainable": "all"}, "ntest": 2000, "ntrain": null, "num_epochs": 10, "num_workers": 12, "nvalid": null, "patience": 20, "prefetch_factor": 100, "raytune": {"asha": {"brackets": 1, "grace_period": 4, "max_t": 200, "reduction_factor": 4}, "default_metric": "val_loss", "default_mode": "min", "hyperband": {"max_t": 200, "reduction_factor": 4}, "hyperopt": {"n_random_steps": 10}, "local_dir": "/mnt/ceph/users/ewulff/ray_results", "nevergrad": {"n_random_steps": 10}, "sched": null, "search_alg": null}, "save_attention": true, "sort_data": false, "test": null, "test_dataset": {"clic_edm_qq_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}, "clic_edm_ttbar_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}, "clic_edm_ww_fullhad_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}}, "train": true, "train_dataset": {"clic": {"physical": {"batch_size": 1, "samples": {"clic_edm_qq_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}, "clic_edm_ttbar_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}, "clic_edm_ww_fullhad_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}}}}}, "val_freq": null, "valid_dataset": {"clic": {"physical": {"batch_size": 1, "samples": {"clic_edm_qq_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}, "clic_edm_ttbar_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}, "clic_edm_ww_fullhad_pf": {"splits": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "version": "2.5.0"}}}}}}

clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/model_kwargs.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08d9fedbafe13195772b70373f3ad6d7dcff24c98624585cc27f59b557bb5ae7
+size 553

clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/overridden_config.yaml ADDED Viewed

	@@ -0,0 +1,227 @@

+checkpoint_freq: 1
+comet: true
+comet_name: particleflow-pt
+comet_offline: false
+comet_step_freq: 1000
+conv_type: attention
+data_dir: /mnt/ceph/users/ewulff/tensorflow_datasets/clic
+dataset: clic
+dtype: bfloat16
+enabled_test_datasets:
+- clic_edm_qq_pf
+finetune: null
+gpu_batch_multiplier: 256
+gpus: 4
+load: null
+lr: 0.0004
+lr_schedule: cosinedecay
+lr_schedule_config:
+  onecycle:
+    pct_start: 0.3
+make_plots: null
+model:
+  attention:
+    activation: relu
+    attention_type: flash
+    conv_type: attention
+    dropout_conv_id_ff: 0.0
+    dropout_conv_id_mha: 0.0
+    dropout_conv_reg_ff: 0.0
+    dropout_conv_reg_mha: 0.0
+    dropout_ff: 0.0
+    head_dim: 32
+    num_convs: 3
+    num_heads: 32
+    use_pre_layernorm: true
+  cos_phi_mode: linear
+  energy_mode: direct-elemtype-split
+  eta_mode: linear
+  gnn_lsh:
+    activation: elu
+    bin_size: 32
+    conv_type: gnn_lsh
+    distance_dim: 128
+    embedding_dim: 512
+    ffn_dist_hidden_dim: 128
+    ffn_dist_num_layers: 2
+    layernorm: true
+    max_num_bins: 200
+    num_convs: 8
+    num_node_messages: 2
+    width: 512
+  input_encoding: split
+  learned_representation_mode: last
+  mamba:
+    activation: elu
+    conv_type: mamba
+    d_conv: 4
+    d_state: 16
+    dropout: 0.0
+    embedding_dim: 128
+    expand: 2
+    num_convs: 2
+    num_heads: 2
+    width: 128
+  pt_mode: direct-elemtype-split
+  sin_phi_mode: linear
+  trainable: all
+ntest: 2000
+ntrain: null
+num_epochs: 10
+num_workers: 12
+nvalid: null
+optimizer: adamw
+patience: 20
+prefetch_factor: 100
+raytune:
+  asha:
+    brackets: 1
+    grace_period: 4
+    max_t: 200
+    reduction_factor: 4
+  default_metric: val_loss
+  default_mode: min
+  hyperband:
+    max_t: 200
+    reduction_factor: 4
+  hyperopt:
+    n_random_steps: 10
+  local_dir: /mnt/ceph/users/ewulff/ray_results
+  nevergrad:
+    n_random_steps: 10
+  sched: null
+  search_alg: null
+save_attention: true
+sort_data: false
+test: null
+test_dataset:
+  clic_edm_qq_pf:
+    splits:
+    - 1
+    - 2
+    - 3
+    - 4
+    - 5
+    - 6
+    - 7
+    - 8
+    - 9
+    - 10
+    version: 2.5.0
+  clic_edm_ttbar_pf:
+    splits:
+    - 1
+    - 2
+    - 3
+    - 4
+    - 5
+    - 6
+    - 7
+    - 8
+    - 9
+    - 10
+    version: 2.5.0
+  clic_edm_ww_fullhad_pf:
+    splits:
+    - 1
+    - 2
+    - 3
+    - 4
+    - 5
+    - 6
+    - 7
+    - 8
+    - 9
+    - 10
+    version: 2.5.0
+train: true
+train_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_qq_pf:
+          splits:
+          - 1
+          - 2
+          - 3
+          - 4
+          - 5
+          - 6
+          - 7
+          - 8
+          - 9
+          - 10
+          version: 2.5.0
+        clic_edm_ttbar_pf:
+          splits:
+          - 1
+          - 2
+          - 3
+          - 4
+          - 5
+          - 6
+          - 7
+          - 8
+          - 9
+          - 10
+          version: 2.5.0
+        clic_edm_ww_fullhad_pf:
+          splits:
+          - 1
+          - 2
+          - 3
+          - 4
+          - 5
+          - 6
+          - 7
+          - 8
+          - 9
+          - 10
+          version: 2.5.0
+val_freq: null
+valid_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_qq_pf:
+          splits:
+          - 1
+          - 2
+          - 3
+          - 4
+          - 5
+          - 6
+          - 7
+          - 8
+          - 9
+          - 10
+          version: 2.5.0
+        clic_edm_ttbar_pf:
+          splits:
+          - 1
+          - 2
+          - 3
+          - 4
+          - 5
+          - 6
+          - 7
+          - 8
+          - 9
+          - 10
+          version: 2.5.0
+        clic_edm_ww_fullhad_pf:
+          splits:
+          - 1
+          - 2
+          - 3
+          - 4
+          - 5
+          - 6
+          - 7
+          - 8
+          - 9
+          - 10
+          version: 2.5.0
+weight_decay: 0.03

clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/runs/train/events.out.tfevents.1741957152.workergpu072.934288.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18f9ea316e72db389ce5a1a6bd076330c12189182a3d9de30699ccb472188ca8
+size 52833

clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/runs/valid/events.out.tfevents.1741957152.workergpu072.934288.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15eac2fffca0b2c3b3c51d82c685ff3d2e41d521b9e4bed220b30e5cac13f451
+size 11978655

clic/clusters/v2.3.0/largebatch_clic_wd3eneg2_gpus4_lr4eneg4_epochs10_pyg-clic-v230_adamw_tunedweightdecay_20250314_085408_738888/train-config.yaml ADDED Viewed

	@@ -0,0 +1,227 @@

+checkpoint_freq: 1
+comet: true
+comet_name: particleflow-pt
+comet_offline: false
+comet_step_freq: 1000
+conv_type: attention
+data_dir: /mnt/ceph/users/ewulff/tensorflow_datasets/clic
+dataset: clic
+dtype: bfloat16
+enabled_test_datasets:
+- clic_edm_qq_pf
+finetune: null
+gpu_batch_multiplier: 256
+gpus: 4
+load: null
+lr: 0.0004
+lr_schedule: cosinedecay
+lr_schedule_config:
+  onecycle:
+    pct_start: 0.3
+make_plots: null
+model:
+  attention:
+    activation: relu
+    attention_type: flash
+    conv_type: attention
+    dropout_conv_id_ff: 0.0
+    dropout_conv_id_mha: 0.0
+    dropout_conv_reg_ff: 0.0
+    dropout_conv_reg_mha: 0.0
+    dropout_ff: 0.0
+    head_dim: 32
+    num_convs: 3
+    num_heads: 32
+    use_pre_layernorm: true
+  cos_phi_mode: linear
+  energy_mode: direct-elemtype-split
+  eta_mode: linear
+  gnn_lsh:
+    activation: elu
+    bin_size: 32
+    conv_type: gnn_lsh
+    distance_dim: 128
+    embedding_dim: 512
+    ffn_dist_hidden_dim: 128
+    ffn_dist_num_layers: 2
+    layernorm: true
+    max_num_bins: 200
+    num_convs: 8
+    num_node_messages: 2
+    width: 512
+  input_encoding: split
+  learned_representation_mode: last
+  mamba:
+    activation: elu
+    conv_type: mamba
+    d_conv: 4
+    d_state: 16
+    dropout: 0.0
+    embedding_dim: 128
+    expand: 2
+    num_convs: 2
+    num_heads: 2
+    width: 128
+  pt_mode: direct-elemtype-split
+  sin_phi_mode: linear
+  trainable: all
+ntest: 2000
+ntrain: null
+num_epochs: 10
+num_workers: 12
+nvalid: null
+optimizer: adamw
+patience: 20
+prefetch_factor: 100
+raytune:
+  asha:
+    brackets: 1
+    grace_period: 4
+    max_t: 200
+    reduction_factor: 4
+  default_metric: val_loss
+  default_mode: min
+  hyperband:
+    max_t: 200
+    reduction_factor: 4
+  hyperopt:
+    n_random_steps: 10
+  local_dir: /mnt/ceph/users/ewulff/ray_results
+  nevergrad:
+    n_random_steps: 10
+  sched: null
+  search_alg: null
+save_attention: true
+sort_data: false
+test: null
+test_dataset:
+  clic_edm_qq_pf:
+    splits:
+    - 1
+    - 2
+    - 3
+    - 4
+    - 5
+    - 6
+    - 7
+    - 8
+    - 9
+    - 10
+    version: 2.5.0
+  clic_edm_ttbar_pf:
+    splits:
+    - 1
+    - 2
+    - 3
+    - 4
+    - 5
+    - 6
+    - 7
+    - 8
+    - 9
+    - 10
+    version: 2.5.0
+  clic_edm_ww_fullhad_pf:
+    splits:
+    - 1
+    - 2
+    - 3
+    - 4
+    - 5
+    - 6
+    - 7
+    - 8
+    - 9
+    - 10
+    version: 2.5.0
+train: true
+train_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_qq_pf:
+          splits:
+          - 1
+          - 2
+          - 3
+          - 4
+          - 5
+          - 6
+          - 7
+          - 8
+          - 9
+          - 10
+          version: 2.5.0
+        clic_edm_ttbar_pf:
+          splits:
+          - 1
+          - 2
+          - 3
+          - 4
+          - 5
+          - 6
+          - 7
+          - 8
+          - 9
+          - 10
+          version: 2.5.0
+        clic_edm_ww_fullhad_pf:
+          splits:
+          - 1
+          - 2
+          - 3
+          - 4
+          - 5
+          - 6
+          - 7
+          - 8
+          - 9
+          - 10
+          version: 2.5.0
+val_freq: null
+valid_dataset:
+  clic:
+    physical:
+      batch_size: 1
+      samples:
+        clic_edm_qq_pf:
+          splits:
+          - 1
+          - 2
+          - 3
+          - 4
+          - 5
+          - 6
+          - 7
+          - 8
+          - 9
+          - 10
+          version: 2.5.0
+        clic_edm_ttbar_pf:
+          splits:
+          - 1
+          - 2
+          - 3
+          - 4
+          - 5
+          - 6
+          - 7
+          - 8
+          - 9
+          - 10
+          version: 2.5.0
+        clic_edm_ww_fullhad_pf:
+          splits:
+          - 1
+          - 2
+          - 3
+          - 4
+          - 5
+          - 6
+          - 7
+          - 8
+          - 9
+          - 10
+          version: 2.5.0
+weight_decay: 0.03