Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +38 -33
- context_encoding_model/_tp0_bk0/command.txt +1 -0
- context_encoding_model/_tp0_bk0/compile_flags.MODULE_f4171003694760566af4+a9cd68fb.json +1 -0
- context_encoding_model/_tp0_bk0/global_metric_store.json +1079 -0
- context_encoding_model/_tp0_bk0/graph.neff +3 -0
- context_encoding_model/_tp0_bk0/log-neuron-cc.txt +0 -0
- context_encoding_model/_tp0_bk0/metaneff.pb +3 -0
- context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb +3 -0
- context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff +3 -0
- context_encoding_model/_tp0_bk0/neuron_config.json +220 -0
- context_encoding_model/_tp0_bk1/command.txt +1 -0
- context_encoding_model/_tp0_bk1/compile_flags.MODULE_2914133a46cb7b4660ab+d7af8a84.json +1 -0
- context_encoding_model/_tp0_bk1/global_metric_store.json +1079 -0
- context_encoding_model/_tp0_bk1/graph.neff +3 -0
- context_encoding_model/_tp0_bk1/log-neuron-cc.txt +0 -0
- context_encoding_model/_tp0_bk1/metaneff.pb +3 -0
- context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb +3 -0
- context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff +3 -0
- context_encoding_model/_tp0_bk1/neuron_config.json +220 -0
- context_encoding_model/_tp0_bk2/command.txt +1 -0
- context_encoding_model/_tp0_bk2/compile_flags.MODULE_00594b8bc68e927f3dbe+1ad60ced.json +1 -0
- context_encoding_model/_tp0_bk2/global_metric_store.json +1079 -0
- context_encoding_model/_tp0_bk2/graph.neff +3 -0
- context_encoding_model/_tp0_bk2/log-neuron-cc.txt +0 -0
- context_encoding_model/_tp0_bk2/metaneff.pb +3 -0
- context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb +3 -0
- context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff +3 -0
- context_encoding_model/_tp0_bk2/neuron_config.json +220 -0
- context_encoding_model/_tp0_bk3/command.txt +1 -0
- context_encoding_model/_tp0_bk3/compile_flags.MODULE_b3ddbc97e5f0d1d64c82+155de413.json +1 -0
- context_encoding_model/_tp0_bk3/global_metric_store.json +1079 -0
- context_encoding_model/_tp0_bk3/graph.neff +3 -0
- context_encoding_model/_tp0_bk3/log-neuron-cc.txt +0 -0
- context_encoding_model/_tp0_bk3/metaneff.pb +3 -0
- context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb +3 -0
- context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff +3 -0
- context_encoding_model/_tp0_bk3/neuron_config.json +220 -0
- layout_opt/command.txt +1 -0
- layout_opt/graph.neff +3 -0
- layout_opt/log-neuron-cc.txt +0 -0
- layout_opt/metaneff +1198 -0
- layout_opt/model/graph.hlo +3 -0
- model.pt +3 -0
- neuron_config.json +218 -0
- token_generation_model/_tp0_bk0/command.txt +1 -0
- token_generation_model/_tp0_bk0/compile_flags.MODULE_6ef5ba8b41fbbe77f080+74ae8282.json +1 -0
- token_generation_model/_tp0_bk0/global_metric_store.json +540 -0
- token_generation_model/_tp0_bk0/graph.neff +3 -0
- token_generation_model/_tp0_bk0/log-neuron-cc.txt +0 -0
- token_generation_model/_tp0_bk0/metaneff.pb +3 -0
.gitattributes
CHANGED
@@ -1,35 +1,40 @@
|
|
1 |
-
*.
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
5 |
+
context_encoding_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text
|
6 |
+
context_encoding_model/_tp0_bk0/metaneff.pb filter=lfs diff=lfs merge=lfs -text
|
7 |
+
context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
|
8 |
+
context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff filter=lfs diff=lfs merge=lfs -text
|
9 |
+
context_encoding_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text
|
10 |
+
context_encoding_model/_tp0_bk1/metaneff.pb filter=lfs diff=lfs merge=lfs -text
|
11 |
+
context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
|
12 |
+
context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff filter=lfs diff=lfs merge=lfs -text
|
13 |
+
context_encoding_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text
|
14 |
+
context_encoding_model/_tp0_bk2/metaneff.pb filter=lfs diff=lfs merge=lfs -text
|
15 |
+
context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
|
16 |
+
context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff filter=lfs diff=lfs merge=lfs -text
|
17 |
+
context_encoding_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text
|
18 |
+
context_encoding_model/_tp0_bk3/metaneff.pb filter=lfs diff=lfs merge=lfs -text
|
19 |
+
context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff filter=lfs diff=lfs merge=lfs -text
|
21 |
+
layout_opt/graph.neff filter=lfs diff=lfs merge=lfs -text
|
22 |
+
layout_opt/model/graph.hlo filter=lfs diff=lfs merge=lfs -text
|
23 |
+
model.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
+
token_generation_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text
|
25 |
+
token_generation_model/_tp0_bk0/metaneff.pb filter=lfs diff=lfs merge=lfs -text
|
26 |
+
token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
|
27 |
+
token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.neff filter=lfs diff=lfs merge=lfs -text
|
28 |
+
token_generation_model/_tp0_bk0/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
|
29 |
+
token_generation_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text
|
30 |
+
token_generation_model/_tp0_bk1/metaneff.pb filter=lfs diff=lfs merge=lfs -text
|
31 |
+
token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
|
32 |
+
token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.neff filter=lfs diff=lfs merge=lfs -text
|
33 |
+
token_generation_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text
|
34 |
+
token_generation_model/_tp0_bk2/metaneff.pb filter=lfs diff=lfs merge=lfs -text
|
35 |
+
token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
|
36 |
+
token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.neff filter=lfs diff=lfs merge=lfs -text
|
37 |
+
token_generation_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text
|
38 |
+
token_generation_model/_tp0_bk3/metaneff.pb filter=lfs diff=lfs merge=lfs -text
|
39 |
+
token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
|
40 |
+
token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.neff filter=lfs diff=lfs merge=lfs -text
|
context_encoding_model/_tp0_bk0/command.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
neuronx-cc compile --framework=XLA model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb --output model.MODULE_f4171003694760566af4+a9cd68fb.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
|
context_encoding_model/_tp0_bk0/compile_flags.MODULE_f4171003694760566af4+a9cd68fb.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"]
|
context_encoding_model/_tp0_bk0/global_metric_store.json
ADDED
@@ -0,0 +1,1079 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Average": {
|
3 |
+
"tensorizer": {
|
4 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.65389251708984,
|
5 |
+
"StaticProfiler::AveragePartitionUtilization": 97.55139923095703,
|
6 |
+
"StaticProfiler::AveragePeUtilization": 98.60253143310547,
|
7 |
+
"StaticProfiler::LocalizationEfficiency": 99.04553985595703,
|
8 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.20111846923828,
|
9 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
10 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"Count": {
|
14 |
+
"tensorizer": {
|
15 |
+
"StaticProfiler::AverageFractalPeUtilization": 1.0,
|
16 |
+
"StaticProfiler::AveragePartitionUtilization": 1.0,
|
17 |
+
"StaticProfiler::AveragePeUtilization": 1.0,
|
18 |
+
"StaticProfiler::LocalizationEfficiency": 1.0,
|
19 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
|
20 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
|
21 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"Sum": {
|
25 |
+
"compiletime": {
|
26 |
+
"AGOrderingAnalysisPass": 0.018787622451782227,
|
27 |
+
"AffinePredicateResolution": 0.0011818408966064453,
|
28 |
+
"AliasDependencyElimination": 0.00011801719665527344,
|
29 |
+
"AliasDependencyInduction": 0.005483388900756836,
|
30 |
+
"AliasDependencyReset": 0.026019811630249023,
|
31 |
+
"BFComputeCutting": 0.00225830078125,
|
32 |
+
"BirCodeGenLoop": 0.4621126651763916,
|
33 |
+
"CCOpFusion": 0.01928091049194336,
|
34 |
+
"CanonicalizeConv": 3.7000001611886546e-05,
|
35 |
+
"CanonicalizeDAGForPGTiling": 0.004612922668457031,
|
36 |
+
"CanonicalizeForTensorizer": 4.099999932805076e-05,
|
37 |
+
"CanonicalizeIR": 0.0017774105072021484,
|
38 |
+
"Canonicalizer": 0.0009619999909773469,
|
39 |
+
"CoalesceCCOp": 0.0146026611328125,
|
40 |
+
"CommuteConcat": 0.0020241737365722656,
|
41 |
+
"DMALocalityOpt": 0.005425214767456055,
|
42 |
+
"DMAProfiler": 0.012541055679321289,
|
43 |
+
"DMATilingProfiler": 0.004782676696777344,
|
44 |
+
"DataLocalityOpt": 0.06629562377929688,
|
45 |
+
"DataStreaming": 0.03773355484008789,
|
46 |
+
"DeConcat": 0.0006563663482666016,
|
47 |
+
"DeadCodeElimination": 0.002358675003051758,
|
48 |
+
"DeadStoreElimination": 0.0055620670318603516,
|
49 |
+
"DelinearIndices": 0.004741668701171875,
|
50 |
+
"Delinearization": 0.0036110877990722656,
|
51 |
+
"DoNothing": 0.00022459030151367188,
|
52 |
+
"DramToDramTranspose": 0.016016721725463867,
|
53 |
+
"DumpGraphAndMetadata": 0.0853111743927002,
|
54 |
+
"EliminateDivs": 0.0025675296783447266,
|
55 |
+
"ExpandBatchNorm": 0.002092123031616211,
|
56 |
+
"ExpandISAMacro": 0.011052370071411133,
|
57 |
+
"FactorizeBlkDims": 0.00814366340637207,
|
58 |
+
"FactorizeThreadAxesInFreeDims": 0.002122640609741211,
|
59 |
+
"FlattenMacroLoop": 0.002187013626098633,
|
60 |
+
"GenericAccessSimplifier": 0.0009529590606689453,
|
61 |
+
"HoistCompute": 6.000000212225132e-06,
|
62 |
+
"IdentifyCrossPassTensors": 7.700000423938036e-05,
|
63 |
+
"InferInitValue": 0.0242159366607666,
|
64 |
+
"InferIntrinsicOnCC": 0.009269952774047852,
|
65 |
+
"InferNeuronTensor": 0.020155906677246094,
|
66 |
+
"InferNonlocalTensors": 0.015646696090698242,
|
67 |
+
"InferPSumTensor": 0.3081786632537842,
|
68 |
+
"InlineNativeKernels": 0.009155511856079102,
|
69 |
+
"InsertIOTransposes": 0.015281438827514648,
|
70 |
+
"InsertLocalTransposes": 0.006501436233520508,
|
71 |
+
"InsertOffloadedTransposes": 0.002702474594116211,
|
72 |
+
"LICM": 0.002913951873779297,
|
73 |
+
"LateLegalizeInst": 0.014158487319946289,
|
74 |
+
"LateLegalizePostSplit": 0.012693405151367188,
|
75 |
+
"LateLowerReshapeOp": 0.0025734901428222656,
|
76 |
+
"LateLowerTensorOp": 0.001531362533569336,
|
77 |
+
"LateNeuronInstComb": 0.008838176727294922,
|
78 |
+
"LayoutPreprocessing": 0.026634931564331055,
|
79 |
+
"LayoutPreprocessingAndAnalysis": 0.5595176219940186,
|
80 |
+
"LayoutRequirementAnalysis": 0.005538463592529297,
|
81 |
+
"LegalizeCCOpLayout": 0.0022728443145751953,
|
82 |
+
"LegalizeOpLevelAlias": 0.001255035400390625,
|
83 |
+
"LegalizePartitionReduce": 0.001256704330444336,
|
84 |
+
"LegalizeSundaAccess": 0.07711672782897949,
|
85 |
+
"LegalizeSundaMacro": 0.010920286178588867,
|
86 |
+
"LegalizeType": 0.01314401626586914,
|
87 |
+
"LocalLayoutOpt": 0.012011289596557617,
|
88 |
+
"LoopFusion": 0.006572723388671875,
|
89 |
+
"LoopSplitting": 0.0003001689910888672,
|
90 |
+
"LowerBroadcast": 0.0018808841705322266,
|
91 |
+
"LowerCCOpBlockAxis": 0.0050678253173828125,
|
92 |
+
"LowerComplexBroadcast": 0.0025262832641601563,
|
93 |
+
"LowerIntrinsics": 0.3039369583129883,
|
94 |
+
"LowerTensorOp": 0.011744022369384766,
|
95 |
+
"LowerTranspose": 0.011741399765014648,
|
96 |
+
"MacroGeneration": 0.026911020278930664,
|
97 |
+
"MaskPropagation": 0.0031325817108154297,
|
98 |
+
"MemcastMotion": 2.2000000171829015e-05,
|
99 |
+
"MemcpyElimination": 0.027472257614135742,
|
100 |
+
"MutateDataType": 0.0015196800231933594,
|
101 |
+
"NeuronAliasDependencyInduction": 0.00016927719116210938,
|
102 |
+
"NeuronAliasDependencyReset": 0.0242006778717041,
|
103 |
+
"NeuronInstComb": 0.00468754768371582,
|
104 |
+
"NeuronLICM": 0.03664875030517578,
|
105 |
+
"NeuronLoopFusion": 0.00889277458190918,
|
106 |
+
"NeuronLoopInterchange": 0.002141237258911133,
|
107 |
+
"NeuronSimplifier": 0.00720524787902832,
|
108 |
+
"NeuronSimplifyPredicates": 0.12209796905517578,
|
109 |
+
"NeuronValueNumbering": 0.003449678421020508,
|
110 |
+
"OptimizeAliasedCopyChain": 0.0006387233734130859,
|
111 |
+
"OptimizeNKIKernels": 0.5260024070739746,
|
112 |
+
"PAGLayoutOpt": 0.5680239200592041,
|
113 |
+
"PComputeCutting": 0.0048143863677978516,
|
114 |
+
"PGLayoutTilingPipeline": 1.6304676532745361,
|
115 |
+
"PGTiling": 0.1616363525390625,
|
116 |
+
"PadElimination": 0.0003521442413330078,
|
117 |
+
"ParAxesAnnotation": 0.0544736385345459,
|
118 |
+
"PartialLoopFusion": 0.005907773971557617,
|
119 |
+
"PartialSimdFusion": 0.0038967132568359375,
|
120 |
+
"PenguinizeFunctions": 3.900000228895806e-05,
|
121 |
+
"PerfectLoopNest": 0.0021576881408691406,
|
122 |
+
"PruneFunctions": 3.5000000934815034e-05,
|
123 |
+
"RecognizeOpIdiom": 0.0039520263671875,
|
124 |
+
"Recompute": 0.0002884864807128906,
|
125 |
+
"RelaxPredicates": 0.013870716094970703,
|
126 |
+
"Rematerialization": 0.0024657249450683594,
|
127 |
+
"RemoveOptimizationBarriers": 6.500000017695129e-05,
|
128 |
+
"ReshapeWeights": 0.0006930828094482422,
|
129 |
+
"ResolveAccessConflict": 0.0038983821868896484,
|
130 |
+
"ResolveComplicatePredicates": 0.0012950897216796875,
|
131 |
+
"RewriteReplicationMatmul": 0.002060413360595703,
|
132 |
+
"RewriteWeights": 0.0028791427612304688,
|
133 |
+
"SFKVectorizer": 0.2904393672943115,
|
134 |
+
"ScatterMotion": 2.8000000384054147e-05,
|
135 |
+
"SimpleAllReduceTiling": 0.008909463882446289,
|
136 |
+
"Simplifier": 0.003449678421020508,
|
137 |
+
"SimplifyMacroPredicates": 0.010317325592041016,
|
138 |
+
"SimplifyNeuronTensor": 1.038323163986206,
|
139 |
+
"SimplifySlice": 0.0008852481842041016,
|
140 |
+
"SimplifyTensor": 0.005218982696533203,
|
141 |
+
"SpillPSum": 0.010073423385620117,
|
142 |
+
"SplitAPUnionSets": 0.10591006278991699,
|
143 |
+
"SplitAccGrp": 0.0011169910430908203,
|
144 |
+
"StaticProfiler": 0.01290583610534668,
|
145 |
+
"StaticTransposeLocalTensor": 0.003824472427368164,
|
146 |
+
"SundaISel": 0.041872262954711914,
|
147 |
+
"TCTransform": 0.0008666515350341797,
|
148 |
+
"TensorInitialization": 0.013058185577392578,
|
149 |
+
"TensorOpSimplifier": 0.0061550140380859375,
|
150 |
+
"TensorOpTransform": 0.020328521728515625,
|
151 |
+
"TensorizerLegalizationPass": 6.900000153109431e-05,
|
152 |
+
"TileCCOps": 0.006834983825683594,
|
153 |
+
"TilingProfiler": 0.0072863101959228516,
|
154 |
+
"TransformConvOp": 0.0032320022583007813,
|
155 |
+
"TritiumFusion": 0.03062152862548828,
|
156 |
+
"ValueNumbering": 0.0023603439331054688,
|
157 |
+
"VectorizeDMA": 0.004430294036865234,
|
158 |
+
"VectorizeMatMult": 0.0021605491638183594,
|
159 |
+
"VerifySupportedOps": 3.300000025774352e-05,
|
160 |
+
"WeightCoalescing": 0.00846409797668457,
|
161 |
+
"ZeroSizeTensorElimination": 0.00011014938354492188,
|
162 |
+
"algsimp": 0.004399999976158142,
|
163 |
+
"batchnorm_expander": 3.600000127335079e-05,
|
164 |
+
"boundary-marker-removal": 1.2000000424450263e-05,
|
165 |
+
"call-inliner": 0.0007670000777579844,
|
166 |
+
"canonicalize-boundary-marker": 1.4999999621068127e-05,
|
167 |
+
"collective-stream-id-checker": 7.300000288523734e-05,
|
168 |
+
"comparison-expander": 0.0006099999882280827,
|
169 |
+
"computation-deduplicator": 5.8999998145736754e-05,
|
170 |
+
"conditional-to-select": 1.5999999959603883e-05,
|
171 |
+
"config-lowering": 8.70000003487803e-05,
|
172 |
+
"constant-statistics": 0.0005649999948218465,
|
173 |
+
"constant_folding": 0.0005520000122487545,
|
174 |
+
"cse": 3.600000127335079e-05,
|
175 |
+
"dce": 0.00014599999121855944,
|
176 |
+
"dot_decomposer": 0.0013859999598935246,
|
177 |
+
"dynamic-slice-transpose": 1.2000000424450263e-05,
|
178 |
+
"eliminate-redundant-compare": 0.0004949999856762588,
|
179 |
+
"emit-offloaded-dropout": 3.80000019504223e-05,
|
180 |
+
"flatten-call-graph": 0.0009339999523945153,
|
181 |
+
"fuse-send-recv": 7.100000220816582e-05,
|
182 |
+
"hilo::LegalizeAlias": 1.1999999514955562e-05,
|
183 |
+
"hilo::NeuronInstCombine": 0.00010099999781232327,
|
184 |
+
"hilo::NeuronOpFusion": 4.7999998059822246e-05,
|
185 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 3.899999865097925e-05,
|
186 |
+
"hilo::ScheduleFusion": 1.9999999949504854e-06,
|
187 |
+
"hilo::SixtyFourHack": 6.200000643730164e-05,
|
188 |
+
"hilo::VerifyAliasing": 4.999999873689376e-06,
|
189 |
+
"hlo-mac-count": 0.0018479999853298068,
|
190 |
+
"hlo-verifier": 0.007563999388366938,
|
191 |
+
"instruction-histogram": 0.002553999889642,
|
192 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
193 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
194 |
+
"io-layout-normalization": 0.0013040000339969993,
|
195 |
+
"io-statistics": 6.500000017695129e-05,
|
196 |
+
"legalize-ccops": 3.000000106112566e-06,
|
197 |
+
"legalize-compare": 1.1000000085914508e-05,
|
198 |
+
"lower-argminmax-custom-call": 1.2000000424450263e-05,
|
199 |
+
"map-inline": 0.0008759999764151871,
|
200 |
+
"metadata-naming": 6.0999998822808266e-05,
|
201 |
+
"mlir::detail::OpToOpPassAdaptor": 7.200000254670158e-05,
|
202 |
+
"mlir::hlo::MhloToPyPenguin": 0.002776999957859516,
|
203 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.00023499999952036887,
|
204 |
+
"mlir::mhlo::LowerComplexPass": 0.00032500000088475645,
|
205 |
+
"native-to-custom-softmax": 0.0007319999858736992,
|
206 |
+
"native-to-custom-softmax-dx": 0.000678999989759177,
|
207 |
+
"operand_upcaster": 4.900000203633681e-05,
|
208 |
+
"opt-barrier-removal": 0.0005629999795928597,
|
209 |
+
"post-par-pipe-begin": 9.000000318337698e-06,
|
210 |
+
"post-par-pipe-end": 0.0,
|
211 |
+
"post-partition-simplification": 0.001663000090047717,
|
212 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
213 |
+
"pre-par-pipe-end": 9.999999974752427e-07,
|
214 |
+
"pre-partition-simplification": 0.13888800144195557,
|
215 |
+
"replace-minimum-constant": 0.0007169999880716205,
|
216 |
+
"reshape-mover": 0.00021499999274965376,
|
217 |
+
"simplify-concat": 0.00014099999680183828,
|
218 |
+
"simplify-while-loops": 0.00017800000205170363,
|
219 |
+
"transform-variadic-reduce": 6.70000008540228e-05,
|
220 |
+
"tuple-simplifier": 0.0005469999159686267,
|
221 |
+
"unpack-nested-aws-ntwsr": 0.00046300000394694507,
|
222 |
+
"unroll-while-loop": 3.099999958067201e-05,
|
223 |
+
"zero_sized_hlo_elimination": 0.0008880000095814466
|
224 |
+
},
|
225 |
+
"hilo": {
|
226 |
+
"ConstantSize": 304437.0,
|
227 |
+
"HloInputCount": 475.0,
|
228 |
+
"HloMacCount": 25141444608.0,
|
229 |
+
"HloOutputCount": 73.0,
|
230 |
+
"IfmapSize": 8266542080.0,
|
231 |
+
"OfmapSize": 75497472.0,
|
232 |
+
"OutputsReadFromCount": 0.0,
|
233 |
+
"PassthroughTensorsCount": 0.0,
|
234 |
+
"RedundantOutputCount": 0.0,
|
235 |
+
"Traffic": 1649111936.0
|
236 |
+
},
|
237 |
+
"tensorizer": {
|
238 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 42834.0,
|
239 |
+
"StaticProfiler::AifUb": 129.43267822265625,
|
240 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 128.19729614257813,
|
241 |
+
"StaticProfiler::AverageDmaLength": 4810.17578125,
|
242 |
+
"StaticProfiler::DDRTransferBytes": 782946624.0,
|
243 |
+
"StaticProfiler::InternalTransferBytes": 629086720.0,
|
244 |
+
"StaticProfiler::LoadExpanded": 97814.0,
|
245 |
+
"StaticProfiler::StoreExpanded": 1757.0,
|
246 |
+
"StaticProfiler::TotalDMAExpanded": 99571.0,
|
247 |
+
"StaticProfiler::TotalDynamicInstancesCount": 50031.0,
|
248 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 49585.0,
|
249 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
250 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
251 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
252 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
253 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
254 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 22464.0,
|
255 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
256 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
257 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
258 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
259 |
+
"TilingProfiler::PfTransposeInstructions": 19105.0,
|
260 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
|
261 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
262 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 96.0,
|
263 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
|
264 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 158.0,
|
265 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
266 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
267 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
268 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
269 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
270 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
271 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
272 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
273 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
274 |
+
}
|
275 |
+
},
|
276 |
+
"all": {
|
277 |
+
"compiletime": {
|
278 |
+
"algsimp": 0.004207999911159277,
|
279 |
+
"call-inliner": 0.0007350000087171793,
|
280 |
+
"collective-stream-id-checker": 6.399999983841553e-05,
|
281 |
+
"comparison-expander": 0.0005949999904260039,
|
282 |
+
"constant-statistics": 0.0005649999948218465,
|
283 |
+
"constant_folding": 0.0005249999812804163,
|
284 |
+
"dce": 0.0001429999974789098,
|
285 |
+
"dot_decomposer": 0.0013859999598935246,
|
286 |
+
"eliminate-redundant-compare": 0.0004839999892283231,
|
287 |
+
"flatten-call-graph": 0.000901999999769032,
|
288 |
+
"hlo-mac-count": 0.0016270000487565994,
|
289 |
+
"hlo-verifier": 0.007029999978840351,
|
290 |
+
"instruction-histogram": 0.002553999889642,
|
291 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
292 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
293 |
+
"io-layout-normalization": 0.0013040000339969993,
|
294 |
+
"io-statistics": 6.500000017695129e-05,
|
295 |
+
"map-inline": 0.000838999985717237,
|
296 |
+
"native-to-custom-softmax": 0.0007050000131130219,
|
297 |
+
"native-to-custom-softmax-dx": 0.0005089999758638442,
|
298 |
+
"opt-barrier-removal": 0.0005629999795928597,
|
299 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
300 |
+
"pre-par-pipe-end": 9.999999974752427e-07,
|
301 |
+
"pre-partition-simplification": 0.13888800144195557,
|
302 |
+
"replace-minimum-constant": 0.0006949999951757491,
|
303 |
+
"reshape-mover": 0.00020500000391621143,
|
304 |
+
"simplify-while-loops": 0.0001720000000204891,
|
305 |
+
"tuple-simplifier": 0.0005319999763742089,
|
306 |
+
"unpack-nested-aws-ntwsr": 0.000450999999884516,
|
307 |
+
"unroll-while-loop": 2.9000000722589903e-05,
|
308 |
+
"zero_sized_hlo_elimination": 0.0008880000095814466
|
309 |
+
}
|
310 |
+
},
|
311 |
+
"cumsum": {
|
312 |
+
"compiletime": {
|
313 |
+
"CoalesceCCOp": 0.00020933151245117188,
|
314 |
+
"DMALocalityOpt": 0.0001666545867919922,
|
315 |
+
"DMAProfiler": 0.0008401870727539063,
|
316 |
+
"DataStreaming": 0.0002658367156982422,
|
317 |
+
"DoNothing": 0.00014090538024902344,
|
318 |
+
"ExpandISAMacro": 0.0004999637603759766,
|
319 |
+
"FactorizeBlkDims": 0.00046062469482421875,
|
320 |
+
"InferPSumTensor": 0.0004820823669433594,
|
321 |
+
"LateLegalizeInst": 0.0004343986511230469,
|
322 |
+
"LateNeuronInstComb": 0.0004832744598388672,
|
323 |
+
"LegalizeSundaAccess": 0.002238750457763672,
|
324 |
+
"LegalizeType": 0.0002429485321044922,
|
325 |
+
"LowerBroadcast": 0.0002453327178955078,
|
326 |
+
"LowerIntrinsics": 0.00021791458129882813,
|
327 |
+
"LowerTranspose": 0.00022292137145996094,
|
328 |
+
"NeuronInstComb": 0.0005400180816650391,
|
329 |
+
"NeuronLICM": 0.0003840923309326172,
|
330 |
+
"NeuronSimplifyPredicates": 0.0028014183044433594,
|
331 |
+
"NeuronValueNumbering": 0.00042724609375,
|
332 |
+
"SFKVectorizer": 0.0028204917907714844,
|
333 |
+
"SimpleAllReduceTiling": 0.0002048015594482422,
|
334 |
+
"SimplifyNeuronTensor": 0.00043082237243652344,
|
335 |
+
"SpillPSum": 0.0005221366882324219,
|
336 |
+
"WeightCoalescing": 0.00020456314086914063
|
337 |
+
}
|
338 |
+
},
|
339 |
+
"sg00": {
|
340 |
+
"compiletime": {
|
341 |
+
"CanonicalizeConv": 1.4000000192027073e-05,
|
342 |
+
"CanonicalizeForTensorizer": 1.4999999621068127e-05,
|
343 |
+
"Canonicalizer": 0.0003440000000409782,
|
344 |
+
"HoistCompute": 3.000000106112566e-06,
|
345 |
+
"IdentifyCrossPassTensors": 3.099999958067201e-05,
|
346 |
+
"MemcastMotion": 1.2000000424450263e-05,
|
347 |
+
"PenguinizeFunctions": 1.5999999959603883e-05,
|
348 |
+
"PruneFunctions": 1.2999999853491317e-05,
|
349 |
+
"RemoveOptimizationBarriers": 2.4000000848900527e-05,
|
350 |
+
"ScatterMotion": 1.1000000085914508e-05,
|
351 |
+
"TensorizerLegalizationPass": 2.9000000722589903e-05,
|
352 |
+
"VerifySupportedOps": 1.1000000085914508e-05,
|
353 |
+
"algsimp": 6.70000008540228e-05,
|
354 |
+
"batchnorm_expander": 1.2999999853491317e-05,
|
355 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
356 |
+
"call-inliner": 1.1000000085914508e-05,
|
357 |
+
"canonicalize-boundary-marker": 6.000000212225132e-06,
|
358 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
359 |
+
"comparison-expander": 4.999999873689376e-06,
|
360 |
+
"computation-deduplicator": 1.700000029813964e-05,
|
361 |
+
"conditional-to-select": 4.999999873689376e-06,
|
362 |
+
"config-lowering": 3.199999991920777e-05,
|
363 |
+
"constant_folding": 9.000000318337698e-06,
|
364 |
+
"cse": 1.2999999853491317e-05,
|
365 |
+
"dce": 9.999999974752427e-07,
|
366 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
367 |
+
"eliminate-redundant-compare": 3.999999989900971e-06,
|
368 |
+
"emit-offloaded-dropout": 1.2999999853491317e-05,
|
369 |
+
"flatten-call-graph": 1.1000000085914508e-05,
|
370 |
+
"fuse-send-recv": 2.4000000848900527e-05,
|
371 |
+
"hilo::LegalizeAlias": 4.999999873689376e-06,
|
372 |
+
"hilo::NeuronInstCombine": 6.0999998822808266e-05,
|
373 |
+
"hilo::NeuronOpFusion": 7.000000096013537e-06,
|
374 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05,
|
375 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
376 |
+
"hilo::SixtyFourHack": 1.4999999621068127e-05,
|
377 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
378 |
+
"hlo-mac-count": 2.5999999706982635e-05,
|
379 |
+
"hlo-verifier": 0.00018699999782256782,
|
380 |
+
"legalize-ccops": 9.999999974752427e-07,
|
381 |
+
"legalize-compare": 3.999999989900971e-06,
|
382 |
+
"lower-argminmax-custom-call": 3.999999989900971e-06,
|
383 |
+
"map-inline": 1.2000000424450263e-05,
|
384 |
+
"metadata-naming": 2.099999983329326e-05,
|
385 |
+
"mlir::detail::OpToOpPassAdaptor": 2.300000051036477e-05,
|
386 |
+
"mlir::hlo::MhloToPyPenguin": 0.0010349999647587538,
|
387 |
+
"mlir::mhlo::LowerComplexExtraPass": 8.600000001024455e-05,
|
388 |
+
"mlir::mhlo::LowerComplexPass": 0.0001740000006975606,
|
389 |
+
"native-to-custom-softmax": 1.4000000192027073e-05,
|
390 |
+
"native-to-custom-softmax-dx": 0.0001340000017080456,
|
391 |
+
"operand_upcaster": 1.8000000636675395e-05,
|
392 |
+
"post-par-pipe-begin": 6.000000212225132e-06,
|
393 |
+
"post-par-pipe-end": 0.0,
|
394 |
+
"post-partition-simplification": 0.0005830000154674053,
|
395 |
+
"replace-minimum-constant": 7.000000096013537e-06,
|
396 |
+
"reshape-mover": 3.999999989900971e-06,
|
397 |
+
"simplify-concat": 4.8000001697801054e-05,
|
398 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
399 |
+
"transform-variadic-reduce": 9.000000318337698e-06,
|
400 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
401 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
402 |
+
"unroll-while-loop": 0.0
|
403 |
+
},
|
404 |
+
"hilo": {
|
405 |
+
"ArithmeticIntensity": 8.198826789855957,
|
406 |
+
"ConstantSize": 304437.0,
|
407 |
+
"HloInputCount": 475.0,
|
408 |
+
"HloMacCount": 2751463424.0,
|
409 |
+
"HloOutputCount": 73.0,
|
410 |
+
"IfmapSize": 8266542080.0,
|
411 |
+
"OfmapSize": 75497472.0,
|
412 |
+
"OutputsReadFromCount": 0.0,
|
413 |
+
"PassthroughTensorsCount": 0.0,
|
414 |
+
"RedundantOutputCount": 0.0,
|
415 |
+
"Traffic": 671184704.0
|
416 |
+
}
|
417 |
+
},
|
418 |
+
"sg0000": {
|
419 |
+
"compiletime": {
|
420 |
+
"AGOrderingAnalysisPass": 0.04074835777282715,
|
421 |
+
"AffinePredicateResolution": 0.002183198928833008,
|
422 |
+
"AliasDependencyElimination": 0.00012922286987304688,
|
423 |
+
"AliasDependencyInduction": 0.008634567260742188,
|
424 |
+
"AliasDependencyReset": 0.03679013252258301,
|
425 |
+
"BFComputeCutting": 0.0019538402557373047,
|
426 |
+
"BirCodeGenLoop": 0.04571366310119629,
|
427 |
+
"CCOpFusion": 0.01575756072998047,
|
428 |
+
"CanonicalizeDAGForPGTiling": 0.003149271011352539,
|
429 |
+
"CanonicalizeIR": 0.002719879150390625,
|
430 |
+
"CoalesceCCOp": 0.0047032833099365234,
|
431 |
+
"CommuteConcat": 0.0013585090637207031,
|
432 |
+
"DMALocalityOpt": 0.001116037368774414,
|
433 |
+
"DMAProfiler": 0.0047032833099365234,
|
434 |
+
"DMATilingProfiler": 0.004144191741943359,
|
435 |
+
"DataLocalityOpt": 0.10100674629211426,
|
436 |
+
"DataStreaming": 0.0033788681030273438,
|
437 |
+
"DeConcat": 0.0007069110870361328,
|
438 |
+
"DeadCodeElimination": 0.0010058879852294922,
|
439 |
+
"DeadStoreElimination": 0.031080961227416992,
|
440 |
+
"DelinearIndices": 0.007829427719116211,
|
441 |
+
"Delinearization": 0.003365039825439453,
|
442 |
+
"DoNothing": 7.033348083496094e-05,
|
443 |
+
"DramToDramTranspose": 0.024500370025634766,
|
444 |
+
"DumpGraphAndMetadata": 0.005262136459350586,
|
445 |
+
"EliminateDivs": 0.005412578582763672,
|
446 |
+
"ExpandBatchNorm": 0.0019643306732177734,
|
447 |
+
"ExpandISAMacro": 0.002582550048828125,
|
448 |
+
"FactorizeBlkDims": 0.00794839859008789,
|
449 |
+
"FactorizeThreadAxesInFreeDims": 0.0020449161529541016,
|
450 |
+
"FlattenMacroLoop": 0.0028934478759765625,
|
451 |
+
"GenericAccessSimplifier": 0.0009298324584960938,
|
452 |
+
"InferInitValue": 0.026146411895751953,
|
453 |
+
"InferIntrinsicOnCC": 0.010050058364868164,
|
454 |
+
"InferNeuronTensor": 0.03407764434814453,
|
455 |
+
"InferNonlocalTensors": 0.06189298629760742,
|
456 |
+
"InferPSumTensor": 0.03060150146484375,
|
457 |
+
"InlineNativeKernels": 0.0014431476593017578,
|
458 |
+
"InsertIOTransposes": 0.009805679321289063,
|
459 |
+
"InsertLocalTransposes": 0.007609128952026367,
|
460 |
+
"InsertOffloadedTransposes": 0.004189968109130859,
|
461 |
+
"LICM": 0.0029850006103515625,
|
462 |
+
"LateLegalizeInst": 0.004921674728393555,
|
463 |
+
"LateLegalizePostSplit": 0.0025641918182373047,
|
464 |
+
"LateLowerReshapeOp": 0.002185821533203125,
|
465 |
+
"LateLowerTensorOp": 0.00531768798828125,
|
466 |
+
"LateNeuronInstComb": 0.02812671661376953,
|
467 |
+
"LayoutPreprocessing": 0.11982965469360352,
|
468 |
+
"LayoutPreprocessingAndAnalysis": 0.24928760528564453,
|
469 |
+
"LayoutRequirementAnalysis": 0.007187366485595703,
|
470 |
+
"LegalizeCCOpLayout": 0.0035941600799560547,
|
471 |
+
"LegalizeOpLevelAlias": 0.0022826194763183594,
|
472 |
+
"LegalizePartitionReduce": 0.002084970474243164,
|
473 |
+
"LegalizeSundaAccess": 0.03499269485473633,
|
474 |
+
"LegalizeSundaMacro": 0.00858449935913086,
|
475 |
+
"LegalizeType": 0.0038924217224121094,
|
476 |
+
"LocalLayoutOpt": 0.015146255493164063,
|
477 |
+
"LoopFusion": 0.00600433349609375,
|
478 |
+
"LoopSplitting": 0.0003192424774169922,
|
479 |
+
"LowerBroadcast": 0.0030934810638427734,
|
480 |
+
"LowerCCOpBlockAxis": 0.0053822994232177734,
|
481 |
+
"LowerComplexBroadcast": 0.0017805099487304688,
|
482 |
+
"LowerIntrinsics": 0.03145861625671387,
|
483 |
+
"LowerTensorOp": 0.013553142547607422,
|
484 |
+
"LowerTranspose": 0.008147954940795898,
|
485 |
+
"MacroGeneration": 0.10158348083496094,
|
486 |
+
"MaskPropagation": 0.004988193511962891,
|
487 |
+
"MemcpyElimination": 0.1091456413269043,
|
488 |
+
"MutateDataType": 0.002095937728881836,
|
489 |
+
"NeuronAliasDependencyInduction": 0.00023055076599121094,
|
490 |
+
"NeuronAliasDependencyReset": 0.036977291107177734,
|
491 |
+
"NeuronInstComb": 0.01214146614074707,
|
492 |
+
"NeuronLICM": 0.007807016372680664,
|
493 |
+
"NeuronLoopFusion": 0.014447927474975586,
|
494 |
+
"NeuronLoopInterchange": 0.0015079975128173828,
|
495 |
+
"NeuronSimplifier": 0.009031057357788086,
|
496 |
+
"NeuronSimplifyPredicates": 0.0026018619537353516,
|
497 |
+
"NeuronValueNumbering": 0.00443577766418457,
|
498 |
+
"OptimizeAliasedCopyChain": 0.0012700557708740234,
|
499 |
+
"OptimizeNKIKernels": 0.00177764892578125,
|
500 |
+
"PAGLayoutOpt": 0.3914484977722168,
|
501 |
+
"PComputeCutting": 0.005900144577026367,
|
502 |
+
"PGLayoutTilingPipeline": 1.2139532566070557,
|
503 |
+
"PGTiling": 0.2603449821472168,
|
504 |
+
"PadElimination": 0.00040340423583984375,
|
505 |
+
"ParAxesAnnotation": 0.2578258514404297,
|
506 |
+
"PartialLoopFusion": 0.010677099227905273,
|
507 |
+
"PartialSimdFusion": 0.011437177658081055,
|
508 |
+
"PerfectLoopNest": 0.001963376998901367,
|
509 |
+
"RecognizeOpIdiom": 0.004378318786621094,
|
510 |
+
"Recompute": 0.0002574920654296875,
|
511 |
+
"RelaxPredicates": 0.003600597381591797,
|
512 |
+
"Rematerialization": 0.004474163055419922,
|
513 |
+
"ReshapeWeights": 0.0006759166717529297,
|
514 |
+
"ResolveAccessConflict": 0.003798246383666992,
|
515 |
+
"ResolveComplicatePredicates": 0.002101421356201172,
|
516 |
+
"RewriteReplicationMatmul": 0.0012481212615966797,
|
517 |
+
"RewriteWeights": 0.004036903381347656,
|
518 |
+
"SFKVectorizer": 0.09602093696594238,
|
519 |
+
"SimpleAllReduceTiling": 0.0017740726470947266,
|
520 |
+
"Simplifier": 0.004450559616088867,
|
521 |
+
"SimplifyMacroPredicates": 0.010053157806396484,
|
522 |
+
"SimplifyNeuronTensor": 0.00724029541015625,
|
523 |
+
"SimplifySlice": 0.001153707504272461,
|
524 |
+
"SimplifyTensor": 0.005860805511474609,
|
525 |
+
"SpillPSum": 0.011501789093017578,
|
526 |
+
"SplitAPUnionSets": 0.03104996681213379,
|
527 |
+
"SplitAccGrp": 0.002181529998779297,
|
528 |
+
"StaticProfiler": 0.004481792449951172,
|
529 |
+
"StaticTransposeLocalTensor": 0.006117343902587891,
|
530 |
+
"SundaISel": 0.041422128677368164,
|
531 |
+
"TCTransform": 0.0022428035736083984,
|
532 |
+
"TensorInitialization": 0.00680994987487793,
|
533 |
+
"TensorOpSimplifier": 0.008346796035766602,
|
534 |
+
"TensorOpTransform": 0.030104398727416992,
|
535 |
+
"TileCCOps": 0.005553245544433594,
|
536 |
+
"TilingProfiler": 0.009899139404296875,
|
537 |
+
"TransformConvOp": 0.0027108192443847656,
|
538 |
+
"TritiumFusion": 0.020798206329345703,
|
539 |
+
"ValueNumbering": 0.003211498260498047,
|
540 |
+
"VectorizeDMA": 0.004341602325439453,
|
541 |
+
"VectorizeMatMult": 0.0021800994873046875,
|
542 |
+
"WeightCoalescing": 0.0030617713928222656,
|
543 |
+
"ZeroSizeTensorElimination": 0.00011968612670898438
|
544 |
+
},
|
545 |
+
"tensorizer": {
|
546 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 1396.0,
|
547 |
+
"StaticProfiler::AifUb": 8.992382049560547,
|
548 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 75.54261016845703,
|
549 |
+
"StaticProfiler::AverageDmaLength": 9594.294921875,
|
550 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.893310546875,
|
551 |
+
"StaticProfiler::AveragePartitionUtilization": 94.61784362792969,
|
552 |
+
"StaticProfiler::AveragePeUtilization": 99.893310546875,
|
553 |
+
"StaticProfiler::DDRTransferBytes": 79837440.0,
|
554 |
+
"StaticProfiler::InternalTransferBytes": 9797632.0,
|
555 |
+
"StaticProfiler::LoadExpanded": 11010.0,
|
556 |
+
"StaticProfiler::LocalizationEfficiency": 840.0734252929688,
|
557 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1527.2347412109375,
|
558 |
+
"StaticProfiler::StoreExpanded": 3073.0,
|
559 |
+
"StaticProfiler::TotalDMAExpanded": 14083.0,
|
560 |
+
"StaticProfiler::TotalDynamicInstancesCount": 1442.0,
|
561 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 1442.0,
|
562 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
563 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
564 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
565 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
566 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
567 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
568 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 12.0,
|
569 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 821.0,
|
570 |
+
"TilingProfiler::NumPfTransposes": 6.0,
|
571 |
+
"TilingProfiler::NumPfTransposesForIo": 0.0,
|
572 |
+
"TilingProfiler::NumPfTransposesForLocal": 5.0,
|
573 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 1.0,
|
574 |
+
"TilingProfiler::PfTransposeInstructions": 72.0,
|
575 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 0.0,
|
576 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 56.0,
|
577 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 16.0,
|
578 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
579 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 101.0,
|
580 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
581 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
582 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
583 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
584 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
585 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
586 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
587 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
588 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
589 |
+
}
|
590 |
+
},
|
591 |
+
"sg0001": {
|
592 |
+
"compiletime": {
|
593 |
+
"AGOrderingAnalysisPass": 0.034732818603515625,
|
594 |
+
"AffinePredicateResolution": 0.0015087127685546875,
|
595 |
+
"AliasDependencyElimination": 0.0001227855682373047,
|
596 |
+
"AliasDependencyInduction": 0.008313894271850586,
|
597 |
+
"AliasDependencyReset": 0.044220924377441406,
|
598 |
+
"BFComputeCutting": 0.001974344253540039,
|
599 |
+
"BirCodeGenLoop": 0.03118896484375,
|
600 |
+
"CCOpFusion": 0.018246889114379883,
|
601 |
+
"CanonicalizeDAGForPGTiling": 0.003057718276977539,
|
602 |
+
"CanonicalizeIR": 0.0027036666870117188,
|
603 |
+
"CoalesceCCOp": 0.0046405792236328125,
|
604 |
+
"CommuteConcat": 0.0015790462493896484,
|
605 |
+
"DMALocalityOpt": 0.0015497207641601563,
|
606 |
+
"DMAProfiler": 0.004349708557128906,
|
607 |
+
"DMATilingProfiler": 0.003928422927856445,
|
608 |
+
"DataLocalityOpt": 0.12123703956604004,
|
609 |
+
"DataStreaming": 0.0025773048400878906,
|
610 |
+
"DeConcat": 0.0008485317230224609,
|
611 |
+
"DeadCodeElimination": 0.0012981891632080078,
|
612 |
+
"DeadStoreElimination": 0.034687042236328125,
|
613 |
+
"DelinearIndices": 0.009628772735595703,
|
614 |
+
"Delinearization": 0.003772258758544922,
|
615 |
+
"DoNothing": 7.009506225585938e-05,
|
616 |
+
"DramToDramTranspose": 0.028621673583984375,
|
617 |
+
"DumpGraphAndMetadata": 0.003651142120361328,
|
618 |
+
"EliminateDivs": 0.004262447357177734,
|
619 |
+
"ExpandBatchNorm": 0.002134084701538086,
|
620 |
+
"ExpandISAMacro": 0.0026290416717529297,
|
621 |
+
"FactorizeBlkDims": 0.009716033935546875,
|
622 |
+
"FactorizeThreadAxesInFreeDims": 0.0013210773468017578,
|
623 |
+
"FlattenMacroLoop": 0.002851247787475586,
|
624 |
+
"GenericAccessSimplifier": 0.002216815948486328,
|
625 |
+
"InferInitValue": 0.03134632110595703,
|
626 |
+
"InferIntrinsicOnCC": 0.011671781539916992,
|
627 |
+
"InferNeuronTensor": 0.039717674255371094,
|
628 |
+
"InferNonlocalTensors": 0.030872583389282227,
|
629 |
+
"InferPSumTensor": 0.022834062576293945,
|
630 |
+
"InlineNativeKernels": 0.0021605491638183594,
|
631 |
+
"InsertIOTransposes": 0.017906904220581055,
|
632 |
+
"InsertLocalTransposes": 0.007941961288452148,
|
633 |
+
"InsertOffloadedTransposes": 0.0032515525817871094,
|
634 |
+
"LICM": 0.003479480743408203,
|
635 |
+
"LateLegalizeInst": 0.003596782684326172,
|
636 |
+
"LateLegalizePostSplit": 0.002257108688354492,
|
637 |
+
"LateLowerReshapeOp": 0.0018393993377685547,
|
638 |
+
"LateLowerTensorOp": 0.005475044250488281,
|
639 |
+
"LateNeuronInstComb": 0.017774581909179688,
|
640 |
+
"LayoutPreprocessing": 0.03530263900756836,
|
641 |
+
"LayoutPreprocessingAndAnalysis": 0.11916303634643555,
|
642 |
+
"LayoutRequirementAnalysis": 0.007796525955200195,
|
643 |
+
"LegalizeCCOpLayout": 0.0019328594207763672,
|
644 |
+
"LegalizeOpLevelAlias": 0.001219034194946289,
|
645 |
+
"LegalizePartitionReduce": 0.0009839534759521484,
|
646 |
+
"LegalizeSundaAccess": 0.015137434005737305,
|
647 |
+
"LegalizeSundaMacro": 0.010521173477172852,
|
648 |
+
"LegalizeType": 0.004090547561645508,
|
649 |
+
"LocalLayoutOpt": 0.020325422286987305,
|
650 |
+
"LoopFusion": 0.006730556488037109,
|
651 |
+
"LoopSplitting": 0.00034809112548828125,
|
652 |
+
"LowerBroadcast": 0.001789093017578125,
|
653 |
+
"LowerCCOpBlockAxis": 0.005074977874755859,
|
654 |
+
"LowerComplexBroadcast": 0.0019309520721435547,
|
655 |
+
"LowerIntrinsics": 0.03209352493286133,
|
656 |
+
"LowerTensorOp": 0.012279510498046875,
|
657 |
+
"LowerTranspose": 0.010157585144042969,
|
658 |
+
"MacroGeneration": 0.09246373176574707,
|
659 |
+
"MaskPropagation": 0.003335237503051758,
|
660 |
+
"MemcpyElimination": 0.10414385795593262,
|
661 |
+
"MutateDataType": 0.00220489501953125,
|
662 |
+
"NeuronAliasDependencyInduction": 0.0002532005310058594,
|
663 |
+
"NeuronAliasDependencyReset": 0.03873252868652344,
|
664 |
+
"NeuronInstComb": 0.012767791748046875,
|
665 |
+
"NeuronLICM": 0.006428241729736328,
|
666 |
+
"NeuronLoopFusion": 0.01547694206237793,
|
667 |
+
"NeuronLoopInterchange": 0.0012590885162353516,
|
668 |
+
"NeuronSimplifier": 0.009620428085327148,
|
669 |
+
"NeuronSimplifyPredicates": 0.0022652149200439453,
|
670 |
+
"NeuronValueNumbering": 0.0031261444091796875,
|
671 |
+
"OptimizeAliasedCopyChain": 0.0007045269012451172,
|
672 |
+
"OptimizeNKIKernels": 0.0022683143615722656,
|
673 |
+
"PAGLayoutOpt": 0.11684298515319824,
|
674 |
+
"PComputeCutting": 0.0060575008392333984,
|
675 |
+
"PGLayoutTilingPipeline": 1.5194215774536133,
|
676 |
+
"PGTiling": 0.5792257785797119,
|
677 |
+
"PadElimination": 0.0004138946533203125,
|
678 |
+
"ParAxesAnnotation": 0.08577656745910645,
|
679 |
+
"PartialLoopFusion": 0.010853052139282227,
|
680 |
+
"PartialSimdFusion": 0.010831356048583984,
|
681 |
+
"PerfectLoopNest": 0.0021359920501708984,
|
682 |
+
"RecognizeOpIdiom": 0.004781246185302734,
|
683 |
+
"Recompute": 0.00029349327087402344,
|
684 |
+
"RelaxPredicates": 0.0031125545501708984,
|
685 |
+
"Rematerialization": 0.002535104751586914,
|
686 |
+
"ReshapeWeights": 0.0007915496826171875,
|
687 |
+
"ResolveAccessConflict": 0.004204988479614258,
|
688 |
+
"ResolveComplicatePredicates": 0.0014605522155761719,
|
689 |
+
"RewriteReplicationMatmul": 0.0014035701751708984,
|
690 |
+
"RewriteWeights": 0.0033304691314697266,
|
691 |
+
"SFKVectorizer": 0.11060166358947754,
|
692 |
+
"SimpleAllReduceTiling": 0.0013706684112548828,
|
693 |
+
"Simplifier": 0.004431247711181641,
|
694 |
+
"SimplifyMacroPredicates": 0.005709409713745117,
|
695 |
+
"SimplifyNeuronTensor": 0.005321979522705078,
|
696 |
+
"SimplifySlice": 0.0020780563354492188,
|
697 |
+
"SimplifyTensor": 0.00576329231262207,
|
698 |
+
"SpillPSum": 0.01259160041809082,
|
699 |
+
"SplitAPUnionSets": 0.009907007217407227,
|
700 |
+
"SplitAccGrp": 0.0010552406311035156,
|
701 |
+
"StaticProfiler": 0.0033452510833740234,
|
702 |
+
"StaticTransposeLocalTensor": 0.005699634552001953,
|
703 |
+
"SundaISel": 0.04179859161376953,
|
704 |
+
"TCTransform": 0.0024602413177490234,
|
705 |
+
"TensorInitialization": 0.0022628307342529297,
|
706 |
+
"TensorOpSimplifier": 0.006663322448730469,
|
707 |
+
"TensorOpTransform": 0.03399252891540527,
|
708 |
+
"TileCCOps": 0.0057027339935302734,
|
709 |
+
"TilingProfiler": 0.01235508918762207,
|
710 |
+
"TransformConvOp": 0.0025110244750976563,
|
711 |
+
"TritiumFusion": 0.04343080520629883,
|
712 |
+
"ValueNumbering": 0.0041046142578125,
|
713 |
+
"VectorizeDMA": 0.0033228397369384766,
|
714 |
+
"VectorizeMatMult": 0.0033872127532958984,
|
715 |
+
"WeightCoalescing": 0.0023338794708251953,
|
716 |
+
"ZeroSizeTensorElimination": 0.00011372566223144531
|
717 |
+
},
|
718 |
+
"tensorizer": {
|
719 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 5268.0,
|
720 |
+
"StaticProfiler::AifUb": 127.58392333984375,
|
721 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 129.38287353515625,
|
722 |
+
"StaticProfiler::AverageDmaLength": 6718.79638671875,
|
723 |
+
"StaticProfiler::AverageFractalPeUtilization": 100.0,
|
724 |
+
"StaticProfiler::AveragePartitionUtilization": 99.92172241210938,
|
725 |
+
"StaticProfiler::AveragePeUtilization": 100.0,
|
726 |
+
"StaticProfiler::DDRTransferBytes": 198661120.0,
|
727 |
+
"StaticProfiler::InternalTransferBytes": 10321920.0,
|
728 |
+
"StaticProfiler::LoadExpanded": 27264.0,
|
729 |
+
"StaticProfiler::LocalizationEfficiency": 101.41001892089844,
|
730 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 103.59725189208984,
|
731 |
+
"StaticProfiler::StoreExpanded": 1153.0,
|
732 |
+
"StaticProfiler::TotalDMAExpanded": 28417.0,
|
733 |
+
"StaticProfiler::TotalDynamicInstancesCount": 5111.0,
|
734 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 5111.0,
|
735 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
736 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
737 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
738 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
739 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
740 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
741 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 8.0,
|
742 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 4276.0,
|
743 |
+
"TilingProfiler::NumPfTransposes": 8.0,
|
744 |
+
"TilingProfiler::NumPfTransposesForIo": 3.0,
|
745 |
+
"TilingProfiler::NumPfTransposesForLocal": 3.0,
|
746 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 2.0,
|
747 |
+
"TilingProfiler::PfTransposeInstructions": 122.0,
|
748 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 34.0,
|
749 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 24.0,
|
750 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 64.0,
|
751 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
752 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 180.0,
|
753 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
754 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
755 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
756 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
757 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
758 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
759 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
760 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
761 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
762 |
+
}
|
763 |
+
},
|
764 |
+
"sg0002": {
|
765 |
+
"compiletime": {
|
766 |
+
"AGOrderingAnalysisPass": 0.018787622451782227,
|
767 |
+
"AffinePredicateResolution": 0.0011818408966064453,
|
768 |
+
"AliasDependencyElimination": 0.00011801719665527344,
|
769 |
+
"AliasDependencyInduction": 0.005483388900756836,
|
770 |
+
"AliasDependencyReset": 0.026019811630249023,
|
771 |
+
"BFComputeCutting": 0.00225830078125,
|
772 |
+
"BirCodeGenLoop": 0.4621126651763916,
|
773 |
+
"CCOpFusion": 0.01928091049194336,
|
774 |
+
"CanonicalizeDAGForPGTiling": 0.004612922668457031,
|
775 |
+
"CanonicalizeIR": 0.0017774105072021484,
|
776 |
+
"CoalesceCCOp": 0.014393329620361328,
|
777 |
+
"CommuteConcat": 0.0020241737365722656,
|
778 |
+
"DMALocalityOpt": 0.0052585601806640625,
|
779 |
+
"DMAProfiler": 0.011700868606567383,
|
780 |
+
"DMATilingProfiler": 0.004782676696777344,
|
781 |
+
"DataLocalityOpt": 0.06629562377929688,
|
782 |
+
"DataStreaming": 0.03746771812438965,
|
783 |
+
"DeConcat": 0.0006563663482666016,
|
784 |
+
"DeadCodeElimination": 0.002358675003051758,
|
785 |
+
"DeadStoreElimination": 0.0055620670318603516,
|
786 |
+
"DelinearIndices": 0.004741668701171875,
|
787 |
+
"Delinearization": 0.0036110877990722656,
|
788 |
+
"DoNothing": 8.368492126464844e-05,
|
789 |
+
"DramToDramTranspose": 0.016016721725463867,
|
790 |
+
"DumpGraphAndMetadata": 0.0853111743927002,
|
791 |
+
"EliminateDivs": 0.0025675296783447266,
|
792 |
+
"ExpandBatchNorm": 0.002092123031616211,
|
793 |
+
"ExpandISAMacro": 0.010552406311035156,
|
794 |
+
"FactorizeBlkDims": 0.0076830387115478516,
|
795 |
+
"FactorizeThreadAxesInFreeDims": 0.002122640609741211,
|
796 |
+
"FlattenMacroLoop": 0.002187013626098633,
|
797 |
+
"GenericAccessSimplifier": 0.0009529590606689453,
|
798 |
+
"InferInitValue": 0.0242159366607666,
|
799 |
+
"InferIntrinsicOnCC": 0.009269952774047852,
|
800 |
+
"InferNeuronTensor": 0.020155906677246094,
|
801 |
+
"InferNonlocalTensors": 0.015646696090698242,
|
802 |
+
"InferPSumTensor": 0.3076965808868408,
|
803 |
+
"InlineNativeKernels": 0.009155511856079102,
|
804 |
+
"InsertIOTransposes": 0.015281438827514648,
|
805 |
+
"InsertLocalTransposes": 0.006501436233520508,
|
806 |
+
"InsertOffloadedTransposes": 0.002702474594116211,
|
807 |
+
"LICM": 0.002913951873779297,
|
808 |
+
"LateLegalizeInst": 0.013724088668823242,
|
809 |
+
"LateLegalizePostSplit": 0.012693405151367188,
|
810 |
+
"LateLowerReshapeOp": 0.0025734901428222656,
|
811 |
+
"LateLowerTensorOp": 0.001531362533569336,
|
812 |
+
"LateNeuronInstComb": 0.008354902267456055,
|
813 |
+
"LayoutPreprocessing": 0.026634931564331055,
|
814 |
+
"LayoutPreprocessingAndAnalysis": 0.5595176219940186,
|
815 |
+
"LayoutRequirementAnalysis": 0.005538463592529297,
|
816 |
+
"LegalizeCCOpLayout": 0.0022728443145751953,
|
817 |
+
"LegalizeOpLevelAlias": 0.001255035400390625,
|
818 |
+
"LegalizePartitionReduce": 0.001256704330444336,
|
819 |
+
"LegalizeSundaAccess": 0.07487797737121582,
|
820 |
+
"LegalizeSundaMacro": 0.010920286178588867,
|
821 |
+
"LegalizeType": 0.012901067733764648,
|
822 |
+
"LocalLayoutOpt": 0.012011289596557617,
|
823 |
+
"LoopFusion": 0.006572723388671875,
|
824 |
+
"LoopSplitting": 0.0003001689910888672,
|
825 |
+
"LowerBroadcast": 0.0016355514526367188,
|
826 |
+
"LowerCCOpBlockAxis": 0.0050678253173828125,
|
827 |
+
"LowerComplexBroadcast": 0.0025262832641601563,
|
828 |
+
"LowerIntrinsics": 0.30371904373168945,
|
829 |
+
"LowerTensorOp": 0.011744022369384766,
|
830 |
+
"LowerTranspose": 0.011518478393554688,
|
831 |
+
"MacroGeneration": 0.026911020278930664,
|
832 |
+
"MaskPropagation": 0.0031325817108154297,
|
833 |
+
"MemcpyElimination": 0.027472257614135742,
|
834 |
+
"MutateDataType": 0.0015196800231933594,
|
835 |
+
"NeuronAliasDependencyInduction": 0.00016927719116210938,
|
836 |
+
"NeuronAliasDependencyReset": 0.0242006778717041,
|
837 |
+
"NeuronInstComb": 0.004147529602050781,
|
838 |
+
"NeuronLICM": 0.036264657974243164,
|
839 |
+
"NeuronLoopFusion": 0.00889277458190918,
|
840 |
+
"NeuronLoopInterchange": 0.002141237258911133,
|
841 |
+
"NeuronSimplifier": 0.00720524787902832,
|
842 |
+
"NeuronSimplifyPredicates": 0.11929655075073242,
|
843 |
+
"NeuronValueNumbering": 0.003022432327270508,
|
844 |
+
"OptimizeAliasedCopyChain": 0.0006387233734130859,
|
845 |
+
"OptimizeNKIKernels": 0.5260024070739746,
|
846 |
+
"PAGLayoutOpt": 0.5680239200592041,
|
847 |
+
"PComputeCutting": 0.0048143863677978516,
|
848 |
+
"PGLayoutTilingPipeline": 1.6304676532745361,
|
849 |
+
"PGTiling": 0.1616363525390625,
|
850 |
+
"PadElimination": 0.0003521442413330078,
|
851 |
+
"ParAxesAnnotation": 0.0544736385345459,
|
852 |
+
"PartialLoopFusion": 0.005907773971557617,
|
853 |
+
"PartialSimdFusion": 0.0038967132568359375,
|
854 |
+
"PerfectLoopNest": 0.0021576881408691406,
|
855 |
+
"RecognizeOpIdiom": 0.0039520263671875,
|
856 |
+
"Recompute": 0.0002884864807128906,
|
857 |
+
"RelaxPredicates": 0.013870716094970703,
|
858 |
+
"Rematerialization": 0.0024657249450683594,
|
859 |
+
"ReshapeWeights": 0.0006930828094482422,
|
860 |
+
"ResolveAccessConflict": 0.0038983821868896484,
|
861 |
+
"ResolveComplicatePredicates": 0.0012950897216796875,
|
862 |
+
"RewriteReplicationMatmul": 0.002060413360595703,
|
863 |
+
"RewriteWeights": 0.0028791427612304688,
|
864 |
+
"SFKVectorizer": 0.28761887550354004,
|
865 |
+
"SimpleAllReduceTiling": 0.008704662322998047,
|
866 |
+
"Simplifier": 0.003449678421020508,
|
867 |
+
"SimplifyMacroPredicates": 0.010317325592041016,
|
868 |
+
"SimplifyNeuronTensor": 1.0378923416137695,
|
869 |
+
"SimplifySlice": 0.0008852481842041016,
|
870 |
+
"SimplifyTensor": 0.005218982696533203,
|
871 |
+
"SpillPSum": 0.009551286697387695,
|
872 |
+
"SplitAPUnionSets": 0.10591006278991699,
|
873 |
+
"SplitAccGrp": 0.0011169910430908203,
|
874 |
+
"StaticProfiler": 0.01290583610534668,
|
875 |
+
"StaticTransposeLocalTensor": 0.003824472427368164,
|
876 |
+
"SundaISel": 0.041872262954711914,
|
877 |
+
"TCTransform": 0.0008666515350341797,
|
878 |
+
"TensorInitialization": 0.013058185577392578,
|
879 |
+
"TensorOpSimplifier": 0.0061550140380859375,
|
880 |
+
"TensorOpTransform": 0.020328521728515625,
|
881 |
+
"TileCCOps": 0.006834983825683594,
|
882 |
+
"TilingProfiler": 0.0072863101959228516,
|
883 |
+
"TransformConvOp": 0.0032320022583007813,
|
884 |
+
"TritiumFusion": 0.03062152862548828,
|
885 |
+
"ValueNumbering": 0.0023603439331054688,
|
886 |
+
"VectorizeDMA": 0.004430294036865234,
|
887 |
+
"VectorizeMatMult": 0.0021605491638183594,
|
888 |
+
"WeightCoalescing": 0.00825953483581543,
|
889 |
+
"ZeroSizeTensorElimination": 0.00011014938354492188
|
890 |
+
},
|
891 |
+
"tensorizer": {
|
892 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 42834.0,
|
893 |
+
"StaticProfiler::AifUb": 129.43267822265625,
|
894 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 128.19729614257813,
|
895 |
+
"StaticProfiler::AverageDmaLength": 4810.17578125,
|
896 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.65389251708984,
|
897 |
+
"StaticProfiler::AveragePartitionUtilization": 97.55139923095703,
|
898 |
+
"StaticProfiler::AveragePeUtilization": 98.60253143310547,
|
899 |
+
"StaticProfiler::DDRTransferBytes": 782946624.0,
|
900 |
+
"StaticProfiler::InternalTransferBytes": 629086720.0,
|
901 |
+
"StaticProfiler::LoadExpanded": 97814.0,
|
902 |
+
"StaticProfiler::LocalizationEfficiency": 99.04553985595703,
|
903 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.20111846923828,
|
904 |
+
"StaticProfiler::StoreExpanded": 1757.0,
|
905 |
+
"StaticProfiler::TotalDMAExpanded": 99571.0,
|
906 |
+
"StaticProfiler::TotalDynamicInstancesCount": 50031.0,
|
907 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 49585.0,
|
908 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
909 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
910 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
911 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
912 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
913 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
914 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
915 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 22464.0,
|
916 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
917 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
918 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
919 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
920 |
+
"TilingProfiler::PfTransposeInstructions": 19105.0,
|
921 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
|
922 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
923 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 96.0,
|
924 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
|
925 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 158.0,
|
926 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
927 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
928 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
929 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
930 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
931 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
932 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
933 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
934 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
935 |
+
}
|
936 |
+
},
|
937 |
+
"sg01": {
|
938 |
+
"compiletime": {
|
939 |
+
"CanonicalizeConv": 2.300000051036477e-05,
|
940 |
+
"CanonicalizeForTensorizer": 1.2999999853491317e-05,
|
941 |
+
"Canonicalizer": 0.00028199999360367656,
|
942 |
+
"HoistCompute": 3.000000106112566e-06,
|
943 |
+
"IdentifyCrossPassTensors": 2.4000000848900527e-05,
|
944 |
+
"MemcastMotion": 9.999999747378752e-06,
|
945 |
+
"PenguinizeFunctions": 1.4000000192027073e-05,
|
946 |
+
"PruneFunctions": 1.2000000424450263e-05,
|
947 |
+
"RemoveOptimizationBarriers": 2.2000000171829015e-05,
|
948 |
+
"ScatterMotion": 1.700000029813964e-05,
|
949 |
+
"TensorizerLegalizationPass": 3.400000059627928e-05,
|
950 |
+
"VerifySupportedOps": 1.1000000085914508e-05,
|
951 |
+
"algsimp": 6.500000017695129e-05,
|
952 |
+
"batchnorm_expander": 1.2000000424450263e-05,
|
953 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
954 |
+
"call-inliner": 9.999999747378752e-06,
|
955 |
+
"canonicalize-boundary-marker": 4.999999873689376e-06,
|
956 |
+
"collective-stream-id-checker": 3.999999989900971e-06,
|
957 |
+
"comparison-expander": 4.999999873689376e-06,
|
958 |
+
"computation-deduplicator": 2.099999983329326e-05,
|
959 |
+
"conditional-to-select": 4.999999873689376e-06,
|
960 |
+
"config-lowering": 2.9000000722589903e-05,
|
961 |
+
"constant_folding": 9.000000318337698e-06,
|
962 |
+
"cse": 1.2000000424450263e-05,
|
963 |
+
"dce": 9.999999974752427e-07,
|
964 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
965 |
+
"eliminate-redundant-compare": 3.999999989900971e-06,
|
966 |
+
"emit-offloaded-dropout": 1.2999999853491317e-05,
|
967 |
+
"flatten-call-graph": 9.000000318337698e-06,
|
968 |
+
"fuse-send-recv": 2.9000000722589903e-05,
|
969 |
+
"hilo::LegalizeAlias": 4.999999873689376e-06,
|
970 |
+
"hilo::NeuronInstCombine": 3.5000000934815034e-05,
|
971 |
+
"hilo::NeuronOpFusion": 1.2999999853491317e-05,
|
972 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05,
|
973 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
974 |
+
"hilo::SixtyFourHack": 9.999999747378752e-06,
|
975 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
976 |
+
"hlo-mac-count": 2.5999999706982635e-05,
|
977 |
+
"hlo-verifier": 0.00018899999849963933,
|
978 |
+
"legalize-ccops": 9.999999974752427e-07,
|
979 |
+
"legalize-compare": 3.999999989900971e-06,
|
980 |
+
"lower-argminmax-custom-call": 3.999999989900971e-06,
|
981 |
+
"map-inline": 1.2000000424450263e-05,
|
982 |
+
"metadata-naming": 2.4000000848900527e-05,
|
983 |
+
"mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05,
|
984 |
+
"mlir::hlo::MhloToPyPenguin": 0.0008980000275187194,
|
985 |
+
"mlir::mhlo::LowerComplexExtraPass": 7.999999797903001e-05,
|
986 |
+
"mlir::mhlo::LowerComplexPass": 0.00013499999477062374,
|
987 |
+
"native-to-custom-softmax": 7.000000096013537e-06,
|
988 |
+
"native-to-custom-softmax-dx": 1.5999999959603883e-05,
|
989 |
+
"operand_upcaster": 1.8999999156221747e-05,
|
990 |
+
"post-par-pipe-begin": 1.9999999949504854e-06,
|
991 |
+
"post-par-pipe-end": 0.0,
|
992 |
+
"post-partition-simplification": 0.0005530000198632479,
|
993 |
+
"replace-minimum-constant": 6.000000212225132e-06,
|
994 |
+
"reshape-mover": 3.000000106112566e-06,
|
995 |
+
"simplify-concat": 4.8999998398358e-05,
|
996 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
997 |
+
"transform-variadic-reduce": 9.000000318337698e-06,
|
998 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
999 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
1000 |
+
"unroll-while-loop": 9.999999974752427e-07
|
1001 |
+
},
|
1002 |
+
"hilo": {
|
1003 |
+
"ArithmeticIntensity": 123.27030181884766,
|
1004 |
+
"HloMacCount": 12415139840.0,
|
1005 |
+
"Traffic": 201429536.0
|
1006 |
+
}
|
1007 |
+
},
|
1008 |
+
"sg02": {
|
1009 |
+
"compiletime": {
|
1010 |
+
"CanonicalizeConv": 0.0,
|
1011 |
+
"CanonicalizeForTensorizer": 1.2999999853491317e-05,
|
1012 |
+
"Canonicalizer": 0.00033599999733269215,
|
1013 |
+
"HoistCompute": 0.0,
|
1014 |
+
"IdentifyCrossPassTensors": 2.2000000171829015e-05,
|
1015 |
+
"MemcastMotion": 0.0,
|
1016 |
+
"PenguinizeFunctions": 9.000000318337698e-06,
|
1017 |
+
"PruneFunctions": 9.999999747378752e-06,
|
1018 |
+
"RemoveOptimizationBarriers": 1.8999999156221747e-05,
|
1019 |
+
"ScatterMotion": 0.0,
|
1020 |
+
"TensorizerLegalizationPass": 6.000000212225132e-06,
|
1021 |
+
"VerifySupportedOps": 1.1000000085914508e-05,
|
1022 |
+
"algsimp": 5.999999848427251e-05,
|
1023 |
+
"batchnorm_expander": 1.1000000085914508e-05,
|
1024 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
1025 |
+
"call-inliner": 1.1000000085914508e-05,
|
1026 |
+
"canonicalize-boundary-marker": 3.999999989900971e-06,
|
1027 |
+
"collective-stream-id-checker": 1.9999999949504854e-06,
|
1028 |
+
"comparison-expander": 4.999999873689376e-06,
|
1029 |
+
"computation-deduplicator": 2.099999983329326e-05,
|
1030 |
+
"conditional-to-select": 6.000000212225132e-06,
|
1031 |
+
"config-lowering": 2.5999999706982635e-05,
|
1032 |
+
"constant_folding": 9.000000318337698e-06,
|
1033 |
+
"cse": 1.1000000085914508e-05,
|
1034 |
+
"dce": 9.999999974752427e-07,
|
1035 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
1036 |
+
"eliminate-redundant-compare": 3.000000106112566e-06,
|
1037 |
+
"emit-offloaded-dropout": 1.2000000424450263e-05,
|
1038 |
+
"flatten-call-graph": 1.2000000424450263e-05,
|
1039 |
+
"fuse-send-recv": 1.8000000636675395e-05,
|
1040 |
+
"hilo::LegalizeAlias": 1.9999999949504854e-06,
|
1041 |
+
"hilo::NeuronInstCombine": 4.999999873689376e-06,
|
1042 |
+
"hilo::NeuronOpFusion": 2.8000000384054147e-05,
|
1043 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 7.000000096013537e-06,
|
1044 |
+
"hilo::ScheduleFusion": 0.0,
|
1045 |
+
"hilo::SixtyFourHack": 3.7000001611886546e-05,
|
1046 |
+
"hilo::VerifyAliasing": 9.999999974752427e-07,
|
1047 |
+
"hlo-mac-count": 0.00016900000628083944,
|
1048 |
+
"hlo-verifier": 0.00015799999528098851,
|
1049 |
+
"legalize-ccops": 9.999999974752427e-07,
|
1050 |
+
"legalize-compare": 3.000000106112566e-06,
|
1051 |
+
"lower-argminmax-custom-call": 3.999999989900971e-06,
|
1052 |
+
"map-inline": 1.2999999853491317e-05,
|
1053 |
+
"metadata-naming": 1.5999999959603883e-05,
|
1054 |
+
"mlir::detail::OpToOpPassAdaptor": 2.700000004551839e-05,
|
1055 |
+
"mlir::hlo::MhloToPyPenguin": 0.0008440000237897038,
|
1056 |
+
"mlir::mhlo::LowerComplexExtraPass": 6.900000153109431e-05,
|
1057 |
+
"mlir::mhlo::LowerComplexPass": 1.5999999959603883e-05,
|
1058 |
+
"native-to-custom-softmax": 6.000000212225132e-06,
|
1059 |
+
"native-to-custom-softmax-dx": 1.9999999494757503e-05,
|
1060 |
+
"operand_upcaster": 1.2000000424450263e-05,
|
1061 |
+
"post-par-pipe-begin": 9.999999974752427e-07,
|
1062 |
+
"post-par-pipe-end": 0.0,
|
1063 |
+
"post-partition-simplification": 0.000526999996509403,
|
1064 |
+
"replace-minimum-constant": 9.000000318337698e-06,
|
1065 |
+
"reshape-mover": 3.000000106112566e-06,
|
1066 |
+
"simplify-concat": 4.400000034365803e-05,
|
1067 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
1068 |
+
"transform-variadic-reduce": 4.8999998398358e-05,
|
1069 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
1070 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
1071 |
+
"unroll-while-loop": 9.999999974752427e-07
|
1072 |
+
},
|
1073 |
+
"hilo": {
|
1074 |
+
"ArithmeticIntensity": 25.691875457763672,
|
1075 |
+
"HloMacCount": 9974841344.0,
|
1076 |
+
"Traffic": 776497728.0
|
1077 |
+
}
|
1078 |
+
}
|
1079 |
+
}
|
context_encoding_model/_tp0_bk0/graph.neff
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c0aeace703e08ac36bdcb2027d9a278403cb96ef39f48bddc999b077215e8a36
|
3 |
+
size 1557504
|
context_encoding_model/_tp0_bk0/log-neuron-cc.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
context_encoding_model/_tp0_bk0/metaneff.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0aef68f833b52be82fd0e17410bcfd279e5719338cb746c0619d5139fc4a3d02
|
3 |
+
size 1042690
|
context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d89b9e073981a0b1b7d0bbd0a24f147e9df13c5706d9d6be9971b857124c9496
|
3 |
+
size 1119812
|
context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c0aeace703e08ac36bdcb2027d9a278403cb96ef39f48bddc999b077215e8a36
|
3 |
+
size 1557504
|
context_encoding_model/_tp0_bk0/neuron_config.json
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_attn_implementation_autoset": false,
|
3 |
+
"_name_or_path": "Qwen/Qwen3-8B",
|
4 |
+
"add_cross_attention": false,
|
5 |
+
"architectures": [
|
6 |
+
"Qwen3ForCausalLM"
|
7 |
+
],
|
8 |
+
"attention_bias": false,
|
9 |
+
"attention_dropout": 0.0,
|
10 |
+
"attribute_map": {},
|
11 |
+
"bad_words_ids": null,
|
12 |
+
"begin_suppress_tokens": null,
|
13 |
+
"bos_token_id": 151643,
|
14 |
+
"chunk_size_feed_forward": 0,
|
15 |
+
"cross_attention_hidden_size": null,
|
16 |
+
"decoder_start_token_id": null,
|
17 |
+
"diversity_penalty": 0.0,
|
18 |
+
"do_sample": false,
|
19 |
+
"early_stopping": false,
|
20 |
+
"encoder_no_repeat_ngram_size": 0,
|
21 |
+
"eos_token_id": 151645,
|
22 |
+
"exponential_decay_length_penalty": null,
|
23 |
+
"finetuning_task": null,
|
24 |
+
"forced_bos_token_id": null,
|
25 |
+
"forced_eos_token_id": null,
|
26 |
+
"fused_spec_config": null,
|
27 |
+
"head_dim": 128,
|
28 |
+
"hidden_act": "silu",
|
29 |
+
"hidden_size": 4096,
|
30 |
+
"id2label": {
|
31 |
+
"0": "LABEL_0",
|
32 |
+
"1": "LABEL_1"
|
33 |
+
},
|
34 |
+
"initializer_range": 0.02,
|
35 |
+
"intermediate_size": 12288,
|
36 |
+
"is_decoder": false,
|
37 |
+
"is_encoder_decoder": false,
|
38 |
+
"label2id": {
|
39 |
+
"LABEL_0": 0,
|
40 |
+
"LABEL_1": 1
|
41 |
+
},
|
42 |
+
"length_penalty": 1.0,
|
43 |
+
"max_length": 20,
|
44 |
+
"max_position_embeddings": 40960,
|
45 |
+
"max_window_layers": 36,
|
46 |
+
"metadata": null,
|
47 |
+
"min_length": 0,
|
48 |
+
"model_type": "qwen3",
|
49 |
+
"neuron_config": {
|
50 |
+
"activation_quantization_type": null,
|
51 |
+
"allow_input_truncation": false,
|
52 |
+
"apply_seq_ids_mask": false,
|
53 |
+
"async_mode": false,
|
54 |
+
"attention_dp_degree": 1,
|
55 |
+
"attention_dtype": null,
|
56 |
+
"attn_block_cte_nki_kernel_enabled": false,
|
57 |
+
"attn_block_tkg_nki_kernel_cache_update": false,
|
58 |
+
"attn_block_tkg_nki_kernel_enabled": false,
|
59 |
+
"attn_cls": {
|
60 |
+
"__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
|
61 |
+
"__name__": "NeuronQwen3Attention"
|
62 |
+
},
|
63 |
+
"attn_kernel_enabled": null,
|
64 |
+
"attn_tkg_builtin_kernel_enabled": false,
|
65 |
+
"attn_tkg_nki_kernel_enabled": false,
|
66 |
+
"batch_size": 1,
|
67 |
+
"bucket_n_active_tokens": true,
|
68 |
+
"buckets": [
|
69 |
+
128
|
70 |
+
],
|
71 |
+
"cast_type": "config",
|
72 |
+
"cc_pipeline_tiling_factor": 2,
|
73 |
+
"chunked_prefill_config": null,
|
74 |
+
"context_encoding_buckets": [
|
75 |
+
128
|
76 |
+
],
|
77 |
+
"cp_degree": 1,
|
78 |
+
"ctx_batch_size": 1,
|
79 |
+
"disable_kv_cache_tiling": false,
|
80 |
+
"draft_model_modules_to_not_convert": null,
|
81 |
+
"enable_bucketing": true,
|
82 |
+
"enable_eagle_draft_input_norm": false,
|
83 |
+
"enable_eagle_speculation": false,
|
84 |
+
"enable_fused_speculation": false,
|
85 |
+
"enable_long_context_mode": false,
|
86 |
+
"enable_output_completion_notifications": false,
|
87 |
+
"enable_spill_reload_dge": false,
|
88 |
+
"enable_token_tree": false,
|
89 |
+
"ep_degree": 1,
|
90 |
+
"expert_mlp_nki_kernel_enabled": null,
|
91 |
+
"flash_decoding_enabled": false,
|
92 |
+
"fused_qkv": false,
|
93 |
+
"fused_rmsnorm_skip_gamma": false,
|
94 |
+
"is_block_kv_layout": null,
|
95 |
+
"is_chunked_prefill": false,
|
96 |
+
"is_continuous_batching": true,
|
97 |
+
"is_eagle_draft": false,
|
98 |
+
"is_medusa": false,
|
99 |
+
"is_prefill_stage": true,
|
100 |
+
"is_prefix_caching": false,
|
101 |
+
"k_cache_transposed": false,
|
102 |
+
"kv_cache_batch_size": 1,
|
103 |
+
"kv_cache_padding_size": 0,
|
104 |
+
"kv_cache_quant": false,
|
105 |
+
"kv_cache_tiling": false,
|
106 |
+
"layer_boundary_markers": false,
|
107 |
+
"lm_head_pad": false,
|
108 |
+
"lm_head_pad_alignment_size": 1,
|
109 |
+
"local_ranks_size": 2,
|
110 |
+
"logical_nc_config": 1,
|
111 |
+
"lora_config": null,
|
112 |
+
"max_batch_size": 1,
|
113 |
+
"max_context_length": 1024,
|
114 |
+
"max_length": 1024,
|
115 |
+
"max_new_tokens": null,
|
116 |
+
"medusa_speculation_length": 0,
|
117 |
+
"medusa_tree": null,
|
118 |
+
"mlp_kernel_enabled": false,
|
119 |
+
"mlp_kernel_fuse_residual_add": false,
|
120 |
+
"modules_to_not_convert": null,
|
121 |
+
"moe_fused_nki_kernel_enabled": null,
|
122 |
+
"n_active_tokens": 1024,
|
123 |
+
"n_positions": 1024,
|
124 |
+
"num_medusa_heads": 0,
|
125 |
+
"on_cpu": false,
|
126 |
+
"on_device_sampling_config": {
|
127 |
+
"deterministic": false,
|
128 |
+
"do_sample": false,
|
129 |
+
"dynamic": true,
|
130 |
+
"global_topk": 256,
|
131 |
+
"on_device_sampling_config": true,
|
132 |
+
"temperature": 1.0,
|
133 |
+
"top_k": 1,
|
134 |
+
"top_k_kernel_enabled": false,
|
135 |
+
"top_p": 1.0
|
136 |
+
},
|
137 |
+
"output_logits": false,
|
138 |
+
"overrides_torch_dtype": true,
|
139 |
+
"pa_block_size": 1024,
|
140 |
+
"pa_num_blocks": 1,
|
141 |
+
"padding_side": "right",
|
142 |
+
"pp_degree": 1,
|
143 |
+
"prefix_buckets": null,
|
144 |
+
"qk_layernorm": false,
|
145 |
+
"qkv_kernel_enabled": false,
|
146 |
+
"qkv_kernel_fuse_residual_add": false,
|
147 |
+
"qkv_kernel_nbsd_layout": false,
|
148 |
+
"quantization_dtype": "int8",
|
149 |
+
"quantization_type": "per_tensor_symmetric",
|
150 |
+
"quantize_clamp_bound": Infinity,
|
151 |
+
"quantized": false,
|
152 |
+
"quantized_checkpoints_path": null,
|
153 |
+
"quantized_mlp_kernel_enabled": false,
|
154 |
+
"rmsnorm_quantize_kernel_enabled": false,
|
155 |
+
"router_topk_nki_kernel_enabled": null,
|
156 |
+
"rpl_reduce_dtype": null,
|
157 |
+
"save_sharded_checkpoint": true,
|
158 |
+
"scratchpad_page_size": null,
|
159 |
+
"seq_len": 1024,
|
160 |
+
"seq_len_threshold_for_cc_tiling": 16384,
|
161 |
+
"sequence_parallel_enabled": false,
|
162 |
+
"shared_mlp_nki_kernel_enabled": null,
|
163 |
+
"skip_sharding": false,
|
164 |
+
"skip_warmup": false,
|
165 |
+
"spec_batch_size": 1,
|
166 |
+
"speculation_length": 0,
|
167 |
+
"start_rank_id": 0,
|
168 |
+
"target": null,
|
169 |
+
"tile_cc": false,
|
170 |
+
"tkg_batch_size": 1,
|
171 |
+
"token_generation_buckets": null,
|
172 |
+
"token_tree_config": null,
|
173 |
+
"torch_dtype": "bfloat16",
|
174 |
+
"tp_degree": 2,
|
175 |
+
"vocab_parallel": false,
|
176 |
+
"weight_gather_seq_len_threshold": 32768,
|
177 |
+
"weights_to_skip_layout_optimization": [],
|
178 |
+
"world_size": 2
|
179 |
+
},
|
180 |
+
"no_repeat_ngram_size": 0,
|
181 |
+
"num_attention_heads": 32,
|
182 |
+
"num_beam_groups": 1,
|
183 |
+
"num_beams": 1,
|
184 |
+
"num_cores_per_group": 1,
|
185 |
+
"num_hidden_layers": 36,
|
186 |
+
"num_key_value_heads": 8,
|
187 |
+
"num_return_sequences": 1,
|
188 |
+
"output_attentions": false,
|
189 |
+
"output_hidden_states": false,
|
190 |
+
"output_scores": false,
|
191 |
+
"pad_token_id": 0,
|
192 |
+
"prefix": null,
|
193 |
+
"problem_type": null,
|
194 |
+
"pruned_heads": {},
|
195 |
+
"remove_invalid_values": false,
|
196 |
+
"repetition_penalty": 1.0,
|
197 |
+
"return_dict": true,
|
198 |
+
"return_dict_in_generate": false,
|
199 |
+
"rms_norm_eps": 1e-06,
|
200 |
+
"rope_scaling": null,
|
201 |
+
"rope_theta": 1000000,
|
202 |
+
"sep_token_id": null,
|
203 |
+
"sliding_window": null,
|
204 |
+
"suppress_tokens": null,
|
205 |
+
"task_specific_params": null,
|
206 |
+
"temperature": 1.0,
|
207 |
+
"tf_legacy_loss": false,
|
208 |
+
"tie_encoder_decoder": false,
|
209 |
+
"tie_word_embeddings": false,
|
210 |
+
"tokenizer_class": null,
|
211 |
+
"top_k": 50,
|
212 |
+
"top_p": 1.0,
|
213 |
+
"torchscript": false,
|
214 |
+
"transformers_version": "4.51.0",
|
215 |
+
"typical_p": 1.0,
|
216 |
+
"use_bfloat16": false,
|
217 |
+
"use_cache": true,
|
218 |
+
"use_sliding_window": false,
|
219 |
+
"vocab_size": 151936
|
220 |
+
}
|
context_encoding_model/_tp0_bk1/command.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
neuronx-cc compile --framework=XLA model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb --output model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
|
context_encoding_model/_tp0_bk1/compile_flags.MODULE_2914133a46cb7b4660ab+d7af8a84.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/log-neuron-cc.txt"]
|
context_encoding_model/_tp0_bk1/global_metric_store.json
ADDED
@@ -0,0 +1,1079 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Average": {
|
3 |
+
"tensorizer": {
|
4 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.65782165527344,
|
5 |
+
"StaticProfiler::AveragePartitionUtilization": 97.58238220214844,
|
6 |
+
"StaticProfiler::AveragePeUtilization": 98.61824035644531,
|
7 |
+
"StaticProfiler::LocalizationEfficiency": 98.78419494628906,
|
8 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.47209167480469,
|
9 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
10 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"Count": {
|
14 |
+
"tensorizer": {
|
15 |
+
"StaticProfiler::AverageFractalPeUtilization": 1.0,
|
16 |
+
"StaticProfiler::AveragePartitionUtilization": 1.0,
|
17 |
+
"StaticProfiler::AveragePeUtilization": 1.0,
|
18 |
+
"StaticProfiler::LocalizationEfficiency": 1.0,
|
19 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
|
20 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
|
21 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"Sum": {
|
25 |
+
"compiletime": {
|
26 |
+
"AGOrderingAnalysisPass": 0.019578933715820313,
|
27 |
+
"AffinePredicateResolution": 0.0019481182098388672,
|
28 |
+
"AliasDependencyElimination": 0.0001239776611328125,
|
29 |
+
"AliasDependencyInduction": 0.00577092170715332,
|
30 |
+
"AliasDependencyReset": 0.027690649032592773,
|
31 |
+
"BFComputeCutting": 0.0023322105407714844,
|
32 |
+
"BirCodeGenLoop": 0.4628438949584961,
|
33 |
+
"CCOpFusion": 0.022275209426879883,
|
34 |
+
"CanonicalizeConv": 3.300000025774352e-05,
|
35 |
+
"CanonicalizeDAGForPGTiling": 0.005593061447143555,
|
36 |
+
"CanonicalizeForTensorizer": 4.400000034365803e-05,
|
37 |
+
"CanonicalizeIR": 0.001634359359741211,
|
38 |
+
"Canonicalizer": 0.0008999999845400453,
|
39 |
+
"CoalesceCCOp": 0.015577077865600586,
|
40 |
+
"CommuteConcat": 0.0008616447448730469,
|
41 |
+
"DMALocalityOpt": 0.007327079772949219,
|
42 |
+
"DMAProfiler": 0.012569665908813477,
|
43 |
+
"DMATilingProfiler": 0.0037431716918945313,
|
44 |
+
"DataLocalityOpt": 0.06741714477539063,
|
45 |
+
"DataStreaming": 0.03615880012512207,
|
46 |
+
"DeConcat": 0.0005049705505371094,
|
47 |
+
"DeadCodeElimination": 0.0009002685546875,
|
48 |
+
"DeadStoreElimination": 0.0056514739990234375,
|
49 |
+
"DelinearIndices": 0.004773139953613281,
|
50 |
+
"Delinearization": 0.0026137828826904297,
|
51 |
+
"DoNothing": 0.0001933574676513672,
|
52 |
+
"DramToDramTranspose": 0.019293546676635742,
|
53 |
+
"DumpGraphAndMetadata": 0.10360383987426758,
|
54 |
+
"EliminateDivs": 0.003831148147583008,
|
55 |
+
"ExpandBatchNorm": 0.0019576549530029297,
|
56 |
+
"ExpandISAMacro": 0.012068033218383789,
|
57 |
+
"FactorizeBlkDims": 0.008942604064941406,
|
58 |
+
"FactorizeThreadAxesInFreeDims": 0.001847982406616211,
|
59 |
+
"FlattenMacroLoop": 0.003529787063598633,
|
60 |
+
"GenericAccessSimplifier": 0.0008223056793212891,
|
61 |
+
"HoistCompute": 7.999999979801942e-06,
|
62 |
+
"IdentifyCrossPassTensors": 4.8000001697801054e-05,
|
63 |
+
"InferInitValue": 0.025947093963623047,
|
64 |
+
"InferIntrinsicOnCC": 0.00908350944519043,
|
65 |
+
"InferNeuronTensor": 0.02371978759765625,
|
66 |
+
"InferNonlocalTensors": 0.014753341674804688,
|
67 |
+
"InferPSumTensor": 0.309035062789917,
|
68 |
+
"InlineNativeKernels": 0.008690595626831055,
|
69 |
+
"InsertIOTransposes": 0.01906275749206543,
|
70 |
+
"InsertLocalTransposes": 0.004312276840209961,
|
71 |
+
"InsertOffloadedTransposes": 0.002802133560180664,
|
72 |
+
"LICM": 0.003081083297729492,
|
73 |
+
"LateLegalizeInst": 0.014100313186645508,
|
74 |
+
"LateLegalizePostSplit": 0.012533903121948242,
|
75 |
+
"LateLowerReshapeOp": 0.001035451889038086,
|
76 |
+
"LateLowerTensorOp": 0.002605438232421875,
|
77 |
+
"LateNeuronInstComb": 0.009373188018798828,
|
78 |
+
"LayoutPreprocessing": 0.03434133529663086,
|
79 |
+
"LayoutPreprocessingAndAnalysis": 0.07319903373718262,
|
80 |
+
"LayoutRequirementAnalysis": 0.005194187164306641,
|
81 |
+
"LegalizeCCOpLayout": 0.0025322437286376953,
|
82 |
+
"LegalizeOpLevelAlias": 0.0020308494567871094,
|
83 |
+
"LegalizePartitionReduce": 0.0010001659393310547,
|
84 |
+
"LegalizeSundaAccess": 0.0786747932434082,
|
85 |
+
"LegalizeSundaMacro": 0.011176109313964844,
|
86 |
+
"LegalizeType": 0.014636754989624023,
|
87 |
+
"LocalLayoutOpt": 0.014019250869750977,
|
88 |
+
"LoopFusion": 0.005472898483276367,
|
89 |
+
"LoopSplitting": 0.00038623809814453125,
|
90 |
+
"LowerBroadcast": 0.0027265548706054688,
|
91 |
+
"LowerCCOpBlockAxis": 0.0058476924896240234,
|
92 |
+
"LowerComplexBroadcast": 0.00213623046875,
|
93 |
+
"LowerIntrinsics": 0.3070671558380127,
|
94 |
+
"LowerTensorOp": 0.010679960250854492,
|
95 |
+
"LowerTranspose": 0.012553691864013672,
|
96 |
+
"MacroGeneration": 0.029733657836914063,
|
97 |
+
"MaskPropagation": 0.0028328895568847656,
|
98 |
+
"MemcastMotion": 1.8999999156221747e-05,
|
99 |
+
"MemcpyElimination": 0.026583433151245117,
|
100 |
+
"MutateDataType": 0.0020093917846679688,
|
101 |
+
"NeuronAliasDependencyInduction": 0.00018548965454101563,
|
102 |
+
"NeuronAliasDependencyReset": 0.02524423599243164,
|
103 |
+
"NeuronInstComb": 0.004286766052246094,
|
104 |
+
"NeuronLICM": 0.03554058074951172,
|
105 |
+
"NeuronLoopFusion": 0.007987260818481445,
|
106 |
+
"NeuronLoopInterchange": 0.0023233890533447266,
|
107 |
+
"NeuronSimplifier": 0.0075054168701171875,
|
108 |
+
"NeuronSimplifyPredicates": 0.12207841873168945,
|
109 |
+
"NeuronValueNumbering": 0.0038213729858398438,
|
110 |
+
"OptimizeAliasedCopyChain": 0.0005936622619628906,
|
111 |
+
"OptimizeNKIKernels": 0.44962644577026367,
|
112 |
+
"PAGLayoutOpt": 0.0999138355255127,
|
113 |
+
"PComputeCutting": 0.005170106887817383,
|
114 |
+
"PGLayoutTilingPipeline": 0.7408750057220459,
|
115 |
+
"PGTiling": 0.29245758056640625,
|
116 |
+
"PadElimination": 0.000308990478515625,
|
117 |
+
"ParAxesAnnotation": 0.05283546447753906,
|
118 |
+
"PartialLoopFusion": 0.0043125152587890625,
|
119 |
+
"PartialSimdFusion": 0.004901885986328125,
|
120 |
+
"PenguinizeFunctions": 4.3000000005122274e-05,
|
121 |
+
"PerfectLoopNest": 0.001722574234008789,
|
122 |
+
"PruneFunctions": 4.199999966658652e-05,
|
123 |
+
"RecognizeOpIdiom": 0.004076480865478516,
|
124 |
+
"Recompute": 0.0002620220184326172,
|
125 |
+
"RelaxPredicates": 0.013286828994750977,
|
126 |
+
"Rematerialization": 0.0021238327026367188,
|
127 |
+
"RemoveOptimizationBarriers": 4.3000000005122274e-05,
|
128 |
+
"ReshapeWeights": 0.0006799697875976563,
|
129 |
+
"ResolveAccessConflict": 0.0040090084075927734,
|
130 |
+
"ResolveComplicatePredicates": 0.001981496810913086,
|
131 |
+
"RewriteReplicationMatmul": 0.0021796226501464844,
|
132 |
+
"RewriteWeights": 0.0022602081298828125,
|
133 |
+
"SFKVectorizer": 0.274188756942749,
|
134 |
+
"ScatterMotion": 5.7999997807201e-05,
|
135 |
+
"SimpleAllReduceTiling": 0.009164094924926758,
|
136 |
+
"Simplifier": 0.0046122074127197266,
|
137 |
+
"SimplifyMacroPredicates": 0.010458230972290039,
|
138 |
+
"SimplifyNeuronTensor": 1.0516629219055176,
|
139 |
+
"SimplifySlice": 0.0009145736694335938,
|
140 |
+
"SimplifyTensor": 0.00577855110168457,
|
141 |
+
"SpillPSum": 0.012692689895629883,
|
142 |
+
"SplitAPUnionSets": 0.10518908500671387,
|
143 |
+
"SplitAccGrp": 0.001172780990600586,
|
144 |
+
"StaticProfiler": 0.0124053955078125,
|
145 |
+
"StaticTransposeLocalTensor": 0.0038576126098632813,
|
146 |
+
"SundaISel": 0.04396390914916992,
|
147 |
+
"TCTransform": 0.0018804073333740234,
|
148 |
+
"TensorInitialization": 0.012793779373168945,
|
149 |
+
"TensorOpSimplifier": 0.0045316219329833984,
|
150 |
+
"TensorOpTransform": 0.021115541458129883,
|
151 |
+
"TensorizerLegalizationPass": 6.999999459367245e-05,
|
152 |
+
"TileCCOps": 0.0056231021881103516,
|
153 |
+
"TilingProfiler": 0.00790858268737793,
|
154 |
+
"TransformConvOp": 0.0030431747436523438,
|
155 |
+
"TritiumFusion": 0.03186154365539551,
|
156 |
+
"ValueNumbering": 0.0038623809814453125,
|
157 |
+
"VectorizeDMA": 0.0021522045135498047,
|
158 |
+
"VectorizeMatMult": 0.003453969955444336,
|
159 |
+
"VerifySupportedOps": 3.300000025774352e-05,
|
160 |
+
"WeightCoalescing": 0.009244203567504883,
|
161 |
+
"ZeroSizeTensorElimination": 0.00011420249938964844,
|
162 |
+
"algsimp": 0.0026100000832229853,
|
163 |
+
"batchnorm_expander": 3.9999998989515007e-05,
|
164 |
+
"boundary-marker-removal": 1.2000000424450263e-05,
|
165 |
+
"call-inliner": 0.00046499999007210135,
|
166 |
+
"canonicalize-boundary-marker": 1.8000000636675395e-05,
|
167 |
+
"collective-stream-id-checker": 9.200000204145908e-05,
|
168 |
+
"comparison-expander": 0.0005959999980404973,
|
169 |
+
"computation-deduplicator": 6.900000153109431e-05,
|
170 |
+
"conditional-to-select": 1.700000029813964e-05,
|
171 |
+
"config-lowering": 7.79999973019585e-05,
|
172 |
+
"constant-statistics": 0.0005530000198632479,
|
173 |
+
"constant_folding": 0.0003320000250823796,
|
174 |
+
"cse": 3.7000001611886546e-05,
|
175 |
+
"dce": 7.800000457791612e-05,
|
176 |
+
"dot_decomposer": 0.0014440000522881746,
|
177 |
+
"dynamic-slice-transpose": 1.2000000424450263e-05,
|
178 |
+
"eliminate-redundant-compare": 0.00028100001509301364,
|
179 |
+
"emit-offloaded-dropout": 4.099999932805076e-05,
|
180 |
+
"flatten-call-graph": 0.0009379999246448278,
|
181 |
+
"fuse-send-recv": 7.200000254670158e-05,
|
182 |
+
"hilo::LegalizeAlias": 1.2999999853491317e-05,
|
183 |
+
"hilo::NeuronInstCombine": 0.00010099999781232327,
|
184 |
+
"hilo::NeuronOpFusion": 2.5000001187436283e-05,
|
185 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 4.3000000005122274e-05,
|
186 |
+
"hilo::ScheduleFusion": 1.9999999949504854e-06,
|
187 |
+
"hilo::SixtyFourHack": 8.900000102585182e-05,
|
188 |
+
"hilo::VerifyAliasing": 4.999999873689376e-06,
|
189 |
+
"hlo-mac-count": 0.0012799999676644802,
|
190 |
+
"hlo-verifier": 0.007751000113785267,
|
191 |
+
"instruction-histogram": 0.0006590000120922923,
|
192 |
+
"io-con-pipe-begin": 6.000000212225132e-06,
|
193 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
194 |
+
"io-layout-normalization": 0.0014029999729245901,
|
195 |
+
"io-statistics": 6.199999916134402e-05,
|
196 |
+
"legalize-ccops": 3.999999989900971e-06,
|
197 |
+
"legalize-compare": 1.1000000085914508e-05,
|
198 |
+
"lower-argminmax-custom-call": 1.2000000424450263e-05,
|
199 |
+
"map-inline": 0.0008909999742172658,
|
200 |
+
"metadata-naming": 5.7999997807201e-05,
|
201 |
+
"mlir::detail::OpToOpPassAdaptor": 0.00016799999866634607,
|
202 |
+
"mlir::hlo::MhloToPyPenguin": 0.0028260000981390476,
|
203 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.00026000000070780516,
|
204 |
+
"mlir::mhlo::LowerComplexPass": 0.0002699999895412475,
|
205 |
+
"native-to-custom-softmax": 0.0007219999679364264,
|
206 |
+
"native-to-custom-softmax-dx": 0.0005740000051446259,
|
207 |
+
"operand_upcaster": 6.399999983841553e-05,
|
208 |
+
"opt-barrier-removal": 0.0005649999948218465,
|
209 |
+
"post-par-pipe-begin": 9.600000339560211e-05,
|
210 |
+
"post-par-pipe-end": 0.0,
|
211 |
+
"post-partition-simplification": 0.0016929999692365527,
|
212 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
213 |
+
"pre-par-pipe-end": 0.0,
|
214 |
+
"pre-partition-simplification": 0.1934960037469864,
|
215 |
+
"replace-minimum-constant": 0.00044299999717622995,
|
216 |
+
"reshape-mover": 0.00010800000018207356,
|
217 |
+
"simplify-concat": 0.00014099999680183828,
|
218 |
+
"simplify-while-loops": 9.600000339560211e-05,
|
219 |
+
"transform-variadic-reduce": 7.900000491645187e-05,
|
220 |
+
"tuple-simplifier": 0.0002980000281240791,
|
221 |
+
"unpack-nested-aws-ntwsr": 0.0004720000142697245,
|
222 |
+
"unroll-while-loop": 1.8999999156221747e-05,
|
223 |
+
"zero_sized_hlo_elimination": 0.0008989999769255519
|
224 |
+
},
|
225 |
+
"hilo": {
|
226 |
+
"ConstantSize": 599333.0,
|
227 |
+
"HloInputCount": 475.0,
|
228 |
+
"HloMacCount": 50240159744.0,
|
229 |
+
"HloOutputCount": 73.0,
|
230 |
+
"IfmapSize": 8266543104.0,
|
231 |
+
"OfmapSize": 75497472.0,
|
232 |
+
"OutputsReadFromCount": 0.0,
|
233 |
+
"PassthroughTensorsCount": 0.0,
|
234 |
+
"RedundantOutputCount": 0.0,
|
235 |
+
"Traffic": 1663506816.0
|
236 |
+
},
|
237 |
+
"tensorizer": {
|
238 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 43318.0,
|
239 |
+
"StaticProfiler::AifUb": 154.8094024658203,
|
240 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 152.92723083496094,
|
241 |
+
"StaticProfiler::AverageDmaLength": 4809.89794921875,
|
242 |
+
"StaticProfiler::DDRTransferBytes": 787141440.0,
|
243 |
+
"StaticProfiler::InternalTransferBytes": 634853888.0,
|
244 |
+
"StaticProfiler::LoadExpanded": 98070.0,
|
245 |
+
"StaticProfiler::StoreExpanded": 2397.0,
|
246 |
+
"StaticProfiler::TotalDMAExpanded": 100467.0,
|
247 |
+
"StaticProfiler::TotalDynamicInstancesCount": 50670.0,
|
248 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 50224.0,
|
249 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
250 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
251 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
252 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
253 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
254 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 22848.0,
|
255 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
256 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
257 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
258 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
259 |
+
"TilingProfiler::PfTransposeInstructions": 19201.0,
|
260 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
|
261 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
262 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 192.0,
|
263 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
|
264 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 158.0,
|
265 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
266 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
267 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
268 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
269 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
270 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
271 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
272 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
273 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
274 |
+
}
|
275 |
+
},
|
276 |
+
"all": {
|
277 |
+
"compiletime": {
|
278 |
+
"algsimp": 0.0024220000486820936,
|
279 |
+
"call-inliner": 0.0004349999944679439,
|
280 |
+
"collective-stream-id-checker": 8.199999865610152e-05,
|
281 |
+
"comparison-expander": 0.0005810000002384186,
|
282 |
+
"constant-statistics": 0.0005530000198632479,
|
283 |
+
"constant_folding": 0.0003060000017285347,
|
284 |
+
"dce": 7.500000356230885e-05,
|
285 |
+
"dot_decomposer": 0.0014440000522881746,
|
286 |
+
"eliminate-redundant-compare": 0.0002690000110305846,
|
287 |
+
"flatten-call-graph": 0.0009069999796338379,
|
288 |
+
"hlo-mac-count": 0.0010560000082477927,
|
289 |
+
"hlo-verifier": 0.007164000067859888,
|
290 |
+
"instruction-histogram": 0.0006590000120922923,
|
291 |
+
"io-con-pipe-begin": 6.000000212225132e-06,
|
292 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
293 |
+
"io-layout-normalization": 0.0014029999729245901,
|
294 |
+
"io-statistics": 6.199999916134402e-05,
|
295 |
+
"map-inline": 0.0008549999911338091,
|
296 |
+
"native-to-custom-softmax": 0.0007029999978840351,
|
297 |
+
"native-to-custom-softmax-dx": 0.000522000016644597,
|
298 |
+
"opt-barrier-removal": 0.0005649999948218465,
|
299 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
300 |
+
"pre-par-pipe-end": 0.0,
|
301 |
+
"pre-partition-simplification": 0.1934960037469864,
|
302 |
+
"replace-minimum-constant": 0.00042100000428035855,
|
303 |
+
"reshape-mover": 9.7999996796716e-05,
|
304 |
+
"simplify-while-loops": 9.000000136438757e-05,
|
305 |
+
"tuple-simplifier": 0.00028300000121816993,
|
306 |
+
"unpack-nested-aws-ntwsr": 0.0004600000102072954,
|
307 |
+
"unroll-while-loop": 1.8999999156221747e-05,
|
308 |
+
"zero_sized_hlo_elimination": 0.0008989999769255519
|
309 |
+
}
|
310 |
+
},
|
311 |
+
"cumsum": {
|
312 |
+
"compiletime": {
|
313 |
+
"CoalesceCCOp": 0.0002090930938720703,
|
314 |
+
"DMALocalityOpt": 0.00018835067749023438,
|
315 |
+
"DMAProfiler": 0.0008924007415771484,
|
316 |
+
"DataStreaming": 0.0002593994140625,
|
317 |
+
"DoNothing": 0.00011873245239257813,
|
318 |
+
"ExpandISAMacro": 0.0005505084991455078,
|
319 |
+
"FactorizeBlkDims": 0.0004696846008300781,
|
320 |
+
"InferPSumTensor": 0.0004990100860595703,
|
321 |
+
"LateLegalizeInst": 0.0004222393035888672,
|
322 |
+
"LateNeuronInstComb": 0.0005340576171875,
|
323 |
+
"LegalizeSundaAccess": 0.0017271041870117188,
|
324 |
+
"LegalizeType": 0.0002815723419189453,
|
325 |
+
"LowerBroadcast": 0.0002243518829345703,
|
326 |
+
"LowerIntrinsics": 0.0002181529998779297,
|
327 |
+
"LowerTranspose": 0.00024199485778808594,
|
328 |
+
"NeuronInstComb": 0.0004971027374267578,
|
329 |
+
"NeuronLICM": 0.0004258155822753906,
|
330 |
+
"NeuronSimplifyPredicates": 0.002941608428955078,
|
331 |
+
"NeuronValueNumbering": 0.0004222393035888672,
|
332 |
+
"SFKVectorizer": 0.002941131591796875,
|
333 |
+
"SimpleAllReduceTiling": 0.00019812583923339844,
|
334 |
+
"SimplifyNeuronTensor": 0.00045800209045410156,
|
335 |
+
"SpillPSum": 0.0005657672882080078,
|
336 |
+
"WeightCoalescing": 0.00020837783813476563
|
337 |
+
}
|
338 |
+
},
|
339 |
+
"sg00": {
|
340 |
+
"compiletime": {
|
341 |
+
"CanonicalizeConv": 3.099999958067201e-05,
|
342 |
+
"CanonicalizeForTensorizer": 1.5999999959603883e-05,
|
343 |
+
"Canonicalizer": 0.00032900000223889947,
|
344 |
+
"HoistCompute": 3.000000106112566e-06,
|
345 |
+
"IdentifyCrossPassTensors": 2.8000000384054147e-05,
|
346 |
+
"MemcastMotion": 1.1000000085914508e-05,
|
347 |
+
"PenguinizeFunctions": 1.5999999959603883e-05,
|
348 |
+
"PruneFunctions": 1.2999999853491317e-05,
|
349 |
+
"RemoveOptimizationBarriers": 2.300000051036477e-05,
|
350 |
+
"ScatterMotion": 1.9999999494757503e-05,
|
351 |
+
"TensorizerLegalizationPass": 4.3000000005122274e-05,
|
352 |
+
"VerifySupportedOps": 1.2000000424450263e-05,
|
353 |
+
"algsimp": 6.600000051548705e-05,
|
354 |
+
"batchnorm_expander": 1.4000000192027073e-05,
|
355 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
356 |
+
"call-inliner": 9.000000318337698e-06,
|
357 |
+
"canonicalize-boundary-marker": 6.000000212225132e-06,
|
358 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
359 |
+
"comparison-expander": 4.999999873689376e-06,
|
360 |
+
"computation-deduplicator": 1.9999999494757503e-05,
|
361 |
+
"conditional-to-select": 4.999999873689376e-06,
|
362 |
+
"config-lowering": 3.099999958067201e-05,
|
363 |
+
"constant_folding": 9.000000318337698e-06,
|
364 |
+
"cse": 1.2999999853491317e-05,
|
365 |
+
"dce": 9.999999974752427e-07,
|
366 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
367 |
+
"eliminate-redundant-compare": 3.999999989900971e-06,
|
368 |
+
"emit-offloaded-dropout": 1.2999999853491317e-05,
|
369 |
+
"flatten-call-graph": 9.999999747378752e-06,
|
370 |
+
"fuse-send-recv": 2.499999936844688e-05,
|
371 |
+
"hilo::LegalizeAlias": 6.000000212225132e-06,
|
372 |
+
"hilo::NeuronInstCombine": 4.3000000005122274e-05,
|
373 |
+
"hilo::NeuronOpFusion": 9.000000318337698e-06,
|
374 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05,
|
375 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
376 |
+
"hilo::SixtyFourHack": 1.4999999621068127e-05,
|
377 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
378 |
+
"hlo-mac-count": 2.5999999706982635e-05,
|
379 |
+
"hlo-verifier": 0.0001939999929163605,
|
380 |
+
"legalize-ccops": 1.9999999949504854e-06,
|
381 |
+
"legalize-compare": 3.999999989900971e-06,
|
382 |
+
"lower-argminmax-custom-call": 3.999999989900971e-06,
|
383 |
+
"map-inline": 1.2000000424450263e-05,
|
384 |
+
"metadata-naming": 1.8000000636675395e-05,
|
385 |
+
"mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05,
|
386 |
+
"mlir::hlo::MhloToPyPenguin": 0.0009980000322684646,
|
387 |
+
"mlir::mhlo::LowerComplexExtraPass": 8.800000068731606e-05,
|
388 |
+
"mlir::mhlo::LowerComplexPass": 0.00015999999595806003,
|
389 |
+
"native-to-custom-softmax": 7.000000096013537e-06,
|
390 |
+
"native-to-custom-softmax-dx": 1.2999999853491317e-05,
|
391 |
+
"operand_upcaster": 1.9999999494757503e-05,
|
392 |
+
"post-par-pipe-begin": 8.900000102585182e-05,
|
393 |
+
"post-par-pipe-end": 0.0,
|
394 |
+
"post-partition-simplification": 0.000582000007852912,
|
395 |
+
"replace-minimum-constant": 7.000000096013537e-06,
|
396 |
+
"reshape-mover": 3.999999989900971e-06,
|
397 |
+
"simplify-concat": 4.8000001697801054e-05,
|
398 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
399 |
+
"transform-variadic-reduce": 9.000000318337698e-06,
|
400 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
401 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
402 |
+
"unroll-while-loop": 0.0
|
403 |
+
},
|
404 |
+
"hilo": {
|
405 |
+
"ArithmeticIntensity": 16.6773738861084,
|
406 |
+
"ConstantSize": 599333.0,
|
407 |
+
"HloInputCount": 475.0,
|
408 |
+
"HloMacCount": 5637144576.0,
|
409 |
+
"HloOutputCount": 73.0,
|
410 |
+
"IfmapSize": 8266543104.0,
|
411 |
+
"OfmapSize": 75497472.0,
|
412 |
+
"OutputsReadFromCount": 0.0,
|
413 |
+
"PassthroughTensorsCount": 0.0,
|
414 |
+
"RedundantOutputCount": 0.0,
|
415 |
+
"Traffic": 676023104.0
|
416 |
+
}
|
417 |
+
},
|
418 |
+
"sg0000": {
|
419 |
+
"compiletime": {
|
420 |
+
"AGOrderingAnalysisPass": 0.08161520957946777,
|
421 |
+
"AffinePredicateResolution": 0.001527547836303711,
|
422 |
+
"AliasDependencyElimination": 0.00012493133544921875,
|
423 |
+
"AliasDependencyInduction": 0.008615970611572266,
|
424 |
+
"AliasDependencyReset": 0.03425288200378418,
|
425 |
+
"BFComputeCutting": 0.003037691116333008,
|
426 |
+
"BirCodeGenLoop": 0.05175900459289551,
|
427 |
+
"CCOpFusion": 0.024791479110717773,
|
428 |
+
"CanonicalizeDAGForPGTiling": 0.003105640411376953,
|
429 |
+
"CanonicalizeIR": 0.0020570755004882813,
|
430 |
+
"CoalesceCCOp": 0.005420684814453125,
|
431 |
+
"CommuteConcat": 0.0015554428100585938,
|
432 |
+
"DMALocalityOpt": 0.0025992393493652344,
|
433 |
+
"DMAProfiler": 0.004426240921020508,
|
434 |
+
"DMATilingProfiler": 0.00414586067199707,
|
435 |
+
"DataLocalityOpt": 0.11810016632080078,
|
436 |
+
"DataStreaming": 0.0053942203521728516,
|
437 |
+
"DeConcat": 0.0011267662048339844,
|
438 |
+
"DeadCodeElimination": 0.0016050338745117188,
|
439 |
+
"DeadStoreElimination": 0.030996084213256836,
|
440 |
+
"DelinearIndices": 0.007958412170410156,
|
441 |
+
"Delinearization": 0.003355741500854492,
|
442 |
+
"DoNothing": 7.987022399902344e-05,
|
443 |
+
"DramToDramTranspose": 0.03346753120422363,
|
444 |
+
"DumpGraphAndMetadata": 0.005443096160888672,
|
445 |
+
"EliminateDivs": 0.004342555999755859,
|
446 |
+
"ExpandBatchNorm": 0.0018055438995361328,
|
447 |
+
"ExpandISAMacro": 0.003648519515991211,
|
448 |
+
"FactorizeBlkDims": 0.019720077514648438,
|
449 |
+
"FactorizeThreadAxesInFreeDims": 0.0019965171813964844,
|
450 |
+
"FlattenMacroLoop": 0.003274679183959961,
|
451 |
+
"GenericAccessSimplifier": 0.0009877681732177734,
|
452 |
+
"InferInitValue": 0.032111167907714844,
|
453 |
+
"InferIntrinsicOnCC": 0.014227867126464844,
|
454 |
+
"InferNeuronTensor": 0.04684329032897949,
|
455 |
+
"InferNonlocalTensors": 0.10579586029052734,
|
456 |
+
"InferPSumTensor": 0.04808926582336426,
|
457 |
+
"InlineNativeKernels": 0.0025835037231445313,
|
458 |
+
"InsertIOTransposes": 0.012038707733154297,
|
459 |
+
"InsertLocalTransposes": 0.007574796676635742,
|
460 |
+
"InsertOffloadedTransposes": 0.003882884979248047,
|
461 |
+
"LICM": 0.003116607666015625,
|
462 |
+
"LateLegalizeInst": 0.006630420684814453,
|
463 |
+
"LateLegalizePostSplit": 0.0030584335327148438,
|
464 |
+
"LateLowerReshapeOp": 0.002176046371459961,
|
465 |
+
"LateLowerTensorOp": 0.005063295364379883,
|
466 |
+
"LateNeuronInstComb": 0.024392366409301758,
|
467 |
+
"LayoutPreprocessing": 0.03173065185546875,
|
468 |
+
"LayoutPreprocessingAndAnalysis": 0.07484269142150879,
|
469 |
+
"LayoutRequirementAnalysis": 0.007186174392700195,
|
470 |
+
"LegalizeCCOpLayout": 0.003088235855102539,
|
471 |
+
"LegalizeOpLevelAlias": 0.0011813640594482422,
|
472 |
+
"LegalizePartitionReduce": 0.0013763904571533203,
|
473 |
+
"LegalizeSundaAccess": 0.04270172119140625,
|
474 |
+
"LegalizeSundaMacro": 0.009444236755371094,
|
475 |
+
"LegalizeType": 0.004534721374511719,
|
476 |
+
"LocalLayoutOpt": 0.01777815818786621,
|
477 |
+
"LoopFusion": 0.0060007572174072266,
|
478 |
+
"LoopSplitting": 0.000377655029296875,
|
479 |
+
"LowerBroadcast": 0.0016138553619384766,
|
480 |
+
"LowerCCOpBlockAxis": 0.004978179931640625,
|
481 |
+
"LowerComplexBroadcast": 0.0023903846740722656,
|
482 |
+
"LowerIntrinsics": 0.034012556076049805,
|
483 |
+
"LowerTensorOp": 0.01333928108215332,
|
484 |
+
"LowerTranspose": 0.011911869049072266,
|
485 |
+
"MacroGeneration": 0.07152104377746582,
|
486 |
+
"MaskPropagation": 0.004988193511962891,
|
487 |
+
"MemcpyElimination": 0.11162376403808594,
|
488 |
+
"MutateDataType": 0.0014476776123046875,
|
489 |
+
"NeuronAliasDependencyInduction": 0.0002269744873046875,
|
490 |
+
"NeuronAliasDependencyReset": 0.15035724639892578,
|
491 |
+
"NeuronInstComb": 0.015686750411987305,
|
492 |
+
"NeuronLICM": 0.011453866958618164,
|
493 |
+
"NeuronLoopFusion": 0.018696069717407227,
|
494 |
+
"NeuronLoopInterchange": 0.0018415451049804688,
|
495 |
+
"NeuronSimplifier": 0.011624336242675781,
|
496 |
+
"NeuronSimplifyPredicates": 0.005795955657958984,
|
497 |
+
"NeuronValueNumbering": 0.0040967464447021484,
|
498 |
+
"OptimizeAliasedCopyChain": 0.0014064311981201172,
|
499 |
+
"OptimizeNKIKernels": 0.0021300315856933594,
|
500 |
+
"PAGLayoutOpt": 0.33215951919555664,
|
501 |
+
"PComputeCutting": 0.008408308029174805,
|
502 |
+
"PGLayoutTilingPipeline": 1.3294909000396729,
|
503 |
+
"PGTiling": 0.3412203788757324,
|
504 |
+
"PadElimination": 0.0018661022186279297,
|
505 |
+
"ParAxesAnnotation": 0.29718852043151855,
|
506 |
+
"PartialLoopFusion": 0.024113893508911133,
|
507 |
+
"PartialSimdFusion": 0.029590368270874023,
|
508 |
+
"PerfectLoopNest": 0.0021219253540039063,
|
509 |
+
"RecognizeOpIdiom": 0.004444122314453125,
|
510 |
+
"Recompute": 0.00028204917907714844,
|
511 |
+
"RelaxPredicates": 0.004793405532836914,
|
512 |
+
"Rematerialization": 0.004267692565917969,
|
513 |
+
"ReshapeWeights": 0.0014717578887939453,
|
514 |
+
"ResolveAccessConflict": 0.0038602352142333984,
|
515 |
+
"ResolveComplicatePredicates": 0.001505136489868164,
|
516 |
+
"RewriteReplicationMatmul": 0.0020885467529296875,
|
517 |
+
"RewriteWeights": 0.003512144088745117,
|
518 |
+
"SFKVectorizer": 0.3296499252319336,
|
519 |
+
"SimpleAllReduceTiling": 0.002294301986694336,
|
520 |
+
"Simplifier": 0.004443168640136719,
|
521 |
+
"SimplifyMacroPredicates": 0.013223648071289063,
|
522 |
+
"SimplifyNeuronTensor": 0.011357307434082031,
|
523 |
+
"SimplifySlice": 0.0010068416595458984,
|
524 |
+
"SimplifyTensor": 0.006380319595336914,
|
525 |
+
"SpillPSum": 0.018645763397216797,
|
526 |
+
"SplitAPUnionSets": 0.031983375549316406,
|
527 |
+
"SplitAccGrp": 0.0017464160919189453,
|
528 |
+
"StaticProfiler": 0.004789590835571289,
|
529 |
+
"StaticTransposeLocalTensor": 0.0048563480377197266,
|
530 |
+
"SundaISel": 0.046004533767700195,
|
531 |
+
"TCTransform": 0.0017864704132080078,
|
532 |
+
"TensorInitialization": 0.015267372131347656,
|
533 |
+
"TensorOpSimplifier": 0.006502866744995117,
|
534 |
+
"TensorOpTransform": 0.029101848602294922,
|
535 |
+
"TileCCOps": 0.0055658817291259766,
|
536 |
+
"TilingProfiler": 0.014283895492553711,
|
537 |
+
"TransformConvOp": 0.0028002262115478516,
|
538 |
+
"TritiumFusion": 0.037850379943847656,
|
539 |
+
"ValueNumbering": 0.002534627914428711,
|
540 |
+
"VectorizeDMA": 0.0056002140045166016,
|
541 |
+
"VectorizeMatMult": 0.004069805145263672,
|
542 |
+
"WeightCoalescing": 0.0033059120178222656,
|
543 |
+
"ZeroSizeTensorElimination": 0.00012040138244628906
|
544 |
+
},
|
545 |
+
"tensorizer": {
|
546 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 1945.0,
|
547 |
+
"StaticProfiler::AifUb": 18.54642677307129,
|
548 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 234.4757080078125,
|
549 |
+
"StaticProfiler::AverageDmaLength": 3607.790283203125,
|
550 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.84349822998047,
|
551 |
+
"StaticProfiler::AveragePartitionUtilization": 96.70350646972656,
|
552 |
+
"StaticProfiler::AveragePeUtilization": 99.51932525634766,
|
553 |
+
"StaticProfiler::DDRTransferBytes": 53226752.0,
|
554 |
+
"StaticProfiler::InternalTransferBytes": 27462656.0,
|
555 |
+
"StaticProfiler::LoadExpanded": 10244.0,
|
556 |
+
"StaticProfiler::LocalizationEfficiency": 1264.2635498046875,
|
557 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1466.4949951171875,
|
558 |
+
"StaticProfiler::StoreExpanded": 3713.0,
|
559 |
+
"StaticProfiler::TotalDMAExpanded": 13957.0,
|
560 |
+
"StaticProfiler::TotalDynamicInstancesCount": 2107.0,
|
561 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 2103.0,
|
562 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
563 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
564 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
565 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
566 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
567 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
568 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 24.0,
|
569 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 1010.0,
|
570 |
+
"TilingProfiler::NumPfTransposes": 6.0,
|
571 |
+
"TilingProfiler::NumPfTransposesForIo": 0.0,
|
572 |
+
"TilingProfiler::NumPfTransposesForLocal": 5.0,
|
573 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 1.0,
|
574 |
+
"TilingProfiler::PfTransposeInstructions": 176.0,
|
575 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 0.0,
|
576 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 144.0,
|
577 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 32.0,
|
578 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
579 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 177.0,
|
580 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
581 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
582 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
583 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
584 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
585 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
586 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
587 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
588 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
589 |
+
}
|
590 |
+
},
|
591 |
+
"sg0001": {
|
592 |
+
"compiletime": {
|
593 |
+
"AGOrderingAnalysisPass": 0.035902976989746094,
|
594 |
+
"AffinePredicateResolution": 0.0021402835845947266,
|
595 |
+
"AliasDependencyElimination": 0.0001494884490966797,
|
596 |
+
"AliasDependencyInduction": 0.00843667984008789,
|
597 |
+
"AliasDependencyReset": 0.07715225219726563,
|
598 |
+
"BFComputeCutting": 0.002821207046508789,
|
599 |
+
"BirCodeGenLoop": 0.03489971160888672,
|
600 |
+
"CCOpFusion": 0.03669166564941406,
|
601 |
+
"CanonicalizeDAGForPGTiling": 0.0034394264221191406,
|
602 |
+
"CanonicalizeIR": 0.001888275146484375,
|
603 |
+
"CoalesceCCOp": 0.0048944950103759766,
|
604 |
+
"CommuteConcat": 0.001985788345336914,
|
605 |
+
"DMALocalityOpt": 0.0010595321655273438,
|
606 |
+
"DMAProfiler": 0.0038537979125976563,
|
607 |
+
"DMATilingProfiler": 0.0052776336669921875,
|
608 |
+
"DataLocalityOpt": 0.13663840293884277,
|
609 |
+
"DataStreaming": 0.004033327102661133,
|
610 |
+
"DeConcat": 0.0017592906951904297,
|
611 |
+
"DeadCodeElimination": 0.0027074813842773438,
|
612 |
+
"DeadStoreElimination": 0.03486442565917969,
|
613 |
+
"DelinearIndices": 0.010581493377685547,
|
614 |
+
"Delinearization": 0.004877567291259766,
|
615 |
+
"DoNothing": 6.914138793945313e-05,
|
616 |
+
"DramToDramTranspose": 0.03982400894165039,
|
617 |
+
"DumpGraphAndMetadata": 0.004088640213012695,
|
618 |
+
"EliminateDivs": 0.0045583248138427734,
|
619 |
+
"ExpandBatchNorm": 0.0018122196197509766,
|
620 |
+
"ExpandISAMacro": 0.0023725032806396484,
|
621 |
+
"FactorizeBlkDims": 0.013248920440673828,
|
622 |
+
"FactorizeThreadAxesInFreeDims": 0.0023849010467529297,
|
623 |
+
"FlattenMacroLoop": 0.0036728382110595703,
|
624 |
+
"GenericAccessSimplifier": 0.0026085376739501953,
|
625 |
+
"InferInitValue": 0.038416147232055664,
|
626 |
+
"InferIntrinsicOnCC": 0.010096549987792969,
|
627 |
+
"InferNeuronTensor": 0.05150651931762695,
|
628 |
+
"InferNonlocalTensors": 0.031507015228271484,
|
629 |
+
"InferPSumTensor": 0.03166079521179199,
|
630 |
+
"InlineNativeKernels": 0.0021262168884277344,
|
631 |
+
"InsertIOTransposes": 0.022419452667236328,
|
632 |
+
"InsertLocalTransposes": 0.0071408748626708984,
|
633 |
+
"InsertOffloadedTransposes": 0.0034465789794921875,
|
634 |
+
"LICM": 0.004317283630371094,
|
635 |
+
"LateLegalizeInst": 0.004563570022583008,
|
636 |
+
"LateLegalizePostSplit": 0.0027570724487304688,
|
637 |
+
"LateLowerReshapeOp": 0.0013232231140136719,
|
638 |
+
"LateLowerTensorOp": 0.004618406295776367,
|
639 |
+
"LateNeuronInstComb": 0.020873546600341797,
|
640 |
+
"LayoutPreprocessing": 0.037287235260009766,
|
641 |
+
"LayoutPreprocessingAndAnalysis": 0.10860347747802734,
|
642 |
+
"LayoutRequirementAnalysis": 0.007799863815307617,
|
643 |
+
"LegalizeCCOpLayout": 0.001935720443725586,
|
644 |
+
"LegalizeOpLevelAlias": 0.0012698173522949219,
|
645 |
+
"LegalizePartitionReduce": 0.002346515655517578,
|
646 |
+
"LegalizeSundaAccess": 0.016484975814819336,
|
647 |
+
"LegalizeSundaMacro": 0.011503934860229492,
|
648 |
+
"LegalizeType": 0.0047261714935302734,
|
649 |
+
"LocalLayoutOpt": 0.02424001693725586,
|
650 |
+
"LoopFusion": 0.007829427719116211,
|
651 |
+
"LoopSplitting": 0.00044846534729003906,
|
652 |
+
"LowerBroadcast": 0.0014789104461669922,
|
653 |
+
"LowerCCOpBlockAxis": 0.0059947967529296875,
|
654 |
+
"LowerComplexBroadcast": 0.0023598670959472656,
|
655 |
+
"LowerIntrinsics": 0.035590410232543945,
|
656 |
+
"LowerTensorOp": 0.012118339538574219,
|
657 |
+
"LowerTranspose": 0.011335611343383789,
|
658 |
+
"MacroGeneration": 0.11938071250915527,
|
659 |
+
"MaskPropagation": 0.003367900848388672,
|
660 |
+
"MemcpyElimination": 0.10591435432434082,
|
661 |
+
"MutateDataType": 0.002183198928833008,
|
662 |
+
"NeuronAliasDependencyInduction": 0.0002372264862060547,
|
663 |
+
"NeuronAliasDependencyReset": 0.02314162254333496,
|
664 |
+
"NeuronInstComb": 0.01471090316772461,
|
665 |
+
"NeuronLICM": 0.007970094680786133,
|
666 |
+
"NeuronLoopFusion": 0.022555112838745117,
|
667 |
+
"NeuronLoopInterchange": 0.0015497207641601563,
|
668 |
+
"NeuronSimplifier": 0.012836694717407227,
|
669 |
+
"NeuronSimplifyPredicates": 0.001605987548828125,
|
670 |
+
"NeuronValueNumbering": 0.0046231746673583984,
|
671 |
+
"OptimizeAliasedCopyChain": 0.00162506103515625,
|
672 |
+
"OptimizeNKIKernels": 0.0015685558319091797,
|
673 |
+
"PAGLayoutOpt": 0.14427471160888672,
|
674 |
+
"PComputeCutting": 0.00727081298828125,
|
675 |
+
"PGLayoutTilingPipeline": 1.2423913478851318,
|
676 |
+
"PGTiling": 0.5181164741516113,
|
677 |
+
"PadElimination": 0.00038051605224609375,
|
678 |
+
"ParAxesAnnotation": 0.09470343589782715,
|
679 |
+
"PartialLoopFusion": 0.018784761428833008,
|
680 |
+
"PartialSimdFusion": 0.027338027954101563,
|
681 |
+
"PerfectLoopNest": 0.0021829605102539063,
|
682 |
+
"RecognizeOpIdiom": 0.0048656463623046875,
|
683 |
+
"Recompute": 0.0002601146697998047,
|
684 |
+
"RelaxPredicates": 0.0033593177795410156,
|
685 |
+
"Rematerialization": 0.0023822784423828125,
|
686 |
+
"ReshapeWeights": 0.0014538764953613281,
|
687 |
+
"ResolveAccessConflict": 0.0047032833099365234,
|
688 |
+
"ResolveComplicatePredicates": 0.0019354820251464844,
|
689 |
+
"RewriteReplicationMatmul": 0.002605438232421875,
|
690 |
+
"RewriteWeights": 0.004354715347290039,
|
691 |
+
"SFKVectorizer": 0.16805624961853027,
|
692 |
+
"SimpleAllReduceTiling": 0.0025529861450195313,
|
693 |
+
"Simplifier": 0.00439763069152832,
|
694 |
+
"SimplifyMacroPredicates": 0.007683992385864258,
|
695 |
+
"SimplifyNeuronTensor": 0.0066149234771728516,
|
696 |
+
"SimplifySlice": 0.0023670196533203125,
|
697 |
+
"SimplifyTensor": 0.0063228607177734375,
|
698 |
+
"SpillPSum": 0.01709151268005371,
|
699 |
+
"SplitAPUnionSets": 0.018975019454956055,
|
700 |
+
"SplitAccGrp": 0.002074003219604492,
|
701 |
+
"StaticProfiler": 0.0037796497344970703,
|
702 |
+
"StaticTransposeLocalTensor": 0.005953311920166016,
|
703 |
+
"SundaISel": 0.0426335334777832,
|
704 |
+
"TCTransform": 0.0011513233184814453,
|
705 |
+
"TensorInitialization": 0.002532958984375,
|
706 |
+
"TensorOpSimplifier": 0.006600379943847656,
|
707 |
+
"TensorOpTransform": 0.034122467041015625,
|
708 |
+
"TileCCOps": 0.0059397220611572266,
|
709 |
+
"TilingProfiler": 0.013670921325683594,
|
710 |
+
"TransformConvOp": 0.002622365951538086,
|
711 |
+
"TritiumFusion": 0.05379676818847656,
|
712 |
+
"ValueNumbering": 0.0030698776245117188,
|
713 |
+
"VectorizeDMA": 0.0016117095947265625,
|
714 |
+
"VectorizeMatMult": 0.005866289138793945,
|
715 |
+
"WeightCoalescing": 0.0026290416717529297,
|
716 |
+
"ZeroSizeTensorElimination": 0.00011897087097167969
|
717 |
+
},
|
718 |
+
"tensorizer": {
|
719 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 6049.0,
|
720 |
+
"StaticProfiler::AifUb": 251.7889862060547,
|
721 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 253.54466247558594,
|
722 |
+
"StaticProfiler::AverageDmaLength": 6385.9599609375,
|
723 |
+
"StaticProfiler::AverageFractalPeUtilization": 100.0,
|
724 |
+
"StaticProfiler::AveragePartitionUtilization": 99.86996459960938,
|
725 |
+
"StaticProfiler::AveragePeUtilization": 100.0,
|
726 |
+
"StaticProfiler::DDRTransferBytes": 204350464.0,
|
727 |
+
"StaticProfiler::InternalTransferBytes": 21430272.0,
|
728 |
+
"StaticProfiler::LoadExpanded": 27520.0,
|
729 |
+
"StaticProfiler::LocalizationEfficiency": 100.69728088378906,
|
730 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 105.00786590576172,
|
731 |
+
"StaticProfiler::StoreExpanded": 2305.0,
|
732 |
+
"StaticProfiler::TotalDMAExpanded": 29825.0,
|
733 |
+
"StaticProfiler::TotalDynamicInstancesCount": 6153.0,
|
734 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 6153.0,
|
735 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
736 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
737 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
738 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
739 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
740 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
741 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 16.0,
|
742 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 4848.0,
|
743 |
+
"TilingProfiler::NumPfTransposes": 8.0,
|
744 |
+
"TilingProfiler::NumPfTransposesForIo": 3.0,
|
745 |
+
"TilingProfiler::NumPfTransposesForLocal": 3.0,
|
746 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 2.0,
|
747 |
+
"TilingProfiler::PfTransposeInstructions": 276.0,
|
748 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 68.0,
|
749 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 80.0,
|
750 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 128.0,
|
751 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
752 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 216.0,
|
753 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
754 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
755 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
756 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
757 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
758 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
759 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
760 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
761 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
762 |
+
}
|
763 |
+
},
|
764 |
+
"sg0002": {
|
765 |
+
"compiletime": {
|
766 |
+
"AGOrderingAnalysisPass": 0.019578933715820313,
|
767 |
+
"AffinePredicateResolution": 0.0019481182098388672,
|
768 |
+
"AliasDependencyElimination": 0.0001239776611328125,
|
769 |
+
"AliasDependencyInduction": 0.00577092170715332,
|
770 |
+
"AliasDependencyReset": 0.027690649032592773,
|
771 |
+
"BFComputeCutting": 0.0023322105407714844,
|
772 |
+
"BirCodeGenLoop": 0.4628438949584961,
|
773 |
+
"CCOpFusion": 0.022275209426879883,
|
774 |
+
"CanonicalizeDAGForPGTiling": 0.005593061447143555,
|
775 |
+
"CanonicalizeIR": 0.001634359359741211,
|
776 |
+
"CoalesceCCOp": 0.015367984771728516,
|
777 |
+
"CommuteConcat": 0.0008616447448730469,
|
778 |
+
"DMALocalityOpt": 0.007138729095458984,
|
779 |
+
"DMAProfiler": 0.011677265167236328,
|
780 |
+
"DMATilingProfiler": 0.0037431716918945313,
|
781 |
+
"DataLocalityOpt": 0.06741714477539063,
|
782 |
+
"DataStreaming": 0.03589940071105957,
|
783 |
+
"DeConcat": 0.0005049705505371094,
|
784 |
+
"DeadCodeElimination": 0.0009002685546875,
|
785 |
+
"DeadStoreElimination": 0.0056514739990234375,
|
786 |
+
"DelinearIndices": 0.004773139953613281,
|
787 |
+
"Delinearization": 0.0026137828826904297,
|
788 |
+
"DoNothing": 7.462501525878906e-05,
|
789 |
+
"DramToDramTranspose": 0.019293546676635742,
|
790 |
+
"DumpGraphAndMetadata": 0.10360383987426758,
|
791 |
+
"EliminateDivs": 0.003831148147583008,
|
792 |
+
"ExpandBatchNorm": 0.0019576549530029297,
|
793 |
+
"ExpandISAMacro": 0.011517524719238281,
|
794 |
+
"FactorizeBlkDims": 0.008472919464111328,
|
795 |
+
"FactorizeThreadAxesInFreeDims": 0.001847982406616211,
|
796 |
+
"FlattenMacroLoop": 0.003529787063598633,
|
797 |
+
"GenericAccessSimplifier": 0.0008223056793212891,
|
798 |
+
"InferInitValue": 0.025947093963623047,
|
799 |
+
"InferIntrinsicOnCC": 0.00908350944519043,
|
800 |
+
"InferNeuronTensor": 0.02371978759765625,
|
801 |
+
"InferNonlocalTensors": 0.014753341674804688,
|
802 |
+
"InferPSumTensor": 0.3085360527038574,
|
803 |
+
"InlineNativeKernels": 0.008690595626831055,
|
804 |
+
"InsertIOTransposes": 0.01906275749206543,
|
805 |
+
"InsertLocalTransposes": 0.004312276840209961,
|
806 |
+
"InsertOffloadedTransposes": 0.002802133560180664,
|
807 |
+
"LICM": 0.003081083297729492,
|
808 |
+
"LateLegalizeInst": 0.01367807388305664,
|
809 |
+
"LateLegalizePostSplit": 0.012533903121948242,
|
810 |
+
"LateLowerReshapeOp": 0.001035451889038086,
|
811 |
+
"LateLowerTensorOp": 0.002605438232421875,
|
812 |
+
"LateNeuronInstComb": 0.008839130401611328,
|
813 |
+
"LayoutPreprocessing": 0.03434133529663086,
|
814 |
+
"LayoutPreprocessingAndAnalysis": 0.07319903373718262,
|
815 |
+
"LayoutRequirementAnalysis": 0.005194187164306641,
|
816 |
+
"LegalizeCCOpLayout": 0.0025322437286376953,
|
817 |
+
"LegalizeOpLevelAlias": 0.0020308494567871094,
|
818 |
+
"LegalizePartitionReduce": 0.0010001659393310547,
|
819 |
+
"LegalizeSundaAccess": 0.07694768905639648,
|
820 |
+
"LegalizeSundaMacro": 0.011176109313964844,
|
821 |
+
"LegalizeType": 0.014355182647705078,
|
822 |
+
"LocalLayoutOpt": 0.014019250869750977,
|
823 |
+
"LoopFusion": 0.005472898483276367,
|
824 |
+
"LoopSplitting": 0.00038623809814453125,
|
825 |
+
"LowerBroadcast": 0.0025022029876708984,
|
826 |
+
"LowerCCOpBlockAxis": 0.0058476924896240234,
|
827 |
+
"LowerComplexBroadcast": 0.00213623046875,
|
828 |
+
"LowerIntrinsics": 0.30684900283813477,
|
829 |
+
"LowerTensorOp": 0.010679960250854492,
|
830 |
+
"LowerTranspose": 0.012311697006225586,
|
831 |
+
"MacroGeneration": 0.029733657836914063,
|
832 |
+
"MaskPropagation": 0.0028328895568847656,
|
833 |
+
"MemcpyElimination": 0.026583433151245117,
|
834 |
+
"MutateDataType": 0.0020093917846679688,
|
835 |
+
"NeuronAliasDependencyInduction": 0.00018548965454101563,
|
836 |
+
"NeuronAliasDependencyReset": 0.02524423599243164,
|
837 |
+
"NeuronInstComb": 0.003789663314819336,
|
838 |
+
"NeuronLICM": 0.03511476516723633,
|
839 |
+
"NeuronLoopFusion": 0.007987260818481445,
|
840 |
+
"NeuronLoopInterchange": 0.0023233890533447266,
|
841 |
+
"NeuronSimplifier": 0.0075054168701171875,
|
842 |
+
"NeuronSimplifyPredicates": 0.11913681030273438,
|
843 |
+
"NeuronValueNumbering": 0.0033991336822509766,
|
844 |
+
"OptimizeAliasedCopyChain": 0.0005936622619628906,
|
845 |
+
"OptimizeNKIKernels": 0.44962644577026367,
|
846 |
+
"PAGLayoutOpt": 0.0999138355255127,
|
847 |
+
"PComputeCutting": 0.005170106887817383,
|
848 |
+
"PGLayoutTilingPipeline": 0.7408750057220459,
|
849 |
+
"PGTiling": 0.29245758056640625,
|
850 |
+
"PadElimination": 0.000308990478515625,
|
851 |
+
"ParAxesAnnotation": 0.05283546447753906,
|
852 |
+
"PartialLoopFusion": 0.0043125152587890625,
|
853 |
+
"PartialSimdFusion": 0.004901885986328125,
|
854 |
+
"PerfectLoopNest": 0.001722574234008789,
|
855 |
+
"RecognizeOpIdiom": 0.004076480865478516,
|
856 |
+
"Recompute": 0.0002620220184326172,
|
857 |
+
"RelaxPredicates": 0.013286828994750977,
|
858 |
+
"Rematerialization": 0.0021238327026367188,
|
859 |
+
"ReshapeWeights": 0.0006799697875976563,
|
860 |
+
"ResolveAccessConflict": 0.0040090084075927734,
|
861 |
+
"ResolveComplicatePredicates": 0.001981496810913086,
|
862 |
+
"RewriteReplicationMatmul": 0.0021796226501464844,
|
863 |
+
"RewriteWeights": 0.0022602081298828125,
|
864 |
+
"SFKVectorizer": 0.27124762535095215,
|
865 |
+
"SimpleAllReduceTiling": 0.00896596908569336,
|
866 |
+
"Simplifier": 0.0046122074127197266,
|
867 |
+
"SimplifyMacroPredicates": 0.010458230972290039,
|
868 |
+
"SimplifyNeuronTensor": 1.0512049198150635,
|
869 |
+
"SimplifySlice": 0.0009145736694335938,
|
870 |
+
"SimplifyTensor": 0.00577855110168457,
|
871 |
+
"SpillPSum": 0.012126922607421875,
|
872 |
+
"SplitAPUnionSets": 0.10518908500671387,
|
873 |
+
"SplitAccGrp": 0.001172780990600586,
|
874 |
+
"StaticProfiler": 0.0124053955078125,
|
875 |
+
"StaticTransposeLocalTensor": 0.0038576126098632813,
|
876 |
+
"SundaISel": 0.04396390914916992,
|
877 |
+
"TCTransform": 0.0018804073333740234,
|
878 |
+
"TensorInitialization": 0.012793779373168945,
|
879 |
+
"TensorOpSimplifier": 0.0045316219329833984,
|
880 |
+
"TensorOpTransform": 0.021115541458129883,
|
881 |
+
"TileCCOps": 0.0056231021881103516,
|
882 |
+
"TilingProfiler": 0.00790858268737793,
|
883 |
+
"TransformConvOp": 0.0030431747436523438,
|
884 |
+
"TritiumFusion": 0.03186154365539551,
|
885 |
+
"ValueNumbering": 0.0038623809814453125,
|
886 |
+
"VectorizeDMA": 0.0021522045135498047,
|
887 |
+
"VectorizeMatMult": 0.003453969955444336,
|
888 |
+
"WeightCoalescing": 0.009035825729370117,
|
889 |
+
"ZeroSizeTensorElimination": 0.00011420249938964844
|
890 |
+
},
|
891 |
+
"tensorizer": {
|
892 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 43318.0,
|
893 |
+
"StaticProfiler::AifUb": 154.8094024658203,
|
894 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 152.92723083496094,
|
895 |
+
"StaticProfiler::AverageDmaLength": 4809.89794921875,
|
896 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.65782165527344,
|
897 |
+
"StaticProfiler::AveragePartitionUtilization": 97.58238220214844,
|
898 |
+
"StaticProfiler::AveragePeUtilization": 98.61824035644531,
|
899 |
+
"StaticProfiler::DDRTransferBytes": 787141440.0,
|
900 |
+
"StaticProfiler::InternalTransferBytes": 634853888.0,
|
901 |
+
"StaticProfiler::LoadExpanded": 98070.0,
|
902 |
+
"StaticProfiler::LocalizationEfficiency": 98.78419494628906,
|
903 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.47209167480469,
|
904 |
+
"StaticProfiler::StoreExpanded": 2397.0,
|
905 |
+
"StaticProfiler::TotalDMAExpanded": 100467.0,
|
906 |
+
"StaticProfiler::TotalDynamicInstancesCount": 50670.0,
|
907 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 50224.0,
|
908 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
909 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
910 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
911 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
912 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
913 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
914 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
915 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 22848.0,
|
916 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
917 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
918 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
919 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
920 |
+
"TilingProfiler::PfTransposeInstructions": 19201.0,
|
921 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
|
922 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
923 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 192.0,
|
924 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
|
925 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 158.0,
|
926 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
927 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
928 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
929 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
930 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
931 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
932 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
933 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
934 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
935 |
+
}
|
936 |
+
},
|
937 |
+
"sg01": {
|
938 |
+
"compiletime": {
|
939 |
+
"CanonicalizeConv": 1.9999999949504854e-06,
|
940 |
+
"CanonicalizeForTensorizer": 1.4999999621068127e-05,
|
941 |
+
"Canonicalizer": 0.0002589999930933118,
|
942 |
+
"HoistCompute": 4.999999873689376e-06,
|
943 |
+
"IdentifyCrossPassTensors": 7.999999979801942e-06,
|
944 |
+
"MemcastMotion": 7.999999979801942e-06,
|
945 |
+
"PenguinizeFunctions": 1.5999999959603883e-05,
|
946 |
+
"PruneFunctions": 2.099999983329326e-05,
|
947 |
+
"RemoveOptimizationBarriers": 7.999999979801942e-06,
|
948 |
+
"ScatterMotion": 3.7999998312443495e-05,
|
949 |
+
"TensorizerLegalizationPass": 1.9999999494757503e-05,
|
950 |
+
"VerifySupportedOps": 9.999999747378752e-06,
|
951 |
+
"algsimp": 6.199999916134402e-05,
|
952 |
+
"batchnorm_expander": 1.2999999853491317e-05,
|
953 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
954 |
+
"call-inliner": 9.999999747378752e-06,
|
955 |
+
"canonicalize-boundary-marker": 6.000000212225132e-06,
|
956 |
+
"collective-stream-id-checker": 3.999999989900971e-06,
|
957 |
+
"comparison-expander": 4.999999873689376e-06,
|
958 |
+
"computation-deduplicator": 2.4000000848900527e-05,
|
959 |
+
"conditional-to-select": 4.999999873689376e-06,
|
960 |
+
"config-lowering": 2.099999983329326e-05,
|
961 |
+
"constant_folding": 7.999999979801942e-06,
|
962 |
+
"cse": 1.2000000424450263e-05,
|
963 |
+
"dce": 9.999999974752427e-07,
|
964 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
965 |
+
"eliminate-redundant-compare": 3.999999989900971e-06,
|
966 |
+
"emit-offloaded-dropout": 1.4000000192027073e-05,
|
967 |
+
"flatten-call-graph": 9.000000318337698e-06,
|
968 |
+
"fuse-send-recv": 2.9000000722589903e-05,
|
969 |
+
"hilo::LegalizeAlias": 4.999999873689376e-06,
|
970 |
+
"hilo::NeuronInstCombine": 4.5000000682193786e-05,
|
971 |
+
"hilo::NeuronOpFusion": 1.5999999959603883e-05,
|
972 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.1000000085914508e-05,
|
973 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
974 |
+
"hilo::SixtyFourHack": 1.4999999621068127e-05,
|
975 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
976 |
+
"hlo-mac-count": 2.5999999706982635e-05,
|
977 |
+
"hlo-verifier": 0.00020500000391621143,
|
978 |
+
"legalize-ccops": 9.999999974752427e-07,
|
979 |
+
"legalize-compare": 3.999999989900971e-06,
|
980 |
+
"lower-argminmax-custom-call": 3.999999989900971e-06,
|
981 |
+
"map-inline": 1.2000000424450263e-05,
|
982 |
+
"metadata-naming": 2.499999936844688e-05,
|
983 |
+
"mlir::detail::OpToOpPassAdaptor": 0.00012799999967683107,
|
984 |
+
"mlir::hlo::MhloToPyPenguin": 0.0009619999909773469,
|
985 |
+
"mlir::mhlo::LowerComplexExtraPass": 8.099999831756577e-05,
|
986 |
+
"mlir::mhlo::LowerComplexPass": 3.999999989900971e-06,
|
987 |
+
"native-to-custom-softmax": 6.000000212225132e-06,
|
988 |
+
"native-to-custom-softmax-dx": 1.5999999959603883e-05,
|
989 |
+
"operand_upcaster": 2.099999983329326e-05,
|
990 |
+
"post-par-pipe-begin": 4.999999873689376e-06,
|
991 |
+
"post-par-pipe-end": 0.0,
|
992 |
+
"post-partition-simplification": 0.0005779999773949385,
|
993 |
+
"replace-minimum-constant": 6.000000212225132e-06,
|
994 |
+
"reshape-mover": 3.000000106112566e-06,
|
995 |
+
"simplify-concat": 4.8999998398358e-05,
|
996 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
997 |
+
"transform-variadic-reduce": 9.000000318337698e-06,
|
998 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
999 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
1000 |
+
"unroll-while-loop": 0.0
|
1001 |
+
},
|
1002 |
+
"hilo": {
|
1003 |
+
"ArithmeticIntensity": 240.22828674316406,
|
1004 |
+
"HloMacCount": 24964497408.0,
|
1005 |
+
"Traffic": 207839776.0
|
1006 |
+
}
|
1007 |
+
},
|
1008 |
+
"sg02": {
|
1009 |
+
"compiletime": {
|
1010 |
+
"CanonicalizeConv": 0.0,
|
1011 |
+
"CanonicalizeForTensorizer": 1.2999999853491317e-05,
|
1012 |
+
"Canonicalizer": 0.000311999989207834,
|
1013 |
+
"HoistCompute": 0.0,
|
1014 |
+
"IdentifyCrossPassTensors": 1.2000000424450263e-05,
|
1015 |
+
"MemcastMotion": 0.0,
|
1016 |
+
"PenguinizeFunctions": 1.1000000085914508e-05,
|
1017 |
+
"PruneFunctions": 7.999999979801942e-06,
|
1018 |
+
"RemoveOptimizationBarriers": 1.2000000424450263e-05,
|
1019 |
+
"ScatterMotion": 0.0,
|
1020 |
+
"TensorizerLegalizationPass": 7.000000096013537e-06,
|
1021 |
+
"VerifySupportedOps": 1.1000000085914508e-05,
|
1022 |
+
"algsimp": 5.999999848427251e-05,
|
1023 |
+
"batchnorm_expander": 1.2999999853491317e-05,
|
1024 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
1025 |
+
"call-inliner": 1.1000000085914508e-05,
|
1026 |
+
"canonicalize-boundary-marker": 6.000000212225132e-06,
|
1027 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
1028 |
+
"comparison-expander": 4.999999873689376e-06,
|
1029 |
+
"computation-deduplicator": 2.499999936844688e-05,
|
1030 |
+
"conditional-to-select": 7.000000096013537e-06,
|
1031 |
+
"config-lowering": 2.5999999706982635e-05,
|
1032 |
+
"constant_folding": 9.000000318337698e-06,
|
1033 |
+
"cse": 1.2000000424450263e-05,
|
1034 |
+
"dce": 9.999999974752427e-07,
|
1035 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
1036 |
+
"eliminate-redundant-compare": 3.999999989900971e-06,
|
1037 |
+
"emit-offloaded-dropout": 1.4000000192027073e-05,
|
1038 |
+
"flatten-call-graph": 1.2000000424450263e-05,
|
1039 |
+
"fuse-send-recv": 1.8000000636675395e-05,
|
1040 |
+
"hilo::LegalizeAlias": 1.9999999949504854e-06,
|
1041 |
+
"hilo::NeuronInstCombine": 1.2999999853491317e-05,
|
1042 |
+
"hilo::NeuronOpFusion": 0.0,
|
1043 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05,
|
1044 |
+
"hilo::ScheduleFusion": 0.0,
|
1045 |
+
"hilo::SixtyFourHack": 5.900000178371556e-05,
|
1046 |
+
"hilo::VerifyAliasing": 9.999999974752427e-07,
|
1047 |
+
"hlo-mac-count": 0.0001720000000204891,
|
1048 |
+
"hlo-verifier": 0.0001880000054370612,
|
1049 |
+
"legalize-ccops": 9.999999974752427e-07,
|
1050 |
+
"legalize-compare": 3.000000106112566e-06,
|
1051 |
+
"lower-argminmax-custom-call": 3.999999989900971e-06,
|
1052 |
+
"map-inline": 1.2000000424450263e-05,
|
1053 |
+
"metadata-naming": 1.4999999621068127e-05,
|
1054 |
+
"mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05,
|
1055 |
+
"mlir::hlo::MhloToPyPenguin": 0.0008660000166855752,
|
1056 |
+
"mlir::mhlo::LowerComplexExtraPass": 9.100000170292333e-05,
|
1057 |
+
"mlir::mhlo::LowerComplexPass": 0.00010599999950500205,
|
1058 |
+
"native-to-custom-softmax": 6.000000212225132e-06,
|
1059 |
+
"native-to-custom-softmax-dx": 2.300000051036477e-05,
|
1060 |
+
"operand_upcaster": 2.300000051036477e-05,
|
1061 |
+
"post-par-pipe-begin": 1.9999999949504854e-06,
|
1062 |
+
"post-par-pipe-end": 0.0,
|
1063 |
+
"post-partition-simplification": 0.0005329999839887023,
|
1064 |
+
"replace-minimum-constant": 9.000000318337698e-06,
|
1065 |
+
"reshape-mover": 3.000000106112566e-06,
|
1066 |
+
"simplify-concat": 4.400000034365803e-05,
|
1067 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
1068 |
+
"transform-variadic-reduce": 6.0999998822808266e-05,
|
1069 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
1070 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
1071 |
+
"unroll-while-loop": 0.0
|
1072 |
+
},
|
1073 |
+
"hilo": {
|
1074 |
+
"ArithmeticIntensity": 50.378170013427734,
|
1075 |
+
"HloMacCount": 19638517760.0,
|
1076 |
+
"Traffic": 779643968.0
|
1077 |
+
}
|
1078 |
+
}
|
1079 |
+
}
|
context_encoding_model/_tp0_bk1/graph.neff
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60a8e4c285a690a146d149c675038f0498f62f761e4e3893706941d7ca8af583
|
3 |
+
size 1659904
|
context_encoding_model/_tp0_bk1/log-neuron-cc.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
context_encoding_model/_tp0_bk1/metaneff.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:24459c80d98d706b0a4aca22eda28ff6c09f03a08393e76b58ee0ca668d1b851
|
3 |
+
size 1152551
|
context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b03debb723d63387ea26771f63729d616ac71a0dbfcb78d21d2194ff723fcbc1
|
3 |
+
size 1229637
|
context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60a8e4c285a690a146d149c675038f0498f62f761e4e3893706941d7ca8af583
|
3 |
+
size 1659904
|
context_encoding_model/_tp0_bk1/neuron_config.json
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_attn_implementation_autoset": false,
|
3 |
+
"_name_or_path": "Qwen/Qwen3-8B",
|
4 |
+
"add_cross_attention": false,
|
5 |
+
"architectures": [
|
6 |
+
"Qwen3ForCausalLM"
|
7 |
+
],
|
8 |
+
"attention_bias": false,
|
9 |
+
"attention_dropout": 0.0,
|
10 |
+
"attribute_map": {},
|
11 |
+
"bad_words_ids": null,
|
12 |
+
"begin_suppress_tokens": null,
|
13 |
+
"bos_token_id": 151643,
|
14 |
+
"chunk_size_feed_forward": 0,
|
15 |
+
"cross_attention_hidden_size": null,
|
16 |
+
"decoder_start_token_id": null,
|
17 |
+
"diversity_penalty": 0.0,
|
18 |
+
"do_sample": false,
|
19 |
+
"early_stopping": false,
|
20 |
+
"encoder_no_repeat_ngram_size": 0,
|
21 |
+
"eos_token_id": 151645,
|
22 |
+
"exponential_decay_length_penalty": null,
|
23 |
+
"finetuning_task": null,
|
24 |
+
"forced_bos_token_id": null,
|
25 |
+
"forced_eos_token_id": null,
|
26 |
+
"fused_spec_config": null,
|
27 |
+
"head_dim": 128,
|
28 |
+
"hidden_act": "silu",
|
29 |
+
"hidden_size": 4096,
|
30 |
+
"id2label": {
|
31 |
+
"0": "LABEL_0",
|
32 |
+
"1": "LABEL_1"
|
33 |
+
},
|
34 |
+
"initializer_range": 0.02,
|
35 |
+
"intermediate_size": 12288,
|
36 |
+
"is_decoder": false,
|
37 |
+
"is_encoder_decoder": false,
|
38 |
+
"label2id": {
|
39 |
+
"LABEL_0": 0,
|
40 |
+
"LABEL_1": 1
|
41 |
+
},
|
42 |
+
"length_penalty": 1.0,
|
43 |
+
"max_length": 20,
|
44 |
+
"max_position_embeddings": 40960,
|
45 |
+
"max_window_layers": 36,
|
46 |
+
"metadata": null,
|
47 |
+
"min_length": 0,
|
48 |
+
"model_type": "qwen3",
|
49 |
+
"neuron_config": {
|
50 |
+
"activation_quantization_type": null,
|
51 |
+
"allow_input_truncation": false,
|
52 |
+
"apply_seq_ids_mask": false,
|
53 |
+
"async_mode": false,
|
54 |
+
"attention_dp_degree": 1,
|
55 |
+
"attention_dtype": null,
|
56 |
+
"attn_block_cte_nki_kernel_enabled": false,
|
57 |
+
"attn_block_tkg_nki_kernel_cache_update": false,
|
58 |
+
"attn_block_tkg_nki_kernel_enabled": false,
|
59 |
+
"attn_cls": {
|
60 |
+
"__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
|
61 |
+
"__name__": "NeuronQwen3Attention"
|
62 |
+
},
|
63 |
+
"attn_kernel_enabled": null,
|
64 |
+
"attn_tkg_builtin_kernel_enabled": false,
|
65 |
+
"attn_tkg_nki_kernel_enabled": false,
|
66 |
+
"batch_size": 1,
|
67 |
+
"bucket_n_active_tokens": true,
|
68 |
+
"buckets": [
|
69 |
+
256
|
70 |
+
],
|
71 |
+
"cast_type": "config",
|
72 |
+
"cc_pipeline_tiling_factor": 2,
|
73 |
+
"chunked_prefill_config": null,
|
74 |
+
"context_encoding_buckets": [
|
75 |
+
256
|
76 |
+
],
|
77 |
+
"cp_degree": 1,
|
78 |
+
"ctx_batch_size": 1,
|
79 |
+
"disable_kv_cache_tiling": false,
|
80 |
+
"draft_model_modules_to_not_convert": null,
|
81 |
+
"enable_bucketing": true,
|
82 |
+
"enable_eagle_draft_input_norm": false,
|
83 |
+
"enable_eagle_speculation": false,
|
84 |
+
"enable_fused_speculation": false,
|
85 |
+
"enable_long_context_mode": false,
|
86 |
+
"enable_output_completion_notifications": false,
|
87 |
+
"enable_spill_reload_dge": false,
|
88 |
+
"enable_token_tree": false,
|
89 |
+
"ep_degree": 1,
|
90 |
+
"expert_mlp_nki_kernel_enabled": null,
|
91 |
+
"flash_decoding_enabled": false,
|
92 |
+
"fused_qkv": false,
|
93 |
+
"fused_rmsnorm_skip_gamma": false,
|
94 |
+
"is_block_kv_layout": null,
|
95 |
+
"is_chunked_prefill": false,
|
96 |
+
"is_continuous_batching": true,
|
97 |
+
"is_eagle_draft": false,
|
98 |
+
"is_medusa": false,
|
99 |
+
"is_prefill_stage": true,
|
100 |
+
"is_prefix_caching": false,
|
101 |
+
"k_cache_transposed": false,
|
102 |
+
"kv_cache_batch_size": 1,
|
103 |
+
"kv_cache_padding_size": 0,
|
104 |
+
"kv_cache_quant": false,
|
105 |
+
"kv_cache_tiling": false,
|
106 |
+
"layer_boundary_markers": false,
|
107 |
+
"lm_head_pad": false,
|
108 |
+
"lm_head_pad_alignment_size": 1,
|
109 |
+
"local_ranks_size": 2,
|
110 |
+
"logical_nc_config": 1,
|
111 |
+
"lora_config": null,
|
112 |
+
"max_batch_size": 1,
|
113 |
+
"max_context_length": 1024,
|
114 |
+
"max_length": 1024,
|
115 |
+
"max_new_tokens": null,
|
116 |
+
"medusa_speculation_length": 0,
|
117 |
+
"medusa_tree": null,
|
118 |
+
"mlp_kernel_enabled": false,
|
119 |
+
"mlp_kernel_fuse_residual_add": false,
|
120 |
+
"modules_to_not_convert": null,
|
121 |
+
"moe_fused_nki_kernel_enabled": null,
|
122 |
+
"n_active_tokens": 1024,
|
123 |
+
"n_positions": 1024,
|
124 |
+
"num_medusa_heads": 0,
|
125 |
+
"on_cpu": false,
|
126 |
+
"on_device_sampling_config": {
|
127 |
+
"deterministic": false,
|
128 |
+
"do_sample": false,
|
129 |
+
"dynamic": true,
|
130 |
+
"global_topk": 256,
|
131 |
+
"on_device_sampling_config": true,
|
132 |
+
"temperature": 1.0,
|
133 |
+
"top_k": 1,
|
134 |
+
"top_k_kernel_enabled": false,
|
135 |
+
"top_p": 1.0
|
136 |
+
},
|
137 |
+
"output_logits": false,
|
138 |
+
"overrides_torch_dtype": true,
|
139 |
+
"pa_block_size": 1024,
|
140 |
+
"pa_num_blocks": 1,
|
141 |
+
"padding_side": "right",
|
142 |
+
"pp_degree": 1,
|
143 |
+
"prefix_buckets": null,
|
144 |
+
"qk_layernorm": false,
|
145 |
+
"qkv_kernel_enabled": false,
|
146 |
+
"qkv_kernel_fuse_residual_add": false,
|
147 |
+
"qkv_kernel_nbsd_layout": false,
|
148 |
+
"quantization_dtype": "int8",
|
149 |
+
"quantization_type": "per_tensor_symmetric",
|
150 |
+
"quantize_clamp_bound": Infinity,
|
151 |
+
"quantized": false,
|
152 |
+
"quantized_checkpoints_path": null,
|
153 |
+
"quantized_mlp_kernel_enabled": false,
|
154 |
+
"rmsnorm_quantize_kernel_enabled": false,
|
155 |
+
"router_topk_nki_kernel_enabled": null,
|
156 |
+
"rpl_reduce_dtype": null,
|
157 |
+
"save_sharded_checkpoint": true,
|
158 |
+
"scratchpad_page_size": null,
|
159 |
+
"seq_len": 1024,
|
160 |
+
"seq_len_threshold_for_cc_tiling": 16384,
|
161 |
+
"sequence_parallel_enabled": false,
|
162 |
+
"shared_mlp_nki_kernel_enabled": null,
|
163 |
+
"skip_sharding": false,
|
164 |
+
"skip_warmup": false,
|
165 |
+
"spec_batch_size": 1,
|
166 |
+
"speculation_length": 0,
|
167 |
+
"start_rank_id": 0,
|
168 |
+
"target": null,
|
169 |
+
"tile_cc": false,
|
170 |
+
"tkg_batch_size": 1,
|
171 |
+
"token_generation_buckets": null,
|
172 |
+
"token_tree_config": null,
|
173 |
+
"torch_dtype": "bfloat16",
|
174 |
+
"tp_degree": 2,
|
175 |
+
"vocab_parallel": false,
|
176 |
+
"weight_gather_seq_len_threshold": 32768,
|
177 |
+
"weights_to_skip_layout_optimization": [],
|
178 |
+
"world_size": 2
|
179 |
+
},
|
180 |
+
"no_repeat_ngram_size": 0,
|
181 |
+
"num_attention_heads": 32,
|
182 |
+
"num_beam_groups": 1,
|
183 |
+
"num_beams": 1,
|
184 |
+
"num_cores_per_group": 1,
|
185 |
+
"num_hidden_layers": 36,
|
186 |
+
"num_key_value_heads": 8,
|
187 |
+
"num_return_sequences": 1,
|
188 |
+
"output_attentions": false,
|
189 |
+
"output_hidden_states": false,
|
190 |
+
"output_scores": false,
|
191 |
+
"pad_token_id": 0,
|
192 |
+
"prefix": null,
|
193 |
+
"problem_type": null,
|
194 |
+
"pruned_heads": {},
|
195 |
+
"remove_invalid_values": false,
|
196 |
+
"repetition_penalty": 1.0,
|
197 |
+
"return_dict": true,
|
198 |
+
"return_dict_in_generate": false,
|
199 |
+
"rms_norm_eps": 1e-06,
|
200 |
+
"rope_scaling": null,
|
201 |
+
"rope_theta": 1000000,
|
202 |
+
"sep_token_id": null,
|
203 |
+
"sliding_window": null,
|
204 |
+
"suppress_tokens": null,
|
205 |
+
"task_specific_params": null,
|
206 |
+
"temperature": 1.0,
|
207 |
+
"tf_legacy_loss": false,
|
208 |
+
"tie_encoder_decoder": false,
|
209 |
+
"tie_word_embeddings": false,
|
210 |
+
"tokenizer_class": null,
|
211 |
+
"top_k": 50,
|
212 |
+
"top_p": 1.0,
|
213 |
+
"torchscript": false,
|
214 |
+
"transformers_version": "4.51.0",
|
215 |
+
"typical_p": 1.0,
|
216 |
+
"use_bfloat16": false,
|
217 |
+
"use_cache": true,
|
218 |
+
"use_sliding_window": false,
|
219 |
+
"vocab_size": 151936
|
220 |
+
}
|
context_encoding_model/_tp0_bk2/command.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
neuronx-cc compile --framework=XLA model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb --output model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
|
context_encoding_model/_tp0_bk2/compile_flags.MODULE_00594b8bc68e927f3dbe+1ad60ced.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/log-neuron-cc.txt"]
|
context_encoding_model/_tp0_bk2/global_metric_store.json
ADDED
@@ -0,0 +1,1079 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Average": {
|
3 |
+
"tensorizer": {
|
4 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.66542053222656,
|
5 |
+
"StaticProfiler::AveragePartitionUtilization": 97.7269515991211,
|
6 |
+
"StaticProfiler::AveragePeUtilization": 98.64861297607422,
|
7 |
+
"StaticProfiler::LocalizationEfficiency": 98.26979064941406,
|
8 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 101.01405334472656,
|
9 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
10 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"Count": {
|
14 |
+
"tensorizer": {
|
15 |
+
"StaticProfiler::AverageFractalPeUtilization": 1.0,
|
16 |
+
"StaticProfiler::AveragePartitionUtilization": 1.0,
|
17 |
+
"StaticProfiler::AveragePeUtilization": 1.0,
|
18 |
+
"StaticProfiler::LocalizationEfficiency": 1.0,
|
19 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
|
20 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
|
21 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"Sum": {
|
25 |
+
"compiletime": {
|
26 |
+
"AGOrderingAnalysisPass": 0.018257856369018555,
|
27 |
+
"AffinePredicateResolution": 0.0011677742004394531,
|
28 |
+
"AliasDependencyElimination": 0.0001201629638671875,
|
29 |
+
"AliasDependencyInduction": 0.0052988529205322266,
|
30 |
+
"AliasDependencyReset": 0.029210567474365234,
|
31 |
+
"BFComputeCutting": 0.0032625198364257813,
|
32 |
+
"BirCodeGenLoop": 0.4527714252471924,
|
33 |
+
"CCOpFusion": 0.02410125732421875,
|
34 |
+
"CanonicalizeConv": 0.00029399999766610563,
|
35 |
+
"CanonicalizeDAGForPGTiling": 0.004324913024902344,
|
36 |
+
"CanonicalizeForTensorizer": 4.8000001697801054e-05,
|
37 |
+
"CanonicalizeIR": 0.0019502639770507813,
|
38 |
+
"Canonicalizer": 0.0010809999657794833,
|
39 |
+
"CoalesceCCOp": 0.014672040939331055,
|
40 |
+
"CommuteConcat": 0.0008339881896972656,
|
41 |
+
"DMALocalityOpt": 0.005767107009887695,
|
42 |
+
"DMAProfiler": 0.012850046157836914,
|
43 |
+
"DMATilingProfiler": 0.004332065582275391,
|
44 |
+
"DataLocalityOpt": 0.07260942459106445,
|
45 |
+
"DataStreaming": 0.03969836235046387,
|
46 |
+
"DeConcat": 0.0005326271057128906,
|
47 |
+
"DeadCodeElimination": 0.0009255409240722656,
|
48 |
+
"DeadStoreElimination": 0.0055675506591796875,
|
49 |
+
"DelinearIndices": 0.004735231399536133,
|
50 |
+
"Delinearization": 0.0030374526977539063,
|
51 |
+
"DoNothing": 0.00018930435180664063,
|
52 |
+
"DramToDramTranspose": 0.018135547637939453,
|
53 |
+
"DumpGraphAndMetadata": 0.09476375579833984,
|
54 |
+
"EliminateDivs": 0.002595663070678711,
|
55 |
+
"ExpandBatchNorm": 0.002063274383544922,
|
56 |
+
"ExpandISAMacro": 0.011973381042480469,
|
57 |
+
"FactorizeBlkDims": 0.009292840957641602,
|
58 |
+
"FactorizeThreadAxesInFreeDims": 0.0010046958923339844,
|
59 |
+
"FlattenMacroLoop": 0.002232074737548828,
|
60 |
+
"GenericAccessSimplifier": 0.0018167495727539063,
|
61 |
+
"HoistCompute": 7.999999979801942e-06,
|
62 |
+
"IdentifyCrossPassTensors": 7.79999973019585e-05,
|
63 |
+
"InferInitValue": 0.024865150451660156,
|
64 |
+
"InferIntrinsicOnCC": 0.009101152420043945,
|
65 |
+
"InferNeuronTensor": 0.023293495178222656,
|
66 |
+
"InferNonlocalTensors": 0.01632833480834961,
|
67 |
+
"InferPSumTensor": 0.27726316452026367,
|
68 |
+
"InlineNativeKernels": 0.0081634521484375,
|
69 |
+
"InsertIOTransposes": 0.019203901290893555,
|
70 |
+
"InsertLocalTransposes": 0.0042340755462646484,
|
71 |
+
"InsertOffloadedTransposes": 0.002811431884765625,
|
72 |
+
"LICM": 0.0029730796813964844,
|
73 |
+
"LateLegalizeInst": 0.014307022094726563,
|
74 |
+
"LateLegalizePostSplit": 0.012536048889160156,
|
75 |
+
"LateLowerReshapeOp": 0.0018641948699951172,
|
76 |
+
"LateLowerTensorOp": 0.0014081001281738281,
|
77 |
+
"LateNeuronInstComb": 0.00915217399597168,
|
78 |
+
"LayoutPreprocessing": 0.02658390998840332,
|
79 |
+
"LayoutPreprocessingAndAnalysis": 0.10707235336303711,
|
80 |
+
"LayoutRequirementAnalysis": 0.005135536193847656,
|
81 |
+
"LegalizeCCOpLayout": 0.002307415008544922,
|
82 |
+
"LegalizeOpLevelAlias": 0.0012297630310058594,
|
83 |
+
"LegalizePartitionReduce": 0.0010194778442382813,
|
84 |
+
"LegalizeSundaAccess": 0.07808256149291992,
|
85 |
+
"LegalizeSundaMacro": 0.010968446731567383,
|
86 |
+
"LegalizeType": 0.012074947357177734,
|
87 |
+
"LocalLayoutOpt": 0.013799905776977539,
|
88 |
+
"LoopFusion": 0.0052182674407958984,
|
89 |
+
"LoopSplitting": 0.0003161430358886719,
|
90 |
+
"LowerBroadcast": 0.0015821456909179688,
|
91 |
+
"LowerCCOpBlockAxis": 0.0040547847747802734,
|
92 |
+
"LowerComplexBroadcast": 0.002165079116821289,
|
93 |
+
"LowerIntrinsics": 0.31156492233276367,
|
94 |
+
"LowerTensorOp": 0.010558843612670898,
|
95 |
+
"LowerTranspose": 0.012494325637817383,
|
96 |
+
"MacroGeneration": 0.029862642288208008,
|
97 |
+
"MaskPropagation": 0.002757549285888672,
|
98 |
+
"MemcastMotion": 3.400000059627928e-05,
|
99 |
+
"MemcpyElimination": 0.025969266891479492,
|
100 |
+
"MutateDataType": 0.002087831497192383,
|
101 |
+
"NeuronAliasDependencyInduction": 0.00016880035400390625,
|
102 |
+
"NeuronAliasDependencyReset": 0.020352602005004883,
|
103 |
+
"NeuronInstComb": 0.004656076431274414,
|
104 |
+
"NeuronLICM": 0.03560137748718262,
|
105 |
+
"NeuronLoopFusion": 0.007991313934326172,
|
106 |
+
"NeuronLoopInterchange": 0.002409219741821289,
|
107 |
+
"NeuronSimplifier": 0.007069587707519531,
|
108 |
+
"NeuronSimplifyPredicates": 0.12419009208679199,
|
109 |
+
"NeuronValueNumbering": 0.0032753944396972656,
|
110 |
+
"OptimizeAliasedCopyChain": 0.0005936622619628906,
|
111 |
+
"OptimizeNKIKernels": 0.5374257564544678,
|
112 |
+
"PAGLayoutOpt": 0.08115577697753906,
|
113 |
+
"PComputeCutting": 0.004801273345947266,
|
114 |
+
"PGLayoutTilingPipeline": 0.5454635620117188,
|
115 |
+
"PGTiling": 0.14933419227600098,
|
116 |
+
"PadElimination": 0.00034046173095703125,
|
117 |
+
"ParAxesAnnotation": 0.053552865982055664,
|
118 |
+
"PartialLoopFusion": 0.0067539215087890625,
|
119 |
+
"PartialSimdFusion": 0.00693058967590332,
|
120 |
+
"PenguinizeFunctions": 4.5000000682193786e-05,
|
121 |
+
"PerfectLoopNest": 0.0035321712493896484,
|
122 |
+
"PruneFunctions": 5.199999941396527e-05,
|
123 |
+
"RecognizeOpIdiom": 0.003947257995605469,
|
124 |
+
"Recompute": 0.00024962425231933594,
|
125 |
+
"RelaxPredicates": 0.013285398483276367,
|
126 |
+
"Rematerialization": 0.002062082290649414,
|
127 |
+
"RemoveOptimizationBarriers": 8.70000003487803e-05,
|
128 |
+
"ReshapeWeights": 0.002131223678588867,
|
129 |
+
"ResolveAccessConflict": 0.0038597583770751953,
|
130 |
+
"ResolveComplicatePredicates": 0.002032756805419922,
|
131 |
+
"RewriteReplicationMatmul": 0.001924753189086914,
|
132 |
+
"RewriteWeights": 0.002452373504638672,
|
133 |
+
"SFKVectorizer": 0.2718319892883301,
|
134 |
+
"ScatterMotion": 3.7999998312443495e-05,
|
135 |
+
"SimpleAllReduceTiling": 0.008960247039794922,
|
136 |
+
"Simplifier": 0.004038810729980469,
|
137 |
+
"SimplifyMacroPredicates": 0.010622739791870117,
|
138 |
+
"SimplifyNeuronTensor": 1.0594146251678467,
|
139 |
+
"SimplifySlice": 0.0009577274322509766,
|
140 |
+
"SimplifyTensor": 0.005341768264770508,
|
141 |
+
"SpillPSum": 0.012076139450073242,
|
142 |
+
"SplitAPUnionSets": 0.10771751403808594,
|
143 |
+
"SplitAccGrp": 0.002201557159423828,
|
144 |
+
"StaticProfiler": 0.012447118759155273,
|
145 |
+
"StaticTransposeLocalTensor": 0.0038712024688720703,
|
146 |
+
"SundaISel": 0.04214668273925781,
|
147 |
+
"TCTransform": 0.0008432865142822266,
|
148 |
+
"TensorInitialization": 0.012825727462768555,
|
149 |
+
"TensorOpSimplifier": 0.004651308059692383,
|
150 |
+
"TensorOpTransform": 0.019537687301635742,
|
151 |
+
"TensorizerLegalizationPass": 5.7999997807201e-05,
|
152 |
+
"TileCCOps": 0.006766319274902344,
|
153 |
+
"TilingProfiler": 0.006911277770996094,
|
154 |
+
"TransformConvOp": 0.0030303001403808594,
|
155 |
+
"TritiumFusion": 0.04502224922180176,
|
156 |
+
"ValueNumbering": 0.001996755599975586,
|
157 |
+
"VectorizeDMA": 0.0019402503967285156,
|
158 |
+
"VectorizeMatMult": 0.0027413368225097656,
|
159 |
+
"VerifySupportedOps": 3.7000001611886546e-05,
|
160 |
+
"WeightCoalescing": 0.008520841598510742,
|
161 |
+
"ZeroSizeTensorElimination": 0.00013709068298339844,
|
162 |
+
"algsimp": 0.0026940000243484974,
|
163 |
+
"batchnorm_expander": 4.400000034365803e-05,
|
164 |
+
"boundary-marker-removal": 1.5999999959603883e-05,
|
165 |
+
"call-inliner": 0.00046999999904073775,
|
166 |
+
"canonicalize-boundary-marker": 1.8999999156221747e-05,
|
167 |
+
"collective-stream-id-checker": 7.300000288523734e-05,
|
168 |
+
"comparison-expander": 0.0005740000051446259,
|
169 |
+
"computation-deduplicator": 7.999999797903001e-05,
|
170 |
+
"conditional-to-select": 1.8000000636675395e-05,
|
171 |
+
"config-lowering": 0.0003279999946244061,
|
172 |
+
"constant-statistics": 0.0005329999839887023,
|
173 |
+
"constant_folding": 0.0003260000084992498,
|
174 |
+
"cse": 4.5000000682193786e-05,
|
175 |
+
"dce": 8.399999933317304e-05,
|
176 |
+
"dot_decomposer": 0.0013409999664872885,
|
177 |
+
"dynamic-slice-transpose": 1.3999999282532372e-05,
|
178 |
+
"eliminate-redundant-compare": 0.0002959999837912619,
|
179 |
+
"emit-offloaded-dropout": 6.399999983841553e-05,
|
180 |
+
"flatten-call-graph": 0.0009319999953731894,
|
181 |
+
"fuse-send-recv": 6.999999459367245e-05,
|
182 |
+
"hilo::LegalizeAlias": 1.3999999282532372e-05,
|
183 |
+
"hilo::NeuronInstCombine": 0.0001660000125411898,
|
184 |
+
"hilo::NeuronOpFusion": 2.5000001187436283e-05,
|
185 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 5.2999999752501026e-05,
|
186 |
+
"hilo::ScheduleFusion": 7.000000096013537e-06,
|
187 |
+
"hilo::SixtyFourHack": 7.299999560927972e-05,
|
188 |
+
"hilo::VerifyAliasing": 6.000000212225132e-06,
|
189 |
+
"hlo-mac-count": 0.0013429999817162752,
|
190 |
+
"hlo-verifier": 0.007542999926954508,
|
191 |
+
"instruction-histogram": 0.0006709999870508909,
|
192 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
193 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
194 |
+
"io-layout-normalization": 0.001310999970883131,
|
195 |
+
"io-statistics": 8.499999967170879e-05,
|
196 |
+
"legalize-ccops": 3.999999989900971e-06,
|
197 |
+
"legalize-compare": 1.2999999853491317e-05,
|
198 |
+
"lower-argminmax-custom-call": 1.300000076298602e-05,
|
199 |
+
"map-inline": 0.0008850000449456275,
|
200 |
+
"metadata-naming": 5.999999848427251e-05,
|
201 |
+
"mlir::detail::OpToOpPassAdaptor": 0.00014399999054148793,
|
202 |
+
"mlir::hlo::MhloToPyPenguin": 0.004429999738931656,
|
203 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.00027299998328089714,
|
204 |
+
"mlir::mhlo::LowerComplexPass": 0.0004909999552182853,
|
205 |
+
"native-to-custom-softmax": 0.0007070000283420086,
|
206 |
+
"native-to-custom-softmax-dx": 0.0005990000208839774,
|
207 |
+
"operand_upcaster": 4.900000203633681e-05,
|
208 |
+
"opt-barrier-removal": 0.0005510000046342611,
|
209 |
+
"post-par-pipe-begin": 8.999999408842996e-06,
|
210 |
+
"post-par-pipe-end": 0.0,
|
211 |
+
"post-partition-simplification": 0.0018570000538602471,
|
212 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
213 |
+
"pre-par-pipe-end": 0.0,
|
214 |
+
"pre-partition-simplification": 0.12893199920654297,
|
215 |
+
"replace-minimum-constant": 0.0004569999873638153,
|
216 |
+
"reshape-mover": 0.00012599999899975955,
|
217 |
+
"simplify-concat": 0.00015899998834356666,
|
218 |
+
"simplify-while-loops": 0.00010400000610388815,
|
219 |
+
"transform-variadic-reduce": 7.000000186963007e-05,
|
220 |
+
"tuple-simplifier": 0.0003150000120513141,
|
221 |
+
"unpack-nested-aws-ntwsr": 0.0004349999944679439,
|
222 |
+
"unroll-while-loop": 2.099999983329326e-05,
|
223 |
+
"zero_sized_hlo_elimination": 0.0008670000243000686
|
224 |
+
},
|
225 |
+
"hilo": {
|
226 |
+
"ConstantSize": 1189157.0,
|
227 |
+
"HloInputCount": 475.0,
|
228 |
+
"HloMacCount": 101242896384.0,
|
229 |
+
"HloOutputCount": 73.0,
|
230 |
+
"IfmapSize": 8266545152.0,
|
231 |
+
"OfmapSize": 75497472.0,
|
232 |
+
"OutputsReadFromCount": 0.0,
|
233 |
+
"PassthroughTensorsCount": 0.0,
|
234 |
+
"RedundantOutputCount": 0.0,
|
235 |
+
"Traffic": 1692493184.0
|
236 |
+
},
|
237 |
+
"tensorizer": {
|
238 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 44382.0,
|
239 |
+
"StaticProfiler::AifUb": 205.154296875,
|
240 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 201.6046905517578,
|
241 |
+
"StaticProfiler::AverageDmaLength": 1901.806396484375,
|
242 |
+
"StaticProfiler::DDRTransferBytes": 795531072.0,
|
243 |
+
"StaticProfiler::InternalTransferBytes": 646388224.0,
|
244 |
+
"StaticProfiler::LoadExpanded": 376342.0,
|
245 |
+
"StaticProfiler::StoreExpanded": 4189.0,
|
246 |
+
"StaticProfiler::TotalDMAExpanded": 380531.0,
|
247 |
+
"StaticProfiler::TotalDynamicInstancesCount": 53882.0,
|
248 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 53436.0,
|
249 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
250 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
251 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
252 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
253 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
254 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 23616.0,
|
255 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
256 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
257 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
258 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
259 |
+
"TilingProfiler::PfTransposeInstructions": 19393.0,
|
260 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
|
261 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
262 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0,
|
263 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
|
264 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 158.0,
|
265 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
266 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
267 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
268 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
269 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
270 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
271 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
272 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
273 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
274 |
+
}
|
275 |
+
},
|
276 |
+
"all": {
|
277 |
+
"compiletime": {
|
278 |
+
"algsimp": 0.002466999925673008,
|
279 |
+
"call-inliner": 0.0004360000020824373,
|
280 |
+
"collective-stream-id-checker": 6.299999949987978e-05,
|
281 |
+
"comparison-expander": 0.0005569999921135604,
|
282 |
+
"constant-statistics": 0.0005329999839887023,
|
283 |
+
"constant_folding": 0.0002969999914057553,
|
284 |
+
"dce": 7.999999797903001e-05,
|
285 |
+
"dot_decomposer": 0.0013409999664872885,
|
286 |
+
"eliminate-redundant-compare": 0.00028199999360367656,
|
287 |
+
"flatten-call-graph": 0.0008999999845400453,
|
288 |
+
"hlo-mac-count": 0.0010720000136643648,
|
289 |
+
"hlo-verifier": 0.0069679999724030495,
|
290 |
+
"instruction-histogram": 0.0006709999870508909,
|
291 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
292 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
293 |
+
"io-layout-normalization": 0.001310999970883131,
|
294 |
+
"io-statistics": 8.499999967170879e-05,
|
295 |
+
"map-inline": 0.0008440000237897038,
|
296 |
+
"native-to-custom-softmax": 0.0006750000175088644,
|
297 |
+
"native-to-custom-softmax-dx": 0.0005000000237487257,
|
298 |
+
"opt-barrier-removal": 0.0005510000046342611,
|
299 |
+
"pre-par-pipe-begin": 9.999999974752427e-07,
|
300 |
+
"pre-par-pipe-end": 0.0,
|
301 |
+
"pre-partition-simplification": 0.12893199920654297,
|
302 |
+
"replace-minimum-constant": 0.0004309999931138009,
|
303 |
+
"reshape-mover": 0.00011500000255182385,
|
304 |
+
"simplify-while-loops": 9.600000339560211e-05,
|
305 |
+
"tuple-simplifier": 0.0002969999914057553,
|
306 |
+
"unpack-nested-aws-ntwsr": 0.00042100000428035855,
|
307 |
+
"unroll-while-loop": 1.9999999494757503e-05,
|
308 |
+
"zero_sized_hlo_elimination": 0.0008670000243000686
|
309 |
+
}
|
310 |
+
},
|
311 |
+
"cumsum": {
|
312 |
+
"compiletime": {
|
313 |
+
"CoalesceCCOp": 0.00020885467529296875,
|
314 |
+
"DMALocalityOpt": 0.00016832351684570313,
|
315 |
+
"DMAProfiler": 0.0007588863372802734,
|
316 |
+
"DataStreaming": 0.00029587745666503906,
|
317 |
+
"DoNothing": 0.00011897087097167969,
|
318 |
+
"ExpandISAMacro": 0.0005011558532714844,
|
319 |
+
"FactorizeBlkDims": 0.00043463706970214844,
|
320 |
+
"InferPSumTensor": 0.00044608116149902344,
|
321 |
+
"LateLegalizeInst": 0.0004031658172607422,
|
322 |
+
"LateNeuronInstComb": 0.0005033016204833984,
|
323 |
+
"LegalizeSundaAccess": 0.0021431446075439453,
|
324 |
+
"LegalizeType": 0.00024056434631347656,
|
325 |
+
"LowerBroadcast": 0.00022101402282714844,
|
326 |
+
"LowerIntrinsics": 0.00023508071899414063,
|
327 |
+
"LowerTranspose": 0.0002219676971435547,
|
328 |
+
"NeuronInstComb": 0.0005297660827636719,
|
329 |
+
"NeuronLICM": 0.00041484832763671875,
|
330 |
+
"NeuronSimplifyPredicates": 0.0028023719787597656,
|
331 |
+
"NeuronValueNumbering": 0.00043582916259765625,
|
332 |
+
"SFKVectorizer": 0.002759695053100586,
|
333 |
+
"SimpleAllReduceTiling": 0.00020432472229003906,
|
334 |
+
"SimplifyNeuronTensor": 0.0004029273986816406,
|
335 |
+
"SpillPSum": 0.0005388259887695313,
|
336 |
+
"WeightCoalescing": 0.0002307891845703125
|
337 |
+
}
|
338 |
+
},
|
339 |
+
"sg00": {
|
340 |
+
"compiletime": {
|
341 |
+
"CanonicalizeConv": 2.300000051036477e-05,
|
342 |
+
"CanonicalizeForTensorizer": 2.300000051036477e-05,
|
343 |
+
"Canonicalizer": 0.0005249999812804163,
|
344 |
+
"HoistCompute": 3.000000106112566e-06,
|
345 |
+
"IdentifyCrossPassTensors": 3.099999958067201e-05,
|
346 |
+
"MemcastMotion": 9.999999747378752e-06,
|
347 |
+
"PenguinizeFunctions": 2.2000000171829015e-05,
|
348 |
+
"PruneFunctions": 1.2999999853491317e-05,
|
349 |
+
"RemoveOptimizationBarriers": 4.400000034365803e-05,
|
350 |
+
"ScatterMotion": 6.000000212225132e-06,
|
351 |
+
"TensorizerLegalizationPass": 3.600000127335079e-05,
|
352 |
+
"VerifySupportedOps": 1.700000029813964e-05,
|
353 |
+
"algsimp": 0.0001049999991664663,
|
354 |
+
"batchnorm_expander": 1.8999999156221747e-05,
|
355 |
+
"boundary-marker-removal": 7.000000096013537e-06,
|
356 |
+
"call-inliner": 1.4000000192027073e-05,
|
357 |
+
"canonicalize-boundary-marker": 7.999999979801942e-06,
|
358 |
+
"collective-stream-id-checker": 3.999999989900971e-06,
|
359 |
+
"comparison-expander": 7.000000096013537e-06,
|
360 |
+
"computation-deduplicator": 2.099999983329326e-05,
|
361 |
+
"conditional-to-select": 7.000000096013537e-06,
|
362 |
+
"config-lowering": 0.00027600000612437725,
|
363 |
+
"constant_folding": 1.2000000424450263e-05,
|
364 |
+
"cse": 2.2000000171829015e-05,
|
365 |
+
"dce": 1.9999999949504854e-06,
|
366 |
+
"dynamic-slice-transpose": 6.000000212225132e-06,
|
367 |
+
"eliminate-redundant-compare": 6.000000212225132e-06,
|
368 |
+
"emit-offloaded-dropout": 3.7999998312443495e-05,
|
369 |
+
"flatten-call-graph": 1.2999999853491317e-05,
|
370 |
+
"fuse-send-recv": 3.099999958067201e-05,
|
371 |
+
"hilo::LegalizeAlias": 7.000000096013537e-06,
|
372 |
+
"hilo::NeuronInstCombine": 6.299999949987978e-05,
|
373 |
+
"hilo::NeuronOpFusion": 6.000000212225132e-06,
|
374 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 2.300000051036477e-05,
|
375 |
+
"hilo::ScheduleFusion": 1.9999999949504854e-06,
|
376 |
+
"hilo::SixtyFourHack": 2.099999983329326e-05,
|
377 |
+
"hilo::VerifyAliasing": 3.000000106112566e-06,
|
378 |
+
"hlo-mac-count": 7.300000288523734e-05,
|
379 |
+
"hlo-verifier": 0.00023600000713486224,
|
380 |
+
"legalize-ccops": 1.9999999949504854e-06,
|
381 |
+
"legalize-compare": 6.000000212225132e-06,
|
382 |
+
"lower-argminmax-custom-call": 6.000000212225132e-06,
|
383 |
+
"map-inline": 1.700000029813964e-05,
|
384 |
+
"metadata-naming": 2.499999936844688e-05,
|
385 |
+
"mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05,
|
386 |
+
"mlir::hlo::MhloToPyPenguin": 0.002633000025525689,
|
387 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.0001049999991664663,
|
388 |
+
"mlir::mhlo::LowerComplexPass": 0.00017299999308306724,
|
389 |
+
"native-to-custom-softmax": 2.099999983329326e-05,
|
390 |
+
"native-to-custom-softmax-dx": 6.600000051548705e-05,
|
391 |
+
"operand_upcaster": 2.2000000171829015e-05,
|
392 |
+
"post-par-pipe-begin": 4.999999873689376e-06,
|
393 |
+
"post-par-pipe-end": 0.0,
|
394 |
+
"post-partition-simplification": 0.0008430000161752105,
|
395 |
+
"replace-minimum-constant": 1.1000000085914508e-05,
|
396 |
+
"reshape-mover": 4.999999873689376e-06,
|
397 |
+
"simplify-concat": 6.70000008540228e-05,
|
398 |
+
"simplify-while-loops": 3.999999989900971e-06,
|
399 |
+
"transform-variadic-reduce": 1.2999999853491317e-05,
|
400 |
+
"tuple-simplifier": 7.999999979801942e-06,
|
401 |
+
"unpack-nested-aws-ntwsr": 6.000000212225132e-06,
|
402 |
+
"unroll-while-loop": 9.999999974752427e-07
|
403 |
+
},
|
404 |
+
"hilo": {
|
405 |
+
"ArithmeticIntensity": 34.445003509521484,
|
406 |
+
"ConstantSize": 1189157.0,
|
407 |
+
"HloInputCount": 475.0,
|
408 |
+
"HloMacCount": 11811160064.0,
|
409 |
+
"HloOutputCount": 73.0,
|
410 |
+
"IfmapSize": 8266545152.0,
|
411 |
+
"OfmapSize": 75497472.0,
|
412 |
+
"OutputsReadFromCount": 0.0,
|
413 |
+
"PassthroughTensorsCount": 0.0,
|
414 |
+
"RedundantOutputCount": 0.0,
|
415 |
+
"Traffic": 685798208.0
|
416 |
+
}
|
417 |
+
},
|
418 |
+
"sg0000": {
|
419 |
+
"compiletime": {
|
420 |
+
"AGOrderingAnalysisPass": 0.07801461219787598,
|
421 |
+
"AffinePredicateResolution": 0.0017647743225097656,
|
422 |
+
"AliasDependencyElimination": 0.0001277923583984375,
|
423 |
+
"AliasDependencyInduction": 0.00855708122253418,
|
424 |
+
"AliasDependencyReset": 0.08457040786743164,
|
425 |
+
"BFComputeCutting": 0.003294229507446289,
|
426 |
+
"BirCodeGenLoop": 0.05274701118469238,
|
427 |
+
"CCOpFusion": 0.030017614364624023,
|
428 |
+
"CanonicalizeDAGForPGTiling": 0.003341197967529297,
|
429 |
+
"CanonicalizeIR": 0.0022792816162109375,
|
430 |
+
"CoalesceCCOp": 0.0053555965423583984,
|
431 |
+
"CommuteConcat": 0.0023560523986816406,
|
432 |
+
"DMALocalityOpt": 0.0013885498046875,
|
433 |
+
"DMAProfiler": 0.00625157356262207,
|
434 |
+
"DMATilingProfiler": 0.003763914108276367,
|
435 |
+
"DataLocalityOpt": 0.09786868095397949,
|
436 |
+
"DataStreaming": 0.004992246627807617,
|
437 |
+
"DeConcat": 0.002264261245727539,
|
438 |
+
"DeadCodeElimination": 0.002042531967163086,
|
439 |
+
"DeadStoreElimination": 0.030755043029785156,
|
440 |
+
"DelinearIndices": 0.009100914001464844,
|
441 |
+
"Delinearization": 0.004424571990966797,
|
442 |
+
"DoNothing": 6.914138793945313e-05,
|
443 |
+
"DramToDramTranspose": 0.03130936622619629,
|
444 |
+
"DumpGraphAndMetadata": 0.005283832550048828,
|
445 |
+
"EliminateDivs": 0.0042150020599365234,
|
446 |
+
"ExpandBatchNorm": 0.0019366741180419922,
|
447 |
+
"ExpandISAMacro": 0.002724170684814453,
|
448 |
+
"FactorizeBlkDims": 0.011873722076416016,
|
449 |
+
"FactorizeThreadAxesInFreeDims": 0.002283811569213867,
|
450 |
+
"FlattenMacroLoop": 0.0031974315643310547,
|
451 |
+
"GenericAccessSimplifier": 0.002216339111328125,
|
452 |
+
"InferInitValue": 0.030458927154541016,
|
453 |
+
"InferIntrinsicOnCC": 0.011402368545532227,
|
454 |
+
"InferNeuronTensor": 0.04513859748840332,
|
455 |
+
"InferNonlocalTensors": 0.10613727569580078,
|
456 |
+
"InferPSumTensor": 0.037427663803100586,
|
457 |
+
"InlineNativeKernels": 0.00368499755859375,
|
458 |
+
"InsertIOTransposes": 0.012629508972167969,
|
459 |
+
"InsertLocalTransposes": 0.007400989532470703,
|
460 |
+
"InsertOffloadedTransposes": 0.0025758743286132813,
|
461 |
+
"LICM": 0.0031554698944091797,
|
462 |
+
"LateLegalizeInst": 0.005858182907104492,
|
463 |
+
"LateLegalizePostSplit": 0.0029172897338867188,
|
464 |
+
"LateLowerReshapeOp": 0.0018696784973144531,
|
465 |
+
"LateLowerTensorOp": 0.004997968673706055,
|
466 |
+
"LateNeuronInstComb": 0.019808530807495117,
|
467 |
+
"LayoutPreprocessing": 0.04119300842285156,
|
468 |
+
"LayoutPreprocessingAndAnalysis": 0.10642147064208984,
|
469 |
+
"LayoutRequirementAnalysis": 0.0070705413818359375,
|
470 |
+
"LegalizeCCOpLayout": 0.004191398620605469,
|
471 |
+
"LegalizeOpLevelAlias": 0.0015521049499511719,
|
472 |
+
"LegalizePartitionReduce": 0.002257108688354492,
|
473 |
+
"LegalizeSundaAccess": 0.03900027275085449,
|
474 |
+
"LegalizeSundaMacro": 0.010483741760253906,
|
475 |
+
"LegalizeType": 0.0038602352142333984,
|
476 |
+
"LocalLayoutOpt": 0.01764845848083496,
|
477 |
+
"LoopFusion": 0.006066322326660156,
|
478 |
+
"LoopSplitting": 0.0015685558319091797,
|
479 |
+
"LowerBroadcast": 0.0020384788513183594,
|
480 |
+
"LowerCCOpBlockAxis": 0.005359172821044922,
|
481 |
+
"LowerComplexBroadcast": 0.0019440650939941406,
|
482 |
+
"LowerIntrinsics": 0.030491113662719727,
|
483 |
+
"LowerTensorOp": 0.012917041778564453,
|
484 |
+
"LowerTranspose": 0.010635852813720703,
|
485 |
+
"MacroGeneration": 0.06435012817382813,
|
486 |
+
"MaskPropagation": 0.0051097869873046875,
|
487 |
+
"MemcpyElimination": 0.11022067070007324,
|
488 |
+
"MutateDataType": 0.0014224052429199219,
|
489 |
+
"NeuronAliasDependencyInduction": 0.00023031234741210938,
|
490 |
+
"NeuronAliasDependencyReset": 0.021604061126708984,
|
491 |
+
"NeuronInstComb": 0.013072729110717773,
|
492 |
+
"NeuronLICM": 0.01006174087524414,
|
493 |
+
"NeuronLoopFusion": 0.017573833465576172,
|
494 |
+
"NeuronLoopInterchange": 0.0020608901977539063,
|
495 |
+
"NeuronSimplifier": 0.010074615478515625,
|
496 |
+
"NeuronSimplifyPredicates": 0.0060672760009765625,
|
497 |
+
"NeuronValueNumbering": 0.0041046142578125,
|
498 |
+
"OptimizeAliasedCopyChain": 0.0014190673828125,
|
499 |
+
"OptimizeNKIKernels": 0.0021109580993652344,
|
500 |
+
"PAGLayoutOpt": 0.3779466152191162,
|
501 |
+
"PComputeCutting": 0.008729696273803711,
|
502 |
+
"PGLayoutTilingPipeline": 1.5334703922271729,
|
503 |
+
"PGTiling": 0.47260475158691406,
|
504 |
+
"PadElimination": 0.0015625953674316406,
|
505 |
+
"ParAxesAnnotation": 0.2937772274017334,
|
506 |
+
"PartialLoopFusion": 0.016366004943847656,
|
507 |
+
"PartialSimdFusion": 0.01980447769165039,
|
508 |
+
"PerfectLoopNest": 0.0021877288818359375,
|
509 |
+
"RecognizeOpIdiom": 0.004831075668334961,
|
510 |
+
"Recompute": 0.00025010108947753906,
|
511 |
+
"RelaxPredicates": 0.0039484500885009766,
|
512 |
+
"Rematerialization": 0.004274129867553711,
|
513 |
+
"ReshapeWeights": 0.000804901123046875,
|
514 |
+
"ResolveAccessConflict": 0.0038733482360839844,
|
515 |
+
"ResolveComplicatePredicates": 0.0016858577728271484,
|
516 |
+
"RewriteReplicationMatmul": 0.0014014244079589844,
|
517 |
+
"RewriteWeights": 0.00405120849609375,
|
518 |
+
"SFKVectorizer": 0.20196890830993652,
|
519 |
+
"SimpleAllReduceTiling": 0.002203702926635742,
|
520 |
+
"Simplifier": 0.004297018051147461,
|
521 |
+
"SimplifyMacroPredicates": 0.01361393928527832,
|
522 |
+
"SimplifyNeuronTensor": 0.009984970092773438,
|
523 |
+
"SimplifySlice": 0.0010356903076171875,
|
524 |
+
"SimplifyTensor": 0.006205558776855469,
|
525 |
+
"SpillPSum": 0.016466140747070313,
|
526 |
+
"SplitAPUnionSets": 0.029446840286254883,
|
527 |
+
"SplitAccGrp": 0.0020453929901123047,
|
528 |
+
"StaticProfiler": 0.004591464996337891,
|
529 |
+
"StaticTransposeLocalTensor": 0.005173683166503906,
|
530 |
+
"SundaISel": 0.04554462432861328,
|
531 |
+
"TCTransform": 0.002426624298095703,
|
532 |
+
"TensorInitialization": 0.009510517120361328,
|
533 |
+
"TensorOpSimplifier": 0.0067560672760009766,
|
534 |
+
"TensorOpTransform": 0.028885841369628906,
|
535 |
+
"TileCCOps": 0.005466938018798828,
|
536 |
+
"TilingProfiler": 0.013426065444946289,
|
537 |
+
"TransformConvOp": 0.002458810806274414,
|
538 |
+
"TritiumFusion": 0.0620732307434082,
|
539 |
+
"ValueNumbering": 0.002520322799682617,
|
540 |
+
"VectorizeDMA": 0.005783796310424805,
|
541 |
+
"VectorizeMatMult": 0.005175352096557617,
|
542 |
+
"WeightCoalescing": 0.0029850006103515625,
|
543 |
+
"ZeroSizeTensorElimination": 0.00011801719665527344
|
544 |
+
},
|
545 |
+
"tensorizer": {
|
546 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 2597.0,
|
547 |
+
"StaticProfiler::AifUb": 40.028141021728516,
|
548 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 420.0349426269531,
|
549 |
+
"StaticProfiler::AverageDmaLength": 1921.007568359375,
|
550 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.95317840576172,
|
551 |
+
"StaticProfiler::AveragePartitionUtilization": 99.87249755859375,
|
552 |
+
"StaticProfiler::AveragePeUtilization": 99.80845642089844,
|
553 |
+
"StaticProfiler::DDRTransferBytes": 64558336.0,
|
554 |
+
"StaticProfiler::InternalTransferBytes": 52297728.0,
|
555 |
+
"StaticProfiler::LoadExpanded": 23298.0,
|
556 |
+
"StaticProfiler::LocalizationEfficiency": 1049.3489990234375,
|
557 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1358.191162109375,
|
558 |
+
"StaticProfiler::StoreExpanded": 5505.0,
|
559 |
+
"StaticProfiler::TotalDMAExpanded": 28803.0,
|
560 |
+
"StaticProfiler::TotalDynamicInstancesCount": 3692.0,
|
561 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 3689.0,
|
562 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
563 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
564 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
565 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
566 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
567 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
568 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 48.0,
|
569 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 1412.0,
|
570 |
+
"TilingProfiler::NumPfTransposes": 7.0,
|
571 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
572 |
+
"TilingProfiler::NumPfTransposesForLocal": 5.0,
|
573 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 1.0,
|
574 |
+
"TilingProfiler::PfTransposeInstructions": 608.0,
|
575 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 128.0,
|
576 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 416.0,
|
577 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 64.0,
|
578 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
579 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 257.0,
|
580 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
581 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
582 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
583 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
584 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
585 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
586 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
587 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
588 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
589 |
+
}
|
590 |
+
},
|
591 |
+
"sg0001": {
|
592 |
+
"compiletime": {
|
593 |
+
"AGOrderingAnalysisPass": 0.03313565254211426,
|
594 |
+
"AffinePredicateResolution": 0.0015239715576171875,
|
595 |
+
"AliasDependencyElimination": 0.00011467933654785156,
|
596 |
+
"AliasDependencyInduction": 0.009088993072509766,
|
597 |
+
"AliasDependencyReset": 1.062025547027588,
|
598 |
+
"BFComputeCutting": 0.0024559497833251953,
|
599 |
+
"BirCodeGenLoop": 0.03748297691345215,
|
600 |
+
"CCOpFusion": 0.04092240333557129,
|
601 |
+
"CanonicalizeDAGForPGTiling": 0.004329681396484375,
|
602 |
+
"CanonicalizeIR": 0.002464771270751953,
|
603 |
+
"CoalesceCCOp": 0.004778146743774414,
|
604 |
+
"CommuteConcat": 0.0011680126190185547,
|
605 |
+
"DMALocalityOpt": 0.0016834735870361328,
|
606 |
+
"DMAProfiler": 0.0039997100830078125,
|
607 |
+
"DMATilingProfiler": 0.004555702209472656,
|
608 |
+
"DataLocalityOpt": 0.13762187957763672,
|
609 |
+
"DataStreaming": 0.0044286251068115234,
|
610 |
+
"DeConcat": 0.0015981197357177734,
|
611 |
+
"DeadCodeElimination": 0.0020780563354492188,
|
612 |
+
"DeadStoreElimination": 0.03435230255126953,
|
613 |
+
"DelinearIndices": 0.00969839096069336,
|
614 |
+
"Delinearization": 0.0038826465606689453,
|
615 |
+
"DoNothing": 9.846687316894531e-05,
|
616 |
+
"DramToDramTranspose": 0.03438973426818848,
|
617 |
+
"DumpGraphAndMetadata": 0.00426793098449707,
|
618 |
+
"EliminateDivs": 0.004217386245727539,
|
619 |
+
"ExpandBatchNorm": 0.0019202232360839844,
|
620 |
+
"ExpandISAMacro": 0.0024042129516601563,
|
621 |
+
"FactorizeBlkDims": 0.01425933837890625,
|
622 |
+
"FactorizeThreadAxesInFreeDims": 0.0026972293853759766,
|
623 |
+
"FlattenMacroLoop": 0.002768993377685547,
|
624 |
+
"GenericAccessSimplifier": 0.001058816909790039,
|
625 |
+
"InferInitValue": 0.03559255599975586,
|
626 |
+
"InferIntrinsicOnCC": 0.009636163711547852,
|
627 |
+
"InferNeuronTensor": 0.04922318458557129,
|
628 |
+
"InferNonlocalTensors": 0.030732393264770508,
|
629 |
+
"InferPSumTensor": 0.03249359130859375,
|
630 |
+
"InlineNativeKernels": 0.0014734268188476563,
|
631 |
+
"InsertIOTransposes": 0.021765470504760742,
|
632 |
+
"InsertLocalTransposes": 0.006593465805053711,
|
633 |
+
"InsertOffloadedTransposes": 0.0034906864166259766,
|
634 |
+
"LICM": 0.003262758255004883,
|
635 |
+
"LateLegalizeInst": 0.00400543212890625,
|
636 |
+
"LateLegalizePostSplit": 0.00289154052734375,
|
637 |
+
"LateLowerReshapeOp": 0.002287149429321289,
|
638 |
+
"LateLowerTensorOp": 0.0046651363372802734,
|
639 |
+
"LateNeuronInstComb": 0.019269704818725586,
|
640 |
+
"LayoutPreprocessing": 0.03711414337158203,
|
641 |
+
"LayoutPreprocessingAndAnalysis": 0.2516040802001953,
|
642 |
+
"LayoutRequirementAnalysis": 0.007753133773803711,
|
643 |
+
"LegalizeCCOpLayout": 0.003732919692993164,
|
644 |
+
"LegalizeOpLevelAlias": 0.0016019344329833984,
|
645 |
+
"LegalizePartitionReduce": 0.0020945072174072266,
|
646 |
+
"LegalizeSundaAccess": 0.016069650650024414,
|
647 |
+
"LegalizeSundaMacro": 0.010806083679199219,
|
648 |
+
"LegalizeType": 0.004706859588623047,
|
649 |
+
"LocalLayoutOpt": 0.02442765235900879,
|
650 |
+
"LoopFusion": 0.0067822933197021484,
|
651 |
+
"LoopSplitting": 0.00033974647521972656,
|
652 |
+
"LowerBroadcast": 0.0019419193267822266,
|
653 |
+
"LowerCCOpBlockAxis": 0.005570650100708008,
|
654 |
+
"LowerComplexBroadcast": 0.0020999908447265625,
|
655 |
+
"LowerIntrinsics": 0.03607368469238281,
|
656 |
+
"LowerTensorOp": 0.011876583099365234,
|
657 |
+
"LowerTranspose": 0.011530637741088867,
|
658 |
+
"MacroGeneration": 0.10653066635131836,
|
659 |
+
"MaskPropagation": 0.003092050552368164,
|
660 |
+
"MemcpyElimination": 0.10495471954345703,
|
661 |
+
"MutateDataType": 0.0014193058013916016,
|
662 |
+
"NeuronAliasDependencyInduction": 0.0002295970916748047,
|
663 |
+
"NeuronAliasDependencyReset": 0.021070480346679688,
|
664 |
+
"NeuronInstComb": 0.012903451919555664,
|
665 |
+
"NeuronLICM": 0.00844264030456543,
|
666 |
+
"NeuronLoopFusion": 0.020880460739135742,
|
667 |
+
"NeuronLoopInterchange": 0.0021686553955078125,
|
668 |
+
"NeuronSimplifier": 0.011090755462646484,
|
669 |
+
"NeuronSimplifyPredicates": 0.0016274452209472656,
|
670 |
+
"NeuronValueNumbering": 0.004062652587890625,
|
671 |
+
"OptimizeAliasedCopyChain": 0.0014641284942626953,
|
672 |
+
"OptimizeNKIKernels": 0.0023856163024902344,
|
673 |
+
"PAGLayoutOpt": 0.17638587951660156,
|
674 |
+
"PComputeCutting": 0.00709986686706543,
|
675 |
+
"PGLayoutTilingPipeline": 1.142796516418457,
|
676 |
+
"PGTiling": 0.39766955375671387,
|
677 |
+
"PadElimination": 0.0015380382537841797,
|
678 |
+
"ParAxesAnnotation": 0.09186458587646484,
|
679 |
+
"PartialLoopFusion": 0.015995025634765625,
|
680 |
+
"PartialSimdFusion": 0.026766300201416016,
|
681 |
+
"PerfectLoopNest": 0.002192258834838867,
|
682 |
+
"RecognizeOpIdiom": 0.004943370819091797,
|
683 |
+
"Recompute": 0.00025773048400878906,
|
684 |
+
"RelaxPredicates": 0.003591299057006836,
|
685 |
+
"Rematerialization": 0.0025196075439453125,
|
686 |
+
"ReshapeWeights": 0.0007069110870361328,
|
687 |
+
"ResolveAccessConflict": 0.00481104850769043,
|
688 |
+
"ResolveComplicatePredicates": 0.002285003662109375,
|
689 |
+
"RewriteReplicationMatmul": 0.0021715164184570313,
|
690 |
+
"RewriteWeights": 0.003401041030883789,
|
691 |
+
"SFKVectorizer": 0.14661574363708496,
|
692 |
+
"SimpleAllReduceTiling": 0.0016207695007324219,
|
693 |
+
"Simplifier": 0.00443577766418457,
|
694 |
+
"SimplifyMacroPredicates": 0.006165742874145508,
|
695 |
+
"SimplifyNeuronTensor": 0.006829500198364258,
|
696 |
+
"SimplifySlice": 0.0013000965118408203,
|
697 |
+
"SimplifyTensor": 0.0061337947845458984,
|
698 |
+
"SpillPSum": 0.018761634826660156,
|
699 |
+
"SplitAPUnionSets": 0.017923593521118164,
|
700 |
+
"SplitAccGrp": 0.002531290054321289,
|
701 |
+
"StaticProfiler": 0.003990888595581055,
|
702 |
+
"StaticTransposeLocalTensor": 0.004915952682495117,
|
703 |
+
"SundaISel": 0.04209589958190918,
|
704 |
+
"TCTransform": 0.0012347698211669922,
|
705 |
+
"TensorInitialization": 0.002599954605102539,
|
706 |
+
"TensorOpSimplifier": 0.006845712661743164,
|
707 |
+
"TensorOpTransform": 0.03345227241516113,
|
708 |
+
"TileCCOps": 0.005617856979370117,
|
709 |
+
"TilingProfiler": 0.015013933181762695,
|
710 |
+
"TransformConvOp": 0.002393960952758789,
|
711 |
+
"TritiumFusion": 0.09340715408325195,
|
712 |
+
"ValueNumbering": 0.0031540393829345703,
|
713 |
+
"VectorizeDMA": 0.0015842914581298828,
|
714 |
+
"VectorizeMatMult": 0.0071103572845458984,
|
715 |
+
"WeightCoalescing": 0.0026235580444335938,
|
716 |
+
"ZeroSizeTensorElimination": 0.0001163482666015625
|
717 |
+
},
|
718 |
+
"tensorizer": {
|
719 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 7847.0,
|
720 |
+
"StaticProfiler::AifUb": 490.6532287597656,
|
721 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 487.63507080078125,
|
722 |
+
"StaticProfiler::AverageDmaLength": 869.1515502929688,
|
723 |
+
"StaticProfiler::AverageFractalPeUtilization": 100.0,
|
724 |
+
"StaticProfiler::AveragePartitionUtilization": 99.83790588378906,
|
725 |
+
"StaticProfiler::AveragePeUtilization": 100.0,
|
726 |
+
"StaticProfiler::DDRTransferBytes": 215827456.0,
|
727 |
+
"StaticProfiler::InternalTransferBytes": 43515904.0,
|
728 |
+
"StaticProfiler::LoadExpanded": 238976.0,
|
729 |
+
"StaticProfiler::LocalizationEfficiency": 99.38487243652344,
|
730 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 107.76165771484375,
|
731 |
+
"StaticProfiler::StoreExpanded": 5121.0,
|
732 |
+
"StaticProfiler::TotalDMAExpanded": 244097.0,
|
733 |
+
"StaticProfiler::TotalDynamicInstancesCount": 9872.0,
|
734 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 9872.0,
|
735 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
736 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
737 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
738 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
739 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
740 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
741 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 32.0,
|
742 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 6016.0,
|
743 |
+
"TilingProfiler::NumPfTransposes": 8.0,
|
744 |
+
"TilingProfiler::NumPfTransposesForIo": 3.0,
|
745 |
+
"TilingProfiler::NumPfTransposesForLocal": 3.0,
|
746 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 2.0,
|
747 |
+
"TilingProfiler::PfTransposeInstructions": 680.0,
|
748 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 136.0,
|
749 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 288.0,
|
750 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 256.0,
|
751 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
752 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 288.0,
|
753 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
754 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
755 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
756 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
757 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
758 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
759 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
760 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
761 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
762 |
+
}
|
763 |
+
},
|
764 |
+
"sg0002": {
|
765 |
+
"compiletime": {
|
766 |
+
"AGOrderingAnalysisPass": 0.018257856369018555,
|
767 |
+
"AffinePredicateResolution": 0.0011677742004394531,
|
768 |
+
"AliasDependencyElimination": 0.0001201629638671875,
|
769 |
+
"AliasDependencyInduction": 0.0052988529205322266,
|
770 |
+
"AliasDependencyReset": 0.029210567474365234,
|
771 |
+
"BFComputeCutting": 0.0032625198364257813,
|
772 |
+
"BirCodeGenLoop": 0.4527714252471924,
|
773 |
+
"CCOpFusion": 0.02410125732421875,
|
774 |
+
"CanonicalizeDAGForPGTiling": 0.004324913024902344,
|
775 |
+
"CanonicalizeIR": 0.0019502639770507813,
|
776 |
+
"CoalesceCCOp": 0.014463186264038086,
|
777 |
+
"CommuteConcat": 0.0008339881896972656,
|
778 |
+
"DMALocalityOpt": 0.005598783493041992,
|
779 |
+
"DMAProfiler": 0.01209115982055664,
|
780 |
+
"DMATilingProfiler": 0.004332065582275391,
|
781 |
+
"DataLocalityOpt": 0.07260942459106445,
|
782 |
+
"DataStreaming": 0.03940248489379883,
|
783 |
+
"DeConcat": 0.0005326271057128906,
|
784 |
+
"DeadCodeElimination": 0.0009255409240722656,
|
785 |
+
"DeadStoreElimination": 0.0055675506591796875,
|
786 |
+
"DelinearIndices": 0.004735231399536133,
|
787 |
+
"Delinearization": 0.0030374526977539063,
|
788 |
+
"DoNothing": 7.033348083496094e-05,
|
789 |
+
"DramToDramTranspose": 0.018135547637939453,
|
790 |
+
"DumpGraphAndMetadata": 0.09476375579833984,
|
791 |
+
"EliminateDivs": 0.002595663070678711,
|
792 |
+
"ExpandBatchNorm": 0.002063274383544922,
|
793 |
+
"ExpandISAMacro": 0.011472225189208984,
|
794 |
+
"FactorizeBlkDims": 0.008858203887939453,
|
795 |
+
"FactorizeThreadAxesInFreeDims": 0.0010046958923339844,
|
796 |
+
"FlattenMacroLoop": 0.002232074737548828,
|
797 |
+
"GenericAccessSimplifier": 0.0018167495727539063,
|
798 |
+
"InferInitValue": 0.024865150451660156,
|
799 |
+
"InferIntrinsicOnCC": 0.009101152420043945,
|
800 |
+
"InferNeuronTensor": 0.023293495178222656,
|
801 |
+
"InferNonlocalTensors": 0.01632833480834961,
|
802 |
+
"InferPSumTensor": 0.27681708335876465,
|
803 |
+
"InlineNativeKernels": 0.0081634521484375,
|
804 |
+
"InsertIOTransposes": 0.019203901290893555,
|
805 |
+
"InsertLocalTransposes": 0.0042340755462646484,
|
806 |
+
"InsertOffloadedTransposes": 0.002811431884765625,
|
807 |
+
"LICM": 0.0029730796813964844,
|
808 |
+
"LateLegalizeInst": 0.01390385627746582,
|
809 |
+
"LateLegalizePostSplit": 0.012536048889160156,
|
810 |
+
"LateLowerReshapeOp": 0.0018641948699951172,
|
811 |
+
"LateLowerTensorOp": 0.0014081001281738281,
|
812 |
+
"LateNeuronInstComb": 0.008648872375488281,
|
813 |
+
"LayoutPreprocessing": 0.02658390998840332,
|
814 |
+
"LayoutPreprocessingAndAnalysis": 0.10707235336303711,
|
815 |
+
"LayoutRequirementAnalysis": 0.005135536193847656,
|
816 |
+
"LegalizeCCOpLayout": 0.002307415008544922,
|
817 |
+
"LegalizeOpLevelAlias": 0.0012297630310058594,
|
818 |
+
"LegalizePartitionReduce": 0.0010194778442382813,
|
819 |
+
"LegalizeSundaAccess": 0.07593941688537598,
|
820 |
+
"LegalizeSundaMacro": 0.010968446731567383,
|
821 |
+
"LegalizeType": 0.011834383010864258,
|
822 |
+
"LocalLayoutOpt": 0.013799905776977539,
|
823 |
+
"LoopFusion": 0.0052182674407958984,
|
824 |
+
"LoopSplitting": 0.0003161430358886719,
|
825 |
+
"LowerBroadcast": 0.0013611316680908203,
|
826 |
+
"LowerCCOpBlockAxis": 0.0040547847747802734,
|
827 |
+
"LowerComplexBroadcast": 0.002165079116821289,
|
828 |
+
"LowerIntrinsics": 0.31132984161376953,
|
829 |
+
"LowerTensorOp": 0.010558843612670898,
|
830 |
+
"LowerTranspose": 0.012272357940673828,
|
831 |
+
"MacroGeneration": 0.029862642288208008,
|
832 |
+
"MaskPropagation": 0.002757549285888672,
|
833 |
+
"MemcpyElimination": 0.025969266891479492,
|
834 |
+
"MutateDataType": 0.002087831497192383,
|
835 |
+
"NeuronAliasDependencyInduction": 0.00016880035400390625,
|
836 |
+
"NeuronAliasDependencyReset": 0.020352602005004883,
|
837 |
+
"NeuronInstComb": 0.004126310348510742,
|
838 |
+
"NeuronLICM": 0.0351865291595459,
|
839 |
+
"NeuronLoopFusion": 0.007991313934326172,
|
840 |
+
"NeuronLoopInterchange": 0.002409219741821289,
|
841 |
+
"NeuronSimplifier": 0.007069587707519531,
|
842 |
+
"NeuronSimplifyPredicates": 0.12138772010803223,
|
843 |
+
"NeuronValueNumbering": 0.0028395652770996094,
|
844 |
+
"OptimizeAliasedCopyChain": 0.0005936622619628906,
|
845 |
+
"OptimizeNKIKernels": 0.5374257564544678,
|
846 |
+
"PAGLayoutOpt": 0.08115577697753906,
|
847 |
+
"PComputeCutting": 0.004801273345947266,
|
848 |
+
"PGLayoutTilingPipeline": 0.5454635620117188,
|
849 |
+
"PGTiling": 0.14933419227600098,
|
850 |
+
"PadElimination": 0.00034046173095703125,
|
851 |
+
"ParAxesAnnotation": 0.053552865982055664,
|
852 |
+
"PartialLoopFusion": 0.0067539215087890625,
|
853 |
+
"PartialSimdFusion": 0.00693058967590332,
|
854 |
+
"PerfectLoopNest": 0.0035321712493896484,
|
855 |
+
"RecognizeOpIdiom": 0.003947257995605469,
|
856 |
+
"Recompute": 0.00024962425231933594,
|
857 |
+
"RelaxPredicates": 0.013285398483276367,
|
858 |
+
"Rematerialization": 0.002062082290649414,
|
859 |
+
"ReshapeWeights": 0.002131223678588867,
|
860 |
+
"ResolveAccessConflict": 0.0038597583770751953,
|
861 |
+
"ResolveComplicatePredicates": 0.002032756805419922,
|
862 |
+
"RewriteReplicationMatmul": 0.001924753189086914,
|
863 |
+
"RewriteWeights": 0.002452373504638672,
|
864 |
+
"SFKVectorizer": 0.2690722942352295,
|
865 |
+
"SimpleAllReduceTiling": 0.008755922317504883,
|
866 |
+
"Simplifier": 0.004038810729980469,
|
867 |
+
"SimplifyMacroPredicates": 0.010622739791870117,
|
868 |
+
"SimplifyNeuronTensor": 1.059011697769165,
|
869 |
+
"SimplifySlice": 0.0009577274322509766,
|
870 |
+
"SimplifyTensor": 0.005341768264770508,
|
871 |
+
"SpillPSum": 0.011537313461303711,
|
872 |
+
"SplitAPUnionSets": 0.10771751403808594,
|
873 |
+
"SplitAccGrp": 0.002201557159423828,
|
874 |
+
"StaticProfiler": 0.012447118759155273,
|
875 |
+
"StaticTransposeLocalTensor": 0.0038712024688720703,
|
876 |
+
"SundaISel": 0.04214668273925781,
|
877 |
+
"TCTransform": 0.0008432865142822266,
|
878 |
+
"TensorInitialization": 0.012825727462768555,
|
879 |
+
"TensorOpSimplifier": 0.004651308059692383,
|
880 |
+
"TensorOpTransform": 0.019537687301635742,
|
881 |
+
"TileCCOps": 0.006766319274902344,
|
882 |
+
"TilingProfiler": 0.006911277770996094,
|
883 |
+
"TransformConvOp": 0.0030303001403808594,
|
884 |
+
"TritiumFusion": 0.04502224922180176,
|
885 |
+
"ValueNumbering": 0.001996755599975586,
|
886 |
+
"VectorizeDMA": 0.0019402503967285156,
|
887 |
+
"VectorizeMatMult": 0.0027413368225097656,
|
888 |
+
"WeightCoalescing": 0.00829005241394043,
|
889 |
+
"ZeroSizeTensorElimination": 0.00013709068298339844
|
890 |
+
},
|
891 |
+
"tensorizer": {
|
892 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 44382.0,
|
893 |
+
"StaticProfiler::AifUb": 205.154296875,
|
894 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 201.6046905517578,
|
895 |
+
"StaticProfiler::AverageDmaLength": 1901.806396484375,
|
896 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.66542053222656,
|
897 |
+
"StaticProfiler::AveragePartitionUtilization": 97.7269515991211,
|
898 |
+
"StaticProfiler::AveragePeUtilization": 98.64861297607422,
|
899 |
+
"StaticProfiler::DDRTransferBytes": 795531072.0,
|
900 |
+
"StaticProfiler::InternalTransferBytes": 646388224.0,
|
901 |
+
"StaticProfiler::LoadExpanded": 376342.0,
|
902 |
+
"StaticProfiler::LocalizationEfficiency": 98.26979064941406,
|
903 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 101.01405334472656,
|
904 |
+
"StaticProfiler::StoreExpanded": 4189.0,
|
905 |
+
"StaticProfiler::TotalDMAExpanded": 380531.0,
|
906 |
+
"StaticProfiler::TotalDynamicInstancesCount": 53882.0,
|
907 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 53436.0,
|
908 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
909 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
910 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
911 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
912 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
913 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
914 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
915 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 23616.0,
|
916 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
917 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
918 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
919 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
920 |
+
"TilingProfiler::PfTransposeInstructions": 19393.0,
|
921 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
|
922 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
923 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0,
|
924 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
|
925 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 158.0,
|
926 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
927 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
928 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
929 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
930 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
931 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
932 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
933 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
934 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
935 |
+
}
|
936 |
+
},
|
937 |
+
"sg01": {
|
938 |
+
"compiletime": {
|
939 |
+
"CanonicalizeConv": 1.2000000424450263e-05,
|
940 |
+
"CanonicalizeForTensorizer": 1.2999999853491317e-05,
|
941 |
+
"Canonicalizer": 0.0002500000118743628,
|
942 |
+
"HoistCompute": 3.000000106112566e-06,
|
943 |
+
"IdentifyCrossPassTensors": 2.300000051036477e-05,
|
944 |
+
"MemcastMotion": 1.1000000085914508e-05,
|
945 |
+
"PenguinizeFunctions": 1.4000000192027073e-05,
|
946 |
+
"PruneFunctions": 3.099999958067201e-05,
|
947 |
+
"RemoveOptimizationBarriers": 2.2000000171829015e-05,
|
948 |
+
"ScatterMotion": 2.9999999242136255e-05,
|
949 |
+
"TensorizerLegalizationPass": 1.700000029813964e-05,
|
950 |
+
"VerifySupportedOps": 9.000000318337698e-06,
|
951 |
+
"algsimp": 6.299999949987978e-05,
|
952 |
+
"batchnorm_expander": 1.2999999853491317e-05,
|
953 |
+
"boundary-marker-removal": 4.999999873689376e-06,
|
954 |
+
"call-inliner": 9.000000318337698e-06,
|
955 |
+
"canonicalize-boundary-marker": 6.000000212225132e-06,
|
956 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
957 |
+
"comparison-expander": 4.999999873689376e-06,
|
958 |
+
"computation-deduplicator": 1.8000000636675395e-05,
|
959 |
+
"conditional-to-select": 4.999999873689376e-06,
|
960 |
+
"config-lowering": 2.5999999706982635e-05,
|
961 |
+
"constant_folding": 7.999999979801942e-06,
|
962 |
+
"cse": 1.2000000424450263e-05,
|
963 |
+
"dce": 9.999999974752427e-07,
|
964 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
965 |
+
"eliminate-redundant-compare": 3.999999989900971e-06,
|
966 |
+
"emit-offloaded-dropout": 1.2999999853491317e-05,
|
967 |
+
"flatten-call-graph": 7.999999979801942e-06,
|
968 |
+
"fuse-send-recv": 2.099999983329326e-05,
|
969 |
+
"hilo::LegalizeAlias": 4.999999873689376e-06,
|
970 |
+
"hilo::NeuronInstCombine": 4.5000000682193786e-05,
|
971 |
+
"hilo::NeuronOpFusion": 1.700000029813964e-05,
|
972 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 2.099999983329326e-05,
|
973 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
974 |
+
"hilo::SixtyFourHack": 1.2999999853491317e-05,
|
975 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
976 |
+
"hlo-mac-count": 2.9999999242136255e-05,
|
977 |
+
"hlo-verifier": 0.00018000000272877514,
|
978 |
+
"legalize-ccops": 9.999999974752427e-07,
|
979 |
+
"legalize-compare": 3.999999989900971e-06,
|
980 |
+
"lower-argminmax-custom-call": 3.999999989900971e-06,
|
981 |
+
"map-inline": 1.2000000424450263e-05,
|
982 |
+
"metadata-naming": 1.8000000636675395e-05,
|
983 |
+
"mlir::detail::OpToOpPassAdaptor": 9.999999747378752e-05,
|
984 |
+
"mlir::hlo::MhloToPyPenguin": 0.0009420000133104622,
|
985 |
+
"mlir::mhlo::LowerComplexExtraPass": 7.999999797903001e-05,
|
986 |
+
"mlir::mhlo::LowerComplexPass": 0.00015799999528098851,
|
987 |
+
"native-to-custom-softmax": 6.000000212225132e-06,
|
988 |
+
"native-to-custom-softmax-dx": 1.2999999853491317e-05,
|
989 |
+
"operand_upcaster": 1.4999999621068127e-05,
|
990 |
+
"post-par-pipe-begin": 1.9999999949504854e-06,
|
991 |
+
"post-par-pipe-end": 0.0,
|
992 |
+
"post-partition-simplification": 0.0005130000063218176,
|
993 |
+
"replace-minimum-constant": 6.000000212225132e-06,
|
994 |
+
"reshape-mover": 3.000000106112566e-06,
|
995 |
+
"simplify-concat": 4.8999998398358e-05,
|
996 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
997 |
+
"transform-variadic-reduce": 9.000000318337698e-06,
|
998 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
999 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
1000 |
+
"unroll-while-loop": 0.0
|
1001 |
+
},
|
1002 |
+
"hilo": {
|
1003 |
+
"ArithmeticIntensity": 457.20416259765625,
|
1004 |
+
"HloMacCount": 50465865728.0,
|
1005 |
+
"Traffic": 220758560.0
|
1006 |
+
}
|
1007 |
+
},
|
1008 |
+
"sg02": {
|
1009 |
+
"compiletime": {
|
1010 |
+
"CanonicalizeConv": 0.0002589999930933118,
|
1011 |
+
"CanonicalizeForTensorizer": 1.2000000424450263e-05,
|
1012 |
+
"Canonicalizer": 0.0003060000017285347,
|
1013 |
+
"HoistCompute": 1.9999999949504854e-06,
|
1014 |
+
"IdentifyCrossPassTensors": 2.4000000848900527e-05,
|
1015 |
+
"MemcastMotion": 1.2999999853491317e-05,
|
1016 |
+
"PenguinizeFunctions": 9.000000318337698e-06,
|
1017 |
+
"PruneFunctions": 7.999999979801942e-06,
|
1018 |
+
"RemoveOptimizationBarriers": 2.099999983329326e-05,
|
1019 |
+
"ScatterMotion": 1.9999999949504854e-06,
|
1020 |
+
"TensorizerLegalizationPass": 4.999999873689376e-06,
|
1021 |
+
"VerifySupportedOps": 1.1000000085914508e-05,
|
1022 |
+
"algsimp": 5.900000178371556e-05,
|
1023 |
+
"batchnorm_expander": 1.2000000424450263e-05,
|
1024 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
1025 |
+
"call-inliner": 1.1000000085914508e-05,
|
1026 |
+
"canonicalize-boundary-marker": 4.999999873689376e-06,
|
1027 |
+
"collective-stream-id-checker": 3.000000106112566e-06,
|
1028 |
+
"comparison-expander": 4.999999873689376e-06,
|
1029 |
+
"computation-deduplicator": 4.099999932805076e-05,
|
1030 |
+
"conditional-to-select": 6.000000212225132e-06,
|
1031 |
+
"config-lowering": 2.5999999706982635e-05,
|
1032 |
+
"constant_folding": 9.000000318337698e-06,
|
1033 |
+
"cse": 1.1000000085914508e-05,
|
1034 |
+
"dce": 9.999999974752427e-07,
|
1035 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
1036 |
+
"eliminate-redundant-compare": 3.999999989900971e-06,
|
1037 |
+
"emit-offloaded-dropout": 1.2999999853491317e-05,
|
1038 |
+
"flatten-call-graph": 1.1000000085914508e-05,
|
1039 |
+
"fuse-send-recv": 1.8000000636675395e-05,
|
1040 |
+
"hilo::LegalizeAlias": 1.9999999949504854e-06,
|
1041 |
+
"hilo::NeuronInstCombine": 5.8000001445179805e-05,
|
1042 |
+
"hilo::NeuronOpFusion": 1.9999999949504854e-06,
|
1043 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 9.000000318337698e-06,
|
1044 |
+
"hilo::ScheduleFusion": 3.999999989900971e-06,
|
1045 |
+
"hilo::SixtyFourHack": 3.899999865097925e-05,
|
1046 |
+
"hilo::VerifyAliasing": 9.999999974752427e-07,
|
1047 |
+
"hlo-mac-count": 0.00016799999866634607,
|
1048 |
+
"hlo-verifier": 0.00015900000289548188,
|
1049 |
+
"legalize-ccops": 9.999999974752427e-07,
|
1050 |
+
"legalize-compare": 3.000000106112566e-06,
|
1051 |
+
"lower-argminmax-custom-call": 3.000000106112566e-06,
|
1052 |
+
"map-inline": 1.2000000424450263e-05,
|
1053 |
+
"metadata-naming": 1.700000029813964e-05,
|
1054 |
+
"mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05,
|
1055 |
+
"mlir::hlo::MhloToPyPenguin": 0.0008549999911338091,
|
1056 |
+
"mlir::mhlo::LowerComplexExtraPass": 8.800000068731606e-05,
|
1057 |
+
"mlir::mhlo::LowerComplexPass": 0.00015999999595806003,
|
1058 |
+
"native-to-custom-softmax": 4.999999873689376e-06,
|
1059 |
+
"native-to-custom-softmax-dx": 1.9999999494757503e-05,
|
1060 |
+
"operand_upcaster": 1.2000000424450263e-05,
|
1061 |
+
"post-par-pipe-begin": 1.9999999949504854e-06,
|
1062 |
+
"post-par-pipe-end": 0.0,
|
1063 |
+
"post-partition-simplification": 0.0005009999731555581,
|
1064 |
+
"replace-minimum-constant": 9.000000318337698e-06,
|
1065 |
+
"reshape-mover": 3.000000106112566e-06,
|
1066 |
+
"simplify-concat": 4.3000000005122274e-05,
|
1067 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
1068 |
+
"transform-variadic-reduce": 4.8000001697801054e-05,
|
1069 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
1070 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
1071 |
+
"unroll-while-loop": 0.0
|
1072 |
+
},
|
1073 |
+
"hilo": {
|
1074 |
+
"ArithmeticIntensity": 99.1578140258789,
|
1075 |
+
"HloMacCount": 38965870592.0,
|
1076 |
+
"Traffic": 785936448.0
|
1077 |
+
}
|
1078 |
+
}
|
1079 |
+
}
|
context_encoding_model/_tp0_bk2/graph.neff
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d7e216fd8f0f2acfef59524e7cdb4ead506b2c17c584ce45dd222cd4dc4e3f4f
|
3 |
+
size 1987584
|
context_encoding_model/_tp0_bk2/log-neuron-cc.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
context_encoding_model/_tp0_bk2/metaneff.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99c279a1a32451ce56757879c7a74b6ff23378ae19871f2aee2c2746ceda57f3
|
3 |
+
size 1373735
|
context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:970c5138d61d773fc00bacb9090fbc05a05573925b8d91068006c211596d3f78
|
3 |
+
size 1450821
|
context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d7e216fd8f0f2acfef59524e7cdb4ead506b2c17c584ce45dd222cd4dc4e3f4f
|
3 |
+
size 1987584
|
context_encoding_model/_tp0_bk2/neuron_config.json
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_attn_implementation_autoset": false,
|
3 |
+
"_name_or_path": "Qwen/Qwen3-8B",
|
4 |
+
"add_cross_attention": false,
|
5 |
+
"architectures": [
|
6 |
+
"Qwen3ForCausalLM"
|
7 |
+
],
|
8 |
+
"attention_bias": false,
|
9 |
+
"attention_dropout": 0.0,
|
10 |
+
"attribute_map": {},
|
11 |
+
"bad_words_ids": null,
|
12 |
+
"begin_suppress_tokens": null,
|
13 |
+
"bos_token_id": 151643,
|
14 |
+
"chunk_size_feed_forward": 0,
|
15 |
+
"cross_attention_hidden_size": null,
|
16 |
+
"decoder_start_token_id": null,
|
17 |
+
"diversity_penalty": 0.0,
|
18 |
+
"do_sample": false,
|
19 |
+
"early_stopping": false,
|
20 |
+
"encoder_no_repeat_ngram_size": 0,
|
21 |
+
"eos_token_id": 151645,
|
22 |
+
"exponential_decay_length_penalty": null,
|
23 |
+
"finetuning_task": null,
|
24 |
+
"forced_bos_token_id": null,
|
25 |
+
"forced_eos_token_id": null,
|
26 |
+
"fused_spec_config": null,
|
27 |
+
"head_dim": 128,
|
28 |
+
"hidden_act": "silu",
|
29 |
+
"hidden_size": 4096,
|
30 |
+
"id2label": {
|
31 |
+
"0": "LABEL_0",
|
32 |
+
"1": "LABEL_1"
|
33 |
+
},
|
34 |
+
"initializer_range": 0.02,
|
35 |
+
"intermediate_size": 12288,
|
36 |
+
"is_decoder": false,
|
37 |
+
"is_encoder_decoder": false,
|
38 |
+
"label2id": {
|
39 |
+
"LABEL_0": 0,
|
40 |
+
"LABEL_1": 1
|
41 |
+
},
|
42 |
+
"length_penalty": 1.0,
|
43 |
+
"max_length": 20,
|
44 |
+
"max_position_embeddings": 40960,
|
45 |
+
"max_window_layers": 36,
|
46 |
+
"metadata": null,
|
47 |
+
"min_length": 0,
|
48 |
+
"model_type": "qwen3",
|
49 |
+
"neuron_config": {
|
50 |
+
"activation_quantization_type": null,
|
51 |
+
"allow_input_truncation": false,
|
52 |
+
"apply_seq_ids_mask": false,
|
53 |
+
"async_mode": false,
|
54 |
+
"attention_dp_degree": 1,
|
55 |
+
"attention_dtype": null,
|
56 |
+
"attn_block_cte_nki_kernel_enabled": false,
|
57 |
+
"attn_block_tkg_nki_kernel_cache_update": false,
|
58 |
+
"attn_block_tkg_nki_kernel_enabled": false,
|
59 |
+
"attn_cls": {
|
60 |
+
"__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
|
61 |
+
"__name__": "NeuronQwen3Attention"
|
62 |
+
},
|
63 |
+
"attn_kernel_enabled": null,
|
64 |
+
"attn_tkg_builtin_kernel_enabled": false,
|
65 |
+
"attn_tkg_nki_kernel_enabled": false,
|
66 |
+
"batch_size": 1,
|
67 |
+
"bucket_n_active_tokens": true,
|
68 |
+
"buckets": [
|
69 |
+
512
|
70 |
+
],
|
71 |
+
"cast_type": "config",
|
72 |
+
"cc_pipeline_tiling_factor": 2,
|
73 |
+
"chunked_prefill_config": null,
|
74 |
+
"context_encoding_buckets": [
|
75 |
+
512
|
76 |
+
],
|
77 |
+
"cp_degree": 1,
|
78 |
+
"ctx_batch_size": 1,
|
79 |
+
"disable_kv_cache_tiling": false,
|
80 |
+
"draft_model_modules_to_not_convert": null,
|
81 |
+
"enable_bucketing": true,
|
82 |
+
"enable_eagle_draft_input_norm": false,
|
83 |
+
"enable_eagle_speculation": false,
|
84 |
+
"enable_fused_speculation": false,
|
85 |
+
"enable_long_context_mode": false,
|
86 |
+
"enable_output_completion_notifications": false,
|
87 |
+
"enable_spill_reload_dge": false,
|
88 |
+
"enable_token_tree": false,
|
89 |
+
"ep_degree": 1,
|
90 |
+
"expert_mlp_nki_kernel_enabled": null,
|
91 |
+
"flash_decoding_enabled": false,
|
92 |
+
"fused_qkv": false,
|
93 |
+
"fused_rmsnorm_skip_gamma": false,
|
94 |
+
"is_block_kv_layout": null,
|
95 |
+
"is_chunked_prefill": false,
|
96 |
+
"is_continuous_batching": true,
|
97 |
+
"is_eagle_draft": false,
|
98 |
+
"is_medusa": false,
|
99 |
+
"is_prefill_stage": true,
|
100 |
+
"is_prefix_caching": false,
|
101 |
+
"k_cache_transposed": false,
|
102 |
+
"kv_cache_batch_size": 1,
|
103 |
+
"kv_cache_padding_size": 0,
|
104 |
+
"kv_cache_quant": false,
|
105 |
+
"kv_cache_tiling": false,
|
106 |
+
"layer_boundary_markers": false,
|
107 |
+
"lm_head_pad": false,
|
108 |
+
"lm_head_pad_alignment_size": 1,
|
109 |
+
"local_ranks_size": 2,
|
110 |
+
"logical_nc_config": 1,
|
111 |
+
"lora_config": null,
|
112 |
+
"max_batch_size": 1,
|
113 |
+
"max_context_length": 1024,
|
114 |
+
"max_length": 1024,
|
115 |
+
"max_new_tokens": null,
|
116 |
+
"medusa_speculation_length": 0,
|
117 |
+
"medusa_tree": null,
|
118 |
+
"mlp_kernel_enabled": false,
|
119 |
+
"mlp_kernel_fuse_residual_add": false,
|
120 |
+
"modules_to_not_convert": null,
|
121 |
+
"moe_fused_nki_kernel_enabled": null,
|
122 |
+
"n_active_tokens": 1024,
|
123 |
+
"n_positions": 1024,
|
124 |
+
"num_medusa_heads": 0,
|
125 |
+
"on_cpu": false,
|
126 |
+
"on_device_sampling_config": {
|
127 |
+
"deterministic": false,
|
128 |
+
"do_sample": false,
|
129 |
+
"dynamic": true,
|
130 |
+
"global_topk": 256,
|
131 |
+
"on_device_sampling_config": true,
|
132 |
+
"temperature": 1.0,
|
133 |
+
"top_k": 1,
|
134 |
+
"top_k_kernel_enabled": false,
|
135 |
+
"top_p": 1.0
|
136 |
+
},
|
137 |
+
"output_logits": false,
|
138 |
+
"overrides_torch_dtype": true,
|
139 |
+
"pa_block_size": 1024,
|
140 |
+
"pa_num_blocks": 1,
|
141 |
+
"padding_side": "right",
|
142 |
+
"pp_degree": 1,
|
143 |
+
"prefix_buckets": null,
|
144 |
+
"qk_layernorm": false,
|
145 |
+
"qkv_kernel_enabled": false,
|
146 |
+
"qkv_kernel_fuse_residual_add": false,
|
147 |
+
"qkv_kernel_nbsd_layout": false,
|
148 |
+
"quantization_dtype": "int8",
|
149 |
+
"quantization_type": "per_tensor_symmetric",
|
150 |
+
"quantize_clamp_bound": Infinity,
|
151 |
+
"quantized": false,
|
152 |
+
"quantized_checkpoints_path": null,
|
153 |
+
"quantized_mlp_kernel_enabled": false,
|
154 |
+
"rmsnorm_quantize_kernel_enabled": false,
|
155 |
+
"router_topk_nki_kernel_enabled": null,
|
156 |
+
"rpl_reduce_dtype": null,
|
157 |
+
"save_sharded_checkpoint": true,
|
158 |
+
"scratchpad_page_size": null,
|
159 |
+
"seq_len": 1024,
|
160 |
+
"seq_len_threshold_for_cc_tiling": 16384,
|
161 |
+
"sequence_parallel_enabled": false,
|
162 |
+
"shared_mlp_nki_kernel_enabled": null,
|
163 |
+
"skip_sharding": false,
|
164 |
+
"skip_warmup": false,
|
165 |
+
"spec_batch_size": 1,
|
166 |
+
"speculation_length": 0,
|
167 |
+
"start_rank_id": 0,
|
168 |
+
"target": null,
|
169 |
+
"tile_cc": false,
|
170 |
+
"tkg_batch_size": 1,
|
171 |
+
"token_generation_buckets": null,
|
172 |
+
"token_tree_config": null,
|
173 |
+
"torch_dtype": "bfloat16",
|
174 |
+
"tp_degree": 2,
|
175 |
+
"vocab_parallel": false,
|
176 |
+
"weight_gather_seq_len_threshold": 32768,
|
177 |
+
"weights_to_skip_layout_optimization": [],
|
178 |
+
"world_size": 2
|
179 |
+
},
|
180 |
+
"no_repeat_ngram_size": 0,
|
181 |
+
"num_attention_heads": 32,
|
182 |
+
"num_beam_groups": 1,
|
183 |
+
"num_beams": 1,
|
184 |
+
"num_cores_per_group": 1,
|
185 |
+
"num_hidden_layers": 36,
|
186 |
+
"num_key_value_heads": 8,
|
187 |
+
"num_return_sequences": 1,
|
188 |
+
"output_attentions": false,
|
189 |
+
"output_hidden_states": false,
|
190 |
+
"output_scores": false,
|
191 |
+
"pad_token_id": 0,
|
192 |
+
"prefix": null,
|
193 |
+
"problem_type": null,
|
194 |
+
"pruned_heads": {},
|
195 |
+
"remove_invalid_values": false,
|
196 |
+
"repetition_penalty": 1.0,
|
197 |
+
"return_dict": true,
|
198 |
+
"return_dict_in_generate": false,
|
199 |
+
"rms_norm_eps": 1e-06,
|
200 |
+
"rope_scaling": null,
|
201 |
+
"rope_theta": 1000000,
|
202 |
+
"sep_token_id": null,
|
203 |
+
"sliding_window": null,
|
204 |
+
"suppress_tokens": null,
|
205 |
+
"task_specific_params": null,
|
206 |
+
"temperature": 1.0,
|
207 |
+
"tf_legacy_loss": false,
|
208 |
+
"tie_encoder_decoder": false,
|
209 |
+
"tie_word_embeddings": false,
|
210 |
+
"tokenizer_class": null,
|
211 |
+
"top_k": 50,
|
212 |
+
"top_p": 1.0,
|
213 |
+
"torchscript": false,
|
214 |
+
"transformers_version": "4.51.0",
|
215 |
+
"typical_p": 1.0,
|
216 |
+
"use_bfloat16": false,
|
217 |
+
"use_cache": true,
|
218 |
+
"use_sliding_window": false,
|
219 |
+
"vocab_size": 151936
|
220 |
+
}
|
context_encoding_model/_tp0_bk3/command.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
neuronx-cc compile --framework=XLA model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb --output model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
|
context_encoding_model/_tp0_bk3/compile_flags.MODULE_b3ddbc97e5f0d1d64c82+155de413.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/log-neuron-cc.txt"]
|
context_encoding_model/_tp0_bk3/global_metric_store.json
ADDED
@@ -0,0 +1,1079 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Average": {
|
3 |
+
"tensorizer": {
|
4 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.7004623413086,
|
5 |
+
"StaticProfiler::AveragePartitionUtilization": 97.94140625,
|
6 |
+
"StaticProfiler::AveragePeUtilization": 98.78884887695313,
|
7 |
+
"StaticProfiler::LocalizationEfficiency": 91.59693145751953,
|
8 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 95.863037109375,
|
9 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
10 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"Count": {
|
14 |
+
"tensorizer": {
|
15 |
+
"StaticProfiler::AverageFractalPeUtilization": 1.0,
|
16 |
+
"StaticProfiler::AveragePartitionUtilization": 1.0,
|
17 |
+
"StaticProfiler::AveragePeUtilization": 1.0,
|
18 |
+
"StaticProfiler::LocalizationEfficiency": 1.0,
|
19 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
|
20 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
|
21 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"Sum": {
|
25 |
+
"compiletime": {
|
26 |
+
"AGOrderingAnalysisPass": 0.01837611198425293,
|
27 |
+
"AffinePredicateResolution": 0.0011184215545654297,
|
28 |
+
"AliasDependencyElimination": 0.00015664100646972656,
|
29 |
+
"AliasDependencyInduction": 0.005170583724975586,
|
30 |
+
"AliasDependencyReset": 0.027508020401000977,
|
31 |
+
"BFComputeCutting": 0.0036101341247558594,
|
32 |
+
"BirCodeGenLoop": 0.4774467945098877,
|
33 |
+
"CCOpFusion": 0.033265113830566406,
|
34 |
+
"CanonicalizeConv": 2.300000051036477e-05,
|
35 |
+
"CanonicalizeDAGForPGTiling": 0.004282712936401367,
|
36 |
+
"CanonicalizeForTensorizer": 4.600000102072954e-05,
|
37 |
+
"CanonicalizeIR": 0.0024569034576416016,
|
38 |
+
"Canonicalizer": 0.0009039999567903578,
|
39 |
+
"CoalesceCCOp": 0.014229059219360352,
|
40 |
+
"CommuteConcat": 0.0017316341400146484,
|
41 |
+
"DMALocalityOpt": 0.005630016326904297,
|
42 |
+
"DMAProfiler": 0.012981653213500977,
|
43 |
+
"DMATilingProfiler": 0.0037560462951660156,
|
44 |
+
"DataLocalityOpt": 0.07645320892333984,
|
45 |
+
"DataStreaming": 0.03730320930480957,
|
46 |
+
"DeConcat": 0.0018520355224609375,
|
47 |
+
"DeadCodeElimination": 0.0020148754119873047,
|
48 |
+
"DeadStoreElimination": 0.006912708282470703,
|
49 |
+
"DelinearIndices": 0.004647254943847656,
|
50 |
+
"Delinearization": 0.003908872604370117,
|
51 |
+
"DoNothing": 0.0001888275146484375,
|
52 |
+
"DramToDramTranspose": 0.02015542984008789,
|
53 |
+
"DumpGraphAndMetadata": 0.08691883087158203,
|
54 |
+
"EliminateDivs": 0.0025060176849365234,
|
55 |
+
"ExpandBatchNorm": 0.0027189254760742188,
|
56 |
+
"ExpandISAMacro": 0.011646032333374023,
|
57 |
+
"FactorizeBlkDims": 0.010123252868652344,
|
58 |
+
"FactorizeThreadAxesInFreeDims": 0.0023202896118164063,
|
59 |
+
"FlattenMacroLoop": 0.00232696533203125,
|
60 |
+
"GenericAccessSimplifier": 0.0008094310760498047,
|
61 |
+
"HoistCompute": 5.999999757477781e-06,
|
62 |
+
"IdentifyCrossPassTensors": 5.2999999752501026e-05,
|
63 |
+
"InferInitValue": 0.02833867073059082,
|
64 |
+
"InferIntrinsicOnCC": 0.008923768997192383,
|
65 |
+
"InferNeuronTensor": 0.025766372680664063,
|
66 |
+
"InferNonlocalTensors": 0.014599800109863281,
|
67 |
+
"InferPSumTensor": 0.28418898582458496,
|
68 |
+
"InlineNativeKernels": 0.00860905647277832,
|
69 |
+
"InsertIOTransposes": 0.01989889144897461,
|
70 |
+
"InsertLocalTransposes": 0.004229307174682617,
|
71 |
+
"InsertOffloadedTransposes": 0.0029871463775634766,
|
72 |
+
"LICM": 0.0030870437622070313,
|
73 |
+
"LateLegalizeInst": 0.014106035232543945,
|
74 |
+
"LateLegalizePostSplit": 0.014872312545776367,
|
75 |
+
"LateLowerReshapeOp": 0.0010464191436767578,
|
76 |
+
"LateLowerTensorOp": 0.002707242965698242,
|
77 |
+
"LateNeuronInstComb": 0.010563373565673828,
|
78 |
+
"LayoutPreprocessing": 0.026853561401367188,
|
79 |
+
"LayoutPreprocessingAndAnalysis": 0.0556035041809082,
|
80 |
+
"LayoutRequirementAnalysis": 0.004946470260620117,
|
81 |
+
"LegalizeCCOpLayout": 0.0025353431701660156,
|
82 |
+
"LegalizeOpLevelAlias": 0.0018966197967529297,
|
83 |
+
"LegalizePartitionReduce": 0.0017490386962890625,
|
84 |
+
"LegalizeSundaAccess": 0.07800722122192383,
|
85 |
+
"LegalizeSundaMacro": 0.012125253677368164,
|
86 |
+
"LegalizeType": 0.012685060501098633,
|
87 |
+
"LocalLayoutOpt": 0.013860225677490234,
|
88 |
+
"LoopFusion": 0.005201578140258789,
|
89 |
+
"LoopSplitting": 0.0003204345703125,
|
90 |
+
"LowerBroadcast": 0.002086162567138672,
|
91 |
+
"LowerCCOpBlockAxis": 0.0040171146392822266,
|
92 |
+
"LowerComplexBroadcast": 0.002280712127685547,
|
93 |
+
"LowerIntrinsics": 0.3143951892852783,
|
94 |
+
"LowerTensorOp": 0.01141357421875,
|
95 |
+
"LowerTranspose": 0.012923002243041992,
|
96 |
+
"MacroGeneration": 0.034410953521728516,
|
97 |
+
"MaskPropagation": 0.0028192996978759766,
|
98 |
+
"MemcastMotion": 1.8000000636675395e-05,
|
99 |
+
"MemcpyElimination": 0.02788853645324707,
|
100 |
+
"MutateDataType": 0.0012311935424804688,
|
101 |
+
"NeuronAliasDependencyInduction": 0.0001773834228515625,
|
102 |
+
"NeuronAliasDependencyReset": 0.024976015090942383,
|
103 |
+
"NeuronInstComb": 0.005156517028808594,
|
104 |
+
"NeuronLICM": 0.036696434020996094,
|
105 |
+
"NeuronLoopFusion": 0.008457422256469727,
|
106 |
+
"NeuronLoopInterchange": 0.001413106918334961,
|
107 |
+
"NeuronSimplifier": 0.007856369018554688,
|
108 |
+
"NeuronSimplifyPredicates": 0.12235808372497559,
|
109 |
+
"NeuronValueNumbering": 0.004765748977661133,
|
110 |
+
"OptimizeAliasedCopyChain": 0.0006341934204101563,
|
111 |
+
"OptimizeNKIKernels": 0.38834357261657715,
|
112 |
+
"PAGLayoutOpt": 0.0889735221862793,
|
113 |
+
"PComputeCutting": 0.005109071731567383,
|
114 |
+
"PGLayoutTilingPipeline": 0.6248171329498291,
|
115 |
+
"PGTiling": 0.1645822525024414,
|
116 |
+
"PadElimination": 0.0003485679626464844,
|
117 |
+
"ParAxesAnnotation": 0.05196070671081543,
|
118 |
+
"PartialLoopFusion": 0.011112451553344727,
|
119 |
+
"PartialSimdFusion": 0.012138128280639648,
|
120 |
+
"PenguinizeFunctions": 4.3000000005122274e-05,
|
121 |
+
"PerfectLoopNest": 0.002288341522216797,
|
122 |
+
"PruneFunctions": 4.099999932805076e-05,
|
123 |
+
"RecognizeOpIdiom": 0.0041277408599853516,
|
124 |
+
"Recompute": 0.00026416778564453125,
|
125 |
+
"RelaxPredicates": 0.01356959342956543,
|
126 |
+
"Rematerialization": 0.0024864673614501953,
|
127 |
+
"RemoveOptimizationBarriers": 4.900000203633681e-05,
|
128 |
+
"ReshapeWeights": 0.0007522106170654297,
|
129 |
+
"ResolveAccessConflict": 0.0048482418060302734,
|
130 |
+
"ResolveComplicatePredicates": 0.0015094280242919922,
|
131 |
+
"RewriteReplicationMatmul": 0.0015668869018554688,
|
132 |
+
"RewriteWeights": 0.0027174949645996094,
|
133 |
+
"SFKVectorizer": 0.2781519889831543,
|
134 |
+
"ScatterMotion": 4.70000013592653e-05,
|
135 |
+
"SimpleAllReduceTiling": 0.009549379348754883,
|
136 |
+
"Simplifier": 0.003630399703979492,
|
137 |
+
"SimplifyMacroPredicates": 0.011396646499633789,
|
138 |
+
"SimplifyNeuronTensor": 1.0561063289642334,
|
139 |
+
"SimplifySlice": 0.0023348331451416016,
|
140 |
+
"SimplifyTensor": 0.005601167678833008,
|
141 |
+
"SpillPSum": 0.013618230819702148,
|
142 |
+
"SplitAPUnionSets": 0.11336159706115723,
|
143 |
+
"SplitAccGrp": 0.001394510269165039,
|
144 |
+
"StaticProfiler": 0.014252662658691406,
|
145 |
+
"StaticTransposeLocalTensor": 0.003930330276489258,
|
146 |
+
"SundaISel": 0.04436635971069336,
|
147 |
+
"TCTransform": 0.0008757114410400391,
|
148 |
+
"TensorInitialization": 0.01558232307434082,
|
149 |
+
"TensorOpSimplifier": 0.004608869552612305,
|
150 |
+
"TensorOpTransform": 0.01923346519470215,
|
151 |
+
"TensorizerLegalizationPass": 5.2999999752501026e-05,
|
152 |
+
"TileCCOps": 0.005507707595825195,
|
153 |
+
"TilingProfiler": 0.007405757904052734,
|
154 |
+
"TransformConvOp": 0.0030219554901123047,
|
155 |
+
"TritiumFusion": 0.05425119400024414,
|
156 |
+
"ValueNumbering": 0.0020017623901367188,
|
157 |
+
"VectorizeDMA": 0.002228975296020508,
|
158 |
+
"VectorizeMatMult": 0.006806135177612305,
|
159 |
+
"VerifySupportedOps": 3.5000000934815034e-05,
|
160 |
+
"WeightCoalescing": 0.008660554885864258,
|
161 |
+
"ZeroSizeTensorElimination": 0.00014281272888183594,
|
162 |
+
"algsimp": 0.0027209999971091747,
|
163 |
+
"batchnorm_expander": 4.099999932805076e-05,
|
164 |
+
"boundary-marker-removal": 1.2999998943996616e-05,
|
165 |
+
"call-inliner": 0.0004540000227279961,
|
166 |
+
"canonicalize-boundary-marker": 1.700000029813964e-05,
|
167 |
+
"collective-stream-id-checker": 8.000000525498763e-05,
|
168 |
+
"comparison-expander": 0.0005869999877177179,
|
169 |
+
"computation-deduplicator": 7.500000356230885e-05,
|
170 |
+
"conditional-to-select": 1.700000029813964e-05,
|
171 |
+
"config-lowering": 8.800000068731606e-05,
|
172 |
+
"constant-statistics": 0.0005440000095404685,
|
173 |
+
"constant_folding": 0.00032700004521757364,
|
174 |
+
"cse": 3.7000001611886546e-05,
|
175 |
+
"dce": 9.100000170292333e-05,
|
176 |
+
"dot_decomposer": 0.0013370000524446368,
|
177 |
+
"dynamic-slice-transpose": 1.2000000424450263e-05,
|
178 |
+
"eliminate-redundant-compare": 0.0003020000003743917,
|
179 |
+
"emit-offloaded-dropout": 3.9999998989515007e-05,
|
180 |
+
"flatten-call-graph": 0.0009239999344572425,
|
181 |
+
"fuse-send-recv": 7.79999973019585e-05,
|
182 |
+
"hilo::LegalizeAlias": 1.1999999514955562e-05,
|
183 |
+
"hilo::NeuronInstCombine": 0.00018899999849963933,
|
184 |
+
"hilo::NeuronOpFusion": 4.5000000682193786e-05,
|
185 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 5.7999997807201e-05,
|
186 |
+
"hilo::ScheduleFusion": 0.00016099998902063817,
|
187 |
+
"hilo::SixtyFourHack": 6.70000008540228e-05,
|
188 |
+
"hilo::VerifyAliasing": 4.999999873689376e-06,
|
189 |
+
"hlo-mac-count": 0.0013409999664872885,
|
190 |
+
"hlo-verifier": 0.007716999854892492,
|
191 |
+
"instruction-histogram": 0.0007719999994151294,
|
192 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
193 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
194 |
+
"io-layout-normalization": 0.00139999995008111,
|
195 |
+
"io-statistics": 6.299999949987978e-05,
|
196 |
+
"legalize-ccops": 3.999999989900971e-06,
|
197 |
+
"legalize-compare": 1.1000000085914508e-05,
|
198 |
+
"lower-argminmax-custom-call": 1.1000000085914508e-05,
|
199 |
+
"map-inline": 0.0008809999562799931,
|
200 |
+
"metadata-naming": 6.70000008540228e-05,
|
201 |
+
"mlir::detail::OpToOpPassAdaptor": 0.00020599999697878957,
|
202 |
+
"mlir::hlo::MhloToPyPenguin": 0.00291300006210804,
|
203 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.00027200000477023423,
|
204 |
+
"mlir::mhlo::LowerComplexPass": 0.0003980000037699938,
|
205 |
+
"native-to-custom-softmax": 0.0007730000070296228,
|
206 |
+
"native-to-custom-softmax-dx": 0.0006189999985508621,
|
207 |
+
"operand_upcaster": 6.299999949987978e-05,
|
208 |
+
"opt-barrier-removal": 0.0005789999850094318,
|
209 |
+
"post-par-pipe-begin": 7.999999979801942e-06,
|
210 |
+
"post-par-pipe-end": 0.0,
|
211 |
+
"post-partition-simplification": 0.0017419999931007624,
|
212 |
+
"pre-par-pipe-begin": 1.9999999949504854e-06,
|
213 |
+
"pre-par-pipe-end": 0.0,
|
214 |
+
"pre-partition-simplification": 0.1384889930486679,
|
215 |
+
"replace-minimum-constant": 0.0004579999949783087,
|
216 |
+
"reshape-mover": 0.00011000000085914508,
|
217 |
+
"simplify-concat": 0.00014099999680183828,
|
218 |
+
"simplify-while-loops": 9.40000027185306e-05,
|
219 |
+
"transform-variadic-reduce": 8.100000559352338e-05,
|
220 |
+
"tuple-simplifier": 0.00030600003083236516,
|
221 |
+
"unpack-nested-aws-ntwsr": 0.000438000017311424,
|
222 |
+
"unroll-while-loop": 1.8999999156221747e-05,
|
223 |
+
"zero_sized_hlo_elimination": 0.0008750000270083547
|
224 |
+
},
|
225 |
+
"hilo": {
|
226 |
+
"ConstantSize": 2368805.0,
|
227 |
+
"HloInputCount": 475.0,
|
228 |
+
"HloMacCount": 206469595136.0,
|
229 |
+
"HloOutputCount": 73.0,
|
230 |
+
"IfmapSize": 8266549248.0,
|
231 |
+
"OfmapSize": 75497472.0,
|
232 |
+
"OutputsReadFromCount": 0.0,
|
233 |
+
"PassthroughTensorsCount": 0.0,
|
234 |
+
"RedundantOutputCount": 0.0,
|
235 |
+
"Traffic": 1751252352.0
|
236 |
+
},
|
237 |
+
"tensorizer": {
|
238 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 49538.0,
|
239 |
+
"StaticProfiler::AifUb": 304.240234375,
|
240 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 278.67474365234375,
|
241 |
+
"StaticProfiler::AverageDmaLength": 1974.1033935546875,
|
242 |
+
"StaticProfiler::DDRTransferBytes": 862646080.0,
|
243 |
+
"StaticProfiler::InternalTransferBytes": 669456896.0,
|
244 |
+
"StaticProfiler::LoadExpanded": 390679.0,
|
245 |
+
"StaticProfiler::StoreExpanded": 7261.0,
|
246 |
+
"StaticProfiler::TotalDMAExpanded": 397940.0,
|
247 |
+
"StaticProfiler::TotalDynamicInstancesCount": 59578.0,
|
248 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 59132.0,
|
249 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
250 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
251 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
252 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
253 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
254 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 28224.0,
|
255 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
256 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
257 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
258 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
259 |
+
"TilingProfiler::PfTransposeInstructions": 19777.0,
|
260 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
|
261 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
262 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0,
|
263 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 6.0,
|
264 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 303.0,
|
265 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
266 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
267 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
268 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
269 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
270 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
271 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
272 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
273 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
274 |
+
}
|
275 |
+
},
|
276 |
+
"all": {
|
277 |
+
"compiletime": {
|
278 |
+
"algsimp": 0.002532999962568283,
|
279 |
+
"call-inliner": 0.00042600001324899495,
|
280 |
+
"collective-stream-id-checker": 6.70000008540228e-05,
|
281 |
+
"comparison-expander": 0.0005719999899156392,
|
282 |
+
"constant-statistics": 0.0005440000095404685,
|
283 |
+
"constant_folding": 0.0003000000142492354,
|
284 |
+
"dce": 8.800000068731606e-05,
|
285 |
+
"dot_decomposer": 0.0013370000524446368,
|
286 |
+
"eliminate-redundant-compare": 0.000291000003926456,
|
287 |
+
"flatten-call-graph": 0.0008929999894462526,
|
288 |
+
"hlo-mac-count": 0.0010870000114664435,
|
289 |
+
"hlo-verifier": 0.007048000115901232,
|
290 |
+
"instruction-histogram": 0.0007719999994151294,
|
291 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
292 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
293 |
+
"io-layout-normalization": 0.00139999995008111,
|
294 |
+
"io-statistics": 6.299999949987978e-05,
|
295 |
+
"map-inline": 0.0008459999808110297,
|
296 |
+
"native-to-custom-softmax": 0.0006709999870508909,
|
297 |
+
"native-to-custom-softmax-dx": 0.0005300000193528831,
|
298 |
+
"opt-barrier-removal": 0.0005789999850094318,
|
299 |
+
"pre-par-pipe-begin": 1.9999999949504854e-06,
|
300 |
+
"pre-par-pipe-end": 0.0,
|
301 |
+
"pre-partition-simplification": 0.1384889930486679,
|
302 |
+
"replace-minimum-constant": 0.00041700000292621553,
|
303 |
+
"reshape-mover": 9.999999747378752e-05,
|
304 |
+
"simplify-while-loops": 8.800000068731606e-05,
|
305 |
+
"tuple-simplifier": 0.000291000003926456,
|
306 |
+
"unpack-nested-aws-ntwsr": 0.00042600001324899495,
|
307 |
+
"unroll-while-loop": 1.8999999156221747e-05,
|
308 |
+
"zero_sized_hlo_elimination": 0.0008750000270083547
|
309 |
+
}
|
310 |
+
},
|
311 |
+
"cumsum": {
|
312 |
+
"compiletime": {
|
313 |
+
"CoalesceCCOp": 0.00023508071899414063,
|
314 |
+
"DMALocalityOpt": 0.00017404556274414063,
|
315 |
+
"DMAProfiler": 0.0008785724639892578,
|
316 |
+
"DataStreaming": 0.0002880096435546875,
|
317 |
+
"DoNothing": 0.00011467933654785156,
|
318 |
+
"ExpandISAMacro": 0.0006787776947021484,
|
319 |
+
"FactorizeBlkDims": 0.0004444122314453125,
|
320 |
+
"InferPSumTensor": 0.0004467964172363281,
|
321 |
+
"LateLegalizeInst": 0.000461578369140625,
|
322 |
+
"LateNeuronInstComb": 0.0004818439483642578,
|
323 |
+
"LegalizeSundaAccess": 0.0016222000122070313,
|
324 |
+
"LegalizeType": 0.0002703666687011719,
|
325 |
+
"LowerBroadcast": 0.00025391578674316406,
|
326 |
+
"LowerIntrinsics": 0.00021457672119140625,
|
327 |
+
"LowerTranspose": 0.00024318695068359375,
|
328 |
+
"NeuronInstComb": 0.00048065185546875,
|
329 |
+
"NeuronLICM": 0.00038552284240722656,
|
330 |
+
"NeuronSimplifyPredicates": 0.0027823448181152344,
|
331 |
+
"NeuronValueNumbering": 0.00043129920959472656,
|
332 |
+
"SFKVectorizer": 0.003134012222290039,
|
333 |
+
"SimpleAllReduceTiling": 0.00022721290588378906,
|
334 |
+
"SimplifyNeuronTensor": 0.0005092620849609375,
|
335 |
+
"SpillPSum": 0.0005443096160888672,
|
336 |
+
"WeightCoalescing": 0.00020051002502441406
|
337 |
+
}
|
338 |
+
},
|
339 |
+
"sg00": {
|
340 |
+
"compiletime": {
|
341 |
+
"CanonicalizeConv": 9.999999974752427e-07,
|
342 |
+
"CanonicalizeForTensorizer": 1.700000029813964e-05,
|
343 |
+
"Canonicalizer": 0.00033599999733269215,
|
344 |
+
"HoistCompute": 3.000000106112566e-06,
|
345 |
+
"IdentifyCrossPassTensors": 1.5999999959603883e-05,
|
346 |
+
"MemcastMotion": 1.1000000085914508e-05,
|
347 |
+
"PenguinizeFunctions": 1.8000000636675395e-05,
|
348 |
+
"PruneFunctions": 1.4000000192027073e-05,
|
349 |
+
"RemoveOptimizationBarriers": 1.2999999853491317e-05,
|
350 |
+
"ScatterMotion": 2.4000000848900527e-05,
|
351 |
+
"TensorizerLegalizationPass": 2.700000004551839e-05,
|
352 |
+
"VerifySupportedOps": 1.2000000424450263e-05,
|
353 |
+
"algsimp": 6.500000017695129e-05,
|
354 |
+
"batchnorm_expander": 1.4000000192027073e-05,
|
355 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
356 |
+
"call-inliner": 9.000000318337698e-06,
|
357 |
+
"canonicalize-boundary-marker": 6.000000212225132e-06,
|
358 |
+
"collective-stream-id-checker": 3.999999989900971e-06,
|
359 |
+
"comparison-expander": 4.999999873689376e-06,
|
360 |
+
"computation-deduplicator": 2.300000051036477e-05,
|
361 |
+
"conditional-to-select": 4.999999873689376e-06,
|
362 |
+
"config-lowering": 3.9999998989515007e-05,
|
363 |
+
"constant_folding": 9.000000318337698e-06,
|
364 |
+
"cse": 1.2999999853491317e-05,
|
365 |
+
"dce": 9.999999974752427e-07,
|
366 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
367 |
+
"eliminate-redundant-compare": 3.999999989900971e-06,
|
368 |
+
"emit-offloaded-dropout": 1.4000000192027073e-05,
|
369 |
+
"flatten-call-graph": 9.999999747378752e-06,
|
370 |
+
"fuse-send-recv": 2.8000000384054147e-05,
|
371 |
+
"hilo::LegalizeAlias": 4.999999873689376e-06,
|
372 |
+
"hilo::NeuronInstCombine": 8.499999967170879e-05,
|
373 |
+
"hilo::NeuronOpFusion": 2.700000004551839e-05,
|
374 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 1.4999999621068127e-05,
|
375 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
376 |
+
"hilo::SixtyFourHack": 1.2999999853491317e-05,
|
377 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
378 |
+
"hlo-mac-count": 3.099999958067201e-05,
|
379 |
+
"hlo-verifier": 0.0002530000056140125,
|
380 |
+
"legalize-ccops": 9.999999974752427e-07,
|
381 |
+
"legalize-compare": 3.999999989900971e-06,
|
382 |
+
"lower-argminmax-custom-call": 3.999999989900971e-06,
|
383 |
+
"map-inline": 1.2000000424450263e-05,
|
384 |
+
"metadata-naming": 2.4000000848900527e-05,
|
385 |
+
"mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05,
|
386 |
+
"mlir::hlo::MhloToPyPenguin": 0.0010389999952167273,
|
387 |
+
"mlir::mhlo::LowerComplexExtraPass": 8.800000068731606e-05,
|
388 |
+
"mlir::mhlo::LowerComplexPass": 0.00014200000441633165,
|
389 |
+
"native-to-custom-softmax": 9.000000136438757e-05,
|
390 |
+
"native-to-custom-softmax-dx": 4.3000000005122274e-05,
|
391 |
+
"operand_upcaster": 2.300000051036477e-05,
|
392 |
+
"post-par-pipe-begin": 3.000000106112566e-06,
|
393 |
+
"post-par-pipe-end": 0.0,
|
394 |
+
"post-partition-simplification": 0.0006249999860301614,
|
395 |
+
"replace-minimum-constant": 2.5999999706982635e-05,
|
396 |
+
"reshape-mover": 3.999999989900971e-06,
|
397 |
+
"simplify-concat": 4.8000001697801054e-05,
|
398 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
399 |
+
"transform-variadic-reduce": 9.000000318337698e-06,
|
400 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
401 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
402 |
+
"unroll-while-loop": 0.0
|
403 |
+
},
|
404 |
+
"hilo": {
|
405 |
+
"ArithmeticIntensity": 73.02900695800781,
|
406 |
+
"ConstantSize": 2368805.0,
|
407 |
+
"HloInputCount": 475.0,
|
408 |
+
"HloMacCount": 25769803776.0,
|
409 |
+
"HloOutputCount": 73.0,
|
410 |
+
"IfmapSize": 8266549248.0,
|
411 |
+
"OfmapSize": 75497472.0,
|
412 |
+
"OutputsReadFromCount": 0.0,
|
413 |
+
"PassthroughTensorsCount": 0.0,
|
414 |
+
"RedundantOutputCount": 0.0,
|
415 |
+
"Traffic": 705741632.0
|
416 |
+
}
|
417 |
+
},
|
418 |
+
"sg0000": {
|
419 |
+
"compiletime": {
|
420 |
+
"AGOrderingAnalysisPass": 0.0818486213684082,
|
421 |
+
"AffinePredicateResolution": 0.001665353775024414,
|
422 |
+
"AliasDependencyElimination": 0.00012683868408203125,
|
423 |
+
"AliasDependencyInduction": 0.008559942245483398,
|
424 |
+
"AliasDependencyReset": 0.03254294395446777,
|
425 |
+
"BFComputeCutting": 0.003969907760620117,
|
426 |
+
"BirCodeGenLoop": 0.06339025497436523,
|
427 |
+
"CCOpFusion": 0.029911041259765625,
|
428 |
+
"CanonicalizeDAGForPGTiling": 0.003092050552368164,
|
429 |
+
"CanonicalizeIR": 0.002637147903442383,
|
430 |
+
"CoalesceCCOp": 0.0051479339599609375,
|
431 |
+
"CommuteConcat": 0.001478433609008789,
|
432 |
+
"DMALocalityOpt": 0.0016412734985351563,
|
433 |
+
"DMAProfiler": 0.004613637924194336,
|
434 |
+
"DMATilingProfiler": 0.004850864410400391,
|
435 |
+
"DataLocalityOpt": 0.11357831954956055,
|
436 |
+
"DataStreaming": 0.0061092376708984375,
|
437 |
+
"DeConcat": 0.0013332366943359375,
|
438 |
+
"DeadCodeElimination": 0.0018727779388427734,
|
439 |
+
"DeadStoreElimination": 0.03094482421875,
|
440 |
+
"DelinearIndices": 0.008640289306640625,
|
441 |
+
"Delinearization": 0.0035429000854492188,
|
442 |
+
"DoNothing": 8.106231689453125e-05,
|
443 |
+
"DramToDramTranspose": 0.03549051284790039,
|
444 |
+
"DumpGraphAndMetadata": 0.005577564239501953,
|
445 |
+
"EliminateDivs": 0.003966331481933594,
|
446 |
+
"ExpandBatchNorm": 0.0017447471618652344,
|
447 |
+
"ExpandISAMacro": 0.002687692642211914,
|
448 |
+
"FactorizeBlkDims": 0.026469945907592773,
|
449 |
+
"FactorizeThreadAxesInFreeDims": 0.0014863014221191406,
|
450 |
+
"FlattenMacroLoop": 0.00392913818359375,
|
451 |
+
"GenericAccessSimplifier": 0.0018973350524902344,
|
452 |
+
"InferInitValue": 0.03517007827758789,
|
453 |
+
"InferIntrinsicOnCC": 0.010237932205200195,
|
454 |
+
"InferNeuronTensor": 0.051462411880493164,
|
455 |
+
"InferNonlocalTensors": 0.14991235733032227,
|
456 |
+
"InferPSumTensor": 0.053685903549194336,
|
457 |
+
"InlineNativeKernels": 0.002433300018310547,
|
458 |
+
"InsertIOTransposes": 0.015550613403320313,
|
459 |
+
"InsertLocalTransposes": 0.007843017578125,
|
460 |
+
"InsertOffloadedTransposes": 0.002854585647583008,
|
461 |
+
"LICM": 0.003381490707397461,
|
462 |
+
"LateLegalizeInst": 0.0069310665130615234,
|
463 |
+
"LateLegalizePostSplit": 0.00308990478515625,
|
464 |
+
"LateLowerReshapeOp": 0.0017940998077392578,
|
465 |
+
"LateLowerTensorOp": 0.005001068115234375,
|
466 |
+
"LateNeuronInstComb": 0.016704320907592773,
|
467 |
+
"LayoutPreprocessing": 0.033296823501586914,
|
468 |
+
"LayoutPreprocessingAndAnalysis": 0.12302517890930176,
|
469 |
+
"LayoutRequirementAnalysis": 0.007364988327026367,
|
470 |
+
"LegalizeCCOpLayout": 0.0029296875,
|
471 |
+
"LegalizeOpLevelAlias": 0.0016987323760986328,
|
472 |
+
"LegalizePartitionReduce": 0.0014727115631103516,
|
473 |
+
"LegalizeSundaAccess": 0.04025077819824219,
|
474 |
+
"LegalizeSundaMacro": 0.009906291961669922,
|
475 |
+
"LegalizeType": 0.004493236541748047,
|
476 |
+
"LocalLayoutOpt": 0.017308473587036133,
|
477 |
+
"LoopFusion": 0.005831241607666016,
|
478 |
+
"LoopSplitting": 0.00037789344787597656,
|
479 |
+
"LowerBroadcast": 0.0016851425170898438,
|
480 |
+
"LowerCCOpBlockAxis": 0.005655765533447266,
|
481 |
+
"LowerComplexBroadcast": 0.0020987987518310547,
|
482 |
+
"LowerIntrinsics": 0.040236473083496094,
|
483 |
+
"LowerTensorOp": 0.012641191482543945,
|
484 |
+
"LowerTranspose": 0.0125579833984375,
|
485 |
+
"MacroGeneration": 0.08074021339416504,
|
486 |
+
"MaskPropagation": 0.005038022994995117,
|
487 |
+
"MemcpyElimination": 0.10875082015991211,
|
488 |
+
"MutateDataType": 0.0013315677642822266,
|
489 |
+
"NeuronAliasDependencyInduction": 0.00025200843811035156,
|
490 |
+
"NeuronAliasDependencyReset": 0.021958112716674805,
|
491 |
+
"NeuronInstComb": 0.009703636169433594,
|
492 |
+
"NeuronLICM": 0.011526823043823242,
|
493 |
+
"NeuronLoopFusion": 0.017663955688476563,
|
494 |
+
"NeuronLoopInterchange": 0.002567291259765625,
|
495 |
+
"NeuronSimplifier": 0.011670589447021484,
|
496 |
+
"NeuronSimplifyPredicates": 0.017385244369506836,
|
497 |
+
"NeuronValueNumbering": 0.004181623458862305,
|
498 |
+
"OptimizeAliasedCopyChain": 0.0017867088317871094,
|
499 |
+
"OptimizeNKIKernels": 0.0020456314086914063,
|
500 |
+
"PAGLayoutOpt": 0.3681519031524658,
|
501 |
+
"PComputeCutting": 0.008620262145996094,
|
502 |
+
"PGLayoutTilingPipeline": 1.3210320472717285,
|
503 |
+
"PGTiling": 0.27039527893066406,
|
504 |
+
"PadElimination": 0.0003745555877685547,
|
505 |
+
"ParAxesAnnotation": 0.33005595207214355,
|
506 |
+
"PartialLoopFusion": 0.026912212371826172,
|
507 |
+
"PartialSimdFusion": 0.03544425964355469,
|
508 |
+
"PerfectLoopNest": 0.0021703243255615234,
|
509 |
+
"RecognizeOpIdiom": 0.004334926605224609,
|
510 |
+
"Recompute": 0.0002522468566894531,
|
511 |
+
"RelaxPredicates": 0.004270076751708984,
|
512 |
+
"Rematerialization": 0.005487918853759766,
|
513 |
+
"ReshapeWeights": 0.0006825923919677734,
|
514 |
+
"ResolveAccessConflict": 0.003779888153076172,
|
515 |
+
"ResolveComplicatePredicates": 0.0018131732940673828,
|
516 |
+
"RewriteReplicationMatmul": 0.002633333206176758,
|
517 |
+
"RewriteWeights": 0.0036499500274658203,
|
518 |
+
"SFKVectorizer": 0.2772994041442871,
|
519 |
+
"SimpleAllReduceTiling": 0.002454519271850586,
|
520 |
+
"Simplifier": 0.0045070648193359375,
|
521 |
+
"SimplifyMacroPredicates": 0.016190290451049805,
|
522 |
+
"SimplifyNeuronTensor": 0.01452183723449707,
|
523 |
+
"SimplifySlice": 0.0010039806365966797,
|
524 |
+
"SimplifyTensor": 0.00657200813293457,
|
525 |
+
"SpillPSum": 0.02208685874938965,
|
526 |
+
"SplitAPUnionSets": 0.04095458984375,
|
527 |
+
"SplitAccGrp": 0.0018160343170166016,
|
528 |
+
"StaticProfiler": 0.004816770553588867,
|
529 |
+
"StaticTransposeLocalTensor": 0.004886150360107422,
|
530 |
+
"SundaISel": 0.04611611366271973,
|
531 |
+
"TCTransform": 0.001667022705078125,
|
532 |
+
"TensorInitialization": 0.022374629974365234,
|
533 |
+
"TensorOpSimplifier": 0.006697177886962891,
|
534 |
+
"TensorOpTransform": 0.02793574333190918,
|
535 |
+
"TileCCOps": 0.007641792297363281,
|
536 |
+
"TilingProfiler": 0.015750885009765625,
|
537 |
+
"TransformConvOp": 0.0026845932006835938,
|
538 |
+
"TritiumFusion": 0.08186149597167969,
|
539 |
+
"ValueNumbering": 0.0026755332946777344,
|
540 |
+
"VectorizeDMA": 0.007223367691040039,
|
541 |
+
"VectorizeMatMult": 0.018305540084838867,
|
542 |
+
"WeightCoalescing": 0.003328561782836914,
|
543 |
+
"ZeroSizeTensorElimination": 0.00011229515075683594
|
544 |
+
},
|
545 |
+
"tensorizer": {
|
546 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 5862.0,
|
547 |
+
"StaticProfiler::AifUb": 88.59026336669922,
|
548 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 582.7418823242188,
|
549 |
+
"StaticProfiler::AverageDmaLength": 2248.2685546875,
|
550 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.96076202392578,
|
551 |
+
"StaticProfiler::AveragePartitionUtilization": 99.90216827392578,
|
552 |
+
"StaticProfiler::AveragePeUtilization": 99.8394546508789,
|
553 |
+
"StaticProfiler::DDRTransferBytes": 104424704.0,
|
554 |
+
"StaticProfiler::InternalTransferBytes": 122421248.0,
|
555 |
+
"StaticProfiler::LoadExpanded": 25346.0,
|
556 |
+
"StaticProfiler::LocalizationEfficiency": 657.7944946289063,
|
557 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 915.0787353515625,
|
558 |
+
"StaticProfiler::StoreExpanded": 10753.0,
|
559 |
+
"StaticProfiler::TotalDMAExpanded": 36099.0,
|
560 |
+
"StaticProfiler::TotalDynamicInstancesCount": 8866.0,
|
561 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 8860.0,
|
562 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
563 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
564 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
565 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
566 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
567 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
568 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 96.0,
|
569 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 3080.0,
|
570 |
+
"TilingProfiler::NumPfTransposes": 8.0,
|
571 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
572 |
+
"TilingProfiler::NumPfTransposesForLocal": 6.0,
|
573 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 1.0,
|
574 |
+
"TilingProfiler::PfTransposeInstructions": 1760.0,
|
575 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 256.0,
|
576 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1376.0,
|
577 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 128.0,
|
578 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
579 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 649.0,
|
580 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
581 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
582 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
583 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
584 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
585 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
586 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
587 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
588 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
589 |
+
}
|
590 |
+
},
|
591 |
+
"sg0001": {
|
592 |
+
"compiletime": {
|
593 |
+
"AGOrderingAnalysisPass": 0.03383040428161621,
|
594 |
+
"AffinePredicateResolution": 0.0015320777893066406,
|
595 |
+
"AliasDependencyElimination": 0.0001316070556640625,
|
596 |
+
"AliasDependencyInduction": 0.00819253921508789,
|
597 |
+
"AliasDependencyReset": 0.02862405776977539,
|
598 |
+
"BFComputeCutting": 0.004217624664306641,
|
599 |
+
"BirCodeGenLoop": 0.0443270206451416,
|
600 |
+
"CCOpFusion": 0.04336118698120117,
|
601 |
+
"CanonicalizeDAGForPGTiling": 0.0031616687774658203,
|
602 |
+
"CanonicalizeIR": 0.0021500587463378906,
|
603 |
+
"CoalesceCCOp": 0.005389690399169922,
|
604 |
+
"CommuteConcat": 0.0024237632751464844,
|
605 |
+
"DMALocalityOpt": 0.002274751663208008,
|
606 |
+
"DMAProfiler": 0.003973484039306641,
|
607 |
+
"DMATilingProfiler": 0.005924701690673828,
|
608 |
+
"DataLocalityOpt": 0.15027260780334473,
|
609 |
+
"DataStreaming": 0.004762887954711914,
|
610 |
+
"DeConcat": 0.0018739700317382813,
|
611 |
+
"DeadCodeElimination": 0.001882314682006836,
|
612 |
+
"DeadStoreElimination": 0.03486776351928711,
|
613 |
+
"DelinearIndices": 0.009628534317016602,
|
614 |
+
"Delinearization": 0.0037381649017333984,
|
615 |
+
"DoNothing": 6.985664367675781e-05,
|
616 |
+
"DramToDramTranspose": 0.04212188720703125,
|
617 |
+
"DumpGraphAndMetadata": 0.004312038421630859,
|
618 |
+
"EliminateDivs": 0.005432844161987305,
|
619 |
+
"ExpandBatchNorm": 0.002119302749633789,
|
620 |
+
"ExpandISAMacro": 0.0024309158325195313,
|
621 |
+
"FactorizeBlkDims": 0.02235579490661621,
|
622 |
+
"FactorizeThreadAxesInFreeDims": 0.0018169879913330078,
|
623 |
+
"FlattenMacroLoop": 0.0030968189239501953,
|
624 |
+
"GenericAccessSimplifier": 0.0016777515411376953,
|
625 |
+
"InferInitValue": 0.043079376220703125,
|
626 |
+
"InferIntrinsicOnCC": 0.009890556335449219,
|
627 |
+
"InferNeuronTensor": 0.05600404739379883,
|
628 |
+
"InferNonlocalTensors": 0.03101515769958496,
|
629 |
+
"InferPSumTensor": 0.04645681381225586,
|
630 |
+
"InlineNativeKernels": 0.0015399456024169922,
|
631 |
+
"InsertIOTransposes": 0.02417731285095215,
|
632 |
+
"InsertLocalTransposes": 0.0070497989654541016,
|
633 |
+
"InsertOffloadedTransposes": 0.003525972366333008,
|
634 |
+
"LICM": 0.0035805702209472656,
|
635 |
+
"LateLegalizeInst": 0.0041539669036865234,
|
636 |
+
"LateLegalizePostSplit": 0.0027403831481933594,
|
637 |
+
"LateLowerReshapeOp": 0.0014560222625732422,
|
638 |
+
"LateLowerTensorOp": 0.004617452621459961,
|
639 |
+
"LateNeuronInstComb": 0.015344619750976563,
|
640 |
+
"LayoutPreprocessing": 0.030884981155395508,
|
641 |
+
"LayoutPreprocessingAndAnalysis": 0.06435275077819824,
|
642 |
+
"LayoutRequirementAnalysis": 0.007463693618774414,
|
643 |
+
"LegalizeCCOpLayout": 0.002064943313598633,
|
644 |
+
"LegalizeOpLevelAlias": 0.0011925697326660156,
|
645 |
+
"LegalizePartitionReduce": 0.0026116371154785156,
|
646 |
+
"LegalizeSundaAccess": 0.015822887420654297,
|
647 |
+
"LegalizeSundaMacro": 0.012560844421386719,
|
648 |
+
"LegalizeType": 0.004744291305541992,
|
649 |
+
"LocalLayoutOpt": 0.023772239685058594,
|
650 |
+
"LoopFusion": 0.0066835880279541016,
|
651 |
+
"LoopSplitting": 0.0003638267517089844,
|
652 |
+
"LowerBroadcast": 0.002238750457763672,
|
653 |
+
"LowerCCOpBlockAxis": 0.005678653717041016,
|
654 |
+
"LowerComplexBroadcast": 0.0019271373748779297,
|
655 |
+
"LowerIntrinsics": 0.042801856994628906,
|
656 |
+
"LowerTensorOp": 0.012106895446777344,
|
657 |
+
"LowerTranspose": 0.012960433959960938,
|
658 |
+
"MacroGeneration": 0.12800955772399902,
|
659 |
+
"MaskPropagation": 0.0031516551971435547,
|
660 |
+
"MemcpyElimination": 0.10379505157470703,
|
661 |
+
"MutateDataType": 0.0014393329620361328,
|
662 |
+
"NeuronAliasDependencyInduction": 0.00022101402282714844,
|
663 |
+
"NeuronAliasDependencyReset": 0.020102262496948242,
|
664 |
+
"NeuronInstComb": 0.009283781051635742,
|
665 |
+
"NeuronLICM": 0.009867429733276367,
|
666 |
+
"NeuronLoopFusion": 0.022713661193847656,
|
667 |
+
"NeuronLoopInterchange": 0.002709627151489258,
|
668 |
+
"NeuronSimplifier": 0.01328134536743164,
|
669 |
+
"NeuronSimplifyPredicates": 0.001683950424194336,
|
670 |
+
"NeuronValueNumbering": 0.0033235549926757813,
|
671 |
+
"OptimizeAliasedCopyChain": 0.0007724761962890625,
|
672 |
+
"OptimizeNKIKernels": 0.001729726791381836,
|
673 |
+
"PAGLayoutOpt": 0.13172507286071777,
|
674 |
+
"PComputeCutting": 0.007474422454833984,
|
675 |
+
"PGLayoutTilingPipeline": 0.9329550266265869,
|
676 |
+
"PGTiling": 0.4518747329711914,
|
677 |
+
"PadElimination": 0.00040411949157714844,
|
678 |
+
"ParAxesAnnotation": 0.0915369987487793,
|
679 |
+
"PartialLoopFusion": 0.020573854446411133,
|
680 |
+
"PartialSimdFusion": 0.04284977912902832,
|
681 |
+
"PerfectLoopNest": 0.002377033233642578,
|
682 |
+
"RecognizeOpIdiom": 0.0049991607666015625,
|
683 |
+
"Recompute": 0.00026345252990722656,
|
684 |
+
"RelaxPredicates": 0.0034220218658447266,
|
685 |
+
"Rematerialization": 0.0021615028381347656,
|
686 |
+
"ReshapeWeights": 0.0007557868957519531,
|
687 |
+
"ResolveAccessConflict": 0.004181861877441406,
|
688 |
+
"ResolveComplicatePredicates": 0.0015151500701904297,
|
689 |
+
"RewriteReplicationMatmul": 0.0020759105682373047,
|
690 |
+
"RewriteWeights": 0.0036649703979492188,
|
691 |
+
"SFKVectorizer": 0.20148277282714844,
|
692 |
+
"SimpleAllReduceTiling": 0.003732442855834961,
|
693 |
+
"Simplifier": 0.004697084426879883,
|
694 |
+
"SimplifyMacroPredicates": 0.007361888885498047,
|
695 |
+
"SimplifyNeuronTensor": 0.009825944900512695,
|
696 |
+
"SimplifySlice": 0.0017888545989990234,
|
697 |
+
"SimplifyTensor": 0.006832122802734375,
|
698 |
+
"SpillPSum": 0.022799968719482422,
|
699 |
+
"SplitAPUnionSets": 0.020108938217163086,
|
700 |
+
"SplitAccGrp": 0.0015766620635986328,
|
701 |
+
"StaticProfiler": 0.004146099090576172,
|
702 |
+
"StaticTransposeLocalTensor": 0.004926919937133789,
|
703 |
+
"SundaISel": 0.04472494125366211,
|
704 |
+
"TCTransform": 0.0018138885498046875,
|
705 |
+
"TensorInitialization": 0.004791736602783203,
|
706 |
+
"TensorOpSimplifier": 0.0064849853515625,
|
707 |
+
"TensorOpTransform": 0.0333099365234375,
|
708 |
+
"TileCCOps": 0.0056035518646240234,
|
709 |
+
"TilingProfiler": 0.01600933074951172,
|
710 |
+
"TransformConvOp": 0.002446413040161133,
|
711 |
+
"TritiumFusion": 0.1239166259765625,
|
712 |
+
"ValueNumbering": 0.0030901432037353516,
|
713 |
+
"VectorizeDMA": 0.0017311573028564453,
|
714 |
+
"VectorizeMatMult": 0.018932580947875977,
|
715 |
+
"WeightCoalescing": 0.0027513504028320313,
|
716 |
+
"ZeroSizeTensorElimination": 0.00011587142944335938
|
717 |
+
},
|
718 |
+
"tensorizer": {
|
719 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 15811.0,
|
720 |
+
"StaticProfiler::AifUb": 934.4357299804688,
|
721 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 708.8487548828125,
|
722 |
+
"StaticProfiler::AverageDmaLength": 1109.3380126953125,
|
723 |
+
"StaticProfiler::AverageFractalPeUtilization": 100.0,
|
724 |
+
"StaticProfiler::AveragePartitionUtilization": 99.8372802734375,
|
725 |
+
"StaticProfiler::AveragePeUtilization": 100.0,
|
726 |
+
"StaticProfiler::DDRTransferBytes": 306283520.0,
|
727 |
+
"StaticProfiler::InternalTransferBytes": 104595456.0,
|
728 |
+
"StaticProfiler::LoadExpanded": 257536.0,
|
729 |
+
"StaticProfiler::LocalizationEfficiency": 75.85848236083984,
|
730 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 85.1915054321289,
|
731 |
+
"StaticProfiler::StoreExpanded": 10241.0,
|
732 |
+
"StaticProfiler::TotalDMAExpanded": 267777.0,
|
733 |
+
"StaticProfiler::TotalDynamicInstancesCount": 19667.0,
|
734 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 19667.0,
|
735 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
736 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
737 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
738 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
739 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
740 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
741 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 64.0,
|
742 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 12288.0,
|
743 |
+
"TilingProfiler::NumPfTransposes": 9.0,
|
744 |
+
"TilingProfiler::NumPfTransposesForIo": 3.0,
|
745 |
+
"TilingProfiler::NumPfTransposesForLocal": 4.0,
|
746 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 2.0,
|
747 |
+
"TilingProfiler::PfTransposeInstructions": 1904.0,
|
748 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 272.0,
|
749 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1120.0,
|
750 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 512.0,
|
751 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
|
752 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 704.0,
|
753 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
754 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
755 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
756 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
757 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
758 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
759 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
760 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
761 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
762 |
+
}
|
763 |
+
},
|
764 |
+
"sg0002": {
|
765 |
+
"compiletime": {
|
766 |
+
"AGOrderingAnalysisPass": 0.01837611198425293,
|
767 |
+
"AffinePredicateResolution": 0.0011184215545654297,
|
768 |
+
"AliasDependencyElimination": 0.00015664100646972656,
|
769 |
+
"AliasDependencyInduction": 0.005170583724975586,
|
770 |
+
"AliasDependencyReset": 0.027508020401000977,
|
771 |
+
"BFComputeCutting": 0.0036101341247558594,
|
772 |
+
"BirCodeGenLoop": 0.4774467945098877,
|
773 |
+
"CCOpFusion": 0.033265113830566406,
|
774 |
+
"CanonicalizeDAGForPGTiling": 0.004282712936401367,
|
775 |
+
"CanonicalizeIR": 0.0024569034576416016,
|
776 |
+
"CoalesceCCOp": 0.013993978500366211,
|
777 |
+
"CommuteConcat": 0.0017316341400146484,
|
778 |
+
"DMALocalityOpt": 0.005455970764160156,
|
779 |
+
"DMAProfiler": 0.012103080749511719,
|
780 |
+
"DMATilingProfiler": 0.0037560462951660156,
|
781 |
+
"DataLocalityOpt": 0.07645320892333984,
|
782 |
+
"DataStreaming": 0.03701519966125488,
|
783 |
+
"DeConcat": 0.0018520355224609375,
|
784 |
+
"DeadCodeElimination": 0.0020148754119873047,
|
785 |
+
"DeadStoreElimination": 0.006912708282470703,
|
786 |
+
"DelinearIndices": 0.004647254943847656,
|
787 |
+
"Delinearization": 0.003908872604370117,
|
788 |
+
"DoNothing": 7.414817810058594e-05,
|
789 |
+
"DramToDramTranspose": 0.02015542984008789,
|
790 |
+
"DumpGraphAndMetadata": 0.08691883087158203,
|
791 |
+
"EliminateDivs": 0.0025060176849365234,
|
792 |
+
"ExpandBatchNorm": 0.0027189254760742188,
|
793 |
+
"ExpandISAMacro": 0.010967254638671875,
|
794 |
+
"FactorizeBlkDims": 0.009678840637207031,
|
795 |
+
"FactorizeThreadAxesInFreeDims": 0.0023202896118164063,
|
796 |
+
"FlattenMacroLoop": 0.00232696533203125,
|
797 |
+
"GenericAccessSimplifier": 0.0008094310760498047,
|
798 |
+
"InferInitValue": 0.02833867073059082,
|
799 |
+
"InferIntrinsicOnCC": 0.008923768997192383,
|
800 |
+
"InferNeuronTensor": 0.025766372680664063,
|
801 |
+
"InferNonlocalTensors": 0.014599800109863281,
|
802 |
+
"InferPSumTensor": 0.28374218940734863,
|
803 |
+
"InlineNativeKernels": 0.00860905647277832,
|
804 |
+
"InsertIOTransposes": 0.01989889144897461,
|
805 |
+
"InsertLocalTransposes": 0.004229307174682617,
|
806 |
+
"InsertOffloadedTransposes": 0.0029871463775634766,
|
807 |
+
"LICM": 0.0030870437622070313,
|
808 |
+
"LateLegalizeInst": 0.01364445686340332,
|
809 |
+
"LateLegalizePostSplit": 0.014872312545776367,
|
810 |
+
"LateLowerReshapeOp": 0.0010464191436767578,
|
811 |
+
"LateLowerTensorOp": 0.002707242965698242,
|
812 |
+
"LateNeuronInstComb": 0.01008152961730957,
|
813 |
+
"LayoutPreprocessing": 0.026853561401367188,
|
814 |
+
"LayoutPreprocessingAndAnalysis": 0.0556035041809082,
|
815 |
+
"LayoutRequirementAnalysis": 0.004946470260620117,
|
816 |
+
"LegalizeCCOpLayout": 0.0025353431701660156,
|
817 |
+
"LegalizeOpLevelAlias": 0.0018966197967529297,
|
818 |
+
"LegalizePartitionReduce": 0.0017490386962890625,
|
819 |
+
"LegalizeSundaAccess": 0.0763850212097168,
|
820 |
+
"LegalizeSundaMacro": 0.012125253677368164,
|
821 |
+
"LegalizeType": 0.012414693832397461,
|
822 |
+
"LocalLayoutOpt": 0.013860225677490234,
|
823 |
+
"LoopFusion": 0.005201578140258789,
|
824 |
+
"LoopSplitting": 0.0003204345703125,
|
825 |
+
"LowerBroadcast": 0.0018322467803955078,
|
826 |
+
"LowerCCOpBlockAxis": 0.0040171146392822266,
|
827 |
+
"LowerComplexBroadcast": 0.002280712127685547,
|
828 |
+
"LowerIntrinsics": 0.3141806125640869,
|
829 |
+
"LowerTensorOp": 0.01141357421875,
|
830 |
+
"LowerTranspose": 0.012679815292358398,
|
831 |
+
"MacroGeneration": 0.034410953521728516,
|
832 |
+
"MaskPropagation": 0.0028192996978759766,
|
833 |
+
"MemcpyElimination": 0.02788853645324707,
|
834 |
+
"MutateDataType": 0.0012311935424804688,
|
835 |
+
"NeuronAliasDependencyInduction": 0.0001773834228515625,
|
836 |
+
"NeuronAliasDependencyReset": 0.024976015090942383,
|
837 |
+
"NeuronInstComb": 0.004675865173339844,
|
838 |
+
"NeuronLICM": 0.03631091117858887,
|
839 |
+
"NeuronLoopFusion": 0.008457422256469727,
|
840 |
+
"NeuronLoopInterchange": 0.001413106918334961,
|
841 |
+
"NeuronSimplifier": 0.007856369018554688,
|
842 |
+
"NeuronSimplifyPredicates": 0.11957573890686035,
|
843 |
+
"NeuronValueNumbering": 0.004334449768066406,
|
844 |
+
"OptimizeAliasedCopyChain": 0.0006341934204101563,
|
845 |
+
"OptimizeNKIKernels": 0.38834357261657715,
|
846 |
+
"PAGLayoutOpt": 0.0889735221862793,
|
847 |
+
"PComputeCutting": 0.005109071731567383,
|
848 |
+
"PGLayoutTilingPipeline": 0.6248171329498291,
|
849 |
+
"PGTiling": 0.1645822525024414,
|
850 |
+
"PadElimination": 0.0003485679626464844,
|
851 |
+
"ParAxesAnnotation": 0.05196070671081543,
|
852 |
+
"PartialLoopFusion": 0.011112451553344727,
|
853 |
+
"PartialSimdFusion": 0.012138128280639648,
|
854 |
+
"PerfectLoopNest": 0.002288341522216797,
|
855 |
+
"RecognizeOpIdiom": 0.0041277408599853516,
|
856 |
+
"Recompute": 0.00026416778564453125,
|
857 |
+
"RelaxPredicates": 0.01356959342956543,
|
858 |
+
"Rematerialization": 0.0024864673614501953,
|
859 |
+
"ReshapeWeights": 0.0007522106170654297,
|
860 |
+
"ResolveAccessConflict": 0.0048482418060302734,
|
861 |
+
"ResolveComplicatePredicates": 0.0015094280242919922,
|
862 |
+
"RewriteReplicationMatmul": 0.0015668869018554688,
|
863 |
+
"RewriteWeights": 0.0027174949645996094,
|
864 |
+
"SFKVectorizer": 0.27501797676086426,
|
865 |
+
"SimpleAllReduceTiling": 0.009322166442871094,
|
866 |
+
"Simplifier": 0.003630399703979492,
|
867 |
+
"SimplifyMacroPredicates": 0.011396646499633789,
|
868 |
+
"SimplifyNeuronTensor": 1.0555970668792725,
|
869 |
+
"SimplifySlice": 0.0023348331451416016,
|
870 |
+
"SimplifyTensor": 0.005601167678833008,
|
871 |
+
"SpillPSum": 0.013073921203613281,
|
872 |
+
"SplitAPUnionSets": 0.11336159706115723,
|
873 |
+
"SplitAccGrp": 0.001394510269165039,
|
874 |
+
"StaticProfiler": 0.014252662658691406,
|
875 |
+
"StaticTransposeLocalTensor": 0.003930330276489258,
|
876 |
+
"SundaISel": 0.04436635971069336,
|
877 |
+
"TCTransform": 0.0008757114410400391,
|
878 |
+
"TensorInitialization": 0.01558232307434082,
|
879 |
+
"TensorOpSimplifier": 0.004608869552612305,
|
880 |
+
"TensorOpTransform": 0.01923346519470215,
|
881 |
+
"TileCCOps": 0.005507707595825195,
|
882 |
+
"TilingProfiler": 0.007405757904052734,
|
883 |
+
"TransformConvOp": 0.0030219554901123047,
|
884 |
+
"TritiumFusion": 0.05425119400024414,
|
885 |
+
"ValueNumbering": 0.0020017623901367188,
|
886 |
+
"VectorizeDMA": 0.002228975296020508,
|
887 |
+
"VectorizeMatMult": 0.006806135177612305,
|
888 |
+
"WeightCoalescing": 0.008460044860839844,
|
889 |
+
"ZeroSizeTensorElimination": 0.00014281272888183594
|
890 |
+
},
|
891 |
+
"tensorizer": {
|
892 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 49538.0,
|
893 |
+
"StaticProfiler::AifUb": 304.240234375,
|
894 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 278.67474365234375,
|
895 |
+
"StaticProfiler::AverageDmaLength": 1974.1033935546875,
|
896 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.7004623413086,
|
897 |
+
"StaticProfiler::AveragePartitionUtilization": 97.94140625,
|
898 |
+
"StaticProfiler::AveragePeUtilization": 98.78884887695313,
|
899 |
+
"StaticProfiler::DDRTransferBytes": 862646080.0,
|
900 |
+
"StaticProfiler::InternalTransferBytes": 669456896.0,
|
901 |
+
"StaticProfiler::LoadExpanded": 390679.0,
|
902 |
+
"StaticProfiler::LocalizationEfficiency": 91.59693145751953,
|
903 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 95.863037109375,
|
904 |
+
"StaticProfiler::StoreExpanded": 7261.0,
|
905 |
+
"StaticProfiler::TotalDMAExpanded": 397940.0,
|
906 |
+
"StaticProfiler::TotalDynamicInstancesCount": 59578.0,
|
907 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 59132.0,
|
908 |
+
"StaticProfiler::TotalLNCComm": 0.0,
|
909 |
+
"StaticProfiler::TotalLNCCommTransfer": 0.0,
|
910 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
|
911 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
|
912 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
|
913 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0.0,
|
914 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 4.0,
|
915 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 28224.0,
|
916 |
+
"TilingProfiler::NumPfTransposes": 5.0,
|
917 |
+
"TilingProfiler::NumPfTransposesForIo": 1.0,
|
918 |
+
"TilingProfiler::NumPfTransposesForLocal": 1.0,
|
919 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 3.0,
|
920 |
+
"TilingProfiler::PfTransposeInstructions": 19777.0,
|
921 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
|
922 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
|
923 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0,
|
924 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 6.0,
|
925 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 303.0,
|
926 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0.0,
|
927 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
|
928 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
|
929 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
|
930 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
|
931 |
+
"TransformConvOp::conv2d_column_packing": 0.0,
|
932 |
+
"TransformConvOp::conv2d_column_packing_1": 0.0,
|
933 |
+
"TransformConvOp::conv2d_column_packing_io10": 0.0,
|
934 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
|
935 |
+
}
|
936 |
+
},
|
937 |
+
"sg01": {
|
938 |
+
"compiletime": {
|
939 |
+
"CanonicalizeConv": 2.2000000171829015e-05,
|
940 |
+
"CanonicalizeForTensorizer": 1.4999999621068127e-05,
|
941 |
+
"Canonicalizer": 0.00025499999173916876,
|
942 |
+
"HoistCompute": 1.9999999949504854e-06,
|
943 |
+
"IdentifyCrossPassTensors": 2.499999936844688e-05,
|
944 |
+
"MemcastMotion": 7.000000096013537e-06,
|
945 |
+
"PenguinizeFunctions": 1.4999999621068127e-05,
|
946 |
+
"PruneFunctions": 1.8999999156221747e-05,
|
947 |
+
"RemoveOptimizationBarriers": 2.700000004551839e-05,
|
948 |
+
"ScatterMotion": 1.9999999494757503e-05,
|
949 |
+
"TensorizerLegalizationPass": 1.9999999494757503e-05,
|
950 |
+
"VerifySupportedOps": 1.1000000085914508e-05,
|
951 |
+
"algsimp": 6.299999949987978e-05,
|
952 |
+
"batchnorm_expander": 1.4000000192027073e-05,
|
953 |
+
"boundary-marker-removal": 4.999999873689376e-06,
|
954 |
+
"call-inliner": 9.000000318337698e-06,
|
955 |
+
"canonicalize-boundary-marker": 6.000000212225132e-06,
|
956 |
+
"collective-stream-id-checker": 4.999999873689376e-06,
|
957 |
+
"comparison-expander": 4.999999873689376e-06,
|
958 |
+
"computation-deduplicator": 2.5999999706982635e-05,
|
959 |
+
"conditional-to-select": 4.999999873689376e-06,
|
960 |
+
"config-lowering": 2.2000000171829015e-05,
|
961 |
+
"constant_folding": 9.000000318337698e-06,
|
962 |
+
"cse": 1.2000000424450263e-05,
|
963 |
+
"dce": 9.999999974752427e-07,
|
964 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
965 |
+
"eliminate-redundant-compare": 3.999999989900971e-06,
|
966 |
+
"emit-offloaded-dropout": 1.2999999853491317e-05,
|
967 |
+
"flatten-call-graph": 9.000000318337698e-06,
|
968 |
+
"fuse-send-recv": 2.9999999242136255e-05,
|
969 |
+
"hilo::LegalizeAlias": 4.999999873689376e-06,
|
970 |
+
"hilo::NeuronInstCombine": 3.600000127335079e-05,
|
971 |
+
"hilo::NeuronOpFusion": 1.4000000192027073e-05,
|
972 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 2.099999983329326e-05,
|
973 |
+
"hilo::ScheduleFusion": 9.999999974752427e-07,
|
974 |
+
"hilo::SixtyFourHack": 1.4000000192027073e-05,
|
975 |
+
"hilo::VerifyAliasing": 1.9999999949504854e-06,
|
976 |
+
"hlo-mac-count": 4.600000102072954e-05,
|
977 |
+
"hlo-verifier": 0.00023299999884329736,
|
978 |
+
"legalize-ccops": 9.999999974752427e-07,
|
979 |
+
"legalize-compare": 3.999999989900971e-06,
|
980 |
+
"lower-argminmax-custom-call": 3.999999989900971e-06,
|
981 |
+
"map-inline": 1.1000000085914508e-05,
|
982 |
+
"metadata-naming": 2.700000004551839e-05,
|
983 |
+
"mlir::detail::OpToOpPassAdaptor": 0.00017299999308306724,
|
984 |
+
"mlir::hlo::MhloToPyPenguin": 0.0009840000420808792,
|
985 |
+
"mlir::mhlo::LowerComplexExtraPass": 9.600000339560211e-05,
|
986 |
+
"mlir::mhlo::LowerComplexPass": 0.00013600000238511711,
|
987 |
+
"native-to-custom-softmax": 6.000000212225132e-06,
|
988 |
+
"native-to-custom-softmax-dx": 2.2000000171829015e-05,
|
989 |
+
"operand_upcaster": 2.4000000848900527e-05,
|
990 |
+
"post-par-pipe-begin": 3.000000106112566e-06,
|
991 |
+
"post-par-pipe-end": 0.0,
|
992 |
+
"post-partition-simplification": 0.0005660000024363399,
|
993 |
+
"replace-minimum-constant": 6.000000212225132e-06,
|
994 |
+
"reshape-mover": 3.000000106112566e-06,
|
995 |
+
"simplify-concat": 4.8999998398358e-05,
|
996 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
997 |
+
"transform-variadic-reduce": 9.000000318337698e-06,
|
998 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
999 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
1000 |
+
"unroll-while-loop": 0.0
|
1001 |
+
},
|
1002 |
+
"hilo": {
|
1003 |
+
"ArithmeticIntensity": 834.6854858398438,
|
1004 |
+
"HloMacCount": 103079215104.0,
|
1005 |
+
"Traffic": 246989344.0
|
1006 |
+
}
|
1007 |
+
},
|
1008 |
+
"sg02": {
|
1009 |
+
"compiletime": {
|
1010 |
+
"CanonicalizeConv": 0.0,
|
1011 |
+
"CanonicalizeForTensorizer": 1.4000000192027073e-05,
|
1012 |
+
"Canonicalizer": 0.0003129999968223274,
|
1013 |
+
"HoistCompute": 9.999999974752427e-07,
|
1014 |
+
"IdentifyCrossPassTensors": 1.2000000424450263e-05,
|
1015 |
+
"MemcastMotion": 0.0,
|
1016 |
+
"PenguinizeFunctions": 9.999999747378752e-06,
|
1017 |
+
"PruneFunctions": 7.999999979801942e-06,
|
1018 |
+
"RemoveOptimizationBarriers": 9.000000318337698e-06,
|
1019 |
+
"ScatterMotion": 3.000000106112566e-06,
|
1020 |
+
"TensorizerLegalizationPass": 6.000000212225132e-06,
|
1021 |
+
"VerifySupportedOps": 1.2000000424450263e-05,
|
1022 |
+
"algsimp": 5.999999848427251e-05,
|
1023 |
+
"batchnorm_expander": 1.2999999853491317e-05,
|
1024 |
+
"boundary-marker-removal": 3.999999989900971e-06,
|
1025 |
+
"call-inliner": 9.999999747378752e-06,
|
1026 |
+
"canonicalize-boundary-marker": 4.999999873689376e-06,
|
1027 |
+
"collective-stream-id-checker": 3.999999989900971e-06,
|
1028 |
+
"comparison-expander": 4.999999873689376e-06,
|
1029 |
+
"computation-deduplicator": 2.5999999706982635e-05,
|
1030 |
+
"conditional-to-select": 7.000000096013537e-06,
|
1031 |
+
"config-lowering": 2.5999999706982635e-05,
|
1032 |
+
"constant_folding": 9.000000318337698e-06,
|
1033 |
+
"cse": 1.2000000424450263e-05,
|
1034 |
+
"dce": 9.999999974752427e-07,
|
1035 |
+
"dynamic-slice-transpose": 3.999999989900971e-06,
|
1036 |
+
"eliminate-redundant-compare": 3.000000106112566e-06,
|
1037 |
+
"emit-offloaded-dropout": 1.2999999853491317e-05,
|
1038 |
+
"flatten-call-graph": 1.2000000424450263e-05,
|
1039 |
+
"fuse-send-recv": 1.9999999494757503e-05,
|
1040 |
+
"hilo::LegalizeAlias": 1.9999999949504854e-06,
|
1041 |
+
"hilo::NeuronInstCombine": 6.800000119255856e-05,
|
1042 |
+
"hilo::NeuronOpFusion": 3.999999989900971e-06,
|
1043 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 2.2000000171829015e-05,
|
1044 |
+
"hilo::ScheduleFusion": 0.00015900000289548188,
|
1045 |
+
"hilo::SixtyFourHack": 3.9999998989515007e-05,
|
1046 |
+
"hilo::VerifyAliasing": 9.999999974752427e-07,
|
1047 |
+
"hlo-mac-count": 0.00017699999443721026,
|
1048 |
+
"hlo-verifier": 0.0001829999964684248,
|
1049 |
+
"legalize-ccops": 1.9999999949504854e-06,
|
1050 |
+
"legalize-compare": 3.000000106112566e-06,
|
1051 |
+
"lower-argminmax-custom-call": 3.000000106112566e-06,
|
1052 |
+
"map-inline": 1.2000000424450263e-05,
|
1053 |
+
"metadata-naming": 1.5999999959603883e-05,
|
1054 |
+
"mlir::detail::OpToOpPassAdaptor": 1.1000000085914508e-05,
|
1055 |
+
"mlir::hlo::MhloToPyPenguin": 0.0008900000248104334,
|
1056 |
+
"mlir::mhlo::LowerComplexExtraPass": 8.800000068731606e-05,
|
1057 |
+
"mlir::mhlo::LowerComplexPass": 0.00011999999696854502,
|
1058 |
+
"native-to-custom-softmax": 6.000000212225132e-06,
|
1059 |
+
"native-to-custom-softmax-dx": 2.4000000848900527e-05,
|
1060 |
+
"operand_upcaster": 1.5999999959603883e-05,
|
1061 |
+
"post-par-pipe-begin": 1.9999999949504854e-06,
|
1062 |
+
"post-par-pipe-end": 0.0,
|
1063 |
+
"post-partition-simplification": 0.0005510000046342611,
|
1064 |
+
"replace-minimum-constant": 9.000000318337698e-06,
|
1065 |
+
"reshape-mover": 3.000000106112566e-06,
|
1066 |
+
"simplify-concat": 4.400000034365803e-05,
|
1067 |
+
"simplify-while-loops": 1.9999999949504854e-06,
|
1068 |
+
"transform-variadic-reduce": 6.299999949987978e-05,
|
1069 |
+
"tuple-simplifier": 4.999999873689376e-06,
|
1070 |
+
"unpack-nested-aws-ntwsr": 3.999999989900971e-06,
|
1071 |
+
"unroll-while-loop": 0.0
|
1072 |
+
},
|
1073 |
+
"hilo": {
|
1074 |
+
"ArithmeticIntensity": 194.41075134277344,
|
1075 |
+
"HloMacCount": 77620576256.0,
|
1076 |
+
"Traffic": 798521408.0
|
1077 |
+
}
|
1078 |
+
}
|
1079 |
+
}
|
context_encoding_model/_tp0_bk3/graph.neff
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3602ab29177b01531c0dbdb62bc869556ef53a934ba98dd3bd846e75e171cc3a
|
3 |
+
size 2561024
|
context_encoding_model/_tp0_bk3/log-neuron-cc.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
context_encoding_model/_tp0_bk3/metaneff.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ea141404110996ab61ca5ba70e86499e6c4390e0b31c1ef947cf95911917766
|
3 |
+
size 1816103
|
context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9068f3ba4f55e1b8b35adde74efc6a9e617baa344783aaee62353f9181c3092c
|
3 |
+
size 1893189
|
context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3602ab29177b01531c0dbdb62bc869556ef53a934ba98dd3bd846e75e171cc3a
|
3 |
+
size 2561024
|
context_encoding_model/_tp0_bk3/neuron_config.json
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_attn_implementation_autoset": false,
|
3 |
+
"_name_or_path": "Qwen/Qwen3-8B",
|
4 |
+
"add_cross_attention": false,
|
5 |
+
"architectures": [
|
6 |
+
"Qwen3ForCausalLM"
|
7 |
+
],
|
8 |
+
"attention_bias": false,
|
9 |
+
"attention_dropout": 0.0,
|
10 |
+
"attribute_map": {},
|
11 |
+
"bad_words_ids": null,
|
12 |
+
"begin_suppress_tokens": null,
|
13 |
+
"bos_token_id": 151643,
|
14 |
+
"chunk_size_feed_forward": 0,
|
15 |
+
"cross_attention_hidden_size": null,
|
16 |
+
"decoder_start_token_id": null,
|
17 |
+
"diversity_penalty": 0.0,
|
18 |
+
"do_sample": false,
|
19 |
+
"early_stopping": false,
|
20 |
+
"encoder_no_repeat_ngram_size": 0,
|
21 |
+
"eos_token_id": 151645,
|
22 |
+
"exponential_decay_length_penalty": null,
|
23 |
+
"finetuning_task": null,
|
24 |
+
"forced_bos_token_id": null,
|
25 |
+
"forced_eos_token_id": null,
|
26 |
+
"fused_spec_config": null,
|
27 |
+
"head_dim": 128,
|
28 |
+
"hidden_act": "silu",
|
29 |
+
"hidden_size": 4096,
|
30 |
+
"id2label": {
|
31 |
+
"0": "LABEL_0",
|
32 |
+
"1": "LABEL_1"
|
33 |
+
},
|
34 |
+
"initializer_range": 0.02,
|
35 |
+
"intermediate_size": 12288,
|
36 |
+
"is_decoder": false,
|
37 |
+
"is_encoder_decoder": false,
|
38 |
+
"label2id": {
|
39 |
+
"LABEL_0": 0,
|
40 |
+
"LABEL_1": 1
|
41 |
+
},
|
42 |
+
"length_penalty": 1.0,
|
43 |
+
"max_length": 20,
|
44 |
+
"max_position_embeddings": 40960,
|
45 |
+
"max_window_layers": 36,
|
46 |
+
"metadata": null,
|
47 |
+
"min_length": 0,
|
48 |
+
"model_type": "qwen3",
|
49 |
+
"neuron_config": {
|
50 |
+
"activation_quantization_type": null,
|
51 |
+
"allow_input_truncation": false,
|
52 |
+
"apply_seq_ids_mask": false,
|
53 |
+
"async_mode": false,
|
54 |
+
"attention_dp_degree": 1,
|
55 |
+
"attention_dtype": null,
|
56 |
+
"attn_block_cte_nki_kernel_enabled": false,
|
57 |
+
"attn_block_tkg_nki_kernel_cache_update": false,
|
58 |
+
"attn_block_tkg_nki_kernel_enabled": false,
|
59 |
+
"attn_cls": {
|
60 |
+
"__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
|
61 |
+
"__name__": "NeuronQwen3Attention"
|
62 |
+
},
|
63 |
+
"attn_kernel_enabled": null,
|
64 |
+
"attn_tkg_builtin_kernel_enabled": false,
|
65 |
+
"attn_tkg_nki_kernel_enabled": false,
|
66 |
+
"batch_size": 1,
|
67 |
+
"bucket_n_active_tokens": true,
|
68 |
+
"buckets": [
|
69 |
+
1024
|
70 |
+
],
|
71 |
+
"cast_type": "config",
|
72 |
+
"cc_pipeline_tiling_factor": 2,
|
73 |
+
"chunked_prefill_config": null,
|
74 |
+
"context_encoding_buckets": [
|
75 |
+
1024
|
76 |
+
],
|
77 |
+
"cp_degree": 1,
|
78 |
+
"ctx_batch_size": 1,
|
79 |
+
"disable_kv_cache_tiling": false,
|
80 |
+
"draft_model_modules_to_not_convert": null,
|
81 |
+
"enable_bucketing": true,
|
82 |
+
"enable_eagle_draft_input_norm": false,
|
83 |
+
"enable_eagle_speculation": false,
|
84 |
+
"enable_fused_speculation": false,
|
85 |
+
"enable_long_context_mode": false,
|
86 |
+
"enable_output_completion_notifications": false,
|
87 |
+
"enable_spill_reload_dge": false,
|
88 |
+
"enable_token_tree": false,
|
89 |
+
"ep_degree": 1,
|
90 |
+
"expert_mlp_nki_kernel_enabled": null,
|
91 |
+
"flash_decoding_enabled": false,
|
92 |
+
"fused_qkv": false,
|
93 |
+
"fused_rmsnorm_skip_gamma": false,
|
94 |
+
"is_block_kv_layout": null,
|
95 |
+
"is_chunked_prefill": false,
|
96 |
+
"is_continuous_batching": true,
|
97 |
+
"is_eagle_draft": false,
|
98 |
+
"is_medusa": false,
|
99 |
+
"is_prefill_stage": true,
|
100 |
+
"is_prefix_caching": false,
|
101 |
+
"k_cache_transposed": false,
|
102 |
+
"kv_cache_batch_size": 1,
|
103 |
+
"kv_cache_padding_size": 0,
|
104 |
+
"kv_cache_quant": false,
|
105 |
+
"kv_cache_tiling": false,
|
106 |
+
"layer_boundary_markers": false,
|
107 |
+
"lm_head_pad": false,
|
108 |
+
"lm_head_pad_alignment_size": 1,
|
109 |
+
"local_ranks_size": 2,
|
110 |
+
"logical_nc_config": 1,
|
111 |
+
"lora_config": null,
|
112 |
+
"max_batch_size": 1,
|
113 |
+
"max_context_length": 1024,
|
114 |
+
"max_length": 1024,
|
115 |
+
"max_new_tokens": null,
|
116 |
+
"medusa_speculation_length": 0,
|
117 |
+
"medusa_tree": null,
|
118 |
+
"mlp_kernel_enabled": false,
|
119 |
+
"mlp_kernel_fuse_residual_add": false,
|
120 |
+
"modules_to_not_convert": null,
|
121 |
+
"moe_fused_nki_kernel_enabled": null,
|
122 |
+
"n_active_tokens": 1024,
|
123 |
+
"n_positions": 1024,
|
124 |
+
"num_medusa_heads": 0,
|
125 |
+
"on_cpu": false,
|
126 |
+
"on_device_sampling_config": {
|
127 |
+
"deterministic": false,
|
128 |
+
"do_sample": false,
|
129 |
+
"dynamic": true,
|
130 |
+
"global_topk": 256,
|
131 |
+
"on_device_sampling_config": true,
|
132 |
+
"temperature": 1.0,
|
133 |
+
"top_k": 1,
|
134 |
+
"top_k_kernel_enabled": false,
|
135 |
+
"top_p": 1.0
|
136 |
+
},
|
137 |
+
"output_logits": false,
|
138 |
+
"overrides_torch_dtype": true,
|
139 |
+
"pa_block_size": 1024,
|
140 |
+
"pa_num_blocks": 1,
|
141 |
+
"padding_side": "right",
|
142 |
+
"pp_degree": 1,
|
143 |
+
"prefix_buckets": null,
|
144 |
+
"qk_layernorm": false,
|
145 |
+
"qkv_kernel_enabled": false,
|
146 |
+
"qkv_kernel_fuse_residual_add": false,
|
147 |
+
"qkv_kernel_nbsd_layout": false,
|
148 |
+
"quantization_dtype": "int8",
|
149 |
+
"quantization_type": "per_tensor_symmetric",
|
150 |
+
"quantize_clamp_bound": Infinity,
|
151 |
+
"quantized": false,
|
152 |
+
"quantized_checkpoints_path": null,
|
153 |
+
"quantized_mlp_kernel_enabled": false,
|
154 |
+
"rmsnorm_quantize_kernel_enabled": false,
|
155 |
+
"router_topk_nki_kernel_enabled": null,
|
156 |
+
"rpl_reduce_dtype": null,
|
157 |
+
"save_sharded_checkpoint": true,
|
158 |
+
"scratchpad_page_size": null,
|
159 |
+
"seq_len": 1024,
|
160 |
+
"seq_len_threshold_for_cc_tiling": 16384,
|
161 |
+
"sequence_parallel_enabled": false,
|
162 |
+
"shared_mlp_nki_kernel_enabled": null,
|
163 |
+
"skip_sharding": false,
|
164 |
+
"skip_warmup": false,
|
165 |
+
"spec_batch_size": 1,
|
166 |
+
"speculation_length": 0,
|
167 |
+
"start_rank_id": 0,
|
168 |
+
"target": null,
|
169 |
+
"tile_cc": false,
|
170 |
+
"tkg_batch_size": 1,
|
171 |
+
"token_generation_buckets": null,
|
172 |
+
"token_tree_config": null,
|
173 |
+
"torch_dtype": "bfloat16",
|
174 |
+
"tp_degree": 2,
|
175 |
+
"vocab_parallel": false,
|
176 |
+
"weight_gather_seq_len_threshold": 32768,
|
177 |
+
"weights_to_skip_layout_optimization": [],
|
178 |
+
"world_size": 2
|
179 |
+
},
|
180 |
+
"no_repeat_ngram_size": 0,
|
181 |
+
"num_attention_heads": 32,
|
182 |
+
"num_beam_groups": 1,
|
183 |
+
"num_beams": 1,
|
184 |
+
"num_cores_per_group": 1,
|
185 |
+
"num_hidden_layers": 36,
|
186 |
+
"num_key_value_heads": 8,
|
187 |
+
"num_return_sequences": 1,
|
188 |
+
"output_attentions": false,
|
189 |
+
"output_hidden_states": false,
|
190 |
+
"output_scores": false,
|
191 |
+
"pad_token_id": 0,
|
192 |
+
"prefix": null,
|
193 |
+
"problem_type": null,
|
194 |
+
"pruned_heads": {},
|
195 |
+
"remove_invalid_values": false,
|
196 |
+
"repetition_penalty": 1.0,
|
197 |
+
"return_dict": true,
|
198 |
+
"return_dict_in_generate": false,
|
199 |
+
"rms_norm_eps": 1e-06,
|
200 |
+
"rope_scaling": null,
|
201 |
+
"rope_theta": 1000000,
|
202 |
+
"sep_token_id": null,
|
203 |
+
"sliding_window": null,
|
204 |
+
"suppress_tokens": null,
|
205 |
+
"task_specific_params": null,
|
206 |
+
"temperature": 1.0,
|
207 |
+
"tf_legacy_loss": false,
|
208 |
+
"tie_encoder_decoder": false,
|
209 |
+
"tie_word_embeddings": false,
|
210 |
+
"tokenizer_class": null,
|
211 |
+
"top_k": 50,
|
212 |
+
"top_p": 1.0,
|
213 |
+
"torchscript": false,
|
214 |
+
"transformers_version": "4.51.0",
|
215 |
+
"typical_p": 1.0,
|
216 |
+
"use_bfloat16": false,
|
217 |
+
"use_cache": true,
|
218 |
+
"use_sliding_window": false,
|
219 |
+
"vocab_size": 151936
|
220 |
+
}
|
layout_opt/command.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
neuronx-cc compile graph.hlo --framework XLA --target trn1 --output graph.neff --model-type=transformer -O1 --lnc=1 '--internal-hlo2tensorizer-options=--experimental-unsafe-fp8e4m3fn-as-fp8e4m3 --verify-hlo=false' --logfile=log-neuron-cc.txt --verbose=35
|
layout_opt/graph.neff
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eafae43287bda2aa58740df223d211d8e3638af29e402c9cc6cbcadcf302ddde
|
3 |
+
size 5786624
|
layout_opt/log-neuron-cc.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
layout_opt/metaneff
ADDED
@@ -0,0 +1,1198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
(
|
3 |
+
input0�� �2embed_tokens.weight8
|
4 |
+
;
|
5 |
+
input1� �2'layers.0.self_attn.o_proj.o_proj.weight8
|
6 |
+
=
|
7 |
+
input2�� 2)layers.0.self_attn.qkv_proj.v_proj.weight8
|
8 |
+
1
|
9 |
+
input3� 2layers.0.input_layernorm.weight8
|
10 |
+
7
|
11 |
+
input4�2%layers.0.self_attn.k_layernorm.weight8
|
12 |
+
=
|
13 |
+
input5�� 2)layers.0.self_attn.qkv_proj.k_proj.weight8
|
14 |
+
7
|
15 |
+
input6�2%layers.0.self_attn.q_layernorm.weight8
|
16 |
+
=
|
17 |
+
input7�� 2)layers.0.self_attn.qkv_proj.q_proj.weight8
|
18 |
+
1
|
19 |
+
input8� �02layers.0.mlp.down_proj.weight8
|
20 |
+
/
|
21 |
+
input9�0� 2layers.0.mlp.up_proj.weight8
|
22 |
+
;
|
23 |
+
input10� 2(layers.0.post_attention_layernorm.weight8
|
24 |
+
2
|
25 |
+
input11�0� 2layers.0.mlp.gate_proj.weight8
|
26 |
+
<
|
27 |
+
input12� �2'layers.1.self_attn.o_proj.o_proj.weight8
|
28 |
+
>
|
29 |
+
input13�� 2)layers.1.self_attn.qkv_proj.v_proj.weight8
|
30 |
+
2
|
31 |
+
input14� 2layers.1.input_layernorm.weight8
|
32 |
+
8
|
33 |
+
input15�2%layers.1.self_attn.k_layernorm.weight8
|
34 |
+
>
|
35 |
+
input16�� 2)layers.1.self_attn.qkv_proj.k_proj.weight8
|
36 |
+
8
|
37 |
+
input17�2%layers.1.self_attn.q_layernorm.weight8
|
38 |
+
>
|
39 |
+
input18�� 2)layers.1.self_attn.qkv_proj.q_proj.weight8
|
40 |
+
2
|
41 |
+
input19� �02layers.1.mlp.down_proj.weight8
|
42 |
+
0
|
43 |
+
input20�0� 2layers.1.mlp.up_proj.weight8
|
44 |
+
;
|
45 |
+
input21� 2(layers.1.post_attention_layernorm.weight8
|
46 |
+
2
|
47 |
+
input22�0� 2layers.1.mlp.gate_proj.weight8
|
48 |
+
<
|
49 |
+
input23� �2'layers.2.self_attn.o_proj.o_proj.weight8
|
50 |
+
>
|
51 |
+
input24�� 2)layers.2.self_attn.qkv_proj.v_proj.weight8
|
52 |
+
2
|
53 |
+
input25� 2layers.2.input_layernorm.weight8
|
54 |
+
8
|
55 |
+
input26�2%layers.2.self_attn.k_layernorm.weight8
|
56 |
+
>
|
57 |
+
input27�� 2)layers.2.self_attn.qkv_proj.k_proj.weight8
|
58 |
+
8
|
59 |
+
input28�2%layers.2.self_attn.q_layernorm.weight8
|
60 |
+
>
|
61 |
+
input29�� 2)layers.2.self_attn.qkv_proj.q_proj.weight8
|
62 |
+
2
|
63 |
+
input30� �02layers.2.mlp.down_proj.weight8
|
64 |
+
0
|
65 |
+
input31�0� 2layers.2.mlp.up_proj.weight8
|
66 |
+
;
|
67 |
+
input32� 2(layers.2.post_attention_layernorm.weight8
|
68 |
+
2
|
69 |
+
input33�0� 2layers.2.mlp.gate_proj.weight8
|
70 |
+
<
|
71 |
+
input34� �2'layers.3.self_attn.o_proj.o_proj.weight8
|
72 |
+
>
|
73 |
+
input35�� 2)layers.3.self_attn.qkv_proj.v_proj.weight8
|
74 |
+
2
|
75 |
+
input36� 2layers.3.input_layernorm.weight8
|
76 |
+
8
|
77 |
+
input37�2%layers.3.self_attn.k_layernorm.weight8
|
78 |
+
>
|
79 |
+
input38�� 2)layers.3.self_attn.qkv_proj.k_proj.weight8
|
80 |
+
8
|
81 |
+
input39�2%layers.3.self_attn.q_layernorm.weight8
|
82 |
+
>
|
83 |
+
input40�� 2)layers.3.self_attn.qkv_proj.q_proj.weight8
|
84 |
+
2
|
85 |
+
input41� �02layers.3.mlp.down_proj.weight8
|
86 |
+
0
|
87 |
+
input42�0� 2layers.3.mlp.up_proj.weight8
|
88 |
+
;
|
89 |
+
input43� 2(layers.3.post_attention_layernorm.weight8
|
90 |
+
2
|
91 |
+
input44�0� 2layers.3.mlp.gate_proj.weight8
|
92 |
+
<
|
93 |
+
input45� �2'layers.4.self_attn.o_proj.o_proj.weight8
|
94 |
+
>
|
95 |
+
input46�� 2)layers.4.self_attn.qkv_proj.v_proj.weight8
|
96 |
+
2
|
97 |
+
input47� 2layers.4.input_layernorm.weight8
|
98 |
+
8
|
99 |
+
input48�2%layers.4.self_attn.k_layernorm.weight8
|
100 |
+
>
|
101 |
+
input49�� 2)layers.4.self_attn.qkv_proj.k_proj.weight8
|
102 |
+
8
|
103 |
+
input50�2%layers.4.self_attn.q_layernorm.weight8
|
104 |
+
>
|
105 |
+
input51�� 2)layers.4.self_attn.qkv_proj.q_proj.weight8
|
106 |
+
2
|
107 |
+
input52� �02layers.4.mlp.down_proj.weight8
|
108 |
+
0
|
109 |
+
input53�0� 2layers.4.mlp.up_proj.weight8
|
110 |
+
;
|
111 |
+
input54� 2(layers.4.post_attention_layernorm.weight8
|
112 |
+
2
|
113 |
+
input55�0� 2layers.4.mlp.gate_proj.weight8
|
114 |
+
<
|
115 |
+
input56� �2'layers.5.self_attn.o_proj.o_proj.weight8
|
116 |
+
>
|
117 |
+
input57�� 2)layers.5.self_attn.qkv_proj.v_proj.weight8
|
118 |
+
2
|
119 |
+
input58� 2layers.5.input_layernorm.weight8
|
120 |
+
8
|
121 |
+
input59�2%layers.5.self_attn.k_layernorm.weight8
|
122 |
+
>
|
123 |
+
input60�� 2)layers.5.self_attn.qkv_proj.k_proj.weight8
|
124 |
+
8
|
125 |
+
input61�2%layers.5.self_attn.q_layernorm.weight8
|
126 |
+
>
|
127 |
+
input62�� 2)layers.5.self_attn.qkv_proj.q_proj.weight8
|
128 |
+
2
|
129 |
+
input63� �02layers.5.mlp.down_proj.weight8
|
130 |
+
0
|
131 |
+
input64�0� 2layers.5.mlp.up_proj.weight8
|
132 |
+
;
|
133 |
+
input65� 2(layers.5.post_attention_layernorm.weight8
|
134 |
+
2
|
135 |
+
input66�0� 2layers.5.mlp.gate_proj.weight8
|
136 |
+
<
|
137 |
+
input67� �2'layers.6.self_attn.o_proj.o_proj.weight8
|
138 |
+
>
|
139 |
+
input68�� 2)layers.6.self_attn.qkv_proj.v_proj.weight8
|
140 |
+
2
|
141 |
+
input69� 2layers.6.input_layernorm.weight8
|
142 |
+
8
|
143 |
+
input70�2%layers.6.self_attn.k_layernorm.weight8
|
144 |
+
>
|
145 |
+
input71�� 2)layers.6.self_attn.qkv_proj.k_proj.weight8
|
146 |
+
8
|
147 |
+
input72�2%layers.6.self_attn.q_layernorm.weight8
|
148 |
+
>
|
149 |
+
input73�� 2)layers.6.self_attn.qkv_proj.q_proj.weight8
|
150 |
+
2
|
151 |
+
input74� �02layers.6.mlp.down_proj.weight8
|
152 |
+
0
|
153 |
+
input75�0� 2layers.6.mlp.up_proj.weight8
|
154 |
+
;
|
155 |
+
input76� 2(layers.6.post_attention_layernorm.weight8
|
156 |
+
2
|
157 |
+
input77�0� 2layers.6.mlp.gate_proj.weight8
|
158 |
+
<
|
159 |
+
input78� �2'layers.7.self_attn.o_proj.o_proj.weight8
|
160 |
+
>
|
161 |
+
input79�� 2)layers.7.self_attn.qkv_proj.v_proj.weight8
|
162 |
+
2
|
163 |
+
input80� 2layers.7.input_layernorm.weight8
|
164 |
+
8
|
165 |
+
input81�2%layers.7.self_attn.k_layernorm.weight8
|
166 |
+
>
|
167 |
+
input82�� 2)layers.7.self_attn.qkv_proj.k_proj.weight8
|
168 |
+
8
|
169 |
+
input83�2%layers.7.self_attn.q_layernorm.weight8
|
170 |
+
>
|
171 |
+
input84�� 2)layers.7.self_attn.qkv_proj.q_proj.weight8
|
172 |
+
2
|
173 |
+
input85� �02layers.7.mlp.down_proj.weight8
|
174 |
+
0
|
175 |
+
input86�0� 2layers.7.mlp.up_proj.weight8
|
176 |
+
;
|
177 |
+
input87� 2(layers.7.post_attention_layernorm.weight8
|
178 |
+
2
|
179 |
+
input88�0� 2layers.7.mlp.gate_proj.weight8
|
180 |
+
<
|
181 |
+
input89� �2'layers.8.self_attn.o_proj.o_proj.weight8
|
182 |
+
>
|
183 |
+
input90�� 2)layers.8.self_attn.qkv_proj.v_proj.weight8
|
184 |
+
2
|
185 |
+
input91� 2layers.8.input_layernorm.weight8
|
186 |
+
8
|
187 |
+
input92�2%layers.8.self_attn.k_layernorm.weight8
|
188 |
+
>
|
189 |
+
input93�� 2)layers.8.self_attn.qkv_proj.k_proj.weight8
|
190 |
+
8
|
191 |
+
input94�2%layers.8.self_attn.q_layernorm.weight8
|
192 |
+
>
|
193 |
+
input95�� 2)layers.8.self_attn.qkv_proj.q_proj.weight8
|
194 |
+
2
|
195 |
+
input96� �02layers.8.mlp.down_proj.weight8
|
196 |
+
0
|
197 |
+
input97�0� 2layers.8.mlp.up_proj.weight8
|
198 |
+
;
|
199 |
+
input98� 2(layers.8.post_attention_layernorm.weight8
|
200 |
+
2
|
201 |
+
input99�0� 2layers.8.mlp.gate_proj.weight8
|
202 |
+
=
|
203 |
+
input100� �2'layers.9.self_attn.o_proj.o_proj.weight8
|
204 |
+
?
|
205 |
+
input101�� 2)layers.9.self_attn.qkv_proj.v_proj.weight8
|
206 |
+
3
|
207 |
+
input102� 2layers.9.input_layernorm.weight8
|
208 |
+
9
|
209 |
+
input103�2%layers.9.self_attn.k_layernorm.weight8
|
210 |
+
?
|
211 |
+
input104�� 2)layers.9.self_attn.qkv_proj.k_proj.weight8
|
212 |
+
9
|
213 |
+
input105�2%layers.9.self_attn.q_layernorm.weight8
|
214 |
+
?
|
215 |
+
input106�� 2)layers.9.self_attn.qkv_proj.q_proj.weight8
|
216 |
+
3
|
217 |
+
input107� �02layers.9.mlp.down_proj.weight8
|
218 |
+
1
|
219 |
+
input108�0� 2layers.9.mlp.up_proj.weight8
|
220 |
+
<
|
221 |
+
input109� 2(layers.9.post_attention_layernorm.weight8
|
222 |
+
3
|
223 |
+
input110�0� 2layers.9.mlp.gate_proj.weight8
|
224 |
+
>
|
225 |
+
input111� �2(layers.10.self_attn.o_proj.o_proj.weight8
|
226 |
+
@
|
227 |
+
input112�� 2*layers.10.self_attn.qkv_proj.v_proj.weight8
|
228 |
+
4
|
229 |
+
input113� 2 layers.10.input_layernorm.weight8
|
230 |
+
:
|
231 |
+
input114�2&layers.10.self_attn.k_layernorm.weight8
|
232 |
+
@
|
233 |
+
input115�� 2*layers.10.self_attn.qkv_proj.k_proj.weight8
|
234 |
+
:
|
235 |
+
input116�2&layers.10.self_attn.q_layernorm.weight8
|
236 |
+
@
|
237 |
+
input117�� 2*layers.10.self_attn.qkv_proj.q_proj.weight8
|
238 |
+
4
|
239 |
+
input118� �02layers.10.mlp.down_proj.weight8
|
240 |
+
2
|
241 |
+
input119�0� 2layers.10.mlp.up_proj.weight8
|
242 |
+
=
|
243 |
+
input120� 2)layers.10.post_attention_layernorm.weight8
|
244 |
+
4
|
245 |
+
input121�0� 2layers.10.mlp.gate_proj.weight8
|
246 |
+
>
|
247 |
+
input122� �2(layers.11.self_attn.o_proj.o_proj.weight8
|
248 |
+
@
|
249 |
+
input123�� 2*layers.11.self_attn.qkv_proj.v_proj.weight8
|
250 |
+
4
|
251 |
+
input124� 2 layers.11.input_layernorm.weight8
|
252 |
+
:
|
253 |
+
input125�2&layers.11.self_attn.k_layernorm.weight8
|
254 |
+
@
|
255 |
+
input126�� 2*layers.11.self_attn.qkv_proj.k_proj.weight8
|
256 |
+
:
|
257 |
+
input127�2&layers.11.self_attn.q_layernorm.weight8
|
258 |
+
@
|
259 |
+
input128�� 2*layers.11.self_attn.qkv_proj.q_proj.weight8
|
260 |
+
4
|
261 |
+
input129� �02layers.11.mlp.down_proj.weight8
|
262 |
+
2
|
263 |
+
input130�0� 2layers.11.mlp.up_proj.weight8
|
264 |
+
=
|
265 |
+
input131� 2)layers.11.post_attention_layernorm.weight8
|
266 |
+
4
|
267 |
+
input132�0� 2layers.11.mlp.gate_proj.weight8
|
268 |
+
>
|
269 |
+
input133� �2(layers.12.self_attn.o_proj.o_proj.weight8
|
270 |
+
@
|
271 |
+
input134�� 2*layers.12.self_attn.qkv_proj.v_proj.weight8
|
272 |
+
4
|
273 |
+
input135� 2 layers.12.input_layernorm.weight8
|
274 |
+
:
|
275 |
+
input136�2&layers.12.self_attn.k_layernorm.weight8
|
276 |
+
@
|
277 |
+
input137�� 2*layers.12.self_attn.qkv_proj.k_proj.weight8
|
278 |
+
:
|
279 |
+
input138�2&layers.12.self_attn.q_layernorm.weight8
|
280 |
+
@
|
281 |
+
input139�� 2*layers.12.self_attn.qkv_proj.q_proj.weight8
|
282 |
+
4
|
283 |
+
input140� �02layers.12.mlp.down_proj.weight8
|
284 |
+
2
|
285 |
+
input141�0� 2layers.12.mlp.up_proj.weight8
|
286 |
+
=
|
287 |
+
input142� 2)layers.12.post_attention_layernorm.weight8
|
288 |
+
4
|
289 |
+
input143�0� 2layers.12.mlp.gate_proj.weight8
|
290 |
+
>
|
291 |
+
input144� �2(layers.13.self_attn.o_proj.o_proj.weight8
|
292 |
+
@
|
293 |
+
input145�� 2*layers.13.self_attn.qkv_proj.v_proj.weight8
|
294 |
+
4
|
295 |
+
input146� 2 layers.13.input_layernorm.weight8
|
296 |
+
:
|
297 |
+
input147�2&layers.13.self_attn.k_layernorm.weight8
|
298 |
+
@
|
299 |
+
input148�� 2*layers.13.self_attn.qkv_proj.k_proj.weight8
|
300 |
+
:
|
301 |
+
input149�2&layers.13.self_attn.q_layernorm.weight8
|
302 |
+
@
|
303 |
+
input150�� 2*layers.13.self_attn.qkv_proj.q_proj.weight8
|
304 |
+
4
|
305 |
+
input151� �02layers.13.mlp.down_proj.weight8
|
306 |
+
2
|
307 |
+
input152�0� 2layers.13.mlp.up_proj.weight8
|
308 |
+
=
|
309 |
+
input153� 2)layers.13.post_attention_layernorm.weight8
|
310 |
+
4
|
311 |
+
input154�0� 2layers.13.mlp.gate_proj.weight8
|
312 |
+
>
|
313 |
+
input155� �2(layers.14.self_attn.o_proj.o_proj.weight8
|
314 |
+
@
|
315 |
+
input156�� 2*layers.14.self_attn.qkv_proj.v_proj.weight8
|
316 |
+
4
|
317 |
+
input157� 2 layers.14.input_layernorm.weight8
|
318 |
+
:
|
319 |
+
input158�2&layers.14.self_attn.k_layernorm.weight8
|
320 |
+
@
|
321 |
+
input159�� 2*layers.14.self_attn.qkv_proj.k_proj.weight8
|
322 |
+
:
|
323 |
+
input160�2&layers.14.self_attn.q_layernorm.weight8
|
324 |
+
@
|
325 |
+
input161�� 2*layers.14.self_attn.qkv_proj.q_proj.weight8
|
326 |
+
4
|
327 |
+
input162� �02layers.14.mlp.down_proj.weight8
|
328 |
+
2
|
329 |
+
input163�0� 2layers.14.mlp.up_proj.weight8
|
330 |
+
=
|
331 |
+
input164� 2)layers.14.post_attention_layernorm.weight8
|
332 |
+
4
|
333 |
+
input165�0� 2layers.14.mlp.gate_proj.weight8
|
334 |
+
>
|
335 |
+
input166� �2(layers.15.self_attn.o_proj.o_proj.weight8
|
336 |
+
@
|
337 |
+
input167�� 2*layers.15.self_attn.qkv_proj.v_proj.weight8
|
338 |
+
4
|
339 |
+
input168� 2 layers.15.input_layernorm.weight8
|
340 |
+
:
|
341 |
+
input169�2&layers.15.self_attn.k_layernorm.weight8
|
342 |
+
@
|
343 |
+
input170�� 2*layers.15.self_attn.qkv_proj.k_proj.weight8
|
344 |
+
:
|
345 |
+
input171�2&layers.15.self_attn.q_layernorm.weight8
|
346 |
+
@
|
347 |
+
input172�� 2*layers.15.self_attn.qkv_proj.q_proj.weight8
|
348 |
+
4
|
349 |
+
input173� �02layers.15.mlp.down_proj.weight8
|
350 |
+
2
|
351 |
+
input174�0� 2layers.15.mlp.up_proj.weight8
|
352 |
+
=
|
353 |
+
input175� 2)layers.15.post_attention_layernorm.weight8
|
354 |
+
4
|
355 |
+
input176�0� 2layers.15.mlp.gate_proj.weight8
|
356 |
+
>
|
357 |
+
input177� �2(layers.16.self_attn.o_proj.o_proj.weight8
|
358 |
+
@
|
359 |
+
input178�� 2*layers.16.self_attn.qkv_proj.v_proj.weight8
|
360 |
+
4
|
361 |
+
input179� 2 layers.16.input_layernorm.weight8
|
362 |
+
:
|
363 |
+
input180�2&layers.16.self_attn.k_layernorm.weight8
|
364 |
+
@
|
365 |
+
input181�� 2*layers.16.self_attn.qkv_proj.k_proj.weight8
|
366 |
+
:
|
367 |
+
input182�2&layers.16.self_attn.q_layernorm.weight8
|
368 |
+
@
|
369 |
+
input183�� 2*layers.16.self_attn.qkv_proj.q_proj.weight8
|
370 |
+
4
|
371 |
+
input184� �02layers.16.mlp.down_proj.weight8
|
372 |
+
2
|
373 |
+
input185�0� 2layers.16.mlp.up_proj.weight8
|
374 |
+
=
|
375 |
+
input186� 2)layers.16.post_attention_layernorm.weight8
|
376 |
+
4
|
377 |
+
input187�0� 2layers.16.mlp.gate_proj.weight8
|
378 |
+
>
|
379 |
+
input188� �2(layers.17.self_attn.o_proj.o_proj.weight8
|
380 |
+
@
|
381 |
+
input189�� 2*layers.17.self_attn.qkv_proj.v_proj.weight8
|
382 |
+
4
|
383 |
+
input190� 2 layers.17.input_layernorm.weight8
|
384 |
+
:
|
385 |
+
input191�2&layers.17.self_attn.k_layernorm.weight8
|
386 |
+
@
|
387 |
+
input192�� 2*layers.17.self_attn.qkv_proj.k_proj.weight8
|
388 |
+
:
|
389 |
+
input193�2&layers.17.self_attn.q_layernorm.weight8
|
390 |
+
@
|
391 |
+
input194�� 2*layers.17.self_attn.qkv_proj.q_proj.weight8
|
392 |
+
4
|
393 |
+
input195� �02layers.17.mlp.down_proj.weight8
|
394 |
+
2
|
395 |
+
input196�0� 2layers.17.mlp.up_proj.weight8
|
396 |
+
=
|
397 |
+
input197� 2)layers.17.post_attention_layernorm.weight8
|
398 |
+
4
|
399 |
+
input198�0� 2layers.17.mlp.gate_proj.weight8
|
400 |
+
>
|
401 |
+
input199� �2(layers.18.self_attn.o_proj.o_proj.weight8
|
402 |
+
@
|
403 |
+
input200�� 2*layers.18.self_attn.qkv_proj.v_proj.weight8
|
404 |
+
4
|
405 |
+
input201� 2 layers.18.input_layernorm.weight8
|
406 |
+
:
|
407 |
+
input202�2&layers.18.self_attn.k_layernorm.weight8
|
408 |
+
@
|
409 |
+
input203�� 2*layers.18.self_attn.qkv_proj.k_proj.weight8
|
410 |
+
:
|
411 |
+
input204�2&layers.18.self_attn.q_layernorm.weight8
|
412 |
+
@
|
413 |
+
input205�� 2*layers.18.self_attn.qkv_proj.q_proj.weight8
|
414 |
+
4
|
415 |
+
input206� �02layers.18.mlp.down_proj.weight8
|
416 |
+
2
|
417 |
+
input207�0� 2layers.18.mlp.up_proj.weight8
|
418 |
+
=
|
419 |
+
input208� 2)layers.18.post_attention_layernorm.weight8
|
420 |
+
4
|
421 |
+
input209�0� 2layers.18.mlp.gate_proj.weight8
|
422 |
+
>
|
423 |
+
input210� �2(layers.19.self_attn.o_proj.o_proj.weight8
|
424 |
+
@
|
425 |
+
input211�� 2*layers.19.self_attn.qkv_proj.v_proj.weight8
|
426 |
+
4
|
427 |
+
input212� 2 layers.19.input_layernorm.weight8
|
428 |
+
:
|
429 |
+
input213�2&layers.19.self_attn.k_layernorm.weight8
|
430 |
+
@
|
431 |
+
input214�� 2*layers.19.self_attn.qkv_proj.k_proj.weight8
|
432 |
+
:
|
433 |
+
input215�2&layers.19.self_attn.q_layernorm.weight8
|
434 |
+
@
|
435 |
+
input216�� 2*layers.19.self_attn.qkv_proj.q_proj.weight8
|
436 |
+
4
|
437 |
+
input217� �02layers.19.mlp.down_proj.weight8
|
438 |
+
2
|
439 |
+
input218�0� 2layers.19.mlp.up_proj.weight8
|
440 |
+
=
|
441 |
+
input219� 2)layers.19.post_attention_layernorm.weight8
|
442 |
+
4
|
443 |
+
input220�0� 2layers.19.mlp.gate_proj.weight8
|
444 |
+
>
|
445 |
+
input221� �2(layers.20.self_attn.o_proj.o_proj.weight8
|
446 |
+
@
|
447 |
+
input222�� 2*layers.20.self_attn.qkv_proj.v_proj.weight8
|
448 |
+
4
|
449 |
+
input223� 2 layers.20.input_layernorm.weight8
|
450 |
+
:
|
451 |
+
input224�2&layers.20.self_attn.k_layernorm.weight8
|
452 |
+
@
|
453 |
+
input225�� 2*layers.20.self_attn.qkv_proj.k_proj.weight8
|
454 |
+
:
|
455 |
+
input226�2&layers.20.self_attn.q_layernorm.weight8
|
456 |
+
@
|
457 |
+
input227�� 2*layers.20.self_attn.qkv_proj.q_proj.weight8
|
458 |
+
4
|
459 |
+
input228� �02layers.20.mlp.down_proj.weight8
|
460 |
+
2
|
461 |
+
input229�0� 2layers.20.mlp.up_proj.weight8
|
462 |
+
=
|
463 |
+
input230� 2)layers.20.post_attention_layernorm.weight8
|
464 |
+
4
|
465 |
+
input231�0� 2layers.20.mlp.gate_proj.weight8
|
466 |
+
>
|
467 |
+
input232� �2(layers.21.self_attn.o_proj.o_proj.weight8
|
468 |
+
@
|
469 |
+
input233�� 2*layers.21.self_attn.qkv_proj.v_proj.weight8
|
470 |
+
4
|
471 |
+
input234� 2 layers.21.input_layernorm.weight8
|
472 |
+
:
|
473 |
+
input235�2&layers.21.self_attn.k_layernorm.weight8
|
474 |
+
@
|
475 |
+
input236�� 2*layers.21.self_attn.qkv_proj.k_proj.weight8
|
476 |
+
:
|
477 |
+
input237�2&layers.21.self_attn.q_layernorm.weight8
|
478 |
+
@
|
479 |
+
input238�� 2*layers.21.self_attn.qkv_proj.q_proj.weight8
|
480 |
+
4
|
481 |
+
input239� �02layers.21.mlp.down_proj.weight8
|
482 |
+
2
|
483 |
+
input240�0� 2layers.21.mlp.up_proj.weight8
|
484 |
+
=
|
485 |
+
input241� 2)layers.21.post_attention_layernorm.weight8
|
486 |
+
4
|
487 |
+
input242�0� 2layers.21.mlp.gate_proj.weight8
|
488 |
+
>
|
489 |
+
input243� �2(layers.22.self_attn.o_proj.o_proj.weight8
|
490 |
+
@
|
491 |
+
input244�� 2*layers.22.self_attn.qkv_proj.v_proj.weight8
|
492 |
+
4
|
493 |
+
input245� 2 layers.22.input_layernorm.weight8
|
494 |
+
:
|
495 |
+
input246�2&layers.22.self_attn.k_layernorm.weight8
|
496 |
+
@
|
497 |
+
input247�� 2*layers.22.self_attn.qkv_proj.k_proj.weight8
|
498 |
+
:
|
499 |
+
input248�2&layers.22.self_attn.q_layernorm.weight8
|
500 |
+
@
|
501 |
+
input249�� 2*layers.22.self_attn.qkv_proj.q_proj.weight8
|
502 |
+
4
|
503 |
+
input250� �02layers.22.mlp.down_proj.weight8
|
504 |
+
2
|
505 |
+
input251�0� 2layers.22.mlp.up_proj.weight8
|
506 |
+
=
|
507 |
+
input252� 2)layers.22.post_attention_layernorm.weight8
|
508 |
+
4
|
509 |
+
input253�0� 2layers.22.mlp.gate_proj.weight8
|
510 |
+
>
|
511 |
+
input254� �2(layers.23.self_attn.o_proj.o_proj.weight8
|
512 |
+
@
|
513 |
+
input255�� 2*layers.23.self_attn.qkv_proj.v_proj.weight8
|
514 |
+
4
|
515 |
+
input256� 2 layers.23.input_layernorm.weight8
|
516 |
+
:
|
517 |
+
input257�2&layers.23.self_attn.k_layernorm.weight8
|
518 |
+
@
|
519 |
+
input258�� 2*layers.23.self_attn.qkv_proj.k_proj.weight8
|
520 |
+
:
|
521 |
+
input259�2&layers.23.self_attn.q_layernorm.weight8
|
522 |
+
@
|
523 |
+
input260�� 2*layers.23.self_attn.qkv_proj.q_proj.weight8
|
524 |
+
4
|
525 |
+
input261� �02layers.23.mlp.down_proj.weight8
|
526 |
+
2
|
527 |
+
input262�0� 2layers.23.mlp.up_proj.weight8
|
528 |
+
=
|
529 |
+
input263� 2)layers.23.post_attention_layernorm.weight8
|
530 |
+
4
|
531 |
+
input264�0� 2layers.23.mlp.gate_proj.weight8
|
532 |
+
>
|
533 |
+
input265� �2(layers.24.self_attn.o_proj.o_proj.weight8
|
534 |
+
@
|
535 |
+
input266�� 2*layers.24.self_attn.qkv_proj.v_proj.weight8
|
536 |
+
4
|
537 |
+
input267� 2 layers.24.input_layernorm.weight8
|
538 |
+
:
|
539 |
+
input268�2&layers.24.self_attn.k_layernorm.weight8
|
540 |
+
@
|
541 |
+
input269�� 2*layers.24.self_attn.qkv_proj.k_proj.weight8
|
542 |
+
:
|
543 |
+
input270�2&layers.24.self_attn.q_layernorm.weight8
|
544 |
+
@
|
545 |
+
input271�� 2*layers.24.self_attn.qkv_proj.q_proj.weight8
|
546 |
+
4
|
547 |
+
input272� �02layers.24.mlp.down_proj.weight8
|
548 |
+
2
|
549 |
+
input273�0� 2layers.24.mlp.up_proj.weight8
|
550 |
+
=
|
551 |
+
input274� 2)layers.24.post_attention_layernorm.weight8
|
552 |
+
4
|
553 |
+
input275�0� 2layers.24.mlp.gate_proj.weight8
|
554 |
+
>
|
555 |
+
input276� �2(layers.25.self_attn.o_proj.o_proj.weight8
|
556 |
+
@
|
557 |
+
input277�� 2*layers.25.self_attn.qkv_proj.v_proj.weight8
|
558 |
+
4
|
559 |
+
input278� 2 layers.25.input_layernorm.weight8
|
560 |
+
:
|
561 |
+
input279�2&layers.25.self_attn.k_layernorm.weight8
|
562 |
+
@
|
563 |
+
input280�� 2*layers.25.self_attn.qkv_proj.k_proj.weight8
|
564 |
+
:
|
565 |
+
input281�2&layers.25.self_attn.q_layernorm.weight8
|
566 |
+
@
|
567 |
+
input282�� 2*layers.25.self_attn.qkv_proj.q_proj.weight8
|
568 |
+
4
|
569 |
+
input283� �02layers.25.mlp.down_proj.weight8
|
570 |
+
2
|
571 |
+
input284�0� 2layers.25.mlp.up_proj.weight8
|
572 |
+
=
|
573 |
+
input285� 2)layers.25.post_attention_layernorm.weight8
|
574 |
+
4
|
575 |
+
input286�0� 2layers.25.mlp.gate_proj.weight8
|
576 |
+
>
|
577 |
+
input287� �2(layers.26.self_attn.o_proj.o_proj.weight8
|
578 |
+
@
|
579 |
+
input288�� 2*layers.26.self_attn.qkv_proj.v_proj.weight8
|
580 |
+
4
|
581 |
+
input289� 2 layers.26.input_layernorm.weight8
|
582 |
+
:
|
583 |
+
input290�2&layers.26.self_attn.k_layernorm.weight8
|
584 |
+
@
|
585 |
+
input291�� 2*layers.26.self_attn.qkv_proj.k_proj.weight8
|
586 |
+
:
|
587 |
+
input292�2&layers.26.self_attn.q_layernorm.weight8
|
588 |
+
@
|
589 |
+
input293�� 2*layers.26.self_attn.qkv_proj.q_proj.weight8
|
590 |
+
4
|
591 |
+
input294� �02layers.26.mlp.down_proj.weight8
|
592 |
+
2
|
593 |
+
input295�0� 2layers.26.mlp.up_proj.weight8
|
594 |
+
=
|
595 |
+
input296� 2)layers.26.post_attention_layernorm.weight8
|
596 |
+
4
|
597 |
+
input297�0� 2layers.26.mlp.gate_proj.weight8
|
598 |
+
>
|
599 |
+
input298� �2(layers.27.self_attn.o_proj.o_proj.weight8
|
600 |
+
@
|
601 |
+
input299�� 2*layers.27.self_attn.qkv_proj.v_proj.weight8
|
602 |
+
4
|
603 |
+
input300� 2 layers.27.input_layernorm.weight8
|
604 |
+
:
|
605 |
+
input301�2&layers.27.self_attn.k_layernorm.weight8
|
606 |
+
@
|
607 |
+
input302�� 2*layers.27.self_attn.qkv_proj.k_proj.weight8
|
608 |
+
:
|
609 |
+
input303�2&layers.27.self_attn.q_layernorm.weight8
|
610 |
+
@
|
611 |
+
input304�� 2*layers.27.self_attn.qkv_proj.q_proj.weight8
|
612 |
+
4
|
613 |
+
input305� �02layers.27.mlp.down_proj.weight8
|
614 |
+
2
|
615 |
+
input306�0� 2layers.27.mlp.up_proj.weight8
|
616 |
+
=
|
617 |
+
input307� 2)layers.27.post_attention_layernorm.weight8
|
618 |
+
4
|
619 |
+
input308�0� 2layers.27.mlp.gate_proj.weight8
|
620 |
+
>
|
621 |
+
input309� �2(layers.28.self_attn.o_proj.o_proj.weight8
|
622 |
+
@
|
623 |
+
input310�� 2*layers.28.self_attn.qkv_proj.v_proj.weight8
|
624 |
+
4
|
625 |
+
input311� 2 layers.28.input_layernorm.weight8
|
626 |
+
:
|
627 |
+
input312�2&layers.28.self_attn.k_layernorm.weight8
|
628 |
+
@
|
629 |
+
input313�� 2*layers.28.self_attn.qkv_proj.k_proj.weight8
|
630 |
+
:
|
631 |
+
input314�2&layers.28.self_attn.q_layernorm.weight8
|
632 |
+
@
|
633 |
+
input315�� 2*layers.28.self_attn.qkv_proj.q_proj.weight8
|
634 |
+
4
|
635 |
+
input316� �02layers.28.mlp.down_proj.weight8
|
636 |
+
2
|
637 |
+
input317�0� 2layers.28.mlp.up_proj.weight8
|
638 |
+
=
|
639 |
+
input318� 2)layers.28.post_attention_layernorm.weight8
|
640 |
+
4
|
641 |
+
input319�0� 2layers.28.mlp.gate_proj.weight8
|
642 |
+
>
|
643 |
+
input320� �2(layers.29.self_attn.o_proj.o_proj.weight8
|
644 |
+
@
|
645 |
+
input321�� 2*layers.29.self_attn.qkv_proj.v_proj.weight8
|
646 |
+
4
|
647 |
+
input322� 2 layers.29.input_layernorm.weight8
|
648 |
+
:
|
649 |
+
input323�2&layers.29.self_attn.k_layernorm.weight8
|
650 |
+
@
|
651 |
+
input324�� 2*layers.29.self_attn.qkv_proj.k_proj.weight8
|
652 |
+
:
|
653 |
+
input325�2&layers.29.self_attn.q_layernorm.weight8
|
654 |
+
@
|
655 |
+
input326�� 2*layers.29.self_attn.qkv_proj.q_proj.weight8
|
656 |
+
4
|
657 |
+
input327� �02layers.29.mlp.down_proj.weight8
|
658 |
+
2
|
659 |
+
input328�0� 2layers.29.mlp.up_proj.weight8
|
660 |
+
=
|
661 |
+
input329� 2)layers.29.post_attention_layernorm.weight8
|
662 |
+
4
|
663 |
+
input330�0� 2layers.29.mlp.gate_proj.weight8
|
664 |
+
>
|
665 |
+
input331� �2(layers.30.self_attn.o_proj.o_proj.weight8
|
666 |
+
@
|
667 |
+
input332�� 2*layers.30.self_attn.qkv_proj.v_proj.weight8
|
668 |
+
4
|
669 |
+
input333� 2 layers.30.input_layernorm.weight8
|
670 |
+
:
|
671 |
+
input334�2&layers.30.self_attn.k_layernorm.weight8
|
672 |
+
@
|
673 |
+
input335�� 2*layers.30.self_attn.qkv_proj.k_proj.weight8
|
674 |
+
:
|
675 |
+
input336�2&layers.30.self_attn.q_layernorm.weight8
|
676 |
+
@
|
677 |
+
input337�� 2*layers.30.self_attn.qkv_proj.q_proj.weight8
|
678 |
+
4
|
679 |
+
input338� �02layers.30.mlp.down_proj.weight8
|
680 |
+
2
|
681 |
+
input339�0� 2layers.30.mlp.up_proj.weight8
|
682 |
+
=
|
683 |
+
input340� 2)layers.30.post_attention_layernorm.weight8
|
684 |
+
4
|
685 |
+
input341�0� 2layers.30.mlp.gate_proj.weight8
|
686 |
+
>
|
687 |
+
input342� �2(layers.31.self_attn.o_proj.o_proj.weight8
|
688 |
+
@
|
689 |
+
input343�� 2*layers.31.self_attn.qkv_proj.v_proj.weight8
|
690 |
+
4
|
691 |
+
input344� 2 layers.31.input_layernorm.weight8
|
692 |
+
:
|
693 |
+
input345�2&layers.31.self_attn.k_layernorm.weight8
|
694 |
+
@
|
695 |
+
input346�� 2*layers.31.self_attn.qkv_proj.k_proj.weight8
|
696 |
+
:
|
697 |
+
input347�2&layers.31.self_attn.q_layernorm.weight8
|
698 |
+
@
|
699 |
+
input348�� 2*layers.31.self_attn.qkv_proj.q_proj.weight8
|
700 |
+
4
|
701 |
+
input349� �02layers.31.mlp.down_proj.weight8
|
702 |
+
2
|
703 |
+
input350�0� 2layers.31.mlp.up_proj.weight8
|
704 |
+
=
|
705 |
+
input351� 2)layers.31.post_attention_layernorm.weight8
|
706 |
+
4
|
707 |
+
input352�0� 2layers.31.mlp.gate_proj.weight8
|
708 |
+
>
|
709 |
+
input353� �2(layers.32.self_attn.o_proj.o_proj.weight8
|
710 |
+
@
|
711 |
+
input354�� 2*layers.32.self_attn.qkv_proj.v_proj.weight8
|
712 |
+
4
|
713 |
+
input355� 2 layers.32.input_layernorm.weight8
|
714 |
+
:
|
715 |
+
input356�2&layers.32.self_attn.k_layernorm.weight8
|
716 |
+
@
|
717 |
+
input357�� 2*layers.32.self_attn.qkv_proj.k_proj.weight8
|
718 |
+
:
|
719 |
+
input358�2&layers.32.self_attn.q_layernorm.weight8
|
720 |
+
@
|
721 |
+
input359�� 2*layers.32.self_attn.qkv_proj.q_proj.weight8
|
722 |
+
4
|
723 |
+
input360� �02layers.32.mlp.down_proj.weight8
|
724 |
+
2
|
725 |
+
input361�0� 2layers.32.mlp.up_proj.weight8
|
726 |
+
=
|
727 |
+
input362� 2)layers.32.post_attention_layernorm.weight8
|
728 |
+
4
|
729 |
+
input363�0� 2layers.32.mlp.gate_proj.weight8
|
730 |
+
>
|
731 |
+
input364� �2(layers.33.self_attn.o_proj.o_proj.weight8
|
732 |
+
@
|
733 |
+
input365�� 2*layers.33.self_attn.qkv_proj.v_proj.weight8
|
734 |
+
4
|
735 |
+
input366� 2 layers.33.input_layernorm.weight8
|
736 |
+
:
|
737 |
+
input367�2&layers.33.self_attn.k_layernorm.weight8
|
738 |
+
@
|
739 |
+
input368�� 2*layers.33.self_attn.qkv_proj.k_proj.weight8
|
740 |
+
:
|
741 |
+
input369�2&layers.33.self_attn.q_layernorm.weight8
|
742 |
+
@
|
743 |
+
input370�� 2*layers.33.self_attn.qkv_proj.q_proj.weight8
|
744 |
+
4
|
745 |
+
input371� �02layers.33.mlp.down_proj.weight8
|
746 |
+
2
|
747 |
+
input372�0� 2layers.33.mlp.up_proj.weight8
|
748 |
+
=
|
749 |
+
input373� 2)layers.33.post_attention_layernorm.weight8
|
750 |
+
4
|
751 |
+
input374�0� 2layers.33.mlp.gate_proj.weight8
|
752 |
+
>
|
753 |
+
input375� �2(layers.34.self_attn.o_proj.o_proj.weight8
|
754 |
+
@
|
755 |
+
input376�� 2*layers.34.self_attn.qkv_proj.v_proj.weight8
|
756 |
+
4
|
757 |
+
input377� 2 layers.34.input_layernorm.weight8
|
758 |
+
:
|
759 |
+
input378�2&layers.34.self_attn.k_layernorm.weight8
|
760 |
+
@
|
761 |
+
input379�� 2*layers.34.self_attn.qkv_proj.k_proj.weight8
|
762 |
+
:
|
763 |
+
input380�2&layers.34.self_attn.q_layernorm.weight8
|
764 |
+
@
|
765 |
+
input381�� 2*layers.34.self_attn.qkv_proj.q_proj.weight8
|
766 |
+
4
|
767 |
+
input382� �02layers.34.mlp.down_proj.weight8
|
768 |
+
2
|
769 |
+
input383�0� 2layers.34.mlp.up_proj.weight8
|
770 |
+
=
|
771 |
+
input384� 2)layers.34.post_attention_layernorm.weight8
|
772 |
+
4
|
773 |
+
input385�0� 2layers.34.mlp.gate_proj.weight8
|
774 |
+
>
|
775 |
+
input386� �2(layers.35.self_attn.o_proj.o_proj.weight8
|
776 |
+
@
|
777 |
+
input387�� 2*layers.35.self_attn.qkv_proj.v_proj.weight8
|
778 |
+
4
|
779 |
+
input388� 2 layers.35.input_layernorm.weight8
|
780 |
+
:
|
781 |
+
input389�2&layers.35.self_attn.k_layernorm.weight8
|
782 |
+
@
|
783 |
+
input390�� 2*layers.35.self_attn.qkv_proj.k_proj.weight8
|
784 |
+
:
|
785 |
+
input391�2&layers.35.self_attn.q_layernorm.weight8
|
786 |
+
@
|
787 |
+
input392�� 2*layers.35.self_attn.qkv_proj.q_proj.weight8
|
788 |
+
4
|
789 |
+
input393� �02layers.35.mlp.down_proj.weight8
|
790 |
+
2
|
791 |
+
input394�0� 2layers.35.mlp.up_proj.weight8
|
792 |
+
=
|
793 |
+
input395� 2)layers.35.post_attention_layernorm.weight8
|
794 |
+
4
|
795 |
+
input396�0� 2layers.35.mlp.gate_proj.weight8
|
796 |
+
%
|
797 |
+
input397��� 2lm_head.weight8
|
798 |
+
|
799 |
+
input398� 2norm.weight8'
|
800 |
+
output0�� �2embed_tokens.weight>
|
801 |
+
output1��2'layers.0.self_attn.o_proj.o_proj.weight>
|
802 |
+
output2� �2)layers.0.self_attn.qkv_proj.v_proj.weight1
|
803 |
+
output3� 2layers.0.input_layernorm.weight6
|
804 |
+
output4�2%layers.0.self_attn.k_layernorm.weight>
|
805 |
+
output5� @2)layers.0.self_attn.qkv_proj.k_proj.weight6
|
806 |
+
output6�2%layers.0.self_attn.q_layernorm.weight?
|
807 |
+
output7� @2)layers.0.self_attn.qkv_proj.q_proj.weight3
|
808 |
+
output8 ��2layers.0.mlp.down_proj.weight0
|
809 |
+
output90� �2layers.0.mlp.up_proj.weight;
|
810 |
+
output10� 2(layers.0.post_attention_layernorm.weight3
|
811 |
+
output110� �2layers.0.mlp.gate_proj.weight?
|
812 |
+
output12��2'layers.1.self_attn.o_proj.o_proj.weight?
|
813 |
+
output13� �2)layers.1.self_attn.qkv_proj.v_proj.weight2
|
814 |
+
output14� 2layers.1.input_layernorm.weight7
|
815 |
+
output15�2%layers.1.self_attn.k_layernorm.weight?
|
816 |
+
output16� @2)layers.1.self_attn.qkv_proj.k_proj.weight7
|
817 |
+
output17�2%layers.1.self_attn.q_layernorm.weight@
|
818 |
+
output18� @2)layers.1.self_attn.qkv_proj.q_proj.weight4
|
819 |
+
output19 ��2layers.1.mlp.down_proj.weight1
|
820 |
+
output200� �2layers.1.mlp.up_proj.weight;
|
821 |
+
output21� 2(layers.1.post_attention_layernorm.weight3
|
822 |
+
output220� �2layers.1.mlp.gate_proj.weight?
|
823 |
+
output23��2'layers.2.self_attn.o_proj.o_proj.weight?
|
824 |
+
output24� �2)layers.2.self_attn.qkv_proj.v_proj.weight2
|
825 |
+
output25� 2layers.2.input_layernorm.weight7
|
826 |
+
output26�2%layers.2.self_attn.k_layernorm.weight?
|
827 |
+
output27� @2)layers.2.self_attn.qkv_proj.k_proj.weight7
|
828 |
+
output28�2%layers.2.self_attn.q_layernorm.weight@
|
829 |
+
output29� @2)layers.2.self_attn.qkv_proj.q_proj.weight4
|
830 |
+
output30 ��2layers.2.mlp.down_proj.weight1
|
831 |
+
output310� �2layers.2.mlp.up_proj.weight;
|
832 |
+
output32� 2(layers.2.post_attention_layernorm.weight3
|
833 |
+
output330� �2layers.2.mlp.gate_proj.weight?
|
834 |
+
output34��2'layers.3.self_attn.o_proj.o_proj.weight?
|
835 |
+
output35� �2)layers.3.self_attn.qkv_proj.v_proj.weight2
|
836 |
+
output36� 2layers.3.input_layernorm.weight7
|
837 |
+
output37�2%layers.3.self_attn.k_layernorm.weight?
|
838 |
+
output38� @2)layers.3.self_attn.qkv_proj.k_proj.weight7
|
839 |
+
output39�2%layers.3.self_attn.q_layernorm.weight@
|
840 |
+
output40� @2)layers.3.self_attn.qkv_proj.q_proj.weight4
|
841 |
+
output41 ��2layers.3.mlp.down_proj.weight1
|
842 |
+
output420� �2layers.3.mlp.up_proj.weight;
|
843 |
+
output43� 2(layers.3.post_attention_layernorm.weight3
|
844 |
+
output440� �2layers.3.mlp.gate_proj.weight?
|
845 |
+
output45��2'layers.4.self_attn.o_proj.o_proj.weight?
|
846 |
+
output46� �2)layers.4.self_attn.qkv_proj.v_proj.weight2
|
847 |
+
output47� 2layers.4.input_layernorm.weight7
|
848 |
+
output48�2%layers.4.self_attn.k_layernorm.weight?
|
849 |
+
output49� @2)layers.4.self_attn.qkv_proj.k_proj.weight7
|
850 |
+
output50�2%layers.4.self_attn.q_layernorm.weight@
|
851 |
+
output51� @2)layers.4.self_attn.qkv_proj.q_proj.weight4
|
852 |
+
output52 ��2layers.4.mlp.down_proj.weight1
|
853 |
+
output530� �2layers.4.mlp.up_proj.weight;
|
854 |
+
output54� 2(layers.4.post_attention_layernorm.weight3
|
855 |
+
output550� �2layers.4.mlp.gate_proj.weight?
|
856 |
+
output56��2'layers.5.self_attn.o_proj.o_proj.weight?
|
857 |
+
output57� �2)layers.5.self_attn.qkv_proj.v_proj.weight2
|
858 |
+
output58� 2layers.5.input_layernorm.weight7
|
859 |
+
output59�2%layers.5.self_attn.k_layernorm.weight?
|
860 |
+
output60� @2)layers.5.self_attn.qkv_proj.k_proj.weight7
|
861 |
+
output61�2%layers.5.self_attn.q_layernorm.weight@
|
862 |
+
output62� @2)layers.5.self_attn.qkv_proj.q_proj.weight4
|
863 |
+
output63 ��2layers.5.mlp.down_proj.weight1
|
864 |
+
output640� �2layers.5.mlp.up_proj.weight;
|
865 |
+
output65� 2(layers.5.post_attention_layernorm.weight3
|
866 |
+
output660� �2layers.5.mlp.gate_proj.weight?
|
867 |
+
output67��2'layers.6.self_attn.o_proj.o_proj.weight?
|
868 |
+
output68� �2)layers.6.self_attn.qkv_proj.v_proj.weight2
|
869 |
+
output69� 2layers.6.input_layernorm.weight7
|
870 |
+
output70�2%layers.6.self_attn.k_layernorm.weight?
|
871 |
+
output71� @2)layers.6.self_attn.qkv_proj.k_proj.weight7
|
872 |
+
output72�2%layers.6.self_attn.q_layernorm.weight@
|
873 |
+
output73� @2)layers.6.self_attn.qkv_proj.q_proj.weight4
|
874 |
+
output74 ��2layers.6.mlp.down_proj.weight1
|
875 |
+
output750� �2layers.6.mlp.up_proj.weight;
|
876 |
+
output76� 2(layers.6.post_attention_layernorm.weight3
|
877 |
+
output770� �2layers.6.mlp.gate_proj.weight?
|
878 |
+
output78��2'layers.7.self_attn.o_proj.o_proj.weight?
|
879 |
+
output79� �2)layers.7.self_attn.qkv_proj.v_proj.weight2
|
880 |
+
output80� 2layers.7.input_layernorm.weight7
|
881 |
+
output81�2%layers.7.self_attn.k_layernorm.weight?
|
882 |
+
output82� @2)layers.7.self_attn.qkv_proj.k_proj.weight7
|
883 |
+
output83�2%layers.7.self_attn.q_layernorm.weight@
|
884 |
+
output84� @2)layers.7.self_attn.qkv_proj.q_proj.weight4
|
885 |
+
output85 ��2layers.7.mlp.down_proj.weight1
|
886 |
+
output860� �2layers.7.mlp.up_proj.weight;
|
887 |
+
output87� 2(layers.7.post_attention_layernorm.weight3
|
888 |
+
output880� �2layers.7.mlp.gate_proj.weight?
|
889 |
+
output89��2'layers.8.self_attn.o_proj.o_proj.weight?
|
890 |
+
output90� �2)layers.8.self_attn.qkv_proj.v_proj.weight2
|
891 |
+
output91� 2layers.8.input_layernorm.weight7
|
892 |
+
output92�2%layers.8.self_attn.k_layernorm.weight?
|
893 |
+
output93� @2)layers.8.self_attn.qkv_proj.k_proj.weight7
|
894 |
+
output94�2%layers.8.self_attn.q_layernorm.weight@
|
895 |
+
output95� @2)layers.8.self_attn.qkv_proj.q_proj.weight4
|
896 |
+
output96 ��2layers.8.mlp.down_proj.weight1
|
897 |
+
output970� �2layers.8.mlp.up_proj.weight;
|
898 |
+
output98� 2(layers.8.post_attention_layernorm.weight3
|
899 |
+
output990� �2layers.8.mlp.gate_proj.weight@
|
900 |
+
output100��2'layers.9.self_attn.o_proj.o_proj.weight@
|
901 |
+
output101� �2)layers.9.self_attn.qkv_proj.v_proj.weight3
|
902 |
+
output102� 2layers.9.input_layernorm.weight8
|
903 |
+
output103�2%layers.9.self_attn.k_layernorm.weight@
|
904 |
+
output104� @2)layers.9.self_attn.qkv_proj.k_proj.weight8
|
905 |
+
output105�2%layers.9.self_attn.q_layernorm.weightA
|
906 |
+
output106� @2)layers.9.self_attn.qkv_proj.q_proj.weight5
|
907 |
+
output107 ��2layers.9.mlp.down_proj.weight2
|
908 |
+
output1080� �2layers.9.mlp.up_proj.weight<
|
909 |
+
output109� 2(layers.9.post_attention_layernorm.weight4
|
910 |
+
output1100� �2layers.9.mlp.gate_proj.weightA
|
911 |
+
output111��2(layers.10.self_attn.o_proj.o_proj.weightA
|
912 |
+
output112� �2*layers.10.self_attn.qkv_proj.v_proj.weight4
|
913 |
+
output113� 2 layers.10.input_layernorm.weight9
|
914 |
+
output114�2&layers.10.self_attn.k_layernorm.weightA
|
915 |
+
output115� @2*layers.10.self_attn.qkv_proj.k_proj.weight9
|
916 |
+
output116�2&layers.10.self_attn.q_layernorm.weightB
|
917 |
+
output117� @2*layers.10.self_attn.qkv_proj.q_proj.weight6
|
918 |
+
output118 ��2layers.10.mlp.down_proj.weight3
|
919 |
+
output1190� �2layers.10.mlp.up_proj.weight=
|
920 |
+
output120� 2)layers.10.post_attention_layernorm.weight5
|
921 |
+
output1210� �2layers.10.mlp.gate_proj.weightA
|
922 |
+
output122��2(layers.11.self_attn.o_proj.o_proj.weightA
|
923 |
+
output123� �2*layers.11.self_attn.qkv_proj.v_proj.weight4
|
924 |
+
output124� 2 layers.11.input_layernorm.weight9
|
925 |
+
output125�2&layers.11.self_attn.k_layernorm.weightA
|
926 |
+
output126� @2*layers.11.self_attn.qkv_proj.k_proj.weight9
|
927 |
+
output127�2&layers.11.self_attn.q_layernorm.weightB
|
928 |
+
output128� @2*layers.11.self_attn.qkv_proj.q_proj.weight6
|
929 |
+
output129 ��2layers.11.mlp.down_proj.weight3
|
930 |
+
output1300� �2layers.11.mlp.up_proj.weight=
|
931 |
+
output131� 2)layers.11.post_attention_layernorm.weight5
|
932 |
+
output1320� �2layers.11.mlp.gate_proj.weightA
|
933 |
+
output133��2(layers.12.self_attn.o_proj.o_proj.weightA
|
934 |
+
output134� �2*layers.12.self_attn.qkv_proj.v_proj.weight4
|
935 |
+
output135� 2 layers.12.input_layernorm.weight9
|
936 |
+
output136�2&layers.12.self_attn.k_layernorm.weightA
|
937 |
+
output137� @2*layers.12.self_attn.qkv_proj.k_proj.weight9
|
938 |
+
output138�2&layers.12.self_attn.q_layernorm.weightB
|
939 |
+
output139� @2*layers.12.self_attn.qkv_proj.q_proj.weight6
|
940 |
+
output140 ��2layers.12.mlp.down_proj.weight3
|
941 |
+
output1410� �2layers.12.mlp.up_proj.weight=
|
942 |
+
output142� 2)layers.12.post_attention_layernorm.weight5
|
943 |
+
output1430� �2layers.12.mlp.gate_proj.weightA
|
944 |
+
output144��2(layers.13.self_attn.o_proj.o_proj.weightA
|
945 |
+
output145� �2*layers.13.self_attn.qkv_proj.v_proj.weight4
|
946 |
+
output146� 2 layers.13.input_layernorm.weight9
|
947 |
+
output147�2&layers.13.self_attn.k_layernorm.weightA
|
948 |
+
output148� @2*layers.13.self_attn.qkv_proj.k_proj.weight9
|
949 |
+
output149�2&layers.13.self_attn.q_layernorm.weightB
|
950 |
+
output150� @2*layers.13.self_attn.qkv_proj.q_proj.weight6
|
951 |
+
output151 ��2layers.13.mlp.down_proj.weight3
|
952 |
+
output1520� �2layers.13.mlp.up_proj.weight=
|
953 |
+
output153� 2)layers.13.post_attention_layernorm.weight5
|
954 |
+
output1540� �2layers.13.mlp.gate_proj.weightA
|
955 |
+
output155��2(layers.14.self_attn.o_proj.o_proj.weightA
|
956 |
+
output156� �2*layers.14.self_attn.qkv_proj.v_proj.weight4
|
957 |
+
output157� 2 layers.14.input_layernorm.weight9
|
958 |
+
output158�2&layers.14.self_attn.k_layernorm.weightA
|
959 |
+
output159� @2*layers.14.self_attn.qkv_proj.k_proj.weight9
|
960 |
+
output160�2&layers.14.self_attn.q_layernorm.weightB
|
961 |
+
output161� @2*layers.14.self_attn.qkv_proj.q_proj.weight6
|
962 |
+
output162 ��2layers.14.mlp.down_proj.weight3
|
963 |
+
output1630� �2layers.14.mlp.up_proj.weight=
|
964 |
+
output164� 2)layers.14.post_attention_layernorm.weight5
|
965 |
+
output1650� �2layers.14.mlp.gate_proj.weightA
|
966 |
+
output166��2(layers.15.self_attn.o_proj.o_proj.weightA
|
967 |
+
output167� �2*layers.15.self_attn.qkv_proj.v_proj.weight4
|
968 |
+
output168� 2 layers.15.input_layernorm.weight9
|
969 |
+
output169�2&layers.15.self_attn.k_layernorm.weightA
|
970 |
+
output170� @2*layers.15.self_attn.qkv_proj.k_proj.weight9
|
971 |
+
output171�2&layers.15.self_attn.q_layernorm.weightB
|
972 |
+
output172� @2*layers.15.self_attn.qkv_proj.q_proj.weight6
|
973 |
+
output173 ��2layers.15.mlp.down_proj.weight3
|
974 |
+
output1740� �2layers.15.mlp.up_proj.weight=
|
975 |
+
output175� 2)layers.15.post_attention_layernorm.weight5
|
976 |
+
output1760� �2layers.15.mlp.gate_proj.weightA
|
977 |
+
output177��2(layers.16.self_attn.o_proj.o_proj.weightA
|
978 |
+
output178� �2*layers.16.self_attn.qkv_proj.v_proj.weight4
|
979 |
+
output179� 2 layers.16.input_layernorm.weight9
|
980 |
+
output180�2&layers.16.self_attn.k_layernorm.weightA
|
981 |
+
output181� @2*layers.16.self_attn.qkv_proj.k_proj.weight9
|
982 |
+
output182�2&layers.16.self_attn.q_layernorm.weightB
|
983 |
+
output183� @2*layers.16.self_attn.qkv_proj.q_proj.weight6
|
984 |
+
output184 ��2layers.16.mlp.down_proj.weight3
|
985 |
+
output1850� �2layers.16.mlp.up_proj.weight=
|
986 |
+
output186� 2)layers.16.post_attention_layernorm.weight5
|
987 |
+
output1870� �2layers.16.mlp.gate_proj.weightA
|
988 |
+
output188��2(layers.17.self_attn.o_proj.o_proj.weightA
|
989 |
+
output189� �2*layers.17.self_attn.qkv_proj.v_proj.weight4
|
990 |
+
output190� 2 layers.17.input_layernorm.weight9
|
991 |
+
output191�2&layers.17.self_attn.k_layernorm.weightA
|
992 |
+
output192� @2*layers.17.self_attn.qkv_proj.k_proj.weight9
|
993 |
+
output193�2&layers.17.self_attn.q_layernorm.weightB
|
994 |
+
output194� @2*layers.17.self_attn.qkv_proj.q_proj.weight6
|
995 |
+
output195 ��2layers.17.mlp.down_proj.weight3
|
996 |
+
output1960� �2layers.17.mlp.up_proj.weight=
|
997 |
+
output197� 2)layers.17.post_attention_layernorm.weight5
|
998 |
+
output1980� �2layers.17.mlp.gate_proj.weightA
|
999 |
+
output199��2(layers.18.self_attn.o_proj.o_proj.weightA
|
1000 |
+
output200� �2*layers.18.self_attn.qkv_proj.v_proj.weight4
|
1001 |
+
output201� 2 layers.18.input_layernorm.weight9
|
1002 |
+
output202�2&layers.18.self_attn.k_layernorm.weightA
|
1003 |
+
output203� @2*layers.18.self_attn.qkv_proj.k_proj.weight9
|
1004 |
+
output204�2&layers.18.self_attn.q_layernorm.weightB
|
1005 |
+
output205� @2*layers.18.self_attn.qkv_proj.q_proj.weight6
|
1006 |
+
output206 ��2layers.18.mlp.down_proj.weight3
|
1007 |
+
output2070� �2layers.18.mlp.up_proj.weight=
|
1008 |
+
output208� 2)layers.18.post_attention_layernorm.weight5
|
1009 |
+
output2090� �2layers.18.mlp.gate_proj.weightA
|
1010 |
+
output210��2(layers.19.self_attn.o_proj.o_proj.weightA
|
1011 |
+
output211� �2*layers.19.self_attn.qkv_proj.v_proj.weight4
|
1012 |
+
output212� 2 layers.19.input_layernorm.weight9
|
1013 |
+
output213�2&layers.19.self_attn.k_layernorm.weightA
|
1014 |
+
output214� @2*layers.19.self_attn.qkv_proj.k_proj.weight9
|
1015 |
+
output215�2&layers.19.self_attn.q_layernorm.weightB
|
1016 |
+
output216� @2*layers.19.self_attn.qkv_proj.q_proj.weight6
|
1017 |
+
output217 ��2layers.19.mlp.down_proj.weight3
|
1018 |
+
output2180� �2layers.19.mlp.up_proj.weight=
|
1019 |
+
output219� 2)layers.19.post_attention_layernorm.weight5
|
1020 |
+
output2200� �2layers.19.mlp.gate_proj.weightA
|
1021 |
+
output221��2(layers.20.self_attn.o_proj.o_proj.weightA
|
1022 |
+
output222� �2*layers.20.self_attn.qkv_proj.v_proj.weight4
|
1023 |
+
output223� 2 layers.20.input_layernorm.weight9
|
1024 |
+
output224�2&layers.20.self_attn.k_layernorm.weightA
|
1025 |
+
output225� @2*layers.20.self_attn.qkv_proj.k_proj.weight9
|
1026 |
+
output226�2&layers.20.self_attn.q_layernorm.weightB
|
1027 |
+
output227� @2*layers.20.self_attn.qkv_proj.q_proj.weight6
|
1028 |
+
output228 ��2layers.20.mlp.down_proj.weight3
|
1029 |
+
output2290� �2layers.20.mlp.up_proj.weight=
|
1030 |
+
output230� 2)layers.20.post_attention_layernorm.weight5
|
1031 |
+
output2310� �2layers.20.mlp.gate_proj.weightA
|
1032 |
+
output232��2(layers.21.self_attn.o_proj.o_proj.weightA
|
1033 |
+
output233� �2*layers.21.self_attn.qkv_proj.v_proj.weight4
|
1034 |
+
output234� 2 layers.21.input_layernorm.weight9
|
1035 |
+
output235�2&layers.21.self_attn.k_layernorm.weightA
|
1036 |
+
output236� @2*layers.21.self_attn.qkv_proj.k_proj.weight9
|
1037 |
+
output237�2&layers.21.self_attn.q_layernorm.weightB
|
1038 |
+
output238� @2*layers.21.self_attn.qkv_proj.q_proj.weight6
|
1039 |
+
output239 ��2layers.21.mlp.down_proj.weight3
|
1040 |
+
output2400� �2layers.21.mlp.up_proj.weight=
|
1041 |
+
output241� 2)layers.21.post_attention_layernorm.weight5
|
1042 |
+
output2420� �2layers.21.mlp.gate_proj.weightA
|
1043 |
+
output243��2(layers.22.self_attn.o_proj.o_proj.weightA
|
1044 |
+
output244� �2*layers.22.self_attn.qkv_proj.v_proj.weight4
|
1045 |
+
output245� 2 layers.22.input_layernorm.weight9
|
1046 |
+
output246�2&layers.22.self_attn.k_layernorm.weightA
|
1047 |
+
output247� @2*layers.22.self_attn.qkv_proj.k_proj.weight9
|
1048 |
+
output248�2&layers.22.self_attn.q_layernorm.weightB
|
1049 |
+
output249� @2*layers.22.self_attn.qkv_proj.q_proj.weight6
|
1050 |
+
output250 ��2layers.22.mlp.down_proj.weight3
|
1051 |
+
output2510� �2layers.22.mlp.up_proj.weight=
|
1052 |
+
output252� 2)layers.22.post_attention_layernorm.weight5
|
1053 |
+
output2530� �2layers.22.mlp.gate_proj.weightA
|
1054 |
+
output254��2(layers.23.self_attn.o_proj.o_proj.weightA
|
1055 |
+
output255� �2*layers.23.self_attn.qkv_proj.v_proj.weight4
|
1056 |
+
output256� 2 layers.23.input_layernorm.weight9
|
1057 |
+
output257�2&layers.23.self_attn.k_layernorm.weightA
|
1058 |
+
output258� @2*layers.23.self_attn.qkv_proj.k_proj.weight9
|
1059 |
+
output259�2&layers.23.self_attn.q_layernorm.weightB
|
1060 |
+
output260� @2*layers.23.self_attn.qkv_proj.q_proj.weight6
|
1061 |
+
output261 ��2layers.23.mlp.down_proj.weight3
|
1062 |
+
output2620� �2layers.23.mlp.up_proj.weight=
|
1063 |
+
output263� 2)layers.23.post_attention_layernorm.weight5
|
1064 |
+
output2640� �2layers.23.mlp.gate_proj.weightA
|
1065 |
+
output265��2(layers.24.self_attn.o_proj.o_proj.weightA
|
1066 |
+
output266� �2*layers.24.self_attn.qkv_proj.v_proj.weight4
|
1067 |
+
output267� 2 layers.24.input_layernorm.weight9
|
1068 |
+
output268�2&layers.24.self_attn.k_layernorm.weightA
|
1069 |
+
output269� @2*layers.24.self_attn.qkv_proj.k_proj.weight9
|
1070 |
+
output270�2&layers.24.self_attn.q_layernorm.weightB
|
1071 |
+
output271� @2*layers.24.self_attn.qkv_proj.q_proj.weight6
|
1072 |
+
output272 ��2layers.24.mlp.down_proj.weight3
|
1073 |
+
output2730� �2layers.24.mlp.up_proj.weight=
|
1074 |
+
output274� 2)layers.24.post_attention_layernorm.weight5
|
1075 |
+
output2750� �2layers.24.mlp.gate_proj.weightA
|
1076 |
+
output276��2(layers.25.self_attn.o_proj.o_proj.weightA
|
1077 |
+
output277� �2*layers.25.self_attn.qkv_proj.v_proj.weight4
|
1078 |
+
output278� 2 layers.25.input_layernorm.weight9
|
1079 |
+
output279�2&layers.25.self_attn.k_layernorm.weightA
|
1080 |
+
output280� @2*layers.25.self_attn.qkv_proj.k_proj.weight9
|
1081 |
+
output281�2&layers.25.self_attn.q_layernorm.weightB
|
1082 |
+
output282� @2*layers.25.self_attn.qkv_proj.q_proj.weight6
|
1083 |
+
output283 ��2layers.25.mlp.down_proj.weight3
|
1084 |
+
output2840� �2layers.25.mlp.up_proj.weight=
|
1085 |
+
output285� 2)layers.25.post_attention_layernorm.weight5
|
1086 |
+
output2860� �2layers.25.mlp.gate_proj.weightA
|
1087 |
+
output287��2(layers.26.self_attn.o_proj.o_proj.weightA
|
1088 |
+
output288� �2*layers.26.self_attn.qkv_proj.v_proj.weight4
|
1089 |
+
output289� 2 layers.26.input_layernorm.weight9
|
1090 |
+
output290�2&layers.26.self_attn.k_layernorm.weightA
|
1091 |
+
output291� @2*layers.26.self_attn.qkv_proj.k_proj.weight9
|
1092 |
+
output292�2&layers.26.self_attn.q_layernorm.weightB
|
1093 |
+
output293� @2*layers.26.self_attn.qkv_proj.q_proj.weight6
|
1094 |
+
output294 ��2layers.26.mlp.down_proj.weight3
|
1095 |
+
output2950� �2layers.26.mlp.up_proj.weight=
|
1096 |
+
output296� 2)layers.26.post_attention_layernorm.weight5
|
1097 |
+
output2970� �2layers.26.mlp.gate_proj.weightA
|
1098 |
+
output298��2(layers.27.self_attn.o_proj.o_proj.weightA
|
1099 |
+
output299� �2*layers.27.self_attn.qkv_proj.v_proj.weight4
|
1100 |
+
output300� 2 layers.27.input_layernorm.weight9
|
1101 |
+
output301�2&layers.27.self_attn.k_layernorm.weightA
|
1102 |
+
output302� @2*layers.27.self_attn.qkv_proj.k_proj.weight9
|
1103 |
+
output303�2&layers.27.self_attn.q_layernorm.weightB
|
1104 |
+
output304� @2*layers.27.self_attn.qkv_proj.q_proj.weight6
|
1105 |
+
output305 ��2layers.27.mlp.down_proj.weight3
|
1106 |
+
output3060� �2layers.27.mlp.up_proj.weight=
|
1107 |
+
output307� 2)layers.27.post_attention_layernorm.weight5
|
1108 |
+
output3080� �2layers.27.mlp.gate_proj.weightA
|
1109 |
+
output309��2(layers.28.self_attn.o_proj.o_proj.weightA
|
1110 |
+
output310� �2*layers.28.self_attn.qkv_proj.v_proj.weight4
|
1111 |
+
output311� 2 layers.28.input_layernorm.weight9
|
1112 |
+
output312�2&layers.28.self_attn.k_layernorm.weightA
|
1113 |
+
output313� @2*layers.28.self_attn.qkv_proj.k_proj.weight9
|
1114 |
+
output314�2&layers.28.self_attn.q_layernorm.weightB
|
1115 |
+
output315� @2*layers.28.self_attn.qkv_proj.q_proj.weight6
|
1116 |
+
output316 ��2layers.28.mlp.down_proj.weight3
|
1117 |
+
output3170� �2layers.28.mlp.up_proj.weight=
|
1118 |
+
output318� 2)layers.28.post_attention_layernorm.weight5
|
1119 |
+
output3190� �2layers.28.mlp.gate_proj.weightA
|
1120 |
+
output320��2(layers.29.self_attn.o_proj.o_proj.weightA
|
1121 |
+
output321� �2*layers.29.self_attn.qkv_proj.v_proj.weight4
|
1122 |
+
output322� 2 layers.29.input_layernorm.weight9
|
1123 |
+
output323�2&layers.29.self_attn.k_layernorm.weightA
|
1124 |
+
output324� @2*layers.29.self_attn.qkv_proj.k_proj.weight9
|
1125 |
+
output325�2&layers.29.self_attn.q_layernorm.weightB
|
1126 |
+
output326� @2*layers.29.self_attn.qkv_proj.q_proj.weight6
|
1127 |
+
output327 ��2layers.29.mlp.down_proj.weight3
|
1128 |
+
output3280� �2layers.29.mlp.up_proj.weight=
|
1129 |
+
output329� 2)layers.29.post_attention_layernorm.weight5
|
1130 |
+
output3300� �2layers.29.mlp.gate_proj.weightA
|
1131 |
+
output331��2(layers.30.self_attn.o_proj.o_proj.weightA
|
1132 |
+
output332� �2*layers.30.self_attn.qkv_proj.v_proj.weight4
|
1133 |
+
output333� 2 layers.30.input_layernorm.weight9
|
1134 |
+
output334�2&layers.30.self_attn.k_layernorm.weightA
|
1135 |
+
output335� @2*layers.30.self_attn.qkv_proj.k_proj.weight9
|
1136 |
+
output336�2&layers.30.self_attn.q_layernorm.weightB
|
1137 |
+
output337� @2*layers.30.self_attn.qkv_proj.q_proj.weight6
|
1138 |
+
output338 ��2layers.30.mlp.down_proj.weight3
|
1139 |
+
output3390� �2layers.30.mlp.up_proj.weight=
|
1140 |
+
output340� 2)layers.30.post_attention_layernorm.weight5
|
1141 |
+
output3410� �2layers.30.mlp.gate_proj.weightA
|
1142 |
+
output342��2(layers.31.self_attn.o_proj.o_proj.weightA
|
1143 |
+
output343� �2*layers.31.self_attn.qkv_proj.v_proj.weight4
|
1144 |
+
output344� 2 layers.31.input_layernorm.weight9
|
1145 |
+
output345�2&layers.31.self_attn.k_layernorm.weightA
|
1146 |
+
output346� @2*layers.31.self_attn.qkv_proj.k_proj.weight9
|
1147 |
+
output347�2&layers.31.self_attn.q_layernorm.weightB
|
1148 |
+
output348� @2*layers.31.self_attn.qkv_proj.q_proj.weight6
|
1149 |
+
output349 ��2layers.31.mlp.down_proj.weight3
|
1150 |
+
output3500� �2layers.31.mlp.up_proj.weight=
|
1151 |
+
output351� 2)layers.31.post_attention_layernorm.weight5
|
1152 |
+
output3520� �2layers.31.mlp.gate_proj.weightA
|
1153 |
+
output353��2(layers.32.self_attn.o_proj.o_proj.weightA
|
1154 |
+
output354� �2*layers.32.self_attn.qkv_proj.v_proj.weight4
|
1155 |
+
output355� 2 layers.32.input_layernorm.weight9
|
1156 |
+
output356�2&layers.32.self_attn.k_layernorm.weightA
|
1157 |
+
output357� @2*layers.32.self_attn.qkv_proj.k_proj.weight9
|
1158 |
+
output358�2&layers.32.self_attn.q_layernorm.weightB
|
1159 |
+
output359� @2*layers.32.self_attn.qkv_proj.q_proj.weight6
|
1160 |
+
output360 ��2layers.32.mlp.down_proj.weight3
|
1161 |
+
output3610� �2layers.32.mlp.up_proj.weight=
|
1162 |
+
output362� 2)layers.32.post_attention_layernorm.weight5
|
1163 |
+
output3630� �2layers.32.mlp.gate_proj.weightA
|
1164 |
+
output364��2(layers.33.self_attn.o_proj.o_proj.weightA
|
1165 |
+
output365� �2*layers.33.self_attn.qkv_proj.v_proj.weight4
|
1166 |
+
output366� 2 layers.33.input_layernorm.weight9
|
1167 |
+
output367�2&layers.33.self_attn.k_layernorm.weightA
|
1168 |
+
output368� @2*layers.33.self_attn.qkv_proj.k_proj.weight9
|
1169 |
+
output369�2&layers.33.self_attn.q_layernorm.weightB
|
1170 |
+
output370� @2*layers.33.self_attn.qkv_proj.q_proj.weight6
|
1171 |
+
output371 ��2layers.33.mlp.down_proj.weight3
|
1172 |
+
output3720� �2layers.33.mlp.up_proj.weight=
|
1173 |
+
output373� 2)layers.33.post_attention_layernorm.weight5
|
1174 |
+
output3740� �2layers.33.mlp.gate_proj.weightA
|
1175 |
+
output375��2(layers.34.self_attn.o_proj.o_proj.weightA
|
1176 |
+
output376� �2*layers.34.self_attn.qkv_proj.v_proj.weight4
|
1177 |
+
output377� 2 layers.34.input_layernorm.weight9
|
1178 |
+
output378�2&layers.34.self_attn.k_layernorm.weightA
|
1179 |
+
output379� @2*layers.34.self_attn.qkv_proj.k_proj.weight9
|
1180 |
+
output380�2&layers.34.self_attn.q_layernorm.weightB
|
1181 |
+
output381� @2*layers.34.self_attn.qkv_proj.q_proj.weight6
|
1182 |
+
output382 ��2layers.34.mlp.down_proj.weight3
|
1183 |
+
output3830� �2layers.34.mlp.up_proj.weight=
|
1184 |
+
output384� 2)layers.34.post_attention_layernorm.weight5
|
1185 |
+
output3850� �2layers.34.mlp.gate_proj.weightA
|
1186 |
+
output386��2(layers.35.self_attn.o_proj.o_proj.weightA
|
1187 |
+
output387� �2*layers.35.self_attn.qkv_proj.v_proj.weight4
|
1188 |
+
output388� 2 layers.35.input_layernorm.weight9
|
1189 |
+
output389�2&layers.35.self_attn.k_layernorm.weightA
|
1190 |
+
output390� @2*layers.35.self_attn.qkv_proj.k_proj.weight9
|
1191 |
+
output391�2&layers.35.self_attn.q_layernorm.weightB
|
1192 |
+
output392� @2*layers.35.self_attn.qkv_proj.q_proj.weight6
|
1193 |
+
output393 ��2layers.35.mlp.down_proj.weight3
|
1194 |
+
output3940� �2layers.35.mlp.up_proj.weight=
|
1195 |
+
output395� 2)layers.35.post_attention_layernorm.weight5
|
1196 |
+
output3960� �2layers.35.mlp.gate_proj.weight$
|
1197 |
+
output397��� 2lm_head.weight
|
1198 |
+
output398� 2norm.weight
|
layout_opt/model/graph.hlo
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:12b45b028e502b2dd8c42c1287fbdbea434454143a30d473806853bc18673d98
|
3 |
+
size 211060
|
model.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c36077018a9f85728962cc73bfcba755ce1d5d5b6f608dacf65d7b95596eb109
|
3 |
+
size 47198475
|
neuron_config.json
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_attn_implementation_autoset": false,
|
3 |
+
"_name_or_path": "Qwen/Qwen3-8B",
|
4 |
+
"add_cross_attention": false,
|
5 |
+
"architectures": [
|
6 |
+
"Qwen3ForCausalLM"
|
7 |
+
],
|
8 |
+
"attention_bias": false,
|
9 |
+
"attention_dropout": 0.0,
|
10 |
+
"attribute_map": {},
|
11 |
+
"bad_words_ids": null,
|
12 |
+
"begin_suppress_tokens": null,
|
13 |
+
"bos_token_id": 151643,
|
14 |
+
"chunk_size_feed_forward": 0,
|
15 |
+
"cross_attention_hidden_size": null,
|
16 |
+
"decoder_start_token_id": null,
|
17 |
+
"diversity_penalty": 0.0,
|
18 |
+
"do_sample": false,
|
19 |
+
"early_stopping": false,
|
20 |
+
"encoder_no_repeat_ngram_size": 0,
|
21 |
+
"eos_token_id": 151645,
|
22 |
+
"exponential_decay_length_penalty": null,
|
23 |
+
"finetuning_task": null,
|
24 |
+
"forced_bos_token_id": null,
|
25 |
+
"forced_eos_token_id": null,
|
26 |
+
"fused_spec_config": null,
|
27 |
+
"head_dim": 128,
|
28 |
+
"hidden_act": "silu",
|
29 |
+
"hidden_size": 4096,
|
30 |
+
"id2label": {
|
31 |
+
"0": "LABEL_0",
|
32 |
+
"1": "LABEL_1"
|
33 |
+
},
|
34 |
+
"initializer_range": 0.02,
|
35 |
+
"intermediate_size": 12288,
|
36 |
+
"is_decoder": false,
|
37 |
+
"is_encoder_decoder": false,
|
38 |
+
"label2id": {
|
39 |
+
"LABEL_0": 0,
|
40 |
+
"LABEL_1": 1
|
41 |
+
},
|
42 |
+
"length_penalty": 1.0,
|
43 |
+
"max_length": 20,
|
44 |
+
"max_position_embeddings": 40960,
|
45 |
+
"max_window_layers": 36,
|
46 |
+
"metadata": null,
|
47 |
+
"min_length": 0,
|
48 |
+
"model_type": "qwen3",
|
49 |
+
"neuron_config": {
|
50 |
+
"activation_quantization_type": null,
|
51 |
+
"allow_input_truncation": false,
|
52 |
+
"apply_seq_ids_mask": false,
|
53 |
+
"async_mode": false,
|
54 |
+
"attention_dp_degree": 1,
|
55 |
+
"attention_dtype": null,
|
56 |
+
"attn_block_cte_nki_kernel_enabled": false,
|
57 |
+
"attn_block_tkg_nki_kernel_cache_update": false,
|
58 |
+
"attn_block_tkg_nki_kernel_enabled": false,
|
59 |
+
"attn_cls": {
|
60 |
+
"__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
|
61 |
+
"__name__": "NeuronQwen3Attention"
|
62 |
+
},
|
63 |
+
"attn_kernel_enabled": null,
|
64 |
+
"attn_tkg_builtin_kernel_enabled": false,
|
65 |
+
"attn_tkg_nki_kernel_enabled": false,
|
66 |
+
"batch_size": 1,
|
67 |
+
"bucket_n_active_tokens": false,
|
68 |
+
"buckets": [
|
69 |
+
1024
|
70 |
+
],
|
71 |
+
"cast_type": "config",
|
72 |
+
"cc_pipeline_tiling_factor": 2,
|
73 |
+
"chunked_prefill_config": null,
|
74 |
+
"context_encoding_buckets": null,
|
75 |
+
"cp_degree": 1,
|
76 |
+
"ctx_batch_size": 1,
|
77 |
+
"disable_kv_cache_tiling": false,
|
78 |
+
"draft_model_modules_to_not_convert": null,
|
79 |
+
"enable_bucketing": true,
|
80 |
+
"enable_eagle_draft_input_norm": false,
|
81 |
+
"enable_eagle_speculation": false,
|
82 |
+
"enable_fused_speculation": false,
|
83 |
+
"enable_long_context_mode": false,
|
84 |
+
"enable_output_completion_notifications": false,
|
85 |
+
"enable_spill_reload_dge": false,
|
86 |
+
"enable_token_tree": false,
|
87 |
+
"ep_degree": 1,
|
88 |
+
"expert_mlp_nki_kernel_enabled": null,
|
89 |
+
"flash_decoding_enabled": false,
|
90 |
+
"fused_qkv": false,
|
91 |
+
"fused_rmsnorm_skip_gamma": false,
|
92 |
+
"is_block_kv_layout": null,
|
93 |
+
"is_chunked_prefill": false,
|
94 |
+
"is_continuous_batching": true,
|
95 |
+
"is_eagle_draft": false,
|
96 |
+
"is_medusa": false,
|
97 |
+
"is_prefill_stage": null,
|
98 |
+
"is_prefix_caching": false,
|
99 |
+
"k_cache_transposed": false,
|
100 |
+
"kv_cache_batch_size": 1,
|
101 |
+
"kv_cache_padding_size": 0,
|
102 |
+
"kv_cache_quant": false,
|
103 |
+
"kv_cache_tiling": false,
|
104 |
+
"layer_boundary_markers": false,
|
105 |
+
"lm_head_pad": false,
|
106 |
+
"lm_head_pad_alignment_size": 1,
|
107 |
+
"local_ranks_size": 2,
|
108 |
+
"logical_nc_config": 1,
|
109 |
+
"lora_config": null,
|
110 |
+
"max_batch_size": 1,
|
111 |
+
"max_context_length": 1024,
|
112 |
+
"max_length": 1024,
|
113 |
+
"max_new_tokens": null,
|
114 |
+
"medusa_speculation_length": 0,
|
115 |
+
"medusa_tree": null,
|
116 |
+
"mlp_kernel_enabled": false,
|
117 |
+
"mlp_kernel_fuse_residual_add": false,
|
118 |
+
"modules_to_not_convert": null,
|
119 |
+
"moe_fused_nki_kernel_enabled": null,
|
120 |
+
"n_active_tokens": 1024,
|
121 |
+
"n_positions": 1024,
|
122 |
+
"num_medusa_heads": 0,
|
123 |
+
"on_cpu": false,
|
124 |
+
"on_device_sampling_config": {
|
125 |
+
"deterministic": false,
|
126 |
+
"do_sample": false,
|
127 |
+
"dynamic": true,
|
128 |
+
"global_topk": 256,
|
129 |
+
"on_device_sampling_config": true,
|
130 |
+
"temperature": 1.0,
|
131 |
+
"top_k": 1,
|
132 |
+
"top_k_kernel_enabled": false,
|
133 |
+
"top_p": 1.0
|
134 |
+
},
|
135 |
+
"output_logits": false,
|
136 |
+
"overrides_torch_dtype": true,
|
137 |
+
"pa_block_size": 1024,
|
138 |
+
"pa_num_blocks": 1,
|
139 |
+
"padding_side": "right",
|
140 |
+
"pp_degree": 1,
|
141 |
+
"prefix_buckets": null,
|
142 |
+
"qk_layernorm": false,
|
143 |
+
"qkv_kernel_enabled": false,
|
144 |
+
"qkv_kernel_fuse_residual_add": false,
|
145 |
+
"qkv_kernel_nbsd_layout": false,
|
146 |
+
"quantization_dtype": "int8",
|
147 |
+
"quantization_type": "per_tensor_symmetric",
|
148 |
+
"quantize_clamp_bound": Infinity,
|
149 |
+
"quantized": false,
|
150 |
+
"quantized_checkpoints_path": null,
|
151 |
+
"quantized_mlp_kernel_enabled": false,
|
152 |
+
"rmsnorm_quantize_kernel_enabled": false,
|
153 |
+
"router_topk_nki_kernel_enabled": null,
|
154 |
+
"rpl_reduce_dtype": null,
|
155 |
+
"save_sharded_checkpoint": true,
|
156 |
+
"scratchpad_page_size": null,
|
157 |
+
"seq_len": 1024,
|
158 |
+
"seq_len_threshold_for_cc_tiling": 16384,
|
159 |
+
"sequence_parallel_enabled": false,
|
160 |
+
"shared_mlp_nki_kernel_enabled": null,
|
161 |
+
"skip_sharding": false,
|
162 |
+
"skip_warmup": false,
|
163 |
+
"spec_batch_size": 1,
|
164 |
+
"speculation_length": 0,
|
165 |
+
"start_rank_id": 0,
|
166 |
+
"target": null,
|
167 |
+
"tile_cc": false,
|
168 |
+
"tkg_batch_size": 1,
|
169 |
+
"token_generation_buckets": null,
|
170 |
+
"token_tree_config": null,
|
171 |
+
"torch_dtype": "bfloat16",
|
172 |
+
"tp_degree": 2,
|
173 |
+
"vocab_parallel": false,
|
174 |
+
"weight_gather_seq_len_threshold": 32768,
|
175 |
+
"weights_to_skip_layout_optimization": [],
|
176 |
+
"world_size": 2
|
177 |
+
},
|
178 |
+
"no_repeat_ngram_size": 0,
|
179 |
+
"num_attention_heads": 32,
|
180 |
+
"num_beam_groups": 1,
|
181 |
+
"num_beams": 1,
|
182 |
+
"num_cores_per_group": 1,
|
183 |
+
"num_hidden_layers": 36,
|
184 |
+
"num_key_value_heads": 8,
|
185 |
+
"num_return_sequences": 1,
|
186 |
+
"output_attentions": false,
|
187 |
+
"output_hidden_states": false,
|
188 |
+
"output_scores": false,
|
189 |
+
"pad_token_id": null,
|
190 |
+
"prefix": null,
|
191 |
+
"problem_type": null,
|
192 |
+
"pruned_heads": {},
|
193 |
+
"remove_invalid_values": false,
|
194 |
+
"repetition_penalty": 1.0,
|
195 |
+
"return_dict": true,
|
196 |
+
"return_dict_in_generate": false,
|
197 |
+
"rms_norm_eps": 1e-06,
|
198 |
+
"rope_scaling": null,
|
199 |
+
"rope_theta": 1000000,
|
200 |
+
"sep_token_id": null,
|
201 |
+
"sliding_window": null,
|
202 |
+
"suppress_tokens": null,
|
203 |
+
"task_specific_params": null,
|
204 |
+
"temperature": 1.0,
|
205 |
+
"tf_legacy_loss": false,
|
206 |
+
"tie_encoder_decoder": false,
|
207 |
+
"tie_word_embeddings": false,
|
208 |
+
"tokenizer_class": null,
|
209 |
+
"top_k": 50,
|
210 |
+
"top_p": 1.0,
|
211 |
+
"torchscript": false,
|
212 |
+
"transformers_version": "4.51.0",
|
213 |
+
"typical_p": 1.0,
|
214 |
+
"use_bfloat16": false,
|
215 |
+
"use_cache": true,
|
216 |
+
"use_sliding_window": false,
|
217 |
+
"vocab_size": 151936
|
218 |
+
}
|
token_generation_model/_tp0_bk0/command.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
neuronx-cc compile --framework=XLA model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb --output model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --enable-internal-neff-wrapper --verbose=35
|
token_generation_model/_tp0_bk0/compile_flags.MODULE_6ef5ba8b41fbbe77f080+74ae8282.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=1", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/home/ubuntu/qwen3/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"]
|
token_generation_model/_tp0_bk0/global_metric_store.json
ADDED
@@ -0,0 +1,540 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Average": {
|
3 |
+
"tensorizer": {
|
4 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.8321762084961,
|
5 |
+
"StaticProfiler::AveragePartitionUtilization": 99.3888168334961,
|
6 |
+
"StaticProfiler::AveragePeUtilization": 99.65400695800781,
|
7 |
+
"StaticProfiler::LocalizationEfficiency": 109.9806137084961,
|
8 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 110.06793212890625,
|
9 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
|
10 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"Count": {
|
14 |
+
"tensorizer": {
|
15 |
+
"StaticProfiler::AverageFractalPeUtilization": 1,
|
16 |
+
"StaticProfiler::AveragePartitionUtilization": 1,
|
17 |
+
"StaticProfiler::AveragePeUtilization": 1,
|
18 |
+
"StaticProfiler::LocalizationEfficiency": 1,
|
19 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1,
|
20 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 1,
|
21 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 1
|
22 |
+
}
|
23 |
+
},
|
24 |
+
"Sum": {
|
25 |
+
"compiletime": {
|
26 |
+
"AGOrderingAnalysisPass": 1.4457588195800781,
|
27 |
+
"AffinePredicateResolution": 0.05167531967163086,
|
28 |
+
"AliasDependencyElimination": 0.0026276111602783203,
|
29 |
+
"AliasDependencyInduction": 0.44934630393981934,
|
30 |
+
"AliasDependencyReset": 1.2677826881408691,
|
31 |
+
"BFComputeCutting": 0.06423807144165039,
|
32 |
+
"BirCodeGenLoop": 2.421293258666992,
|
33 |
+
"CCOpFusion": 0.41050028800964355,
|
34 |
+
"CanonicalizeConv": 9.999999974752427e-07,
|
35 |
+
"CanonicalizeDAGForPGTiling": 0.21233797073364258,
|
36 |
+
"CanonicalizeForTensorizer": 0.0003640000068116933,
|
37 |
+
"CanonicalizeIR": 0.06626629829406738,
|
38 |
+
"Canonicalizer": 0.007044999860227108,
|
39 |
+
"CoalesceCCOp": 0.19146490097045898,
|
40 |
+
"CommuteConcat": 0.03319668769836426,
|
41 |
+
"DMALocalityOpt": 0.035207271575927734,
|
42 |
+
"DMAProfiler": 0.08866691589355469,
|
43 |
+
"DMATilingProfiler": 0.07109546661376953,
|
44 |
+
"DataLocalityOpt": 1.910703182220459,
|
45 |
+
"DataStreaming": 0.15389323234558105,
|
46 |
+
"DeConcat": 0.012087583541870117,
|
47 |
+
"DeadCodeElimination": 0.035611867904663086,
|
48 |
+
"DeadStoreElimination": 0.37193870544433594,
|
49 |
+
"DelinearIndices": 0.2894127368927002,
|
50 |
+
"Delinearization": 0.1295926570892334,
|
51 |
+
"DoNothing": 0.00019550323486328125,
|
52 |
+
"DramToDramTranspose": 1.0679569244384766,
|
53 |
+
"DumpGraphAndMetadata": 0.24142217636108398,
|
54 |
+
"EliminateDivs": 0.17337489128112793,
|
55 |
+
"ExpandBatchNorm": 0.06027984619140625,
|
56 |
+
"ExpandISAMacro": 0.0909569263458252,
|
57 |
+
"FactorizeBlkDims": 0.24945974349975586,
|
58 |
+
"FactorizeThreadAxesInFreeDims": 0.03613853454589844,
|
59 |
+
"FlattenMacroLoop": 0.26774168014526367,
|
60 |
+
"GenericAccessSimplifier": 0.03175926208496094,
|
61 |
+
"HoistCompute": 4.8000001697801054e-05,
|
62 |
+
"IdentifyCrossPassTensors": 0.00013600000238511711,
|
63 |
+
"InferInitValue": 1.029360294342041,
|
64 |
+
"InferIntrinsicOnCC": 0.34307408332824707,
|
65 |
+
"InferNeuronTensor": 1.7935998439788818,
|
66 |
+
"InferNonlocalTensors": 3.6307339668273926,
|
67 |
+
"InferPSumTensor": 0.9782986640930176,
|
68 |
+
"InlineNativeKernels": 0.05374264717102051,
|
69 |
+
"InsertIOTransposes": 1.162278652191162,
|
70 |
+
"InsertLocalTransposes": 1.0349645614624023,
|
71 |
+
"InsertOffloadedTransposes": 0.0943443775177002,
|
72 |
+
"LICM": 0.1061861515045166,
|
73 |
+
"LateLegalizeInst": 0.22754216194152832,
|
74 |
+
"LateLegalizePostSplit": 0.09247255325317383,
|
75 |
+
"LateLowerReshapeOp": 0.04053616523742676,
|
76 |
+
"LateLowerTensorOp": 0.3356895446777344,
|
77 |
+
"LateNeuronInstComb": 0.4516925811767578,
|
78 |
+
"LayoutPreprocessing": 0.9441671371459961,
|
79 |
+
"LayoutPreprocessingAndAnalysis": 1.2680203914642334,
|
80 |
+
"LayoutRequirementAnalysis": 0.309098482131958,
|
81 |
+
"LegalizeCCOpLayout": 0.07318258285522461,
|
82 |
+
"LegalizeOpLevelAlias": 0.03343796730041504,
|
83 |
+
"LegalizePartitionReduce": 0.034781694412231445,
|
84 |
+
"LegalizeSundaAccess": 1.4558701515197754,
|
85 |
+
"LegalizeSundaMacro": 0.37755250930786133,
|
86 |
+
"LegalizeType": 0.20858454704284668,
|
87 |
+
"LocalLayoutOpt": 0.36218762397766113,
|
88 |
+
"LoopFusion": 0.31240200996398926,
|
89 |
+
"LoopSplitting": 0.013066768646240234,
|
90 |
+
"LowerBroadcast": 0.047890663146972656,
|
91 |
+
"LowerCCOpBlockAxis": 0.23094987869262695,
|
92 |
+
"LowerComplexBroadcast": 0.15572404861450195,
|
93 |
+
"LowerIntrinsics": 1.228858470916748,
|
94 |
+
"LowerTensorOp": 0.4897449016571045,
|
95 |
+
"LowerTranspose": 0.3995330333709717,
|
96 |
+
"MacroGeneration": 2.335334062576294,
|
97 |
+
"MaskPropagation": 0.14433836936950684,
|
98 |
+
"MemcastMotion": 0.00013000000035390258,
|
99 |
+
"MemcpyElimination": 3.9867260456085205,
|
100 |
+
"MutateDataType": 0.04344511032104492,
|
101 |
+
"NeuronAliasDependencyInduction": 0.025929927825927734,
|
102 |
+
"NeuronAliasDependencyReset": 0.04254412651062012,
|
103 |
+
"NeuronInstComb": 0.19350981712341309,
|
104 |
+
"NeuronLICM": 0.2897522449493408,
|
105 |
+
"NeuronLoopFusion": 0.4089043140411377,
|
106 |
+
"NeuronLoopInterchange": 0.04476189613342285,
|
107 |
+
"NeuronSimplifier": 0.30055856704711914,
|
108 |
+
"NeuronSimplifyPredicates": 0.18221426010131836,
|
109 |
+
"NeuronValueNumbering": 0.10663247108459473,
|
110 |
+
"OptimizeAliasedCopyChain": 0.01511383056640625,
|
111 |
+
"OptimizeNKIKernels": 0.4606451988220215,
|
112 |
+
"PAGLayoutOpt": 26.32272720336914,
|
113 |
+
"PComputeCutting": 0.302201509475708,
|
114 |
+
"PGLayoutTilingPipeline": 38.88710403442383,
|
115 |
+
"PGTiling": 4.423768043518066,
|
116 |
+
"PadElimination": 0.008622884750366211,
|
117 |
+
"ParAxesAnnotation": 25.272018432617188,
|
118 |
+
"PartialLoopFusion": 0.2368309497833252,
|
119 |
+
"PartialSimdFusion": 0.20722246170043945,
|
120 |
+
"PenguinizeFunctions": 0.00015999999595806003,
|
121 |
+
"PerfectLoopNest": 0.06273055076599121,
|
122 |
+
"PruneFunctions": 0.00016700000560376793,
|
123 |
+
"RecognizeOpIdiom": 0.20455479621887207,
|
124 |
+
"Recompute": 0.00649714469909668,
|
125 |
+
"RelaxPredicates": 0.154876708984375,
|
126 |
+
"Rematerialization": 0.16764259338378906,
|
127 |
+
"RemoveOptimizationBarriers": 0.00014099999680183828,
|
128 |
+
"ReshapeWeights": 0.021569013595581055,
|
129 |
+
"ResolveAccessConflict": 0.24012255668640137,
|
130 |
+
"ResolveComplicatePredicates": 0.05034017562866211,
|
131 |
+
"RewriteReplicationMatmul": 0.04589343070983887,
|
132 |
+
"RewriteWeights": 0.05840659141540527,
|
133 |
+
"SFKVectorizer": 3.1227571964263916,
|
134 |
+
"ScatterMotion": 0.0041600000113248825,
|
135 |
+
"SimpleAllReduceTiling": 0.06594347953796387,
|
136 |
+
"Simplifier": 0.11366057395935059,
|
137 |
+
"SimplifyMacroPredicates": 0.18840670585632324,
|
138 |
+
"SimplifyNeuronTensor": 1.3299446105957031,
|
139 |
+
"SimplifySlice": 0.03386688232421875,
|
140 |
+
"SimplifyTensor": 0.21405529975891113,
|
141 |
+
"SpillPSum": 0.5441117286682129,
|
142 |
+
"SplitAPUnionSets": 0.3313255310058594,
|
143 |
+
"SplitAccGrp": 0.03839588165283203,
|
144 |
+
"StaticProfiler": 0.13296246528625488,
|
145 |
+
"StaticTransposeLocalTensor": 0.21724367141723633,
|
146 |
+
"SundaISel": 1.6302134990692139,
|
147 |
+
"TCTransform": 0.03438615798950195,
|
148 |
+
"TensorInitialization": 0.13414645195007324,
|
149 |
+
"TensorOpSimplifier": 0.27712535858154297,
|
150 |
+
"TensorOpTransform": 0.8646912574768066,
|
151 |
+
"TensorizerLegalizationPass": 0.000155999994603917,
|
152 |
+
"TileCCOps": 0.263721227645874,
|
153 |
+
"TilingProfiler": 0.39296984672546387,
|
154 |
+
"TransformConvOp": 0.06336498260498047,
|
155 |
+
"TritiumFusion": 1.0901517868041992,
|
156 |
+
"ValueNumbering": 0.09328150749206543,
|
157 |
+
"VectorizeDMA": 0.03394460678100586,
|
158 |
+
"VectorizeMatMult": 0.0209348201751709,
|
159 |
+
"VerifySupportedOps": 0.00023200000578071922,
|
160 |
+
"WeightCoalescing": 0.05484199523925781,
|
161 |
+
"ZeroSizeTensorElimination": 0.0004336833953857422,
|
162 |
+
"algsimp": 0.0020280000753700733,
|
163 |
+
"batchnorm_expander": 0.0007249999907799065,
|
164 |
+
"boundary-marker-removal": 0.0004140000091865659,
|
165 |
+
"call-inliner": 0.0002570000069681555,
|
166 |
+
"canonicalize-boundary-marker": 0.00044800000614486635,
|
167 |
+
"collective-stream-id-checker": 7.000000186963007e-05,
|
168 |
+
"comparison-expander": 0.00041700000292621553,
|
169 |
+
"computation-deduplicator": 0.0004440000047907233,
|
170 |
+
"conditional-to-select": 8.70000003487803e-05,
|
171 |
+
"config-lowering": 0.00020700000459328294,
|
172 |
+
"constant_folding": 0.00016900000628083944,
|
173 |
+
"cse": 0.00043799998820759356,
|
174 |
+
"dce": 3.899999865097925e-05,
|
175 |
+
"dynamic-slice-transpose": 0.00015799999528098851,
|
176 |
+
"eliminate-redundant-compare": 0.0001539999939268455,
|
177 |
+
"emit-offloaded-dropout": 0.0002770000137388706,
|
178 |
+
"flatten-call-graph": 0.000299000006634742,
|
179 |
+
"fuse-send-recv": 0.0015030000358819962,
|
180 |
+
"hilo::LegalizeAlias": 0.003281000070273876,
|
181 |
+
"hilo::NeuronInstCombine": 0.0011020000092685223,
|
182 |
+
"hilo::NeuronOpFusion": 0.0003429999924264848,
|
183 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 0.00018600000475998968,
|
184 |
+
"hilo::ScheduleFusion": 3.5000000934815034e-05,
|
185 |
+
"hilo::SixtyFourHack": 0.00020599999697878957,
|
186 |
+
"hilo::VerifyAliasing": 7.000000186963007e-05,
|
187 |
+
"hlo-mac-count": 0.0006559999892488122,
|
188 |
+
"hlo-verifier": 0.006031000055372715,
|
189 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
190 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
191 |
+
"io-layout-normalization": 0.0009500000160187483,
|
192 |
+
"legalize-ccops": 1.700000029813964e-05,
|
193 |
+
"legalize-compare": 0.00036899998667649925,
|
194 |
+
"lower-argminmax-custom-call": 0.00013800000306218863,
|
195 |
+
"map-inline": 0.0006319999811239541,
|
196 |
+
"metadata-naming": 0.0009749999735504389,
|
197 |
+
"mlir::detail::OpToOpPassAdaptor": 0.00022499999613501132,
|
198 |
+
"mlir::hlo::MhloToPyPenguin": 0.025104999542236328,
|
199 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.002770999912172556,
|
200 |
+
"mlir::mhlo::LowerComplexPass": 0.001180000021122396,
|
201 |
+
"native-to-custom-softmax": 0.00041199999395757914,
|
202 |
+
"native-to-custom-softmax-dx": 0.00042600001324899495,
|
203 |
+
"operand_upcaster": 0.0007089999853633344,
|
204 |
+
"post-par-pipe-begin": 9.999999974752427e-07,
|
205 |
+
"post-par-pipe-end": 0.0,
|
206 |
+
"post-partition-simplification": 0.05639899894595146,
|
207 |
+
"pre-hlo-begin": 4.999999873689376e-06,
|
208 |
+
"pre-hlo-end": 9.999999974752427e-07,
|
209 |
+
"replace-minimum-constant": 0.0002209999947808683,
|
210 |
+
"reshape-mover": 7.400000322377309e-05,
|
211 |
+
"simplify-concat": 0.0018210000125691295,
|
212 |
+
"simplify-while-loops": 5.500000042957254e-05,
|
213 |
+
"transform-variadic-reduce": 0.0006440000142902136,
|
214 |
+
"tuple-simplifier": 0.00016700000560376793,
|
215 |
+
"unpack-nested-aws-ntwsr": 0.00035700001171790063,
|
216 |
+
"unroll-while-loop": 1.1000000085914508e-05
|
217 |
+
},
|
218 |
+
"hilo": {
|
219 |
+
"HloMacCount": 3802996736.0,
|
220 |
+
"Traffic": 8267154432.0
|
221 |
+
},
|
222 |
+
"tensorizer": {
|
223 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 262321,
|
224 |
+
"StaticProfiler::AifUb": 10.559271812438965,
|
225 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 11.613152503967285,
|
226 |
+
"StaticProfiler::AverageDmaLength": 6652.8759765625,
|
227 |
+
"StaticProfiler::DDRTransferBytes": 7587185496,
|
228 |
+
"StaticProfiler::InternalTransferBytes": 632323092,
|
229 |
+
"StaticProfiler::LoadExpanded": 1033407,
|
230 |
+
"StaticProfiler::StoreExpanded": 3422,
|
231 |
+
"StaticProfiler::TotalDMAExpanded": 1036829,
|
232 |
+
"StaticProfiler::TotalDynamicInstancesCount": 275548,
|
233 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 274994,
|
234 |
+
"StaticProfiler::TotalLNCComm": 0,
|
235 |
+
"StaticProfiler::TotalLNCCommTransfer": 0,
|
236 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0,
|
237 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0,
|
238 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 79,
|
239 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 231408,
|
240 |
+
"TilingProfiler::NumPfTransposes": 398,
|
241 |
+
"TilingProfiler::NumPfTransposesForIo": 37,
|
242 |
+
"TilingProfiler::NumPfTransposesForLocal": 216,
|
243 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 145,
|
244 |
+
"TilingProfiler::PfTransposeInstructions": 19513,
|
245 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 19152,
|
246 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 216,
|
247 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 145,
|
248 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 74,
|
249 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 2999,
|
250 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0,
|
251 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
|
252 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
|
253 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
|
254 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
|
255 |
+
"TransformConvOp::conv2d_column_packing": 0,
|
256 |
+
"TransformConvOp::conv2d_column_packing_1": 0,
|
257 |
+
"TransformConvOp::conv2d_column_packing_io10": 0,
|
258 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
|
259 |
+
}
|
260 |
+
},
|
261 |
+
"all": {
|
262 |
+
"compiletime": {
|
263 |
+
"CanonicalizeConv": 9.999999974752427e-07,
|
264 |
+
"CanonicalizeForTensorizer": 0.0003640000068116933,
|
265 |
+
"Canonicalizer": 0.007044999860227108,
|
266 |
+
"HoistCompute": 4.8000001697801054e-05,
|
267 |
+
"IdentifyCrossPassTensors": 0.00013600000238511711,
|
268 |
+
"MemcastMotion": 0.00013000000035390258,
|
269 |
+
"PenguinizeFunctions": 0.00015999999595806003,
|
270 |
+
"PruneFunctions": 0.00016700000560376793,
|
271 |
+
"RemoveOptimizationBarriers": 0.00014099999680183828,
|
272 |
+
"ScatterMotion": 0.0041600000113248825,
|
273 |
+
"TensorizerLegalizationPass": 0.000155999994603917,
|
274 |
+
"VerifySupportedOps": 0.00023200000578071922,
|
275 |
+
"algsimp": 0.0020280000753700733,
|
276 |
+
"batchnorm_expander": 0.0007249999907799065,
|
277 |
+
"boundary-marker-removal": 0.0004140000091865659,
|
278 |
+
"call-inliner": 0.0002570000069681555,
|
279 |
+
"canonicalize-boundary-marker": 0.00044800000614486635,
|
280 |
+
"collective-stream-id-checker": 7.000000186963007e-05,
|
281 |
+
"comparison-expander": 0.00041700000292621553,
|
282 |
+
"computation-deduplicator": 0.0004440000047907233,
|
283 |
+
"conditional-to-select": 8.70000003487803e-05,
|
284 |
+
"config-lowering": 0.00020700000459328294,
|
285 |
+
"constant_folding": 0.00016900000628083944,
|
286 |
+
"cse": 0.00043799998820759356,
|
287 |
+
"dce": 3.899999865097925e-05,
|
288 |
+
"dynamic-slice-transpose": 0.00015799999528098851,
|
289 |
+
"eliminate-redundant-compare": 0.0001539999939268455,
|
290 |
+
"emit-offloaded-dropout": 0.0002770000137388706,
|
291 |
+
"flatten-call-graph": 0.000299000006634742,
|
292 |
+
"fuse-send-recv": 0.0015030000358819962,
|
293 |
+
"hilo::LegalizeAlias": 0.003281000070273876,
|
294 |
+
"hilo::NeuronInstCombine": 0.0011020000092685223,
|
295 |
+
"hilo::NeuronOpFusion": 0.0003429999924264848,
|
296 |
+
"hilo::ReplaceTokenTypeWithU8Pass": 0.00018600000475998968,
|
297 |
+
"hilo::ScheduleFusion": 3.5000000934815034e-05,
|
298 |
+
"hilo::SixtyFourHack": 0.00020599999697878957,
|
299 |
+
"hilo::VerifyAliasing": 7.000000186963007e-05,
|
300 |
+
"hlo-mac-count": 0.0006559999892488122,
|
301 |
+
"hlo-verifier": 0.006031000055372715,
|
302 |
+
"io-con-pipe-begin": 4.999999873689376e-06,
|
303 |
+
"io-con-pipe-end": 9.999999974752427e-07,
|
304 |
+
"io-layout-normalization": 0.0009500000160187483,
|
305 |
+
"legalize-ccops": 1.700000029813964e-05,
|
306 |
+
"legalize-compare": 0.00036899998667649925,
|
307 |
+
"lower-argminmax-custom-call": 0.00013800000306218863,
|
308 |
+
"map-inline": 0.0006319999811239541,
|
309 |
+
"metadata-naming": 0.0009749999735504389,
|
310 |
+
"mlir::detail::OpToOpPassAdaptor": 0.00022499999613501132,
|
311 |
+
"mlir::hlo::MhloToPyPenguin": 0.025104999542236328,
|
312 |
+
"mlir::mhlo::LowerComplexExtraPass": 0.002770999912172556,
|
313 |
+
"mlir::mhlo::LowerComplexPass": 0.001180000021122396,
|
314 |
+
"native-to-custom-softmax": 0.00041199999395757914,
|
315 |
+
"native-to-custom-softmax-dx": 0.00042600001324899495,
|
316 |
+
"operand_upcaster": 0.0007089999853633344,
|
317 |
+
"post-par-pipe-begin": 9.999999974752427e-07,
|
318 |
+
"post-par-pipe-end": 0.0,
|
319 |
+
"post-partition-simplification": 0.05639899894595146,
|
320 |
+
"pre-hlo-begin": 4.999999873689376e-06,
|
321 |
+
"pre-hlo-end": 9.999999974752427e-07,
|
322 |
+
"replace-minimum-constant": 0.0002209999947808683,
|
323 |
+
"reshape-mover": 7.400000322377309e-05,
|
324 |
+
"simplify-concat": 0.0018210000125691295,
|
325 |
+
"simplify-while-loops": 5.500000042957254e-05,
|
326 |
+
"transform-variadic-reduce": 0.0006440000142902136,
|
327 |
+
"tuple-simplifier": 0.00016700000560376793,
|
328 |
+
"unpack-nested-aws-ntwsr": 0.00035700001171790063,
|
329 |
+
"unroll-while-loop": 1.1000000085914508e-05
|
330 |
+
}
|
331 |
+
},
|
332 |
+
"cumsum": {
|
333 |
+
"compiletime": {
|
334 |
+
"CoalesceCCOp": 0.0008378028869628906,
|
335 |
+
"DMALocalityOpt": 0.0003306865692138672,
|
336 |
+
"DMAProfiler": 0.0007596015930175781,
|
337 |
+
"DataStreaming": 0.0002918243408203125,
|
338 |
+
"DoNothing": 0.00012636184692382813,
|
339 |
+
"ExpandISAMacro": 0.0005497932434082031,
|
340 |
+
"FactorizeBlkDims": 0.0004723072052001953,
|
341 |
+
"InferPSumTensor": 0.000583648681640625,
|
342 |
+
"LateLegalizeInst": 0.00040459632873535156,
|
343 |
+
"LateNeuronInstComb": 0.0004837512969970703,
|
344 |
+
"LegalizeSundaAccess": 0.0015611648559570313,
|
345 |
+
"LegalizeType": 0.00025010108947753906,
|
346 |
+
"LowerBroadcast": 0.0009808540344238281,
|
347 |
+
"LowerIntrinsics": 0.0002262592315673828,
|
348 |
+
"LowerTranspose": 0.00021767616271972656,
|
349 |
+
"NeuronInstComb": 0.0004963874816894531,
|
350 |
+
"NeuronLICM": 0.0006859302520751953,
|
351 |
+
"NeuronSimplifyPredicates": 0.002815723419189453,
|
352 |
+
"NeuronValueNumbering": 0.0004124641418457031,
|
353 |
+
"SFKVectorizer": 0.0027742385864257813,
|
354 |
+
"SimpleAllReduceTiling": 0.000209808349609375,
|
355 |
+
"SimplifyNeuronTensor": 0.00040721893310546875,
|
356 |
+
"SpillPSum": 0.0009286403656005859,
|
357 |
+
"WeightCoalescing": 0.0002105236053466797
|
358 |
+
}
|
359 |
+
},
|
360 |
+
"sg00": {
|
361 |
+
"hilo": {
|
362 |
+
"ArithmeticIntensity": 0.9200255870819092,
|
363 |
+
"HloMacCount": 3802996736.0,
|
364 |
+
"Traffic": 8267154432.0
|
365 |
+
}
|
366 |
+
},
|
367 |
+
"sg0000": {
|
368 |
+
"compiletime": {
|
369 |
+
"AGOrderingAnalysisPass": 1.4457588195800781,
|
370 |
+
"AffinePredicateResolution": 0.05167531967163086,
|
371 |
+
"AliasDependencyElimination": 0.0026276111602783203,
|
372 |
+
"AliasDependencyInduction": 0.44934630393981934,
|
373 |
+
"AliasDependencyReset": 1.2677826881408691,
|
374 |
+
"BFComputeCutting": 0.06423807144165039,
|
375 |
+
"BirCodeGenLoop": 2.421293258666992,
|
376 |
+
"CCOpFusion": 0.41050028800964355,
|
377 |
+
"CanonicalizeDAGForPGTiling": 0.21233797073364258,
|
378 |
+
"CanonicalizeIR": 0.06626629829406738,
|
379 |
+
"CoalesceCCOp": 0.1906270980834961,
|
380 |
+
"CommuteConcat": 0.03319668769836426,
|
381 |
+
"DMALocalityOpt": 0.03487658500671387,
|
382 |
+
"DMAProfiler": 0.08790731430053711,
|
383 |
+
"DMATilingProfiler": 0.07109546661376953,
|
384 |
+
"DataLocalityOpt": 1.910703182220459,
|
385 |
+
"DataStreaming": 0.15360140800476074,
|
386 |
+
"DeConcat": 0.012087583541870117,
|
387 |
+
"DeadCodeElimination": 0.035611867904663086,
|
388 |
+
"DeadStoreElimination": 0.37193870544433594,
|
389 |
+
"DelinearIndices": 0.2894127368927002,
|
390 |
+
"Delinearization": 0.1295926570892334,
|
391 |
+
"DoNothing": 6.914138793945313e-05,
|
392 |
+
"DramToDramTranspose": 1.0679569244384766,
|
393 |
+
"DumpGraphAndMetadata": 0.24142217636108398,
|
394 |
+
"EliminateDivs": 0.17337489128112793,
|
395 |
+
"ExpandBatchNorm": 0.06027984619140625,
|
396 |
+
"ExpandISAMacro": 0.09040713310241699,
|
397 |
+
"FactorizeBlkDims": 0.24898743629455566,
|
398 |
+
"FactorizeThreadAxesInFreeDims": 0.03613853454589844,
|
399 |
+
"FlattenMacroLoop": 0.26774168014526367,
|
400 |
+
"GenericAccessSimplifier": 0.03175926208496094,
|
401 |
+
"InferInitValue": 1.029360294342041,
|
402 |
+
"InferIntrinsicOnCC": 0.34307408332824707,
|
403 |
+
"InferNeuronTensor": 1.7935998439788818,
|
404 |
+
"InferNonlocalTensors": 3.6307339668273926,
|
405 |
+
"InferPSumTensor": 0.977715015411377,
|
406 |
+
"InlineNativeKernels": 0.05374264717102051,
|
407 |
+
"InsertIOTransposes": 1.162278652191162,
|
408 |
+
"InsertLocalTransposes": 1.0349645614624023,
|
409 |
+
"InsertOffloadedTransposes": 0.0943443775177002,
|
410 |
+
"LICM": 0.1061861515045166,
|
411 |
+
"LateLegalizeInst": 0.22713756561279297,
|
412 |
+
"LateLegalizePostSplit": 0.09247255325317383,
|
413 |
+
"LateLowerReshapeOp": 0.04053616523742676,
|
414 |
+
"LateLowerTensorOp": 0.3356895446777344,
|
415 |
+
"LateNeuronInstComb": 0.45120882987976074,
|
416 |
+
"LayoutPreprocessing": 0.9441671371459961,
|
417 |
+
"LayoutPreprocessingAndAnalysis": 1.2680203914642334,
|
418 |
+
"LayoutRequirementAnalysis": 0.309098482131958,
|
419 |
+
"LegalizeCCOpLayout": 0.07318258285522461,
|
420 |
+
"LegalizeOpLevelAlias": 0.03343796730041504,
|
421 |
+
"LegalizePartitionReduce": 0.034781694412231445,
|
422 |
+
"LegalizeSundaAccess": 1.4543089866638184,
|
423 |
+
"LegalizeSundaMacro": 0.37755250930786133,
|
424 |
+
"LegalizeType": 0.20833444595336914,
|
425 |
+
"LocalLayoutOpt": 0.36218762397766113,
|
426 |
+
"LoopFusion": 0.31240200996398926,
|
427 |
+
"LoopSplitting": 0.013066768646240234,
|
428 |
+
"LowerBroadcast": 0.04690980911254883,
|
429 |
+
"LowerCCOpBlockAxis": 0.23094987869262695,
|
430 |
+
"LowerComplexBroadcast": 0.15572404861450195,
|
431 |
+
"LowerIntrinsics": 1.2286322116851807,
|
432 |
+
"LowerTensorOp": 0.4897449016571045,
|
433 |
+
"LowerTranspose": 0.39931535720825195,
|
434 |
+
"MacroGeneration": 2.335334062576294,
|
435 |
+
"MaskPropagation": 0.14433836936950684,
|
436 |
+
"MemcpyElimination": 3.9867260456085205,
|
437 |
+
"MutateDataType": 0.04344511032104492,
|
438 |
+
"NeuronAliasDependencyInduction": 0.025929927825927734,
|
439 |
+
"NeuronAliasDependencyReset": 0.04254412651062012,
|
440 |
+
"NeuronInstComb": 0.19301342964172363,
|
441 |
+
"NeuronLICM": 0.2890663146972656,
|
442 |
+
"NeuronLoopFusion": 0.4089043140411377,
|
443 |
+
"NeuronLoopInterchange": 0.04476189613342285,
|
444 |
+
"NeuronSimplifier": 0.30055856704711914,
|
445 |
+
"NeuronSimplifyPredicates": 0.1793985366821289,
|
446 |
+
"NeuronValueNumbering": 0.10622000694274902,
|
447 |
+
"OptimizeAliasedCopyChain": 0.01511383056640625,
|
448 |
+
"OptimizeNKIKernels": 0.4606451988220215,
|
449 |
+
"PAGLayoutOpt": 26.32272720336914,
|
450 |
+
"PComputeCutting": 0.302201509475708,
|
451 |
+
"PGLayoutTilingPipeline": 38.88710403442383,
|
452 |
+
"PGTiling": 4.423768043518066,
|
453 |
+
"PadElimination": 0.008622884750366211,
|
454 |
+
"ParAxesAnnotation": 25.272018432617188,
|
455 |
+
"PartialLoopFusion": 0.2368309497833252,
|
456 |
+
"PartialSimdFusion": 0.20722246170043945,
|
457 |
+
"PerfectLoopNest": 0.06273055076599121,
|
458 |
+
"RecognizeOpIdiom": 0.20455479621887207,
|
459 |
+
"Recompute": 0.00649714469909668,
|
460 |
+
"RelaxPredicates": 0.154876708984375,
|
461 |
+
"Rematerialization": 0.16764259338378906,
|
462 |
+
"ReshapeWeights": 0.021569013595581055,
|
463 |
+
"ResolveAccessConflict": 0.24012255668640137,
|
464 |
+
"ResolveComplicatePredicates": 0.05034017562866211,
|
465 |
+
"RewriteReplicationMatmul": 0.04589343070983887,
|
466 |
+
"RewriteWeights": 0.05840659141540527,
|
467 |
+
"SFKVectorizer": 3.119982957839966,
|
468 |
+
"SimpleAllReduceTiling": 0.06573367118835449,
|
469 |
+
"Simplifier": 0.11366057395935059,
|
470 |
+
"SimplifyMacroPredicates": 0.18840670585632324,
|
471 |
+
"SimplifyNeuronTensor": 1.3295373916625977,
|
472 |
+
"SimplifySlice": 0.03386688232421875,
|
473 |
+
"SimplifyTensor": 0.21405529975891113,
|
474 |
+
"SpillPSum": 0.5431830883026123,
|
475 |
+
"SplitAPUnionSets": 0.3313255310058594,
|
476 |
+
"SplitAccGrp": 0.03839588165283203,
|
477 |
+
"StaticProfiler": 0.13296246528625488,
|
478 |
+
"StaticTransposeLocalTensor": 0.21724367141723633,
|
479 |
+
"SundaISel": 1.6302134990692139,
|
480 |
+
"TCTransform": 0.03438615798950195,
|
481 |
+
"TensorInitialization": 0.13414645195007324,
|
482 |
+
"TensorOpSimplifier": 0.27712535858154297,
|
483 |
+
"TensorOpTransform": 0.8646912574768066,
|
484 |
+
"TileCCOps": 0.263721227645874,
|
485 |
+
"TilingProfiler": 0.39296984672546387,
|
486 |
+
"TransformConvOp": 0.06336498260498047,
|
487 |
+
"TritiumFusion": 1.0901517868041992,
|
488 |
+
"ValueNumbering": 0.09328150749206543,
|
489 |
+
"VectorizeDMA": 0.03394460678100586,
|
490 |
+
"VectorizeMatMult": 0.0209348201751709,
|
491 |
+
"WeightCoalescing": 0.05463147163391113,
|
492 |
+
"ZeroSizeTensorElimination": 0.0004336833953857422
|
493 |
+
},
|
494 |
+
"tensorizer": {
|
495 |
+
"DMATilingProfiler::TotalInstructionsAfterTiling": 262321,
|
496 |
+
"StaticProfiler::AifUb": 10.559271812438965,
|
497 |
+
"StaticProfiler::ArithmeticIntensityTensorizer": 11.613152503967285,
|
498 |
+
"StaticProfiler::AverageDmaLength": 6652.8759765625,
|
499 |
+
"StaticProfiler::AverageFractalPeUtilization": 99.8321762084961,
|
500 |
+
"StaticProfiler::AveragePartitionUtilization": 99.3888168334961,
|
501 |
+
"StaticProfiler::AveragePeUtilization": 99.65400695800781,
|
502 |
+
"StaticProfiler::DDRTransferBytes": 7587185496,
|
503 |
+
"StaticProfiler::InternalTransferBytes": 632323092,
|
504 |
+
"StaticProfiler::LoadExpanded": 1033407,
|
505 |
+
"StaticProfiler::LocalizationEfficiency": 109.9806137084961,
|
506 |
+
"StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 110.06793212890625,
|
507 |
+
"StaticProfiler::StoreExpanded": 3422,
|
508 |
+
"StaticProfiler::TotalDMAExpanded": 1036829,
|
509 |
+
"StaticProfiler::TotalDynamicInstancesCount": 275548,
|
510 |
+
"StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 274994,
|
511 |
+
"StaticProfiler::TotalLNCComm": 0,
|
512 |
+
"StaticProfiler::TotalLNCCommTransfer": 0,
|
513 |
+
"TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
|
514 |
+
"TilingProfiler::AveragePeUtilizationAfterTiling": 0,
|
515 |
+
"TilingProfiler::BatchnormInstructionsAfterTiling": 0,
|
516 |
+
"TilingProfiler::DmaInstructionsAfterTiling": 0,
|
517 |
+
"TilingProfiler::GenericInstructionsAfterTiling": 79,
|
518 |
+
"TilingProfiler::MatMultInstructionsAfterTiling": 231408,
|
519 |
+
"TilingProfiler::NumPfTransposes": 398,
|
520 |
+
"TilingProfiler::NumPfTransposesForIo": 37,
|
521 |
+
"TilingProfiler::NumPfTransposesForLocal": 216,
|
522 |
+
"TilingProfiler::NumPfTransposesForNonlocal": 145,
|
523 |
+
"TilingProfiler::PfTransposeInstructions": 19513,
|
524 |
+
"TilingProfiler::PfTransposeInstructionsForIo": 19152,
|
525 |
+
"TilingProfiler::PfTransposeInstructionsForLocal": 216,
|
526 |
+
"TilingProfiler::PfTransposeInstructionsForNonlocal": 145,
|
527 |
+
"TilingProfiler::ReduceInstructionsAfterTiling": 74,
|
528 |
+
"TilingProfiler::SimdInstructionsAfterTiling": 2999,
|
529 |
+
"TilingProfiler::TotalInstructionsAfterTiling": 0,
|
530 |
+
"TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
|
531 |
+
"TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
|
532 |
+
"TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
|
533 |
+
"TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
|
534 |
+
"TransformConvOp::conv2d_column_packing": 0,
|
535 |
+
"TransformConvOp::conv2d_column_packing_1": 0,
|
536 |
+
"TransformConvOp::conv2d_column_packing_io10": 0,
|
537 |
+
"TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
|
538 |
+
}
|
539 |
+
}
|
540 |
+
}
|
token_generation_model/_tp0_bk0/graph.neff
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82be447a0a308a6e83990d1f3d193b4dc43ab835b136e7c27647ecf6cde94383
|
3 |
+
size 6001664
|
token_generation_model/_tp0_bk0/log-neuron-cc.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
token_generation_model/_tp0_bk0/metaneff.pb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f42b279a662fc21e6bb94ab8bdb96ad553535cec385b6c8909a4e7622fad939
|
3 |
+
size 985283
|