jburtoft commited on
Commit
ee61cf7
·
verified ·
1 Parent(s): d713d23

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +38 -33
  2. context_encoding_model/_tp0_bk0/command.txt +1 -0
  3. context_encoding_model/_tp0_bk0/compile_flags.MODULE_f4171003694760566af4+a9cd68fb.json +1 -0
  4. context_encoding_model/_tp0_bk0/global_metric_store.json +1079 -0
  5. context_encoding_model/_tp0_bk0/graph.neff +3 -0
  6. context_encoding_model/_tp0_bk0/log-neuron-cc.txt +0 -0
  7. context_encoding_model/_tp0_bk0/metaneff.pb +3 -0
  8. context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb +3 -0
  9. context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff +3 -0
  10. context_encoding_model/_tp0_bk0/neuron_config.json +220 -0
  11. context_encoding_model/_tp0_bk1/command.txt +1 -0
  12. context_encoding_model/_tp0_bk1/compile_flags.MODULE_2914133a46cb7b4660ab+d7af8a84.json +1 -0
  13. context_encoding_model/_tp0_bk1/global_metric_store.json +1079 -0
  14. context_encoding_model/_tp0_bk1/graph.neff +3 -0
  15. context_encoding_model/_tp0_bk1/log-neuron-cc.txt +0 -0
  16. context_encoding_model/_tp0_bk1/metaneff.pb +3 -0
  17. context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb +3 -0
  18. context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff +3 -0
  19. context_encoding_model/_tp0_bk1/neuron_config.json +220 -0
  20. context_encoding_model/_tp0_bk2/command.txt +1 -0
  21. context_encoding_model/_tp0_bk2/compile_flags.MODULE_00594b8bc68e927f3dbe+1ad60ced.json +1 -0
  22. context_encoding_model/_tp0_bk2/global_metric_store.json +1079 -0
  23. context_encoding_model/_tp0_bk2/graph.neff +3 -0
  24. context_encoding_model/_tp0_bk2/log-neuron-cc.txt +0 -0
  25. context_encoding_model/_tp0_bk2/metaneff.pb +3 -0
  26. context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb +3 -0
  27. context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff +3 -0
  28. context_encoding_model/_tp0_bk2/neuron_config.json +220 -0
  29. context_encoding_model/_tp0_bk3/command.txt +1 -0
  30. context_encoding_model/_tp0_bk3/compile_flags.MODULE_b3ddbc97e5f0d1d64c82+155de413.json +1 -0
  31. context_encoding_model/_tp0_bk3/global_metric_store.json +1079 -0
  32. context_encoding_model/_tp0_bk3/graph.neff +3 -0
  33. context_encoding_model/_tp0_bk3/log-neuron-cc.txt +0 -0
  34. context_encoding_model/_tp0_bk3/metaneff.pb +3 -0
  35. context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb +3 -0
  36. context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff +3 -0
  37. context_encoding_model/_tp0_bk3/neuron_config.json +220 -0
  38. layout_opt/command.txt +1 -0
  39. layout_opt/graph.neff +3 -0
  40. layout_opt/log-neuron-cc.txt +0 -0
  41. layout_opt/metaneff +1198 -0
  42. layout_opt/model/graph.hlo +3 -0
  43. model.pt +3 -0
  44. neuron_config.json +218 -0
  45. token_generation_model/_tp0_bk0/command.txt +1 -0
  46. token_generation_model/_tp0_bk0/compile_flags.MODULE_6ef5ba8b41fbbe77f080+74ae8282.json +1 -0
  47. token_generation_model/_tp0_bk0/global_metric_store.json +540 -0
  48. token_generation_model/_tp0_bk0/graph.neff +3 -0
  49. token_generation_model/_tp0_bk0/log-neuron-cc.txt +0 -0
  50. token_generation_model/_tp0_bk0/metaneff.pb +3 -0
.gitattributes CHANGED
@@ -1,35 +1,40 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
2
  *.h5 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
3
  *.onnx filter=lfs diff=lfs merge=lfs -text
4
+ *.bin filter=lfs diff=lfs merge=lfs -text
5
+ context_encoding_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text
6
+ context_encoding_model/_tp0_bk0/metaneff.pb filter=lfs diff=lfs merge=lfs -text
7
+ context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
8
+ context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff filter=lfs diff=lfs merge=lfs -text
9
+ context_encoding_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text
10
+ context_encoding_model/_tp0_bk1/metaneff.pb filter=lfs diff=lfs merge=lfs -text
11
+ context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
12
+ context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff filter=lfs diff=lfs merge=lfs -text
13
+ context_encoding_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text
14
+ context_encoding_model/_tp0_bk2/metaneff.pb filter=lfs diff=lfs merge=lfs -text
15
+ context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
16
+ context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff filter=lfs diff=lfs merge=lfs -text
17
+ context_encoding_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text
18
+ context_encoding_model/_tp0_bk3/metaneff.pb filter=lfs diff=lfs merge=lfs -text
19
+ context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
20
+ context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff filter=lfs diff=lfs merge=lfs -text
21
+ layout_opt/graph.neff filter=lfs diff=lfs merge=lfs -text
22
+ layout_opt/model/graph.hlo filter=lfs diff=lfs merge=lfs -text
23
+ model.pt filter=lfs diff=lfs merge=lfs -text
24
+ token_generation_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text
25
+ token_generation_model/_tp0_bk0/metaneff.pb filter=lfs diff=lfs merge=lfs -text
26
+ token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
27
+ token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.neff filter=lfs diff=lfs merge=lfs -text
28
+ token_generation_model/_tp0_bk0/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text
29
+ token_generation_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text
30
+ token_generation_model/_tp0_bk1/metaneff.pb filter=lfs diff=lfs merge=lfs -text
31
+ token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
32
+ token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.neff filter=lfs diff=lfs merge=lfs -text
33
+ token_generation_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text
34
+ token_generation_model/_tp0_bk2/metaneff.pb filter=lfs diff=lfs merge=lfs -text
35
+ token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
36
+ token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.neff filter=lfs diff=lfs merge=lfs -text
37
+ token_generation_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text
38
+ token_generation_model/_tp0_bk3/metaneff.pb filter=lfs diff=lfs merge=lfs -text
39
+ token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.hlo_module.pb filter=lfs diff=lfs merge=lfs -text
40
+ token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.neff filter=lfs diff=lfs merge=lfs -text
context_encoding_model/_tp0_bk0/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb --output model.MODULE_f4171003694760566af4+a9cd68fb.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk0/compile_flags.MODULE_f4171003694760566af4+a9cd68fb.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"]
context_encoding_model/_tp0_bk0/global_metric_store.json ADDED
@@ -0,0 +1,1079 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.65389251708984,
5
+ "StaticProfiler::AveragePartitionUtilization": 97.55139923095703,
6
+ "StaticProfiler::AveragePeUtilization": 98.60253143310547,
7
+ "StaticProfiler::LocalizationEfficiency": 99.04553985595703,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.20111846923828,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1.0,
16
+ "StaticProfiler::AveragePartitionUtilization": 1.0,
17
+ "StaticProfiler::AveragePeUtilization": 1.0,
18
+ "StaticProfiler::LocalizationEfficiency": 1.0,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 0.018787622451782227,
27
+ "AffinePredicateResolution": 0.0011818408966064453,
28
+ "AliasDependencyElimination": 0.00011801719665527344,
29
+ "AliasDependencyInduction": 0.005483388900756836,
30
+ "AliasDependencyReset": 0.026019811630249023,
31
+ "BFComputeCutting": 0.00225830078125,
32
+ "BirCodeGenLoop": 0.4621126651763916,
33
+ "CCOpFusion": 0.01928091049194336,
34
+ "CanonicalizeConv": 3.7000001611886546e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.004612922668457031,
36
+ "CanonicalizeForTensorizer": 4.099999932805076e-05,
37
+ "CanonicalizeIR": 0.0017774105072021484,
38
+ "Canonicalizer": 0.0009619999909773469,
39
+ "CoalesceCCOp": 0.0146026611328125,
40
+ "CommuteConcat": 0.0020241737365722656,
41
+ "DMALocalityOpt": 0.005425214767456055,
42
+ "DMAProfiler": 0.012541055679321289,
43
+ "DMATilingProfiler": 0.004782676696777344,
44
+ "DataLocalityOpt": 0.06629562377929688,
45
+ "DataStreaming": 0.03773355484008789,
46
+ "DeConcat": 0.0006563663482666016,
47
+ "DeadCodeElimination": 0.002358675003051758,
48
+ "DeadStoreElimination": 0.0055620670318603516,
49
+ "DelinearIndices": 0.004741668701171875,
50
+ "Delinearization": 0.0036110877990722656,
51
+ "DoNothing": 0.00022459030151367188,
52
+ "DramToDramTranspose": 0.016016721725463867,
53
+ "DumpGraphAndMetadata": 0.0853111743927002,
54
+ "EliminateDivs": 0.0025675296783447266,
55
+ "ExpandBatchNorm": 0.002092123031616211,
56
+ "ExpandISAMacro": 0.011052370071411133,
57
+ "FactorizeBlkDims": 0.00814366340637207,
58
+ "FactorizeThreadAxesInFreeDims": 0.002122640609741211,
59
+ "FlattenMacroLoop": 0.002187013626098633,
60
+ "GenericAccessSimplifier": 0.0009529590606689453,
61
+ "HoistCompute": 6.000000212225132e-06,
62
+ "IdentifyCrossPassTensors": 7.700000423938036e-05,
63
+ "InferInitValue": 0.0242159366607666,
64
+ "InferIntrinsicOnCC": 0.009269952774047852,
65
+ "InferNeuronTensor": 0.020155906677246094,
66
+ "InferNonlocalTensors": 0.015646696090698242,
67
+ "InferPSumTensor": 0.3081786632537842,
68
+ "InlineNativeKernels": 0.009155511856079102,
69
+ "InsertIOTransposes": 0.015281438827514648,
70
+ "InsertLocalTransposes": 0.006501436233520508,
71
+ "InsertOffloadedTransposes": 0.002702474594116211,
72
+ "LICM": 0.002913951873779297,
73
+ "LateLegalizeInst": 0.014158487319946289,
74
+ "LateLegalizePostSplit": 0.012693405151367188,
75
+ "LateLowerReshapeOp": 0.0025734901428222656,
76
+ "LateLowerTensorOp": 0.001531362533569336,
77
+ "LateNeuronInstComb": 0.008838176727294922,
78
+ "LayoutPreprocessing": 0.026634931564331055,
79
+ "LayoutPreprocessingAndAnalysis": 0.5595176219940186,
80
+ "LayoutRequirementAnalysis": 0.005538463592529297,
81
+ "LegalizeCCOpLayout": 0.0022728443145751953,
82
+ "LegalizeOpLevelAlias": 0.001255035400390625,
83
+ "LegalizePartitionReduce": 0.001256704330444336,
84
+ "LegalizeSundaAccess": 0.07711672782897949,
85
+ "LegalizeSundaMacro": 0.010920286178588867,
86
+ "LegalizeType": 0.01314401626586914,
87
+ "LocalLayoutOpt": 0.012011289596557617,
88
+ "LoopFusion": 0.006572723388671875,
89
+ "LoopSplitting": 0.0003001689910888672,
90
+ "LowerBroadcast": 0.0018808841705322266,
91
+ "LowerCCOpBlockAxis": 0.0050678253173828125,
92
+ "LowerComplexBroadcast": 0.0025262832641601563,
93
+ "LowerIntrinsics": 0.3039369583129883,
94
+ "LowerTensorOp": 0.011744022369384766,
95
+ "LowerTranspose": 0.011741399765014648,
96
+ "MacroGeneration": 0.026911020278930664,
97
+ "MaskPropagation": 0.0031325817108154297,
98
+ "MemcastMotion": 2.2000000171829015e-05,
99
+ "MemcpyElimination": 0.027472257614135742,
100
+ "MutateDataType": 0.0015196800231933594,
101
+ "NeuronAliasDependencyInduction": 0.00016927719116210938,
102
+ "NeuronAliasDependencyReset": 0.0242006778717041,
103
+ "NeuronInstComb": 0.00468754768371582,
104
+ "NeuronLICM": 0.03664875030517578,
105
+ "NeuronLoopFusion": 0.00889277458190918,
106
+ "NeuronLoopInterchange": 0.002141237258911133,
107
+ "NeuronSimplifier": 0.00720524787902832,
108
+ "NeuronSimplifyPredicates": 0.12209796905517578,
109
+ "NeuronValueNumbering": 0.003449678421020508,
110
+ "OptimizeAliasedCopyChain": 0.0006387233734130859,
111
+ "OptimizeNKIKernels": 0.5260024070739746,
112
+ "PAGLayoutOpt": 0.5680239200592041,
113
+ "PComputeCutting": 0.0048143863677978516,
114
+ "PGLayoutTilingPipeline": 1.6304676532745361,
115
+ "PGTiling": 0.1616363525390625,
116
+ "PadElimination": 0.0003521442413330078,
117
+ "ParAxesAnnotation": 0.0544736385345459,
118
+ "PartialLoopFusion": 0.005907773971557617,
119
+ "PartialSimdFusion": 0.0038967132568359375,
120
+ "PenguinizeFunctions": 3.900000228895806e-05,
121
+ "PerfectLoopNest": 0.0021576881408691406,
122
+ "PruneFunctions": 3.5000000934815034e-05,
123
+ "RecognizeOpIdiom": 0.0039520263671875,
124
+ "Recompute": 0.0002884864807128906,
125
+ "RelaxPredicates": 0.013870716094970703,
126
+ "Rematerialization": 0.0024657249450683594,
127
+ "RemoveOptimizationBarriers": 6.500000017695129e-05,
128
+ "ReshapeWeights": 0.0006930828094482422,
129
+ "ResolveAccessConflict": 0.0038983821868896484,
130
+ "ResolveComplicatePredicates": 0.0012950897216796875,
131
+ "RewriteReplicationMatmul": 0.002060413360595703,
132
+ "RewriteWeights": 0.0028791427612304688,
133
+ "SFKVectorizer": 0.2904393672943115,
134
+ "ScatterMotion": 2.8000000384054147e-05,
135
+ "SimpleAllReduceTiling": 0.008909463882446289,
136
+ "Simplifier": 0.003449678421020508,
137
+ "SimplifyMacroPredicates": 0.010317325592041016,
138
+ "SimplifyNeuronTensor": 1.038323163986206,
139
+ "SimplifySlice": 0.0008852481842041016,
140
+ "SimplifyTensor": 0.005218982696533203,
141
+ "SpillPSum": 0.010073423385620117,
142
+ "SplitAPUnionSets": 0.10591006278991699,
143
+ "SplitAccGrp": 0.0011169910430908203,
144
+ "StaticProfiler": 0.01290583610534668,
145
+ "StaticTransposeLocalTensor": 0.003824472427368164,
146
+ "SundaISel": 0.041872262954711914,
147
+ "TCTransform": 0.0008666515350341797,
148
+ "TensorInitialization": 0.013058185577392578,
149
+ "TensorOpSimplifier": 0.0061550140380859375,
150
+ "TensorOpTransform": 0.020328521728515625,
151
+ "TensorizerLegalizationPass": 6.900000153109431e-05,
152
+ "TileCCOps": 0.006834983825683594,
153
+ "TilingProfiler": 0.0072863101959228516,
154
+ "TransformConvOp": 0.0032320022583007813,
155
+ "TritiumFusion": 0.03062152862548828,
156
+ "ValueNumbering": 0.0023603439331054688,
157
+ "VectorizeDMA": 0.004430294036865234,
158
+ "VectorizeMatMult": 0.0021605491638183594,
159
+ "VerifySupportedOps": 3.300000025774352e-05,
160
+ "WeightCoalescing": 0.00846409797668457,
161
+ "ZeroSizeTensorElimination": 0.00011014938354492188,
162
+ "algsimp": 0.004399999976158142,
163
+ "batchnorm_expander": 3.600000127335079e-05,
164
+ "boundary-marker-removal": 1.2000000424450263e-05,
165
+ "call-inliner": 0.0007670000777579844,
166
+ "canonicalize-boundary-marker": 1.4999999621068127e-05,
167
+ "collective-stream-id-checker": 7.300000288523734e-05,
168
+ "comparison-expander": 0.0006099999882280827,
169
+ "computation-deduplicator": 5.8999998145736754e-05,
170
+ "conditional-to-select": 1.5999999959603883e-05,
171
+ "config-lowering": 8.70000003487803e-05,
172
+ "constant-statistics": 0.0005649999948218465,
173
+ "constant_folding": 0.0005520000122487545,
174
+ "cse": 3.600000127335079e-05,
175
+ "dce": 0.00014599999121855944,
176
+ "dot_decomposer": 0.0013859999598935246,
177
+ "dynamic-slice-transpose": 1.2000000424450263e-05,
178
+ "eliminate-redundant-compare": 0.0004949999856762588,
179
+ "emit-offloaded-dropout": 3.80000019504223e-05,
180
+ "flatten-call-graph": 0.0009339999523945153,
181
+ "fuse-send-recv": 7.100000220816582e-05,
182
+ "hilo::LegalizeAlias": 1.1999999514955562e-05,
183
+ "hilo::NeuronInstCombine": 0.00010099999781232327,
184
+ "hilo::NeuronOpFusion": 4.7999998059822246e-05,
185
+ "hilo::ReplaceTokenTypeWithU8Pass": 3.899999865097925e-05,
186
+ "hilo::ScheduleFusion": 1.9999999949504854e-06,
187
+ "hilo::SixtyFourHack": 6.200000643730164e-05,
188
+ "hilo::VerifyAliasing": 4.999999873689376e-06,
189
+ "hlo-mac-count": 0.0018479999853298068,
190
+ "hlo-verifier": 0.007563999388366938,
191
+ "instruction-histogram": 0.002553999889642,
192
+ "io-con-pipe-begin": 4.999999873689376e-06,
193
+ "io-con-pipe-end": 9.999999974752427e-07,
194
+ "io-layout-normalization": 0.0013040000339969993,
195
+ "io-statistics": 6.500000017695129e-05,
196
+ "legalize-ccops": 3.000000106112566e-06,
197
+ "legalize-compare": 1.1000000085914508e-05,
198
+ "lower-argminmax-custom-call": 1.2000000424450263e-05,
199
+ "map-inline": 0.0008759999764151871,
200
+ "metadata-naming": 6.0999998822808266e-05,
201
+ "mlir::detail::OpToOpPassAdaptor": 7.200000254670158e-05,
202
+ "mlir::hlo::MhloToPyPenguin": 0.002776999957859516,
203
+ "mlir::mhlo::LowerComplexExtraPass": 0.00023499999952036887,
204
+ "mlir::mhlo::LowerComplexPass": 0.00032500000088475645,
205
+ "native-to-custom-softmax": 0.0007319999858736992,
206
+ "native-to-custom-softmax-dx": 0.000678999989759177,
207
+ "operand_upcaster": 4.900000203633681e-05,
208
+ "opt-barrier-removal": 0.0005629999795928597,
209
+ "post-par-pipe-begin": 9.000000318337698e-06,
210
+ "post-par-pipe-end": 0.0,
211
+ "post-partition-simplification": 0.001663000090047717,
212
+ "pre-par-pipe-begin": 9.999999974752427e-07,
213
+ "pre-par-pipe-end": 9.999999974752427e-07,
214
+ "pre-partition-simplification": 0.13888800144195557,
215
+ "replace-minimum-constant": 0.0007169999880716205,
216
+ "reshape-mover": 0.00021499999274965376,
217
+ "simplify-concat": 0.00014099999680183828,
218
+ "simplify-while-loops": 0.00017800000205170363,
219
+ "transform-variadic-reduce": 6.70000008540228e-05,
220
+ "tuple-simplifier": 0.0005469999159686267,
221
+ "unpack-nested-aws-ntwsr": 0.00046300000394694507,
222
+ "unroll-while-loop": 3.099999958067201e-05,
223
+ "zero_sized_hlo_elimination": 0.0008880000095814466
224
+ },
225
+ "hilo": {
226
+ "ConstantSize": 304437.0,
227
+ "HloInputCount": 475.0,
228
+ "HloMacCount": 25141444608.0,
229
+ "HloOutputCount": 73.0,
230
+ "IfmapSize": 8266542080.0,
231
+ "OfmapSize": 75497472.0,
232
+ "OutputsReadFromCount": 0.0,
233
+ "PassthroughTensorsCount": 0.0,
234
+ "RedundantOutputCount": 0.0,
235
+ "Traffic": 1649111936.0
236
+ },
237
+ "tensorizer": {
238
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 42834.0,
239
+ "StaticProfiler::AifUb": 129.43267822265625,
240
+ "StaticProfiler::ArithmeticIntensityTensorizer": 128.19729614257813,
241
+ "StaticProfiler::AverageDmaLength": 4810.17578125,
242
+ "StaticProfiler::DDRTransferBytes": 782946624.0,
243
+ "StaticProfiler::InternalTransferBytes": 629086720.0,
244
+ "StaticProfiler::LoadExpanded": 97814.0,
245
+ "StaticProfiler::StoreExpanded": 1757.0,
246
+ "StaticProfiler::TotalDMAExpanded": 99571.0,
247
+ "StaticProfiler::TotalDynamicInstancesCount": 50031.0,
248
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 49585.0,
249
+ "StaticProfiler::TotalLNCComm": 0.0,
250
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
251
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
252
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
253
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
254
+ "TilingProfiler::MatMultInstructionsAfterTiling": 22464.0,
255
+ "TilingProfiler::NumPfTransposes": 5.0,
256
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
257
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
258
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
259
+ "TilingProfiler::PfTransposeInstructions": 19105.0,
260
+ "TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
261
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
262
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 96.0,
263
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
264
+ "TilingProfiler::SimdInstructionsAfterTiling": 158.0,
265
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
266
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
267
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
268
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
269
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
270
+ "TransformConvOp::conv2d_column_packing": 0.0,
271
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
272
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
273
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
274
+ }
275
+ },
276
+ "all": {
277
+ "compiletime": {
278
+ "algsimp": 0.004207999911159277,
279
+ "call-inliner": 0.0007350000087171793,
280
+ "collective-stream-id-checker": 6.399999983841553e-05,
281
+ "comparison-expander": 0.0005949999904260039,
282
+ "constant-statistics": 0.0005649999948218465,
283
+ "constant_folding": 0.0005249999812804163,
284
+ "dce": 0.0001429999974789098,
285
+ "dot_decomposer": 0.0013859999598935246,
286
+ "eliminate-redundant-compare": 0.0004839999892283231,
287
+ "flatten-call-graph": 0.000901999999769032,
288
+ "hlo-mac-count": 0.0016270000487565994,
289
+ "hlo-verifier": 0.007029999978840351,
290
+ "instruction-histogram": 0.002553999889642,
291
+ "io-con-pipe-begin": 4.999999873689376e-06,
292
+ "io-con-pipe-end": 9.999999974752427e-07,
293
+ "io-layout-normalization": 0.0013040000339969993,
294
+ "io-statistics": 6.500000017695129e-05,
295
+ "map-inline": 0.000838999985717237,
296
+ "native-to-custom-softmax": 0.0007050000131130219,
297
+ "native-to-custom-softmax-dx": 0.0005089999758638442,
298
+ "opt-barrier-removal": 0.0005629999795928597,
299
+ "pre-par-pipe-begin": 9.999999974752427e-07,
300
+ "pre-par-pipe-end": 9.999999974752427e-07,
301
+ "pre-partition-simplification": 0.13888800144195557,
302
+ "replace-minimum-constant": 0.0006949999951757491,
303
+ "reshape-mover": 0.00020500000391621143,
304
+ "simplify-while-loops": 0.0001720000000204891,
305
+ "tuple-simplifier": 0.0005319999763742089,
306
+ "unpack-nested-aws-ntwsr": 0.000450999999884516,
307
+ "unroll-while-loop": 2.9000000722589903e-05,
308
+ "zero_sized_hlo_elimination": 0.0008880000095814466
309
+ }
310
+ },
311
+ "cumsum": {
312
+ "compiletime": {
313
+ "CoalesceCCOp": 0.00020933151245117188,
314
+ "DMALocalityOpt": 0.0001666545867919922,
315
+ "DMAProfiler": 0.0008401870727539063,
316
+ "DataStreaming": 0.0002658367156982422,
317
+ "DoNothing": 0.00014090538024902344,
318
+ "ExpandISAMacro": 0.0004999637603759766,
319
+ "FactorizeBlkDims": 0.00046062469482421875,
320
+ "InferPSumTensor": 0.0004820823669433594,
321
+ "LateLegalizeInst": 0.0004343986511230469,
322
+ "LateNeuronInstComb": 0.0004832744598388672,
323
+ "LegalizeSundaAccess": 0.002238750457763672,
324
+ "LegalizeType": 0.0002429485321044922,
325
+ "LowerBroadcast": 0.0002453327178955078,
326
+ "LowerIntrinsics": 0.00021791458129882813,
327
+ "LowerTranspose": 0.00022292137145996094,
328
+ "NeuronInstComb": 0.0005400180816650391,
329
+ "NeuronLICM": 0.0003840923309326172,
330
+ "NeuronSimplifyPredicates": 0.0028014183044433594,
331
+ "NeuronValueNumbering": 0.00042724609375,
332
+ "SFKVectorizer": 0.0028204917907714844,
333
+ "SimpleAllReduceTiling": 0.0002048015594482422,
334
+ "SimplifyNeuronTensor": 0.00043082237243652344,
335
+ "SpillPSum": 0.0005221366882324219,
336
+ "WeightCoalescing": 0.00020456314086914063
337
+ }
338
+ },
339
+ "sg00": {
340
+ "compiletime": {
341
+ "CanonicalizeConv": 1.4000000192027073e-05,
342
+ "CanonicalizeForTensorizer": 1.4999999621068127e-05,
343
+ "Canonicalizer": 0.0003440000000409782,
344
+ "HoistCompute": 3.000000106112566e-06,
345
+ "IdentifyCrossPassTensors": 3.099999958067201e-05,
346
+ "MemcastMotion": 1.2000000424450263e-05,
347
+ "PenguinizeFunctions": 1.5999999959603883e-05,
348
+ "PruneFunctions": 1.2999999853491317e-05,
349
+ "RemoveOptimizationBarriers": 2.4000000848900527e-05,
350
+ "ScatterMotion": 1.1000000085914508e-05,
351
+ "TensorizerLegalizationPass": 2.9000000722589903e-05,
352
+ "VerifySupportedOps": 1.1000000085914508e-05,
353
+ "algsimp": 6.70000008540228e-05,
354
+ "batchnorm_expander": 1.2999999853491317e-05,
355
+ "boundary-marker-removal": 3.999999989900971e-06,
356
+ "call-inliner": 1.1000000085914508e-05,
357
+ "canonicalize-boundary-marker": 6.000000212225132e-06,
358
+ "collective-stream-id-checker": 3.000000106112566e-06,
359
+ "comparison-expander": 4.999999873689376e-06,
360
+ "computation-deduplicator": 1.700000029813964e-05,
361
+ "conditional-to-select": 4.999999873689376e-06,
362
+ "config-lowering": 3.199999991920777e-05,
363
+ "constant_folding": 9.000000318337698e-06,
364
+ "cse": 1.2999999853491317e-05,
365
+ "dce": 9.999999974752427e-07,
366
+ "dynamic-slice-transpose": 3.999999989900971e-06,
367
+ "eliminate-redundant-compare": 3.999999989900971e-06,
368
+ "emit-offloaded-dropout": 1.2999999853491317e-05,
369
+ "flatten-call-graph": 1.1000000085914508e-05,
370
+ "fuse-send-recv": 2.4000000848900527e-05,
371
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
372
+ "hilo::NeuronInstCombine": 6.0999998822808266e-05,
373
+ "hilo::NeuronOpFusion": 7.000000096013537e-06,
374
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05,
375
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
376
+ "hilo::SixtyFourHack": 1.4999999621068127e-05,
377
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
378
+ "hlo-mac-count": 2.5999999706982635e-05,
379
+ "hlo-verifier": 0.00018699999782256782,
380
+ "legalize-ccops": 9.999999974752427e-07,
381
+ "legalize-compare": 3.999999989900971e-06,
382
+ "lower-argminmax-custom-call": 3.999999989900971e-06,
383
+ "map-inline": 1.2000000424450263e-05,
384
+ "metadata-naming": 2.099999983329326e-05,
385
+ "mlir::detail::OpToOpPassAdaptor": 2.300000051036477e-05,
386
+ "mlir::hlo::MhloToPyPenguin": 0.0010349999647587538,
387
+ "mlir::mhlo::LowerComplexExtraPass": 8.600000001024455e-05,
388
+ "mlir::mhlo::LowerComplexPass": 0.0001740000006975606,
389
+ "native-to-custom-softmax": 1.4000000192027073e-05,
390
+ "native-to-custom-softmax-dx": 0.0001340000017080456,
391
+ "operand_upcaster": 1.8000000636675395e-05,
392
+ "post-par-pipe-begin": 6.000000212225132e-06,
393
+ "post-par-pipe-end": 0.0,
394
+ "post-partition-simplification": 0.0005830000154674053,
395
+ "replace-minimum-constant": 7.000000096013537e-06,
396
+ "reshape-mover": 3.999999989900971e-06,
397
+ "simplify-concat": 4.8000001697801054e-05,
398
+ "simplify-while-loops": 1.9999999949504854e-06,
399
+ "transform-variadic-reduce": 9.000000318337698e-06,
400
+ "tuple-simplifier": 4.999999873689376e-06,
401
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
402
+ "unroll-while-loop": 0.0
403
+ },
404
+ "hilo": {
405
+ "ArithmeticIntensity": 8.198826789855957,
406
+ "ConstantSize": 304437.0,
407
+ "HloInputCount": 475.0,
408
+ "HloMacCount": 2751463424.0,
409
+ "HloOutputCount": 73.0,
410
+ "IfmapSize": 8266542080.0,
411
+ "OfmapSize": 75497472.0,
412
+ "OutputsReadFromCount": 0.0,
413
+ "PassthroughTensorsCount": 0.0,
414
+ "RedundantOutputCount": 0.0,
415
+ "Traffic": 671184704.0
416
+ }
417
+ },
418
+ "sg0000": {
419
+ "compiletime": {
420
+ "AGOrderingAnalysisPass": 0.04074835777282715,
421
+ "AffinePredicateResolution": 0.002183198928833008,
422
+ "AliasDependencyElimination": 0.00012922286987304688,
423
+ "AliasDependencyInduction": 0.008634567260742188,
424
+ "AliasDependencyReset": 0.03679013252258301,
425
+ "BFComputeCutting": 0.0019538402557373047,
426
+ "BirCodeGenLoop": 0.04571366310119629,
427
+ "CCOpFusion": 0.01575756072998047,
428
+ "CanonicalizeDAGForPGTiling": 0.003149271011352539,
429
+ "CanonicalizeIR": 0.002719879150390625,
430
+ "CoalesceCCOp": 0.0047032833099365234,
431
+ "CommuteConcat": 0.0013585090637207031,
432
+ "DMALocalityOpt": 0.001116037368774414,
433
+ "DMAProfiler": 0.0047032833099365234,
434
+ "DMATilingProfiler": 0.004144191741943359,
435
+ "DataLocalityOpt": 0.10100674629211426,
436
+ "DataStreaming": 0.0033788681030273438,
437
+ "DeConcat": 0.0007069110870361328,
438
+ "DeadCodeElimination": 0.0010058879852294922,
439
+ "DeadStoreElimination": 0.031080961227416992,
440
+ "DelinearIndices": 0.007829427719116211,
441
+ "Delinearization": 0.003365039825439453,
442
+ "DoNothing": 7.033348083496094e-05,
443
+ "DramToDramTranspose": 0.024500370025634766,
444
+ "DumpGraphAndMetadata": 0.005262136459350586,
445
+ "EliminateDivs": 0.005412578582763672,
446
+ "ExpandBatchNorm": 0.0019643306732177734,
447
+ "ExpandISAMacro": 0.002582550048828125,
448
+ "FactorizeBlkDims": 0.00794839859008789,
449
+ "FactorizeThreadAxesInFreeDims": 0.0020449161529541016,
450
+ "FlattenMacroLoop": 0.0028934478759765625,
451
+ "GenericAccessSimplifier": 0.0009298324584960938,
452
+ "InferInitValue": 0.026146411895751953,
453
+ "InferIntrinsicOnCC": 0.010050058364868164,
454
+ "InferNeuronTensor": 0.03407764434814453,
455
+ "InferNonlocalTensors": 0.06189298629760742,
456
+ "InferPSumTensor": 0.03060150146484375,
457
+ "InlineNativeKernels": 0.0014431476593017578,
458
+ "InsertIOTransposes": 0.009805679321289063,
459
+ "InsertLocalTransposes": 0.007609128952026367,
460
+ "InsertOffloadedTransposes": 0.004189968109130859,
461
+ "LICM": 0.0029850006103515625,
462
+ "LateLegalizeInst": 0.004921674728393555,
463
+ "LateLegalizePostSplit": 0.0025641918182373047,
464
+ "LateLowerReshapeOp": 0.002185821533203125,
465
+ "LateLowerTensorOp": 0.00531768798828125,
466
+ "LateNeuronInstComb": 0.02812671661376953,
467
+ "LayoutPreprocessing": 0.11982965469360352,
468
+ "LayoutPreprocessingAndAnalysis": 0.24928760528564453,
469
+ "LayoutRequirementAnalysis": 0.007187366485595703,
470
+ "LegalizeCCOpLayout": 0.0035941600799560547,
471
+ "LegalizeOpLevelAlias": 0.0022826194763183594,
472
+ "LegalizePartitionReduce": 0.002084970474243164,
473
+ "LegalizeSundaAccess": 0.03499269485473633,
474
+ "LegalizeSundaMacro": 0.00858449935913086,
475
+ "LegalizeType": 0.0038924217224121094,
476
+ "LocalLayoutOpt": 0.015146255493164063,
477
+ "LoopFusion": 0.00600433349609375,
478
+ "LoopSplitting": 0.0003192424774169922,
479
+ "LowerBroadcast": 0.0030934810638427734,
480
+ "LowerCCOpBlockAxis": 0.0053822994232177734,
481
+ "LowerComplexBroadcast": 0.0017805099487304688,
482
+ "LowerIntrinsics": 0.03145861625671387,
483
+ "LowerTensorOp": 0.013553142547607422,
484
+ "LowerTranspose": 0.008147954940795898,
485
+ "MacroGeneration": 0.10158348083496094,
486
+ "MaskPropagation": 0.004988193511962891,
487
+ "MemcpyElimination": 0.1091456413269043,
488
+ "MutateDataType": 0.002095937728881836,
489
+ "NeuronAliasDependencyInduction": 0.00023055076599121094,
490
+ "NeuronAliasDependencyReset": 0.036977291107177734,
491
+ "NeuronInstComb": 0.01214146614074707,
492
+ "NeuronLICM": 0.007807016372680664,
493
+ "NeuronLoopFusion": 0.014447927474975586,
494
+ "NeuronLoopInterchange": 0.0015079975128173828,
495
+ "NeuronSimplifier": 0.009031057357788086,
496
+ "NeuronSimplifyPredicates": 0.0026018619537353516,
497
+ "NeuronValueNumbering": 0.00443577766418457,
498
+ "OptimizeAliasedCopyChain": 0.0012700557708740234,
499
+ "OptimizeNKIKernels": 0.00177764892578125,
500
+ "PAGLayoutOpt": 0.3914484977722168,
501
+ "PComputeCutting": 0.005900144577026367,
502
+ "PGLayoutTilingPipeline": 1.2139532566070557,
503
+ "PGTiling": 0.2603449821472168,
504
+ "PadElimination": 0.00040340423583984375,
505
+ "ParAxesAnnotation": 0.2578258514404297,
506
+ "PartialLoopFusion": 0.010677099227905273,
507
+ "PartialSimdFusion": 0.011437177658081055,
508
+ "PerfectLoopNest": 0.001963376998901367,
509
+ "RecognizeOpIdiom": 0.004378318786621094,
510
+ "Recompute": 0.0002574920654296875,
511
+ "RelaxPredicates": 0.003600597381591797,
512
+ "Rematerialization": 0.004474163055419922,
513
+ "ReshapeWeights": 0.0006759166717529297,
514
+ "ResolveAccessConflict": 0.003798246383666992,
515
+ "ResolveComplicatePredicates": 0.002101421356201172,
516
+ "RewriteReplicationMatmul": 0.0012481212615966797,
517
+ "RewriteWeights": 0.004036903381347656,
518
+ "SFKVectorizer": 0.09602093696594238,
519
+ "SimpleAllReduceTiling": 0.0017740726470947266,
520
+ "Simplifier": 0.004450559616088867,
521
+ "SimplifyMacroPredicates": 0.010053157806396484,
522
+ "SimplifyNeuronTensor": 0.00724029541015625,
523
+ "SimplifySlice": 0.001153707504272461,
524
+ "SimplifyTensor": 0.005860805511474609,
525
+ "SpillPSum": 0.011501789093017578,
526
+ "SplitAPUnionSets": 0.03104996681213379,
527
+ "SplitAccGrp": 0.002181529998779297,
528
+ "StaticProfiler": 0.004481792449951172,
529
+ "StaticTransposeLocalTensor": 0.006117343902587891,
530
+ "SundaISel": 0.041422128677368164,
531
+ "TCTransform": 0.0022428035736083984,
532
+ "TensorInitialization": 0.00680994987487793,
533
+ "TensorOpSimplifier": 0.008346796035766602,
534
+ "TensorOpTransform": 0.030104398727416992,
535
+ "TileCCOps": 0.005553245544433594,
536
+ "TilingProfiler": 0.009899139404296875,
537
+ "TransformConvOp": 0.0027108192443847656,
538
+ "TritiumFusion": 0.020798206329345703,
539
+ "ValueNumbering": 0.003211498260498047,
540
+ "VectorizeDMA": 0.004341602325439453,
541
+ "VectorizeMatMult": 0.0021800994873046875,
542
+ "WeightCoalescing": 0.0030617713928222656,
543
+ "ZeroSizeTensorElimination": 0.00011968612670898438
544
+ },
545
+ "tensorizer": {
546
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 1396.0,
547
+ "StaticProfiler::AifUb": 8.992382049560547,
548
+ "StaticProfiler::ArithmeticIntensityTensorizer": 75.54261016845703,
549
+ "StaticProfiler::AverageDmaLength": 9594.294921875,
550
+ "StaticProfiler::AverageFractalPeUtilization": 99.893310546875,
551
+ "StaticProfiler::AveragePartitionUtilization": 94.61784362792969,
552
+ "StaticProfiler::AveragePeUtilization": 99.893310546875,
553
+ "StaticProfiler::DDRTransferBytes": 79837440.0,
554
+ "StaticProfiler::InternalTransferBytes": 9797632.0,
555
+ "StaticProfiler::LoadExpanded": 11010.0,
556
+ "StaticProfiler::LocalizationEfficiency": 840.0734252929688,
557
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1527.2347412109375,
558
+ "StaticProfiler::StoreExpanded": 3073.0,
559
+ "StaticProfiler::TotalDMAExpanded": 14083.0,
560
+ "StaticProfiler::TotalDynamicInstancesCount": 1442.0,
561
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 1442.0,
562
+ "StaticProfiler::TotalLNCComm": 0.0,
563
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
564
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
565
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
566
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
567
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
568
+ "TilingProfiler::GenericInstructionsAfterTiling": 12.0,
569
+ "TilingProfiler::MatMultInstructionsAfterTiling": 821.0,
570
+ "TilingProfiler::NumPfTransposes": 6.0,
571
+ "TilingProfiler::NumPfTransposesForIo": 0.0,
572
+ "TilingProfiler::NumPfTransposesForLocal": 5.0,
573
+ "TilingProfiler::NumPfTransposesForNonlocal": 1.0,
574
+ "TilingProfiler::PfTransposeInstructions": 72.0,
575
+ "TilingProfiler::PfTransposeInstructionsForIo": 0.0,
576
+ "TilingProfiler::PfTransposeInstructionsForLocal": 56.0,
577
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 16.0,
578
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
579
+ "TilingProfiler::SimdInstructionsAfterTiling": 101.0,
580
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
581
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
582
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
583
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
584
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
585
+ "TransformConvOp::conv2d_column_packing": 0.0,
586
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
587
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
588
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
589
+ }
590
+ },
591
+ "sg0001": {
592
+ "compiletime": {
593
+ "AGOrderingAnalysisPass": 0.034732818603515625,
594
+ "AffinePredicateResolution": 0.0015087127685546875,
595
+ "AliasDependencyElimination": 0.0001227855682373047,
596
+ "AliasDependencyInduction": 0.008313894271850586,
597
+ "AliasDependencyReset": 0.044220924377441406,
598
+ "BFComputeCutting": 0.001974344253540039,
599
+ "BirCodeGenLoop": 0.03118896484375,
600
+ "CCOpFusion": 0.018246889114379883,
601
+ "CanonicalizeDAGForPGTiling": 0.003057718276977539,
602
+ "CanonicalizeIR": 0.0027036666870117188,
603
+ "CoalesceCCOp": 0.0046405792236328125,
604
+ "CommuteConcat": 0.0015790462493896484,
605
+ "DMALocalityOpt": 0.0015497207641601563,
606
+ "DMAProfiler": 0.004349708557128906,
607
+ "DMATilingProfiler": 0.003928422927856445,
608
+ "DataLocalityOpt": 0.12123703956604004,
609
+ "DataStreaming": 0.0025773048400878906,
610
+ "DeConcat": 0.0008485317230224609,
611
+ "DeadCodeElimination": 0.0012981891632080078,
612
+ "DeadStoreElimination": 0.034687042236328125,
613
+ "DelinearIndices": 0.009628772735595703,
614
+ "Delinearization": 0.003772258758544922,
615
+ "DoNothing": 7.009506225585938e-05,
616
+ "DramToDramTranspose": 0.028621673583984375,
617
+ "DumpGraphAndMetadata": 0.003651142120361328,
618
+ "EliminateDivs": 0.004262447357177734,
619
+ "ExpandBatchNorm": 0.002134084701538086,
620
+ "ExpandISAMacro": 0.0026290416717529297,
621
+ "FactorizeBlkDims": 0.009716033935546875,
622
+ "FactorizeThreadAxesInFreeDims": 0.0013210773468017578,
623
+ "FlattenMacroLoop": 0.002851247787475586,
624
+ "GenericAccessSimplifier": 0.002216815948486328,
625
+ "InferInitValue": 0.03134632110595703,
626
+ "InferIntrinsicOnCC": 0.011671781539916992,
627
+ "InferNeuronTensor": 0.039717674255371094,
628
+ "InferNonlocalTensors": 0.030872583389282227,
629
+ "InferPSumTensor": 0.022834062576293945,
630
+ "InlineNativeKernels": 0.0021605491638183594,
631
+ "InsertIOTransposes": 0.017906904220581055,
632
+ "InsertLocalTransposes": 0.007941961288452148,
633
+ "InsertOffloadedTransposes": 0.0032515525817871094,
634
+ "LICM": 0.003479480743408203,
635
+ "LateLegalizeInst": 0.003596782684326172,
636
+ "LateLegalizePostSplit": 0.002257108688354492,
637
+ "LateLowerReshapeOp": 0.0018393993377685547,
638
+ "LateLowerTensorOp": 0.005475044250488281,
639
+ "LateNeuronInstComb": 0.017774581909179688,
640
+ "LayoutPreprocessing": 0.03530263900756836,
641
+ "LayoutPreprocessingAndAnalysis": 0.11916303634643555,
642
+ "LayoutRequirementAnalysis": 0.007796525955200195,
643
+ "LegalizeCCOpLayout": 0.0019328594207763672,
644
+ "LegalizeOpLevelAlias": 0.001219034194946289,
645
+ "LegalizePartitionReduce": 0.0009839534759521484,
646
+ "LegalizeSundaAccess": 0.015137434005737305,
647
+ "LegalizeSundaMacro": 0.010521173477172852,
648
+ "LegalizeType": 0.004090547561645508,
649
+ "LocalLayoutOpt": 0.020325422286987305,
650
+ "LoopFusion": 0.006730556488037109,
651
+ "LoopSplitting": 0.00034809112548828125,
652
+ "LowerBroadcast": 0.001789093017578125,
653
+ "LowerCCOpBlockAxis": 0.005074977874755859,
654
+ "LowerComplexBroadcast": 0.0019309520721435547,
655
+ "LowerIntrinsics": 0.03209352493286133,
656
+ "LowerTensorOp": 0.012279510498046875,
657
+ "LowerTranspose": 0.010157585144042969,
658
+ "MacroGeneration": 0.09246373176574707,
659
+ "MaskPropagation": 0.003335237503051758,
660
+ "MemcpyElimination": 0.10414385795593262,
661
+ "MutateDataType": 0.00220489501953125,
662
+ "NeuronAliasDependencyInduction": 0.0002532005310058594,
663
+ "NeuronAliasDependencyReset": 0.03873252868652344,
664
+ "NeuronInstComb": 0.012767791748046875,
665
+ "NeuronLICM": 0.006428241729736328,
666
+ "NeuronLoopFusion": 0.01547694206237793,
667
+ "NeuronLoopInterchange": 0.0012590885162353516,
668
+ "NeuronSimplifier": 0.009620428085327148,
669
+ "NeuronSimplifyPredicates": 0.0022652149200439453,
670
+ "NeuronValueNumbering": 0.0031261444091796875,
671
+ "OptimizeAliasedCopyChain": 0.0007045269012451172,
672
+ "OptimizeNKIKernels": 0.0022683143615722656,
673
+ "PAGLayoutOpt": 0.11684298515319824,
674
+ "PComputeCutting": 0.0060575008392333984,
675
+ "PGLayoutTilingPipeline": 1.5194215774536133,
676
+ "PGTiling": 0.5792257785797119,
677
+ "PadElimination": 0.0004138946533203125,
678
+ "ParAxesAnnotation": 0.08577656745910645,
679
+ "PartialLoopFusion": 0.010853052139282227,
680
+ "PartialSimdFusion": 0.010831356048583984,
681
+ "PerfectLoopNest": 0.0021359920501708984,
682
+ "RecognizeOpIdiom": 0.004781246185302734,
683
+ "Recompute": 0.00029349327087402344,
684
+ "RelaxPredicates": 0.0031125545501708984,
685
+ "Rematerialization": 0.002535104751586914,
686
+ "ReshapeWeights": 0.0007915496826171875,
687
+ "ResolveAccessConflict": 0.004204988479614258,
688
+ "ResolveComplicatePredicates": 0.0014605522155761719,
689
+ "RewriteReplicationMatmul": 0.0014035701751708984,
690
+ "RewriteWeights": 0.0033304691314697266,
691
+ "SFKVectorizer": 0.11060166358947754,
692
+ "SimpleAllReduceTiling": 0.0013706684112548828,
693
+ "Simplifier": 0.004431247711181641,
694
+ "SimplifyMacroPredicates": 0.005709409713745117,
695
+ "SimplifyNeuronTensor": 0.005321979522705078,
696
+ "SimplifySlice": 0.0020780563354492188,
697
+ "SimplifyTensor": 0.00576329231262207,
698
+ "SpillPSum": 0.01259160041809082,
699
+ "SplitAPUnionSets": 0.009907007217407227,
700
+ "SplitAccGrp": 0.0010552406311035156,
701
+ "StaticProfiler": 0.0033452510833740234,
702
+ "StaticTransposeLocalTensor": 0.005699634552001953,
703
+ "SundaISel": 0.04179859161376953,
704
+ "TCTransform": 0.0024602413177490234,
705
+ "TensorInitialization": 0.0022628307342529297,
706
+ "TensorOpSimplifier": 0.006663322448730469,
707
+ "TensorOpTransform": 0.03399252891540527,
708
+ "TileCCOps": 0.0057027339935302734,
709
+ "TilingProfiler": 0.01235508918762207,
710
+ "TransformConvOp": 0.0025110244750976563,
711
+ "TritiumFusion": 0.04343080520629883,
712
+ "ValueNumbering": 0.0041046142578125,
713
+ "VectorizeDMA": 0.0033228397369384766,
714
+ "VectorizeMatMult": 0.0033872127532958984,
715
+ "WeightCoalescing": 0.0023338794708251953,
716
+ "ZeroSizeTensorElimination": 0.00011372566223144531
717
+ },
718
+ "tensorizer": {
719
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 5268.0,
720
+ "StaticProfiler::AifUb": 127.58392333984375,
721
+ "StaticProfiler::ArithmeticIntensityTensorizer": 129.38287353515625,
722
+ "StaticProfiler::AverageDmaLength": 6718.79638671875,
723
+ "StaticProfiler::AverageFractalPeUtilization": 100.0,
724
+ "StaticProfiler::AveragePartitionUtilization": 99.92172241210938,
725
+ "StaticProfiler::AveragePeUtilization": 100.0,
726
+ "StaticProfiler::DDRTransferBytes": 198661120.0,
727
+ "StaticProfiler::InternalTransferBytes": 10321920.0,
728
+ "StaticProfiler::LoadExpanded": 27264.0,
729
+ "StaticProfiler::LocalizationEfficiency": 101.41001892089844,
730
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 103.59725189208984,
731
+ "StaticProfiler::StoreExpanded": 1153.0,
732
+ "StaticProfiler::TotalDMAExpanded": 28417.0,
733
+ "StaticProfiler::TotalDynamicInstancesCount": 5111.0,
734
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 5111.0,
735
+ "StaticProfiler::TotalLNCComm": 0.0,
736
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
737
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
738
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
739
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
740
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
741
+ "TilingProfiler::GenericInstructionsAfterTiling": 8.0,
742
+ "TilingProfiler::MatMultInstructionsAfterTiling": 4276.0,
743
+ "TilingProfiler::NumPfTransposes": 8.0,
744
+ "TilingProfiler::NumPfTransposesForIo": 3.0,
745
+ "TilingProfiler::NumPfTransposesForLocal": 3.0,
746
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
747
+ "TilingProfiler::PfTransposeInstructions": 122.0,
748
+ "TilingProfiler::PfTransposeInstructionsForIo": 34.0,
749
+ "TilingProfiler::PfTransposeInstructionsForLocal": 24.0,
750
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 64.0,
751
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
752
+ "TilingProfiler::SimdInstructionsAfterTiling": 180.0,
753
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
754
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
755
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
756
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
757
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
758
+ "TransformConvOp::conv2d_column_packing": 0.0,
759
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
760
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
761
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
762
+ }
763
+ },
764
+ "sg0002": {
765
+ "compiletime": {
766
+ "AGOrderingAnalysisPass": 0.018787622451782227,
767
+ "AffinePredicateResolution": 0.0011818408966064453,
768
+ "AliasDependencyElimination": 0.00011801719665527344,
769
+ "AliasDependencyInduction": 0.005483388900756836,
770
+ "AliasDependencyReset": 0.026019811630249023,
771
+ "BFComputeCutting": 0.00225830078125,
772
+ "BirCodeGenLoop": 0.4621126651763916,
773
+ "CCOpFusion": 0.01928091049194336,
774
+ "CanonicalizeDAGForPGTiling": 0.004612922668457031,
775
+ "CanonicalizeIR": 0.0017774105072021484,
776
+ "CoalesceCCOp": 0.014393329620361328,
777
+ "CommuteConcat": 0.0020241737365722656,
778
+ "DMALocalityOpt": 0.0052585601806640625,
779
+ "DMAProfiler": 0.011700868606567383,
780
+ "DMATilingProfiler": 0.004782676696777344,
781
+ "DataLocalityOpt": 0.06629562377929688,
782
+ "DataStreaming": 0.03746771812438965,
783
+ "DeConcat": 0.0006563663482666016,
784
+ "DeadCodeElimination": 0.002358675003051758,
785
+ "DeadStoreElimination": 0.0055620670318603516,
786
+ "DelinearIndices": 0.004741668701171875,
787
+ "Delinearization": 0.0036110877990722656,
788
+ "DoNothing": 8.368492126464844e-05,
789
+ "DramToDramTranspose": 0.016016721725463867,
790
+ "DumpGraphAndMetadata": 0.0853111743927002,
791
+ "EliminateDivs": 0.0025675296783447266,
792
+ "ExpandBatchNorm": 0.002092123031616211,
793
+ "ExpandISAMacro": 0.010552406311035156,
794
+ "FactorizeBlkDims": 0.0076830387115478516,
795
+ "FactorizeThreadAxesInFreeDims": 0.002122640609741211,
796
+ "FlattenMacroLoop": 0.002187013626098633,
797
+ "GenericAccessSimplifier": 0.0009529590606689453,
798
+ "InferInitValue": 0.0242159366607666,
799
+ "InferIntrinsicOnCC": 0.009269952774047852,
800
+ "InferNeuronTensor": 0.020155906677246094,
801
+ "InferNonlocalTensors": 0.015646696090698242,
802
+ "InferPSumTensor": 0.3076965808868408,
803
+ "InlineNativeKernels": 0.009155511856079102,
804
+ "InsertIOTransposes": 0.015281438827514648,
805
+ "InsertLocalTransposes": 0.006501436233520508,
806
+ "InsertOffloadedTransposes": 0.002702474594116211,
807
+ "LICM": 0.002913951873779297,
808
+ "LateLegalizeInst": 0.013724088668823242,
809
+ "LateLegalizePostSplit": 0.012693405151367188,
810
+ "LateLowerReshapeOp": 0.0025734901428222656,
811
+ "LateLowerTensorOp": 0.001531362533569336,
812
+ "LateNeuronInstComb": 0.008354902267456055,
813
+ "LayoutPreprocessing": 0.026634931564331055,
814
+ "LayoutPreprocessingAndAnalysis": 0.5595176219940186,
815
+ "LayoutRequirementAnalysis": 0.005538463592529297,
816
+ "LegalizeCCOpLayout": 0.0022728443145751953,
817
+ "LegalizeOpLevelAlias": 0.001255035400390625,
818
+ "LegalizePartitionReduce": 0.001256704330444336,
819
+ "LegalizeSundaAccess": 0.07487797737121582,
820
+ "LegalizeSundaMacro": 0.010920286178588867,
821
+ "LegalizeType": 0.012901067733764648,
822
+ "LocalLayoutOpt": 0.012011289596557617,
823
+ "LoopFusion": 0.006572723388671875,
824
+ "LoopSplitting": 0.0003001689910888672,
825
+ "LowerBroadcast": 0.0016355514526367188,
826
+ "LowerCCOpBlockAxis": 0.0050678253173828125,
827
+ "LowerComplexBroadcast": 0.0025262832641601563,
828
+ "LowerIntrinsics": 0.30371904373168945,
829
+ "LowerTensorOp": 0.011744022369384766,
830
+ "LowerTranspose": 0.011518478393554688,
831
+ "MacroGeneration": 0.026911020278930664,
832
+ "MaskPropagation": 0.0031325817108154297,
833
+ "MemcpyElimination": 0.027472257614135742,
834
+ "MutateDataType": 0.0015196800231933594,
835
+ "NeuronAliasDependencyInduction": 0.00016927719116210938,
836
+ "NeuronAliasDependencyReset": 0.0242006778717041,
837
+ "NeuronInstComb": 0.004147529602050781,
838
+ "NeuronLICM": 0.036264657974243164,
839
+ "NeuronLoopFusion": 0.00889277458190918,
840
+ "NeuronLoopInterchange": 0.002141237258911133,
841
+ "NeuronSimplifier": 0.00720524787902832,
842
+ "NeuronSimplifyPredicates": 0.11929655075073242,
843
+ "NeuronValueNumbering": 0.003022432327270508,
844
+ "OptimizeAliasedCopyChain": 0.0006387233734130859,
845
+ "OptimizeNKIKernels": 0.5260024070739746,
846
+ "PAGLayoutOpt": 0.5680239200592041,
847
+ "PComputeCutting": 0.0048143863677978516,
848
+ "PGLayoutTilingPipeline": 1.6304676532745361,
849
+ "PGTiling": 0.1616363525390625,
850
+ "PadElimination": 0.0003521442413330078,
851
+ "ParAxesAnnotation": 0.0544736385345459,
852
+ "PartialLoopFusion": 0.005907773971557617,
853
+ "PartialSimdFusion": 0.0038967132568359375,
854
+ "PerfectLoopNest": 0.0021576881408691406,
855
+ "RecognizeOpIdiom": 0.0039520263671875,
856
+ "Recompute": 0.0002884864807128906,
857
+ "RelaxPredicates": 0.013870716094970703,
858
+ "Rematerialization": 0.0024657249450683594,
859
+ "ReshapeWeights": 0.0006930828094482422,
860
+ "ResolveAccessConflict": 0.0038983821868896484,
861
+ "ResolveComplicatePredicates": 0.0012950897216796875,
862
+ "RewriteReplicationMatmul": 0.002060413360595703,
863
+ "RewriteWeights": 0.0028791427612304688,
864
+ "SFKVectorizer": 0.28761887550354004,
865
+ "SimpleAllReduceTiling": 0.008704662322998047,
866
+ "Simplifier": 0.003449678421020508,
867
+ "SimplifyMacroPredicates": 0.010317325592041016,
868
+ "SimplifyNeuronTensor": 1.0378923416137695,
869
+ "SimplifySlice": 0.0008852481842041016,
870
+ "SimplifyTensor": 0.005218982696533203,
871
+ "SpillPSum": 0.009551286697387695,
872
+ "SplitAPUnionSets": 0.10591006278991699,
873
+ "SplitAccGrp": 0.0011169910430908203,
874
+ "StaticProfiler": 0.01290583610534668,
875
+ "StaticTransposeLocalTensor": 0.003824472427368164,
876
+ "SundaISel": 0.041872262954711914,
877
+ "TCTransform": 0.0008666515350341797,
878
+ "TensorInitialization": 0.013058185577392578,
879
+ "TensorOpSimplifier": 0.0061550140380859375,
880
+ "TensorOpTransform": 0.020328521728515625,
881
+ "TileCCOps": 0.006834983825683594,
882
+ "TilingProfiler": 0.0072863101959228516,
883
+ "TransformConvOp": 0.0032320022583007813,
884
+ "TritiumFusion": 0.03062152862548828,
885
+ "ValueNumbering": 0.0023603439331054688,
886
+ "VectorizeDMA": 0.004430294036865234,
887
+ "VectorizeMatMult": 0.0021605491638183594,
888
+ "WeightCoalescing": 0.00825953483581543,
889
+ "ZeroSizeTensorElimination": 0.00011014938354492188
890
+ },
891
+ "tensorizer": {
892
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 42834.0,
893
+ "StaticProfiler::AifUb": 129.43267822265625,
894
+ "StaticProfiler::ArithmeticIntensityTensorizer": 128.19729614257813,
895
+ "StaticProfiler::AverageDmaLength": 4810.17578125,
896
+ "StaticProfiler::AverageFractalPeUtilization": 99.65389251708984,
897
+ "StaticProfiler::AveragePartitionUtilization": 97.55139923095703,
898
+ "StaticProfiler::AveragePeUtilization": 98.60253143310547,
899
+ "StaticProfiler::DDRTransferBytes": 782946624.0,
900
+ "StaticProfiler::InternalTransferBytes": 629086720.0,
901
+ "StaticProfiler::LoadExpanded": 97814.0,
902
+ "StaticProfiler::LocalizationEfficiency": 99.04553985595703,
903
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.20111846923828,
904
+ "StaticProfiler::StoreExpanded": 1757.0,
905
+ "StaticProfiler::TotalDMAExpanded": 99571.0,
906
+ "StaticProfiler::TotalDynamicInstancesCount": 50031.0,
907
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 49585.0,
908
+ "StaticProfiler::TotalLNCComm": 0.0,
909
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
910
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
911
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
912
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
913
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
914
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
915
+ "TilingProfiler::MatMultInstructionsAfterTiling": 22464.0,
916
+ "TilingProfiler::NumPfTransposes": 5.0,
917
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
918
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
919
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
920
+ "TilingProfiler::PfTransposeInstructions": 19105.0,
921
+ "TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
922
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
923
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 96.0,
924
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
925
+ "TilingProfiler::SimdInstructionsAfterTiling": 158.0,
926
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
927
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
928
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
929
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
930
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
931
+ "TransformConvOp::conv2d_column_packing": 0.0,
932
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
933
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
934
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
935
+ }
936
+ },
937
+ "sg01": {
938
+ "compiletime": {
939
+ "CanonicalizeConv": 2.300000051036477e-05,
940
+ "CanonicalizeForTensorizer": 1.2999999853491317e-05,
941
+ "Canonicalizer": 0.00028199999360367656,
942
+ "HoistCompute": 3.000000106112566e-06,
943
+ "IdentifyCrossPassTensors": 2.4000000848900527e-05,
944
+ "MemcastMotion": 9.999999747378752e-06,
945
+ "PenguinizeFunctions": 1.4000000192027073e-05,
946
+ "PruneFunctions": 1.2000000424450263e-05,
947
+ "RemoveOptimizationBarriers": 2.2000000171829015e-05,
948
+ "ScatterMotion": 1.700000029813964e-05,
949
+ "TensorizerLegalizationPass": 3.400000059627928e-05,
950
+ "VerifySupportedOps": 1.1000000085914508e-05,
951
+ "algsimp": 6.500000017695129e-05,
952
+ "batchnorm_expander": 1.2000000424450263e-05,
953
+ "boundary-marker-removal": 3.999999989900971e-06,
954
+ "call-inliner": 9.999999747378752e-06,
955
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
956
+ "collective-stream-id-checker": 3.999999989900971e-06,
957
+ "comparison-expander": 4.999999873689376e-06,
958
+ "computation-deduplicator": 2.099999983329326e-05,
959
+ "conditional-to-select": 4.999999873689376e-06,
960
+ "config-lowering": 2.9000000722589903e-05,
961
+ "constant_folding": 9.000000318337698e-06,
962
+ "cse": 1.2000000424450263e-05,
963
+ "dce": 9.999999974752427e-07,
964
+ "dynamic-slice-transpose": 3.999999989900971e-06,
965
+ "eliminate-redundant-compare": 3.999999989900971e-06,
966
+ "emit-offloaded-dropout": 1.2999999853491317e-05,
967
+ "flatten-call-graph": 9.000000318337698e-06,
968
+ "fuse-send-recv": 2.9000000722589903e-05,
969
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
970
+ "hilo::NeuronInstCombine": 3.5000000934815034e-05,
971
+ "hilo::NeuronOpFusion": 1.2999999853491317e-05,
972
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05,
973
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
974
+ "hilo::SixtyFourHack": 9.999999747378752e-06,
975
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
976
+ "hlo-mac-count": 2.5999999706982635e-05,
977
+ "hlo-verifier": 0.00018899999849963933,
978
+ "legalize-ccops": 9.999999974752427e-07,
979
+ "legalize-compare": 3.999999989900971e-06,
980
+ "lower-argminmax-custom-call": 3.999999989900971e-06,
981
+ "map-inline": 1.2000000424450263e-05,
982
+ "metadata-naming": 2.4000000848900527e-05,
983
+ "mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05,
984
+ "mlir::hlo::MhloToPyPenguin": 0.0008980000275187194,
985
+ "mlir::mhlo::LowerComplexExtraPass": 7.999999797903001e-05,
986
+ "mlir::mhlo::LowerComplexPass": 0.00013499999477062374,
987
+ "native-to-custom-softmax": 7.000000096013537e-06,
988
+ "native-to-custom-softmax-dx": 1.5999999959603883e-05,
989
+ "operand_upcaster": 1.8999999156221747e-05,
990
+ "post-par-pipe-begin": 1.9999999949504854e-06,
991
+ "post-par-pipe-end": 0.0,
992
+ "post-partition-simplification": 0.0005530000198632479,
993
+ "replace-minimum-constant": 6.000000212225132e-06,
994
+ "reshape-mover": 3.000000106112566e-06,
995
+ "simplify-concat": 4.8999998398358e-05,
996
+ "simplify-while-loops": 1.9999999949504854e-06,
997
+ "transform-variadic-reduce": 9.000000318337698e-06,
998
+ "tuple-simplifier": 4.999999873689376e-06,
999
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1000
+ "unroll-while-loop": 9.999999974752427e-07
1001
+ },
1002
+ "hilo": {
1003
+ "ArithmeticIntensity": 123.27030181884766,
1004
+ "HloMacCount": 12415139840.0,
1005
+ "Traffic": 201429536.0
1006
+ }
1007
+ },
1008
+ "sg02": {
1009
+ "compiletime": {
1010
+ "CanonicalizeConv": 0.0,
1011
+ "CanonicalizeForTensorizer": 1.2999999853491317e-05,
1012
+ "Canonicalizer": 0.00033599999733269215,
1013
+ "HoistCompute": 0.0,
1014
+ "IdentifyCrossPassTensors": 2.2000000171829015e-05,
1015
+ "MemcastMotion": 0.0,
1016
+ "PenguinizeFunctions": 9.000000318337698e-06,
1017
+ "PruneFunctions": 9.999999747378752e-06,
1018
+ "RemoveOptimizationBarriers": 1.8999999156221747e-05,
1019
+ "ScatterMotion": 0.0,
1020
+ "TensorizerLegalizationPass": 6.000000212225132e-06,
1021
+ "VerifySupportedOps": 1.1000000085914508e-05,
1022
+ "algsimp": 5.999999848427251e-05,
1023
+ "batchnorm_expander": 1.1000000085914508e-05,
1024
+ "boundary-marker-removal": 3.999999989900971e-06,
1025
+ "call-inliner": 1.1000000085914508e-05,
1026
+ "canonicalize-boundary-marker": 3.999999989900971e-06,
1027
+ "collective-stream-id-checker": 1.9999999949504854e-06,
1028
+ "comparison-expander": 4.999999873689376e-06,
1029
+ "computation-deduplicator": 2.099999983329326e-05,
1030
+ "conditional-to-select": 6.000000212225132e-06,
1031
+ "config-lowering": 2.5999999706982635e-05,
1032
+ "constant_folding": 9.000000318337698e-06,
1033
+ "cse": 1.1000000085914508e-05,
1034
+ "dce": 9.999999974752427e-07,
1035
+ "dynamic-slice-transpose": 3.999999989900971e-06,
1036
+ "eliminate-redundant-compare": 3.000000106112566e-06,
1037
+ "emit-offloaded-dropout": 1.2000000424450263e-05,
1038
+ "flatten-call-graph": 1.2000000424450263e-05,
1039
+ "fuse-send-recv": 1.8000000636675395e-05,
1040
+ "hilo::LegalizeAlias": 1.9999999949504854e-06,
1041
+ "hilo::NeuronInstCombine": 4.999999873689376e-06,
1042
+ "hilo::NeuronOpFusion": 2.8000000384054147e-05,
1043
+ "hilo::ReplaceTokenTypeWithU8Pass": 7.000000096013537e-06,
1044
+ "hilo::ScheduleFusion": 0.0,
1045
+ "hilo::SixtyFourHack": 3.7000001611886546e-05,
1046
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1047
+ "hlo-mac-count": 0.00016900000628083944,
1048
+ "hlo-verifier": 0.00015799999528098851,
1049
+ "legalize-ccops": 9.999999974752427e-07,
1050
+ "legalize-compare": 3.000000106112566e-06,
1051
+ "lower-argminmax-custom-call": 3.999999989900971e-06,
1052
+ "map-inline": 1.2999999853491317e-05,
1053
+ "metadata-naming": 1.5999999959603883e-05,
1054
+ "mlir::detail::OpToOpPassAdaptor": 2.700000004551839e-05,
1055
+ "mlir::hlo::MhloToPyPenguin": 0.0008440000237897038,
1056
+ "mlir::mhlo::LowerComplexExtraPass": 6.900000153109431e-05,
1057
+ "mlir::mhlo::LowerComplexPass": 1.5999999959603883e-05,
1058
+ "native-to-custom-softmax": 6.000000212225132e-06,
1059
+ "native-to-custom-softmax-dx": 1.9999999494757503e-05,
1060
+ "operand_upcaster": 1.2000000424450263e-05,
1061
+ "post-par-pipe-begin": 9.999999974752427e-07,
1062
+ "post-par-pipe-end": 0.0,
1063
+ "post-partition-simplification": 0.000526999996509403,
1064
+ "replace-minimum-constant": 9.000000318337698e-06,
1065
+ "reshape-mover": 3.000000106112566e-06,
1066
+ "simplify-concat": 4.400000034365803e-05,
1067
+ "simplify-while-loops": 1.9999999949504854e-06,
1068
+ "transform-variadic-reduce": 4.8999998398358e-05,
1069
+ "tuple-simplifier": 4.999999873689376e-06,
1070
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1071
+ "unroll-while-loop": 9.999999974752427e-07
1072
+ },
1073
+ "hilo": {
1074
+ "ArithmeticIntensity": 25.691875457763672,
1075
+ "HloMacCount": 9974841344.0,
1076
+ "Traffic": 776497728.0
1077
+ }
1078
+ }
1079
+ }
context_encoding_model/_tp0_bk0/graph.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0aeace703e08ac36bdcb2027d9a278403cb96ef39f48bddc999b077215e8a36
3
+ size 1557504
context_encoding_model/_tp0_bk0/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_encoding_model/_tp0_bk0/metaneff.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0aef68f833b52be82fd0e17410bcfd279e5719338cb746c0619d5139fc4a3d02
3
+ size 1042690
context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d89b9e073981a0b1b7d0bbd0a24f147e9df13c5706d9d6be9971b857124c9496
3
+ size 1119812
context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0aeace703e08ac36bdcb2027d9a278403cb96ef39f48bddc999b077215e8a36
3
+ size 1557504
context_encoding_model/_tp0_bk0/neuron_config.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "Qwen/Qwen3-8B",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "Qwen3ForCausalLM"
7
+ ],
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "attribute_map": {},
11
+ "bad_words_ids": null,
12
+ "begin_suppress_tokens": null,
13
+ "bos_token_id": 151643,
14
+ "chunk_size_feed_forward": 0,
15
+ "cross_attention_hidden_size": null,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": 151645,
22
+ "exponential_decay_length_penalty": null,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "fused_spec_config": null,
27
+ "head_dim": 128,
28
+ "hidden_act": "silu",
29
+ "hidden_size": 4096,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 12288,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "max_position_embeddings": 40960,
45
+ "max_window_layers": 36,
46
+ "metadata": null,
47
+ "min_length": 0,
48
+ "model_type": "qwen3",
49
+ "neuron_config": {
50
+ "activation_quantization_type": null,
51
+ "allow_input_truncation": false,
52
+ "apply_seq_ids_mask": false,
53
+ "async_mode": false,
54
+ "attention_dp_degree": 1,
55
+ "attention_dtype": null,
56
+ "attn_block_cte_nki_kernel_enabled": false,
57
+ "attn_block_tkg_nki_kernel_cache_update": false,
58
+ "attn_block_tkg_nki_kernel_enabled": false,
59
+ "attn_cls": {
60
+ "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
61
+ "__name__": "NeuronQwen3Attention"
62
+ },
63
+ "attn_kernel_enabled": null,
64
+ "attn_tkg_builtin_kernel_enabled": false,
65
+ "attn_tkg_nki_kernel_enabled": false,
66
+ "batch_size": 1,
67
+ "bucket_n_active_tokens": true,
68
+ "buckets": [
69
+ 128
70
+ ],
71
+ "cast_type": "config",
72
+ "cc_pipeline_tiling_factor": 2,
73
+ "chunked_prefill_config": null,
74
+ "context_encoding_buckets": [
75
+ 128
76
+ ],
77
+ "cp_degree": 1,
78
+ "ctx_batch_size": 1,
79
+ "disable_kv_cache_tiling": false,
80
+ "draft_model_modules_to_not_convert": null,
81
+ "enable_bucketing": true,
82
+ "enable_eagle_draft_input_norm": false,
83
+ "enable_eagle_speculation": false,
84
+ "enable_fused_speculation": false,
85
+ "enable_long_context_mode": false,
86
+ "enable_output_completion_notifications": false,
87
+ "enable_spill_reload_dge": false,
88
+ "enable_token_tree": false,
89
+ "ep_degree": 1,
90
+ "expert_mlp_nki_kernel_enabled": null,
91
+ "flash_decoding_enabled": false,
92
+ "fused_qkv": false,
93
+ "fused_rmsnorm_skip_gamma": false,
94
+ "is_block_kv_layout": null,
95
+ "is_chunked_prefill": false,
96
+ "is_continuous_batching": true,
97
+ "is_eagle_draft": false,
98
+ "is_medusa": false,
99
+ "is_prefill_stage": true,
100
+ "is_prefix_caching": false,
101
+ "k_cache_transposed": false,
102
+ "kv_cache_batch_size": 1,
103
+ "kv_cache_padding_size": 0,
104
+ "kv_cache_quant": false,
105
+ "kv_cache_tiling": false,
106
+ "layer_boundary_markers": false,
107
+ "lm_head_pad": false,
108
+ "lm_head_pad_alignment_size": 1,
109
+ "local_ranks_size": 2,
110
+ "logical_nc_config": 1,
111
+ "lora_config": null,
112
+ "max_batch_size": 1,
113
+ "max_context_length": 1024,
114
+ "max_length": 1024,
115
+ "max_new_tokens": null,
116
+ "medusa_speculation_length": 0,
117
+ "medusa_tree": null,
118
+ "mlp_kernel_enabled": false,
119
+ "mlp_kernel_fuse_residual_add": false,
120
+ "modules_to_not_convert": null,
121
+ "moe_fused_nki_kernel_enabled": null,
122
+ "n_active_tokens": 1024,
123
+ "n_positions": 1024,
124
+ "num_medusa_heads": 0,
125
+ "on_cpu": false,
126
+ "on_device_sampling_config": {
127
+ "deterministic": false,
128
+ "do_sample": false,
129
+ "dynamic": true,
130
+ "global_topk": 256,
131
+ "on_device_sampling_config": true,
132
+ "temperature": 1.0,
133
+ "top_k": 1,
134
+ "top_k_kernel_enabled": false,
135
+ "top_p": 1.0
136
+ },
137
+ "output_logits": false,
138
+ "overrides_torch_dtype": true,
139
+ "pa_block_size": 1024,
140
+ "pa_num_blocks": 1,
141
+ "padding_side": "right",
142
+ "pp_degree": 1,
143
+ "prefix_buckets": null,
144
+ "qk_layernorm": false,
145
+ "qkv_kernel_enabled": false,
146
+ "qkv_kernel_fuse_residual_add": false,
147
+ "qkv_kernel_nbsd_layout": false,
148
+ "quantization_dtype": "int8",
149
+ "quantization_type": "per_tensor_symmetric",
150
+ "quantize_clamp_bound": Infinity,
151
+ "quantized": false,
152
+ "quantized_checkpoints_path": null,
153
+ "quantized_mlp_kernel_enabled": false,
154
+ "rmsnorm_quantize_kernel_enabled": false,
155
+ "router_topk_nki_kernel_enabled": null,
156
+ "rpl_reduce_dtype": null,
157
+ "save_sharded_checkpoint": true,
158
+ "scratchpad_page_size": null,
159
+ "seq_len": 1024,
160
+ "seq_len_threshold_for_cc_tiling": 16384,
161
+ "sequence_parallel_enabled": false,
162
+ "shared_mlp_nki_kernel_enabled": null,
163
+ "skip_sharding": false,
164
+ "skip_warmup": false,
165
+ "spec_batch_size": 1,
166
+ "speculation_length": 0,
167
+ "start_rank_id": 0,
168
+ "target": null,
169
+ "tile_cc": false,
170
+ "tkg_batch_size": 1,
171
+ "token_generation_buckets": null,
172
+ "token_tree_config": null,
173
+ "torch_dtype": "bfloat16",
174
+ "tp_degree": 2,
175
+ "vocab_parallel": false,
176
+ "weight_gather_seq_len_threshold": 32768,
177
+ "weights_to_skip_layout_optimization": [],
178
+ "world_size": 2
179
+ },
180
+ "no_repeat_ngram_size": 0,
181
+ "num_attention_heads": 32,
182
+ "num_beam_groups": 1,
183
+ "num_beams": 1,
184
+ "num_cores_per_group": 1,
185
+ "num_hidden_layers": 36,
186
+ "num_key_value_heads": 8,
187
+ "num_return_sequences": 1,
188
+ "output_attentions": false,
189
+ "output_hidden_states": false,
190
+ "output_scores": false,
191
+ "pad_token_id": 0,
192
+ "prefix": null,
193
+ "problem_type": null,
194
+ "pruned_heads": {},
195
+ "remove_invalid_values": false,
196
+ "repetition_penalty": 1.0,
197
+ "return_dict": true,
198
+ "return_dict_in_generate": false,
199
+ "rms_norm_eps": 1e-06,
200
+ "rope_scaling": null,
201
+ "rope_theta": 1000000,
202
+ "sep_token_id": null,
203
+ "sliding_window": null,
204
+ "suppress_tokens": null,
205
+ "task_specific_params": null,
206
+ "temperature": 1.0,
207
+ "tf_legacy_loss": false,
208
+ "tie_encoder_decoder": false,
209
+ "tie_word_embeddings": false,
210
+ "tokenizer_class": null,
211
+ "top_k": 50,
212
+ "top_p": 1.0,
213
+ "torchscript": false,
214
+ "transformers_version": "4.51.0",
215
+ "typical_p": 1.0,
216
+ "use_bfloat16": false,
217
+ "use_cache": true,
218
+ "use_sliding_window": false,
219
+ "vocab_size": 151936
220
+ }
context_encoding_model/_tp0_bk1/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb --output model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk1/compile_flags.MODULE_2914133a46cb7b4660ab+d7af8a84.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/log-neuron-cc.txt"]
context_encoding_model/_tp0_bk1/global_metric_store.json ADDED
@@ -0,0 +1,1079 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.65782165527344,
5
+ "StaticProfiler::AveragePartitionUtilization": 97.58238220214844,
6
+ "StaticProfiler::AveragePeUtilization": 98.61824035644531,
7
+ "StaticProfiler::LocalizationEfficiency": 98.78419494628906,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.47209167480469,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1.0,
16
+ "StaticProfiler::AveragePartitionUtilization": 1.0,
17
+ "StaticProfiler::AveragePeUtilization": 1.0,
18
+ "StaticProfiler::LocalizationEfficiency": 1.0,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 0.019578933715820313,
27
+ "AffinePredicateResolution": 0.0019481182098388672,
28
+ "AliasDependencyElimination": 0.0001239776611328125,
29
+ "AliasDependencyInduction": 0.00577092170715332,
30
+ "AliasDependencyReset": 0.027690649032592773,
31
+ "BFComputeCutting": 0.0023322105407714844,
32
+ "BirCodeGenLoop": 0.4628438949584961,
33
+ "CCOpFusion": 0.022275209426879883,
34
+ "CanonicalizeConv": 3.300000025774352e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.005593061447143555,
36
+ "CanonicalizeForTensorizer": 4.400000034365803e-05,
37
+ "CanonicalizeIR": 0.001634359359741211,
38
+ "Canonicalizer": 0.0008999999845400453,
39
+ "CoalesceCCOp": 0.015577077865600586,
40
+ "CommuteConcat": 0.0008616447448730469,
41
+ "DMALocalityOpt": 0.007327079772949219,
42
+ "DMAProfiler": 0.012569665908813477,
43
+ "DMATilingProfiler": 0.0037431716918945313,
44
+ "DataLocalityOpt": 0.06741714477539063,
45
+ "DataStreaming": 0.03615880012512207,
46
+ "DeConcat": 0.0005049705505371094,
47
+ "DeadCodeElimination": 0.0009002685546875,
48
+ "DeadStoreElimination": 0.0056514739990234375,
49
+ "DelinearIndices": 0.004773139953613281,
50
+ "Delinearization": 0.0026137828826904297,
51
+ "DoNothing": 0.0001933574676513672,
52
+ "DramToDramTranspose": 0.019293546676635742,
53
+ "DumpGraphAndMetadata": 0.10360383987426758,
54
+ "EliminateDivs": 0.003831148147583008,
55
+ "ExpandBatchNorm": 0.0019576549530029297,
56
+ "ExpandISAMacro": 0.012068033218383789,
57
+ "FactorizeBlkDims": 0.008942604064941406,
58
+ "FactorizeThreadAxesInFreeDims": 0.001847982406616211,
59
+ "FlattenMacroLoop": 0.003529787063598633,
60
+ "GenericAccessSimplifier": 0.0008223056793212891,
61
+ "HoistCompute": 7.999999979801942e-06,
62
+ "IdentifyCrossPassTensors": 4.8000001697801054e-05,
63
+ "InferInitValue": 0.025947093963623047,
64
+ "InferIntrinsicOnCC": 0.00908350944519043,
65
+ "InferNeuronTensor": 0.02371978759765625,
66
+ "InferNonlocalTensors": 0.014753341674804688,
67
+ "InferPSumTensor": 0.309035062789917,
68
+ "InlineNativeKernels": 0.008690595626831055,
69
+ "InsertIOTransposes": 0.01906275749206543,
70
+ "InsertLocalTransposes": 0.004312276840209961,
71
+ "InsertOffloadedTransposes": 0.002802133560180664,
72
+ "LICM": 0.003081083297729492,
73
+ "LateLegalizeInst": 0.014100313186645508,
74
+ "LateLegalizePostSplit": 0.012533903121948242,
75
+ "LateLowerReshapeOp": 0.001035451889038086,
76
+ "LateLowerTensorOp": 0.002605438232421875,
77
+ "LateNeuronInstComb": 0.009373188018798828,
78
+ "LayoutPreprocessing": 0.03434133529663086,
79
+ "LayoutPreprocessingAndAnalysis": 0.07319903373718262,
80
+ "LayoutRequirementAnalysis": 0.005194187164306641,
81
+ "LegalizeCCOpLayout": 0.0025322437286376953,
82
+ "LegalizeOpLevelAlias": 0.0020308494567871094,
83
+ "LegalizePartitionReduce": 0.0010001659393310547,
84
+ "LegalizeSundaAccess": 0.0786747932434082,
85
+ "LegalizeSundaMacro": 0.011176109313964844,
86
+ "LegalizeType": 0.014636754989624023,
87
+ "LocalLayoutOpt": 0.014019250869750977,
88
+ "LoopFusion": 0.005472898483276367,
89
+ "LoopSplitting": 0.00038623809814453125,
90
+ "LowerBroadcast": 0.0027265548706054688,
91
+ "LowerCCOpBlockAxis": 0.0058476924896240234,
92
+ "LowerComplexBroadcast": 0.00213623046875,
93
+ "LowerIntrinsics": 0.3070671558380127,
94
+ "LowerTensorOp": 0.010679960250854492,
95
+ "LowerTranspose": 0.012553691864013672,
96
+ "MacroGeneration": 0.029733657836914063,
97
+ "MaskPropagation": 0.0028328895568847656,
98
+ "MemcastMotion": 1.8999999156221747e-05,
99
+ "MemcpyElimination": 0.026583433151245117,
100
+ "MutateDataType": 0.0020093917846679688,
101
+ "NeuronAliasDependencyInduction": 0.00018548965454101563,
102
+ "NeuronAliasDependencyReset": 0.02524423599243164,
103
+ "NeuronInstComb": 0.004286766052246094,
104
+ "NeuronLICM": 0.03554058074951172,
105
+ "NeuronLoopFusion": 0.007987260818481445,
106
+ "NeuronLoopInterchange": 0.0023233890533447266,
107
+ "NeuronSimplifier": 0.0075054168701171875,
108
+ "NeuronSimplifyPredicates": 0.12207841873168945,
109
+ "NeuronValueNumbering": 0.0038213729858398438,
110
+ "OptimizeAliasedCopyChain": 0.0005936622619628906,
111
+ "OptimizeNKIKernels": 0.44962644577026367,
112
+ "PAGLayoutOpt": 0.0999138355255127,
113
+ "PComputeCutting": 0.005170106887817383,
114
+ "PGLayoutTilingPipeline": 0.7408750057220459,
115
+ "PGTiling": 0.29245758056640625,
116
+ "PadElimination": 0.000308990478515625,
117
+ "ParAxesAnnotation": 0.05283546447753906,
118
+ "PartialLoopFusion": 0.0043125152587890625,
119
+ "PartialSimdFusion": 0.004901885986328125,
120
+ "PenguinizeFunctions": 4.3000000005122274e-05,
121
+ "PerfectLoopNest": 0.001722574234008789,
122
+ "PruneFunctions": 4.199999966658652e-05,
123
+ "RecognizeOpIdiom": 0.004076480865478516,
124
+ "Recompute": 0.0002620220184326172,
125
+ "RelaxPredicates": 0.013286828994750977,
126
+ "Rematerialization": 0.0021238327026367188,
127
+ "RemoveOptimizationBarriers": 4.3000000005122274e-05,
128
+ "ReshapeWeights": 0.0006799697875976563,
129
+ "ResolveAccessConflict": 0.0040090084075927734,
130
+ "ResolveComplicatePredicates": 0.001981496810913086,
131
+ "RewriteReplicationMatmul": 0.0021796226501464844,
132
+ "RewriteWeights": 0.0022602081298828125,
133
+ "SFKVectorizer": 0.274188756942749,
134
+ "ScatterMotion": 5.7999997807201e-05,
135
+ "SimpleAllReduceTiling": 0.009164094924926758,
136
+ "Simplifier": 0.0046122074127197266,
137
+ "SimplifyMacroPredicates": 0.010458230972290039,
138
+ "SimplifyNeuronTensor": 1.0516629219055176,
139
+ "SimplifySlice": 0.0009145736694335938,
140
+ "SimplifyTensor": 0.00577855110168457,
141
+ "SpillPSum": 0.012692689895629883,
142
+ "SplitAPUnionSets": 0.10518908500671387,
143
+ "SplitAccGrp": 0.001172780990600586,
144
+ "StaticProfiler": 0.0124053955078125,
145
+ "StaticTransposeLocalTensor": 0.0038576126098632813,
146
+ "SundaISel": 0.04396390914916992,
147
+ "TCTransform": 0.0018804073333740234,
148
+ "TensorInitialization": 0.012793779373168945,
149
+ "TensorOpSimplifier": 0.0045316219329833984,
150
+ "TensorOpTransform": 0.021115541458129883,
151
+ "TensorizerLegalizationPass": 6.999999459367245e-05,
152
+ "TileCCOps": 0.0056231021881103516,
153
+ "TilingProfiler": 0.00790858268737793,
154
+ "TransformConvOp": 0.0030431747436523438,
155
+ "TritiumFusion": 0.03186154365539551,
156
+ "ValueNumbering": 0.0038623809814453125,
157
+ "VectorizeDMA": 0.0021522045135498047,
158
+ "VectorizeMatMult": 0.003453969955444336,
159
+ "VerifySupportedOps": 3.300000025774352e-05,
160
+ "WeightCoalescing": 0.009244203567504883,
161
+ "ZeroSizeTensorElimination": 0.00011420249938964844,
162
+ "algsimp": 0.0026100000832229853,
163
+ "batchnorm_expander": 3.9999998989515007e-05,
164
+ "boundary-marker-removal": 1.2000000424450263e-05,
165
+ "call-inliner": 0.00046499999007210135,
166
+ "canonicalize-boundary-marker": 1.8000000636675395e-05,
167
+ "collective-stream-id-checker": 9.200000204145908e-05,
168
+ "comparison-expander": 0.0005959999980404973,
169
+ "computation-deduplicator": 6.900000153109431e-05,
170
+ "conditional-to-select": 1.700000029813964e-05,
171
+ "config-lowering": 7.79999973019585e-05,
172
+ "constant-statistics": 0.0005530000198632479,
173
+ "constant_folding": 0.0003320000250823796,
174
+ "cse": 3.7000001611886546e-05,
175
+ "dce": 7.800000457791612e-05,
176
+ "dot_decomposer": 0.0014440000522881746,
177
+ "dynamic-slice-transpose": 1.2000000424450263e-05,
178
+ "eliminate-redundant-compare": 0.00028100001509301364,
179
+ "emit-offloaded-dropout": 4.099999932805076e-05,
180
+ "flatten-call-graph": 0.0009379999246448278,
181
+ "fuse-send-recv": 7.200000254670158e-05,
182
+ "hilo::LegalizeAlias": 1.2999999853491317e-05,
183
+ "hilo::NeuronInstCombine": 0.00010099999781232327,
184
+ "hilo::NeuronOpFusion": 2.5000001187436283e-05,
185
+ "hilo::ReplaceTokenTypeWithU8Pass": 4.3000000005122274e-05,
186
+ "hilo::ScheduleFusion": 1.9999999949504854e-06,
187
+ "hilo::SixtyFourHack": 8.900000102585182e-05,
188
+ "hilo::VerifyAliasing": 4.999999873689376e-06,
189
+ "hlo-mac-count": 0.0012799999676644802,
190
+ "hlo-verifier": 0.007751000113785267,
191
+ "instruction-histogram": 0.0006590000120922923,
192
+ "io-con-pipe-begin": 6.000000212225132e-06,
193
+ "io-con-pipe-end": 9.999999974752427e-07,
194
+ "io-layout-normalization": 0.0014029999729245901,
195
+ "io-statistics": 6.199999916134402e-05,
196
+ "legalize-ccops": 3.999999989900971e-06,
197
+ "legalize-compare": 1.1000000085914508e-05,
198
+ "lower-argminmax-custom-call": 1.2000000424450263e-05,
199
+ "map-inline": 0.0008909999742172658,
200
+ "metadata-naming": 5.7999997807201e-05,
201
+ "mlir::detail::OpToOpPassAdaptor": 0.00016799999866634607,
202
+ "mlir::hlo::MhloToPyPenguin": 0.0028260000981390476,
203
+ "mlir::mhlo::LowerComplexExtraPass": 0.00026000000070780516,
204
+ "mlir::mhlo::LowerComplexPass": 0.0002699999895412475,
205
+ "native-to-custom-softmax": 0.0007219999679364264,
206
+ "native-to-custom-softmax-dx": 0.0005740000051446259,
207
+ "operand_upcaster": 6.399999983841553e-05,
208
+ "opt-barrier-removal": 0.0005649999948218465,
209
+ "post-par-pipe-begin": 9.600000339560211e-05,
210
+ "post-par-pipe-end": 0.0,
211
+ "post-partition-simplification": 0.0016929999692365527,
212
+ "pre-par-pipe-begin": 9.999999974752427e-07,
213
+ "pre-par-pipe-end": 0.0,
214
+ "pre-partition-simplification": 0.1934960037469864,
215
+ "replace-minimum-constant": 0.00044299999717622995,
216
+ "reshape-mover": 0.00010800000018207356,
217
+ "simplify-concat": 0.00014099999680183828,
218
+ "simplify-while-loops": 9.600000339560211e-05,
219
+ "transform-variadic-reduce": 7.900000491645187e-05,
220
+ "tuple-simplifier": 0.0002980000281240791,
221
+ "unpack-nested-aws-ntwsr": 0.0004720000142697245,
222
+ "unroll-while-loop": 1.8999999156221747e-05,
223
+ "zero_sized_hlo_elimination": 0.0008989999769255519
224
+ },
225
+ "hilo": {
226
+ "ConstantSize": 599333.0,
227
+ "HloInputCount": 475.0,
228
+ "HloMacCount": 50240159744.0,
229
+ "HloOutputCount": 73.0,
230
+ "IfmapSize": 8266543104.0,
231
+ "OfmapSize": 75497472.0,
232
+ "OutputsReadFromCount": 0.0,
233
+ "PassthroughTensorsCount": 0.0,
234
+ "RedundantOutputCount": 0.0,
235
+ "Traffic": 1663506816.0
236
+ },
237
+ "tensorizer": {
238
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 43318.0,
239
+ "StaticProfiler::AifUb": 154.8094024658203,
240
+ "StaticProfiler::ArithmeticIntensityTensorizer": 152.92723083496094,
241
+ "StaticProfiler::AverageDmaLength": 4809.89794921875,
242
+ "StaticProfiler::DDRTransferBytes": 787141440.0,
243
+ "StaticProfiler::InternalTransferBytes": 634853888.0,
244
+ "StaticProfiler::LoadExpanded": 98070.0,
245
+ "StaticProfiler::StoreExpanded": 2397.0,
246
+ "StaticProfiler::TotalDMAExpanded": 100467.0,
247
+ "StaticProfiler::TotalDynamicInstancesCount": 50670.0,
248
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 50224.0,
249
+ "StaticProfiler::TotalLNCComm": 0.0,
250
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
251
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
252
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
253
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
254
+ "TilingProfiler::MatMultInstructionsAfterTiling": 22848.0,
255
+ "TilingProfiler::NumPfTransposes": 5.0,
256
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
257
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
258
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
259
+ "TilingProfiler::PfTransposeInstructions": 19201.0,
260
+ "TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
261
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
262
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 192.0,
263
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
264
+ "TilingProfiler::SimdInstructionsAfterTiling": 158.0,
265
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
266
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
267
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
268
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
269
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
270
+ "TransformConvOp::conv2d_column_packing": 0.0,
271
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
272
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
273
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
274
+ }
275
+ },
276
+ "all": {
277
+ "compiletime": {
278
+ "algsimp": 0.0024220000486820936,
279
+ "call-inliner": 0.0004349999944679439,
280
+ "collective-stream-id-checker": 8.199999865610152e-05,
281
+ "comparison-expander": 0.0005810000002384186,
282
+ "constant-statistics": 0.0005530000198632479,
283
+ "constant_folding": 0.0003060000017285347,
284
+ "dce": 7.500000356230885e-05,
285
+ "dot_decomposer": 0.0014440000522881746,
286
+ "eliminate-redundant-compare": 0.0002690000110305846,
287
+ "flatten-call-graph": 0.0009069999796338379,
288
+ "hlo-mac-count": 0.0010560000082477927,
289
+ "hlo-verifier": 0.007164000067859888,
290
+ "instruction-histogram": 0.0006590000120922923,
291
+ "io-con-pipe-begin": 6.000000212225132e-06,
292
+ "io-con-pipe-end": 9.999999974752427e-07,
293
+ "io-layout-normalization": 0.0014029999729245901,
294
+ "io-statistics": 6.199999916134402e-05,
295
+ "map-inline": 0.0008549999911338091,
296
+ "native-to-custom-softmax": 0.0007029999978840351,
297
+ "native-to-custom-softmax-dx": 0.000522000016644597,
298
+ "opt-barrier-removal": 0.0005649999948218465,
299
+ "pre-par-pipe-begin": 9.999999974752427e-07,
300
+ "pre-par-pipe-end": 0.0,
301
+ "pre-partition-simplification": 0.1934960037469864,
302
+ "replace-minimum-constant": 0.00042100000428035855,
303
+ "reshape-mover": 9.7999996796716e-05,
304
+ "simplify-while-loops": 9.000000136438757e-05,
305
+ "tuple-simplifier": 0.00028300000121816993,
306
+ "unpack-nested-aws-ntwsr": 0.0004600000102072954,
307
+ "unroll-while-loop": 1.8999999156221747e-05,
308
+ "zero_sized_hlo_elimination": 0.0008989999769255519
309
+ }
310
+ },
311
+ "cumsum": {
312
+ "compiletime": {
313
+ "CoalesceCCOp": 0.0002090930938720703,
314
+ "DMALocalityOpt": 0.00018835067749023438,
315
+ "DMAProfiler": 0.0008924007415771484,
316
+ "DataStreaming": 0.0002593994140625,
317
+ "DoNothing": 0.00011873245239257813,
318
+ "ExpandISAMacro": 0.0005505084991455078,
319
+ "FactorizeBlkDims": 0.0004696846008300781,
320
+ "InferPSumTensor": 0.0004990100860595703,
321
+ "LateLegalizeInst": 0.0004222393035888672,
322
+ "LateNeuronInstComb": 0.0005340576171875,
323
+ "LegalizeSundaAccess": 0.0017271041870117188,
324
+ "LegalizeType": 0.0002815723419189453,
325
+ "LowerBroadcast": 0.0002243518829345703,
326
+ "LowerIntrinsics": 0.0002181529998779297,
327
+ "LowerTranspose": 0.00024199485778808594,
328
+ "NeuronInstComb": 0.0004971027374267578,
329
+ "NeuronLICM": 0.0004258155822753906,
330
+ "NeuronSimplifyPredicates": 0.002941608428955078,
331
+ "NeuronValueNumbering": 0.0004222393035888672,
332
+ "SFKVectorizer": 0.002941131591796875,
333
+ "SimpleAllReduceTiling": 0.00019812583923339844,
334
+ "SimplifyNeuronTensor": 0.00045800209045410156,
335
+ "SpillPSum": 0.0005657672882080078,
336
+ "WeightCoalescing": 0.00020837783813476563
337
+ }
338
+ },
339
+ "sg00": {
340
+ "compiletime": {
341
+ "CanonicalizeConv": 3.099999958067201e-05,
342
+ "CanonicalizeForTensorizer": 1.5999999959603883e-05,
343
+ "Canonicalizer": 0.00032900000223889947,
344
+ "HoistCompute": 3.000000106112566e-06,
345
+ "IdentifyCrossPassTensors": 2.8000000384054147e-05,
346
+ "MemcastMotion": 1.1000000085914508e-05,
347
+ "PenguinizeFunctions": 1.5999999959603883e-05,
348
+ "PruneFunctions": 1.2999999853491317e-05,
349
+ "RemoveOptimizationBarriers": 2.300000051036477e-05,
350
+ "ScatterMotion": 1.9999999494757503e-05,
351
+ "TensorizerLegalizationPass": 4.3000000005122274e-05,
352
+ "VerifySupportedOps": 1.2000000424450263e-05,
353
+ "algsimp": 6.600000051548705e-05,
354
+ "batchnorm_expander": 1.4000000192027073e-05,
355
+ "boundary-marker-removal": 3.999999989900971e-06,
356
+ "call-inliner": 9.000000318337698e-06,
357
+ "canonicalize-boundary-marker": 6.000000212225132e-06,
358
+ "collective-stream-id-checker": 3.000000106112566e-06,
359
+ "comparison-expander": 4.999999873689376e-06,
360
+ "computation-deduplicator": 1.9999999494757503e-05,
361
+ "conditional-to-select": 4.999999873689376e-06,
362
+ "config-lowering": 3.099999958067201e-05,
363
+ "constant_folding": 9.000000318337698e-06,
364
+ "cse": 1.2999999853491317e-05,
365
+ "dce": 9.999999974752427e-07,
366
+ "dynamic-slice-transpose": 3.999999989900971e-06,
367
+ "eliminate-redundant-compare": 3.999999989900971e-06,
368
+ "emit-offloaded-dropout": 1.2999999853491317e-05,
369
+ "flatten-call-graph": 9.999999747378752e-06,
370
+ "fuse-send-recv": 2.499999936844688e-05,
371
+ "hilo::LegalizeAlias": 6.000000212225132e-06,
372
+ "hilo::NeuronInstCombine": 4.3000000005122274e-05,
373
+ "hilo::NeuronOpFusion": 9.000000318337698e-06,
374
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05,
375
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
376
+ "hilo::SixtyFourHack": 1.4999999621068127e-05,
377
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
378
+ "hlo-mac-count": 2.5999999706982635e-05,
379
+ "hlo-verifier": 0.0001939999929163605,
380
+ "legalize-ccops": 1.9999999949504854e-06,
381
+ "legalize-compare": 3.999999989900971e-06,
382
+ "lower-argminmax-custom-call": 3.999999989900971e-06,
383
+ "map-inline": 1.2000000424450263e-05,
384
+ "metadata-naming": 1.8000000636675395e-05,
385
+ "mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05,
386
+ "mlir::hlo::MhloToPyPenguin": 0.0009980000322684646,
387
+ "mlir::mhlo::LowerComplexExtraPass": 8.800000068731606e-05,
388
+ "mlir::mhlo::LowerComplexPass": 0.00015999999595806003,
389
+ "native-to-custom-softmax": 7.000000096013537e-06,
390
+ "native-to-custom-softmax-dx": 1.2999999853491317e-05,
391
+ "operand_upcaster": 1.9999999494757503e-05,
392
+ "post-par-pipe-begin": 8.900000102585182e-05,
393
+ "post-par-pipe-end": 0.0,
394
+ "post-partition-simplification": 0.000582000007852912,
395
+ "replace-minimum-constant": 7.000000096013537e-06,
396
+ "reshape-mover": 3.999999989900971e-06,
397
+ "simplify-concat": 4.8000001697801054e-05,
398
+ "simplify-while-loops": 1.9999999949504854e-06,
399
+ "transform-variadic-reduce": 9.000000318337698e-06,
400
+ "tuple-simplifier": 4.999999873689376e-06,
401
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
402
+ "unroll-while-loop": 0.0
403
+ },
404
+ "hilo": {
405
+ "ArithmeticIntensity": 16.6773738861084,
406
+ "ConstantSize": 599333.0,
407
+ "HloInputCount": 475.0,
408
+ "HloMacCount": 5637144576.0,
409
+ "HloOutputCount": 73.0,
410
+ "IfmapSize": 8266543104.0,
411
+ "OfmapSize": 75497472.0,
412
+ "OutputsReadFromCount": 0.0,
413
+ "PassthroughTensorsCount": 0.0,
414
+ "RedundantOutputCount": 0.0,
415
+ "Traffic": 676023104.0
416
+ }
417
+ },
418
+ "sg0000": {
419
+ "compiletime": {
420
+ "AGOrderingAnalysisPass": 0.08161520957946777,
421
+ "AffinePredicateResolution": 0.001527547836303711,
422
+ "AliasDependencyElimination": 0.00012493133544921875,
423
+ "AliasDependencyInduction": 0.008615970611572266,
424
+ "AliasDependencyReset": 0.03425288200378418,
425
+ "BFComputeCutting": 0.003037691116333008,
426
+ "BirCodeGenLoop": 0.05175900459289551,
427
+ "CCOpFusion": 0.024791479110717773,
428
+ "CanonicalizeDAGForPGTiling": 0.003105640411376953,
429
+ "CanonicalizeIR": 0.0020570755004882813,
430
+ "CoalesceCCOp": 0.005420684814453125,
431
+ "CommuteConcat": 0.0015554428100585938,
432
+ "DMALocalityOpt": 0.0025992393493652344,
433
+ "DMAProfiler": 0.004426240921020508,
434
+ "DMATilingProfiler": 0.00414586067199707,
435
+ "DataLocalityOpt": 0.11810016632080078,
436
+ "DataStreaming": 0.0053942203521728516,
437
+ "DeConcat": 0.0011267662048339844,
438
+ "DeadCodeElimination": 0.0016050338745117188,
439
+ "DeadStoreElimination": 0.030996084213256836,
440
+ "DelinearIndices": 0.007958412170410156,
441
+ "Delinearization": 0.003355741500854492,
442
+ "DoNothing": 7.987022399902344e-05,
443
+ "DramToDramTranspose": 0.03346753120422363,
444
+ "DumpGraphAndMetadata": 0.005443096160888672,
445
+ "EliminateDivs": 0.004342555999755859,
446
+ "ExpandBatchNorm": 0.0018055438995361328,
447
+ "ExpandISAMacro": 0.003648519515991211,
448
+ "FactorizeBlkDims": 0.019720077514648438,
449
+ "FactorizeThreadAxesInFreeDims": 0.0019965171813964844,
450
+ "FlattenMacroLoop": 0.003274679183959961,
451
+ "GenericAccessSimplifier": 0.0009877681732177734,
452
+ "InferInitValue": 0.032111167907714844,
453
+ "InferIntrinsicOnCC": 0.014227867126464844,
454
+ "InferNeuronTensor": 0.04684329032897949,
455
+ "InferNonlocalTensors": 0.10579586029052734,
456
+ "InferPSumTensor": 0.04808926582336426,
457
+ "InlineNativeKernels": 0.0025835037231445313,
458
+ "InsertIOTransposes": 0.012038707733154297,
459
+ "InsertLocalTransposes": 0.007574796676635742,
460
+ "InsertOffloadedTransposes": 0.003882884979248047,
461
+ "LICM": 0.003116607666015625,
462
+ "LateLegalizeInst": 0.006630420684814453,
463
+ "LateLegalizePostSplit": 0.0030584335327148438,
464
+ "LateLowerReshapeOp": 0.002176046371459961,
465
+ "LateLowerTensorOp": 0.005063295364379883,
466
+ "LateNeuronInstComb": 0.024392366409301758,
467
+ "LayoutPreprocessing": 0.03173065185546875,
468
+ "LayoutPreprocessingAndAnalysis": 0.07484269142150879,
469
+ "LayoutRequirementAnalysis": 0.007186174392700195,
470
+ "LegalizeCCOpLayout": 0.003088235855102539,
471
+ "LegalizeOpLevelAlias": 0.0011813640594482422,
472
+ "LegalizePartitionReduce": 0.0013763904571533203,
473
+ "LegalizeSundaAccess": 0.04270172119140625,
474
+ "LegalizeSundaMacro": 0.009444236755371094,
475
+ "LegalizeType": 0.004534721374511719,
476
+ "LocalLayoutOpt": 0.01777815818786621,
477
+ "LoopFusion": 0.0060007572174072266,
478
+ "LoopSplitting": 0.000377655029296875,
479
+ "LowerBroadcast": 0.0016138553619384766,
480
+ "LowerCCOpBlockAxis": 0.004978179931640625,
481
+ "LowerComplexBroadcast": 0.0023903846740722656,
482
+ "LowerIntrinsics": 0.034012556076049805,
483
+ "LowerTensorOp": 0.01333928108215332,
484
+ "LowerTranspose": 0.011911869049072266,
485
+ "MacroGeneration": 0.07152104377746582,
486
+ "MaskPropagation": 0.004988193511962891,
487
+ "MemcpyElimination": 0.11162376403808594,
488
+ "MutateDataType": 0.0014476776123046875,
489
+ "NeuronAliasDependencyInduction": 0.0002269744873046875,
490
+ "NeuronAliasDependencyReset": 0.15035724639892578,
491
+ "NeuronInstComb": 0.015686750411987305,
492
+ "NeuronLICM": 0.011453866958618164,
493
+ "NeuronLoopFusion": 0.018696069717407227,
494
+ "NeuronLoopInterchange": 0.0018415451049804688,
495
+ "NeuronSimplifier": 0.011624336242675781,
496
+ "NeuronSimplifyPredicates": 0.005795955657958984,
497
+ "NeuronValueNumbering": 0.0040967464447021484,
498
+ "OptimizeAliasedCopyChain": 0.0014064311981201172,
499
+ "OptimizeNKIKernels": 0.0021300315856933594,
500
+ "PAGLayoutOpt": 0.33215951919555664,
501
+ "PComputeCutting": 0.008408308029174805,
502
+ "PGLayoutTilingPipeline": 1.3294909000396729,
503
+ "PGTiling": 0.3412203788757324,
504
+ "PadElimination": 0.0018661022186279297,
505
+ "ParAxesAnnotation": 0.29718852043151855,
506
+ "PartialLoopFusion": 0.024113893508911133,
507
+ "PartialSimdFusion": 0.029590368270874023,
508
+ "PerfectLoopNest": 0.0021219253540039063,
509
+ "RecognizeOpIdiom": 0.004444122314453125,
510
+ "Recompute": 0.00028204917907714844,
511
+ "RelaxPredicates": 0.004793405532836914,
512
+ "Rematerialization": 0.004267692565917969,
513
+ "ReshapeWeights": 0.0014717578887939453,
514
+ "ResolveAccessConflict": 0.0038602352142333984,
515
+ "ResolveComplicatePredicates": 0.001505136489868164,
516
+ "RewriteReplicationMatmul": 0.0020885467529296875,
517
+ "RewriteWeights": 0.003512144088745117,
518
+ "SFKVectorizer": 0.3296499252319336,
519
+ "SimpleAllReduceTiling": 0.002294301986694336,
520
+ "Simplifier": 0.004443168640136719,
521
+ "SimplifyMacroPredicates": 0.013223648071289063,
522
+ "SimplifyNeuronTensor": 0.011357307434082031,
523
+ "SimplifySlice": 0.0010068416595458984,
524
+ "SimplifyTensor": 0.006380319595336914,
525
+ "SpillPSum": 0.018645763397216797,
526
+ "SplitAPUnionSets": 0.031983375549316406,
527
+ "SplitAccGrp": 0.0017464160919189453,
528
+ "StaticProfiler": 0.004789590835571289,
529
+ "StaticTransposeLocalTensor": 0.0048563480377197266,
530
+ "SundaISel": 0.046004533767700195,
531
+ "TCTransform": 0.0017864704132080078,
532
+ "TensorInitialization": 0.015267372131347656,
533
+ "TensorOpSimplifier": 0.006502866744995117,
534
+ "TensorOpTransform": 0.029101848602294922,
535
+ "TileCCOps": 0.0055658817291259766,
536
+ "TilingProfiler": 0.014283895492553711,
537
+ "TransformConvOp": 0.0028002262115478516,
538
+ "TritiumFusion": 0.037850379943847656,
539
+ "ValueNumbering": 0.002534627914428711,
540
+ "VectorizeDMA": 0.0056002140045166016,
541
+ "VectorizeMatMult": 0.004069805145263672,
542
+ "WeightCoalescing": 0.0033059120178222656,
543
+ "ZeroSizeTensorElimination": 0.00012040138244628906
544
+ },
545
+ "tensorizer": {
546
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 1945.0,
547
+ "StaticProfiler::AifUb": 18.54642677307129,
548
+ "StaticProfiler::ArithmeticIntensityTensorizer": 234.4757080078125,
549
+ "StaticProfiler::AverageDmaLength": 3607.790283203125,
550
+ "StaticProfiler::AverageFractalPeUtilization": 99.84349822998047,
551
+ "StaticProfiler::AveragePartitionUtilization": 96.70350646972656,
552
+ "StaticProfiler::AveragePeUtilization": 99.51932525634766,
553
+ "StaticProfiler::DDRTransferBytes": 53226752.0,
554
+ "StaticProfiler::InternalTransferBytes": 27462656.0,
555
+ "StaticProfiler::LoadExpanded": 10244.0,
556
+ "StaticProfiler::LocalizationEfficiency": 1264.2635498046875,
557
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1466.4949951171875,
558
+ "StaticProfiler::StoreExpanded": 3713.0,
559
+ "StaticProfiler::TotalDMAExpanded": 13957.0,
560
+ "StaticProfiler::TotalDynamicInstancesCount": 2107.0,
561
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 2103.0,
562
+ "StaticProfiler::TotalLNCComm": 0.0,
563
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
564
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
565
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
566
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
567
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
568
+ "TilingProfiler::GenericInstructionsAfterTiling": 24.0,
569
+ "TilingProfiler::MatMultInstructionsAfterTiling": 1010.0,
570
+ "TilingProfiler::NumPfTransposes": 6.0,
571
+ "TilingProfiler::NumPfTransposesForIo": 0.0,
572
+ "TilingProfiler::NumPfTransposesForLocal": 5.0,
573
+ "TilingProfiler::NumPfTransposesForNonlocal": 1.0,
574
+ "TilingProfiler::PfTransposeInstructions": 176.0,
575
+ "TilingProfiler::PfTransposeInstructionsForIo": 0.0,
576
+ "TilingProfiler::PfTransposeInstructionsForLocal": 144.0,
577
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 32.0,
578
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
579
+ "TilingProfiler::SimdInstructionsAfterTiling": 177.0,
580
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
581
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
582
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
583
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
584
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
585
+ "TransformConvOp::conv2d_column_packing": 0.0,
586
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
587
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
588
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
589
+ }
590
+ },
591
+ "sg0001": {
592
+ "compiletime": {
593
+ "AGOrderingAnalysisPass": 0.035902976989746094,
594
+ "AffinePredicateResolution": 0.0021402835845947266,
595
+ "AliasDependencyElimination": 0.0001494884490966797,
596
+ "AliasDependencyInduction": 0.00843667984008789,
597
+ "AliasDependencyReset": 0.07715225219726563,
598
+ "BFComputeCutting": 0.002821207046508789,
599
+ "BirCodeGenLoop": 0.03489971160888672,
600
+ "CCOpFusion": 0.03669166564941406,
601
+ "CanonicalizeDAGForPGTiling": 0.0034394264221191406,
602
+ "CanonicalizeIR": 0.001888275146484375,
603
+ "CoalesceCCOp": 0.0048944950103759766,
604
+ "CommuteConcat": 0.001985788345336914,
605
+ "DMALocalityOpt": 0.0010595321655273438,
606
+ "DMAProfiler": 0.0038537979125976563,
607
+ "DMATilingProfiler": 0.0052776336669921875,
608
+ "DataLocalityOpt": 0.13663840293884277,
609
+ "DataStreaming": 0.004033327102661133,
610
+ "DeConcat": 0.0017592906951904297,
611
+ "DeadCodeElimination": 0.0027074813842773438,
612
+ "DeadStoreElimination": 0.03486442565917969,
613
+ "DelinearIndices": 0.010581493377685547,
614
+ "Delinearization": 0.004877567291259766,
615
+ "DoNothing": 6.914138793945313e-05,
616
+ "DramToDramTranspose": 0.03982400894165039,
617
+ "DumpGraphAndMetadata": 0.004088640213012695,
618
+ "EliminateDivs": 0.0045583248138427734,
619
+ "ExpandBatchNorm": 0.0018122196197509766,
620
+ "ExpandISAMacro": 0.0023725032806396484,
621
+ "FactorizeBlkDims": 0.013248920440673828,
622
+ "FactorizeThreadAxesInFreeDims": 0.0023849010467529297,
623
+ "FlattenMacroLoop": 0.0036728382110595703,
624
+ "GenericAccessSimplifier": 0.0026085376739501953,
625
+ "InferInitValue": 0.038416147232055664,
626
+ "InferIntrinsicOnCC": 0.010096549987792969,
627
+ "InferNeuronTensor": 0.05150651931762695,
628
+ "InferNonlocalTensors": 0.031507015228271484,
629
+ "InferPSumTensor": 0.03166079521179199,
630
+ "InlineNativeKernels": 0.0021262168884277344,
631
+ "InsertIOTransposes": 0.022419452667236328,
632
+ "InsertLocalTransposes": 0.0071408748626708984,
633
+ "InsertOffloadedTransposes": 0.0034465789794921875,
634
+ "LICM": 0.004317283630371094,
635
+ "LateLegalizeInst": 0.004563570022583008,
636
+ "LateLegalizePostSplit": 0.0027570724487304688,
637
+ "LateLowerReshapeOp": 0.0013232231140136719,
638
+ "LateLowerTensorOp": 0.004618406295776367,
639
+ "LateNeuronInstComb": 0.020873546600341797,
640
+ "LayoutPreprocessing": 0.037287235260009766,
641
+ "LayoutPreprocessingAndAnalysis": 0.10860347747802734,
642
+ "LayoutRequirementAnalysis": 0.007799863815307617,
643
+ "LegalizeCCOpLayout": 0.001935720443725586,
644
+ "LegalizeOpLevelAlias": 0.0012698173522949219,
645
+ "LegalizePartitionReduce": 0.002346515655517578,
646
+ "LegalizeSundaAccess": 0.016484975814819336,
647
+ "LegalizeSundaMacro": 0.011503934860229492,
648
+ "LegalizeType": 0.0047261714935302734,
649
+ "LocalLayoutOpt": 0.02424001693725586,
650
+ "LoopFusion": 0.007829427719116211,
651
+ "LoopSplitting": 0.00044846534729003906,
652
+ "LowerBroadcast": 0.0014789104461669922,
653
+ "LowerCCOpBlockAxis": 0.0059947967529296875,
654
+ "LowerComplexBroadcast": 0.0023598670959472656,
655
+ "LowerIntrinsics": 0.035590410232543945,
656
+ "LowerTensorOp": 0.012118339538574219,
657
+ "LowerTranspose": 0.011335611343383789,
658
+ "MacroGeneration": 0.11938071250915527,
659
+ "MaskPropagation": 0.003367900848388672,
660
+ "MemcpyElimination": 0.10591435432434082,
661
+ "MutateDataType": 0.002183198928833008,
662
+ "NeuronAliasDependencyInduction": 0.0002372264862060547,
663
+ "NeuronAliasDependencyReset": 0.02314162254333496,
664
+ "NeuronInstComb": 0.01471090316772461,
665
+ "NeuronLICM": 0.007970094680786133,
666
+ "NeuronLoopFusion": 0.022555112838745117,
667
+ "NeuronLoopInterchange": 0.0015497207641601563,
668
+ "NeuronSimplifier": 0.012836694717407227,
669
+ "NeuronSimplifyPredicates": 0.001605987548828125,
670
+ "NeuronValueNumbering": 0.0046231746673583984,
671
+ "OptimizeAliasedCopyChain": 0.00162506103515625,
672
+ "OptimizeNKIKernels": 0.0015685558319091797,
673
+ "PAGLayoutOpt": 0.14427471160888672,
674
+ "PComputeCutting": 0.00727081298828125,
675
+ "PGLayoutTilingPipeline": 1.2423913478851318,
676
+ "PGTiling": 0.5181164741516113,
677
+ "PadElimination": 0.00038051605224609375,
678
+ "ParAxesAnnotation": 0.09470343589782715,
679
+ "PartialLoopFusion": 0.018784761428833008,
680
+ "PartialSimdFusion": 0.027338027954101563,
681
+ "PerfectLoopNest": 0.0021829605102539063,
682
+ "RecognizeOpIdiom": 0.0048656463623046875,
683
+ "Recompute": 0.0002601146697998047,
684
+ "RelaxPredicates": 0.0033593177795410156,
685
+ "Rematerialization": 0.0023822784423828125,
686
+ "ReshapeWeights": 0.0014538764953613281,
687
+ "ResolveAccessConflict": 0.0047032833099365234,
688
+ "ResolveComplicatePredicates": 0.0019354820251464844,
689
+ "RewriteReplicationMatmul": 0.002605438232421875,
690
+ "RewriteWeights": 0.004354715347290039,
691
+ "SFKVectorizer": 0.16805624961853027,
692
+ "SimpleAllReduceTiling": 0.0025529861450195313,
693
+ "Simplifier": 0.00439763069152832,
694
+ "SimplifyMacroPredicates": 0.007683992385864258,
695
+ "SimplifyNeuronTensor": 0.0066149234771728516,
696
+ "SimplifySlice": 0.0023670196533203125,
697
+ "SimplifyTensor": 0.0063228607177734375,
698
+ "SpillPSum": 0.01709151268005371,
699
+ "SplitAPUnionSets": 0.018975019454956055,
700
+ "SplitAccGrp": 0.002074003219604492,
701
+ "StaticProfiler": 0.0037796497344970703,
702
+ "StaticTransposeLocalTensor": 0.005953311920166016,
703
+ "SundaISel": 0.0426335334777832,
704
+ "TCTransform": 0.0011513233184814453,
705
+ "TensorInitialization": 0.002532958984375,
706
+ "TensorOpSimplifier": 0.006600379943847656,
707
+ "TensorOpTransform": 0.034122467041015625,
708
+ "TileCCOps": 0.0059397220611572266,
709
+ "TilingProfiler": 0.013670921325683594,
710
+ "TransformConvOp": 0.002622365951538086,
711
+ "TritiumFusion": 0.05379676818847656,
712
+ "ValueNumbering": 0.0030698776245117188,
713
+ "VectorizeDMA": 0.0016117095947265625,
714
+ "VectorizeMatMult": 0.005866289138793945,
715
+ "WeightCoalescing": 0.0026290416717529297,
716
+ "ZeroSizeTensorElimination": 0.00011897087097167969
717
+ },
718
+ "tensorizer": {
719
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 6049.0,
720
+ "StaticProfiler::AifUb": 251.7889862060547,
721
+ "StaticProfiler::ArithmeticIntensityTensorizer": 253.54466247558594,
722
+ "StaticProfiler::AverageDmaLength": 6385.9599609375,
723
+ "StaticProfiler::AverageFractalPeUtilization": 100.0,
724
+ "StaticProfiler::AveragePartitionUtilization": 99.86996459960938,
725
+ "StaticProfiler::AveragePeUtilization": 100.0,
726
+ "StaticProfiler::DDRTransferBytes": 204350464.0,
727
+ "StaticProfiler::InternalTransferBytes": 21430272.0,
728
+ "StaticProfiler::LoadExpanded": 27520.0,
729
+ "StaticProfiler::LocalizationEfficiency": 100.69728088378906,
730
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 105.00786590576172,
731
+ "StaticProfiler::StoreExpanded": 2305.0,
732
+ "StaticProfiler::TotalDMAExpanded": 29825.0,
733
+ "StaticProfiler::TotalDynamicInstancesCount": 6153.0,
734
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 6153.0,
735
+ "StaticProfiler::TotalLNCComm": 0.0,
736
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
737
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
738
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
739
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
740
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
741
+ "TilingProfiler::GenericInstructionsAfterTiling": 16.0,
742
+ "TilingProfiler::MatMultInstructionsAfterTiling": 4848.0,
743
+ "TilingProfiler::NumPfTransposes": 8.0,
744
+ "TilingProfiler::NumPfTransposesForIo": 3.0,
745
+ "TilingProfiler::NumPfTransposesForLocal": 3.0,
746
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
747
+ "TilingProfiler::PfTransposeInstructions": 276.0,
748
+ "TilingProfiler::PfTransposeInstructionsForIo": 68.0,
749
+ "TilingProfiler::PfTransposeInstructionsForLocal": 80.0,
750
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 128.0,
751
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
752
+ "TilingProfiler::SimdInstructionsAfterTiling": 216.0,
753
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
754
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
755
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
756
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
757
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
758
+ "TransformConvOp::conv2d_column_packing": 0.0,
759
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
760
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
761
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
762
+ }
763
+ },
764
+ "sg0002": {
765
+ "compiletime": {
766
+ "AGOrderingAnalysisPass": 0.019578933715820313,
767
+ "AffinePredicateResolution": 0.0019481182098388672,
768
+ "AliasDependencyElimination": 0.0001239776611328125,
769
+ "AliasDependencyInduction": 0.00577092170715332,
770
+ "AliasDependencyReset": 0.027690649032592773,
771
+ "BFComputeCutting": 0.0023322105407714844,
772
+ "BirCodeGenLoop": 0.4628438949584961,
773
+ "CCOpFusion": 0.022275209426879883,
774
+ "CanonicalizeDAGForPGTiling": 0.005593061447143555,
775
+ "CanonicalizeIR": 0.001634359359741211,
776
+ "CoalesceCCOp": 0.015367984771728516,
777
+ "CommuteConcat": 0.0008616447448730469,
778
+ "DMALocalityOpt": 0.007138729095458984,
779
+ "DMAProfiler": 0.011677265167236328,
780
+ "DMATilingProfiler": 0.0037431716918945313,
781
+ "DataLocalityOpt": 0.06741714477539063,
782
+ "DataStreaming": 0.03589940071105957,
783
+ "DeConcat": 0.0005049705505371094,
784
+ "DeadCodeElimination": 0.0009002685546875,
785
+ "DeadStoreElimination": 0.0056514739990234375,
786
+ "DelinearIndices": 0.004773139953613281,
787
+ "Delinearization": 0.0026137828826904297,
788
+ "DoNothing": 7.462501525878906e-05,
789
+ "DramToDramTranspose": 0.019293546676635742,
790
+ "DumpGraphAndMetadata": 0.10360383987426758,
791
+ "EliminateDivs": 0.003831148147583008,
792
+ "ExpandBatchNorm": 0.0019576549530029297,
793
+ "ExpandISAMacro": 0.011517524719238281,
794
+ "FactorizeBlkDims": 0.008472919464111328,
795
+ "FactorizeThreadAxesInFreeDims": 0.001847982406616211,
796
+ "FlattenMacroLoop": 0.003529787063598633,
797
+ "GenericAccessSimplifier": 0.0008223056793212891,
798
+ "InferInitValue": 0.025947093963623047,
799
+ "InferIntrinsicOnCC": 0.00908350944519043,
800
+ "InferNeuronTensor": 0.02371978759765625,
801
+ "InferNonlocalTensors": 0.014753341674804688,
802
+ "InferPSumTensor": 0.3085360527038574,
803
+ "InlineNativeKernels": 0.008690595626831055,
804
+ "InsertIOTransposes": 0.01906275749206543,
805
+ "InsertLocalTransposes": 0.004312276840209961,
806
+ "InsertOffloadedTransposes": 0.002802133560180664,
807
+ "LICM": 0.003081083297729492,
808
+ "LateLegalizeInst": 0.01367807388305664,
809
+ "LateLegalizePostSplit": 0.012533903121948242,
810
+ "LateLowerReshapeOp": 0.001035451889038086,
811
+ "LateLowerTensorOp": 0.002605438232421875,
812
+ "LateNeuronInstComb": 0.008839130401611328,
813
+ "LayoutPreprocessing": 0.03434133529663086,
814
+ "LayoutPreprocessingAndAnalysis": 0.07319903373718262,
815
+ "LayoutRequirementAnalysis": 0.005194187164306641,
816
+ "LegalizeCCOpLayout": 0.0025322437286376953,
817
+ "LegalizeOpLevelAlias": 0.0020308494567871094,
818
+ "LegalizePartitionReduce": 0.0010001659393310547,
819
+ "LegalizeSundaAccess": 0.07694768905639648,
820
+ "LegalizeSundaMacro": 0.011176109313964844,
821
+ "LegalizeType": 0.014355182647705078,
822
+ "LocalLayoutOpt": 0.014019250869750977,
823
+ "LoopFusion": 0.005472898483276367,
824
+ "LoopSplitting": 0.00038623809814453125,
825
+ "LowerBroadcast": 0.0025022029876708984,
826
+ "LowerCCOpBlockAxis": 0.0058476924896240234,
827
+ "LowerComplexBroadcast": 0.00213623046875,
828
+ "LowerIntrinsics": 0.30684900283813477,
829
+ "LowerTensorOp": 0.010679960250854492,
830
+ "LowerTranspose": 0.012311697006225586,
831
+ "MacroGeneration": 0.029733657836914063,
832
+ "MaskPropagation": 0.0028328895568847656,
833
+ "MemcpyElimination": 0.026583433151245117,
834
+ "MutateDataType": 0.0020093917846679688,
835
+ "NeuronAliasDependencyInduction": 0.00018548965454101563,
836
+ "NeuronAliasDependencyReset": 0.02524423599243164,
837
+ "NeuronInstComb": 0.003789663314819336,
838
+ "NeuronLICM": 0.03511476516723633,
839
+ "NeuronLoopFusion": 0.007987260818481445,
840
+ "NeuronLoopInterchange": 0.0023233890533447266,
841
+ "NeuronSimplifier": 0.0075054168701171875,
842
+ "NeuronSimplifyPredicates": 0.11913681030273438,
843
+ "NeuronValueNumbering": 0.0033991336822509766,
844
+ "OptimizeAliasedCopyChain": 0.0005936622619628906,
845
+ "OptimizeNKIKernels": 0.44962644577026367,
846
+ "PAGLayoutOpt": 0.0999138355255127,
847
+ "PComputeCutting": 0.005170106887817383,
848
+ "PGLayoutTilingPipeline": 0.7408750057220459,
849
+ "PGTiling": 0.29245758056640625,
850
+ "PadElimination": 0.000308990478515625,
851
+ "ParAxesAnnotation": 0.05283546447753906,
852
+ "PartialLoopFusion": 0.0043125152587890625,
853
+ "PartialSimdFusion": 0.004901885986328125,
854
+ "PerfectLoopNest": 0.001722574234008789,
855
+ "RecognizeOpIdiom": 0.004076480865478516,
856
+ "Recompute": 0.0002620220184326172,
857
+ "RelaxPredicates": 0.013286828994750977,
858
+ "Rematerialization": 0.0021238327026367188,
859
+ "ReshapeWeights": 0.0006799697875976563,
860
+ "ResolveAccessConflict": 0.0040090084075927734,
861
+ "ResolveComplicatePredicates": 0.001981496810913086,
862
+ "RewriteReplicationMatmul": 0.0021796226501464844,
863
+ "RewriteWeights": 0.0022602081298828125,
864
+ "SFKVectorizer": 0.27124762535095215,
865
+ "SimpleAllReduceTiling": 0.00896596908569336,
866
+ "Simplifier": 0.0046122074127197266,
867
+ "SimplifyMacroPredicates": 0.010458230972290039,
868
+ "SimplifyNeuronTensor": 1.0512049198150635,
869
+ "SimplifySlice": 0.0009145736694335938,
870
+ "SimplifyTensor": 0.00577855110168457,
871
+ "SpillPSum": 0.012126922607421875,
872
+ "SplitAPUnionSets": 0.10518908500671387,
873
+ "SplitAccGrp": 0.001172780990600586,
874
+ "StaticProfiler": 0.0124053955078125,
875
+ "StaticTransposeLocalTensor": 0.0038576126098632813,
876
+ "SundaISel": 0.04396390914916992,
877
+ "TCTransform": 0.0018804073333740234,
878
+ "TensorInitialization": 0.012793779373168945,
879
+ "TensorOpSimplifier": 0.0045316219329833984,
880
+ "TensorOpTransform": 0.021115541458129883,
881
+ "TileCCOps": 0.0056231021881103516,
882
+ "TilingProfiler": 0.00790858268737793,
883
+ "TransformConvOp": 0.0030431747436523438,
884
+ "TritiumFusion": 0.03186154365539551,
885
+ "ValueNumbering": 0.0038623809814453125,
886
+ "VectorizeDMA": 0.0021522045135498047,
887
+ "VectorizeMatMult": 0.003453969955444336,
888
+ "WeightCoalescing": 0.009035825729370117,
889
+ "ZeroSizeTensorElimination": 0.00011420249938964844
890
+ },
891
+ "tensorizer": {
892
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 43318.0,
893
+ "StaticProfiler::AifUb": 154.8094024658203,
894
+ "StaticProfiler::ArithmeticIntensityTensorizer": 152.92723083496094,
895
+ "StaticProfiler::AverageDmaLength": 4809.89794921875,
896
+ "StaticProfiler::AverageFractalPeUtilization": 99.65782165527344,
897
+ "StaticProfiler::AveragePartitionUtilization": 97.58238220214844,
898
+ "StaticProfiler::AveragePeUtilization": 98.61824035644531,
899
+ "StaticProfiler::DDRTransferBytes": 787141440.0,
900
+ "StaticProfiler::InternalTransferBytes": 634853888.0,
901
+ "StaticProfiler::LoadExpanded": 98070.0,
902
+ "StaticProfiler::LocalizationEfficiency": 98.78419494628906,
903
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.47209167480469,
904
+ "StaticProfiler::StoreExpanded": 2397.0,
905
+ "StaticProfiler::TotalDMAExpanded": 100467.0,
906
+ "StaticProfiler::TotalDynamicInstancesCount": 50670.0,
907
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 50224.0,
908
+ "StaticProfiler::TotalLNCComm": 0.0,
909
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
910
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
911
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
912
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
913
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
914
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
915
+ "TilingProfiler::MatMultInstructionsAfterTiling": 22848.0,
916
+ "TilingProfiler::NumPfTransposes": 5.0,
917
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
918
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
919
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
920
+ "TilingProfiler::PfTransposeInstructions": 19201.0,
921
+ "TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
922
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
923
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 192.0,
924
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
925
+ "TilingProfiler::SimdInstructionsAfterTiling": 158.0,
926
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
927
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
928
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
929
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
930
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
931
+ "TransformConvOp::conv2d_column_packing": 0.0,
932
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
933
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
934
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
935
+ }
936
+ },
937
+ "sg01": {
938
+ "compiletime": {
939
+ "CanonicalizeConv": 1.9999999949504854e-06,
940
+ "CanonicalizeForTensorizer": 1.4999999621068127e-05,
941
+ "Canonicalizer": 0.0002589999930933118,
942
+ "HoistCompute": 4.999999873689376e-06,
943
+ "IdentifyCrossPassTensors": 7.999999979801942e-06,
944
+ "MemcastMotion": 7.999999979801942e-06,
945
+ "PenguinizeFunctions": 1.5999999959603883e-05,
946
+ "PruneFunctions": 2.099999983329326e-05,
947
+ "RemoveOptimizationBarriers": 7.999999979801942e-06,
948
+ "ScatterMotion": 3.7999998312443495e-05,
949
+ "TensorizerLegalizationPass": 1.9999999494757503e-05,
950
+ "VerifySupportedOps": 9.999999747378752e-06,
951
+ "algsimp": 6.199999916134402e-05,
952
+ "batchnorm_expander": 1.2999999853491317e-05,
953
+ "boundary-marker-removal": 3.999999989900971e-06,
954
+ "call-inliner": 9.999999747378752e-06,
955
+ "canonicalize-boundary-marker": 6.000000212225132e-06,
956
+ "collective-stream-id-checker": 3.999999989900971e-06,
957
+ "comparison-expander": 4.999999873689376e-06,
958
+ "computation-deduplicator": 2.4000000848900527e-05,
959
+ "conditional-to-select": 4.999999873689376e-06,
960
+ "config-lowering": 2.099999983329326e-05,
961
+ "constant_folding": 7.999999979801942e-06,
962
+ "cse": 1.2000000424450263e-05,
963
+ "dce": 9.999999974752427e-07,
964
+ "dynamic-slice-transpose": 3.999999989900971e-06,
965
+ "eliminate-redundant-compare": 3.999999989900971e-06,
966
+ "emit-offloaded-dropout": 1.4000000192027073e-05,
967
+ "flatten-call-graph": 9.000000318337698e-06,
968
+ "fuse-send-recv": 2.9000000722589903e-05,
969
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
970
+ "hilo::NeuronInstCombine": 4.5000000682193786e-05,
971
+ "hilo::NeuronOpFusion": 1.5999999959603883e-05,
972
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.1000000085914508e-05,
973
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
974
+ "hilo::SixtyFourHack": 1.4999999621068127e-05,
975
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
976
+ "hlo-mac-count": 2.5999999706982635e-05,
977
+ "hlo-verifier": 0.00020500000391621143,
978
+ "legalize-ccops": 9.999999974752427e-07,
979
+ "legalize-compare": 3.999999989900971e-06,
980
+ "lower-argminmax-custom-call": 3.999999989900971e-06,
981
+ "map-inline": 1.2000000424450263e-05,
982
+ "metadata-naming": 2.499999936844688e-05,
983
+ "mlir::detail::OpToOpPassAdaptor": 0.00012799999967683107,
984
+ "mlir::hlo::MhloToPyPenguin": 0.0009619999909773469,
985
+ "mlir::mhlo::LowerComplexExtraPass": 8.099999831756577e-05,
986
+ "mlir::mhlo::LowerComplexPass": 3.999999989900971e-06,
987
+ "native-to-custom-softmax": 6.000000212225132e-06,
988
+ "native-to-custom-softmax-dx": 1.5999999959603883e-05,
989
+ "operand_upcaster": 2.099999983329326e-05,
990
+ "post-par-pipe-begin": 4.999999873689376e-06,
991
+ "post-par-pipe-end": 0.0,
992
+ "post-partition-simplification": 0.0005779999773949385,
993
+ "replace-minimum-constant": 6.000000212225132e-06,
994
+ "reshape-mover": 3.000000106112566e-06,
995
+ "simplify-concat": 4.8999998398358e-05,
996
+ "simplify-while-loops": 1.9999999949504854e-06,
997
+ "transform-variadic-reduce": 9.000000318337698e-06,
998
+ "tuple-simplifier": 4.999999873689376e-06,
999
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1000
+ "unroll-while-loop": 0.0
1001
+ },
1002
+ "hilo": {
1003
+ "ArithmeticIntensity": 240.22828674316406,
1004
+ "HloMacCount": 24964497408.0,
1005
+ "Traffic": 207839776.0
1006
+ }
1007
+ },
1008
+ "sg02": {
1009
+ "compiletime": {
1010
+ "CanonicalizeConv": 0.0,
1011
+ "CanonicalizeForTensorizer": 1.2999999853491317e-05,
1012
+ "Canonicalizer": 0.000311999989207834,
1013
+ "HoistCompute": 0.0,
1014
+ "IdentifyCrossPassTensors": 1.2000000424450263e-05,
1015
+ "MemcastMotion": 0.0,
1016
+ "PenguinizeFunctions": 1.1000000085914508e-05,
1017
+ "PruneFunctions": 7.999999979801942e-06,
1018
+ "RemoveOptimizationBarriers": 1.2000000424450263e-05,
1019
+ "ScatterMotion": 0.0,
1020
+ "TensorizerLegalizationPass": 7.000000096013537e-06,
1021
+ "VerifySupportedOps": 1.1000000085914508e-05,
1022
+ "algsimp": 5.999999848427251e-05,
1023
+ "batchnorm_expander": 1.2999999853491317e-05,
1024
+ "boundary-marker-removal": 3.999999989900971e-06,
1025
+ "call-inliner": 1.1000000085914508e-05,
1026
+ "canonicalize-boundary-marker": 6.000000212225132e-06,
1027
+ "collective-stream-id-checker": 3.000000106112566e-06,
1028
+ "comparison-expander": 4.999999873689376e-06,
1029
+ "computation-deduplicator": 2.499999936844688e-05,
1030
+ "conditional-to-select": 7.000000096013537e-06,
1031
+ "config-lowering": 2.5999999706982635e-05,
1032
+ "constant_folding": 9.000000318337698e-06,
1033
+ "cse": 1.2000000424450263e-05,
1034
+ "dce": 9.999999974752427e-07,
1035
+ "dynamic-slice-transpose": 3.999999989900971e-06,
1036
+ "eliminate-redundant-compare": 3.999999989900971e-06,
1037
+ "emit-offloaded-dropout": 1.4000000192027073e-05,
1038
+ "flatten-call-graph": 1.2000000424450263e-05,
1039
+ "fuse-send-recv": 1.8000000636675395e-05,
1040
+ "hilo::LegalizeAlias": 1.9999999949504854e-06,
1041
+ "hilo::NeuronInstCombine": 1.2999999853491317e-05,
1042
+ "hilo::NeuronOpFusion": 0.0,
1043
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05,
1044
+ "hilo::ScheduleFusion": 0.0,
1045
+ "hilo::SixtyFourHack": 5.900000178371556e-05,
1046
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1047
+ "hlo-mac-count": 0.0001720000000204891,
1048
+ "hlo-verifier": 0.0001880000054370612,
1049
+ "legalize-ccops": 9.999999974752427e-07,
1050
+ "legalize-compare": 3.000000106112566e-06,
1051
+ "lower-argminmax-custom-call": 3.999999989900971e-06,
1052
+ "map-inline": 1.2000000424450263e-05,
1053
+ "metadata-naming": 1.4999999621068127e-05,
1054
+ "mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05,
1055
+ "mlir::hlo::MhloToPyPenguin": 0.0008660000166855752,
1056
+ "mlir::mhlo::LowerComplexExtraPass": 9.100000170292333e-05,
1057
+ "mlir::mhlo::LowerComplexPass": 0.00010599999950500205,
1058
+ "native-to-custom-softmax": 6.000000212225132e-06,
1059
+ "native-to-custom-softmax-dx": 2.300000051036477e-05,
1060
+ "operand_upcaster": 2.300000051036477e-05,
1061
+ "post-par-pipe-begin": 1.9999999949504854e-06,
1062
+ "post-par-pipe-end": 0.0,
1063
+ "post-partition-simplification": 0.0005329999839887023,
1064
+ "replace-minimum-constant": 9.000000318337698e-06,
1065
+ "reshape-mover": 3.000000106112566e-06,
1066
+ "simplify-concat": 4.400000034365803e-05,
1067
+ "simplify-while-loops": 1.9999999949504854e-06,
1068
+ "transform-variadic-reduce": 6.0999998822808266e-05,
1069
+ "tuple-simplifier": 4.999999873689376e-06,
1070
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1071
+ "unroll-while-loop": 0.0
1072
+ },
1073
+ "hilo": {
1074
+ "ArithmeticIntensity": 50.378170013427734,
1075
+ "HloMacCount": 19638517760.0,
1076
+ "Traffic": 779643968.0
1077
+ }
1078
+ }
1079
+ }
context_encoding_model/_tp0_bk1/graph.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60a8e4c285a690a146d149c675038f0498f62f761e4e3893706941d7ca8af583
3
+ size 1659904
context_encoding_model/_tp0_bk1/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_encoding_model/_tp0_bk1/metaneff.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24459c80d98d706b0a4aca22eda28ff6c09f03a08393e76b58ee0ca668d1b851
3
+ size 1152551
context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b03debb723d63387ea26771f63729d616ac71a0dbfcb78d21d2194ff723fcbc1
3
+ size 1229637
context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60a8e4c285a690a146d149c675038f0498f62f761e4e3893706941d7ca8af583
3
+ size 1659904
context_encoding_model/_tp0_bk1/neuron_config.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "Qwen/Qwen3-8B",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "Qwen3ForCausalLM"
7
+ ],
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "attribute_map": {},
11
+ "bad_words_ids": null,
12
+ "begin_suppress_tokens": null,
13
+ "bos_token_id": 151643,
14
+ "chunk_size_feed_forward": 0,
15
+ "cross_attention_hidden_size": null,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": 151645,
22
+ "exponential_decay_length_penalty": null,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "fused_spec_config": null,
27
+ "head_dim": 128,
28
+ "hidden_act": "silu",
29
+ "hidden_size": 4096,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 12288,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "max_position_embeddings": 40960,
45
+ "max_window_layers": 36,
46
+ "metadata": null,
47
+ "min_length": 0,
48
+ "model_type": "qwen3",
49
+ "neuron_config": {
50
+ "activation_quantization_type": null,
51
+ "allow_input_truncation": false,
52
+ "apply_seq_ids_mask": false,
53
+ "async_mode": false,
54
+ "attention_dp_degree": 1,
55
+ "attention_dtype": null,
56
+ "attn_block_cte_nki_kernel_enabled": false,
57
+ "attn_block_tkg_nki_kernel_cache_update": false,
58
+ "attn_block_tkg_nki_kernel_enabled": false,
59
+ "attn_cls": {
60
+ "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
61
+ "__name__": "NeuronQwen3Attention"
62
+ },
63
+ "attn_kernel_enabled": null,
64
+ "attn_tkg_builtin_kernel_enabled": false,
65
+ "attn_tkg_nki_kernel_enabled": false,
66
+ "batch_size": 1,
67
+ "bucket_n_active_tokens": true,
68
+ "buckets": [
69
+ 256
70
+ ],
71
+ "cast_type": "config",
72
+ "cc_pipeline_tiling_factor": 2,
73
+ "chunked_prefill_config": null,
74
+ "context_encoding_buckets": [
75
+ 256
76
+ ],
77
+ "cp_degree": 1,
78
+ "ctx_batch_size": 1,
79
+ "disable_kv_cache_tiling": false,
80
+ "draft_model_modules_to_not_convert": null,
81
+ "enable_bucketing": true,
82
+ "enable_eagle_draft_input_norm": false,
83
+ "enable_eagle_speculation": false,
84
+ "enable_fused_speculation": false,
85
+ "enable_long_context_mode": false,
86
+ "enable_output_completion_notifications": false,
87
+ "enable_spill_reload_dge": false,
88
+ "enable_token_tree": false,
89
+ "ep_degree": 1,
90
+ "expert_mlp_nki_kernel_enabled": null,
91
+ "flash_decoding_enabled": false,
92
+ "fused_qkv": false,
93
+ "fused_rmsnorm_skip_gamma": false,
94
+ "is_block_kv_layout": null,
95
+ "is_chunked_prefill": false,
96
+ "is_continuous_batching": true,
97
+ "is_eagle_draft": false,
98
+ "is_medusa": false,
99
+ "is_prefill_stage": true,
100
+ "is_prefix_caching": false,
101
+ "k_cache_transposed": false,
102
+ "kv_cache_batch_size": 1,
103
+ "kv_cache_padding_size": 0,
104
+ "kv_cache_quant": false,
105
+ "kv_cache_tiling": false,
106
+ "layer_boundary_markers": false,
107
+ "lm_head_pad": false,
108
+ "lm_head_pad_alignment_size": 1,
109
+ "local_ranks_size": 2,
110
+ "logical_nc_config": 1,
111
+ "lora_config": null,
112
+ "max_batch_size": 1,
113
+ "max_context_length": 1024,
114
+ "max_length": 1024,
115
+ "max_new_tokens": null,
116
+ "medusa_speculation_length": 0,
117
+ "medusa_tree": null,
118
+ "mlp_kernel_enabled": false,
119
+ "mlp_kernel_fuse_residual_add": false,
120
+ "modules_to_not_convert": null,
121
+ "moe_fused_nki_kernel_enabled": null,
122
+ "n_active_tokens": 1024,
123
+ "n_positions": 1024,
124
+ "num_medusa_heads": 0,
125
+ "on_cpu": false,
126
+ "on_device_sampling_config": {
127
+ "deterministic": false,
128
+ "do_sample": false,
129
+ "dynamic": true,
130
+ "global_topk": 256,
131
+ "on_device_sampling_config": true,
132
+ "temperature": 1.0,
133
+ "top_k": 1,
134
+ "top_k_kernel_enabled": false,
135
+ "top_p": 1.0
136
+ },
137
+ "output_logits": false,
138
+ "overrides_torch_dtype": true,
139
+ "pa_block_size": 1024,
140
+ "pa_num_blocks": 1,
141
+ "padding_side": "right",
142
+ "pp_degree": 1,
143
+ "prefix_buckets": null,
144
+ "qk_layernorm": false,
145
+ "qkv_kernel_enabled": false,
146
+ "qkv_kernel_fuse_residual_add": false,
147
+ "qkv_kernel_nbsd_layout": false,
148
+ "quantization_dtype": "int8",
149
+ "quantization_type": "per_tensor_symmetric",
150
+ "quantize_clamp_bound": Infinity,
151
+ "quantized": false,
152
+ "quantized_checkpoints_path": null,
153
+ "quantized_mlp_kernel_enabled": false,
154
+ "rmsnorm_quantize_kernel_enabled": false,
155
+ "router_topk_nki_kernel_enabled": null,
156
+ "rpl_reduce_dtype": null,
157
+ "save_sharded_checkpoint": true,
158
+ "scratchpad_page_size": null,
159
+ "seq_len": 1024,
160
+ "seq_len_threshold_for_cc_tiling": 16384,
161
+ "sequence_parallel_enabled": false,
162
+ "shared_mlp_nki_kernel_enabled": null,
163
+ "skip_sharding": false,
164
+ "skip_warmup": false,
165
+ "spec_batch_size": 1,
166
+ "speculation_length": 0,
167
+ "start_rank_id": 0,
168
+ "target": null,
169
+ "tile_cc": false,
170
+ "tkg_batch_size": 1,
171
+ "token_generation_buckets": null,
172
+ "token_tree_config": null,
173
+ "torch_dtype": "bfloat16",
174
+ "tp_degree": 2,
175
+ "vocab_parallel": false,
176
+ "weight_gather_seq_len_threshold": 32768,
177
+ "weights_to_skip_layout_optimization": [],
178
+ "world_size": 2
179
+ },
180
+ "no_repeat_ngram_size": 0,
181
+ "num_attention_heads": 32,
182
+ "num_beam_groups": 1,
183
+ "num_beams": 1,
184
+ "num_cores_per_group": 1,
185
+ "num_hidden_layers": 36,
186
+ "num_key_value_heads": 8,
187
+ "num_return_sequences": 1,
188
+ "output_attentions": false,
189
+ "output_hidden_states": false,
190
+ "output_scores": false,
191
+ "pad_token_id": 0,
192
+ "prefix": null,
193
+ "problem_type": null,
194
+ "pruned_heads": {},
195
+ "remove_invalid_values": false,
196
+ "repetition_penalty": 1.0,
197
+ "return_dict": true,
198
+ "return_dict_in_generate": false,
199
+ "rms_norm_eps": 1e-06,
200
+ "rope_scaling": null,
201
+ "rope_theta": 1000000,
202
+ "sep_token_id": null,
203
+ "sliding_window": null,
204
+ "suppress_tokens": null,
205
+ "task_specific_params": null,
206
+ "temperature": 1.0,
207
+ "tf_legacy_loss": false,
208
+ "tie_encoder_decoder": false,
209
+ "tie_word_embeddings": false,
210
+ "tokenizer_class": null,
211
+ "top_k": 50,
212
+ "top_p": 1.0,
213
+ "torchscript": false,
214
+ "transformers_version": "4.51.0",
215
+ "typical_p": 1.0,
216
+ "use_bfloat16": false,
217
+ "use_cache": true,
218
+ "use_sliding_window": false,
219
+ "vocab_size": 151936
220
+ }
context_encoding_model/_tp0_bk2/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb --output model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk2/compile_flags.MODULE_00594b8bc68e927f3dbe+1ad60ced.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/log-neuron-cc.txt"]
context_encoding_model/_tp0_bk2/global_metric_store.json ADDED
@@ -0,0 +1,1079 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.66542053222656,
5
+ "StaticProfiler::AveragePartitionUtilization": 97.7269515991211,
6
+ "StaticProfiler::AveragePeUtilization": 98.64861297607422,
7
+ "StaticProfiler::LocalizationEfficiency": 98.26979064941406,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 101.01405334472656,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1.0,
16
+ "StaticProfiler::AveragePartitionUtilization": 1.0,
17
+ "StaticProfiler::AveragePeUtilization": 1.0,
18
+ "StaticProfiler::LocalizationEfficiency": 1.0,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 0.018257856369018555,
27
+ "AffinePredicateResolution": 0.0011677742004394531,
28
+ "AliasDependencyElimination": 0.0001201629638671875,
29
+ "AliasDependencyInduction": 0.0052988529205322266,
30
+ "AliasDependencyReset": 0.029210567474365234,
31
+ "BFComputeCutting": 0.0032625198364257813,
32
+ "BirCodeGenLoop": 0.4527714252471924,
33
+ "CCOpFusion": 0.02410125732421875,
34
+ "CanonicalizeConv": 0.00029399999766610563,
35
+ "CanonicalizeDAGForPGTiling": 0.004324913024902344,
36
+ "CanonicalizeForTensorizer": 4.8000001697801054e-05,
37
+ "CanonicalizeIR": 0.0019502639770507813,
38
+ "Canonicalizer": 0.0010809999657794833,
39
+ "CoalesceCCOp": 0.014672040939331055,
40
+ "CommuteConcat": 0.0008339881896972656,
41
+ "DMALocalityOpt": 0.005767107009887695,
42
+ "DMAProfiler": 0.012850046157836914,
43
+ "DMATilingProfiler": 0.004332065582275391,
44
+ "DataLocalityOpt": 0.07260942459106445,
45
+ "DataStreaming": 0.03969836235046387,
46
+ "DeConcat": 0.0005326271057128906,
47
+ "DeadCodeElimination": 0.0009255409240722656,
48
+ "DeadStoreElimination": 0.0055675506591796875,
49
+ "DelinearIndices": 0.004735231399536133,
50
+ "Delinearization": 0.0030374526977539063,
51
+ "DoNothing": 0.00018930435180664063,
52
+ "DramToDramTranspose": 0.018135547637939453,
53
+ "DumpGraphAndMetadata": 0.09476375579833984,
54
+ "EliminateDivs": 0.002595663070678711,
55
+ "ExpandBatchNorm": 0.002063274383544922,
56
+ "ExpandISAMacro": 0.011973381042480469,
57
+ "FactorizeBlkDims": 0.009292840957641602,
58
+ "FactorizeThreadAxesInFreeDims": 0.0010046958923339844,
59
+ "FlattenMacroLoop": 0.002232074737548828,
60
+ "GenericAccessSimplifier": 0.0018167495727539063,
61
+ "HoistCompute": 7.999999979801942e-06,
62
+ "IdentifyCrossPassTensors": 7.79999973019585e-05,
63
+ "InferInitValue": 0.024865150451660156,
64
+ "InferIntrinsicOnCC": 0.009101152420043945,
65
+ "InferNeuronTensor": 0.023293495178222656,
66
+ "InferNonlocalTensors": 0.01632833480834961,
67
+ "InferPSumTensor": 0.27726316452026367,
68
+ "InlineNativeKernels": 0.0081634521484375,
69
+ "InsertIOTransposes": 0.019203901290893555,
70
+ "InsertLocalTransposes": 0.0042340755462646484,
71
+ "InsertOffloadedTransposes": 0.002811431884765625,
72
+ "LICM": 0.0029730796813964844,
73
+ "LateLegalizeInst": 0.014307022094726563,
74
+ "LateLegalizePostSplit": 0.012536048889160156,
75
+ "LateLowerReshapeOp": 0.0018641948699951172,
76
+ "LateLowerTensorOp": 0.0014081001281738281,
77
+ "LateNeuronInstComb": 0.00915217399597168,
78
+ "LayoutPreprocessing": 0.02658390998840332,
79
+ "LayoutPreprocessingAndAnalysis": 0.10707235336303711,
80
+ "LayoutRequirementAnalysis": 0.005135536193847656,
81
+ "LegalizeCCOpLayout": 0.002307415008544922,
82
+ "LegalizeOpLevelAlias": 0.0012297630310058594,
83
+ "LegalizePartitionReduce": 0.0010194778442382813,
84
+ "LegalizeSundaAccess": 0.07808256149291992,
85
+ "LegalizeSundaMacro": 0.010968446731567383,
86
+ "LegalizeType": 0.012074947357177734,
87
+ "LocalLayoutOpt": 0.013799905776977539,
88
+ "LoopFusion": 0.0052182674407958984,
89
+ "LoopSplitting": 0.0003161430358886719,
90
+ "LowerBroadcast": 0.0015821456909179688,
91
+ "LowerCCOpBlockAxis": 0.0040547847747802734,
92
+ "LowerComplexBroadcast": 0.002165079116821289,
93
+ "LowerIntrinsics": 0.31156492233276367,
94
+ "LowerTensorOp": 0.010558843612670898,
95
+ "LowerTranspose": 0.012494325637817383,
96
+ "MacroGeneration": 0.029862642288208008,
97
+ "MaskPropagation": 0.002757549285888672,
98
+ "MemcastMotion": 3.400000059627928e-05,
99
+ "MemcpyElimination": 0.025969266891479492,
100
+ "MutateDataType": 0.002087831497192383,
101
+ "NeuronAliasDependencyInduction": 0.00016880035400390625,
102
+ "NeuronAliasDependencyReset": 0.020352602005004883,
103
+ "NeuronInstComb": 0.004656076431274414,
104
+ "NeuronLICM": 0.03560137748718262,
105
+ "NeuronLoopFusion": 0.007991313934326172,
106
+ "NeuronLoopInterchange": 0.002409219741821289,
107
+ "NeuronSimplifier": 0.007069587707519531,
108
+ "NeuronSimplifyPredicates": 0.12419009208679199,
109
+ "NeuronValueNumbering": 0.0032753944396972656,
110
+ "OptimizeAliasedCopyChain": 0.0005936622619628906,
111
+ "OptimizeNKIKernels": 0.5374257564544678,
112
+ "PAGLayoutOpt": 0.08115577697753906,
113
+ "PComputeCutting": 0.004801273345947266,
114
+ "PGLayoutTilingPipeline": 0.5454635620117188,
115
+ "PGTiling": 0.14933419227600098,
116
+ "PadElimination": 0.00034046173095703125,
117
+ "ParAxesAnnotation": 0.053552865982055664,
118
+ "PartialLoopFusion": 0.0067539215087890625,
119
+ "PartialSimdFusion": 0.00693058967590332,
120
+ "PenguinizeFunctions": 4.5000000682193786e-05,
121
+ "PerfectLoopNest": 0.0035321712493896484,
122
+ "PruneFunctions": 5.199999941396527e-05,
123
+ "RecognizeOpIdiom": 0.003947257995605469,
124
+ "Recompute": 0.00024962425231933594,
125
+ "RelaxPredicates": 0.013285398483276367,
126
+ "Rematerialization": 0.002062082290649414,
127
+ "RemoveOptimizationBarriers": 8.70000003487803e-05,
128
+ "ReshapeWeights": 0.002131223678588867,
129
+ "ResolveAccessConflict": 0.0038597583770751953,
130
+ "ResolveComplicatePredicates": 0.002032756805419922,
131
+ "RewriteReplicationMatmul": 0.001924753189086914,
132
+ "RewriteWeights": 0.002452373504638672,
133
+ "SFKVectorizer": 0.2718319892883301,
134
+ "ScatterMotion": 3.7999998312443495e-05,
135
+ "SimpleAllReduceTiling": 0.008960247039794922,
136
+ "Simplifier": 0.004038810729980469,
137
+ "SimplifyMacroPredicates": 0.010622739791870117,
138
+ "SimplifyNeuronTensor": 1.0594146251678467,
139
+ "SimplifySlice": 0.0009577274322509766,
140
+ "SimplifyTensor": 0.005341768264770508,
141
+ "SpillPSum": 0.012076139450073242,
142
+ "SplitAPUnionSets": 0.10771751403808594,
143
+ "SplitAccGrp": 0.002201557159423828,
144
+ "StaticProfiler": 0.012447118759155273,
145
+ "StaticTransposeLocalTensor": 0.0038712024688720703,
146
+ "SundaISel": 0.04214668273925781,
147
+ "TCTransform": 0.0008432865142822266,
148
+ "TensorInitialization": 0.012825727462768555,
149
+ "TensorOpSimplifier": 0.004651308059692383,
150
+ "TensorOpTransform": 0.019537687301635742,
151
+ "TensorizerLegalizationPass": 5.7999997807201e-05,
152
+ "TileCCOps": 0.006766319274902344,
153
+ "TilingProfiler": 0.006911277770996094,
154
+ "TransformConvOp": 0.0030303001403808594,
155
+ "TritiumFusion": 0.04502224922180176,
156
+ "ValueNumbering": 0.001996755599975586,
157
+ "VectorizeDMA": 0.0019402503967285156,
158
+ "VectorizeMatMult": 0.0027413368225097656,
159
+ "VerifySupportedOps": 3.7000001611886546e-05,
160
+ "WeightCoalescing": 0.008520841598510742,
161
+ "ZeroSizeTensorElimination": 0.00013709068298339844,
162
+ "algsimp": 0.0026940000243484974,
163
+ "batchnorm_expander": 4.400000034365803e-05,
164
+ "boundary-marker-removal": 1.5999999959603883e-05,
165
+ "call-inliner": 0.00046999999904073775,
166
+ "canonicalize-boundary-marker": 1.8999999156221747e-05,
167
+ "collective-stream-id-checker": 7.300000288523734e-05,
168
+ "comparison-expander": 0.0005740000051446259,
169
+ "computation-deduplicator": 7.999999797903001e-05,
170
+ "conditional-to-select": 1.8000000636675395e-05,
171
+ "config-lowering": 0.0003279999946244061,
172
+ "constant-statistics": 0.0005329999839887023,
173
+ "constant_folding": 0.0003260000084992498,
174
+ "cse": 4.5000000682193786e-05,
175
+ "dce": 8.399999933317304e-05,
176
+ "dot_decomposer": 0.0013409999664872885,
177
+ "dynamic-slice-transpose": 1.3999999282532372e-05,
178
+ "eliminate-redundant-compare": 0.0002959999837912619,
179
+ "emit-offloaded-dropout": 6.399999983841553e-05,
180
+ "flatten-call-graph": 0.0009319999953731894,
181
+ "fuse-send-recv": 6.999999459367245e-05,
182
+ "hilo::LegalizeAlias": 1.3999999282532372e-05,
183
+ "hilo::NeuronInstCombine": 0.0001660000125411898,
184
+ "hilo::NeuronOpFusion": 2.5000001187436283e-05,
185
+ "hilo::ReplaceTokenTypeWithU8Pass": 5.2999999752501026e-05,
186
+ "hilo::ScheduleFusion": 7.000000096013537e-06,
187
+ "hilo::SixtyFourHack": 7.299999560927972e-05,
188
+ "hilo::VerifyAliasing": 6.000000212225132e-06,
189
+ "hlo-mac-count": 0.0013429999817162752,
190
+ "hlo-verifier": 0.007542999926954508,
191
+ "instruction-histogram": 0.0006709999870508909,
192
+ "io-con-pipe-begin": 4.999999873689376e-06,
193
+ "io-con-pipe-end": 9.999999974752427e-07,
194
+ "io-layout-normalization": 0.001310999970883131,
195
+ "io-statistics": 8.499999967170879e-05,
196
+ "legalize-ccops": 3.999999989900971e-06,
197
+ "legalize-compare": 1.2999999853491317e-05,
198
+ "lower-argminmax-custom-call": 1.300000076298602e-05,
199
+ "map-inline": 0.0008850000449456275,
200
+ "metadata-naming": 5.999999848427251e-05,
201
+ "mlir::detail::OpToOpPassAdaptor": 0.00014399999054148793,
202
+ "mlir::hlo::MhloToPyPenguin": 0.004429999738931656,
203
+ "mlir::mhlo::LowerComplexExtraPass": 0.00027299998328089714,
204
+ "mlir::mhlo::LowerComplexPass": 0.0004909999552182853,
205
+ "native-to-custom-softmax": 0.0007070000283420086,
206
+ "native-to-custom-softmax-dx": 0.0005990000208839774,
207
+ "operand_upcaster": 4.900000203633681e-05,
208
+ "opt-barrier-removal": 0.0005510000046342611,
209
+ "post-par-pipe-begin": 8.999999408842996e-06,
210
+ "post-par-pipe-end": 0.0,
211
+ "post-partition-simplification": 0.0018570000538602471,
212
+ "pre-par-pipe-begin": 9.999999974752427e-07,
213
+ "pre-par-pipe-end": 0.0,
214
+ "pre-partition-simplification": 0.12893199920654297,
215
+ "replace-minimum-constant": 0.0004569999873638153,
216
+ "reshape-mover": 0.00012599999899975955,
217
+ "simplify-concat": 0.00015899998834356666,
218
+ "simplify-while-loops": 0.00010400000610388815,
219
+ "transform-variadic-reduce": 7.000000186963007e-05,
220
+ "tuple-simplifier": 0.0003150000120513141,
221
+ "unpack-nested-aws-ntwsr": 0.0004349999944679439,
222
+ "unroll-while-loop": 2.099999983329326e-05,
223
+ "zero_sized_hlo_elimination": 0.0008670000243000686
224
+ },
225
+ "hilo": {
226
+ "ConstantSize": 1189157.0,
227
+ "HloInputCount": 475.0,
228
+ "HloMacCount": 101242896384.0,
229
+ "HloOutputCount": 73.0,
230
+ "IfmapSize": 8266545152.0,
231
+ "OfmapSize": 75497472.0,
232
+ "OutputsReadFromCount": 0.0,
233
+ "PassthroughTensorsCount": 0.0,
234
+ "RedundantOutputCount": 0.0,
235
+ "Traffic": 1692493184.0
236
+ },
237
+ "tensorizer": {
238
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 44382.0,
239
+ "StaticProfiler::AifUb": 205.154296875,
240
+ "StaticProfiler::ArithmeticIntensityTensorizer": 201.6046905517578,
241
+ "StaticProfiler::AverageDmaLength": 1901.806396484375,
242
+ "StaticProfiler::DDRTransferBytes": 795531072.0,
243
+ "StaticProfiler::InternalTransferBytes": 646388224.0,
244
+ "StaticProfiler::LoadExpanded": 376342.0,
245
+ "StaticProfiler::StoreExpanded": 4189.0,
246
+ "StaticProfiler::TotalDMAExpanded": 380531.0,
247
+ "StaticProfiler::TotalDynamicInstancesCount": 53882.0,
248
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 53436.0,
249
+ "StaticProfiler::TotalLNCComm": 0.0,
250
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
251
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
252
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
253
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
254
+ "TilingProfiler::MatMultInstructionsAfterTiling": 23616.0,
255
+ "TilingProfiler::NumPfTransposes": 5.0,
256
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
257
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
258
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
259
+ "TilingProfiler::PfTransposeInstructions": 19393.0,
260
+ "TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
261
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
262
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0,
263
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
264
+ "TilingProfiler::SimdInstructionsAfterTiling": 158.0,
265
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
266
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
267
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
268
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
269
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
270
+ "TransformConvOp::conv2d_column_packing": 0.0,
271
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
272
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
273
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
274
+ }
275
+ },
276
+ "all": {
277
+ "compiletime": {
278
+ "algsimp": 0.002466999925673008,
279
+ "call-inliner": 0.0004360000020824373,
280
+ "collective-stream-id-checker": 6.299999949987978e-05,
281
+ "comparison-expander": 0.0005569999921135604,
282
+ "constant-statistics": 0.0005329999839887023,
283
+ "constant_folding": 0.0002969999914057553,
284
+ "dce": 7.999999797903001e-05,
285
+ "dot_decomposer": 0.0013409999664872885,
286
+ "eliminate-redundant-compare": 0.00028199999360367656,
287
+ "flatten-call-graph": 0.0008999999845400453,
288
+ "hlo-mac-count": 0.0010720000136643648,
289
+ "hlo-verifier": 0.0069679999724030495,
290
+ "instruction-histogram": 0.0006709999870508909,
291
+ "io-con-pipe-begin": 4.999999873689376e-06,
292
+ "io-con-pipe-end": 9.999999974752427e-07,
293
+ "io-layout-normalization": 0.001310999970883131,
294
+ "io-statistics": 8.499999967170879e-05,
295
+ "map-inline": 0.0008440000237897038,
296
+ "native-to-custom-softmax": 0.0006750000175088644,
297
+ "native-to-custom-softmax-dx": 0.0005000000237487257,
298
+ "opt-barrier-removal": 0.0005510000046342611,
299
+ "pre-par-pipe-begin": 9.999999974752427e-07,
300
+ "pre-par-pipe-end": 0.0,
301
+ "pre-partition-simplification": 0.12893199920654297,
302
+ "replace-minimum-constant": 0.0004309999931138009,
303
+ "reshape-mover": 0.00011500000255182385,
304
+ "simplify-while-loops": 9.600000339560211e-05,
305
+ "tuple-simplifier": 0.0002969999914057553,
306
+ "unpack-nested-aws-ntwsr": 0.00042100000428035855,
307
+ "unroll-while-loop": 1.9999999494757503e-05,
308
+ "zero_sized_hlo_elimination": 0.0008670000243000686
309
+ }
310
+ },
311
+ "cumsum": {
312
+ "compiletime": {
313
+ "CoalesceCCOp": 0.00020885467529296875,
314
+ "DMALocalityOpt": 0.00016832351684570313,
315
+ "DMAProfiler": 0.0007588863372802734,
316
+ "DataStreaming": 0.00029587745666503906,
317
+ "DoNothing": 0.00011897087097167969,
318
+ "ExpandISAMacro": 0.0005011558532714844,
319
+ "FactorizeBlkDims": 0.00043463706970214844,
320
+ "InferPSumTensor": 0.00044608116149902344,
321
+ "LateLegalizeInst": 0.0004031658172607422,
322
+ "LateNeuronInstComb": 0.0005033016204833984,
323
+ "LegalizeSundaAccess": 0.0021431446075439453,
324
+ "LegalizeType": 0.00024056434631347656,
325
+ "LowerBroadcast": 0.00022101402282714844,
326
+ "LowerIntrinsics": 0.00023508071899414063,
327
+ "LowerTranspose": 0.0002219676971435547,
328
+ "NeuronInstComb": 0.0005297660827636719,
329
+ "NeuronLICM": 0.00041484832763671875,
330
+ "NeuronSimplifyPredicates": 0.0028023719787597656,
331
+ "NeuronValueNumbering": 0.00043582916259765625,
332
+ "SFKVectorizer": 0.002759695053100586,
333
+ "SimpleAllReduceTiling": 0.00020432472229003906,
334
+ "SimplifyNeuronTensor": 0.0004029273986816406,
335
+ "SpillPSum": 0.0005388259887695313,
336
+ "WeightCoalescing": 0.0002307891845703125
337
+ }
338
+ },
339
+ "sg00": {
340
+ "compiletime": {
341
+ "CanonicalizeConv": 2.300000051036477e-05,
342
+ "CanonicalizeForTensorizer": 2.300000051036477e-05,
343
+ "Canonicalizer": 0.0005249999812804163,
344
+ "HoistCompute": 3.000000106112566e-06,
345
+ "IdentifyCrossPassTensors": 3.099999958067201e-05,
346
+ "MemcastMotion": 9.999999747378752e-06,
347
+ "PenguinizeFunctions": 2.2000000171829015e-05,
348
+ "PruneFunctions": 1.2999999853491317e-05,
349
+ "RemoveOptimizationBarriers": 4.400000034365803e-05,
350
+ "ScatterMotion": 6.000000212225132e-06,
351
+ "TensorizerLegalizationPass": 3.600000127335079e-05,
352
+ "VerifySupportedOps": 1.700000029813964e-05,
353
+ "algsimp": 0.0001049999991664663,
354
+ "batchnorm_expander": 1.8999999156221747e-05,
355
+ "boundary-marker-removal": 7.000000096013537e-06,
356
+ "call-inliner": 1.4000000192027073e-05,
357
+ "canonicalize-boundary-marker": 7.999999979801942e-06,
358
+ "collective-stream-id-checker": 3.999999989900971e-06,
359
+ "comparison-expander": 7.000000096013537e-06,
360
+ "computation-deduplicator": 2.099999983329326e-05,
361
+ "conditional-to-select": 7.000000096013537e-06,
362
+ "config-lowering": 0.00027600000612437725,
363
+ "constant_folding": 1.2000000424450263e-05,
364
+ "cse": 2.2000000171829015e-05,
365
+ "dce": 1.9999999949504854e-06,
366
+ "dynamic-slice-transpose": 6.000000212225132e-06,
367
+ "eliminate-redundant-compare": 6.000000212225132e-06,
368
+ "emit-offloaded-dropout": 3.7999998312443495e-05,
369
+ "flatten-call-graph": 1.2999999853491317e-05,
370
+ "fuse-send-recv": 3.099999958067201e-05,
371
+ "hilo::LegalizeAlias": 7.000000096013537e-06,
372
+ "hilo::NeuronInstCombine": 6.299999949987978e-05,
373
+ "hilo::NeuronOpFusion": 6.000000212225132e-06,
374
+ "hilo::ReplaceTokenTypeWithU8Pass": 2.300000051036477e-05,
375
+ "hilo::ScheduleFusion": 1.9999999949504854e-06,
376
+ "hilo::SixtyFourHack": 2.099999983329326e-05,
377
+ "hilo::VerifyAliasing": 3.000000106112566e-06,
378
+ "hlo-mac-count": 7.300000288523734e-05,
379
+ "hlo-verifier": 0.00023600000713486224,
380
+ "legalize-ccops": 1.9999999949504854e-06,
381
+ "legalize-compare": 6.000000212225132e-06,
382
+ "lower-argminmax-custom-call": 6.000000212225132e-06,
383
+ "map-inline": 1.700000029813964e-05,
384
+ "metadata-naming": 2.499999936844688e-05,
385
+ "mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05,
386
+ "mlir::hlo::MhloToPyPenguin": 0.002633000025525689,
387
+ "mlir::mhlo::LowerComplexExtraPass": 0.0001049999991664663,
388
+ "mlir::mhlo::LowerComplexPass": 0.00017299999308306724,
389
+ "native-to-custom-softmax": 2.099999983329326e-05,
390
+ "native-to-custom-softmax-dx": 6.600000051548705e-05,
391
+ "operand_upcaster": 2.2000000171829015e-05,
392
+ "post-par-pipe-begin": 4.999999873689376e-06,
393
+ "post-par-pipe-end": 0.0,
394
+ "post-partition-simplification": 0.0008430000161752105,
395
+ "replace-minimum-constant": 1.1000000085914508e-05,
396
+ "reshape-mover": 4.999999873689376e-06,
397
+ "simplify-concat": 6.70000008540228e-05,
398
+ "simplify-while-loops": 3.999999989900971e-06,
399
+ "transform-variadic-reduce": 1.2999999853491317e-05,
400
+ "tuple-simplifier": 7.999999979801942e-06,
401
+ "unpack-nested-aws-ntwsr": 6.000000212225132e-06,
402
+ "unroll-while-loop": 9.999999974752427e-07
403
+ },
404
+ "hilo": {
405
+ "ArithmeticIntensity": 34.445003509521484,
406
+ "ConstantSize": 1189157.0,
407
+ "HloInputCount": 475.0,
408
+ "HloMacCount": 11811160064.0,
409
+ "HloOutputCount": 73.0,
410
+ "IfmapSize": 8266545152.0,
411
+ "OfmapSize": 75497472.0,
412
+ "OutputsReadFromCount": 0.0,
413
+ "PassthroughTensorsCount": 0.0,
414
+ "RedundantOutputCount": 0.0,
415
+ "Traffic": 685798208.0
416
+ }
417
+ },
418
+ "sg0000": {
419
+ "compiletime": {
420
+ "AGOrderingAnalysisPass": 0.07801461219787598,
421
+ "AffinePredicateResolution": 0.0017647743225097656,
422
+ "AliasDependencyElimination": 0.0001277923583984375,
423
+ "AliasDependencyInduction": 0.00855708122253418,
424
+ "AliasDependencyReset": 0.08457040786743164,
425
+ "BFComputeCutting": 0.003294229507446289,
426
+ "BirCodeGenLoop": 0.05274701118469238,
427
+ "CCOpFusion": 0.030017614364624023,
428
+ "CanonicalizeDAGForPGTiling": 0.003341197967529297,
429
+ "CanonicalizeIR": 0.0022792816162109375,
430
+ "CoalesceCCOp": 0.0053555965423583984,
431
+ "CommuteConcat": 0.0023560523986816406,
432
+ "DMALocalityOpt": 0.0013885498046875,
433
+ "DMAProfiler": 0.00625157356262207,
434
+ "DMATilingProfiler": 0.003763914108276367,
435
+ "DataLocalityOpt": 0.09786868095397949,
436
+ "DataStreaming": 0.004992246627807617,
437
+ "DeConcat": 0.002264261245727539,
438
+ "DeadCodeElimination": 0.002042531967163086,
439
+ "DeadStoreElimination": 0.030755043029785156,
440
+ "DelinearIndices": 0.009100914001464844,
441
+ "Delinearization": 0.004424571990966797,
442
+ "DoNothing": 6.914138793945313e-05,
443
+ "DramToDramTranspose": 0.03130936622619629,
444
+ "DumpGraphAndMetadata": 0.005283832550048828,
445
+ "EliminateDivs": 0.0042150020599365234,
446
+ "ExpandBatchNorm": 0.0019366741180419922,
447
+ "ExpandISAMacro": 0.002724170684814453,
448
+ "FactorizeBlkDims": 0.011873722076416016,
449
+ "FactorizeThreadAxesInFreeDims": 0.002283811569213867,
450
+ "FlattenMacroLoop": 0.0031974315643310547,
451
+ "GenericAccessSimplifier": 0.002216339111328125,
452
+ "InferInitValue": 0.030458927154541016,
453
+ "InferIntrinsicOnCC": 0.011402368545532227,
454
+ "InferNeuronTensor": 0.04513859748840332,
455
+ "InferNonlocalTensors": 0.10613727569580078,
456
+ "InferPSumTensor": 0.037427663803100586,
457
+ "InlineNativeKernels": 0.00368499755859375,
458
+ "InsertIOTransposes": 0.012629508972167969,
459
+ "InsertLocalTransposes": 0.007400989532470703,
460
+ "InsertOffloadedTransposes": 0.0025758743286132813,
461
+ "LICM": 0.0031554698944091797,
462
+ "LateLegalizeInst": 0.005858182907104492,
463
+ "LateLegalizePostSplit": 0.0029172897338867188,
464
+ "LateLowerReshapeOp": 0.0018696784973144531,
465
+ "LateLowerTensorOp": 0.004997968673706055,
466
+ "LateNeuronInstComb": 0.019808530807495117,
467
+ "LayoutPreprocessing": 0.04119300842285156,
468
+ "LayoutPreprocessingAndAnalysis": 0.10642147064208984,
469
+ "LayoutRequirementAnalysis": 0.0070705413818359375,
470
+ "LegalizeCCOpLayout": 0.004191398620605469,
471
+ "LegalizeOpLevelAlias": 0.0015521049499511719,
472
+ "LegalizePartitionReduce": 0.002257108688354492,
473
+ "LegalizeSundaAccess": 0.03900027275085449,
474
+ "LegalizeSundaMacro": 0.010483741760253906,
475
+ "LegalizeType": 0.0038602352142333984,
476
+ "LocalLayoutOpt": 0.01764845848083496,
477
+ "LoopFusion": 0.006066322326660156,
478
+ "LoopSplitting": 0.0015685558319091797,
479
+ "LowerBroadcast": 0.0020384788513183594,
480
+ "LowerCCOpBlockAxis": 0.005359172821044922,
481
+ "LowerComplexBroadcast": 0.0019440650939941406,
482
+ "LowerIntrinsics": 0.030491113662719727,
483
+ "LowerTensorOp": 0.012917041778564453,
484
+ "LowerTranspose": 0.010635852813720703,
485
+ "MacroGeneration": 0.06435012817382813,
486
+ "MaskPropagation": 0.0051097869873046875,
487
+ "MemcpyElimination": 0.11022067070007324,
488
+ "MutateDataType": 0.0014224052429199219,
489
+ "NeuronAliasDependencyInduction": 0.00023031234741210938,
490
+ "NeuronAliasDependencyReset": 0.021604061126708984,
491
+ "NeuronInstComb": 0.013072729110717773,
492
+ "NeuronLICM": 0.01006174087524414,
493
+ "NeuronLoopFusion": 0.017573833465576172,
494
+ "NeuronLoopInterchange": 0.0020608901977539063,
495
+ "NeuronSimplifier": 0.010074615478515625,
496
+ "NeuronSimplifyPredicates": 0.0060672760009765625,
497
+ "NeuronValueNumbering": 0.0041046142578125,
498
+ "OptimizeAliasedCopyChain": 0.0014190673828125,
499
+ "OptimizeNKIKernels": 0.0021109580993652344,
500
+ "PAGLayoutOpt": 0.3779466152191162,
501
+ "PComputeCutting": 0.008729696273803711,
502
+ "PGLayoutTilingPipeline": 1.5334703922271729,
503
+ "PGTiling": 0.47260475158691406,
504
+ "PadElimination": 0.0015625953674316406,
505
+ "ParAxesAnnotation": 0.2937772274017334,
506
+ "PartialLoopFusion": 0.016366004943847656,
507
+ "PartialSimdFusion": 0.01980447769165039,
508
+ "PerfectLoopNest": 0.0021877288818359375,
509
+ "RecognizeOpIdiom": 0.004831075668334961,
510
+ "Recompute": 0.00025010108947753906,
511
+ "RelaxPredicates": 0.0039484500885009766,
512
+ "Rematerialization": 0.004274129867553711,
513
+ "ReshapeWeights": 0.000804901123046875,
514
+ "ResolveAccessConflict": 0.0038733482360839844,
515
+ "ResolveComplicatePredicates": 0.0016858577728271484,
516
+ "RewriteReplicationMatmul": 0.0014014244079589844,
517
+ "RewriteWeights": 0.00405120849609375,
518
+ "SFKVectorizer": 0.20196890830993652,
519
+ "SimpleAllReduceTiling": 0.002203702926635742,
520
+ "Simplifier": 0.004297018051147461,
521
+ "SimplifyMacroPredicates": 0.01361393928527832,
522
+ "SimplifyNeuronTensor": 0.009984970092773438,
523
+ "SimplifySlice": 0.0010356903076171875,
524
+ "SimplifyTensor": 0.006205558776855469,
525
+ "SpillPSum": 0.016466140747070313,
526
+ "SplitAPUnionSets": 0.029446840286254883,
527
+ "SplitAccGrp": 0.0020453929901123047,
528
+ "StaticProfiler": 0.004591464996337891,
529
+ "StaticTransposeLocalTensor": 0.005173683166503906,
530
+ "SundaISel": 0.04554462432861328,
531
+ "TCTransform": 0.002426624298095703,
532
+ "TensorInitialization": 0.009510517120361328,
533
+ "TensorOpSimplifier": 0.0067560672760009766,
534
+ "TensorOpTransform": 0.028885841369628906,
535
+ "TileCCOps": 0.005466938018798828,
536
+ "TilingProfiler": 0.013426065444946289,
537
+ "TransformConvOp": 0.002458810806274414,
538
+ "TritiumFusion": 0.0620732307434082,
539
+ "ValueNumbering": 0.002520322799682617,
540
+ "VectorizeDMA": 0.005783796310424805,
541
+ "VectorizeMatMult": 0.005175352096557617,
542
+ "WeightCoalescing": 0.0029850006103515625,
543
+ "ZeroSizeTensorElimination": 0.00011801719665527344
544
+ },
545
+ "tensorizer": {
546
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 2597.0,
547
+ "StaticProfiler::AifUb": 40.028141021728516,
548
+ "StaticProfiler::ArithmeticIntensityTensorizer": 420.0349426269531,
549
+ "StaticProfiler::AverageDmaLength": 1921.007568359375,
550
+ "StaticProfiler::AverageFractalPeUtilization": 99.95317840576172,
551
+ "StaticProfiler::AveragePartitionUtilization": 99.87249755859375,
552
+ "StaticProfiler::AveragePeUtilization": 99.80845642089844,
553
+ "StaticProfiler::DDRTransferBytes": 64558336.0,
554
+ "StaticProfiler::InternalTransferBytes": 52297728.0,
555
+ "StaticProfiler::LoadExpanded": 23298.0,
556
+ "StaticProfiler::LocalizationEfficiency": 1049.3489990234375,
557
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1358.191162109375,
558
+ "StaticProfiler::StoreExpanded": 5505.0,
559
+ "StaticProfiler::TotalDMAExpanded": 28803.0,
560
+ "StaticProfiler::TotalDynamicInstancesCount": 3692.0,
561
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 3689.0,
562
+ "StaticProfiler::TotalLNCComm": 0.0,
563
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
564
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
565
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
566
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
567
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
568
+ "TilingProfiler::GenericInstructionsAfterTiling": 48.0,
569
+ "TilingProfiler::MatMultInstructionsAfterTiling": 1412.0,
570
+ "TilingProfiler::NumPfTransposes": 7.0,
571
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
572
+ "TilingProfiler::NumPfTransposesForLocal": 5.0,
573
+ "TilingProfiler::NumPfTransposesForNonlocal": 1.0,
574
+ "TilingProfiler::PfTransposeInstructions": 608.0,
575
+ "TilingProfiler::PfTransposeInstructionsForIo": 128.0,
576
+ "TilingProfiler::PfTransposeInstructionsForLocal": 416.0,
577
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 64.0,
578
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
579
+ "TilingProfiler::SimdInstructionsAfterTiling": 257.0,
580
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
581
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
582
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
583
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
584
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
585
+ "TransformConvOp::conv2d_column_packing": 0.0,
586
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
587
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
588
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
589
+ }
590
+ },
591
+ "sg0001": {
592
+ "compiletime": {
593
+ "AGOrderingAnalysisPass": 0.03313565254211426,
594
+ "AffinePredicateResolution": 0.0015239715576171875,
595
+ "AliasDependencyElimination": 0.00011467933654785156,
596
+ "AliasDependencyInduction": 0.009088993072509766,
597
+ "AliasDependencyReset": 1.062025547027588,
598
+ "BFComputeCutting": 0.0024559497833251953,
599
+ "BirCodeGenLoop": 0.03748297691345215,
600
+ "CCOpFusion": 0.04092240333557129,
601
+ "CanonicalizeDAGForPGTiling": 0.004329681396484375,
602
+ "CanonicalizeIR": 0.002464771270751953,
603
+ "CoalesceCCOp": 0.004778146743774414,
604
+ "CommuteConcat": 0.0011680126190185547,
605
+ "DMALocalityOpt": 0.0016834735870361328,
606
+ "DMAProfiler": 0.0039997100830078125,
607
+ "DMATilingProfiler": 0.004555702209472656,
608
+ "DataLocalityOpt": 0.13762187957763672,
609
+ "DataStreaming": 0.0044286251068115234,
610
+ "DeConcat": 0.0015981197357177734,
611
+ "DeadCodeElimination": 0.0020780563354492188,
612
+ "DeadStoreElimination": 0.03435230255126953,
613
+ "DelinearIndices": 0.00969839096069336,
614
+ "Delinearization": 0.0038826465606689453,
615
+ "DoNothing": 9.846687316894531e-05,
616
+ "DramToDramTranspose": 0.03438973426818848,
617
+ "DumpGraphAndMetadata": 0.00426793098449707,
618
+ "EliminateDivs": 0.004217386245727539,
619
+ "ExpandBatchNorm": 0.0019202232360839844,
620
+ "ExpandISAMacro": 0.0024042129516601563,
621
+ "FactorizeBlkDims": 0.01425933837890625,
622
+ "FactorizeThreadAxesInFreeDims": 0.0026972293853759766,
623
+ "FlattenMacroLoop": 0.002768993377685547,
624
+ "GenericAccessSimplifier": 0.001058816909790039,
625
+ "InferInitValue": 0.03559255599975586,
626
+ "InferIntrinsicOnCC": 0.009636163711547852,
627
+ "InferNeuronTensor": 0.04922318458557129,
628
+ "InferNonlocalTensors": 0.030732393264770508,
629
+ "InferPSumTensor": 0.03249359130859375,
630
+ "InlineNativeKernels": 0.0014734268188476563,
631
+ "InsertIOTransposes": 0.021765470504760742,
632
+ "InsertLocalTransposes": 0.006593465805053711,
633
+ "InsertOffloadedTransposes": 0.0034906864166259766,
634
+ "LICM": 0.003262758255004883,
635
+ "LateLegalizeInst": 0.00400543212890625,
636
+ "LateLegalizePostSplit": 0.00289154052734375,
637
+ "LateLowerReshapeOp": 0.002287149429321289,
638
+ "LateLowerTensorOp": 0.0046651363372802734,
639
+ "LateNeuronInstComb": 0.019269704818725586,
640
+ "LayoutPreprocessing": 0.03711414337158203,
641
+ "LayoutPreprocessingAndAnalysis": 0.2516040802001953,
642
+ "LayoutRequirementAnalysis": 0.007753133773803711,
643
+ "LegalizeCCOpLayout": 0.003732919692993164,
644
+ "LegalizeOpLevelAlias": 0.0016019344329833984,
645
+ "LegalizePartitionReduce": 0.0020945072174072266,
646
+ "LegalizeSundaAccess": 0.016069650650024414,
647
+ "LegalizeSundaMacro": 0.010806083679199219,
648
+ "LegalizeType": 0.004706859588623047,
649
+ "LocalLayoutOpt": 0.02442765235900879,
650
+ "LoopFusion": 0.0067822933197021484,
651
+ "LoopSplitting": 0.00033974647521972656,
652
+ "LowerBroadcast": 0.0019419193267822266,
653
+ "LowerCCOpBlockAxis": 0.005570650100708008,
654
+ "LowerComplexBroadcast": 0.0020999908447265625,
655
+ "LowerIntrinsics": 0.03607368469238281,
656
+ "LowerTensorOp": 0.011876583099365234,
657
+ "LowerTranspose": 0.011530637741088867,
658
+ "MacroGeneration": 0.10653066635131836,
659
+ "MaskPropagation": 0.003092050552368164,
660
+ "MemcpyElimination": 0.10495471954345703,
661
+ "MutateDataType": 0.0014193058013916016,
662
+ "NeuronAliasDependencyInduction": 0.0002295970916748047,
663
+ "NeuronAliasDependencyReset": 0.021070480346679688,
664
+ "NeuronInstComb": 0.012903451919555664,
665
+ "NeuronLICM": 0.00844264030456543,
666
+ "NeuronLoopFusion": 0.020880460739135742,
667
+ "NeuronLoopInterchange": 0.0021686553955078125,
668
+ "NeuronSimplifier": 0.011090755462646484,
669
+ "NeuronSimplifyPredicates": 0.0016274452209472656,
670
+ "NeuronValueNumbering": 0.004062652587890625,
671
+ "OptimizeAliasedCopyChain": 0.0014641284942626953,
672
+ "OptimizeNKIKernels": 0.0023856163024902344,
673
+ "PAGLayoutOpt": 0.17638587951660156,
674
+ "PComputeCutting": 0.00709986686706543,
675
+ "PGLayoutTilingPipeline": 1.142796516418457,
676
+ "PGTiling": 0.39766955375671387,
677
+ "PadElimination": 0.0015380382537841797,
678
+ "ParAxesAnnotation": 0.09186458587646484,
679
+ "PartialLoopFusion": 0.015995025634765625,
680
+ "PartialSimdFusion": 0.026766300201416016,
681
+ "PerfectLoopNest": 0.002192258834838867,
682
+ "RecognizeOpIdiom": 0.004943370819091797,
683
+ "Recompute": 0.00025773048400878906,
684
+ "RelaxPredicates": 0.003591299057006836,
685
+ "Rematerialization": 0.0025196075439453125,
686
+ "ReshapeWeights": 0.0007069110870361328,
687
+ "ResolveAccessConflict": 0.00481104850769043,
688
+ "ResolveComplicatePredicates": 0.002285003662109375,
689
+ "RewriteReplicationMatmul": 0.0021715164184570313,
690
+ "RewriteWeights": 0.003401041030883789,
691
+ "SFKVectorizer": 0.14661574363708496,
692
+ "SimpleAllReduceTiling": 0.0016207695007324219,
693
+ "Simplifier": 0.00443577766418457,
694
+ "SimplifyMacroPredicates": 0.006165742874145508,
695
+ "SimplifyNeuronTensor": 0.006829500198364258,
696
+ "SimplifySlice": 0.0013000965118408203,
697
+ "SimplifyTensor": 0.0061337947845458984,
698
+ "SpillPSum": 0.018761634826660156,
699
+ "SplitAPUnionSets": 0.017923593521118164,
700
+ "SplitAccGrp": 0.002531290054321289,
701
+ "StaticProfiler": 0.003990888595581055,
702
+ "StaticTransposeLocalTensor": 0.004915952682495117,
703
+ "SundaISel": 0.04209589958190918,
704
+ "TCTransform": 0.0012347698211669922,
705
+ "TensorInitialization": 0.002599954605102539,
706
+ "TensorOpSimplifier": 0.006845712661743164,
707
+ "TensorOpTransform": 0.03345227241516113,
708
+ "TileCCOps": 0.005617856979370117,
709
+ "TilingProfiler": 0.015013933181762695,
710
+ "TransformConvOp": 0.002393960952758789,
711
+ "TritiumFusion": 0.09340715408325195,
712
+ "ValueNumbering": 0.0031540393829345703,
713
+ "VectorizeDMA": 0.0015842914581298828,
714
+ "VectorizeMatMult": 0.0071103572845458984,
715
+ "WeightCoalescing": 0.0026235580444335938,
716
+ "ZeroSizeTensorElimination": 0.0001163482666015625
717
+ },
718
+ "tensorizer": {
719
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 7847.0,
720
+ "StaticProfiler::AifUb": 490.6532287597656,
721
+ "StaticProfiler::ArithmeticIntensityTensorizer": 487.63507080078125,
722
+ "StaticProfiler::AverageDmaLength": 869.1515502929688,
723
+ "StaticProfiler::AverageFractalPeUtilization": 100.0,
724
+ "StaticProfiler::AveragePartitionUtilization": 99.83790588378906,
725
+ "StaticProfiler::AveragePeUtilization": 100.0,
726
+ "StaticProfiler::DDRTransferBytes": 215827456.0,
727
+ "StaticProfiler::InternalTransferBytes": 43515904.0,
728
+ "StaticProfiler::LoadExpanded": 238976.0,
729
+ "StaticProfiler::LocalizationEfficiency": 99.38487243652344,
730
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 107.76165771484375,
731
+ "StaticProfiler::StoreExpanded": 5121.0,
732
+ "StaticProfiler::TotalDMAExpanded": 244097.0,
733
+ "StaticProfiler::TotalDynamicInstancesCount": 9872.0,
734
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 9872.0,
735
+ "StaticProfiler::TotalLNCComm": 0.0,
736
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
737
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
738
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
739
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
740
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
741
+ "TilingProfiler::GenericInstructionsAfterTiling": 32.0,
742
+ "TilingProfiler::MatMultInstructionsAfterTiling": 6016.0,
743
+ "TilingProfiler::NumPfTransposes": 8.0,
744
+ "TilingProfiler::NumPfTransposesForIo": 3.0,
745
+ "TilingProfiler::NumPfTransposesForLocal": 3.0,
746
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
747
+ "TilingProfiler::PfTransposeInstructions": 680.0,
748
+ "TilingProfiler::PfTransposeInstructionsForIo": 136.0,
749
+ "TilingProfiler::PfTransposeInstructionsForLocal": 288.0,
750
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 256.0,
751
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
752
+ "TilingProfiler::SimdInstructionsAfterTiling": 288.0,
753
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
754
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
755
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
756
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
757
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
758
+ "TransformConvOp::conv2d_column_packing": 0.0,
759
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
760
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
761
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
762
+ }
763
+ },
764
+ "sg0002": {
765
+ "compiletime": {
766
+ "AGOrderingAnalysisPass": 0.018257856369018555,
767
+ "AffinePredicateResolution": 0.0011677742004394531,
768
+ "AliasDependencyElimination": 0.0001201629638671875,
769
+ "AliasDependencyInduction": 0.0052988529205322266,
770
+ "AliasDependencyReset": 0.029210567474365234,
771
+ "BFComputeCutting": 0.0032625198364257813,
772
+ "BirCodeGenLoop": 0.4527714252471924,
773
+ "CCOpFusion": 0.02410125732421875,
774
+ "CanonicalizeDAGForPGTiling": 0.004324913024902344,
775
+ "CanonicalizeIR": 0.0019502639770507813,
776
+ "CoalesceCCOp": 0.014463186264038086,
777
+ "CommuteConcat": 0.0008339881896972656,
778
+ "DMALocalityOpt": 0.005598783493041992,
779
+ "DMAProfiler": 0.01209115982055664,
780
+ "DMATilingProfiler": 0.004332065582275391,
781
+ "DataLocalityOpt": 0.07260942459106445,
782
+ "DataStreaming": 0.03940248489379883,
783
+ "DeConcat": 0.0005326271057128906,
784
+ "DeadCodeElimination": 0.0009255409240722656,
785
+ "DeadStoreElimination": 0.0055675506591796875,
786
+ "DelinearIndices": 0.004735231399536133,
787
+ "Delinearization": 0.0030374526977539063,
788
+ "DoNothing": 7.033348083496094e-05,
789
+ "DramToDramTranspose": 0.018135547637939453,
790
+ "DumpGraphAndMetadata": 0.09476375579833984,
791
+ "EliminateDivs": 0.002595663070678711,
792
+ "ExpandBatchNorm": 0.002063274383544922,
793
+ "ExpandISAMacro": 0.011472225189208984,
794
+ "FactorizeBlkDims": 0.008858203887939453,
795
+ "FactorizeThreadAxesInFreeDims": 0.0010046958923339844,
796
+ "FlattenMacroLoop": 0.002232074737548828,
797
+ "GenericAccessSimplifier": 0.0018167495727539063,
798
+ "InferInitValue": 0.024865150451660156,
799
+ "InferIntrinsicOnCC": 0.009101152420043945,
800
+ "InferNeuronTensor": 0.023293495178222656,
801
+ "InferNonlocalTensors": 0.01632833480834961,
802
+ "InferPSumTensor": 0.27681708335876465,
803
+ "InlineNativeKernels": 0.0081634521484375,
804
+ "InsertIOTransposes": 0.019203901290893555,
805
+ "InsertLocalTransposes": 0.0042340755462646484,
806
+ "InsertOffloadedTransposes": 0.002811431884765625,
807
+ "LICM": 0.0029730796813964844,
808
+ "LateLegalizeInst": 0.01390385627746582,
809
+ "LateLegalizePostSplit": 0.012536048889160156,
810
+ "LateLowerReshapeOp": 0.0018641948699951172,
811
+ "LateLowerTensorOp": 0.0014081001281738281,
812
+ "LateNeuronInstComb": 0.008648872375488281,
813
+ "LayoutPreprocessing": 0.02658390998840332,
814
+ "LayoutPreprocessingAndAnalysis": 0.10707235336303711,
815
+ "LayoutRequirementAnalysis": 0.005135536193847656,
816
+ "LegalizeCCOpLayout": 0.002307415008544922,
817
+ "LegalizeOpLevelAlias": 0.0012297630310058594,
818
+ "LegalizePartitionReduce": 0.0010194778442382813,
819
+ "LegalizeSundaAccess": 0.07593941688537598,
820
+ "LegalizeSundaMacro": 0.010968446731567383,
821
+ "LegalizeType": 0.011834383010864258,
822
+ "LocalLayoutOpt": 0.013799905776977539,
823
+ "LoopFusion": 0.0052182674407958984,
824
+ "LoopSplitting": 0.0003161430358886719,
825
+ "LowerBroadcast": 0.0013611316680908203,
826
+ "LowerCCOpBlockAxis": 0.0040547847747802734,
827
+ "LowerComplexBroadcast": 0.002165079116821289,
828
+ "LowerIntrinsics": 0.31132984161376953,
829
+ "LowerTensorOp": 0.010558843612670898,
830
+ "LowerTranspose": 0.012272357940673828,
831
+ "MacroGeneration": 0.029862642288208008,
832
+ "MaskPropagation": 0.002757549285888672,
833
+ "MemcpyElimination": 0.025969266891479492,
834
+ "MutateDataType": 0.002087831497192383,
835
+ "NeuronAliasDependencyInduction": 0.00016880035400390625,
836
+ "NeuronAliasDependencyReset": 0.020352602005004883,
837
+ "NeuronInstComb": 0.004126310348510742,
838
+ "NeuronLICM": 0.0351865291595459,
839
+ "NeuronLoopFusion": 0.007991313934326172,
840
+ "NeuronLoopInterchange": 0.002409219741821289,
841
+ "NeuronSimplifier": 0.007069587707519531,
842
+ "NeuronSimplifyPredicates": 0.12138772010803223,
843
+ "NeuronValueNumbering": 0.0028395652770996094,
844
+ "OptimizeAliasedCopyChain": 0.0005936622619628906,
845
+ "OptimizeNKIKernels": 0.5374257564544678,
846
+ "PAGLayoutOpt": 0.08115577697753906,
847
+ "PComputeCutting": 0.004801273345947266,
848
+ "PGLayoutTilingPipeline": 0.5454635620117188,
849
+ "PGTiling": 0.14933419227600098,
850
+ "PadElimination": 0.00034046173095703125,
851
+ "ParAxesAnnotation": 0.053552865982055664,
852
+ "PartialLoopFusion": 0.0067539215087890625,
853
+ "PartialSimdFusion": 0.00693058967590332,
854
+ "PerfectLoopNest": 0.0035321712493896484,
855
+ "RecognizeOpIdiom": 0.003947257995605469,
856
+ "Recompute": 0.00024962425231933594,
857
+ "RelaxPredicates": 0.013285398483276367,
858
+ "Rematerialization": 0.002062082290649414,
859
+ "ReshapeWeights": 0.002131223678588867,
860
+ "ResolveAccessConflict": 0.0038597583770751953,
861
+ "ResolveComplicatePredicates": 0.002032756805419922,
862
+ "RewriteReplicationMatmul": 0.001924753189086914,
863
+ "RewriteWeights": 0.002452373504638672,
864
+ "SFKVectorizer": 0.2690722942352295,
865
+ "SimpleAllReduceTiling": 0.008755922317504883,
866
+ "Simplifier": 0.004038810729980469,
867
+ "SimplifyMacroPredicates": 0.010622739791870117,
868
+ "SimplifyNeuronTensor": 1.059011697769165,
869
+ "SimplifySlice": 0.0009577274322509766,
870
+ "SimplifyTensor": 0.005341768264770508,
871
+ "SpillPSum": 0.011537313461303711,
872
+ "SplitAPUnionSets": 0.10771751403808594,
873
+ "SplitAccGrp": 0.002201557159423828,
874
+ "StaticProfiler": 0.012447118759155273,
875
+ "StaticTransposeLocalTensor": 0.0038712024688720703,
876
+ "SundaISel": 0.04214668273925781,
877
+ "TCTransform": 0.0008432865142822266,
878
+ "TensorInitialization": 0.012825727462768555,
879
+ "TensorOpSimplifier": 0.004651308059692383,
880
+ "TensorOpTransform": 0.019537687301635742,
881
+ "TileCCOps": 0.006766319274902344,
882
+ "TilingProfiler": 0.006911277770996094,
883
+ "TransformConvOp": 0.0030303001403808594,
884
+ "TritiumFusion": 0.04502224922180176,
885
+ "ValueNumbering": 0.001996755599975586,
886
+ "VectorizeDMA": 0.0019402503967285156,
887
+ "VectorizeMatMult": 0.0027413368225097656,
888
+ "WeightCoalescing": 0.00829005241394043,
889
+ "ZeroSizeTensorElimination": 0.00013709068298339844
890
+ },
891
+ "tensorizer": {
892
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 44382.0,
893
+ "StaticProfiler::AifUb": 205.154296875,
894
+ "StaticProfiler::ArithmeticIntensityTensorizer": 201.6046905517578,
895
+ "StaticProfiler::AverageDmaLength": 1901.806396484375,
896
+ "StaticProfiler::AverageFractalPeUtilization": 99.66542053222656,
897
+ "StaticProfiler::AveragePartitionUtilization": 97.7269515991211,
898
+ "StaticProfiler::AveragePeUtilization": 98.64861297607422,
899
+ "StaticProfiler::DDRTransferBytes": 795531072.0,
900
+ "StaticProfiler::InternalTransferBytes": 646388224.0,
901
+ "StaticProfiler::LoadExpanded": 376342.0,
902
+ "StaticProfiler::LocalizationEfficiency": 98.26979064941406,
903
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 101.01405334472656,
904
+ "StaticProfiler::StoreExpanded": 4189.0,
905
+ "StaticProfiler::TotalDMAExpanded": 380531.0,
906
+ "StaticProfiler::TotalDynamicInstancesCount": 53882.0,
907
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 53436.0,
908
+ "StaticProfiler::TotalLNCComm": 0.0,
909
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
910
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
911
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
912
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
913
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
914
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
915
+ "TilingProfiler::MatMultInstructionsAfterTiling": 23616.0,
916
+ "TilingProfiler::NumPfTransposes": 5.0,
917
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
918
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
919
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
920
+ "TilingProfiler::PfTransposeInstructions": 19393.0,
921
+ "TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
922
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
923
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0,
924
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
925
+ "TilingProfiler::SimdInstructionsAfterTiling": 158.0,
926
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
927
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
928
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
929
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
930
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
931
+ "TransformConvOp::conv2d_column_packing": 0.0,
932
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
933
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
934
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
935
+ }
936
+ },
937
+ "sg01": {
938
+ "compiletime": {
939
+ "CanonicalizeConv": 1.2000000424450263e-05,
940
+ "CanonicalizeForTensorizer": 1.2999999853491317e-05,
941
+ "Canonicalizer": 0.0002500000118743628,
942
+ "HoistCompute": 3.000000106112566e-06,
943
+ "IdentifyCrossPassTensors": 2.300000051036477e-05,
944
+ "MemcastMotion": 1.1000000085914508e-05,
945
+ "PenguinizeFunctions": 1.4000000192027073e-05,
946
+ "PruneFunctions": 3.099999958067201e-05,
947
+ "RemoveOptimizationBarriers": 2.2000000171829015e-05,
948
+ "ScatterMotion": 2.9999999242136255e-05,
949
+ "TensorizerLegalizationPass": 1.700000029813964e-05,
950
+ "VerifySupportedOps": 9.000000318337698e-06,
951
+ "algsimp": 6.299999949987978e-05,
952
+ "batchnorm_expander": 1.2999999853491317e-05,
953
+ "boundary-marker-removal": 4.999999873689376e-06,
954
+ "call-inliner": 9.000000318337698e-06,
955
+ "canonicalize-boundary-marker": 6.000000212225132e-06,
956
+ "collective-stream-id-checker": 3.000000106112566e-06,
957
+ "comparison-expander": 4.999999873689376e-06,
958
+ "computation-deduplicator": 1.8000000636675395e-05,
959
+ "conditional-to-select": 4.999999873689376e-06,
960
+ "config-lowering": 2.5999999706982635e-05,
961
+ "constant_folding": 7.999999979801942e-06,
962
+ "cse": 1.2000000424450263e-05,
963
+ "dce": 9.999999974752427e-07,
964
+ "dynamic-slice-transpose": 3.999999989900971e-06,
965
+ "eliminate-redundant-compare": 3.999999989900971e-06,
966
+ "emit-offloaded-dropout": 1.2999999853491317e-05,
967
+ "flatten-call-graph": 7.999999979801942e-06,
968
+ "fuse-send-recv": 2.099999983329326e-05,
969
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
970
+ "hilo::NeuronInstCombine": 4.5000000682193786e-05,
971
+ "hilo::NeuronOpFusion": 1.700000029813964e-05,
972
+ "hilo::ReplaceTokenTypeWithU8Pass": 2.099999983329326e-05,
973
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
974
+ "hilo::SixtyFourHack": 1.2999999853491317e-05,
975
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
976
+ "hlo-mac-count": 2.9999999242136255e-05,
977
+ "hlo-verifier": 0.00018000000272877514,
978
+ "legalize-ccops": 9.999999974752427e-07,
979
+ "legalize-compare": 3.999999989900971e-06,
980
+ "lower-argminmax-custom-call": 3.999999989900971e-06,
981
+ "map-inline": 1.2000000424450263e-05,
982
+ "metadata-naming": 1.8000000636675395e-05,
983
+ "mlir::detail::OpToOpPassAdaptor": 9.999999747378752e-05,
984
+ "mlir::hlo::MhloToPyPenguin": 0.0009420000133104622,
985
+ "mlir::mhlo::LowerComplexExtraPass": 7.999999797903001e-05,
986
+ "mlir::mhlo::LowerComplexPass": 0.00015799999528098851,
987
+ "native-to-custom-softmax": 6.000000212225132e-06,
988
+ "native-to-custom-softmax-dx": 1.2999999853491317e-05,
989
+ "operand_upcaster": 1.4999999621068127e-05,
990
+ "post-par-pipe-begin": 1.9999999949504854e-06,
991
+ "post-par-pipe-end": 0.0,
992
+ "post-partition-simplification": 0.0005130000063218176,
993
+ "replace-minimum-constant": 6.000000212225132e-06,
994
+ "reshape-mover": 3.000000106112566e-06,
995
+ "simplify-concat": 4.8999998398358e-05,
996
+ "simplify-while-loops": 1.9999999949504854e-06,
997
+ "transform-variadic-reduce": 9.000000318337698e-06,
998
+ "tuple-simplifier": 4.999999873689376e-06,
999
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1000
+ "unroll-while-loop": 0.0
1001
+ },
1002
+ "hilo": {
1003
+ "ArithmeticIntensity": 457.20416259765625,
1004
+ "HloMacCount": 50465865728.0,
1005
+ "Traffic": 220758560.0
1006
+ }
1007
+ },
1008
+ "sg02": {
1009
+ "compiletime": {
1010
+ "CanonicalizeConv": 0.0002589999930933118,
1011
+ "CanonicalizeForTensorizer": 1.2000000424450263e-05,
1012
+ "Canonicalizer": 0.0003060000017285347,
1013
+ "HoistCompute": 1.9999999949504854e-06,
1014
+ "IdentifyCrossPassTensors": 2.4000000848900527e-05,
1015
+ "MemcastMotion": 1.2999999853491317e-05,
1016
+ "PenguinizeFunctions": 9.000000318337698e-06,
1017
+ "PruneFunctions": 7.999999979801942e-06,
1018
+ "RemoveOptimizationBarriers": 2.099999983329326e-05,
1019
+ "ScatterMotion": 1.9999999949504854e-06,
1020
+ "TensorizerLegalizationPass": 4.999999873689376e-06,
1021
+ "VerifySupportedOps": 1.1000000085914508e-05,
1022
+ "algsimp": 5.900000178371556e-05,
1023
+ "batchnorm_expander": 1.2000000424450263e-05,
1024
+ "boundary-marker-removal": 3.999999989900971e-06,
1025
+ "call-inliner": 1.1000000085914508e-05,
1026
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
1027
+ "collective-stream-id-checker": 3.000000106112566e-06,
1028
+ "comparison-expander": 4.999999873689376e-06,
1029
+ "computation-deduplicator": 4.099999932805076e-05,
1030
+ "conditional-to-select": 6.000000212225132e-06,
1031
+ "config-lowering": 2.5999999706982635e-05,
1032
+ "constant_folding": 9.000000318337698e-06,
1033
+ "cse": 1.1000000085914508e-05,
1034
+ "dce": 9.999999974752427e-07,
1035
+ "dynamic-slice-transpose": 3.999999989900971e-06,
1036
+ "eliminate-redundant-compare": 3.999999989900971e-06,
1037
+ "emit-offloaded-dropout": 1.2999999853491317e-05,
1038
+ "flatten-call-graph": 1.1000000085914508e-05,
1039
+ "fuse-send-recv": 1.8000000636675395e-05,
1040
+ "hilo::LegalizeAlias": 1.9999999949504854e-06,
1041
+ "hilo::NeuronInstCombine": 5.8000001445179805e-05,
1042
+ "hilo::NeuronOpFusion": 1.9999999949504854e-06,
1043
+ "hilo::ReplaceTokenTypeWithU8Pass": 9.000000318337698e-06,
1044
+ "hilo::ScheduleFusion": 3.999999989900971e-06,
1045
+ "hilo::SixtyFourHack": 3.899999865097925e-05,
1046
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1047
+ "hlo-mac-count": 0.00016799999866634607,
1048
+ "hlo-verifier": 0.00015900000289548188,
1049
+ "legalize-ccops": 9.999999974752427e-07,
1050
+ "legalize-compare": 3.000000106112566e-06,
1051
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
1052
+ "map-inline": 1.2000000424450263e-05,
1053
+ "metadata-naming": 1.700000029813964e-05,
1054
+ "mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05,
1055
+ "mlir::hlo::MhloToPyPenguin": 0.0008549999911338091,
1056
+ "mlir::mhlo::LowerComplexExtraPass": 8.800000068731606e-05,
1057
+ "mlir::mhlo::LowerComplexPass": 0.00015999999595806003,
1058
+ "native-to-custom-softmax": 4.999999873689376e-06,
1059
+ "native-to-custom-softmax-dx": 1.9999999494757503e-05,
1060
+ "operand_upcaster": 1.2000000424450263e-05,
1061
+ "post-par-pipe-begin": 1.9999999949504854e-06,
1062
+ "post-par-pipe-end": 0.0,
1063
+ "post-partition-simplification": 0.0005009999731555581,
1064
+ "replace-minimum-constant": 9.000000318337698e-06,
1065
+ "reshape-mover": 3.000000106112566e-06,
1066
+ "simplify-concat": 4.3000000005122274e-05,
1067
+ "simplify-while-loops": 1.9999999949504854e-06,
1068
+ "transform-variadic-reduce": 4.8000001697801054e-05,
1069
+ "tuple-simplifier": 4.999999873689376e-06,
1070
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1071
+ "unroll-while-loop": 0.0
1072
+ },
1073
+ "hilo": {
1074
+ "ArithmeticIntensity": 99.1578140258789,
1075
+ "HloMacCount": 38965870592.0,
1076
+ "Traffic": 785936448.0
1077
+ }
1078
+ }
1079
+ }
context_encoding_model/_tp0_bk2/graph.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7e216fd8f0f2acfef59524e7cdb4ead506b2c17c584ce45dd222cd4dc4e3f4f
3
+ size 1987584
context_encoding_model/_tp0_bk2/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_encoding_model/_tp0_bk2/metaneff.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99c279a1a32451ce56757879c7a74b6ff23378ae19871f2aee2c2746ceda57f3
3
+ size 1373735
context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:970c5138d61d773fc00bacb9090fbc05a05573925b8d91068006c211596d3f78
3
+ size 1450821
context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7e216fd8f0f2acfef59524e7cdb4ead506b2c17c584ce45dd222cd4dc4e3f4f
3
+ size 1987584
context_encoding_model/_tp0_bk2/neuron_config.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "Qwen/Qwen3-8B",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "Qwen3ForCausalLM"
7
+ ],
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "attribute_map": {},
11
+ "bad_words_ids": null,
12
+ "begin_suppress_tokens": null,
13
+ "bos_token_id": 151643,
14
+ "chunk_size_feed_forward": 0,
15
+ "cross_attention_hidden_size": null,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": 151645,
22
+ "exponential_decay_length_penalty": null,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "fused_spec_config": null,
27
+ "head_dim": 128,
28
+ "hidden_act": "silu",
29
+ "hidden_size": 4096,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 12288,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "max_position_embeddings": 40960,
45
+ "max_window_layers": 36,
46
+ "metadata": null,
47
+ "min_length": 0,
48
+ "model_type": "qwen3",
49
+ "neuron_config": {
50
+ "activation_quantization_type": null,
51
+ "allow_input_truncation": false,
52
+ "apply_seq_ids_mask": false,
53
+ "async_mode": false,
54
+ "attention_dp_degree": 1,
55
+ "attention_dtype": null,
56
+ "attn_block_cte_nki_kernel_enabled": false,
57
+ "attn_block_tkg_nki_kernel_cache_update": false,
58
+ "attn_block_tkg_nki_kernel_enabled": false,
59
+ "attn_cls": {
60
+ "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
61
+ "__name__": "NeuronQwen3Attention"
62
+ },
63
+ "attn_kernel_enabled": null,
64
+ "attn_tkg_builtin_kernel_enabled": false,
65
+ "attn_tkg_nki_kernel_enabled": false,
66
+ "batch_size": 1,
67
+ "bucket_n_active_tokens": true,
68
+ "buckets": [
69
+ 512
70
+ ],
71
+ "cast_type": "config",
72
+ "cc_pipeline_tiling_factor": 2,
73
+ "chunked_prefill_config": null,
74
+ "context_encoding_buckets": [
75
+ 512
76
+ ],
77
+ "cp_degree": 1,
78
+ "ctx_batch_size": 1,
79
+ "disable_kv_cache_tiling": false,
80
+ "draft_model_modules_to_not_convert": null,
81
+ "enable_bucketing": true,
82
+ "enable_eagle_draft_input_norm": false,
83
+ "enable_eagle_speculation": false,
84
+ "enable_fused_speculation": false,
85
+ "enable_long_context_mode": false,
86
+ "enable_output_completion_notifications": false,
87
+ "enable_spill_reload_dge": false,
88
+ "enable_token_tree": false,
89
+ "ep_degree": 1,
90
+ "expert_mlp_nki_kernel_enabled": null,
91
+ "flash_decoding_enabled": false,
92
+ "fused_qkv": false,
93
+ "fused_rmsnorm_skip_gamma": false,
94
+ "is_block_kv_layout": null,
95
+ "is_chunked_prefill": false,
96
+ "is_continuous_batching": true,
97
+ "is_eagle_draft": false,
98
+ "is_medusa": false,
99
+ "is_prefill_stage": true,
100
+ "is_prefix_caching": false,
101
+ "k_cache_transposed": false,
102
+ "kv_cache_batch_size": 1,
103
+ "kv_cache_padding_size": 0,
104
+ "kv_cache_quant": false,
105
+ "kv_cache_tiling": false,
106
+ "layer_boundary_markers": false,
107
+ "lm_head_pad": false,
108
+ "lm_head_pad_alignment_size": 1,
109
+ "local_ranks_size": 2,
110
+ "logical_nc_config": 1,
111
+ "lora_config": null,
112
+ "max_batch_size": 1,
113
+ "max_context_length": 1024,
114
+ "max_length": 1024,
115
+ "max_new_tokens": null,
116
+ "medusa_speculation_length": 0,
117
+ "medusa_tree": null,
118
+ "mlp_kernel_enabled": false,
119
+ "mlp_kernel_fuse_residual_add": false,
120
+ "modules_to_not_convert": null,
121
+ "moe_fused_nki_kernel_enabled": null,
122
+ "n_active_tokens": 1024,
123
+ "n_positions": 1024,
124
+ "num_medusa_heads": 0,
125
+ "on_cpu": false,
126
+ "on_device_sampling_config": {
127
+ "deterministic": false,
128
+ "do_sample": false,
129
+ "dynamic": true,
130
+ "global_topk": 256,
131
+ "on_device_sampling_config": true,
132
+ "temperature": 1.0,
133
+ "top_k": 1,
134
+ "top_k_kernel_enabled": false,
135
+ "top_p": 1.0
136
+ },
137
+ "output_logits": false,
138
+ "overrides_torch_dtype": true,
139
+ "pa_block_size": 1024,
140
+ "pa_num_blocks": 1,
141
+ "padding_side": "right",
142
+ "pp_degree": 1,
143
+ "prefix_buckets": null,
144
+ "qk_layernorm": false,
145
+ "qkv_kernel_enabled": false,
146
+ "qkv_kernel_fuse_residual_add": false,
147
+ "qkv_kernel_nbsd_layout": false,
148
+ "quantization_dtype": "int8",
149
+ "quantization_type": "per_tensor_symmetric",
150
+ "quantize_clamp_bound": Infinity,
151
+ "quantized": false,
152
+ "quantized_checkpoints_path": null,
153
+ "quantized_mlp_kernel_enabled": false,
154
+ "rmsnorm_quantize_kernel_enabled": false,
155
+ "router_topk_nki_kernel_enabled": null,
156
+ "rpl_reduce_dtype": null,
157
+ "save_sharded_checkpoint": true,
158
+ "scratchpad_page_size": null,
159
+ "seq_len": 1024,
160
+ "seq_len_threshold_for_cc_tiling": 16384,
161
+ "sequence_parallel_enabled": false,
162
+ "shared_mlp_nki_kernel_enabled": null,
163
+ "skip_sharding": false,
164
+ "skip_warmup": false,
165
+ "spec_batch_size": 1,
166
+ "speculation_length": 0,
167
+ "start_rank_id": 0,
168
+ "target": null,
169
+ "tile_cc": false,
170
+ "tkg_batch_size": 1,
171
+ "token_generation_buckets": null,
172
+ "token_tree_config": null,
173
+ "torch_dtype": "bfloat16",
174
+ "tp_degree": 2,
175
+ "vocab_parallel": false,
176
+ "weight_gather_seq_len_threshold": 32768,
177
+ "weights_to_skip_layout_optimization": [],
178
+ "world_size": 2
179
+ },
180
+ "no_repeat_ngram_size": 0,
181
+ "num_attention_heads": 32,
182
+ "num_beam_groups": 1,
183
+ "num_beams": 1,
184
+ "num_cores_per_group": 1,
185
+ "num_hidden_layers": 36,
186
+ "num_key_value_heads": 8,
187
+ "num_return_sequences": 1,
188
+ "output_attentions": false,
189
+ "output_hidden_states": false,
190
+ "output_scores": false,
191
+ "pad_token_id": 0,
192
+ "prefix": null,
193
+ "problem_type": null,
194
+ "pruned_heads": {},
195
+ "remove_invalid_values": false,
196
+ "repetition_penalty": 1.0,
197
+ "return_dict": true,
198
+ "return_dict_in_generate": false,
199
+ "rms_norm_eps": 1e-06,
200
+ "rope_scaling": null,
201
+ "rope_theta": 1000000,
202
+ "sep_token_id": null,
203
+ "sliding_window": null,
204
+ "suppress_tokens": null,
205
+ "task_specific_params": null,
206
+ "temperature": 1.0,
207
+ "tf_legacy_loss": false,
208
+ "tie_encoder_decoder": false,
209
+ "tie_word_embeddings": false,
210
+ "tokenizer_class": null,
211
+ "top_k": 50,
212
+ "top_p": 1.0,
213
+ "torchscript": false,
214
+ "transformers_version": "4.51.0",
215
+ "typical_p": 1.0,
216
+ "use_bfloat16": false,
217
+ "use_cache": true,
218
+ "use_sliding_window": false,
219
+ "vocab_size": 151936
220
+ }
context_encoding_model/_tp0_bk3/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb --output model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk3/compile_flags.MODULE_b3ddbc97e5f0d1d64c82+155de413.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/log-neuron-cc.txt"]
context_encoding_model/_tp0_bk3/global_metric_store.json ADDED
@@ -0,0 +1,1079 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.7004623413086,
5
+ "StaticProfiler::AveragePartitionUtilization": 97.94140625,
6
+ "StaticProfiler::AveragePeUtilization": 98.78884887695313,
7
+ "StaticProfiler::LocalizationEfficiency": 91.59693145751953,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 95.863037109375,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1.0,
16
+ "StaticProfiler::AveragePartitionUtilization": 1.0,
17
+ "StaticProfiler::AveragePeUtilization": 1.0,
18
+ "StaticProfiler::LocalizationEfficiency": 1.0,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 0.01837611198425293,
27
+ "AffinePredicateResolution": 0.0011184215545654297,
28
+ "AliasDependencyElimination": 0.00015664100646972656,
29
+ "AliasDependencyInduction": 0.005170583724975586,
30
+ "AliasDependencyReset": 0.027508020401000977,
31
+ "BFComputeCutting": 0.0036101341247558594,
32
+ "BirCodeGenLoop": 0.4774467945098877,
33
+ "CCOpFusion": 0.033265113830566406,
34
+ "CanonicalizeConv": 2.300000051036477e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.004282712936401367,
36
+ "CanonicalizeForTensorizer": 4.600000102072954e-05,
37
+ "CanonicalizeIR": 0.0024569034576416016,
38
+ "Canonicalizer": 0.0009039999567903578,
39
+ "CoalesceCCOp": 0.014229059219360352,
40
+ "CommuteConcat": 0.0017316341400146484,
41
+ "DMALocalityOpt": 0.005630016326904297,
42
+ "DMAProfiler": 0.012981653213500977,
43
+ "DMATilingProfiler": 0.0037560462951660156,
44
+ "DataLocalityOpt": 0.07645320892333984,
45
+ "DataStreaming": 0.03730320930480957,
46
+ "DeConcat": 0.0018520355224609375,
47
+ "DeadCodeElimination": 0.0020148754119873047,
48
+ "DeadStoreElimination": 0.006912708282470703,
49
+ "DelinearIndices": 0.004647254943847656,
50
+ "Delinearization": 0.003908872604370117,
51
+ "DoNothing": 0.0001888275146484375,
52
+ "DramToDramTranspose": 0.02015542984008789,
53
+ "DumpGraphAndMetadata": 0.08691883087158203,
54
+ "EliminateDivs": 0.0025060176849365234,
55
+ "ExpandBatchNorm": 0.0027189254760742188,
56
+ "ExpandISAMacro": 0.011646032333374023,
57
+ "FactorizeBlkDims": 0.010123252868652344,
58
+ "FactorizeThreadAxesInFreeDims": 0.0023202896118164063,
59
+ "FlattenMacroLoop": 0.00232696533203125,
60
+ "GenericAccessSimplifier": 0.0008094310760498047,
61
+ "HoistCompute": 5.999999757477781e-06,
62
+ "IdentifyCrossPassTensors": 5.2999999752501026e-05,
63
+ "InferInitValue": 0.02833867073059082,
64
+ "InferIntrinsicOnCC": 0.008923768997192383,
65
+ "InferNeuronTensor": 0.025766372680664063,
66
+ "InferNonlocalTensors": 0.014599800109863281,
67
+ "InferPSumTensor": 0.28418898582458496,
68
+ "InlineNativeKernels": 0.00860905647277832,
69
+ "InsertIOTransposes": 0.01989889144897461,
70
+ "InsertLocalTransposes": 0.004229307174682617,
71
+ "InsertOffloadedTransposes": 0.0029871463775634766,
72
+ "LICM": 0.0030870437622070313,
73
+ "LateLegalizeInst": 0.014106035232543945,
74
+ "LateLegalizePostSplit": 0.014872312545776367,
75
+ "LateLowerReshapeOp": 0.0010464191436767578,
76
+ "LateLowerTensorOp": 0.002707242965698242,
77
+ "LateNeuronInstComb": 0.010563373565673828,
78
+ "LayoutPreprocessing": 0.026853561401367188,
79
+ "LayoutPreprocessingAndAnalysis": 0.0556035041809082,
80
+ "LayoutRequirementAnalysis": 0.004946470260620117,
81
+ "LegalizeCCOpLayout": 0.0025353431701660156,
82
+ "LegalizeOpLevelAlias": 0.0018966197967529297,
83
+ "LegalizePartitionReduce": 0.0017490386962890625,
84
+ "LegalizeSundaAccess": 0.07800722122192383,
85
+ "LegalizeSundaMacro": 0.012125253677368164,
86
+ "LegalizeType": 0.012685060501098633,
87
+ "LocalLayoutOpt": 0.013860225677490234,
88
+ "LoopFusion": 0.005201578140258789,
89
+ "LoopSplitting": 0.0003204345703125,
90
+ "LowerBroadcast": 0.002086162567138672,
91
+ "LowerCCOpBlockAxis": 0.0040171146392822266,
92
+ "LowerComplexBroadcast": 0.002280712127685547,
93
+ "LowerIntrinsics": 0.3143951892852783,
94
+ "LowerTensorOp": 0.01141357421875,
95
+ "LowerTranspose": 0.012923002243041992,
96
+ "MacroGeneration": 0.034410953521728516,
97
+ "MaskPropagation": 0.0028192996978759766,
98
+ "MemcastMotion": 1.8000000636675395e-05,
99
+ "MemcpyElimination": 0.02788853645324707,
100
+ "MutateDataType": 0.0012311935424804688,
101
+ "NeuronAliasDependencyInduction": 0.0001773834228515625,
102
+ "NeuronAliasDependencyReset": 0.024976015090942383,
103
+ "NeuronInstComb": 0.005156517028808594,
104
+ "NeuronLICM": 0.036696434020996094,
105
+ "NeuronLoopFusion": 0.008457422256469727,
106
+ "NeuronLoopInterchange": 0.001413106918334961,
107
+ "NeuronSimplifier": 0.007856369018554688,
108
+ "NeuronSimplifyPredicates": 0.12235808372497559,
109
+ "NeuronValueNumbering": 0.004765748977661133,
110
+ "OptimizeAliasedCopyChain": 0.0006341934204101563,
111
+ "OptimizeNKIKernels": 0.38834357261657715,
112
+ "PAGLayoutOpt": 0.0889735221862793,
113
+ "PComputeCutting": 0.005109071731567383,
114
+ "PGLayoutTilingPipeline": 0.6248171329498291,
115
+ "PGTiling": 0.1645822525024414,
116
+ "PadElimination": 0.0003485679626464844,
117
+ "ParAxesAnnotation": 0.05196070671081543,
118
+ "PartialLoopFusion": 0.011112451553344727,
119
+ "PartialSimdFusion": 0.012138128280639648,
120
+ "PenguinizeFunctions": 4.3000000005122274e-05,
121
+ "PerfectLoopNest": 0.002288341522216797,
122
+ "PruneFunctions": 4.099999932805076e-05,
123
+ "RecognizeOpIdiom": 0.0041277408599853516,
124
+ "Recompute": 0.00026416778564453125,
125
+ "RelaxPredicates": 0.01356959342956543,
126
+ "Rematerialization": 0.0024864673614501953,
127
+ "RemoveOptimizationBarriers": 4.900000203633681e-05,
128
+ "ReshapeWeights": 0.0007522106170654297,
129
+ "ResolveAccessConflict": 0.0048482418060302734,
130
+ "ResolveComplicatePredicates": 0.0015094280242919922,
131
+ "RewriteReplicationMatmul": 0.0015668869018554688,
132
+ "RewriteWeights": 0.0027174949645996094,
133
+ "SFKVectorizer": 0.2781519889831543,
134
+ "ScatterMotion": 4.70000013592653e-05,
135
+ "SimpleAllReduceTiling": 0.009549379348754883,
136
+ "Simplifier": 0.003630399703979492,
137
+ "SimplifyMacroPredicates": 0.011396646499633789,
138
+ "SimplifyNeuronTensor": 1.0561063289642334,
139
+ "SimplifySlice": 0.0023348331451416016,
140
+ "SimplifyTensor": 0.005601167678833008,
141
+ "SpillPSum": 0.013618230819702148,
142
+ "SplitAPUnionSets": 0.11336159706115723,
143
+ "SplitAccGrp": 0.001394510269165039,
144
+ "StaticProfiler": 0.014252662658691406,
145
+ "StaticTransposeLocalTensor": 0.003930330276489258,
146
+ "SundaISel": 0.04436635971069336,
147
+ "TCTransform": 0.0008757114410400391,
148
+ "TensorInitialization": 0.01558232307434082,
149
+ "TensorOpSimplifier": 0.004608869552612305,
150
+ "TensorOpTransform": 0.01923346519470215,
151
+ "TensorizerLegalizationPass": 5.2999999752501026e-05,
152
+ "TileCCOps": 0.005507707595825195,
153
+ "TilingProfiler": 0.007405757904052734,
154
+ "TransformConvOp": 0.0030219554901123047,
155
+ "TritiumFusion": 0.05425119400024414,
156
+ "ValueNumbering": 0.0020017623901367188,
157
+ "VectorizeDMA": 0.002228975296020508,
158
+ "VectorizeMatMult": 0.006806135177612305,
159
+ "VerifySupportedOps": 3.5000000934815034e-05,
160
+ "WeightCoalescing": 0.008660554885864258,
161
+ "ZeroSizeTensorElimination": 0.00014281272888183594,
162
+ "algsimp": 0.0027209999971091747,
163
+ "batchnorm_expander": 4.099999932805076e-05,
164
+ "boundary-marker-removal": 1.2999998943996616e-05,
165
+ "call-inliner": 0.0004540000227279961,
166
+ "canonicalize-boundary-marker": 1.700000029813964e-05,
167
+ "collective-stream-id-checker": 8.000000525498763e-05,
168
+ "comparison-expander": 0.0005869999877177179,
169
+ "computation-deduplicator": 7.500000356230885e-05,
170
+ "conditional-to-select": 1.700000029813964e-05,
171
+ "config-lowering": 8.800000068731606e-05,
172
+ "constant-statistics": 0.0005440000095404685,
173
+ "constant_folding": 0.00032700004521757364,
174
+ "cse": 3.7000001611886546e-05,
175
+ "dce": 9.100000170292333e-05,
176
+ "dot_decomposer": 0.0013370000524446368,
177
+ "dynamic-slice-transpose": 1.2000000424450263e-05,
178
+ "eliminate-redundant-compare": 0.0003020000003743917,
179
+ "emit-offloaded-dropout": 3.9999998989515007e-05,
180
+ "flatten-call-graph": 0.0009239999344572425,
181
+ "fuse-send-recv": 7.79999973019585e-05,
182
+ "hilo::LegalizeAlias": 1.1999999514955562e-05,
183
+ "hilo::NeuronInstCombine": 0.00018899999849963933,
184
+ "hilo::NeuronOpFusion": 4.5000000682193786e-05,
185
+ "hilo::ReplaceTokenTypeWithU8Pass": 5.7999997807201e-05,
186
+ "hilo::ScheduleFusion": 0.00016099998902063817,
187
+ "hilo::SixtyFourHack": 6.70000008540228e-05,
188
+ "hilo::VerifyAliasing": 4.999999873689376e-06,
189
+ "hlo-mac-count": 0.0013409999664872885,
190
+ "hlo-verifier": 0.007716999854892492,
191
+ "instruction-histogram": 0.0007719999994151294,
192
+ "io-con-pipe-begin": 4.999999873689376e-06,
193
+ "io-con-pipe-end": 9.999999974752427e-07,
194
+ "io-layout-normalization": 0.00139999995008111,
195
+ "io-statistics": 6.299999949987978e-05,
196
+ "legalize-ccops": 3.999999989900971e-06,
197
+ "legalize-compare": 1.1000000085914508e-05,
198
+ "lower-argminmax-custom-call": 1.1000000085914508e-05,
199
+ "map-inline": 0.0008809999562799931,
200
+ "metadata-naming": 6.70000008540228e-05,
201
+ "mlir::detail::OpToOpPassAdaptor": 0.00020599999697878957,
202
+ "mlir::hlo::MhloToPyPenguin": 0.00291300006210804,
203
+ "mlir::mhlo::LowerComplexExtraPass": 0.00027200000477023423,
204
+ "mlir::mhlo::LowerComplexPass": 0.0003980000037699938,
205
+ "native-to-custom-softmax": 0.0007730000070296228,
206
+ "native-to-custom-softmax-dx": 0.0006189999985508621,
207
+ "operand_upcaster": 6.299999949987978e-05,
208
+ "opt-barrier-removal": 0.0005789999850094318,
209
+ "post-par-pipe-begin": 7.999999979801942e-06,
210
+ "post-par-pipe-end": 0.0,
211
+ "post-partition-simplification": 0.0017419999931007624,
212
+ "pre-par-pipe-begin": 1.9999999949504854e-06,
213
+ "pre-par-pipe-end": 0.0,
214
+ "pre-partition-simplification": 0.1384889930486679,
215
+ "replace-minimum-constant": 0.0004579999949783087,
216
+ "reshape-mover": 0.00011000000085914508,
217
+ "simplify-concat": 0.00014099999680183828,
218
+ "simplify-while-loops": 9.40000027185306e-05,
219
+ "transform-variadic-reduce": 8.100000559352338e-05,
220
+ "tuple-simplifier": 0.00030600003083236516,
221
+ "unpack-nested-aws-ntwsr": 0.000438000017311424,
222
+ "unroll-while-loop": 1.8999999156221747e-05,
223
+ "zero_sized_hlo_elimination": 0.0008750000270083547
224
+ },
225
+ "hilo": {
226
+ "ConstantSize": 2368805.0,
227
+ "HloInputCount": 475.0,
228
+ "HloMacCount": 206469595136.0,
229
+ "HloOutputCount": 73.0,
230
+ "IfmapSize": 8266549248.0,
231
+ "OfmapSize": 75497472.0,
232
+ "OutputsReadFromCount": 0.0,
233
+ "PassthroughTensorsCount": 0.0,
234
+ "RedundantOutputCount": 0.0,
235
+ "Traffic": 1751252352.0
236
+ },
237
+ "tensorizer": {
238
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 49538.0,
239
+ "StaticProfiler::AifUb": 304.240234375,
240
+ "StaticProfiler::ArithmeticIntensityTensorizer": 278.67474365234375,
241
+ "StaticProfiler::AverageDmaLength": 1974.1033935546875,
242
+ "StaticProfiler::DDRTransferBytes": 862646080.0,
243
+ "StaticProfiler::InternalTransferBytes": 669456896.0,
244
+ "StaticProfiler::LoadExpanded": 390679.0,
245
+ "StaticProfiler::StoreExpanded": 7261.0,
246
+ "StaticProfiler::TotalDMAExpanded": 397940.0,
247
+ "StaticProfiler::TotalDynamicInstancesCount": 59578.0,
248
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 59132.0,
249
+ "StaticProfiler::TotalLNCComm": 0.0,
250
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
251
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
252
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
253
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
254
+ "TilingProfiler::MatMultInstructionsAfterTiling": 28224.0,
255
+ "TilingProfiler::NumPfTransposes": 5.0,
256
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
257
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
258
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
259
+ "TilingProfiler::PfTransposeInstructions": 19777.0,
260
+ "TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
261
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
262
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0,
263
+ "TilingProfiler::ReduceInstructionsAfterTiling": 6.0,
264
+ "TilingProfiler::SimdInstructionsAfterTiling": 303.0,
265
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
266
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
267
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
268
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
269
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
270
+ "TransformConvOp::conv2d_column_packing": 0.0,
271
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
272
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
273
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
274
+ }
275
+ },
276
+ "all": {
277
+ "compiletime": {
278
+ "algsimp": 0.002532999962568283,
279
+ "call-inliner": 0.00042600001324899495,
280
+ "collective-stream-id-checker": 6.70000008540228e-05,
281
+ "comparison-expander": 0.0005719999899156392,
282
+ "constant-statistics": 0.0005440000095404685,
283
+ "constant_folding": 0.0003000000142492354,
284
+ "dce": 8.800000068731606e-05,
285
+ "dot_decomposer": 0.0013370000524446368,
286
+ "eliminate-redundant-compare": 0.000291000003926456,
287
+ "flatten-call-graph": 0.0008929999894462526,
288
+ "hlo-mac-count": 0.0010870000114664435,
289
+ "hlo-verifier": 0.007048000115901232,
290
+ "instruction-histogram": 0.0007719999994151294,
291
+ "io-con-pipe-begin": 4.999999873689376e-06,
292
+ "io-con-pipe-end": 9.999999974752427e-07,
293
+ "io-layout-normalization": 0.00139999995008111,
294
+ "io-statistics": 6.299999949987978e-05,
295
+ "map-inline": 0.0008459999808110297,
296
+ "native-to-custom-softmax": 0.0006709999870508909,
297
+ "native-to-custom-softmax-dx": 0.0005300000193528831,
298
+ "opt-barrier-removal": 0.0005789999850094318,
299
+ "pre-par-pipe-begin": 1.9999999949504854e-06,
300
+ "pre-par-pipe-end": 0.0,
301
+ "pre-partition-simplification": 0.1384889930486679,
302
+ "replace-minimum-constant": 0.00041700000292621553,
303
+ "reshape-mover": 9.999999747378752e-05,
304
+ "simplify-while-loops": 8.800000068731606e-05,
305
+ "tuple-simplifier": 0.000291000003926456,
306
+ "unpack-nested-aws-ntwsr": 0.00042600001324899495,
307
+ "unroll-while-loop": 1.8999999156221747e-05,
308
+ "zero_sized_hlo_elimination": 0.0008750000270083547
309
+ }
310
+ },
311
+ "cumsum": {
312
+ "compiletime": {
313
+ "CoalesceCCOp": 0.00023508071899414063,
314
+ "DMALocalityOpt": 0.00017404556274414063,
315
+ "DMAProfiler": 0.0008785724639892578,
316
+ "DataStreaming": 0.0002880096435546875,
317
+ "DoNothing": 0.00011467933654785156,
318
+ "ExpandISAMacro": 0.0006787776947021484,
319
+ "FactorizeBlkDims": 0.0004444122314453125,
320
+ "InferPSumTensor": 0.0004467964172363281,
321
+ "LateLegalizeInst": 0.000461578369140625,
322
+ "LateNeuronInstComb": 0.0004818439483642578,
323
+ "LegalizeSundaAccess": 0.0016222000122070313,
324
+ "LegalizeType": 0.0002703666687011719,
325
+ "LowerBroadcast": 0.00025391578674316406,
326
+ "LowerIntrinsics": 0.00021457672119140625,
327
+ "LowerTranspose": 0.00024318695068359375,
328
+ "NeuronInstComb": 0.00048065185546875,
329
+ "NeuronLICM": 0.00038552284240722656,
330
+ "NeuronSimplifyPredicates": 0.0027823448181152344,
331
+ "NeuronValueNumbering": 0.00043129920959472656,
332
+ "SFKVectorizer": 0.003134012222290039,
333
+ "SimpleAllReduceTiling": 0.00022721290588378906,
334
+ "SimplifyNeuronTensor": 0.0005092620849609375,
335
+ "SpillPSum": 0.0005443096160888672,
336
+ "WeightCoalescing": 0.00020051002502441406
337
+ }
338
+ },
339
+ "sg00": {
340
+ "compiletime": {
341
+ "CanonicalizeConv": 9.999999974752427e-07,
342
+ "CanonicalizeForTensorizer": 1.700000029813964e-05,
343
+ "Canonicalizer": 0.00033599999733269215,
344
+ "HoistCompute": 3.000000106112566e-06,
345
+ "IdentifyCrossPassTensors": 1.5999999959603883e-05,
346
+ "MemcastMotion": 1.1000000085914508e-05,
347
+ "PenguinizeFunctions": 1.8000000636675395e-05,
348
+ "PruneFunctions": 1.4000000192027073e-05,
349
+ "RemoveOptimizationBarriers": 1.2999999853491317e-05,
350
+ "ScatterMotion": 2.4000000848900527e-05,
351
+ "TensorizerLegalizationPass": 2.700000004551839e-05,
352
+ "VerifySupportedOps": 1.2000000424450263e-05,
353
+ "algsimp": 6.500000017695129e-05,
354
+ "batchnorm_expander": 1.4000000192027073e-05,
355
+ "boundary-marker-removal": 3.999999989900971e-06,
356
+ "call-inliner": 9.000000318337698e-06,
357
+ "canonicalize-boundary-marker": 6.000000212225132e-06,
358
+ "collective-stream-id-checker": 3.999999989900971e-06,
359
+ "comparison-expander": 4.999999873689376e-06,
360
+ "computation-deduplicator": 2.300000051036477e-05,
361
+ "conditional-to-select": 4.999999873689376e-06,
362
+ "config-lowering": 3.9999998989515007e-05,
363
+ "constant_folding": 9.000000318337698e-06,
364
+ "cse": 1.2999999853491317e-05,
365
+ "dce": 9.999999974752427e-07,
366
+ "dynamic-slice-transpose": 3.999999989900971e-06,
367
+ "eliminate-redundant-compare": 3.999999989900971e-06,
368
+ "emit-offloaded-dropout": 1.4000000192027073e-05,
369
+ "flatten-call-graph": 9.999999747378752e-06,
370
+ "fuse-send-recv": 2.8000000384054147e-05,
371
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
372
+ "hilo::NeuronInstCombine": 8.499999967170879e-05,
373
+ "hilo::NeuronOpFusion": 2.700000004551839e-05,
374
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.4999999621068127e-05,
375
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
376
+ "hilo::SixtyFourHack": 1.2999999853491317e-05,
377
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
378
+ "hlo-mac-count": 3.099999958067201e-05,
379
+ "hlo-verifier": 0.0002530000056140125,
380
+ "legalize-ccops": 9.999999974752427e-07,
381
+ "legalize-compare": 3.999999989900971e-06,
382
+ "lower-argminmax-custom-call": 3.999999989900971e-06,
383
+ "map-inline": 1.2000000424450263e-05,
384
+ "metadata-naming": 2.4000000848900527e-05,
385
+ "mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05,
386
+ "mlir::hlo::MhloToPyPenguin": 0.0010389999952167273,
387
+ "mlir::mhlo::LowerComplexExtraPass": 8.800000068731606e-05,
388
+ "mlir::mhlo::LowerComplexPass": 0.00014200000441633165,
389
+ "native-to-custom-softmax": 9.000000136438757e-05,
390
+ "native-to-custom-softmax-dx": 4.3000000005122274e-05,
391
+ "operand_upcaster": 2.300000051036477e-05,
392
+ "post-par-pipe-begin": 3.000000106112566e-06,
393
+ "post-par-pipe-end": 0.0,
394
+ "post-partition-simplification": 0.0006249999860301614,
395
+ "replace-minimum-constant": 2.5999999706982635e-05,
396
+ "reshape-mover": 3.999999989900971e-06,
397
+ "simplify-concat": 4.8000001697801054e-05,
398
+ "simplify-while-loops": 1.9999999949504854e-06,
399
+ "transform-variadic-reduce": 9.000000318337698e-06,
400
+ "tuple-simplifier": 4.999999873689376e-06,
401
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
402
+ "unroll-while-loop": 0.0
403
+ },
404
+ "hilo": {
405
+ "ArithmeticIntensity": 73.02900695800781,
406
+ "ConstantSize": 2368805.0,
407
+ "HloInputCount": 475.0,
408
+ "HloMacCount": 25769803776.0,
409
+ "HloOutputCount": 73.0,
410
+ "IfmapSize": 8266549248.0,
411
+ "OfmapSize": 75497472.0,
412
+ "OutputsReadFromCount": 0.0,
413
+ "PassthroughTensorsCount": 0.0,
414
+ "RedundantOutputCount": 0.0,
415
+ "Traffic": 705741632.0
416
+ }
417
+ },
418
+ "sg0000": {
419
+ "compiletime": {
420
+ "AGOrderingAnalysisPass": 0.0818486213684082,
421
+ "AffinePredicateResolution": 0.001665353775024414,
422
+ "AliasDependencyElimination": 0.00012683868408203125,
423
+ "AliasDependencyInduction": 0.008559942245483398,
424
+ "AliasDependencyReset": 0.03254294395446777,
425
+ "BFComputeCutting": 0.003969907760620117,
426
+ "BirCodeGenLoop": 0.06339025497436523,
427
+ "CCOpFusion": 0.029911041259765625,
428
+ "CanonicalizeDAGForPGTiling": 0.003092050552368164,
429
+ "CanonicalizeIR": 0.002637147903442383,
430
+ "CoalesceCCOp": 0.0051479339599609375,
431
+ "CommuteConcat": 0.001478433609008789,
432
+ "DMALocalityOpt": 0.0016412734985351563,
433
+ "DMAProfiler": 0.004613637924194336,
434
+ "DMATilingProfiler": 0.004850864410400391,
435
+ "DataLocalityOpt": 0.11357831954956055,
436
+ "DataStreaming": 0.0061092376708984375,
437
+ "DeConcat": 0.0013332366943359375,
438
+ "DeadCodeElimination": 0.0018727779388427734,
439
+ "DeadStoreElimination": 0.03094482421875,
440
+ "DelinearIndices": 0.008640289306640625,
441
+ "Delinearization": 0.0035429000854492188,
442
+ "DoNothing": 8.106231689453125e-05,
443
+ "DramToDramTranspose": 0.03549051284790039,
444
+ "DumpGraphAndMetadata": 0.005577564239501953,
445
+ "EliminateDivs": 0.003966331481933594,
446
+ "ExpandBatchNorm": 0.0017447471618652344,
447
+ "ExpandISAMacro": 0.002687692642211914,
448
+ "FactorizeBlkDims": 0.026469945907592773,
449
+ "FactorizeThreadAxesInFreeDims": 0.0014863014221191406,
450
+ "FlattenMacroLoop": 0.00392913818359375,
451
+ "GenericAccessSimplifier": 0.0018973350524902344,
452
+ "InferInitValue": 0.03517007827758789,
453
+ "InferIntrinsicOnCC": 0.010237932205200195,
454
+ "InferNeuronTensor": 0.051462411880493164,
455
+ "InferNonlocalTensors": 0.14991235733032227,
456
+ "InferPSumTensor": 0.053685903549194336,
457
+ "InlineNativeKernels": 0.002433300018310547,
458
+ "InsertIOTransposes": 0.015550613403320313,
459
+ "InsertLocalTransposes": 0.007843017578125,
460
+ "InsertOffloadedTransposes": 0.002854585647583008,
461
+ "LICM": 0.003381490707397461,
462
+ "LateLegalizeInst": 0.0069310665130615234,
463
+ "LateLegalizePostSplit": 0.00308990478515625,
464
+ "LateLowerReshapeOp": 0.0017940998077392578,
465
+ "LateLowerTensorOp": 0.005001068115234375,
466
+ "LateNeuronInstComb": 0.016704320907592773,
467
+ "LayoutPreprocessing": 0.033296823501586914,
468
+ "LayoutPreprocessingAndAnalysis": 0.12302517890930176,
469
+ "LayoutRequirementAnalysis": 0.007364988327026367,
470
+ "LegalizeCCOpLayout": 0.0029296875,
471
+ "LegalizeOpLevelAlias": 0.0016987323760986328,
472
+ "LegalizePartitionReduce": 0.0014727115631103516,
473
+ "LegalizeSundaAccess": 0.04025077819824219,
474
+ "LegalizeSundaMacro": 0.009906291961669922,
475
+ "LegalizeType": 0.004493236541748047,
476
+ "LocalLayoutOpt": 0.017308473587036133,
477
+ "LoopFusion": 0.005831241607666016,
478
+ "LoopSplitting": 0.00037789344787597656,
479
+ "LowerBroadcast": 0.0016851425170898438,
480
+ "LowerCCOpBlockAxis": 0.005655765533447266,
481
+ "LowerComplexBroadcast": 0.0020987987518310547,
482
+ "LowerIntrinsics": 0.040236473083496094,
483
+ "LowerTensorOp": 0.012641191482543945,
484
+ "LowerTranspose": 0.0125579833984375,
485
+ "MacroGeneration": 0.08074021339416504,
486
+ "MaskPropagation": 0.005038022994995117,
487
+ "MemcpyElimination": 0.10875082015991211,
488
+ "MutateDataType": 0.0013315677642822266,
489
+ "NeuronAliasDependencyInduction": 0.00025200843811035156,
490
+ "NeuronAliasDependencyReset": 0.021958112716674805,
491
+ "NeuronInstComb": 0.009703636169433594,
492
+ "NeuronLICM": 0.011526823043823242,
493
+ "NeuronLoopFusion": 0.017663955688476563,
494
+ "NeuronLoopInterchange": 0.002567291259765625,
495
+ "NeuronSimplifier": 0.011670589447021484,
496
+ "NeuronSimplifyPredicates": 0.017385244369506836,
497
+ "NeuronValueNumbering": 0.004181623458862305,
498
+ "OptimizeAliasedCopyChain": 0.0017867088317871094,
499
+ "OptimizeNKIKernels": 0.0020456314086914063,
500
+ "PAGLayoutOpt": 0.3681519031524658,
501
+ "PComputeCutting": 0.008620262145996094,
502
+ "PGLayoutTilingPipeline": 1.3210320472717285,
503
+ "PGTiling": 0.27039527893066406,
504
+ "PadElimination": 0.0003745555877685547,
505
+ "ParAxesAnnotation": 0.33005595207214355,
506
+ "PartialLoopFusion": 0.026912212371826172,
507
+ "PartialSimdFusion": 0.03544425964355469,
508
+ "PerfectLoopNest": 0.0021703243255615234,
509
+ "RecognizeOpIdiom": 0.004334926605224609,
510
+ "Recompute": 0.0002522468566894531,
511
+ "RelaxPredicates": 0.004270076751708984,
512
+ "Rematerialization": 0.005487918853759766,
513
+ "ReshapeWeights": 0.0006825923919677734,
514
+ "ResolveAccessConflict": 0.003779888153076172,
515
+ "ResolveComplicatePredicates": 0.0018131732940673828,
516
+ "RewriteReplicationMatmul": 0.002633333206176758,
517
+ "RewriteWeights": 0.0036499500274658203,
518
+ "SFKVectorizer": 0.2772994041442871,
519
+ "SimpleAllReduceTiling": 0.002454519271850586,
520
+ "Simplifier": 0.0045070648193359375,
521
+ "SimplifyMacroPredicates": 0.016190290451049805,
522
+ "SimplifyNeuronTensor": 0.01452183723449707,
523
+ "SimplifySlice": 0.0010039806365966797,
524
+ "SimplifyTensor": 0.00657200813293457,
525
+ "SpillPSum": 0.02208685874938965,
526
+ "SplitAPUnionSets": 0.04095458984375,
527
+ "SplitAccGrp": 0.0018160343170166016,
528
+ "StaticProfiler": 0.004816770553588867,
529
+ "StaticTransposeLocalTensor": 0.004886150360107422,
530
+ "SundaISel": 0.04611611366271973,
531
+ "TCTransform": 0.001667022705078125,
532
+ "TensorInitialization": 0.022374629974365234,
533
+ "TensorOpSimplifier": 0.006697177886962891,
534
+ "TensorOpTransform": 0.02793574333190918,
535
+ "TileCCOps": 0.007641792297363281,
536
+ "TilingProfiler": 0.015750885009765625,
537
+ "TransformConvOp": 0.0026845932006835938,
538
+ "TritiumFusion": 0.08186149597167969,
539
+ "ValueNumbering": 0.0026755332946777344,
540
+ "VectorizeDMA": 0.007223367691040039,
541
+ "VectorizeMatMult": 0.018305540084838867,
542
+ "WeightCoalescing": 0.003328561782836914,
543
+ "ZeroSizeTensorElimination": 0.00011229515075683594
544
+ },
545
+ "tensorizer": {
546
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 5862.0,
547
+ "StaticProfiler::AifUb": 88.59026336669922,
548
+ "StaticProfiler::ArithmeticIntensityTensorizer": 582.7418823242188,
549
+ "StaticProfiler::AverageDmaLength": 2248.2685546875,
550
+ "StaticProfiler::AverageFractalPeUtilization": 99.96076202392578,
551
+ "StaticProfiler::AveragePartitionUtilization": 99.90216827392578,
552
+ "StaticProfiler::AveragePeUtilization": 99.8394546508789,
553
+ "StaticProfiler::DDRTransferBytes": 104424704.0,
554
+ "StaticProfiler::InternalTransferBytes": 122421248.0,
555
+ "StaticProfiler::LoadExpanded": 25346.0,
556
+ "StaticProfiler::LocalizationEfficiency": 657.7944946289063,
557
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 915.0787353515625,
558
+ "StaticProfiler::StoreExpanded": 10753.0,
559
+ "StaticProfiler::TotalDMAExpanded": 36099.0,
560
+ "StaticProfiler::TotalDynamicInstancesCount": 8866.0,
561
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 8860.0,
562
+ "StaticProfiler::TotalLNCComm": 0.0,
563
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
564
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
565
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
566
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
567
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
568
+ "TilingProfiler::GenericInstructionsAfterTiling": 96.0,
569
+ "TilingProfiler::MatMultInstructionsAfterTiling": 3080.0,
570
+ "TilingProfiler::NumPfTransposes": 8.0,
571
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
572
+ "TilingProfiler::NumPfTransposesForLocal": 6.0,
573
+ "TilingProfiler::NumPfTransposesForNonlocal": 1.0,
574
+ "TilingProfiler::PfTransposeInstructions": 1760.0,
575
+ "TilingProfiler::PfTransposeInstructionsForIo": 256.0,
576
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1376.0,
577
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 128.0,
578
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
579
+ "TilingProfiler::SimdInstructionsAfterTiling": 649.0,
580
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
581
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
582
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
583
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
584
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
585
+ "TransformConvOp::conv2d_column_packing": 0.0,
586
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
587
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
588
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
589
+ }
590
+ },
591
+ "sg0001": {
592
+ "compiletime": {
593
+ "AGOrderingAnalysisPass": 0.03383040428161621,
594
+ "AffinePredicateResolution": 0.0015320777893066406,
595
+ "AliasDependencyElimination": 0.0001316070556640625,
596
+ "AliasDependencyInduction": 0.00819253921508789,
597
+ "AliasDependencyReset": 0.02862405776977539,
598
+ "BFComputeCutting": 0.004217624664306641,
599
+ "BirCodeGenLoop": 0.0443270206451416,
600
+ "CCOpFusion": 0.04336118698120117,
601
+ "CanonicalizeDAGForPGTiling": 0.0031616687774658203,
602
+ "CanonicalizeIR": 0.0021500587463378906,
603
+ "CoalesceCCOp": 0.005389690399169922,
604
+ "CommuteConcat": 0.0024237632751464844,
605
+ "DMALocalityOpt": 0.002274751663208008,
606
+ "DMAProfiler": 0.003973484039306641,
607
+ "DMATilingProfiler": 0.005924701690673828,
608
+ "DataLocalityOpt": 0.15027260780334473,
609
+ "DataStreaming": 0.004762887954711914,
610
+ "DeConcat": 0.0018739700317382813,
611
+ "DeadCodeElimination": 0.001882314682006836,
612
+ "DeadStoreElimination": 0.03486776351928711,
613
+ "DelinearIndices": 0.009628534317016602,
614
+ "Delinearization": 0.0037381649017333984,
615
+ "DoNothing": 6.985664367675781e-05,
616
+ "DramToDramTranspose": 0.04212188720703125,
617
+ "DumpGraphAndMetadata": 0.004312038421630859,
618
+ "EliminateDivs": 0.005432844161987305,
619
+ "ExpandBatchNorm": 0.002119302749633789,
620
+ "ExpandISAMacro": 0.0024309158325195313,
621
+ "FactorizeBlkDims": 0.02235579490661621,
622
+ "FactorizeThreadAxesInFreeDims": 0.0018169879913330078,
623
+ "FlattenMacroLoop": 0.0030968189239501953,
624
+ "GenericAccessSimplifier": 0.0016777515411376953,
625
+ "InferInitValue": 0.043079376220703125,
626
+ "InferIntrinsicOnCC": 0.009890556335449219,
627
+ "InferNeuronTensor": 0.05600404739379883,
628
+ "InferNonlocalTensors": 0.03101515769958496,
629
+ "InferPSumTensor": 0.04645681381225586,
630
+ "InlineNativeKernels": 0.0015399456024169922,
631
+ "InsertIOTransposes": 0.02417731285095215,
632
+ "InsertLocalTransposes": 0.0070497989654541016,
633
+ "InsertOffloadedTransposes": 0.003525972366333008,
634
+ "LICM": 0.0035805702209472656,
635
+ "LateLegalizeInst": 0.0041539669036865234,
636
+ "LateLegalizePostSplit": 0.0027403831481933594,
637
+ "LateLowerReshapeOp": 0.0014560222625732422,
638
+ "LateLowerTensorOp": 0.004617452621459961,
639
+ "LateNeuronInstComb": 0.015344619750976563,
640
+ "LayoutPreprocessing": 0.030884981155395508,
641
+ "LayoutPreprocessingAndAnalysis": 0.06435275077819824,
642
+ "LayoutRequirementAnalysis": 0.007463693618774414,
643
+ "LegalizeCCOpLayout": 0.002064943313598633,
644
+ "LegalizeOpLevelAlias": 0.0011925697326660156,
645
+ "LegalizePartitionReduce": 0.0026116371154785156,
646
+ "LegalizeSundaAccess": 0.015822887420654297,
647
+ "LegalizeSundaMacro": 0.012560844421386719,
648
+ "LegalizeType": 0.004744291305541992,
649
+ "LocalLayoutOpt": 0.023772239685058594,
650
+ "LoopFusion": 0.0066835880279541016,
651
+ "LoopSplitting": 0.0003638267517089844,
652
+ "LowerBroadcast": 0.002238750457763672,
653
+ "LowerCCOpBlockAxis": 0.005678653717041016,
654
+ "LowerComplexBroadcast": 0.0019271373748779297,
655
+ "LowerIntrinsics": 0.042801856994628906,
656
+ "LowerTensorOp": 0.012106895446777344,
657
+ "LowerTranspose": 0.012960433959960938,
658
+ "MacroGeneration": 0.12800955772399902,
659
+ "MaskPropagation": 0.0031516551971435547,
660
+ "MemcpyElimination": 0.10379505157470703,
661
+ "MutateDataType": 0.0014393329620361328,
662
+ "NeuronAliasDependencyInduction": 0.00022101402282714844,
663
+ "NeuronAliasDependencyReset": 0.020102262496948242,
664
+ "NeuronInstComb": 0.009283781051635742,
665
+ "NeuronLICM": 0.009867429733276367,
666
+ "NeuronLoopFusion": 0.022713661193847656,
667
+ "NeuronLoopInterchange": 0.002709627151489258,
668
+ "NeuronSimplifier": 0.01328134536743164,
669
+ "NeuronSimplifyPredicates": 0.001683950424194336,
670
+ "NeuronValueNumbering": 0.0033235549926757813,
671
+ "OptimizeAliasedCopyChain": 0.0007724761962890625,
672
+ "OptimizeNKIKernels": 0.001729726791381836,
673
+ "PAGLayoutOpt": 0.13172507286071777,
674
+ "PComputeCutting": 0.007474422454833984,
675
+ "PGLayoutTilingPipeline": 0.9329550266265869,
676
+ "PGTiling": 0.4518747329711914,
677
+ "PadElimination": 0.00040411949157714844,
678
+ "ParAxesAnnotation": 0.0915369987487793,
679
+ "PartialLoopFusion": 0.020573854446411133,
680
+ "PartialSimdFusion": 0.04284977912902832,
681
+ "PerfectLoopNest": 0.002377033233642578,
682
+ "RecognizeOpIdiom": 0.0049991607666015625,
683
+ "Recompute": 0.00026345252990722656,
684
+ "RelaxPredicates": 0.0034220218658447266,
685
+ "Rematerialization": 0.0021615028381347656,
686
+ "ReshapeWeights": 0.0007557868957519531,
687
+ "ResolveAccessConflict": 0.004181861877441406,
688
+ "ResolveComplicatePredicates": 0.0015151500701904297,
689
+ "RewriteReplicationMatmul": 0.0020759105682373047,
690
+ "RewriteWeights": 0.0036649703979492188,
691
+ "SFKVectorizer": 0.20148277282714844,
692
+ "SimpleAllReduceTiling": 0.003732442855834961,
693
+ "Simplifier": 0.004697084426879883,
694
+ "SimplifyMacroPredicates": 0.007361888885498047,
695
+ "SimplifyNeuronTensor": 0.009825944900512695,
696
+ "SimplifySlice": 0.0017888545989990234,
697
+ "SimplifyTensor": 0.006832122802734375,
698
+ "SpillPSum": 0.022799968719482422,
699
+ "SplitAPUnionSets": 0.020108938217163086,
700
+ "SplitAccGrp": 0.0015766620635986328,
701
+ "StaticProfiler": 0.004146099090576172,
702
+ "StaticTransposeLocalTensor": 0.004926919937133789,
703
+ "SundaISel": 0.04472494125366211,
704
+ "TCTransform": 0.0018138885498046875,
705
+ "TensorInitialization": 0.004791736602783203,
706
+ "TensorOpSimplifier": 0.0064849853515625,
707
+ "TensorOpTransform": 0.0333099365234375,
708
+ "TileCCOps": 0.0056035518646240234,
709
+ "TilingProfiler": 0.01600933074951172,
710
+ "TransformConvOp": 0.002446413040161133,
711
+ "TritiumFusion": 0.1239166259765625,
712
+ "ValueNumbering": 0.0030901432037353516,
713
+ "VectorizeDMA": 0.0017311573028564453,
714
+ "VectorizeMatMult": 0.018932580947875977,
715
+ "WeightCoalescing": 0.0027513504028320313,
716
+ "ZeroSizeTensorElimination": 0.00011587142944335938
717
+ },
718
+ "tensorizer": {
719
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 15811.0,
720
+ "StaticProfiler::AifUb": 934.4357299804688,
721
+ "StaticProfiler::ArithmeticIntensityTensorizer": 708.8487548828125,
722
+ "StaticProfiler::AverageDmaLength": 1109.3380126953125,
723
+ "StaticProfiler::AverageFractalPeUtilization": 100.0,
724
+ "StaticProfiler::AveragePartitionUtilization": 99.8372802734375,
725
+ "StaticProfiler::AveragePeUtilization": 100.0,
726
+ "StaticProfiler::DDRTransferBytes": 306283520.0,
727
+ "StaticProfiler::InternalTransferBytes": 104595456.0,
728
+ "StaticProfiler::LoadExpanded": 257536.0,
729
+ "StaticProfiler::LocalizationEfficiency": 75.85848236083984,
730
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 85.1915054321289,
731
+ "StaticProfiler::StoreExpanded": 10241.0,
732
+ "StaticProfiler::TotalDMAExpanded": 267777.0,
733
+ "StaticProfiler::TotalDynamicInstancesCount": 19667.0,
734
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 19667.0,
735
+ "StaticProfiler::TotalLNCComm": 0.0,
736
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
737
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
738
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
739
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
740
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
741
+ "TilingProfiler::GenericInstructionsAfterTiling": 64.0,
742
+ "TilingProfiler::MatMultInstructionsAfterTiling": 12288.0,
743
+ "TilingProfiler::NumPfTransposes": 9.0,
744
+ "TilingProfiler::NumPfTransposesForIo": 3.0,
745
+ "TilingProfiler::NumPfTransposesForLocal": 4.0,
746
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
747
+ "TilingProfiler::PfTransposeInstructions": 1904.0,
748
+ "TilingProfiler::PfTransposeInstructionsForIo": 272.0,
749
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1120.0,
750
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 512.0,
751
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
752
+ "TilingProfiler::SimdInstructionsAfterTiling": 704.0,
753
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
754
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
755
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
756
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
757
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
758
+ "TransformConvOp::conv2d_column_packing": 0.0,
759
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
760
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
761
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
762
+ }
763
+ },
764
+ "sg0002": {
765
+ "compiletime": {
766
+ "AGOrderingAnalysisPass": 0.01837611198425293,
767
+ "AffinePredicateResolution": 0.0011184215545654297,
768
+ "AliasDependencyElimination": 0.00015664100646972656,
769
+ "AliasDependencyInduction": 0.005170583724975586,
770
+ "AliasDependencyReset": 0.027508020401000977,
771
+ "BFComputeCutting": 0.0036101341247558594,
772
+ "BirCodeGenLoop": 0.4774467945098877,
773
+ "CCOpFusion": 0.033265113830566406,
774
+ "CanonicalizeDAGForPGTiling": 0.004282712936401367,
775
+ "CanonicalizeIR": 0.0024569034576416016,
776
+ "CoalesceCCOp": 0.013993978500366211,
777
+ "CommuteConcat": 0.0017316341400146484,
778
+ "DMALocalityOpt": 0.005455970764160156,
779
+ "DMAProfiler": 0.012103080749511719,
780
+ "DMATilingProfiler": 0.0037560462951660156,
781
+ "DataLocalityOpt": 0.07645320892333984,
782
+ "DataStreaming": 0.03701519966125488,
783
+ "DeConcat": 0.0018520355224609375,
784
+ "DeadCodeElimination": 0.0020148754119873047,
785
+ "DeadStoreElimination": 0.006912708282470703,
786
+ "DelinearIndices": 0.004647254943847656,
787
+ "Delinearization": 0.003908872604370117,
788
+ "DoNothing": 7.414817810058594e-05,
789
+ "DramToDramTranspose": 0.02015542984008789,
790
+ "DumpGraphAndMetadata": 0.08691883087158203,
791
+ "EliminateDivs": 0.0025060176849365234,
792
+ "ExpandBatchNorm": 0.0027189254760742188,
793
+ "ExpandISAMacro": 0.010967254638671875,
794
+ "FactorizeBlkDims": 0.009678840637207031,
795
+ "FactorizeThreadAxesInFreeDims": 0.0023202896118164063,
796
+ "FlattenMacroLoop": 0.00232696533203125,
797
+ "GenericAccessSimplifier": 0.0008094310760498047,
798
+ "InferInitValue": 0.02833867073059082,
799
+ "InferIntrinsicOnCC": 0.008923768997192383,
800
+ "InferNeuronTensor": 0.025766372680664063,
801
+ "InferNonlocalTensors": 0.014599800109863281,
802
+ "InferPSumTensor": 0.28374218940734863,
803
+ "InlineNativeKernels": 0.00860905647277832,
804
+ "InsertIOTransposes": 0.01989889144897461,
805
+ "InsertLocalTransposes": 0.004229307174682617,
806
+ "InsertOffloadedTransposes": 0.0029871463775634766,
807
+ "LICM": 0.0030870437622070313,
808
+ "LateLegalizeInst": 0.01364445686340332,
809
+ "LateLegalizePostSplit": 0.014872312545776367,
810
+ "LateLowerReshapeOp": 0.0010464191436767578,
811
+ "LateLowerTensorOp": 0.002707242965698242,
812
+ "LateNeuronInstComb": 0.01008152961730957,
813
+ "LayoutPreprocessing": 0.026853561401367188,
814
+ "LayoutPreprocessingAndAnalysis": 0.0556035041809082,
815
+ "LayoutRequirementAnalysis": 0.004946470260620117,
816
+ "LegalizeCCOpLayout": 0.0025353431701660156,
817
+ "LegalizeOpLevelAlias": 0.0018966197967529297,
818
+ "LegalizePartitionReduce": 0.0017490386962890625,
819
+ "LegalizeSundaAccess": 0.0763850212097168,
820
+ "LegalizeSundaMacro": 0.012125253677368164,
821
+ "LegalizeType": 0.012414693832397461,
822
+ "LocalLayoutOpt": 0.013860225677490234,
823
+ "LoopFusion": 0.005201578140258789,
824
+ "LoopSplitting": 0.0003204345703125,
825
+ "LowerBroadcast": 0.0018322467803955078,
826
+ "LowerCCOpBlockAxis": 0.0040171146392822266,
827
+ "LowerComplexBroadcast": 0.002280712127685547,
828
+ "LowerIntrinsics": 0.3141806125640869,
829
+ "LowerTensorOp": 0.01141357421875,
830
+ "LowerTranspose": 0.012679815292358398,
831
+ "MacroGeneration": 0.034410953521728516,
832
+ "MaskPropagation": 0.0028192996978759766,
833
+ "MemcpyElimination": 0.02788853645324707,
834
+ "MutateDataType": 0.0012311935424804688,
835
+ "NeuronAliasDependencyInduction": 0.0001773834228515625,
836
+ "NeuronAliasDependencyReset": 0.024976015090942383,
837
+ "NeuronInstComb": 0.004675865173339844,
838
+ "NeuronLICM": 0.03631091117858887,
839
+ "NeuronLoopFusion": 0.008457422256469727,
840
+ "NeuronLoopInterchange": 0.001413106918334961,
841
+ "NeuronSimplifier": 0.007856369018554688,
842
+ "NeuronSimplifyPredicates": 0.11957573890686035,
843
+ "NeuronValueNumbering": 0.004334449768066406,
844
+ "OptimizeAliasedCopyChain": 0.0006341934204101563,
845
+ "OptimizeNKIKernels": 0.38834357261657715,
846
+ "PAGLayoutOpt": 0.0889735221862793,
847
+ "PComputeCutting": 0.005109071731567383,
848
+ "PGLayoutTilingPipeline": 0.6248171329498291,
849
+ "PGTiling": 0.1645822525024414,
850
+ "PadElimination": 0.0003485679626464844,
851
+ "ParAxesAnnotation": 0.05196070671081543,
852
+ "PartialLoopFusion": 0.011112451553344727,
853
+ "PartialSimdFusion": 0.012138128280639648,
854
+ "PerfectLoopNest": 0.002288341522216797,
855
+ "RecognizeOpIdiom": 0.0041277408599853516,
856
+ "Recompute": 0.00026416778564453125,
857
+ "RelaxPredicates": 0.01356959342956543,
858
+ "Rematerialization": 0.0024864673614501953,
859
+ "ReshapeWeights": 0.0007522106170654297,
860
+ "ResolveAccessConflict": 0.0048482418060302734,
861
+ "ResolveComplicatePredicates": 0.0015094280242919922,
862
+ "RewriteReplicationMatmul": 0.0015668869018554688,
863
+ "RewriteWeights": 0.0027174949645996094,
864
+ "SFKVectorizer": 0.27501797676086426,
865
+ "SimpleAllReduceTiling": 0.009322166442871094,
866
+ "Simplifier": 0.003630399703979492,
867
+ "SimplifyMacroPredicates": 0.011396646499633789,
868
+ "SimplifyNeuronTensor": 1.0555970668792725,
869
+ "SimplifySlice": 0.0023348331451416016,
870
+ "SimplifyTensor": 0.005601167678833008,
871
+ "SpillPSum": 0.013073921203613281,
872
+ "SplitAPUnionSets": 0.11336159706115723,
873
+ "SplitAccGrp": 0.001394510269165039,
874
+ "StaticProfiler": 0.014252662658691406,
875
+ "StaticTransposeLocalTensor": 0.003930330276489258,
876
+ "SundaISel": 0.04436635971069336,
877
+ "TCTransform": 0.0008757114410400391,
878
+ "TensorInitialization": 0.01558232307434082,
879
+ "TensorOpSimplifier": 0.004608869552612305,
880
+ "TensorOpTransform": 0.01923346519470215,
881
+ "TileCCOps": 0.005507707595825195,
882
+ "TilingProfiler": 0.007405757904052734,
883
+ "TransformConvOp": 0.0030219554901123047,
884
+ "TritiumFusion": 0.05425119400024414,
885
+ "ValueNumbering": 0.0020017623901367188,
886
+ "VectorizeDMA": 0.002228975296020508,
887
+ "VectorizeMatMult": 0.006806135177612305,
888
+ "WeightCoalescing": 0.008460044860839844,
889
+ "ZeroSizeTensorElimination": 0.00014281272888183594
890
+ },
891
+ "tensorizer": {
892
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 49538.0,
893
+ "StaticProfiler::AifUb": 304.240234375,
894
+ "StaticProfiler::ArithmeticIntensityTensorizer": 278.67474365234375,
895
+ "StaticProfiler::AverageDmaLength": 1974.1033935546875,
896
+ "StaticProfiler::AverageFractalPeUtilization": 99.7004623413086,
897
+ "StaticProfiler::AveragePartitionUtilization": 97.94140625,
898
+ "StaticProfiler::AveragePeUtilization": 98.78884887695313,
899
+ "StaticProfiler::DDRTransferBytes": 862646080.0,
900
+ "StaticProfiler::InternalTransferBytes": 669456896.0,
901
+ "StaticProfiler::LoadExpanded": 390679.0,
902
+ "StaticProfiler::LocalizationEfficiency": 91.59693145751953,
903
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 95.863037109375,
904
+ "StaticProfiler::StoreExpanded": 7261.0,
905
+ "StaticProfiler::TotalDMAExpanded": 397940.0,
906
+ "StaticProfiler::TotalDynamicInstancesCount": 59578.0,
907
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 59132.0,
908
+ "StaticProfiler::TotalLNCComm": 0.0,
909
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
910
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
911
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
912
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
913
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
914
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
915
+ "TilingProfiler::MatMultInstructionsAfterTiling": 28224.0,
916
+ "TilingProfiler::NumPfTransposes": 5.0,
917
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
918
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
919
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
920
+ "TilingProfiler::PfTransposeInstructions": 19777.0,
921
+ "TilingProfiler::PfTransposeInstructionsForIo": 19008.0,
922
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
923
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0,
924
+ "TilingProfiler::ReduceInstructionsAfterTiling": 6.0,
925
+ "TilingProfiler::SimdInstructionsAfterTiling": 303.0,
926
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
927
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
928
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
929
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
930
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
931
+ "TransformConvOp::conv2d_column_packing": 0.0,
932
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
933
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
934
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
935
+ }
936
+ },
937
+ "sg01": {
938
+ "compiletime": {
939
+ "CanonicalizeConv": 2.2000000171829015e-05,
940
+ "CanonicalizeForTensorizer": 1.4999999621068127e-05,
941
+ "Canonicalizer": 0.00025499999173916876,
942
+ "HoistCompute": 1.9999999949504854e-06,
943
+ "IdentifyCrossPassTensors": 2.499999936844688e-05,
944
+ "MemcastMotion": 7.000000096013537e-06,
945
+ "PenguinizeFunctions": 1.4999999621068127e-05,
946
+ "PruneFunctions": 1.8999999156221747e-05,
947
+ "RemoveOptimizationBarriers": 2.700000004551839e-05,
948
+ "ScatterMotion": 1.9999999494757503e-05,
949
+ "TensorizerLegalizationPass": 1.9999999494757503e-05,
950
+ "VerifySupportedOps": 1.1000000085914508e-05,
951
+ "algsimp": 6.299999949987978e-05,
952
+ "batchnorm_expander": 1.4000000192027073e-05,
953
+ "boundary-marker-removal": 4.999999873689376e-06,
954
+ "call-inliner": 9.000000318337698e-06,
955
+ "canonicalize-boundary-marker": 6.000000212225132e-06,
956
+ "collective-stream-id-checker": 4.999999873689376e-06,
957
+ "comparison-expander": 4.999999873689376e-06,
958
+ "computation-deduplicator": 2.5999999706982635e-05,
959
+ "conditional-to-select": 4.999999873689376e-06,
960
+ "config-lowering": 2.2000000171829015e-05,
961
+ "constant_folding": 9.000000318337698e-06,
962
+ "cse": 1.2000000424450263e-05,
963
+ "dce": 9.999999974752427e-07,
964
+ "dynamic-slice-transpose": 3.999999989900971e-06,
965
+ "eliminate-redundant-compare": 3.999999989900971e-06,
966
+ "emit-offloaded-dropout": 1.2999999853491317e-05,
967
+ "flatten-call-graph": 9.000000318337698e-06,
968
+ "fuse-send-recv": 2.9999999242136255e-05,
969
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
970
+ "hilo::NeuronInstCombine": 3.600000127335079e-05,
971
+ "hilo::NeuronOpFusion": 1.4000000192027073e-05,
972
+ "hilo::ReplaceTokenTypeWithU8Pass": 2.099999983329326e-05,
973
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
974
+ "hilo::SixtyFourHack": 1.4000000192027073e-05,
975
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
976
+ "hlo-mac-count": 4.600000102072954e-05,
977
+ "hlo-verifier": 0.00023299999884329736,
978
+ "legalize-ccops": 9.999999974752427e-07,
979
+ "legalize-compare": 3.999999989900971e-06,
980
+ "lower-argminmax-custom-call": 3.999999989900971e-06,
981
+ "map-inline": 1.1000000085914508e-05,
982
+ "metadata-naming": 2.700000004551839e-05,
983
+ "mlir::detail::OpToOpPassAdaptor": 0.00017299999308306724,
984
+ "mlir::hlo::MhloToPyPenguin": 0.0009840000420808792,
985
+ "mlir::mhlo::LowerComplexExtraPass": 9.600000339560211e-05,
986
+ "mlir::mhlo::LowerComplexPass": 0.00013600000238511711,
987
+ "native-to-custom-softmax": 6.000000212225132e-06,
988
+ "native-to-custom-softmax-dx": 2.2000000171829015e-05,
989
+ "operand_upcaster": 2.4000000848900527e-05,
990
+ "post-par-pipe-begin": 3.000000106112566e-06,
991
+ "post-par-pipe-end": 0.0,
992
+ "post-partition-simplification": 0.0005660000024363399,
993
+ "replace-minimum-constant": 6.000000212225132e-06,
994
+ "reshape-mover": 3.000000106112566e-06,
995
+ "simplify-concat": 4.8999998398358e-05,
996
+ "simplify-while-loops": 1.9999999949504854e-06,
997
+ "transform-variadic-reduce": 9.000000318337698e-06,
998
+ "tuple-simplifier": 4.999999873689376e-06,
999
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1000
+ "unroll-while-loop": 0.0
1001
+ },
1002
+ "hilo": {
1003
+ "ArithmeticIntensity": 834.6854858398438,
1004
+ "HloMacCount": 103079215104.0,
1005
+ "Traffic": 246989344.0
1006
+ }
1007
+ },
1008
+ "sg02": {
1009
+ "compiletime": {
1010
+ "CanonicalizeConv": 0.0,
1011
+ "CanonicalizeForTensorizer": 1.4000000192027073e-05,
1012
+ "Canonicalizer": 0.0003129999968223274,
1013
+ "HoistCompute": 9.999999974752427e-07,
1014
+ "IdentifyCrossPassTensors": 1.2000000424450263e-05,
1015
+ "MemcastMotion": 0.0,
1016
+ "PenguinizeFunctions": 9.999999747378752e-06,
1017
+ "PruneFunctions": 7.999999979801942e-06,
1018
+ "RemoveOptimizationBarriers": 9.000000318337698e-06,
1019
+ "ScatterMotion": 3.000000106112566e-06,
1020
+ "TensorizerLegalizationPass": 6.000000212225132e-06,
1021
+ "VerifySupportedOps": 1.2000000424450263e-05,
1022
+ "algsimp": 5.999999848427251e-05,
1023
+ "batchnorm_expander": 1.2999999853491317e-05,
1024
+ "boundary-marker-removal": 3.999999989900971e-06,
1025
+ "call-inliner": 9.999999747378752e-06,
1026
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
1027
+ "collective-stream-id-checker": 3.999999989900971e-06,
1028
+ "comparison-expander": 4.999999873689376e-06,
1029
+ "computation-deduplicator": 2.5999999706982635e-05,
1030
+ "conditional-to-select": 7.000000096013537e-06,
1031
+ "config-lowering": 2.5999999706982635e-05,
1032
+ "constant_folding": 9.000000318337698e-06,
1033
+ "cse": 1.2000000424450263e-05,
1034
+ "dce": 9.999999974752427e-07,
1035
+ "dynamic-slice-transpose": 3.999999989900971e-06,
1036
+ "eliminate-redundant-compare": 3.000000106112566e-06,
1037
+ "emit-offloaded-dropout": 1.2999999853491317e-05,
1038
+ "flatten-call-graph": 1.2000000424450263e-05,
1039
+ "fuse-send-recv": 1.9999999494757503e-05,
1040
+ "hilo::LegalizeAlias": 1.9999999949504854e-06,
1041
+ "hilo::NeuronInstCombine": 6.800000119255856e-05,
1042
+ "hilo::NeuronOpFusion": 3.999999989900971e-06,
1043
+ "hilo::ReplaceTokenTypeWithU8Pass": 2.2000000171829015e-05,
1044
+ "hilo::ScheduleFusion": 0.00015900000289548188,
1045
+ "hilo::SixtyFourHack": 3.9999998989515007e-05,
1046
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1047
+ "hlo-mac-count": 0.00017699999443721026,
1048
+ "hlo-verifier": 0.0001829999964684248,
1049
+ "legalize-ccops": 1.9999999949504854e-06,
1050
+ "legalize-compare": 3.000000106112566e-06,
1051
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
1052
+ "map-inline": 1.2000000424450263e-05,
1053
+ "metadata-naming": 1.5999999959603883e-05,
1054
+ "mlir::detail::OpToOpPassAdaptor": 1.1000000085914508e-05,
1055
+ "mlir::hlo::MhloToPyPenguin": 0.0008900000248104334,
1056
+ "mlir::mhlo::LowerComplexExtraPass": 8.800000068731606e-05,
1057
+ "mlir::mhlo::LowerComplexPass": 0.00011999999696854502,
1058
+ "native-to-custom-softmax": 6.000000212225132e-06,
1059
+ "native-to-custom-softmax-dx": 2.4000000848900527e-05,
1060
+ "operand_upcaster": 1.5999999959603883e-05,
1061
+ "post-par-pipe-begin": 1.9999999949504854e-06,
1062
+ "post-par-pipe-end": 0.0,
1063
+ "post-partition-simplification": 0.0005510000046342611,
1064
+ "replace-minimum-constant": 9.000000318337698e-06,
1065
+ "reshape-mover": 3.000000106112566e-06,
1066
+ "simplify-concat": 4.400000034365803e-05,
1067
+ "simplify-while-loops": 1.9999999949504854e-06,
1068
+ "transform-variadic-reduce": 6.299999949987978e-05,
1069
+ "tuple-simplifier": 4.999999873689376e-06,
1070
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1071
+ "unroll-while-loop": 0.0
1072
+ },
1073
+ "hilo": {
1074
+ "ArithmeticIntensity": 194.41075134277344,
1075
+ "HloMacCount": 77620576256.0,
1076
+ "Traffic": 798521408.0
1077
+ }
1078
+ }
1079
+ }
context_encoding_model/_tp0_bk3/graph.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3602ab29177b01531c0dbdb62bc869556ef53a934ba98dd3bd846e75e171cc3a
3
+ size 2561024
context_encoding_model/_tp0_bk3/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_encoding_model/_tp0_bk3/metaneff.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ea141404110996ab61ca5ba70e86499e6c4390e0b31c1ef947cf95911917766
3
+ size 1816103
context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9068f3ba4f55e1b8b35adde74efc6a9e617baa344783aaee62353f9181c3092c
3
+ size 1893189
context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3602ab29177b01531c0dbdb62bc869556ef53a934ba98dd3bd846e75e171cc3a
3
+ size 2561024
context_encoding_model/_tp0_bk3/neuron_config.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "Qwen/Qwen3-8B",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "Qwen3ForCausalLM"
7
+ ],
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "attribute_map": {},
11
+ "bad_words_ids": null,
12
+ "begin_suppress_tokens": null,
13
+ "bos_token_id": 151643,
14
+ "chunk_size_feed_forward": 0,
15
+ "cross_attention_hidden_size": null,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": 151645,
22
+ "exponential_decay_length_penalty": null,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "fused_spec_config": null,
27
+ "head_dim": 128,
28
+ "hidden_act": "silu",
29
+ "hidden_size": 4096,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 12288,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "max_position_embeddings": 40960,
45
+ "max_window_layers": 36,
46
+ "metadata": null,
47
+ "min_length": 0,
48
+ "model_type": "qwen3",
49
+ "neuron_config": {
50
+ "activation_quantization_type": null,
51
+ "allow_input_truncation": false,
52
+ "apply_seq_ids_mask": false,
53
+ "async_mode": false,
54
+ "attention_dp_degree": 1,
55
+ "attention_dtype": null,
56
+ "attn_block_cte_nki_kernel_enabled": false,
57
+ "attn_block_tkg_nki_kernel_cache_update": false,
58
+ "attn_block_tkg_nki_kernel_enabled": false,
59
+ "attn_cls": {
60
+ "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
61
+ "__name__": "NeuronQwen3Attention"
62
+ },
63
+ "attn_kernel_enabled": null,
64
+ "attn_tkg_builtin_kernel_enabled": false,
65
+ "attn_tkg_nki_kernel_enabled": false,
66
+ "batch_size": 1,
67
+ "bucket_n_active_tokens": true,
68
+ "buckets": [
69
+ 1024
70
+ ],
71
+ "cast_type": "config",
72
+ "cc_pipeline_tiling_factor": 2,
73
+ "chunked_prefill_config": null,
74
+ "context_encoding_buckets": [
75
+ 1024
76
+ ],
77
+ "cp_degree": 1,
78
+ "ctx_batch_size": 1,
79
+ "disable_kv_cache_tiling": false,
80
+ "draft_model_modules_to_not_convert": null,
81
+ "enable_bucketing": true,
82
+ "enable_eagle_draft_input_norm": false,
83
+ "enable_eagle_speculation": false,
84
+ "enable_fused_speculation": false,
85
+ "enable_long_context_mode": false,
86
+ "enable_output_completion_notifications": false,
87
+ "enable_spill_reload_dge": false,
88
+ "enable_token_tree": false,
89
+ "ep_degree": 1,
90
+ "expert_mlp_nki_kernel_enabled": null,
91
+ "flash_decoding_enabled": false,
92
+ "fused_qkv": false,
93
+ "fused_rmsnorm_skip_gamma": false,
94
+ "is_block_kv_layout": null,
95
+ "is_chunked_prefill": false,
96
+ "is_continuous_batching": true,
97
+ "is_eagle_draft": false,
98
+ "is_medusa": false,
99
+ "is_prefill_stage": true,
100
+ "is_prefix_caching": false,
101
+ "k_cache_transposed": false,
102
+ "kv_cache_batch_size": 1,
103
+ "kv_cache_padding_size": 0,
104
+ "kv_cache_quant": false,
105
+ "kv_cache_tiling": false,
106
+ "layer_boundary_markers": false,
107
+ "lm_head_pad": false,
108
+ "lm_head_pad_alignment_size": 1,
109
+ "local_ranks_size": 2,
110
+ "logical_nc_config": 1,
111
+ "lora_config": null,
112
+ "max_batch_size": 1,
113
+ "max_context_length": 1024,
114
+ "max_length": 1024,
115
+ "max_new_tokens": null,
116
+ "medusa_speculation_length": 0,
117
+ "medusa_tree": null,
118
+ "mlp_kernel_enabled": false,
119
+ "mlp_kernel_fuse_residual_add": false,
120
+ "modules_to_not_convert": null,
121
+ "moe_fused_nki_kernel_enabled": null,
122
+ "n_active_tokens": 1024,
123
+ "n_positions": 1024,
124
+ "num_medusa_heads": 0,
125
+ "on_cpu": false,
126
+ "on_device_sampling_config": {
127
+ "deterministic": false,
128
+ "do_sample": false,
129
+ "dynamic": true,
130
+ "global_topk": 256,
131
+ "on_device_sampling_config": true,
132
+ "temperature": 1.0,
133
+ "top_k": 1,
134
+ "top_k_kernel_enabled": false,
135
+ "top_p": 1.0
136
+ },
137
+ "output_logits": false,
138
+ "overrides_torch_dtype": true,
139
+ "pa_block_size": 1024,
140
+ "pa_num_blocks": 1,
141
+ "padding_side": "right",
142
+ "pp_degree": 1,
143
+ "prefix_buckets": null,
144
+ "qk_layernorm": false,
145
+ "qkv_kernel_enabled": false,
146
+ "qkv_kernel_fuse_residual_add": false,
147
+ "qkv_kernel_nbsd_layout": false,
148
+ "quantization_dtype": "int8",
149
+ "quantization_type": "per_tensor_symmetric",
150
+ "quantize_clamp_bound": Infinity,
151
+ "quantized": false,
152
+ "quantized_checkpoints_path": null,
153
+ "quantized_mlp_kernel_enabled": false,
154
+ "rmsnorm_quantize_kernel_enabled": false,
155
+ "router_topk_nki_kernel_enabled": null,
156
+ "rpl_reduce_dtype": null,
157
+ "save_sharded_checkpoint": true,
158
+ "scratchpad_page_size": null,
159
+ "seq_len": 1024,
160
+ "seq_len_threshold_for_cc_tiling": 16384,
161
+ "sequence_parallel_enabled": false,
162
+ "shared_mlp_nki_kernel_enabled": null,
163
+ "skip_sharding": false,
164
+ "skip_warmup": false,
165
+ "spec_batch_size": 1,
166
+ "speculation_length": 0,
167
+ "start_rank_id": 0,
168
+ "target": null,
169
+ "tile_cc": false,
170
+ "tkg_batch_size": 1,
171
+ "token_generation_buckets": null,
172
+ "token_tree_config": null,
173
+ "torch_dtype": "bfloat16",
174
+ "tp_degree": 2,
175
+ "vocab_parallel": false,
176
+ "weight_gather_seq_len_threshold": 32768,
177
+ "weights_to_skip_layout_optimization": [],
178
+ "world_size": 2
179
+ },
180
+ "no_repeat_ngram_size": 0,
181
+ "num_attention_heads": 32,
182
+ "num_beam_groups": 1,
183
+ "num_beams": 1,
184
+ "num_cores_per_group": 1,
185
+ "num_hidden_layers": 36,
186
+ "num_key_value_heads": 8,
187
+ "num_return_sequences": 1,
188
+ "output_attentions": false,
189
+ "output_hidden_states": false,
190
+ "output_scores": false,
191
+ "pad_token_id": 0,
192
+ "prefix": null,
193
+ "problem_type": null,
194
+ "pruned_heads": {},
195
+ "remove_invalid_values": false,
196
+ "repetition_penalty": 1.0,
197
+ "return_dict": true,
198
+ "return_dict_in_generate": false,
199
+ "rms_norm_eps": 1e-06,
200
+ "rope_scaling": null,
201
+ "rope_theta": 1000000,
202
+ "sep_token_id": null,
203
+ "sliding_window": null,
204
+ "suppress_tokens": null,
205
+ "task_specific_params": null,
206
+ "temperature": 1.0,
207
+ "tf_legacy_loss": false,
208
+ "tie_encoder_decoder": false,
209
+ "tie_word_embeddings": false,
210
+ "tokenizer_class": null,
211
+ "top_k": 50,
212
+ "top_p": 1.0,
213
+ "torchscript": false,
214
+ "transformers_version": "4.51.0",
215
+ "typical_p": 1.0,
216
+ "use_bfloat16": false,
217
+ "use_cache": true,
218
+ "use_sliding_window": false,
219
+ "vocab_size": 151936
220
+ }
layout_opt/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile graph.hlo --framework XLA --target trn1 --output graph.neff --model-type=transformer -O1 --lnc=1 '--internal-hlo2tensorizer-options=--experimental-unsafe-fp8e4m3fn-as-fp8e4m3 --verify-hlo=false' --logfile=log-neuron-cc.txt --verbose=35
layout_opt/graph.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eafae43287bda2aa58740df223d211d8e3638af29e402c9cc6cbcadcf302ddde
3
+ size 5786624
layout_opt/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
layout_opt/metaneff ADDED
@@ -0,0 +1,1198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ (
3
+ input0�� �2embed_tokens.weight8
4
+ ;
5
+ input1� �2'layers.0.self_attn.o_proj.o_proj.weight8
6
+ =
7
+ input2�� 2)layers.0.self_attn.qkv_proj.v_proj.weight8
8
+ 1
9
+ input3� 2layers.0.input_layernorm.weight8
10
+ 7
11
+ input4�2%layers.0.self_attn.k_layernorm.weight8
12
+ =
13
+ input5�� 2)layers.0.self_attn.qkv_proj.k_proj.weight8
14
+ 7
15
+ input6�2%layers.0.self_attn.q_layernorm.weight8
16
+ =
17
+ input7�� 2)layers.0.self_attn.qkv_proj.q_proj.weight8
18
+ 1
19
+ input8� �02layers.0.mlp.down_proj.weight8
20
+ /
21
+ input9�0� 2layers.0.mlp.up_proj.weight8
22
+ ;
23
+ input10� 2(layers.0.post_attention_layernorm.weight8
24
+ 2
25
+ input11�0� 2layers.0.mlp.gate_proj.weight8
26
+ <
27
+ input12� �2'layers.1.self_attn.o_proj.o_proj.weight8
28
+ >
29
+ input13�� 2)layers.1.self_attn.qkv_proj.v_proj.weight8
30
+ 2
31
+ input14� 2layers.1.input_layernorm.weight8
32
+ 8
33
+ input15�2%layers.1.self_attn.k_layernorm.weight8
34
+ >
35
+ input16�� 2)layers.1.self_attn.qkv_proj.k_proj.weight8
36
+ 8
37
+ input17�2%layers.1.self_attn.q_layernorm.weight8
38
+ >
39
+ input18�� 2)layers.1.self_attn.qkv_proj.q_proj.weight8
40
+ 2
41
+ input19� �02layers.1.mlp.down_proj.weight8
42
+ 0
43
+ input20�0� 2layers.1.mlp.up_proj.weight8
44
+ ;
45
+ input21� 2(layers.1.post_attention_layernorm.weight8
46
+ 2
47
+ input22�0� 2layers.1.mlp.gate_proj.weight8
48
+ <
49
+ input23� �2'layers.2.self_attn.o_proj.o_proj.weight8
50
+ >
51
+ input24�� 2)layers.2.self_attn.qkv_proj.v_proj.weight8
52
+ 2
53
+ input25� 2layers.2.input_layernorm.weight8
54
+ 8
55
+ input26�2%layers.2.self_attn.k_layernorm.weight8
56
+ >
57
+ input27�� 2)layers.2.self_attn.qkv_proj.k_proj.weight8
58
+ 8
59
+ input28�2%layers.2.self_attn.q_layernorm.weight8
60
+ >
61
+ input29�� 2)layers.2.self_attn.qkv_proj.q_proj.weight8
62
+ 2
63
+ input30� �02layers.2.mlp.down_proj.weight8
64
+ 0
65
+ input31�0� 2layers.2.mlp.up_proj.weight8
66
+ ;
67
+ input32� 2(layers.2.post_attention_layernorm.weight8
68
+ 2
69
+ input33�0� 2layers.2.mlp.gate_proj.weight8
70
+ <
71
+ input34� �2'layers.3.self_attn.o_proj.o_proj.weight8
72
+ >
73
+ input35�� 2)layers.3.self_attn.qkv_proj.v_proj.weight8
74
+ 2
75
+ input36� 2layers.3.input_layernorm.weight8
76
+ 8
77
+ input37�2%layers.3.self_attn.k_layernorm.weight8
78
+ >
79
+ input38�� 2)layers.3.self_attn.qkv_proj.k_proj.weight8
80
+ 8
81
+ input39�2%layers.3.self_attn.q_layernorm.weight8
82
+ >
83
+ input40�� 2)layers.3.self_attn.qkv_proj.q_proj.weight8
84
+ 2
85
+ input41� �02layers.3.mlp.down_proj.weight8
86
+ 0
87
+ input42�0� 2layers.3.mlp.up_proj.weight8
88
+ ;
89
+ input43� 2(layers.3.post_attention_layernorm.weight8
90
+ 2
91
+ input44�0� 2layers.3.mlp.gate_proj.weight8
92
+ <
93
+ input45� �2'layers.4.self_attn.o_proj.o_proj.weight8
94
+ >
95
+ input46�� 2)layers.4.self_attn.qkv_proj.v_proj.weight8
96
+ 2
97
+ input47� 2layers.4.input_layernorm.weight8
98
+ 8
99
+ input48�2%layers.4.self_attn.k_layernorm.weight8
100
+ >
101
+ input49�� 2)layers.4.self_attn.qkv_proj.k_proj.weight8
102
+ 8
103
+ input50�2%layers.4.self_attn.q_layernorm.weight8
104
+ >
105
+ input51�� 2)layers.4.self_attn.qkv_proj.q_proj.weight8
106
+ 2
107
+ input52� �02layers.4.mlp.down_proj.weight8
108
+ 0
109
+ input53�0� 2layers.4.mlp.up_proj.weight8
110
+ ;
111
+ input54� 2(layers.4.post_attention_layernorm.weight8
112
+ 2
113
+ input55�0� 2layers.4.mlp.gate_proj.weight8
114
+ <
115
+ input56� �2'layers.5.self_attn.o_proj.o_proj.weight8
116
+ >
117
+ input57�� 2)layers.5.self_attn.qkv_proj.v_proj.weight8
118
+ 2
119
+ input58� 2layers.5.input_layernorm.weight8
120
+ 8
121
+ input59�2%layers.5.self_attn.k_layernorm.weight8
122
+ >
123
+ input60�� 2)layers.5.self_attn.qkv_proj.k_proj.weight8
124
+ 8
125
+ input61�2%layers.5.self_attn.q_layernorm.weight8
126
+ >
127
+ input62�� 2)layers.5.self_attn.qkv_proj.q_proj.weight8
128
+ 2
129
+ input63� �02layers.5.mlp.down_proj.weight8
130
+ 0
131
+ input64�0� 2layers.5.mlp.up_proj.weight8
132
+ ;
133
+ input65� 2(layers.5.post_attention_layernorm.weight8
134
+ 2
135
+ input66�0� 2layers.5.mlp.gate_proj.weight8
136
+ <
137
+ input67� �2'layers.6.self_attn.o_proj.o_proj.weight8
138
+ >
139
+ input68�� 2)layers.6.self_attn.qkv_proj.v_proj.weight8
140
+ 2
141
+ input69� 2layers.6.input_layernorm.weight8
142
+ 8
143
+ input70�2%layers.6.self_attn.k_layernorm.weight8
144
+ >
145
+ input71�� 2)layers.6.self_attn.qkv_proj.k_proj.weight8
146
+ 8
147
+ input72�2%layers.6.self_attn.q_layernorm.weight8
148
+ >
149
+ input73�� 2)layers.6.self_attn.qkv_proj.q_proj.weight8
150
+ 2
151
+ input74� �02layers.6.mlp.down_proj.weight8
152
+ 0
153
+ input75�0� 2layers.6.mlp.up_proj.weight8
154
+ ;
155
+ input76� 2(layers.6.post_attention_layernorm.weight8
156
+ 2
157
+ input77�0� 2layers.6.mlp.gate_proj.weight8
158
+ <
159
+ input78� �2'layers.7.self_attn.o_proj.o_proj.weight8
160
+ >
161
+ input79�� 2)layers.7.self_attn.qkv_proj.v_proj.weight8
162
+ 2
163
+ input80� 2layers.7.input_layernorm.weight8
164
+ 8
165
+ input81�2%layers.7.self_attn.k_layernorm.weight8
166
+ >
167
+ input82�� 2)layers.7.self_attn.qkv_proj.k_proj.weight8
168
+ 8
169
+ input83�2%layers.7.self_attn.q_layernorm.weight8
170
+ >
171
+ input84�� 2)layers.7.self_attn.qkv_proj.q_proj.weight8
172
+ 2
173
+ input85� �02layers.7.mlp.down_proj.weight8
174
+ 0
175
+ input86�0� 2layers.7.mlp.up_proj.weight8
176
+ ;
177
+ input87� 2(layers.7.post_attention_layernorm.weight8
178
+ 2
179
+ input88�0� 2layers.7.mlp.gate_proj.weight8
180
+ <
181
+ input89� �2'layers.8.self_attn.o_proj.o_proj.weight8
182
+ >
183
+ input90�� 2)layers.8.self_attn.qkv_proj.v_proj.weight8
184
+ 2
185
+ input91� 2layers.8.input_layernorm.weight8
186
+ 8
187
+ input92�2%layers.8.self_attn.k_layernorm.weight8
188
+ >
189
+ input93�� 2)layers.8.self_attn.qkv_proj.k_proj.weight8
190
+ 8
191
+ input94�2%layers.8.self_attn.q_layernorm.weight8
192
+ >
193
+ input95�� 2)layers.8.self_attn.qkv_proj.q_proj.weight8
194
+ 2
195
+ input96� �02layers.8.mlp.down_proj.weight8
196
+ 0
197
+ input97�0� 2layers.8.mlp.up_proj.weight8
198
+ ;
199
+ input98� 2(layers.8.post_attention_layernorm.weight8
200
+ 2
201
+ input99�0� 2layers.8.mlp.gate_proj.weight8
202
+ =
203
+ input100� �2'layers.9.self_attn.o_proj.o_proj.weight8
204
+ ?
205
+ input101�� 2)layers.9.self_attn.qkv_proj.v_proj.weight8
206
+ 3
207
+ input102� 2layers.9.input_layernorm.weight8
208
+ 9
209
+ input103�2%layers.9.self_attn.k_layernorm.weight8
210
+ ?
211
+ input104�� 2)layers.9.self_attn.qkv_proj.k_proj.weight8
212
+ 9
213
+ input105�2%layers.9.self_attn.q_layernorm.weight8
214
+ ?
215
+ input106�� 2)layers.9.self_attn.qkv_proj.q_proj.weight8
216
+ 3
217
+ input107� �02layers.9.mlp.down_proj.weight8
218
+ 1
219
+ input108�0� 2layers.9.mlp.up_proj.weight8
220
+ <
221
+ input109� 2(layers.9.post_attention_layernorm.weight8
222
+ 3
223
+ input110�0� 2layers.9.mlp.gate_proj.weight8
224
+ >
225
+ input111� �2(layers.10.self_attn.o_proj.o_proj.weight8
226
+ @
227
+ input112�� 2*layers.10.self_attn.qkv_proj.v_proj.weight8
228
+ 4
229
+ input113� 2 layers.10.input_layernorm.weight8
230
+ :
231
+ input114�2&layers.10.self_attn.k_layernorm.weight8
232
+ @
233
+ input115�� 2*layers.10.self_attn.qkv_proj.k_proj.weight8
234
+ :
235
+ input116�2&layers.10.self_attn.q_layernorm.weight8
236
+ @
237
+ input117�� 2*layers.10.self_attn.qkv_proj.q_proj.weight8
238
+ 4
239
+ input118� �02layers.10.mlp.down_proj.weight8
240
+ 2
241
+ input119�0� 2layers.10.mlp.up_proj.weight8
242
+ =
243
+ input120� 2)layers.10.post_attention_layernorm.weight8
244
+ 4
245
+ input121�0� 2layers.10.mlp.gate_proj.weight8
246
+ >
247
+ input122� �2(layers.11.self_attn.o_proj.o_proj.weight8
248
+ @
249
+ input123�� 2*layers.11.self_attn.qkv_proj.v_proj.weight8
250
+ 4
251
+ input124� 2 layers.11.input_layernorm.weight8
252
+ :
253
+ input125�2&layers.11.self_attn.k_layernorm.weight8
254
+ @
255
+ input126�� 2*layers.11.self_attn.qkv_proj.k_proj.weight8
256
+ :
257
+ input127�2&layers.11.self_attn.q_layernorm.weight8
258
+ @
259
+ input128�� 2*layers.11.self_attn.qkv_proj.q_proj.weight8
260
+ 4
261
+ input129� �02layers.11.mlp.down_proj.weight8
262
+ 2
263
+ input130�0� 2layers.11.mlp.up_proj.weight8
264
+ =
265
+ input131� 2)layers.11.post_attention_layernorm.weight8
266
+ 4
267
+ input132�0� 2layers.11.mlp.gate_proj.weight8
268
+ >
269
+ input133� �2(layers.12.self_attn.o_proj.o_proj.weight8
270
+ @
271
+ input134�� 2*layers.12.self_attn.qkv_proj.v_proj.weight8
272
+ 4
273
+ input135� 2 layers.12.input_layernorm.weight8
274
+ :
275
+ input136�2&layers.12.self_attn.k_layernorm.weight8
276
+ @
277
+ input137�� 2*layers.12.self_attn.qkv_proj.k_proj.weight8
278
+ :
279
+ input138�2&layers.12.self_attn.q_layernorm.weight8
280
+ @
281
+ input139�� 2*layers.12.self_attn.qkv_proj.q_proj.weight8
282
+ 4
283
+ input140� �02layers.12.mlp.down_proj.weight8
284
+ 2
285
+ input141�0� 2layers.12.mlp.up_proj.weight8
286
+ =
287
+ input142� 2)layers.12.post_attention_layernorm.weight8
288
+ 4
289
+ input143�0� 2layers.12.mlp.gate_proj.weight8
290
+ >
291
+ input144� �2(layers.13.self_attn.o_proj.o_proj.weight8
292
+ @
293
+ input145�� 2*layers.13.self_attn.qkv_proj.v_proj.weight8
294
+ 4
295
+ input146� 2 layers.13.input_layernorm.weight8
296
+ :
297
+ input147�2&layers.13.self_attn.k_layernorm.weight8
298
+ @
299
+ input148�� 2*layers.13.self_attn.qkv_proj.k_proj.weight8
300
+ :
301
+ input149�2&layers.13.self_attn.q_layernorm.weight8
302
+ @
303
+ input150�� 2*layers.13.self_attn.qkv_proj.q_proj.weight8
304
+ 4
305
+ input151� �02layers.13.mlp.down_proj.weight8
306
+ 2
307
+ input152�0� 2layers.13.mlp.up_proj.weight8
308
+ =
309
+ input153� 2)layers.13.post_attention_layernorm.weight8
310
+ 4
311
+ input154�0� 2layers.13.mlp.gate_proj.weight8
312
+ >
313
+ input155� �2(layers.14.self_attn.o_proj.o_proj.weight8
314
+ @
315
+ input156�� 2*layers.14.self_attn.qkv_proj.v_proj.weight8
316
+ 4
317
+ input157� 2 layers.14.input_layernorm.weight8
318
+ :
319
+ input158�2&layers.14.self_attn.k_layernorm.weight8
320
+ @
321
+ input159�� 2*layers.14.self_attn.qkv_proj.k_proj.weight8
322
+ :
323
+ input160�2&layers.14.self_attn.q_layernorm.weight8
324
+ @
325
+ input161�� 2*layers.14.self_attn.qkv_proj.q_proj.weight8
326
+ 4
327
+ input162� �02layers.14.mlp.down_proj.weight8
328
+ 2
329
+ input163�0� 2layers.14.mlp.up_proj.weight8
330
+ =
331
+ input164� 2)layers.14.post_attention_layernorm.weight8
332
+ 4
333
+ input165�0� 2layers.14.mlp.gate_proj.weight8
334
+ >
335
+ input166� �2(layers.15.self_attn.o_proj.o_proj.weight8
336
+ @
337
+ input167�� 2*layers.15.self_attn.qkv_proj.v_proj.weight8
338
+ 4
339
+ input168� 2 layers.15.input_layernorm.weight8
340
+ :
341
+ input169�2&layers.15.self_attn.k_layernorm.weight8
342
+ @
343
+ input170�� 2*layers.15.self_attn.qkv_proj.k_proj.weight8
344
+ :
345
+ input171�2&layers.15.self_attn.q_layernorm.weight8
346
+ @
347
+ input172�� 2*layers.15.self_attn.qkv_proj.q_proj.weight8
348
+ 4
349
+ input173� �02layers.15.mlp.down_proj.weight8
350
+ 2
351
+ input174�0� 2layers.15.mlp.up_proj.weight8
352
+ =
353
+ input175� 2)layers.15.post_attention_layernorm.weight8
354
+ 4
355
+ input176�0� 2layers.15.mlp.gate_proj.weight8
356
+ >
357
+ input177� �2(layers.16.self_attn.o_proj.o_proj.weight8
358
+ @
359
+ input178�� 2*layers.16.self_attn.qkv_proj.v_proj.weight8
360
+ 4
361
+ input179� 2 layers.16.input_layernorm.weight8
362
+ :
363
+ input180�2&layers.16.self_attn.k_layernorm.weight8
364
+ @
365
+ input181�� 2*layers.16.self_attn.qkv_proj.k_proj.weight8
366
+ :
367
+ input182�2&layers.16.self_attn.q_layernorm.weight8
368
+ @
369
+ input183�� 2*layers.16.self_attn.qkv_proj.q_proj.weight8
370
+ 4
371
+ input184� �02layers.16.mlp.down_proj.weight8
372
+ 2
373
+ input185�0� 2layers.16.mlp.up_proj.weight8
374
+ =
375
+ input186� 2)layers.16.post_attention_layernorm.weight8
376
+ 4
377
+ input187�0� 2layers.16.mlp.gate_proj.weight8
378
+ >
379
+ input188� �2(layers.17.self_attn.o_proj.o_proj.weight8
380
+ @
381
+ input189�� 2*layers.17.self_attn.qkv_proj.v_proj.weight8
382
+ 4
383
+ input190� 2 layers.17.input_layernorm.weight8
384
+ :
385
+ input191�2&layers.17.self_attn.k_layernorm.weight8
386
+ @
387
+ input192�� 2*layers.17.self_attn.qkv_proj.k_proj.weight8
388
+ :
389
+ input193�2&layers.17.self_attn.q_layernorm.weight8
390
+ @
391
+ input194�� 2*layers.17.self_attn.qkv_proj.q_proj.weight8
392
+ 4
393
+ input195� �02layers.17.mlp.down_proj.weight8
394
+ 2
395
+ input196�0� 2layers.17.mlp.up_proj.weight8
396
+ =
397
+ input197� 2)layers.17.post_attention_layernorm.weight8
398
+ 4
399
+ input198�0� 2layers.17.mlp.gate_proj.weight8
400
+ >
401
+ input199� �2(layers.18.self_attn.o_proj.o_proj.weight8
402
+ @
403
+ input200�� 2*layers.18.self_attn.qkv_proj.v_proj.weight8
404
+ 4
405
+ input201� 2 layers.18.input_layernorm.weight8
406
+ :
407
+ input202�2&layers.18.self_attn.k_layernorm.weight8
408
+ @
409
+ input203�� 2*layers.18.self_attn.qkv_proj.k_proj.weight8
410
+ :
411
+ input204�2&layers.18.self_attn.q_layernorm.weight8
412
+ @
413
+ input205�� 2*layers.18.self_attn.qkv_proj.q_proj.weight8
414
+ 4
415
+ input206� �02layers.18.mlp.down_proj.weight8
416
+ 2
417
+ input207�0� 2layers.18.mlp.up_proj.weight8
418
+ =
419
+ input208� 2)layers.18.post_attention_layernorm.weight8
420
+ 4
421
+ input209�0� 2layers.18.mlp.gate_proj.weight8
422
+ >
423
+ input210� �2(layers.19.self_attn.o_proj.o_proj.weight8
424
+ @
425
+ input211�� 2*layers.19.self_attn.qkv_proj.v_proj.weight8
426
+ 4
427
+ input212� 2 layers.19.input_layernorm.weight8
428
+ :
429
+ input213�2&layers.19.self_attn.k_layernorm.weight8
430
+ @
431
+ input214�� 2*layers.19.self_attn.qkv_proj.k_proj.weight8
432
+ :
433
+ input215�2&layers.19.self_attn.q_layernorm.weight8
434
+ @
435
+ input216�� 2*layers.19.self_attn.qkv_proj.q_proj.weight8
436
+ 4
437
+ input217� �02layers.19.mlp.down_proj.weight8
438
+ 2
439
+ input218�0� 2layers.19.mlp.up_proj.weight8
440
+ =
441
+ input219� 2)layers.19.post_attention_layernorm.weight8
442
+ 4
443
+ input220�0� 2layers.19.mlp.gate_proj.weight8
444
+ >
445
+ input221� �2(layers.20.self_attn.o_proj.o_proj.weight8
446
+ @
447
+ input222�� 2*layers.20.self_attn.qkv_proj.v_proj.weight8
448
+ 4
449
+ input223� 2 layers.20.input_layernorm.weight8
450
+ :
451
+ input224�2&layers.20.self_attn.k_layernorm.weight8
452
+ @
453
+ input225�� 2*layers.20.self_attn.qkv_proj.k_proj.weight8
454
+ :
455
+ input226�2&layers.20.self_attn.q_layernorm.weight8
456
+ @
457
+ input227�� 2*layers.20.self_attn.qkv_proj.q_proj.weight8
458
+ 4
459
+ input228� �02layers.20.mlp.down_proj.weight8
460
+ 2
461
+ input229�0� 2layers.20.mlp.up_proj.weight8
462
+ =
463
+ input230� 2)layers.20.post_attention_layernorm.weight8
464
+ 4
465
+ input231�0� 2layers.20.mlp.gate_proj.weight8
466
+ >
467
+ input232� �2(layers.21.self_attn.o_proj.o_proj.weight8
468
+ @
469
+ input233�� 2*layers.21.self_attn.qkv_proj.v_proj.weight8
470
+ 4
471
+ input234� 2 layers.21.input_layernorm.weight8
472
+ :
473
+ input235�2&layers.21.self_attn.k_layernorm.weight8
474
+ @
475
+ input236�� 2*layers.21.self_attn.qkv_proj.k_proj.weight8
476
+ :
477
+ input237�2&layers.21.self_attn.q_layernorm.weight8
478
+ @
479
+ input238�� 2*layers.21.self_attn.qkv_proj.q_proj.weight8
480
+ 4
481
+ input239� �02layers.21.mlp.down_proj.weight8
482
+ 2
483
+ input240�0� 2layers.21.mlp.up_proj.weight8
484
+ =
485
+ input241� 2)layers.21.post_attention_layernorm.weight8
486
+ 4
487
+ input242�0� 2layers.21.mlp.gate_proj.weight8
488
+ >
489
+ input243� �2(layers.22.self_attn.o_proj.o_proj.weight8
490
+ @
491
+ input244�� 2*layers.22.self_attn.qkv_proj.v_proj.weight8
492
+ 4
493
+ input245� 2 layers.22.input_layernorm.weight8
494
+ :
495
+ input246�2&layers.22.self_attn.k_layernorm.weight8
496
+ @
497
+ input247�� 2*layers.22.self_attn.qkv_proj.k_proj.weight8
498
+ :
499
+ input248�2&layers.22.self_attn.q_layernorm.weight8
500
+ @
501
+ input249�� 2*layers.22.self_attn.qkv_proj.q_proj.weight8
502
+ 4
503
+ input250� �02layers.22.mlp.down_proj.weight8
504
+ 2
505
+ input251�0� 2layers.22.mlp.up_proj.weight8
506
+ =
507
+ input252� 2)layers.22.post_attention_layernorm.weight8
508
+ 4
509
+ input253�0� 2layers.22.mlp.gate_proj.weight8
510
+ >
511
+ input254� �2(layers.23.self_attn.o_proj.o_proj.weight8
512
+ @
513
+ input255�� 2*layers.23.self_attn.qkv_proj.v_proj.weight8
514
+ 4
515
+ input256� 2 layers.23.input_layernorm.weight8
516
+ :
517
+ input257�2&layers.23.self_attn.k_layernorm.weight8
518
+ @
519
+ input258�� 2*layers.23.self_attn.qkv_proj.k_proj.weight8
520
+ :
521
+ input259�2&layers.23.self_attn.q_layernorm.weight8
522
+ @
523
+ input260�� 2*layers.23.self_attn.qkv_proj.q_proj.weight8
524
+ 4
525
+ input261� �02layers.23.mlp.down_proj.weight8
526
+ 2
527
+ input262�0� 2layers.23.mlp.up_proj.weight8
528
+ =
529
+ input263� 2)layers.23.post_attention_layernorm.weight8
530
+ 4
531
+ input264�0� 2layers.23.mlp.gate_proj.weight8
532
+ >
533
+ input265� �2(layers.24.self_attn.o_proj.o_proj.weight8
534
+ @
535
+ input266�� 2*layers.24.self_attn.qkv_proj.v_proj.weight8
536
+ 4
537
+ input267� 2 layers.24.input_layernorm.weight8
538
+ :
539
+ input268�2&layers.24.self_attn.k_layernorm.weight8
540
+ @
541
+ input269�� 2*layers.24.self_attn.qkv_proj.k_proj.weight8
542
+ :
543
+ input270�2&layers.24.self_attn.q_layernorm.weight8
544
+ @
545
+ input271�� 2*layers.24.self_attn.qkv_proj.q_proj.weight8
546
+ 4
547
+ input272� �02layers.24.mlp.down_proj.weight8
548
+ 2
549
+ input273�0� 2layers.24.mlp.up_proj.weight8
550
+ =
551
+ input274� 2)layers.24.post_attention_layernorm.weight8
552
+ 4
553
+ input275�0� 2layers.24.mlp.gate_proj.weight8
554
+ >
555
+ input276� �2(layers.25.self_attn.o_proj.o_proj.weight8
556
+ @
557
+ input277�� 2*layers.25.self_attn.qkv_proj.v_proj.weight8
558
+ 4
559
+ input278� 2 layers.25.input_layernorm.weight8
560
+ :
561
+ input279�2&layers.25.self_attn.k_layernorm.weight8
562
+ @
563
+ input280�� 2*layers.25.self_attn.qkv_proj.k_proj.weight8
564
+ :
565
+ input281�2&layers.25.self_attn.q_layernorm.weight8
566
+ @
567
+ input282�� 2*layers.25.self_attn.qkv_proj.q_proj.weight8
568
+ 4
569
+ input283� �02layers.25.mlp.down_proj.weight8
570
+ 2
571
+ input284�0� 2layers.25.mlp.up_proj.weight8
572
+ =
573
+ input285� 2)layers.25.post_attention_layernorm.weight8
574
+ 4
575
+ input286�0� 2layers.25.mlp.gate_proj.weight8
576
+ >
577
+ input287� �2(layers.26.self_attn.o_proj.o_proj.weight8
578
+ @
579
+ input288�� 2*layers.26.self_attn.qkv_proj.v_proj.weight8
580
+ 4
581
+ input289� 2 layers.26.input_layernorm.weight8
582
+ :
583
+ input290�2&layers.26.self_attn.k_layernorm.weight8
584
+ @
585
+ input291�� 2*layers.26.self_attn.qkv_proj.k_proj.weight8
586
+ :
587
+ input292�2&layers.26.self_attn.q_layernorm.weight8
588
+ @
589
+ input293�� 2*layers.26.self_attn.qkv_proj.q_proj.weight8
590
+ 4
591
+ input294� �02layers.26.mlp.down_proj.weight8
592
+ 2
593
+ input295�0� 2layers.26.mlp.up_proj.weight8
594
+ =
595
+ input296� 2)layers.26.post_attention_layernorm.weight8
596
+ 4
597
+ input297�0� 2layers.26.mlp.gate_proj.weight8
598
+ >
599
+ input298� �2(layers.27.self_attn.o_proj.o_proj.weight8
600
+ @
601
+ input299�� 2*layers.27.self_attn.qkv_proj.v_proj.weight8
602
+ 4
603
+ input300� 2 layers.27.input_layernorm.weight8
604
+ :
605
+ input301�2&layers.27.self_attn.k_layernorm.weight8
606
+ @
607
+ input302�� 2*layers.27.self_attn.qkv_proj.k_proj.weight8
608
+ :
609
+ input303�2&layers.27.self_attn.q_layernorm.weight8
610
+ @
611
+ input304�� 2*layers.27.self_attn.qkv_proj.q_proj.weight8
612
+ 4
613
+ input305� �02layers.27.mlp.down_proj.weight8
614
+ 2
615
+ input306�0� 2layers.27.mlp.up_proj.weight8
616
+ =
617
+ input307� 2)layers.27.post_attention_layernorm.weight8
618
+ 4
619
+ input308�0� 2layers.27.mlp.gate_proj.weight8
620
+ >
621
+ input309� �2(layers.28.self_attn.o_proj.o_proj.weight8
622
+ @
623
+ input310�� 2*layers.28.self_attn.qkv_proj.v_proj.weight8
624
+ 4
625
+ input311� 2 layers.28.input_layernorm.weight8
626
+ :
627
+ input312�2&layers.28.self_attn.k_layernorm.weight8
628
+ @
629
+ input313�� 2*layers.28.self_attn.qkv_proj.k_proj.weight8
630
+ :
631
+ input314�2&layers.28.self_attn.q_layernorm.weight8
632
+ @
633
+ input315�� 2*layers.28.self_attn.qkv_proj.q_proj.weight8
634
+ 4
635
+ input316� �02layers.28.mlp.down_proj.weight8
636
+ 2
637
+ input317�0� 2layers.28.mlp.up_proj.weight8
638
+ =
639
+ input318� 2)layers.28.post_attention_layernorm.weight8
640
+ 4
641
+ input319�0� 2layers.28.mlp.gate_proj.weight8
642
+ >
643
+ input320� �2(layers.29.self_attn.o_proj.o_proj.weight8
644
+ @
645
+ input321�� 2*layers.29.self_attn.qkv_proj.v_proj.weight8
646
+ 4
647
+ input322� 2 layers.29.input_layernorm.weight8
648
+ :
649
+ input323�2&layers.29.self_attn.k_layernorm.weight8
650
+ @
651
+ input324�� 2*layers.29.self_attn.qkv_proj.k_proj.weight8
652
+ :
653
+ input325�2&layers.29.self_attn.q_layernorm.weight8
654
+ @
655
+ input326�� 2*layers.29.self_attn.qkv_proj.q_proj.weight8
656
+ 4
657
+ input327� �02layers.29.mlp.down_proj.weight8
658
+ 2
659
+ input328�0� 2layers.29.mlp.up_proj.weight8
660
+ =
661
+ input329� 2)layers.29.post_attention_layernorm.weight8
662
+ 4
663
+ input330�0� 2layers.29.mlp.gate_proj.weight8
664
+ >
665
+ input331� �2(layers.30.self_attn.o_proj.o_proj.weight8
666
+ @
667
+ input332�� 2*layers.30.self_attn.qkv_proj.v_proj.weight8
668
+ 4
669
+ input333� 2 layers.30.input_layernorm.weight8
670
+ :
671
+ input334�2&layers.30.self_attn.k_layernorm.weight8
672
+ @
673
+ input335�� 2*layers.30.self_attn.qkv_proj.k_proj.weight8
674
+ :
675
+ input336�2&layers.30.self_attn.q_layernorm.weight8
676
+ @
677
+ input337�� 2*layers.30.self_attn.qkv_proj.q_proj.weight8
678
+ 4
679
+ input338� �02layers.30.mlp.down_proj.weight8
680
+ 2
681
+ input339�0� 2layers.30.mlp.up_proj.weight8
682
+ =
683
+ input340� 2)layers.30.post_attention_layernorm.weight8
684
+ 4
685
+ input341�0� 2layers.30.mlp.gate_proj.weight8
686
+ >
687
+ input342� �2(layers.31.self_attn.o_proj.o_proj.weight8
688
+ @
689
+ input343�� 2*layers.31.self_attn.qkv_proj.v_proj.weight8
690
+ 4
691
+ input344� 2 layers.31.input_layernorm.weight8
692
+ :
693
+ input345�2&layers.31.self_attn.k_layernorm.weight8
694
+ @
695
+ input346�� 2*layers.31.self_attn.qkv_proj.k_proj.weight8
696
+ :
697
+ input347�2&layers.31.self_attn.q_layernorm.weight8
698
+ @
699
+ input348�� 2*layers.31.self_attn.qkv_proj.q_proj.weight8
700
+ 4
701
+ input349� �02layers.31.mlp.down_proj.weight8
702
+ 2
703
+ input350�0� 2layers.31.mlp.up_proj.weight8
704
+ =
705
+ input351� 2)layers.31.post_attention_layernorm.weight8
706
+ 4
707
+ input352�0� 2layers.31.mlp.gate_proj.weight8
708
+ >
709
+ input353� �2(layers.32.self_attn.o_proj.o_proj.weight8
710
+ @
711
+ input354�� 2*layers.32.self_attn.qkv_proj.v_proj.weight8
712
+ 4
713
+ input355� 2 layers.32.input_layernorm.weight8
714
+ :
715
+ input356�2&layers.32.self_attn.k_layernorm.weight8
716
+ @
717
+ input357�� 2*layers.32.self_attn.qkv_proj.k_proj.weight8
718
+ :
719
+ input358�2&layers.32.self_attn.q_layernorm.weight8
720
+ @
721
+ input359�� 2*layers.32.self_attn.qkv_proj.q_proj.weight8
722
+ 4
723
+ input360� �02layers.32.mlp.down_proj.weight8
724
+ 2
725
+ input361�0� 2layers.32.mlp.up_proj.weight8
726
+ =
727
+ input362� 2)layers.32.post_attention_layernorm.weight8
728
+ 4
729
+ input363�0� 2layers.32.mlp.gate_proj.weight8
730
+ >
731
+ input364� �2(layers.33.self_attn.o_proj.o_proj.weight8
732
+ @
733
+ input365�� 2*layers.33.self_attn.qkv_proj.v_proj.weight8
734
+ 4
735
+ input366� 2 layers.33.input_layernorm.weight8
736
+ :
737
+ input367�2&layers.33.self_attn.k_layernorm.weight8
738
+ @
739
+ input368�� 2*layers.33.self_attn.qkv_proj.k_proj.weight8
740
+ :
741
+ input369�2&layers.33.self_attn.q_layernorm.weight8
742
+ @
743
+ input370�� 2*layers.33.self_attn.qkv_proj.q_proj.weight8
744
+ 4
745
+ input371� �02layers.33.mlp.down_proj.weight8
746
+ 2
747
+ input372�0� 2layers.33.mlp.up_proj.weight8
748
+ =
749
+ input373� 2)layers.33.post_attention_layernorm.weight8
750
+ 4
751
+ input374�0� 2layers.33.mlp.gate_proj.weight8
752
+ >
753
+ input375� �2(layers.34.self_attn.o_proj.o_proj.weight8
754
+ @
755
+ input376�� 2*layers.34.self_attn.qkv_proj.v_proj.weight8
756
+ 4
757
+ input377� 2 layers.34.input_layernorm.weight8
758
+ :
759
+ input378�2&layers.34.self_attn.k_layernorm.weight8
760
+ @
761
+ input379�� 2*layers.34.self_attn.qkv_proj.k_proj.weight8
762
+ :
763
+ input380�2&layers.34.self_attn.q_layernorm.weight8
764
+ @
765
+ input381�� 2*layers.34.self_attn.qkv_proj.q_proj.weight8
766
+ 4
767
+ input382� �02layers.34.mlp.down_proj.weight8
768
+ 2
769
+ input383�0� 2layers.34.mlp.up_proj.weight8
770
+ =
771
+ input384� 2)layers.34.post_attention_layernorm.weight8
772
+ 4
773
+ input385�0� 2layers.34.mlp.gate_proj.weight8
774
+ >
775
+ input386� �2(layers.35.self_attn.o_proj.o_proj.weight8
776
+ @
777
+ input387�� 2*layers.35.self_attn.qkv_proj.v_proj.weight8
778
+ 4
779
+ input388� 2 layers.35.input_layernorm.weight8
780
+ :
781
+ input389�2&layers.35.self_attn.k_layernorm.weight8
782
+ @
783
+ input390�� 2*layers.35.self_attn.qkv_proj.k_proj.weight8
784
+ :
785
+ input391�2&layers.35.self_attn.q_layernorm.weight8
786
+ @
787
+ input392�� 2*layers.35.self_attn.qkv_proj.q_proj.weight8
788
+ 4
789
+ input393� �02layers.35.mlp.down_proj.weight8
790
+ 2
791
+ input394�0� 2layers.35.mlp.up_proj.weight8
792
+ =
793
+ input395� 2)layers.35.post_attention_layernorm.weight8
794
+ 4
795
+ input396�0� 2layers.35.mlp.gate_proj.weight8
796
+ %
797
+ input397��� 2lm_head.weight8
798
+ 
799
+ input398� 2 norm.weight8'
800
+ output0�� �2embed_tokens.weight>
801
+ output1��2'layers.0.self_attn.o_proj.o_proj.weight>
802
+ output2� �2)layers.0.self_attn.qkv_proj.v_proj.weight1
803
+ output3� 2layers.0.input_layernorm.weight6
804
+ output4�2%layers.0.self_attn.k_layernorm.weight>
805
+ output5� @2)layers.0.self_attn.qkv_proj.k_proj.weight6
806
+ output6�2%layers.0.self_attn.q_layernorm.weight?
807
+ output7� @2)layers.0.self_attn.qkv_proj.q_proj.weight3
808
+ output8 ��2layers.0.mlp.down_proj.weight0
809
+ output90� �2layers.0.mlp.up_proj.weight;
810
+ output10� 2(layers.0.post_attention_layernorm.weight3
811
+ output110� �2layers.0.mlp.gate_proj.weight?
812
+ output12��2'layers.1.self_attn.o_proj.o_proj.weight?
813
+ output13� �2)layers.1.self_attn.qkv_proj.v_proj.weight2
814
+ output14� 2layers.1.input_layernorm.weight7
815
+ output15�2%layers.1.self_attn.k_layernorm.weight?
816
+ output16� @2)layers.1.self_attn.qkv_proj.k_proj.weight7
817
+ output17�2%layers.1.self_attn.q_layernorm.weight@
818
+ output18� @2)layers.1.self_attn.qkv_proj.q_proj.weight4
819
+ output19 ��2layers.1.mlp.down_proj.weight1
820
+ output200� �2layers.1.mlp.up_proj.weight;
821
+ output21� 2(layers.1.post_attention_layernorm.weight3
822
+ output220� �2layers.1.mlp.gate_proj.weight?
823
+ output23��2'layers.2.self_attn.o_proj.o_proj.weight?
824
+ output24� �2)layers.2.self_attn.qkv_proj.v_proj.weight2
825
+ output25� 2layers.2.input_layernorm.weight7
826
+ output26�2%layers.2.self_attn.k_layernorm.weight?
827
+ output27� @2)layers.2.self_attn.qkv_proj.k_proj.weight7
828
+ output28�2%layers.2.self_attn.q_layernorm.weight@
829
+ output29� @2)layers.2.self_attn.qkv_proj.q_proj.weight4
830
+ output30 ��2layers.2.mlp.down_proj.weight1
831
+ output310� �2layers.2.mlp.up_proj.weight;
832
+ output32� 2(layers.2.post_attention_layernorm.weight3
833
+ output330� �2layers.2.mlp.gate_proj.weight?
834
+ output34��2'layers.3.self_attn.o_proj.o_proj.weight?
835
+ output35� �2)layers.3.self_attn.qkv_proj.v_proj.weight2
836
+ output36� 2layers.3.input_layernorm.weight7
837
+ output37�2%layers.3.self_attn.k_layernorm.weight?
838
+ output38� @2)layers.3.self_attn.qkv_proj.k_proj.weight7
839
+ output39�2%layers.3.self_attn.q_layernorm.weight@
840
+ output40� @2)layers.3.self_attn.qkv_proj.q_proj.weight4
841
+ output41 ��2layers.3.mlp.down_proj.weight1
842
+ output420� �2layers.3.mlp.up_proj.weight;
843
+ output43� 2(layers.3.post_attention_layernorm.weight3
844
+ output440� �2layers.3.mlp.gate_proj.weight?
845
+ output45��2'layers.4.self_attn.o_proj.o_proj.weight?
846
+ output46� �2)layers.4.self_attn.qkv_proj.v_proj.weight2
847
+ output47� 2layers.4.input_layernorm.weight7
848
+ output48�2%layers.4.self_attn.k_layernorm.weight?
849
+ output49� @2)layers.4.self_attn.qkv_proj.k_proj.weight7
850
+ output50�2%layers.4.self_attn.q_layernorm.weight@
851
+ output51� @2)layers.4.self_attn.qkv_proj.q_proj.weight4
852
+ output52 ��2layers.4.mlp.down_proj.weight1
853
+ output530� �2layers.4.mlp.up_proj.weight;
854
+ output54� 2(layers.4.post_attention_layernorm.weight3
855
+ output550� �2layers.4.mlp.gate_proj.weight?
856
+ output56��2'layers.5.self_attn.o_proj.o_proj.weight?
857
+ output57� �2)layers.5.self_attn.qkv_proj.v_proj.weight2
858
+ output58� 2layers.5.input_layernorm.weight7
859
+ output59�2%layers.5.self_attn.k_layernorm.weight?
860
+ output60� @2)layers.5.self_attn.qkv_proj.k_proj.weight7
861
+ output61�2%layers.5.self_attn.q_layernorm.weight@
862
+ output62� @2)layers.5.self_attn.qkv_proj.q_proj.weight4
863
+ output63 ��2layers.5.mlp.down_proj.weight1
864
+ output640� �2layers.5.mlp.up_proj.weight;
865
+ output65� 2(layers.5.post_attention_layernorm.weight3
866
+ output660� �2layers.5.mlp.gate_proj.weight?
867
+ output67��2'layers.6.self_attn.o_proj.o_proj.weight?
868
+ output68� �2)layers.6.self_attn.qkv_proj.v_proj.weight2
869
+ output69� 2layers.6.input_layernorm.weight7
870
+ output70�2%layers.6.self_attn.k_layernorm.weight?
871
+ output71� @2)layers.6.self_attn.qkv_proj.k_proj.weight7
872
+ output72�2%layers.6.self_attn.q_layernorm.weight@
873
+ output73� @2)layers.6.self_attn.qkv_proj.q_proj.weight4
874
+ output74 ��2layers.6.mlp.down_proj.weight1
875
+ output750� �2layers.6.mlp.up_proj.weight;
876
+ output76� 2(layers.6.post_attention_layernorm.weight3
877
+ output770� �2layers.6.mlp.gate_proj.weight?
878
+ output78��2'layers.7.self_attn.o_proj.o_proj.weight?
879
+ output79� �2)layers.7.self_attn.qkv_proj.v_proj.weight2
880
+ output80� 2layers.7.input_layernorm.weight7
881
+ output81�2%layers.7.self_attn.k_layernorm.weight?
882
+ output82� @2)layers.7.self_attn.qkv_proj.k_proj.weight7
883
+ output83�2%layers.7.self_attn.q_layernorm.weight@
884
+ output84� @2)layers.7.self_attn.qkv_proj.q_proj.weight4
885
+ output85 ��2layers.7.mlp.down_proj.weight1
886
+ output860� �2layers.7.mlp.up_proj.weight;
887
+ output87� 2(layers.7.post_attention_layernorm.weight3
888
+ output880� �2layers.7.mlp.gate_proj.weight?
889
+ output89��2'layers.8.self_attn.o_proj.o_proj.weight?
890
+ output90� �2)layers.8.self_attn.qkv_proj.v_proj.weight2
891
+ output91� 2layers.8.input_layernorm.weight7
892
+ output92�2%layers.8.self_attn.k_layernorm.weight?
893
+ output93� @2)layers.8.self_attn.qkv_proj.k_proj.weight7
894
+ output94�2%layers.8.self_attn.q_layernorm.weight@
895
+ output95� @2)layers.8.self_attn.qkv_proj.q_proj.weight4
896
+ output96 ��2layers.8.mlp.down_proj.weight1
897
+ output970� �2layers.8.mlp.up_proj.weight;
898
+ output98� 2(layers.8.post_attention_layernorm.weight3
899
+ output990� �2layers.8.mlp.gate_proj.weight@
900
+ output100��2'layers.9.self_attn.o_proj.o_proj.weight@
901
+ output101� �2)layers.9.self_attn.qkv_proj.v_proj.weight3
902
+ output102� 2layers.9.input_layernorm.weight8
903
+ output103�2%layers.9.self_attn.k_layernorm.weight@
904
+ output104� @2)layers.9.self_attn.qkv_proj.k_proj.weight8
905
+ output105�2%layers.9.self_attn.q_layernorm.weightA
906
+ output106� @2)layers.9.self_attn.qkv_proj.q_proj.weight5
907
+ output107 ��2layers.9.mlp.down_proj.weight2
908
+ output1080� �2layers.9.mlp.up_proj.weight<
909
+ output109� 2(layers.9.post_attention_layernorm.weight4
910
+ output1100� �2layers.9.mlp.gate_proj.weightA
911
+ output111��2(layers.10.self_attn.o_proj.o_proj.weightA
912
+ output112� �2*layers.10.self_attn.qkv_proj.v_proj.weight4
913
+ output113� 2 layers.10.input_layernorm.weight9
914
+ output114�2&layers.10.self_attn.k_layernorm.weightA
915
+ output115� @2*layers.10.self_attn.qkv_proj.k_proj.weight9
916
+ output116�2&layers.10.self_attn.q_layernorm.weightB
917
+ output117� @2*layers.10.self_attn.qkv_proj.q_proj.weight6
918
+ output118 ��2layers.10.mlp.down_proj.weight3
919
+ output1190� �2layers.10.mlp.up_proj.weight=
920
+ output120� 2)layers.10.post_attention_layernorm.weight5
921
+ output1210� �2layers.10.mlp.gate_proj.weightA
922
+ output122��2(layers.11.self_attn.o_proj.o_proj.weightA
923
+ output123� �2*layers.11.self_attn.qkv_proj.v_proj.weight4
924
+ output124� 2 layers.11.input_layernorm.weight9
925
+ output125�2&layers.11.self_attn.k_layernorm.weightA
926
+ output126� @2*layers.11.self_attn.qkv_proj.k_proj.weight9
927
+ output127�2&layers.11.self_attn.q_layernorm.weightB
928
+ output128� @2*layers.11.self_attn.qkv_proj.q_proj.weight6
929
+ output129 ��2layers.11.mlp.down_proj.weight3
930
+ output1300� �2layers.11.mlp.up_proj.weight=
931
+ output131� 2)layers.11.post_attention_layernorm.weight5
932
+ output1320� �2layers.11.mlp.gate_proj.weightA
933
+ output133��2(layers.12.self_attn.o_proj.o_proj.weightA
934
+ output134� �2*layers.12.self_attn.qkv_proj.v_proj.weight4
935
+ output135� 2 layers.12.input_layernorm.weight9
936
+ output136�2&layers.12.self_attn.k_layernorm.weightA
937
+ output137� @2*layers.12.self_attn.qkv_proj.k_proj.weight9
938
+ output138�2&layers.12.self_attn.q_layernorm.weightB
939
+ output139� @2*layers.12.self_attn.qkv_proj.q_proj.weight6
940
+ output140 ��2layers.12.mlp.down_proj.weight3
941
+ output1410� �2layers.12.mlp.up_proj.weight=
942
+ output142� 2)layers.12.post_attention_layernorm.weight5
943
+ output1430� �2layers.12.mlp.gate_proj.weightA
944
+ output144��2(layers.13.self_attn.o_proj.o_proj.weightA
945
+ output145� �2*layers.13.self_attn.qkv_proj.v_proj.weight4
946
+ output146� 2 layers.13.input_layernorm.weight9
947
+ output147�2&layers.13.self_attn.k_layernorm.weightA
948
+ output148� @2*layers.13.self_attn.qkv_proj.k_proj.weight9
949
+ output149�2&layers.13.self_attn.q_layernorm.weightB
950
+ output150� @2*layers.13.self_attn.qkv_proj.q_proj.weight6
951
+ output151 ��2layers.13.mlp.down_proj.weight3
952
+ output1520� �2layers.13.mlp.up_proj.weight=
953
+ output153� 2)layers.13.post_attention_layernorm.weight5
954
+ output1540� �2layers.13.mlp.gate_proj.weightA
955
+ output155��2(layers.14.self_attn.o_proj.o_proj.weightA
956
+ output156� �2*layers.14.self_attn.qkv_proj.v_proj.weight4
957
+ output157� 2 layers.14.input_layernorm.weight9
958
+ output158�2&layers.14.self_attn.k_layernorm.weightA
959
+ output159� @2*layers.14.self_attn.qkv_proj.k_proj.weight9
960
+ output160�2&layers.14.self_attn.q_layernorm.weightB
961
+ output161� @2*layers.14.self_attn.qkv_proj.q_proj.weight6
962
+ output162 ��2layers.14.mlp.down_proj.weight3
963
+ output1630� �2layers.14.mlp.up_proj.weight=
964
+ output164� 2)layers.14.post_attention_layernorm.weight5
965
+ output1650� �2layers.14.mlp.gate_proj.weightA
966
+ output166��2(layers.15.self_attn.o_proj.o_proj.weightA
967
+ output167� �2*layers.15.self_attn.qkv_proj.v_proj.weight4
968
+ output168� 2 layers.15.input_layernorm.weight9
969
+ output169�2&layers.15.self_attn.k_layernorm.weightA
970
+ output170� @2*layers.15.self_attn.qkv_proj.k_proj.weight9
971
+ output171�2&layers.15.self_attn.q_layernorm.weightB
972
+ output172� @2*layers.15.self_attn.qkv_proj.q_proj.weight6
973
+ output173 ��2layers.15.mlp.down_proj.weight3
974
+ output1740� �2layers.15.mlp.up_proj.weight=
975
+ output175� 2)layers.15.post_attention_layernorm.weight5
976
+ output1760� �2layers.15.mlp.gate_proj.weightA
977
+ output177��2(layers.16.self_attn.o_proj.o_proj.weightA
978
+ output178� �2*layers.16.self_attn.qkv_proj.v_proj.weight4
979
+ output179� 2 layers.16.input_layernorm.weight9
980
+ output180�2&layers.16.self_attn.k_layernorm.weightA
981
+ output181� @2*layers.16.self_attn.qkv_proj.k_proj.weight9
982
+ output182�2&layers.16.self_attn.q_layernorm.weightB
983
+ output183� @2*layers.16.self_attn.qkv_proj.q_proj.weight6
984
+ output184 ��2layers.16.mlp.down_proj.weight3
985
+ output1850� �2layers.16.mlp.up_proj.weight=
986
+ output186� 2)layers.16.post_attention_layernorm.weight5
987
+ output1870� �2layers.16.mlp.gate_proj.weightA
988
+ output188��2(layers.17.self_attn.o_proj.o_proj.weightA
989
+ output189� �2*layers.17.self_attn.qkv_proj.v_proj.weight4
990
+ output190� 2 layers.17.input_layernorm.weight9
991
+ output191�2&layers.17.self_attn.k_layernorm.weightA
992
+ output192� @2*layers.17.self_attn.qkv_proj.k_proj.weight9
993
+ output193�2&layers.17.self_attn.q_layernorm.weightB
994
+ output194� @2*layers.17.self_attn.qkv_proj.q_proj.weight6
995
+ output195 ��2layers.17.mlp.down_proj.weight3
996
+ output1960� �2layers.17.mlp.up_proj.weight=
997
+ output197� 2)layers.17.post_attention_layernorm.weight5
998
+ output1980� �2layers.17.mlp.gate_proj.weightA
999
+ output199��2(layers.18.self_attn.o_proj.o_proj.weightA
1000
+ output200� �2*layers.18.self_attn.qkv_proj.v_proj.weight4
1001
+ output201� 2 layers.18.input_layernorm.weight9
1002
+ output202�2&layers.18.self_attn.k_layernorm.weightA
1003
+ output203� @2*layers.18.self_attn.qkv_proj.k_proj.weight9
1004
+ output204�2&layers.18.self_attn.q_layernorm.weightB
1005
+ output205� @2*layers.18.self_attn.qkv_proj.q_proj.weight6
1006
+ output206 ��2layers.18.mlp.down_proj.weight3
1007
+ output2070� �2layers.18.mlp.up_proj.weight=
1008
+ output208� 2)layers.18.post_attention_layernorm.weight5
1009
+ output2090� �2layers.18.mlp.gate_proj.weightA
1010
+ output210��2(layers.19.self_attn.o_proj.o_proj.weightA
1011
+ output211� �2*layers.19.self_attn.qkv_proj.v_proj.weight4
1012
+ output212� 2 layers.19.input_layernorm.weight9
1013
+ output213�2&layers.19.self_attn.k_layernorm.weightA
1014
+ output214� @2*layers.19.self_attn.qkv_proj.k_proj.weight9
1015
+ output215�2&layers.19.self_attn.q_layernorm.weightB
1016
+ output216� @2*layers.19.self_attn.qkv_proj.q_proj.weight6
1017
+ output217 ��2layers.19.mlp.down_proj.weight3
1018
+ output2180� �2layers.19.mlp.up_proj.weight=
1019
+ output219� 2)layers.19.post_attention_layernorm.weight5
1020
+ output2200� �2layers.19.mlp.gate_proj.weightA
1021
+ output221��2(layers.20.self_attn.o_proj.o_proj.weightA
1022
+ output222� �2*layers.20.self_attn.qkv_proj.v_proj.weight4
1023
+ output223� 2 layers.20.input_layernorm.weight9
1024
+ output224�2&layers.20.self_attn.k_layernorm.weightA
1025
+ output225� @2*layers.20.self_attn.qkv_proj.k_proj.weight9
1026
+ output226�2&layers.20.self_attn.q_layernorm.weightB
1027
+ output227� @2*layers.20.self_attn.qkv_proj.q_proj.weight6
1028
+ output228 ��2layers.20.mlp.down_proj.weight3
1029
+ output2290� �2layers.20.mlp.up_proj.weight=
1030
+ output230� 2)layers.20.post_attention_layernorm.weight5
1031
+ output2310� �2layers.20.mlp.gate_proj.weightA
1032
+ output232��2(layers.21.self_attn.o_proj.o_proj.weightA
1033
+ output233� �2*layers.21.self_attn.qkv_proj.v_proj.weight4
1034
+ output234� 2 layers.21.input_layernorm.weight9
1035
+ output235�2&layers.21.self_attn.k_layernorm.weightA
1036
+ output236� @2*layers.21.self_attn.qkv_proj.k_proj.weight9
1037
+ output237�2&layers.21.self_attn.q_layernorm.weightB
1038
+ output238� @2*layers.21.self_attn.qkv_proj.q_proj.weight6
1039
+ output239 ��2layers.21.mlp.down_proj.weight3
1040
+ output2400� �2layers.21.mlp.up_proj.weight=
1041
+ output241� 2)layers.21.post_attention_layernorm.weight5
1042
+ output2420� �2layers.21.mlp.gate_proj.weightA
1043
+ output243��2(layers.22.self_attn.o_proj.o_proj.weightA
1044
+ output244� �2*layers.22.self_attn.qkv_proj.v_proj.weight4
1045
+ output245� 2 layers.22.input_layernorm.weight9
1046
+ output246�2&layers.22.self_attn.k_layernorm.weightA
1047
+ output247� @2*layers.22.self_attn.qkv_proj.k_proj.weight9
1048
+ output248�2&layers.22.self_attn.q_layernorm.weightB
1049
+ output249� @2*layers.22.self_attn.qkv_proj.q_proj.weight6
1050
+ output250 ��2layers.22.mlp.down_proj.weight3
1051
+ output2510� �2layers.22.mlp.up_proj.weight=
1052
+ output252� 2)layers.22.post_attention_layernorm.weight5
1053
+ output2530� �2layers.22.mlp.gate_proj.weightA
1054
+ output254��2(layers.23.self_attn.o_proj.o_proj.weightA
1055
+ output255� �2*layers.23.self_attn.qkv_proj.v_proj.weight4
1056
+ output256� 2 layers.23.input_layernorm.weight9
1057
+ output257�2&layers.23.self_attn.k_layernorm.weightA
1058
+ output258� @2*layers.23.self_attn.qkv_proj.k_proj.weight9
1059
+ output259�2&layers.23.self_attn.q_layernorm.weightB
1060
+ output260� @2*layers.23.self_attn.qkv_proj.q_proj.weight6
1061
+ output261 ��2layers.23.mlp.down_proj.weight3
1062
+ output2620� �2layers.23.mlp.up_proj.weight=
1063
+ output263� 2)layers.23.post_attention_layernorm.weight5
1064
+ output2640� �2layers.23.mlp.gate_proj.weightA
1065
+ output265��2(layers.24.self_attn.o_proj.o_proj.weightA
1066
+ output266� �2*layers.24.self_attn.qkv_proj.v_proj.weight4
1067
+ output267� 2 layers.24.input_layernorm.weight9
1068
+ output268�2&layers.24.self_attn.k_layernorm.weightA
1069
+ output269� @2*layers.24.self_attn.qkv_proj.k_proj.weight9
1070
+ output270�2&layers.24.self_attn.q_layernorm.weightB
1071
+ output271� @2*layers.24.self_attn.qkv_proj.q_proj.weight6
1072
+ output272 ��2layers.24.mlp.down_proj.weight3
1073
+ output2730� �2layers.24.mlp.up_proj.weight=
1074
+ output274� 2)layers.24.post_attention_layernorm.weight5
1075
+ output2750� �2layers.24.mlp.gate_proj.weightA
1076
+ output276��2(layers.25.self_attn.o_proj.o_proj.weightA
1077
+ output277� �2*layers.25.self_attn.qkv_proj.v_proj.weight4
1078
+ output278� 2 layers.25.input_layernorm.weight9
1079
+ output279�2&layers.25.self_attn.k_layernorm.weightA
1080
+ output280� @2*layers.25.self_attn.qkv_proj.k_proj.weight9
1081
+ output281�2&layers.25.self_attn.q_layernorm.weightB
1082
+ output282� @2*layers.25.self_attn.qkv_proj.q_proj.weight6
1083
+ output283 ��2layers.25.mlp.down_proj.weight3
1084
+ output2840� �2layers.25.mlp.up_proj.weight=
1085
+ output285� 2)layers.25.post_attention_layernorm.weight5
1086
+ output2860� �2layers.25.mlp.gate_proj.weightA
1087
+ output287��2(layers.26.self_attn.o_proj.o_proj.weightA
1088
+ output288� �2*layers.26.self_attn.qkv_proj.v_proj.weight4
1089
+ output289� 2 layers.26.input_layernorm.weight9
1090
+ output290�2&layers.26.self_attn.k_layernorm.weightA
1091
+ output291� @2*layers.26.self_attn.qkv_proj.k_proj.weight9
1092
+ output292�2&layers.26.self_attn.q_layernorm.weightB
1093
+ output293� @2*layers.26.self_attn.qkv_proj.q_proj.weight6
1094
+ output294 ��2layers.26.mlp.down_proj.weight3
1095
+ output2950� �2layers.26.mlp.up_proj.weight=
1096
+ output296� 2)layers.26.post_attention_layernorm.weight5
1097
+ output2970� �2layers.26.mlp.gate_proj.weightA
1098
+ output298��2(layers.27.self_attn.o_proj.o_proj.weightA
1099
+ output299� �2*layers.27.self_attn.qkv_proj.v_proj.weight4
1100
+ output300� 2 layers.27.input_layernorm.weight9
1101
+ output301�2&layers.27.self_attn.k_layernorm.weightA
1102
+ output302� @2*layers.27.self_attn.qkv_proj.k_proj.weight9
1103
+ output303�2&layers.27.self_attn.q_layernorm.weightB
1104
+ output304� @2*layers.27.self_attn.qkv_proj.q_proj.weight6
1105
+ output305 ��2layers.27.mlp.down_proj.weight3
1106
+ output3060� �2layers.27.mlp.up_proj.weight=
1107
+ output307� 2)layers.27.post_attention_layernorm.weight5
1108
+ output3080� �2layers.27.mlp.gate_proj.weightA
1109
+ output309��2(layers.28.self_attn.o_proj.o_proj.weightA
1110
+ output310� �2*layers.28.self_attn.qkv_proj.v_proj.weight4
1111
+ output311� 2 layers.28.input_layernorm.weight9
1112
+ output312�2&layers.28.self_attn.k_layernorm.weightA
1113
+ output313� @2*layers.28.self_attn.qkv_proj.k_proj.weight9
1114
+ output314�2&layers.28.self_attn.q_layernorm.weightB
1115
+ output315� @2*layers.28.self_attn.qkv_proj.q_proj.weight6
1116
+ output316 ��2layers.28.mlp.down_proj.weight3
1117
+ output3170� �2layers.28.mlp.up_proj.weight=
1118
+ output318� 2)layers.28.post_attention_layernorm.weight5
1119
+ output3190� �2layers.28.mlp.gate_proj.weightA
1120
+ output320��2(layers.29.self_attn.o_proj.o_proj.weightA
1121
+ output321� �2*layers.29.self_attn.qkv_proj.v_proj.weight4
1122
+ output322� 2 layers.29.input_layernorm.weight9
1123
+ output323�2&layers.29.self_attn.k_layernorm.weightA
1124
+ output324� @2*layers.29.self_attn.qkv_proj.k_proj.weight9
1125
+ output325�2&layers.29.self_attn.q_layernorm.weightB
1126
+ output326� @2*layers.29.self_attn.qkv_proj.q_proj.weight6
1127
+ output327 ��2layers.29.mlp.down_proj.weight3
1128
+ output3280� �2layers.29.mlp.up_proj.weight=
1129
+ output329� 2)layers.29.post_attention_layernorm.weight5
1130
+ output3300� �2layers.29.mlp.gate_proj.weightA
1131
+ output331��2(layers.30.self_attn.o_proj.o_proj.weightA
1132
+ output332� �2*layers.30.self_attn.qkv_proj.v_proj.weight4
1133
+ output333� 2 layers.30.input_layernorm.weight9
1134
+ output334�2&layers.30.self_attn.k_layernorm.weightA
1135
+ output335� @2*layers.30.self_attn.qkv_proj.k_proj.weight9
1136
+ output336�2&layers.30.self_attn.q_layernorm.weightB
1137
+ output337� @2*layers.30.self_attn.qkv_proj.q_proj.weight6
1138
+ output338 ��2layers.30.mlp.down_proj.weight3
1139
+ output3390� �2layers.30.mlp.up_proj.weight=
1140
+ output340� 2)layers.30.post_attention_layernorm.weight5
1141
+ output3410� �2layers.30.mlp.gate_proj.weightA
1142
+ output342��2(layers.31.self_attn.o_proj.o_proj.weightA
1143
+ output343� �2*layers.31.self_attn.qkv_proj.v_proj.weight4
1144
+ output344� 2 layers.31.input_layernorm.weight9
1145
+ output345�2&layers.31.self_attn.k_layernorm.weightA
1146
+ output346� @2*layers.31.self_attn.qkv_proj.k_proj.weight9
1147
+ output347�2&layers.31.self_attn.q_layernorm.weightB
1148
+ output348� @2*layers.31.self_attn.qkv_proj.q_proj.weight6
1149
+ output349 ��2layers.31.mlp.down_proj.weight3
1150
+ output3500� �2layers.31.mlp.up_proj.weight=
1151
+ output351� 2)layers.31.post_attention_layernorm.weight5
1152
+ output3520� �2layers.31.mlp.gate_proj.weightA
1153
+ output353��2(layers.32.self_attn.o_proj.o_proj.weightA
1154
+ output354� �2*layers.32.self_attn.qkv_proj.v_proj.weight4
1155
+ output355� 2 layers.32.input_layernorm.weight9
1156
+ output356�2&layers.32.self_attn.k_layernorm.weightA
1157
+ output357� @2*layers.32.self_attn.qkv_proj.k_proj.weight9
1158
+ output358�2&layers.32.self_attn.q_layernorm.weightB
1159
+ output359� @2*layers.32.self_attn.qkv_proj.q_proj.weight6
1160
+ output360 ��2layers.32.mlp.down_proj.weight3
1161
+ output3610� �2layers.32.mlp.up_proj.weight=
1162
+ output362� 2)layers.32.post_attention_layernorm.weight5
1163
+ output3630� �2layers.32.mlp.gate_proj.weightA
1164
+ output364��2(layers.33.self_attn.o_proj.o_proj.weightA
1165
+ output365� �2*layers.33.self_attn.qkv_proj.v_proj.weight4
1166
+ output366� 2 layers.33.input_layernorm.weight9
1167
+ output367�2&layers.33.self_attn.k_layernorm.weightA
1168
+ output368� @2*layers.33.self_attn.qkv_proj.k_proj.weight9
1169
+ output369�2&layers.33.self_attn.q_layernorm.weightB
1170
+ output370� @2*layers.33.self_attn.qkv_proj.q_proj.weight6
1171
+ output371 ��2layers.33.mlp.down_proj.weight3
1172
+ output3720� �2layers.33.mlp.up_proj.weight=
1173
+ output373� 2)layers.33.post_attention_layernorm.weight5
1174
+ output3740� �2layers.33.mlp.gate_proj.weightA
1175
+ output375��2(layers.34.self_attn.o_proj.o_proj.weightA
1176
+ output376� �2*layers.34.self_attn.qkv_proj.v_proj.weight4
1177
+ output377� 2 layers.34.input_layernorm.weight9
1178
+ output378�2&layers.34.self_attn.k_layernorm.weightA
1179
+ output379� @2*layers.34.self_attn.qkv_proj.k_proj.weight9
1180
+ output380�2&layers.34.self_attn.q_layernorm.weightB
1181
+ output381� @2*layers.34.self_attn.qkv_proj.q_proj.weight6
1182
+ output382 ��2layers.34.mlp.down_proj.weight3
1183
+ output3830� �2layers.34.mlp.up_proj.weight=
1184
+ output384� 2)layers.34.post_attention_layernorm.weight5
1185
+ output3850� �2layers.34.mlp.gate_proj.weightA
1186
+ output386��2(layers.35.self_attn.o_proj.o_proj.weightA
1187
+ output387� �2*layers.35.self_attn.qkv_proj.v_proj.weight4
1188
+ output388� 2 layers.35.input_layernorm.weight9
1189
+ output389�2&layers.35.self_attn.k_layernorm.weightA
1190
+ output390� @2*layers.35.self_attn.qkv_proj.k_proj.weight9
1191
+ output391�2&layers.35.self_attn.q_layernorm.weightB
1192
+ output392� @2*layers.35.self_attn.qkv_proj.q_proj.weight6
1193
+ output393 ��2layers.35.mlp.down_proj.weight3
1194
+ output3940� �2layers.35.mlp.up_proj.weight=
1195
+ output395� 2)layers.35.post_attention_layernorm.weight5
1196
+ output3960� �2layers.35.mlp.gate_proj.weight$
1197
+ output397��� 2lm_head.weight
1198
+ output398� 2 norm.weight
layout_opt/model/graph.hlo ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12b45b028e502b2dd8c42c1287fbdbea434454143a30d473806853bc18673d98
3
+ size 211060
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c36077018a9f85728962cc73bfcba755ce1d5d5b6f608dacf65d7b95596eb109
3
+ size 47198475
neuron_config.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "Qwen/Qwen3-8B",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "Qwen3ForCausalLM"
7
+ ],
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "attribute_map": {},
11
+ "bad_words_ids": null,
12
+ "begin_suppress_tokens": null,
13
+ "bos_token_id": 151643,
14
+ "chunk_size_feed_forward": 0,
15
+ "cross_attention_hidden_size": null,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": 151645,
22
+ "exponential_decay_length_penalty": null,
23
+ "finetuning_task": null,
24
+ "forced_bos_token_id": null,
25
+ "forced_eos_token_id": null,
26
+ "fused_spec_config": null,
27
+ "head_dim": 128,
28
+ "hidden_act": "silu",
29
+ "hidden_size": 4096,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 12288,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "max_position_embeddings": 40960,
45
+ "max_window_layers": 36,
46
+ "metadata": null,
47
+ "min_length": 0,
48
+ "model_type": "qwen3",
49
+ "neuron_config": {
50
+ "activation_quantization_type": null,
51
+ "allow_input_truncation": false,
52
+ "apply_seq_ids_mask": false,
53
+ "async_mode": false,
54
+ "attention_dp_degree": 1,
55
+ "attention_dtype": null,
56
+ "attn_block_cte_nki_kernel_enabled": false,
57
+ "attn_block_tkg_nki_kernel_cache_update": false,
58
+ "attn_block_tkg_nki_kernel_enabled": false,
59
+ "attn_cls": {
60
+ "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3",
61
+ "__name__": "NeuronQwen3Attention"
62
+ },
63
+ "attn_kernel_enabled": null,
64
+ "attn_tkg_builtin_kernel_enabled": false,
65
+ "attn_tkg_nki_kernel_enabled": false,
66
+ "batch_size": 1,
67
+ "bucket_n_active_tokens": false,
68
+ "buckets": [
69
+ 1024
70
+ ],
71
+ "cast_type": "config",
72
+ "cc_pipeline_tiling_factor": 2,
73
+ "chunked_prefill_config": null,
74
+ "context_encoding_buckets": null,
75
+ "cp_degree": 1,
76
+ "ctx_batch_size": 1,
77
+ "disable_kv_cache_tiling": false,
78
+ "draft_model_modules_to_not_convert": null,
79
+ "enable_bucketing": true,
80
+ "enable_eagle_draft_input_norm": false,
81
+ "enable_eagle_speculation": false,
82
+ "enable_fused_speculation": false,
83
+ "enable_long_context_mode": false,
84
+ "enable_output_completion_notifications": false,
85
+ "enable_spill_reload_dge": false,
86
+ "enable_token_tree": false,
87
+ "ep_degree": 1,
88
+ "expert_mlp_nki_kernel_enabled": null,
89
+ "flash_decoding_enabled": false,
90
+ "fused_qkv": false,
91
+ "fused_rmsnorm_skip_gamma": false,
92
+ "is_block_kv_layout": null,
93
+ "is_chunked_prefill": false,
94
+ "is_continuous_batching": true,
95
+ "is_eagle_draft": false,
96
+ "is_medusa": false,
97
+ "is_prefill_stage": null,
98
+ "is_prefix_caching": false,
99
+ "k_cache_transposed": false,
100
+ "kv_cache_batch_size": 1,
101
+ "kv_cache_padding_size": 0,
102
+ "kv_cache_quant": false,
103
+ "kv_cache_tiling": false,
104
+ "layer_boundary_markers": false,
105
+ "lm_head_pad": false,
106
+ "lm_head_pad_alignment_size": 1,
107
+ "local_ranks_size": 2,
108
+ "logical_nc_config": 1,
109
+ "lora_config": null,
110
+ "max_batch_size": 1,
111
+ "max_context_length": 1024,
112
+ "max_length": 1024,
113
+ "max_new_tokens": null,
114
+ "medusa_speculation_length": 0,
115
+ "medusa_tree": null,
116
+ "mlp_kernel_enabled": false,
117
+ "mlp_kernel_fuse_residual_add": false,
118
+ "modules_to_not_convert": null,
119
+ "moe_fused_nki_kernel_enabled": null,
120
+ "n_active_tokens": 1024,
121
+ "n_positions": 1024,
122
+ "num_medusa_heads": 0,
123
+ "on_cpu": false,
124
+ "on_device_sampling_config": {
125
+ "deterministic": false,
126
+ "do_sample": false,
127
+ "dynamic": true,
128
+ "global_topk": 256,
129
+ "on_device_sampling_config": true,
130
+ "temperature": 1.0,
131
+ "top_k": 1,
132
+ "top_k_kernel_enabled": false,
133
+ "top_p": 1.0
134
+ },
135
+ "output_logits": false,
136
+ "overrides_torch_dtype": true,
137
+ "pa_block_size": 1024,
138
+ "pa_num_blocks": 1,
139
+ "padding_side": "right",
140
+ "pp_degree": 1,
141
+ "prefix_buckets": null,
142
+ "qk_layernorm": false,
143
+ "qkv_kernel_enabled": false,
144
+ "qkv_kernel_fuse_residual_add": false,
145
+ "qkv_kernel_nbsd_layout": false,
146
+ "quantization_dtype": "int8",
147
+ "quantization_type": "per_tensor_symmetric",
148
+ "quantize_clamp_bound": Infinity,
149
+ "quantized": false,
150
+ "quantized_checkpoints_path": null,
151
+ "quantized_mlp_kernel_enabled": false,
152
+ "rmsnorm_quantize_kernel_enabled": false,
153
+ "router_topk_nki_kernel_enabled": null,
154
+ "rpl_reduce_dtype": null,
155
+ "save_sharded_checkpoint": true,
156
+ "scratchpad_page_size": null,
157
+ "seq_len": 1024,
158
+ "seq_len_threshold_for_cc_tiling": 16384,
159
+ "sequence_parallel_enabled": false,
160
+ "shared_mlp_nki_kernel_enabled": null,
161
+ "skip_sharding": false,
162
+ "skip_warmup": false,
163
+ "spec_batch_size": 1,
164
+ "speculation_length": 0,
165
+ "start_rank_id": 0,
166
+ "target": null,
167
+ "tile_cc": false,
168
+ "tkg_batch_size": 1,
169
+ "token_generation_buckets": null,
170
+ "token_tree_config": null,
171
+ "torch_dtype": "bfloat16",
172
+ "tp_degree": 2,
173
+ "vocab_parallel": false,
174
+ "weight_gather_seq_len_threshold": 32768,
175
+ "weights_to_skip_layout_optimization": [],
176
+ "world_size": 2
177
+ },
178
+ "no_repeat_ngram_size": 0,
179
+ "num_attention_heads": 32,
180
+ "num_beam_groups": 1,
181
+ "num_beams": 1,
182
+ "num_cores_per_group": 1,
183
+ "num_hidden_layers": 36,
184
+ "num_key_value_heads": 8,
185
+ "num_return_sequences": 1,
186
+ "output_attentions": false,
187
+ "output_hidden_states": false,
188
+ "output_scores": false,
189
+ "pad_token_id": null,
190
+ "prefix": null,
191
+ "problem_type": null,
192
+ "pruned_heads": {},
193
+ "remove_invalid_values": false,
194
+ "repetition_penalty": 1.0,
195
+ "return_dict": true,
196
+ "return_dict_in_generate": false,
197
+ "rms_norm_eps": 1e-06,
198
+ "rope_scaling": null,
199
+ "rope_theta": 1000000,
200
+ "sep_token_id": null,
201
+ "sliding_window": null,
202
+ "suppress_tokens": null,
203
+ "task_specific_params": null,
204
+ "temperature": 1.0,
205
+ "tf_legacy_loss": false,
206
+ "tie_encoder_decoder": false,
207
+ "tie_word_embeddings": false,
208
+ "tokenizer_class": null,
209
+ "top_k": 50,
210
+ "top_p": 1.0,
211
+ "torchscript": false,
212
+ "transformers_version": "4.51.0",
213
+ "typical_p": 1.0,
214
+ "use_bfloat16": false,
215
+ "use_cache": true,
216
+ "use_sliding_window": false,
217
+ "vocab_size": 151936
218
+ }
token_generation_model/_tp0_bk0/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb --output model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --enable-internal-neff-wrapper --verbose=35
token_generation_model/_tp0_bk0/compile_flags.MODULE_6ef5ba8b41fbbe77f080+74ae8282.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=1", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/home/ubuntu/qwen3/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"]
token_generation_model/_tp0_bk0/global_metric_store.json ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.8321762084961,
5
+ "StaticProfiler::AveragePartitionUtilization": 99.3888168334961,
6
+ "StaticProfiler::AveragePeUtilization": 99.65400695800781,
7
+ "StaticProfiler::LocalizationEfficiency": 109.9806137084961,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 110.06793212890625,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1,
16
+ "StaticProfiler::AveragePartitionUtilization": 1,
17
+ "StaticProfiler::AveragePeUtilization": 1,
18
+ "StaticProfiler::LocalizationEfficiency": 1,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 1.4457588195800781,
27
+ "AffinePredicateResolution": 0.05167531967163086,
28
+ "AliasDependencyElimination": 0.0026276111602783203,
29
+ "AliasDependencyInduction": 0.44934630393981934,
30
+ "AliasDependencyReset": 1.2677826881408691,
31
+ "BFComputeCutting": 0.06423807144165039,
32
+ "BirCodeGenLoop": 2.421293258666992,
33
+ "CCOpFusion": 0.41050028800964355,
34
+ "CanonicalizeConv": 9.999999974752427e-07,
35
+ "CanonicalizeDAGForPGTiling": 0.21233797073364258,
36
+ "CanonicalizeForTensorizer": 0.0003640000068116933,
37
+ "CanonicalizeIR": 0.06626629829406738,
38
+ "Canonicalizer": 0.007044999860227108,
39
+ "CoalesceCCOp": 0.19146490097045898,
40
+ "CommuteConcat": 0.03319668769836426,
41
+ "DMALocalityOpt": 0.035207271575927734,
42
+ "DMAProfiler": 0.08866691589355469,
43
+ "DMATilingProfiler": 0.07109546661376953,
44
+ "DataLocalityOpt": 1.910703182220459,
45
+ "DataStreaming": 0.15389323234558105,
46
+ "DeConcat": 0.012087583541870117,
47
+ "DeadCodeElimination": 0.035611867904663086,
48
+ "DeadStoreElimination": 0.37193870544433594,
49
+ "DelinearIndices": 0.2894127368927002,
50
+ "Delinearization": 0.1295926570892334,
51
+ "DoNothing": 0.00019550323486328125,
52
+ "DramToDramTranspose": 1.0679569244384766,
53
+ "DumpGraphAndMetadata": 0.24142217636108398,
54
+ "EliminateDivs": 0.17337489128112793,
55
+ "ExpandBatchNorm": 0.06027984619140625,
56
+ "ExpandISAMacro": 0.0909569263458252,
57
+ "FactorizeBlkDims": 0.24945974349975586,
58
+ "FactorizeThreadAxesInFreeDims": 0.03613853454589844,
59
+ "FlattenMacroLoop": 0.26774168014526367,
60
+ "GenericAccessSimplifier": 0.03175926208496094,
61
+ "HoistCompute": 4.8000001697801054e-05,
62
+ "IdentifyCrossPassTensors": 0.00013600000238511711,
63
+ "InferInitValue": 1.029360294342041,
64
+ "InferIntrinsicOnCC": 0.34307408332824707,
65
+ "InferNeuronTensor": 1.7935998439788818,
66
+ "InferNonlocalTensors": 3.6307339668273926,
67
+ "InferPSumTensor": 0.9782986640930176,
68
+ "InlineNativeKernels": 0.05374264717102051,
69
+ "InsertIOTransposes": 1.162278652191162,
70
+ "InsertLocalTransposes": 1.0349645614624023,
71
+ "InsertOffloadedTransposes": 0.0943443775177002,
72
+ "LICM": 0.1061861515045166,
73
+ "LateLegalizeInst": 0.22754216194152832,
74
+ "LateLegalizePostSplit": 0.09247255325317383,
75
+ "LateLowerReshapeOp": 0.04053616523742676,
76
+ "LateLowerTensorOp": 0.3356895446777344,
77
+ "LateNeuronInstComb": 0.4516925811767578,
78
+ "LayoutPreprocessing": 0.9441671371459961,
79
+ "LayoutPreprocessingAndAnalysis": 1.2680203914642334,
80
+ "LayoutRequirementAnalysis": 0.309098482131958,
81
+ "LegalizeCCOpLayout": 0.07318258285522461,
82
+ "LegalizeOpLevelAlias": 0.03343796730041504,
83
+ "LegalizePartitionReduce": 0.034781694412231445,
84
+ "LegalizeSundaAccess": 1.4558701515197754,
85
+ "LegalizeSundaMacro": 0.37755250930786133,
86
+ "LegalizeType": 0.20858454704284668,
87
+ "LocalLayoutOpt": 0.36218762397766113,
88
+ "LoopFusion": 0.31240200996398926,
89
+ "LoopSplitting": 0.013066768646240234,
90
+ "LowerBroadcast": 0.047890663146972656,
91
+ "LowerCCOpBlockAxis": 0.23094987869262695,
92
+ "LowerComplexBroadcast": 0.15572404861450195,
93
+ "LowerIntrinsics": 1.228858470916748,
94
+ "LowerTensorOp": 0.4897449016571045,
95
+ "LowerTranspose": 0.3995330333709717,
96
+ "MacroGeneration": 2.335334062576294,
97
+ "MaskPropagation": 0.14433836936950684,
98
+ "MemcastMotion": 0.00013000000035390258,
99
+ "MemcpyElimination": 3.9867260456085205,
100
+ "MutateDataType": 0.04344511032104492,
101
+ "NeuronAliasDependencyInduction": 0.025929927825927734,
102
+ "NeuronAliasDependencyReset": 0.04254412651062012,
103
+ "NeuronInstComb": 0.19350981712341309,
104
+ "NeuronLICM": 0.2897522449493408,
105
+ "NeuronLoopFusion": 0.4089043140411377,
106
+ "NeuronLoopInterchange": 0.04476189613342285,
107
+ "NeuronSimplifier": 0.30055856704711914,
108
+ "NeuronSimplifyPredicates": 0.18221426010131836,
109
+ "NeuronValueNumbering": 0.10663247108459473,
110
+ "OptimizeAliasedCopyChain": 0.01511383056640625,
111
+ "OptimizeNKIKernels": 0.4606451988220215,
112
+ "PAGLayoutOpt": 26.32272720336914,
113
+ "PComputeCutting": 0.302201509475708,
114
+ "PGLayoutTilingPipeline": 38.88710403442383,
115
+ "PGTiling": 4.423768043518066,
116
+ "PadElimination": 0.008622884750366211,
117
+ "ParAxesAnnotation": 25.272018432617188,
118
+ "PartialLoopFusion": 0.2368309497833252,
119
+ "PartialSimdFusion": 0.20722246170043945,
120
+ "PenguinizeFunctions": 0.00015999999595806003,
121
+ "PerfectLoopNest": 0.06273055076599121,
122
+ "PruneFunctions": 0.00016700000560376793,
123
+ "RecognizeOpIdiom": 0.20455479621887207,
124
+ "Recompute": 0.00649714469909668,
125
+ "RelaxPredicates": 0.154876708984375,
126
+ "Rematerialization": 0.16764259338378906,
127
+ "RemoveOptimizationBarriers": 0.00014099999680183828,
128
+ "ReshapeWeights": 0.021569013595581055,
129
+ "ResolveAccessConflict": 0.24012255668640137,
130
+ "ResolveComplicatePredicates": 0.05034017562866211,
131
+ "RewriteReplicationMatmul": 0.04589343070983887,
132
+ "RewriteWeights": 0.05840659141540527,
133
+ "SFKVectorizer": 3.1227571964263916,
134
+ "ScatterMotion": 0.0041600000113248825,
135
+ "SimpleAllReduceTiling": 0.06594347953796387,
136
+ "Simplifier": 0.11366057395935059,
137
+ "SimplifyMacroPredicates": 0.18840670585632324,
138
+ "SimplifyNeuronTensor": 1.3299446105957031,
139
+ "SimplifySlice": 0.03386688232421875,
140
+ "SimplifyTensor": 0.21405529975891113,
141
+ "SpillPSum": 0.5441117286682129,
142
+ "SplitAPUnionSets": 0.3313255310058594,
143
+ "SplitAccGrp": 0.03839588165283203,
144
+ "StaticProfiler": 0.13296246528625488,
145
+ "StaticTransposeLocalTensor": 0.21724367141723633,
146
+ "SundaISel": 1.6302134990692139,
147
+ "TCTransform": 0.03438615798950195,
148
+ "TensorInitialization": 0.13414645195007324,
149
+ "TensorOpSimplifier": 0.27712535858154297,
150
+ "TensorOpTransform": 0.8646912574768066,
151
+ "TensorizerLegalizationPass": 0.000155999994603917,
152
+ "TileCCOps": 0.263721227645874,
153
+ "TilingProfiler": 0.39296984672546387,
154
+ "TransformConvOp": 0.06336498260498047,
155
+ "TritiumFusion": 1.0901517868041992,
156
+ "ValueNumbering": 0.09328150749206543,
157
+ "VectorizeDMA": 0.03394460678100586,
158
+ "VectorizeMatMult": 0.0209348201751709,
159
+ "VerifySupportedOps": 0.00023200000578071922,
160
+ "WeightCoalescing": 0.05484199523925781,
161
+ "ZeroSizeTensorElimination": 0.0004336833953857422,
162
+ "algsimp": 0.0020280000753700733,
163
+ "batchnorm_expander": 0.0007249999907799065,
164
+ "boundary-marker-removal": 0.0004140000091865659,
165
+ "call-inliner": 0.0002570000069681555,
166
+ "canonicalize-boundary-marker": 0.00044800000614486635,
167
+ "collective-stream-id-checker": 7.000000186963007e-05,
168
+ "comparison-expander": 0.00041700000292621553,
169
+ "computation-deduplicator": 0.0004440000047907233,
170
+ "conditional-to-select": 8.70000003487803e-05,
171
+ "config-lowering": 0.00020700000459328294,
172
+ "constant_folding": 0.00016900000628083944,
173
+ "cse": 0.00043799998820759356,
174
+ "dce": 3.899999865097925e-05,
175
+ "dynamic-slice-transpose": 0.00015799999528098851,
176
+ "eliminate-redundant-compare": 0.0001539999939268455,
177
+ "emit-offloaded-dropout": 0.0002770000137388706,
178
+ "flatten-call-graph": 0.000299000006634742,
179
+ "fuse-send-recv": 0.0015030000358819962,
180
+ "hilo::LegalizeAlias": 0.003281000070273876,
181
+ "hilo::NeuronInstCombine": 0.0011020000092685223,
182
+ "hilo::NeuronOpFusion": 0.0003429999924264848,
183
+ "hilo::ReplaceTokenTypeWithU8Pass": 0.00018600000475998968,
184
+ "hilo::ScheduleFusion": 3.5000000934815034e-05,
185
+ "hilo::SixtyFourHack": 0.00020599999697878957,
186
+ "hilo::VerifyAliasing": 7.000000186963007e-05,
187
+ "hlo-mac-count": 0.0006559999892488122,
188
+ "hlo-verifier": 0.006031000055372715,
189
+ "io-con-pipe-begin": 4.999999873689376e-06,
190
+ "io-con-pipe-end": 9.999999974752427e-07,
191
+ "io-layout-normalization": 0.0009500000160187483,
192
+ "legalize-ccops": 1.700000029813964e-05,
193
+ "legalize-compare": 0.00036899998667649925,
194
+ "lower-argminmax-custom-call": 0.00013800000306218863,
195
+ "map-inline": 0.0006319999811239541,
196
+ "metadata-naming": 0.0009749999735504389,
197
+ "mlir::detail::OpToOpPassAdaptor": 0.00022499999613501132,
198
+ "mlir::hlo::MhloToPyPenguin": 0.025104999542236328,
199
+ "mlir::mhlo::LowerComplexExtraPass": 0.002770999912172556,
200
+ "mlir::mhlo::LowerComplexPass": 0.001180000021122396,
201
+ "native-to-custom-softmax": 0.00041199999395757914,
202
+ "native-to-custom-softmax-dx": 0.00042600001324899495,
203
+ "operand_upcaster": 0.0007089999853633344,
204
+ "post-par-pipe-begin": 9.999999974752427e-07,
205
+ "post-par-pipe-end": 0.0,
206
+ "post-partition-simplification": 0.05639899894595146,
207
+ "pre-hlo-begin": 4.999999873689376e-06,
208
+ "pre-hlo-end": 9.999999974752427e-07,
209
+ "replace-minimum-constant": 0.0002209999947808683,
210
+ "reshape-mover": 7.400000322377309e-05,
211
+ "simplify-concat": 0.0018210000125691295,
212
+ "simplify-while-loops": 5.500000042957254e-05,
213
+ "transform-variadic-reduce": 0.0006440000142902136,
214
+ "tuple-simplifier": 0.00016700000560376793,
215
+ "unpack-nested-aws-ntwsr": 0.00035700001171790063,
216
+ "unroll-while-loop": 1.1000000085914508e-05
217
+ },
218
+ "hilo": {
219
+ "HloMacCount": 3802996736.0,
220
+ "Traffic": 8267154432.0
221
+ },
222
+ "tensorizer": {
223
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 262321,
224
+ "StaticProfiler::AifUb": 10.559271812438965,
225
+ "StaticProfiler::ArithmeticIntensityTensorizer": 11.613152503967285,
226
+ "StaticProfiler::AverageDmaLength": 6652.8759765625,
227
+ "StaticProfiler::DDRTransferBytes": 7587185496,
228
+ "StaticProfiler::InternalTransferBytes": 632323092,
229
+ "StaticProfiler::LoadExpanded": 1033407,
230
+ "StaticProfiler::StoreExpanded": 3422,
231
+ "StaticProfiler::TotalDMAExpanded": 1036829,
232
+ "StaticProfiler::TotalDynamicInstancesCount": 275548,
233
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 274994,
234
+ "StaticProfiler::TotalLNCComm": 0,
235
+ "StaticProfiler::TotalLNCCommTransfer": 0,
236
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0,
237
+ "TilingProfiler::DmaInstructionsAfterTiling": 0,
238
+ "TilingProfiler::GenericInstructionsAfterTiling": 79,
239
+ "TilingProfiler::MatMultInstructionsAfterTiling": 231408,
240
+ "TilingProfiler::NumPfTransposes": 398,
241
+ "TilingProfiler::NumPfTransposesForIo": 37,
242
+ "TilingProfiler::NumPfTransposesForLocal": 216,
243
+ "TilingProfiler::NumPfTransposesForNonlocal": 145,
244
+ "TilingProfiler::PfTransposeInstructions": 19513,
245
+ "TilingProfiler::PfTransposeInstructionsForIo": 19152,
246
+ "TilingProfiler::PfTransposeInstructionsForLocal": 216,
247
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 145,
248
+ "TilingProfiler::ReduceInstructionsAfterTiling": 74,
249
+ "TilingProfiler::SimdInstructionsAfterTiling": 2999,
250
+ "TilingProfiler::TotalInstructionsAfterTiling": 0,
251
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
252
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
253
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
254
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
255
+ "TransformConvOp::conv2d_column_packing": 0,
256
+ "TransformConvOp::conv2d_column_packing_1": 0,
257
+ "TransformConvOp::conv2d_column_packing_io10": 0,
258
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
259
+ }
260
+ },
261
+ "all": {
262
+ "compiletime": {
263
+ "CanonicalizeConv": 9.999999974752427e-07,
264
+ "CanonicalizeForTensorizer": 0.0003640000068116933,
265
+ "Canonicalizer": 0.007044999860227108,
266
+ "HoistCompute": 4.8000001697801054e-05,
267
+ "IdentifyCrossPassTensors": 0.00013600000238511711,
268
+ "MemcastMotion": 0.00013000000035390258,
269
+ "PenguinizeFunctions": 0.00015999999595806003,
270
+ "PruneFunctions": 0.00016700000560376793,
271
+ "RemoveOptimizationBarriers": 0.00014099999680183828,
272
+ "ScatterMotion": 0.0041600000113248825,
273
+ "TensorizerLegalizationPass": 0.000155999994603917,
274
+ "VerifySupportedOps": 0.00023200000578071922,
275
+ "algsimp": 0.0020280000753700733,
276
+ "batchnorm_expander": 0.0007249999907799065,
277
+ "boundary-marker-removal": 0.0004140000091865659,
278
+ "call-inliner": 0.0002570000069681555,
279
+ "canonicalize-boundary-marker": 0.00044800000614486635,
280
+ "collective-stream-id-checker": 7.000000186963007e-05,
281
+ "comparison-expander": 0.00041700000292621553,
282
+ "computation-deduplicator": 0.0004440000047907233,
283
+ "conditional-to-select": 8.70000003487803e-05,
284
+ "config-lowering": 0.00020700000459328294,
285
+ "constant_folding": 0.00016900000628083944,
286
+ "cse": 0.00043799998820759356,
287
+ "dce": 3.899999865097925e-05,
288
+ "dynamic-slice-transpose": 0.00015799999528098851,
289
+ "eliminate-redundant-compare": 0.0001539999939268455,
290
+ "emit-offloaded-dropout": 0.0002770000137388706,
291
+ "flatten-call-graph": 0.000299000006634742,
292
+ "fuse-send-recv": 0.0015030000358819962,
293
+ "hilo::LegalizeAlias": 0.003281000070273876,
294
+ "hilo::NeuronInstCombine": 0.0011020000092685223,
295
+ "hilo::NeuronOpFusion": 0.0003429999924264848,
296
+ "hilo::ReplaceTokenTypeWithU8Pass": 0.00018600000475998968,
297
+ "hilo::ScheduleFusion": 3.5000000934815034e-05,
298
+ "hilo::SixtyFourHack": 0.00020599999697878957,
299
+ "hilo::VerifyAliasing": 7.000000186963007e-05,
300
+ "hlo-mac-count": 0.0006559999892488122,
301
+ "hlo-verifier": 0.006031000055372715,
302
+ "io-con-pipe-begin": 4.999999873689376e-06,
303
+ "io-con-pipe-end": 9.999999974752427e-07,
304
+ "io-layout-normalization": 0.0009500000160187483,
305
+ "legalize-ccops": 1.700000029813964e-05,
306
+ "legalize-compare": 0.00036899998667649925,
307
+ "lower-argminmax-custom-call": 0.00013800000306218863,
308
+ "map-inline": 0.0006319999811239541,
309
+ "metadata-naming": 0.0009749999735504389,
310
+ "mlir::detail::OpToOpPassAdaptor": 0.00022499999613501132,
311
+ "mlir::hlo::MhloToPyPenguin": 0.025104999542236328,
312
+ "mlir::mhlo::LowerComplexExtraPass": 0.002770999912172556,
313
+ "mlir::mhlo::LowerComplexPass": 0.001180000021122396,
314
+ "native-to-custom-softmax": 0.00041199999395757914,
315
+ "native-to-custom-softmax-dx": 0.00042600001324899495,
316
+ "operand_upcaster": 0.0007089999853633344,
317
+ "post-par-pipe-begin": 9.999999974752427e-07,
318
+ "post-par-pipe-end": 0.0,
319
+ "post-partition-simplification": 0.05639899894595146,
320
+ "pre-hlo-begin": 4.999999873689376e-06,
321
+ "pre-hlo-end": 9.999999974752427e-07,
322
+ "replace-minimum-constant": 0.0002209999947808683,
323
+ "reshape-mover": 7.400000322377309e-05,
324
+ "simplify-concat": 0.0018210000125691295,
325
+ "simplify-while-loops": 5.500000042957254e-05,
326
+ "transform-variadic-reduce": 0.0006440000142902136,
327
+ "tuple-simplifier": 0.00016700000560376793,
328
+ "unpack-nested-aws-ntwsr": 0.00035700001171790063,
329
+ "unroll-while-loop": 1.1000000085914508e-05
330
+ }
331
+ },
332
+ "cumsum": {
333
+ "compiletime": {
334
+ "CoalesceCCOp": 0.0008378028869628906,
335
+ "DMALocalityOpt": 0.0003306865692138672,
336
+ "DMAProfiler": 0.0007596015930175781,
337
+ "DataStreaming": 0.0002918243408203125,
338
+ "DoNothing": 0.00012636184692382813,
339
+ "ExpandISAMacro": 0.0005497932434082031,
340
+ "FactorizeBlkDims": 0.0004723072052001953,
341
+ "InferPSumTensor": 0.000583648681640625,
342
+ "LateLegalizeInst": 0.00040459632873535156,
343
+ "LateNeuronInstComb": 0.0004837512969970703,
344
+ "LegalizeSundaAccess": 0.0015611648559570313,
345
+ "LegalizeType": 0.00025010108947753906,
346
+ "LowerBroadcast": 0.0009808540344238281,
347
+ "LowerIntrinsics": 0.0002262592315673828,
348
+ "LowerTranspose": 0.00021767616271972656,
349
+ "NeuronInstComb": 0.0004963874816894531,
350
+ "NeuronLICM": 0.0006859302520751953,
351
+ "NeuronSimplifyPredicates": 0.002815723419189453,
352
+ "NeuronValueNumbering": 0.0004124641418457031,
353
+ "SFKVectorizer": 0.0027742385864257813,
354
+ "SimpleAllReduceTiling": 0.000209808349609375,
355
+ "SimplifyNeuronTensor": 0.00040721893310546875,
356
+ "SpillPSum": 0.0009286403656005859,
357
+ "WeightCoalescing": 0.0002105236053466797
358
+ }
359
+ },
360
+ "sg00": {
361
+ "hilo": {
362
+ "ArithmeticIntensity": 0.9200255870819092,
363
+ "HloMacCount": 3802996736.0,
364
+ "Traffic": 8267154432.0
365
+ }
366
+ },
367
+ "sg0000": {
368
+ "compiletime": {
369
+ "AGOrderingAnalysisPass": 1.4457588195800781,
370
+ "AffinePredicateResolution": 0.05167531967163086,
371
+ "AliasDependencyElimination": 0.0026276111602783203,
372
+ "AliasDependencyInduction": 0.44934630393981934,
373
+ "AliasDependencyReset": 1.2677826881408691,
374
+ "BFComputeCutting": 0.06423807144165039,
375
+ "BirCodeGenLoop": 2.421293258666992,
376
+ "CCOpFusion": 0.41050028800964355,
377
+ "CanonicalizeDAGForPGTiling": 0.21233797073364258,
378
+ "CanonicalizeIR": 0.06626629829406738,
379
+ "CoalesceCCOp": 0.1906270980834961,
380
+ "CommuteConcat": 0.03319668769836426,
381
+ "DMALocalityOpt": 0.03487658500671387,
382
+ "DMAProfiler": 0.08790731430053711,
383
+ "DMATilingProfiler": 0.07109546661376953,
384
+ "DataLocalityOpt": 1.910703182220459,
385
+ "DataStreaming": 0.15360140800476074,
386
+ "DeConcat": 0.012087583541870117,
387
+ "DeadCodeElimination": 0.035611867904663086,
388
+ "DeadStoreElimination": 0.37193870544433594,
389
+ "DelinearIndices": 0.2894127368927002,
390
+ "Delinearization": 0.1295926570892334,
391
+ "DoNothing": 6.914138793945313e-05,
392
+ "DramToDramTranspose": 1.0679569244384766,
393
+ "DumpGraphAndMetadata": 0.24142217636108398,
394
+ "EliminateDivs": 0.17337489128112793,
395
+ "ExpandBatchNorm": 0.06027984619140625,
396
+ "ExpandISAMacro": 0.09040713310241699,
397
+ "FactorizeBlkDims": 0.24898743629455566,
398
+ "FactorizeThreadAxesInFreeDims": 0.03613853454589844,
399
+ "FlattenMacroLoop": 0.26774168014526367,
400
+ "GenericAccessSimplifier": 0.03175926208496094,
401
+ "InferInitValue": 1.029360294342041,
402
+ "InferIntrinsicOnCC": 0.34307408332824707,
403
+ "InferNeuronTensor": 1.7935998439788818,
404
+ "InferNonlocalTensors": 3.6307339668273926,
405
+ "InferPSumTensor": 0.977715015411377,
406
+ "InlineNativeKernels": 0.05374264717102051,
407
+ "InsertIOTransposes": 1.162278652191162,
408
+ "InsertLocalTransposes": 1.0349645614624023,
409
+ "InsertOffloadedTransposes": 0.0943443775177002,
410
+ "LICM": 0.1061861515045166,
411
+ "LateLegalizeInst": 0.22713756561279297,
412
+ "LateLegalizePostSplit": 0.09247255325317383,
413
+ "LateLowerReshapeOp": 0.04053616523742676,
414
+ "LateLowerTensorOp": 0.3356895446777344,
415
+ "LateNeuronInstComb": 0.45120882987976074,
416
+ "LayoutPreprocessing": 0.9441671371459961,
417
+ "LayoutPreprocessingAndAnalysis": 1.2680203914642334,
418
+ "LayoutRequirementAnalysis": 0.309098482131958,
419
+ "LegalizeCCOpLayout": 0.07318258285522461,
420
+ "LegalizeOpLevelAlias": 0.03343796730041504,
421
+ "LegalizePartitionReduce": 0.034781694412231445,
422
+ "LegalizeSundaAccess": 1.4543089866638184,
423
+ "LegalizeSundaMacro": 0.37755250930786133,
424
+ "LegalizeType": 0.20833444595336914,
425
+ "LocalLayoutOpt": 0.36218762397766113,
426
+ "LoopFusion": 0.31240200996398926,
427
+ "LoopSplitting": 0.013066768646240234,
428
+ "LowerBroadcast": 0.04690980911254883,
429
+ "LowerCCOpBlockAxis": 0.23094987869262695,
430
+ "LowerComplexBroadcast": 0.15572404861450195,
431
+ "LowerIntrinsics": 1.2286322116851807,
432
+ "LowerTensorOp": 0.4897449016571045,
433
+ "LowerTranspose": 0.39931535720825195,
434
+ "MacroGeneration": 2.335334062576294,
435
+ "MaskPropagation": 0.14433836936950684,
436
+ "MemcpyElimination": 3.9867260456085205,
437
+ "MutateDataType": 0.04344511032104492,
438
+ "NeuronAliasDependencyInduction": 0.025929927825927734,
439
+ "NeuronAliasDependencyReset": 0.04254412651062012,
440
+ "NeuronInstComb": 0.19301342964172363,
441
+ "NeuronLICM": 0.2890663146972656,
442
+ "NeuronLoopFusion": 0.4089043140411377,
443
+ "NeuronLoopInterchange": 0.04476189613342285,
444
+ "NeuronSimplifier": 0.30055856704711914,
445
+ "NeuronSimplifyPredicates": 0.1793985366821289,
446
+ "NeuronValueNumbering": 0.10622000694274902,
447
+ "OptimizeAliasedCopyChain": 0.01511383056640625,
448
+ "OptimizeNKIKernels": 0.4606451988220215,
449
+ "PAGLayoutOpt": 26.32272720336914,
450
+ "PComputeCutting": 0.302201509475708,
451
+ "PGLayoutTilingPipeline": 38.88710403442383,
452
+ "PGTiling": 4.423768043518066,
453
+ "PadElimination": 0.008622884750366211,
454
+ "ParAxesAnnotation": 25.272018432617188,
455
+ "PartialLoopFusion": 0.2368309497833252,
456
+ "PartialSimdFusion": 0.20722246170043945,
457
+ "PerfectLoopNest": 0.06273055076599121,
458
+ "RecognizeOpIdiom": 0.20455479621887207,
459
+ "Recompute": 0.00649714469909668,
460
+ "RelaxPredicates": 0.154876708984375,
461
+ "Rematerialization": 0.16764259338378906,
462
+ "ReshapeWeights": 0.021569013595581055,
463
+ "ResolveAccessConflict": 0.24012255668640137,
464
+ "ResolveComplicatePredicates": 0.05034017562866211,
465
+ "RewriteReplicationMatmul": 0.04589343070983887,
466
+ "RewriteWeights": 0.05840659141540527,
467
+ "SFKVectorizer": 3.119982957839966,
468
+ "SimpleAllReduceTiling": 0.06573367118835449,
469
+ "Simplifier": 0.11366057395935059,
470
+ "SimplifyMacroPredicates": 0.18840670585632324,
471
+ "SimplifyNeuronTensor": 1.3295373916625977,
472
+ "SimplifySlice": 0.03386688232421875,
473
+ "SimplifyTensor": 0.21405529975891113,
474
+ "SpillPSum": 0.5431830883026123,
475
+ "SplitAPUnionSets": 0.3313255310058594,
476
+ "SplitAccGrp": 0.03839588165283203,
477
+ "StaticProfiler": 0.13296246528625488,
478
+ "StaticTransposeLocalTensor": 0.21724367141723633,
479
+ "SundaISel": 1.6302134990692139,
480
+ "TCTransform": 0.03438615798950195,
481
+ "TensorInitialization": 0.13414645195007324,
482
+ "TensorOpSimplifier": 0.27712535858154297,
483
+ "TensorOpTransform": 0.8646912574768066,
484
+ "TileCCOps": 0.263721227645874,
485
+ "TilingProfiler": 0.39296984672546387,
486
+ "TransformConvOp": 0.06336498260498047,
487
+ "TritiumFusion": 1.0901517868041992,
488
+ "ValueNumbering": 0.09328150749206543,
489
+ "VectorizeDMA": 0.03394460678100586,
490
+ "VectorizeMatMult": 0.0209348201751709,
491
+ "WeightCoalescing": 0.05463147163391113,
492
+ "ZeroSizeTensorElimination": 0.0004336833953857422
493
+ },
494
+ "tensorizer": {
495
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 262321,
496
+ "StaticProfiler::AifUb": 10.559271812438965,
497
+ "StaticProfiler::ArithmeticIntensityTensorizer": 11.613152503967285,
498
+ "StaticProfiler::AverageDmaLength": 6652.8759765625,
499
+ "StaticProfiler::AverageFractalPeUtilization": 99.8321762084961,
500
+ "StaticProfiler::AveragePartitionUtilization": 99.3888168334961,
501
+ "StaticProfiler::AveragePeUtilization": 99.65400695800781,
502
+ "StaticProfiler::DDRTransferBytes": 7587185496,
503
+ "StaticProfiler::InternalTransferBytes": 632323092,
504
+ "StaticProfiler::LoadExpanded": 1033407,
505
+ "StaticProfiler::LocalizationEfficiency": 109.9806137084961,
506
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 110.06793212890625,
507
+ "StaticProfiler::StoreExpanded": 3422,
508
+ "StaticProfiler::TotalDMAExpanded": 1036829,
509
+ "StaticProfiler::TotalDynamicInstancesCount": 275548,
510
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 274994,
511
+ "StaticProfiler::TotalLNCComm": 0,
512
+ "StaticProfiler::TotalLNCCommTransfer": 0,
513
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
514
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0,
515
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0,
516
+ "TilingProfiler::DmaInstructionsAfterTiling": 0,
517
+ "TilingProfiler::GenericInstructionsAfterTiling": 79,
518
+ "TilingProfiler::MatMultInstructionsAfterTiling": 231408,
519
+ "TilingProfiler::NumPfTransposes": 398,
520
+ "TilingProfiler::NumPfTransposesForIo": 37,
521
+ "TilingProfiler::NumPfTransposesForLocal": 216,
522
+ "TilingProfiler::NumPfTransposesForNonlocal": 145,
523
+ "TilingProfiler::PfTransposeInstructions": 19513,
524
+ "TilingProfiler::PfTransposeInstructionsForIo": 19152,
525
+ "TilingProfiler::PfTransposeInstructionsForLocal": 216,
526
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 145,
527
+ "TilingProfiler::ReduceInstructionsAfterTiling": 74,
528
+ "TilingProfiler::SimdInstructionsAfterTiling": 2999,
529
+ "TilingProfiler::TotalInstructionsAfterTiling": 0,
530
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
531
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
532
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
533
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
534
+ "TransformConvOp::conv2d_column_packing": 0,
535
+ "TransformConvOp::conv2d_column_packing_1": 0,
536
+ "TransformConvOp::conv2d_column_packing_io10": 0,
537
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
538
+ }
539
+ }
540
+ }
token_generation_model/_tp0_bk0/graph.neff ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82be447a0a308a6e83990d1f3d193b4dc43ab835b136e7c27647ecf6cde94383
3
+ size 6001664
token_generation_model/_tp0_bk0/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
token_generation_model/_tp0_bk0/metaneff.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f42b279a662fc21e6bb94ab8bdb96ad553535cec385b6c8909a4e7622fad939
3
+ size 985283