dimabr commited on
Commit
e8d3353
·
verified ·
1 Parent(s): e56438a

Add files using upload-large-folder tool

Browse files
Files changed (49) hide show
  1. README.md +200 -0
  2. context_encoding_model/_tp0_bk0/command.txt +1 -0
  3. context_encoding_model/_tp0_bk0/compile_flags.MODULE_c6824be80aab0b095843+cc19d8a1.json +1 -0
  4. context_encoding_model/_tp0_bk0/global_metric_store.json +1079 -0
  5. context_encoding_model/_tp0_bk0/log-neuron-cc.txt +0 -0
  6. context_encoding_model/_tp0_bk0/neuron_config.json +213 -0
  7. context_encoding_model/_tp0_bk1/command.txt +1 -0
  8. context_encoding_model/_tp0_bk1/compile_flags.MODULE_68c159ab1fef44a40212+6a9a7e72.json +1 -0
  9. context_encoding_model/_tp0_bk1/global_metric_store.json +1079 -0
  10. context_encoding_model/_tp0_bk1/log-neuron-cc.txt +0 -0
  11. context_encoding_model/_tp0_bk1/neuron_config.json +213 -0
  12. context_encoding_model/_tp0_bk2/command.txt +1 -0
  13. context_encoding_model/_tp0_bk2/compile_flags.MODULE_78e5291800ea5b96a03b+442879bd.json +1 -0
  14. context_encoding_model/_tp0_bk2/global_metric_store.json +1079 -0
  15. context_encoding_model/_tp0_bk2/log-neuron-cc.txt +0 -0
  16. context_encoding_model/_tp0_bk2/neuron_config.json +213 -0
  17. context_encoding_model/_tp0_bk3/command.txt +1 -0
  18. context_encoding_model/_tp0_bk3/log-neuron-cc.txt +0 -0
  19. generation_config.json +6 -0
  20. neuron_config.json +211 -0
  21. params.json +11 -0
  22. special_tokens_map.json +23 -0
  23. token_generation_model/_tp0_bk0/command.txt +1 -0
  24. token_generation_model/_tp0_bk0/compile_flags.MODULE_67d3774d5bacfe6ba851+72d461cc.json +1 -0
  25. token_generation_model/_tp0_bk0/global_metric_store.json +540 -0
  26. token_generation_model/_tp0_bk0/log-neuron-cc.txt +0 -0
  27. token_generation_model/_tp0_bk0/neuron_config.json +213 -0
  28. token_generation_model/_tp0_bk1/command.txt +1 -0
  29. token_generation_model/_tp0_bk1/compile_flags.MODULE_92bbfea7801df2fea75e+4948da29.json +1 -0
  30. token_generation_model/_tp0_bk1/global_metric_store.json +540 -0
  31. token_generation_model/_tp0_bk1/log-neuron-cc.txt +0 -0
  32. token_generation_model/_tp0_bk1/neuron_config.json +213 -0
  33. token_generation_model/_tp0_bk2/command.txt +1 -0
  34. token_generation_model/_tp0_bk2/compile_flags.MODULE_2f686dc6ba7ef3326a56+6113de8c.json +1 -0
  35. token_generation_model/_tp0_bk2/global_metric_store.json +540 -0
  36. token_generation_model/_tp0_bk2/log-neuron-cc.txt +0 -0
  37. token_generation_model/_tp0_bk2/neuron_config.json +213 -0
  38. token_generation_model/_tp0_bk3/command.txt +1 -0
  39. token_generation_model/_tp0_bk3/compile_flags.MODULE_668122c92a86c0ce6817+f94fe8ed.json +1 -0
  40. token_generation_model/_tp0_bk3/global_metric_store.json +540 -0
  41. token_generation_model/_tp0_bk3/log-neuron-cc.txt +0 -0
  42. token_generation_model/_tp0_bk3/neuron_config.json +213 -0
  43. token_generation_model/_tp0_bk4/command.txt +1 -0
  44. token_generation_model/_tp0_bk4/compile_flags.MODULE_fb6decaa94b1936d08da+1b5847e3.json +1 -0
  45. token_generation_model/_tp0_bk4/global_metric_store.json +540 -0
  46. token_generation_model/_tp0_bk4/log-neuron-cc.txt +0 -0
  47. token_generation_model/_tp0_bk4/neuron_config.json +213 -0
  48. tokenizer.json +0 -0
  49. tokenizer_config.json +0 -0
README.md ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: vllm
3
+ license: apache-2.0
4
+ base_model: mistralai/Mistral-7B-v0.3
5
+ extra_gated_description: >-
6
+ If you want to learn more about how we process your personal data, please read
7
+ our <a href="https://mistral.ai/terms/">Privacy Policy</a>.
8
+ tags:
9
+ - vllm
10
+ - mistral-common
11
+ ---
12
+
13
+ # Model Card for Mistral-7B-Instruct-v0.3
14
+
15
+ The Mistral-7B-Instruct-v0.3 Large Language Model (LLM) is an instruct fine-tuned version of the Mistral-7B-v0.3.
16
+
17
+ Mistral-7B-v0.3 has the following changes compared to [Mistral-7B-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/edit/main/README.md)
18
+ - Extended vocabulary to 32768
19
+ - Supports v3 Tokenizer
20
+ - Supports function calling
21
+
22
+ ## Installation
23
+
24
+ It is recommended to use `mistralai/Mistral-7B-Instruct-v0.3` with [mistral-inference](https://github.com/mistralai/mistral-inference). For HF transformers code snippets, please keep scrolling.
25
+
26
+ ```
27
+ pip install mistral_inference
28
+ ```
29
+
30
+ ## Download
31
+
32
+ ```py
33
+ from huggingface_hub import snapshot_download
34
+ from pathlib import Path
35
+
36
+ mistral_models_path = Path.home().joinpath('mistral_models', '7B-Instruct-v0.3')
37
+ mistral_models_path.mkdir(parents=True, exist_ok=True)
38
+
39
+ snapshot_download(repo_id="mistralai/Mistral-7B-Instruct-v0.3", allow_patterns=["params.json", "consolidated.safetensors", "tokenizer.model.v3"], local_dir=mistral_models_path)
40
+ ```
41
+
42
+ ### Chat
43
+
44
+ After installing `mistral_inference`, a `mistral-chat` CLI command should be available in your environment. You can chat with the model using
45
+
46
+ ```
47
+ mistral-chat $HOME/mistral_models/7B-Instruct-v0.3 --instruct --max_tokens 256
48
+ ```
49
+
50
+ ### Instruct following
51
+
52
+ ```py
53
+ from mistral_inference.transformer import Transformer
54
+ from mistral_inference.generate import generate
55
+
56
+ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
57
+ from mistral_common.protocol.instruct.messages import UserMessage
58
+ from mistral_common.protocol.instruct.request import ChatCompletionRequest
59
+
60
+
61
+ tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")
62
+ model = Transformer.from_folder(mistral_models_path)
63
+
64
+ completion_request = ChatCompletionRequest(messages=[UserMessage(content="Explain Machine Learning to me in a nutshell.")])
65
+
66
+ tokens = tokenizer.encode_chat_completion(completion_request).tokens
67
+
68
+ out_tokens, _ = generate([tokens], model, max_tokens=64, temperature=0.0, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
69
+ result = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])
70
+
71
+ print(result)
72
+ ```
73
+
74
+ ### Function calling
75
+
76
+ ```py
77
+ from mistral_common.protocol.instruct.tool_calls import Function, Tool
78
+ from mistral_inference.transformer import Transformer
79
+ from mistral_inference.generate import generate
80
+
81
+ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
82
+ from mistral_common.protocol.instruct.messages import UserMessage
83
+ from mistral_common.protocol.instruct.request import ChatCompletionRequest
84
+
85
+
86
+ tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")
87
+ model = Transformer.from_folder(mistral_models_path)
88
+
89
+ completion_request = ChatCompletionRequest(
90
+ tools=[
91
+ Tool(
92
+ function=Function(
93
+ name="get_current_weather",
94
+ description="Get the current weather",
95
+ parameters={
96
+ "type": "object",
97
+ "properties": {
98
+ "location": {
99
+ "type": "string",
100
+ "description": "The city and state, e.g. San Francisco, CA",
101
+ },
102
+ "format": {
103
+ "type": "string",
104
+ "enum": ["celsius", "fahrenheit"],
105
+ "description": "The temperature unit to use. Infer this from the users location.",
106
+ },
107
+ },
108
+ "required": ["location", "format"],
109
+ },
110
+ )
111
+ )
112
+ ],
113
+ messages=[
114
+ UserMessage(content="What's the weather like today in Paris?"),
115
+ ],
116
+ )
117
+
118
+ tokens = tokenizer.encode_chat_completion(completion_request).tokens
119
+
120
+ out_tokens, _ = generate([tokens], model, max_tokens=64, temperature=0.0, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
121
+ result = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])
122
+
123
+ print(result)
124
+ ```
125
+
126
+ ## Generate with `transformers`
127
+
128
+ If you want to use Hugging Face `transformers` to generate text, you can do something like this.
129
+
130
+ ```py
131
+ from transformers import pipeline
132
+
133
+ messages = [
134
+ {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
135
+ {"role": "user", "content": "Who are you?"},
136
+ ]
137
+ chatbot = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.3")
138
+ chatbot(messages)
139
+ ```
140
+
141
+
142
+ ## Function calling with `transformers`
143
+
144
+ To use this example, you'll need `transformers` version 4.42.0 or higher. Please see the
145
+ [function calling guide](https://huggingface.co/docs/transformers/main/chat_templating#advanced-tool-use--function-calling)
146
+ in the `transformers` docs for more information.
147
+
148
+ ```python
149
+ from transformers import AutoModelForCausalLM, AutoTokenizer
150
+ import torch
151
+
152
+ model_id = "mistralai/Mistral-7B-Instruct-v0.3"
153
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
154
+
155
+ def get_current_weather(location: str, format: str):
156
+ """
157
+ Get the current weather
158
+
159
+ Args:
160
+ location: The city and state, e.g. San Francisco, CA
161
+ format: The temperature unit to use. Infer this from the users location. (choices: ["celsius", "fahrenheit"])
162
+ """
163
+ pass
164
+
165
+ conversation = [{"role": "user", "content": "What's the weather like in Paris?"}]
166
+ tools = [get_current_weather]
167
+
168
+
169
+ # format and tokenize the tool use prompt
170
+ inputs = tokenizer.apply_chat_template(
171
+ conversation,
172
+ tools=tools,
173
+ add_generation_prompt=True,
174
+ return_dict=True,
175
+ return_tensors="pt",
176
+ )
177
+
178
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
179
+
180
+ inputs.to(model.device)
181
+ outputs = model.generate(**inputs, max_new_tokens=1000)
182
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
183
+ ```
184
+
185
+ Note that, for reasons of space, this example does not show a complete cycle of calling a tool and adding the tool call and tool
186
+ results to the chat history so that the model can use them in its next generation. For a full tool calling example, please
187
+ see the [function calling guide](https://huggingface.co/docs/transformers/main/chat_templating#advanced-tool-use--function-calling),
188
+ and note that Mistral **does** use tool call IDs, so these must be included in your tool calls and tool results. They should be
189
+ exactly 9 alphanumeric characters.
190
+
191
+
192
+ ## Limitations
193
+
194
+ The Mistral 7B Instruct model is a quick demonstration that the base model can be easily fine-tuned to achieve compelling performance.
195
+ It does not have any moderation mechanisms. We're looking forward to engaging with the community on ways to
196
+ make the model finely respect guardrails, allowing for deployment in environments requiring moderated outputs.
197
+
198
+ ## The Mistral AI Team
199
+
200
+ Albert Jiang, Alexandre Sablayrolles, Alexis Tacnet, Antoine Roux, Arthur Mensch, Audrey Herblin-Stoop, Baptiste Bout, Baudouin de Monicault, Blanche Savary, Bam4d, Caroline Feldman, Devendra Singh Chaplot, Diego de las Casas, Eleonore Arcelin, Emma Bou Hanna, Etienne Metzger, Gianna Lengyel, Guillaume Bour, Guillaume Lample, Harizo Rajaona, Jean-Malo Delignon, Jia Li, Justus Murke, Louis Martin, Louis Ternon, Lucile Saulnier, Lélio Renard Lavaud, Margaret Jennings, Marie Pellat, Marie Torelli, Marie-Anne Lachaux, Nicolas Schuhl, Patrick von Platen, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon Antoniak, Teven Le Scao, Thibaut Lavril, Timothée Lacroix, Théophile Gervet, Thomas Wang, Valera Nemychnikova, William El Sayed, William Marshall
context_encoding_model/_tp0_bk0/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_c6824be80aab0b095843+cc19d8a1.hlo_module.pb --output model.MODULE_c6824be80aab0b095843+cc19d8a1.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk0/compile_flags.MODULE_c6824be80aab0b095843+cc19d8a1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"]
context_encoding_model/_tp0_bk0/global_metric_store.json ADDED
@@ -0,0 +1,1079 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.61843872070313,
5
+ "StaticProfiler::AveragePartitionUtilization": 98.97808837890625,
6
+ "StaticProfiler::AveragePeUtilization": 98.48223876953125,
7
+ "StaticProfiler::LocalizationEfficiency": 97.97124481201172,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.67024993896484,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1.0,
16
+ "StaticProfiler::AveragePartitionUtilization": 1.0,
17
+ "StaticProfiler::AveragePeUtilization": 1.0,
18
+ "StaticProfiler::LocalizationEfficiency": 1.0,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 0.04956221580505371,
27
+ "AffinePredicateResolution": 0.002319812774658203,
28
+ "AliasDependencyElimination": 0.00019097328186035156,
29
+ "AliasDependencyInduction": 0.01746201515197754,
30
+ "AliasDependencyReset": 0.04065680503845215,
31
+ "BFComputeCutting": 0.0013394355773925781,
32
+ "BirCodeGenLoop": 0.11406064033508301,
33
+ "CCOpFusion": 0.03461766242980957,
34
+ "CanonicalizeConv": 4.900000203633681e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.004343748092651367,
36
+ "CanonicalizeForTensorizer": 4.600000102072954e-05,
37
+ "CanonicalizeIR": 0.003444671630859375,
38
+ "Canonicalizer": 0.0009449999779462814,
39
+ "CoalesceCCOp": 0.04709577560424805,
40
+ "CommuteConcat": 0.0010838508605957031,
41
+ "DMALocalityOpt": 0.002311229705810547,
42
+ "DMAProfiler": 0.006924152374267578,
43
+ "DMATilingProfiler": 0.004760265350341797,
44
+ "DataLocalityOpt": 0.11827898025512695,
45
+ "DataStreaming": 0.006251335144042969,
46
+ "DeConcat": 0.0006871223449707031,
47
+ "DeadCodeElimination": 0.0018725395202636719,
48
+ "DeadStoreElimination": 0.00615692138671875,
49
+ "DelinearIndices": 0.004624128341674805,
50
+ "Delinearization": 0.0023467540740966797,
51
+ "DoNothing": 0.0003032684326171875,
52
+ "DramToDramTranspose": 0.015562057495117188,
53
+ "DumpGraphAndMetadata": 0.021119117736816406,
54
+ "EliminateDivs": 0.0027742385864257813,
55
+ "ExpandBatchNorm": 0.0035254955291748047,
56
+ "ExpandISAMacro": 0.005174160003662109,
57
+ "FactorizeBlkDims": 0.00865316390991211,
58
+ "FactorizeThreadAxesInFreeDims": 0.0011937618255615234,
59
+ "FlattenMacroLoop": 0.002167940139770508,
60
+ "GenericAccessSimplifier": 0.0008442401885986328,
61
+ "HoistCompute": 7.000000096013537e-06,
62
+ "IdentifyCrossPassTensors": 3.899999865097925e-05,
63
+ "InferInitValue": 0.024956941604614258,
64
+ "InferIntrinsicOnCC": 0.015187978744506836,
65
+ "InferNeuronTensor": 0.01759958267211914,
66
+ "InferNonlocalTensors": 0.014631509780883789,
67
+ "InferPSumTensor": 0.03635692596435547,
68
+ "InlineNativeKernels": 0.0049402713775634766,
69
+ "InsertIOTransposes": 0.012220144271850586,
70
+ "InsertLocalTransposes": 0.0043489933013916016,
71
+ "InsertOffloadedTransposes": 0.0029261112213134766,
72
+ "LICM": 0.002863168716430664,
73
+ "LateLegalizeInst": 0.00691986083984375,
74
+ "LateLegalizePostSplit": 0.004354715347290039,
75
+ "LateLowerReshapeOp": 0.0013675689697265625,
76
+ "LateLowerTensorOp": 0.0017099380493164063,
77
+ "LateNeuronInstComb": 0.008911371231079102,
78
+ "LayoutPreprocessing": 0.024985551834106445,
79
+ "LayoutPreprocessingAndAnalysis": 0.05915069580078125,
80
+ "LayoutRequirementAnalysis": 0.004924297332763672,
81
+ "LegalizeCCOpLayout": 0.003811359405517578,
82
+ "LegalizeOpLevelAlias": 0.0012297630310058594,
83
+ "LegalizePartitionReduce": 0.001873016357421875,
84
+ "LegalizeSundaAccess": 0.06697678565979004,
85
+ "LegalizeSundaMacro": 0.010838985443115234,
86
+ "LegalizeType": 0.007873058319091797,
87
+ "LocalLayoutOpt": 0.01294255256652832,
88
+ "LoopFusion": 0.005301713943481445,
89
+ "LoopSplitting": 0.0006639957427978516,
90
+ "LowerBroadcast": 0.00147247314453125,
91
+ "LowerCCOpBlockAxis": 0.003854513168334961,
92
+ "LowerComplexBroadcast": 0.0023345947265625,
93
+ "LowerIntrinsics": 0.31359052658081055,
94
+ "LowerTensorOp": 0.013880491256713867,
95
+ "LowerTranspose": 0.007946014404296875,
96
+ "MacroGeneration": 0.04993605613708496,
97
+ "MaskPropagation": 0.0027527809143066406,
98
+ "MemcastMotion": 3.600000127335079e-05,
99
+ "MemcpyElimination": 0.02739572525024414,
100
+ "MutateDataType": 0.0013971328735351563,
101
+ "NeuronAliasDependencyInduction": 0.0002524852752685547,
102
+ "NeuronAliasDependencyReset": 0.046864986419677734,
103
+ "NeuronInstComb": 0.007117271423339844,
104
+ "NeuronLICM": 0.035208702087402344,
105
+ "NeuronLoopFusion": 0.008702993392944336,
106
+ "NeuronLoopInterchange": 0.0014033317565917969,
107
+ "NeuronSimplifier": 0.0068361759185791016,
108
+ "NeuronSimplifyPredicates": 0.006758928298950195,
109
+ "NeuronValueNumbering": 0.003446817398071289,
110
+ "OptimizeAliasedCopyChain": 0.00058746337890625,
111
+ "OptimizeNKIKernels": 0.5256326198577881,
112
+ "PAGLayoutOpt": 0.07949376106262207,
113
+ "PComputeCutting": 0.005449533462524414,
114
+ "PGLayoutTilingPipeline": 0.5191597938537598,
115
+ "PGTiling": 0.1754302978515625,
116
+ "PadElimination": 0.0005822181701660156,
117
+ "ParAxesAnnotation": 0.0432279109954834,
118
+ "PartialLoopFusion": 0.0034580230712890625,
119
+ "PartialSimdFusion": 0.003893136978149414,
120
+ "PenguinizeFunctions": 4.099999932805076e-05,
121
+ "PerfectLoopNest": 0.0017859935760498047,
122
+ "PruneFunctions": 3.300000025774352e-05,
123
+ "RecognizeOpIdiom": 0.003717660903930664,
124
+ "Recompute": 0.0006694793701171875,
125
+ "RelaxPredicates": 0.004067182540893555,
126
+ "Rematerialization": 0.002377748489379883,
127
+ "RemoveOptimizationBarriers": 4.199999966658652e-05,
128
+ "ReshapeWeights": 0.0010006427764892578,
129
+ "ResolveAccessConflict": 0.003880739212036133,
130
+ "ResolveComplicatePredicates": 0.0018019676208496094,
131
+ "RewriteReplicationMatmul": 0.0015823841094970703,
132
+ "RewriteWeights": 0.002522706985473633,
133
+ "SFKVectorizer": 0.12057852745056152,
134
+ "ScatterMotion": 3.999999989900971e-06,
135
+ "SimpleAllReduceTiling": 0.003117799758911133,
136
+ "Simplifier": 0.0035266876220703125,
137
+ "SimplifyMacroPredicates": 0.005109071731567383,
138
+ "SimplifyNeuronTensor": 0.2755560874938965,
139
+ "SimplifySlice": 0.0010409355163574219,
140
+ "SimplifyTensor": 0.025161027908325195,
141
+ "SpillPSum": 0.05051732063293457,
142
+ "SplitAPUnionSets": 0.013710737228393555,
143
+ "SplitAccGrp": 0.001070261001586914,
144
+ "StaticProfiler": 0.005475759506225586,
145
+ "StaticTransposeLocalTensor": 0.0038781166076660156,
146
+ "SundaISel": 0.04041314125061035,
147
+ "TCTransform": 0.0008709430694580078,
148
+ "TensorInitialization": 0.004585742950439453,
149
+ "TensorOpSimplifier": 0.005696535110473633,
150
+ "TensorOpTransform": 0.021961212158203125,
151
+ "TensorizerLegalizationPass": 5.199999577598646e-05,
152
+ "TileCCOps": 0.010190010070800781,
153
+ "TilingProfiler": 0.04765009880065918,
154
+ "TransformConvOp": 0.0030345916748046875,
155
+ "TritiumFusion": 0.07207250595092773,
156
+ "ValueNumbering": 0.002035379409790039,
157
+ "VectorizeDMA": 0.006017446517944336,
158
+ "VectorizeMatMult": 0.0027844905853271484,
159
+ "VerifySupportedOps": 3.199999991920777e-05,
160
+ "WeightCoalescing": 0.0033817291259765625,
161
+ "ZeroSizeTensorElimination": 0.00019311904907226563,
162
+ "algsimp": 0.002274000085890293,
163
+ "batchnorm_expander": 4.3000000005122274e-05,
164
+ "boundary-marker-removal": 1.2000000424450263e-05,
165
+ "call-inliner": 0.00041299997246824205,
166
+ "canonicalize-boundary-marker": 1.3999999282532372e-05,
167
+ "collective-stream-id-checker": 0.00012900000729132444,
168
+ "comparison-expander": 0.0003989999822806567,
169
+ "computation-deduplicator": 5.999999848427251e-05,
170
+ "conditional-to-select": 1.8999999156221747e-05,
171
+ "config-lowering": 0.00018099999579135329,
172
+ "constant-statistics": 0.00039400000241585076,
173
+ "constant_folding": 0.0002840000088326633,
174
+ "cse": 5.699999746866524e-05,
175
+ "dce": 7.900000491645187e-05,
176
+ "dot_decomposer": 0.0011419999646022916,
177
+ "dynamic-slice-transpose": 1.4999999621068127e-05,
178
+ "eliminate-redundant-compare": 0.0002559999702498317,
179
+ "emit-offloaded-dropout": 6.399999983841553e-05,
180
+ "flatten-call-graph": 0.0006720000528730452,
181
+ "fuse-send-recv": 6.70000008540228e-05,
182
+ "hilo::LegalizeAlias": 1.2999999853491317e-05,
183
+ "hilo::NeuronInstCombine": 0.00018000000272877514,
184
+ "hilo::NeuronOpFusion": 7.100000220816582e-05,
185
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.8000000636675395e-05,
186
+ "hilo::ScheduleFusion": 1.2000000424450263e-05,
187
+ "hilo::SixtyFourHack": 6.299999949987978e-05,
188
+ "hilo::VerifyAliasing": 6.000000212225132e-06,
189
+ "hlo-mac-count": 0.0011450001038610935,
190
+ "hlo-verifier": 0.00938900001347065,
191
+ "instruction-histogram": 0.001053999993018806,
192
+ "io-con-pipe-begin": 9.000000318337698e-06,
193
+ "io-con-pipe-end": 9.999999974752427e-07,
194
+ "io-layout-normalization": 0.0016039999900385737,
195
+ "io-statistics": 6.900000153109431e-05,
196
+ "legalize-ccops": 3.999999989900971e-06,
197
+ "legalize-compare": 1.1000000085914508e-05,
198
+ "lower-argminmax-custom-call": 1.2000000424450263e-05,
199
+ "map-inline": 0.0006389999762177467,
200
+ "metadata-naming": 6.299999949987978e-05,
201
+ "mlir::detail::OpToOpPassAdaptor": 9.600000339560211e-05,
202
+ "mlir::hlo::MhloToPyPenguin": 0.07229000329971313,
203
+ "mlir::mhlo::LowerComplexExtraPass": 0.0003330000035930425,
204
+ "mlir::mhlo::LowerComplexPass": 0.0003640000068116933,
205
+ "native-to-custom-softmax": 0.0007670000195503235,
206
+ "native-to-custom-softmax-dx": 0.00069500005338341,
207
+ "operand_upcaster": 5.0000002374872565e-05,
208
+ "opt-barrier-removal": 0.0005799999926239252,
209
+ "post-par-pipe-begin": 3.000000106112566e-06,
210
+ "post-par-pipe-end": 0.0,
211
+ "post-partition-simplification": 0.0016440000617876649,
212
+ "pre-par-pipe-begin": 1.9999999949504854e-06,
213
+ "pre-par-pipe-end": 0.0,
214
+ "pre-partition-simplification": 0.2085230052471161,
215
+ "replace-minimum-constant": 0.0003719999804161489,
216
+ "reshape-mover": 0.0001140000022132881,
217
+ "simplify-concat": 0.00015900000289548188,
218
+ "simplify-while-loops": 9.700000373413786e-05,
219
+ "transform-variadic-reduce": 6.900000153109431e-05,
220
+ "tuple-simplifier": 0.000295000005280599,
221
+ "unpack-nested-aws-ntwsr": 0.00043299997923895717,
222
+ "unroll-while-loop": 1.700000029813964e-05,
223
+ "zero_sized_hlo_elimination": 0.0006750000175088644
224
+ },
225
+ "hilo": {
226
+ "ConstantSize": 271333.0,
227
+ "HloInputCount": 359.0,
228
+ "HloMacCount": 28118614016.0,
229
+ "HloOutputCount": 65.0,
230
+ "IfmapSize": 7785161728.0,
231
+ "OfmapSize": 536870912.0,
232
+ "OutputsReadFromCount": 0.0,
233
+ "PassthroughTensorsCount": 0.0,
234
+ "RedundantOutputCount": 0.0,
235
+ "Traffic": 752578432.0
236
+ },
237
+ "tensorizer": {
238
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 9240.0,
239
+ "StaticProfiler::AifUb": 74.75788116455078,
240
+ "StaticProfiler::ArithmeticIntensityTensorizer": 73.24122619628906,
241
+ "StaticProfiler::AverageDmaLength": 6860.4052734375,
242
+ "StaticProfiler::DDRTransferBytes": 318999072.0,
243
+ "StaticProfiler::InternalTransferBytes": 6136852.0,
244
+ "StaticProfiler::LoadExpanded": 40210.0,
245
+ "StaticProfiler::StoreExpanded": 1195.0,
246
+ "StaticProfiler::TotalDMAExpanded": 41405.0,
247
+ "StaticProfiler::TotalDynamicInstancesCount": 9321.0,
248
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 9225.0,
249
+ "StaticProfiler::TotalLNCComm": 0.0,
250
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
251
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
252
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
253
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
254
+ "TilingProfiler::MatMultInstructionsAfterTiling": 8128.0,
255
+ "TilingProfiler::NumPfTransposes": 4.0,
256
+ "TilingProfiler::NumPfTransposesForIo": 0.0,
257
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
258
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
259
+ "TilingProfiler::PfTransposeInstructions": 97.0,
260
+ "TilingProfiler::PfTransposeInstructionsForIo": 0.0,
261
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
262
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 96.0,
263
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
264
+ "TilingProfiler::SimdInstructionsAfterTiling": 166.0,
265
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
266
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
267
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
268
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
269
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
270
+ "TransformConvOp::conv2d_column_packing": 0.0,
271
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
272
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
273
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
274
+ }
275
+ },
276
+ "all": {
277
+ "compiletime": {
278
+ "algsimp": 0.0020540000405162573,
279
+ "call-inliner": 0.0003819999983534217,
280
+ "collective-stream-id-checker": 0.00010900000052060932,
281
+ "comparison-expander": 0.00038499999209307134,
282
+ "constant-statistics": 0.00039400000241585076,
283
+ "constant_folding": 0.0002589999930933118,
284
+ "dce": 7.500000356230885e-05,
285
+ "dot_decomposer": 0.0011419999646022916,
286
+ "eliminate-redundant-compare": 0.00024399999529123306,
287
+ "flatten-call-graph": 0.0006419999990612268,
288
+ "hlo-mac-count": 0.0008709999965503812,
289
+ "hlo-verifier": 0.008843000046908855,
290
+ "instruction-histogram": 0.001053999993018806,
291
+ "io-con-pipe-begin": 9.000000318337698e-06,
292
+ "io-con-pipe-end": 9.999999974752427e-07,
293
+ "io-layout-normalization": 0.0016039999900385737,
294
+ "io-statistics": 6.900000153109431e-05,
295
+ "map-inline": 0.0006040000007487833,
296
+ "native-to-custom-softmax": 0.0007200000109151006,
297
+ "native-to-custom-softmax-dx": 0.0005460000247694552,
298
+ "opt-barrier-removal": 0.0005799999926239252,
299
+ "pre-par-pipe-begin": 1.9999999949504854e-06,
300
+ "pre-par-pipe-end": 0.0,
301
+ "pre-partition-simplification": 0.2085230052471161,
302
+ "replace-minimum-constant": 0.0003429999924264848,
303
+ "reshape-mover": 0.00010299999848939478,
304
+ "simplify-while-loops": 9.100000170292333e-05,
305
+ "tuple-simplifier": 0.0002800000074785203,
306
+ "unpack-nested-aws-ntwsr": 0.00042299999040551484,
307
+ "unroll-while-loop": 1.700000029813964e-05,
308
+ "zero_sized_hlo_elimination": 0.0006750000175088644
309
+ }
310
+ },
311
+ "cumsum": {
312
+ "compiletime": {
313
+ "CoalesceCCOp": 0.0002715587615966797,
314
+ "DMALocalityOpt": 0.00022292137145996094,
315
+ "DMAProfiler": 0.001041412353515625,
316
+ "DataStreaming": 0.0003218650817871094,
317
+ "DoNothing": 0.0001633167266845703,
318
+ "ExpandISAMacro": 0.0005700588226318359,
319
+ "FactorizeBlkDims": 0.00047659873962402344,
320
+ "InferPSumTensor": 0.0006239414215087891,
321
+ "LateLegalizeInst": 0.00047588348388671875,
322
+ "LateNeuronInstComb": 0.0006630420684814453,
323
+ "LegalizeSundaAccess": 0.0018274784088134766,
324
+ "LegalizeType": 0.0003712177276611328,
325
+ "LowerBroadcast": 0.0002789497375488281,
326
+ "LowerIntrinsics": 0.00029349327087402344,
327
+ "LowerTranspose": 0.00027632713317871094,
328
+ "NeuronInstComb": 0.0005421638488769531,
329
+ "NeuronLICM": 0.0004737377166748047,
330
+ "NeuronSimplifyPredicates": 0.0035429000854492188,
331
+ "NeuronValueNumbering": 0.000461578369140625,
332
+ "SFKVectorizer": 0.0029914379119873047,
333
+ "SimpleAllReduceTiling": 0.0002620220184326172,
334
+ "SimplifyNeuronTensor": 0.0004684925079345703,
335
+ "SpillPSum": 0.0006010532379150391,
336
+ "WeightCoalescing": 0.00029349327087402344
337
+ }
338
+ },
339
+ "sg00": {
340
+ "compiletime": {
341
+ "CanonicalizeConv": 3.7000001611886546e-05,
342
+ "CanonicalizeForTensorizer": 1.9999999494757503e-05,
343
+ "Canonicalizer": 0.000371000001905486,
344
+ "HoistCompute": 3.999999989900971e-06,
345
+ "IdentifyCrossPassTensors": 1.5999999959603883e-05,
346
+ "MemcastMotion": 1.4000000192027073e-05,
347
+ "PenguinizeFunctions": 1.8999999156221747e-05,
348
+ "PruneFunctions": 1.2999999853491317e-05,
349
+ "RemoveOptimizationBarriers": 1.4000000192027073e-05,
350
+ "ScatterMotion": 0.0,
351
+ "TensorizerLegalizationPass": 2.9999999242136255e-05,
352
+ "VerifySupportedOps": 1.1000000085914508e-05,
353
+ "algsimp": 7.000000186963007e-05,
354
+ "batchnorm_expander": 1.2999999853491317e-05,
355
+ "boundary-marker-removal": 3.000000106112566e-06,
356
+ "call-inliner": 9.999999747378752e-06,
357
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
358
+ "collective-stream-id-checker": 3.000000106112566e-06,
359
+ "comparison-expander": 3.999999989900971e-06,
360
+ "computation-deduplicator": 1.4999999621068127e-05,
361
+ "conditional-to-select": 4.999999873689376e-06,
362
+ "config-lowering": 5.2999999752501026e-05,
363
+ "constant_folding": 7.999999979801942e-06,
364
+ "cse": 1.4999999621068127e-05,
365
+ "dce": 9.999999974752427e-07,
366
+ "dynamic-slice-transpose": 3.999999989900971e-06,
367
+ "eliminate-redundant-compare": 3.000000106112566e-06,
368
+ "emit-offloaded-dropout": 2.099999983329326e-05,
369
+ "flatten-call-graph": 9.999999747378752e-06,
370
+ "fuse-send-recv": 1.9999999494757503e-05,
371
+ "hilo::LegalizeAlias": 6.000000212225132e-06,
372
+ "hilo::NeuronInstCombine": 6.399999983841553e-05,
373
+ "hilo::NeuronOpFusion": 4.199999966658652e-05,
374
+ "hilo::ReplaceTokenTypeWithU8Pass": 6.000000212225132e-06,
375
+ "hilo::ScheduleFusion": 9.000000318337698e-06,
376
+ "hilo::SixtyFourHack": 1.4999999621068127e-05,
377
+ "hilo::VerifyAliasing": 3.000000106112566e-06,
378
+ "hlo-mac-count": 3.400000059627928e-05,
379
+ "hlo-verifier": 0.00014699999883305281,
380
+ "legalize-ccops": 1.9999999949504854e-06,
381
+ "legalize-compare": 3.999999989900971e-06,
382
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
383
+ "map-inline": 9.999999747378752e-06,
384
+ "metadata-naming": 1.5999999959603883e-05,
385
+ "mlir::detail::OpToOpPassAdaptor": 3.300000025774352e-05,
386
+ "mlir::hlo::MhloToPyPenguin": 0.03224699944257736,
387
+ "mlir::mhlo::LowerComplexExtraPass": 0.00011999999696854502,
388
+ "mlir::mhlo::LowerComplexPass": 0.00016799999866634607,
389
+ "native-to-custom-softmax": 2.099999983329326e-05,
390
+ "native-to-custom-softmax-dx": 8.600000001024455e-05,
391
+ "operand_upcaster": 1.4000000192027073e-05,
392
+ "post-par-pipe-begin": 9.999999974752427e-07,
393
+ "post-par-pipe-end": 0.0,
394
+ "post-partition-simplification": 0.0005360000068321824,
395
+ "replace-minimum-constant": 7.999999979801942e-06,
396
+ "reshape-mover": 3.999999989900971e-06,
397
+ "simplify-concat": 5.199999941396527e-05,
398
+ "simplify-while-loops": 1.9999999949504854e-06,
399
+ "transform-variadic-reduce": 9.000000318337698e-06,
400
+ "tuple-simplifier": 3.999999989900971e-06,
401
+ "unpack-nested-aws-ntwsr": 3.000000106112566e-06,
402
+ "unroll-while-loop": 0.0
403
+ },
404
+ "hilo": {
405
+ "ArithmeticIntensity": 27.82740020751953,
406
+ "ConstantSize": 271333.0,
407
+ "HloInputCount": 359.0,
408
+ "HloMacCount": 2751463424.0,
409
+ "HloOutputCount": 65.0,
410
+ "IfmapSize": 7785161728.0,
411
+ "OfmapSize": 536870912.0,
412
+ "OutputsReadFromCount": 0.0,
413
+ "PassthroughTensorsCount": 0.0,
414
+ "RedundantOutputCount": 0.0,
415
+ "Traffic": 197752096.0
416
+ }
417
+ },
418
+ "sg0000": {
419
+ "compiletime": {
420
+ "AGOrderingAnalysisPass": 0.06467413902282715,
421
+ "AffinePredicateResolution": 0.002021312713623047,
422
+ "AliasDependencyElimination": 0.00022792816162109375,
423
+ "AliasDependencyInduction": 0.009432077407836914,
424
+ "AliasDependencyReset": 0.055196523666381836,
425
+ "BFComputeCutting": 0.0026645660400390625,
426
+ "BirCodeGenLoop": 0.04398298263549805,
427
+ "CCOpFusion": 0.013358116149902344,
428
+ "CanonicalizeDAGForPGTiling": 0.005773782730102539,
429
+ "CanonicalizeIR": 0.002256631851196289,
430
+ "CoalesceCCOp": 0.0045413970947265625,
431
+ "CommuteConcat": 0.0009562969207763672,
432
+ "DMALocalityOpt": 0.0011527538299560547,
433
+ "DMAProfiler": 0.00419306755065918,
434
+ "DMATilingProfiler": 0.003909587860107422,
435
+ "DataLocalityOpt": 0.18979620933532715,
436
+ "DataStreaming": 0.003125905990600586,
437
+ "DeConcat": 0.0009047985076904297,
438
+ "DeadCodeElimination": 0.0010743141174316406,
439
+ "DeadStoreElimination": 0.04163408279418945,
440
+ "DelinearIndices": 0.006937980651855469,
441
+ "Delinearization": 0.0029773712158203125,
442
+ "DoNothing": 0.0001316070556640625,
443
+ "DramToDramTranspose": 0.10994625091552734,
444
+ "DumpGraphAndMetadata": 0.012879133224487305,
445
+ "EliminateDivs": 0.00676274299621582,
446
+ "ExpandBatchNorm": 0.0029730796813964844,
447
+ "ExpandISAMacro": 0.003587961196899414,
448
+ "FactorizeBlkDims": 0.007817268371582031,
449
+ "FactorizeThreadAxesInFreeDims": 0.001369476318359375,
450
+ "FlattenMacroLoop": 0.002986907958984375,
451
+ "GenericAccessSimplifier": 0.0009241104125976563,
452
+ "InferInitValue": 0.06494665145874023,
453
+ "InferIntrinsicOnCC": 0.010074853897094727,
454
+ "InferNeuronTensor": 0.05807805061340332,
455
+ "InferNonlocalTensors": 0.07637619972229004,
456
+ "InferPSumTensor": 0.06549668312072754,
457
+ "InlineNativeKernels": 0.001531362533569336,
458
+ "InsertIOTransposes": 0.03947162628173828,
459
+ "InsertLocalTransposes": 0.011230707168579102,
460
+ "InsertOffloadedTransposes": 0.0025644302368164063,
461
+ "LICM": 0.0028715133666992188,
462
+ "LateLegalizeInst": 0.005754709243774414,
463
+ "LateLegalizePostSplit": 0.0025861263275146484,
464
+ "LateLowerReshapeOp": 0.0012161731719970703,
465
+ "LateLowerTensorOp": 0.010146856307983398,
466
+ "LateNeuronInstComb": 0.053855180740356445,
467
+ "LayoutPreprocessing": 0.02685070037841797,
468
+ "LayoutPreprocessingAndAnalysis": 0.050562143325805664,
469
+ "LayoutRequirementAnalysis": 0.0064203739166259766,
470
+ "LegalizeCCOpLayout": 0.002747058868408203,
471
+ "LegalizeOpLevelAlias": 0.0020356178283691406,
472
+ "LegalizePartitionReduce": 0.0014362335205078125,
473
+ "LegalizeSundaAccess": 0.07279443740844727,
474
+ "LegalizeSundaMacro": 0.0071544647216796875,
475
+ "LegalizeType": 0.003951549530029297,
476
+ "LocalLayoutOpt": 0.04533553123474121,
477
+ "LoopFusion": 0.005699634552001953,
478
+ "LoopSplitting": 0.0004475116729736328,
479
+ "LowerBroadcast": 0.002071380615234375,
480
+ "LowerCCOpBlockAxis": 0.0065212249755859375,
481
+ "LowerComplexBroadcast": 0.002064228057861328,
482
+ "LowerIntrinsics": 0.024514198303222656,
483
+ "LowerTensorOp": 0.01275944709777832,
484
+ "LowerTranspose": 0.007394313812255859,
485
+ "MacroGeneration": 0.11238408088684082,
486
+ "MaskPropagation": 0.03646087646484375,
487
+ "MemcpyElimination": 0.2132580280303955,
488
+ "MutateDataType": 0.001348257064819336,
489
+ "NeuronAliasDependencyInduction": 0.00030231475830078125,
490
+ "NeuronAliasDependencyReset": 0.047522783279418945,
491
+ "NeuronInstComb": 0.05169367790222168,
492
+ "NeuronLICM": 0.006502866744995117,
493
+ "NeuronLoopFusion": 0.05488872528076172,
494
+ "NeuronLoopInterchange": 0.0018024444580078125,
495
+ "NeuronSimplifier": 0.008179903030395508,
496
+ "NeuronSimplifyPredicates": 0.003264904022216797,
497
+ "NeuronValueNumbering": 0.003409862518310547,
498
+ "OptimizeAliasedCopyChain": 0.0009076595306396484,
499
+ "OptimizeNKIKernels": 0.001665353775024414,
500
+ "PAGLayoutOpt": 0.563495397567749,
501
+ "PComputeCutting": 0.009307861328125,
502
+ "PGLayoutTilingPipeline": 1.5276920795440674,
503
+ "PGTiling": 0.28220415115356445,
504
+ "PadElimination": 0.0005035400390625,
505
+ "ParAxesAnnotation": 0.5283262729644775,
506
+ "PartialLoopFusion": 0.008737325668334961,
507
+ "PartialSimdFusion": 0.0075261592864990234,
508
+ "PerfectLoopNest": 0.0016448497772216797,
509
+ "RecognizeOpIdiom": 0.0037012100219726563,
510
+ "Recompute": 0.0005586147308349609,
511
+ "RelaxPredicates": 0.0031774044036865234,
512
+ "Rematerialization": 0.0065402984619140625,
513
+ "ReshapeWeights": 0.000885009765625,
514
+ "ResolveAccessConflict": 0.0038557052612304688,
515
+ "ResolveComplicatePredicates": 0.0025780200958251953,
516
+ "RewriteReplicationMatmul": 0.0013949871063232422,
517
+ "RewriteWeights": 0.003153562545776367,
518
+ "SFKVectorizer": 0.08175945281982422,
519
+ "SimpleAllReduceTiling": 0.0017082691192626953,
520
+ "Simplifier": 0.015218257904052734,
521
+ "SimplifyMacroPredicates": 0.010445356369018555,
522
+ "SimplifyNeuronTensor": 0.006752490997314453,
523
+ "SimplifySlice": 0.0012371540069580078,
524
+ "SimplifyTensor": 0.005439281463623047,
525
+ "SpillPSum": 0.01030278205871582,
526
+ "SplitAPUnionSets": 0.03318023681640625,
527
+ "SplitAccGrp": 0.0014679431915283203,
528
+ "StaticProfiler": 0.003851652145385742,
529
+ "StaticTransposeLocalTensor": 0.008411169052124023,
530
+ "SundaISel": 0.08733582496643066,
531
+ "TCTransform": 0.0011320114135742188,
532
+ "TensorInitialization": 0.009679555892944336,
533
+ "TensorOpSimplifier": 0.008788108825683594,
534
+ "TensorOpTransform": 0.04109930992126465,
535
+ "TileCCOps": 0.006173372268676758,
536
+ "TilingProfiler": 0.009527444839477539,
537
+ "TransformConvOp": 0.002694368362426758,
538
+ "TritiumFusion": 0.01809835433959961,
539
+ "ValueNumbering": 0.002146005630493164,
540
+ "VectorizeDMA": 0.004664897918701172,
541
+ "VectorizeMatMult": 0.008771419525146484,
542
+ "WeightCoalescing": 0.0025610923767089844,
543
+ "ZeroSizeTensorElimination": 0.00019073486328125
544
+ },
545
+ "tensorizer": {
546
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 1100.0,
547
+ "StaticProfiler::AifUb": 27.658906936645508,
548
+ "StaticProfiler::ArithmeticIntensityTensorizer": 73.42570495605469,
549
+ "StaticProfiler::AverageDmaLength": 9659.0947265625,
550
+ "StaticProfiler::AverageFractalPeUtilization": 99.76654052734375,
551
+ "StaticProfiler::AveragePartitionUtilization": 91.43461608886719,
552
+ "StaticProfiler::AveragePeUtilization": 99.76654052734375,
553
+ "StaticProfiler::DDRTransferBytes": 79836424.0,
554
+ "StaticProfiler::InternalTransferBytes": 8093696.0,
555
+ "StaticProfiler::LoadExpanded": 10755.0,
556
+ "StaticProfiler::LocalizationEfficiency": 265.46856689453125,
557
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 482.6210021972656,
558
+ "StaticProfiler::StoreExpanded": 3073.0,
559
+ "StaticProfiler::TotalDMAExpanded": 13828.0,
560
+ "StaticProfiler::TotalDynamicInstancesCount": 910.0,
561
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 910.0,
562
+ "StaticProfiler::TotalLNCComm": 0.0,
563
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
564
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
565
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
566
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
567
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
568
+ "TilingProfiler::GenericInstructionsAfterTiling": 12.0,
569
+ "TilingProfiler::MatMultInstructionsAfterTiling": 437.0,
570
+ "TilingProfiler::NumPfTransposes": 4.0,
571
+ "TilingProfiler::NumPfTransposesForIo": 0.0,
572
+ "TilingProfiler::NumPfTransposesForLocal": 3.0,
573
+ "TilingProfiler::NumPfTransposesForNonlocal": 1.0,
574
+ "TilingProfiler::PfTransposeInstructions": 52.0,
575
+ "TilingProfiler::PfTransposeInstructionsForIo": 0.0,
576
+ "TilingProfiler::PfTransposeInstructionsForLocal": 36.0,
577
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 16.0,
578
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
579
+ "TilingProfiler::SimdInstructionsAfterTiling": 99.0,
580
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
581
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
582
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
583
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
584
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
585
+ "TransformConvOp::conv2d_column_packing": 0.0,
586
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
587
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
588
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
589
+ }
590
+ },
591
+ "sg0001": {
592
+ "compiletime": {
593
+ "AGOrderingAnalysisPass": 0.11036324501037598,
594
+ "AffinePredicateResolution": 0.002354860305786133,
595
+ "AliasDependencyElimination": 0.0002741813659667969,
596
+ "AliasDependencyInduction": 0.008609294891357422,
597
+ "AliasDependencyReset": 0.020910978317260742,
598
+ "BFComputeCutting": 0.001337289810180664,
599
+ "BirCodeGenLoop": 0.06340932846069336,
600
+ "CCOpFusion": 0.018311500549316406,
601
+ "CanonicalizeDAGForPGTiling": 0.004097461700439453,
602
+ "CanonicalizeIR": 0.001764535903930664,
603
+ "CoalesceCCOp": 0.004993915557861328,
604
+ "CommuteConcat": 0.0019958019256591797,
605
+ "DMALocalityOpt": 0.0014808177947998047,
606
+ "DMAProfiler": 0.0038518905639648438,
607
+ "DMATilingProfiler": 0.0041544437408447266,
608
+ "DataLocalityOpt": 0.20078325271606445,
609
+ "DataStreaming": 0.004034519195556641,
610
+ "DeConcat": 0.0015177726745605469,
611
+ "DeadCodeElimination": 0.0010747909545898438,
612
+ "DeadStoreElimination": 0.039188385009765625,
613
+ "DelinearIndices": 0.008949041366577148,
614
+ "Delinearization": 0.0033283233642578125,
615
+ "DoNothing": 0.00011873245239257813,
616
+ "DramToDramTranspose": 0.0655205249786377,
617
+ "DumpGraphAndMetadata": 0.012286186218261719,
618
+ "EliminateDivs": 0.0033621788024902344,
619
+ "ExpandBatchNorm": 0.0016183853149414063,
620
+ "ExpandISAMacro": 0.003101825714111328,
621
+ "FactorizeBlkDims": 0.008794307708740234,
622
+ "FactorizeThreadAxesInFreeDims": 0.0022835731506347656,
623
+ "FlattenMacroLoop": 0.0027937889099121094,
624
+ "GenericAccessSimplifier": 0.0009398460388183594,
625
+ "InferInitValue": 0.06454229354858398,
626
+ "InferIntrinsicOnCC": 0.009270429611206055,
627
+ "InferNeuronTensor": 0.033622026443481445,
628
+ "InferNonlocalTensors": 0.028959989547729492,
629
+ "InferPSumTensor": 0.09024310111999512,
630
+ "InlineNativeKernels": 0.0021810531616210938,
631
+ "InsertIOTransposes": 0.016814231872558594,
632
+ "InsertLocalTransposes": 0.009081840515136719,
633
+ "InsertOffloadedTransposes": 0.003053903579711914,
634
+ "LICM": 0.003207683563232422,
635
+ "LateLegalizeInst": 0.0053195953369140625,
636
+ "LateLegalizePostSplit": 0.0034935474395751953,
637
+ "LateLowerReshapeOp": 0.0013909339904785156,
638
+ "LateLowerTensorOp": 0.005151987075805664,
639
+ "LateNeuronInstComb": 0.026895761489868164,
640
+ "LayoutPreprocessing": 0.02971482276916504,
641
+ "LayoutPreprocessingAndAnalysis": 0.05191612243652344,
642
+ "LayoutRequirementAnalysis": 0.008739709854125977,
643
+ "LegalizeCCOpLayout": 0.00177764892578125,
644
+ "LegalizeOpLevelAlias": 0.0013723373413085938,
645
+ "LegalizePartitionReduce": 0.0010402202606201172,
646
+ "LegalizeSundaAccess": 0.07228899002075195,
647
+ "LegalizeSundaMacro": 0.009223222732543945,
648
+ "LegalizeType": 0.007097959518432617,
649
+ "LocalLayoutOpt": 0.020928382873535156,
650
+ "LoopFusion": 0.006183147430419922,
651
+ "LoopSplitting": 0.0005662441253662109,
652
+ "LowerBroadcast": 0.002042055130004883,
653
+ "LowerCCOpBlockAxis": 0.00509333610534668,
654
+ "LowerComplexBroadcast": 0.001802206039428711,
655
+ "LowerIntrinsics": 0.08197450637817383,
656
+ "LowerTensorOp": 0.012150287628173828,
657
+ "LowerTranspose": 0.009961605072021484,
658
+ "MacroGeneration": 0.07223057746887207,
659
+ "MaskPropagation": 0.003228425979614258,
660
+ "MemcpyElimination": 0.14747166633605957,
661
+ "MutateDataType": 0.0013821125030517578,
662
+ "NeuronAliasDependencyInduction": 0.0003688335418701172,
663
+ "NeuronAliasDependencyReset": 0.025783538818359375,
664
+ "NeuronInstComb": 0.014778375625610352,
665
+ "NeuronLICM": 0.029512643814086914,
666
+ "NeuronLoopFusion": 0.012999534606933594,
667
+ "NeuronLoopInterchange": 0.0011935234069824219,
668
+ "NeuronSimplifier": 0.009111881256103516,
669
+ "NeuronSimplifyPredicates": 0.0014109611511230469,
670
+ "NeuronValueNumbering": 0.05046653747558594,
671
+ "OptimizeAliasedCopyChain": 0.0009007453918457031,
672
+ "OptimizeNKIKernels": 0.0016131401062011719,
673
+ "PAGLayoutOpt": 0.41016316413879395,
674
+ "PComputeCutting": 0.035622358322143555,
675
+ "PGLayoutTilingPipeline": 1.0860846042633057,
676
+ "PGTiling": 0.28725552558898926,
677
+ "PadElimination": 0.0004394054412841797,
678
+ "ParAxesAnnotation": 0.33440423011779785,
679
+ "PartialLoopFusion": 0.01318979263305664,
680
+ "PartialSimdFusion": 0.013974428176879883,
681
+ "PerfectLoopNest": 0.0018496513366699219,
682
+ "RecognizeOpIdiom": 0.007380247116088867,
683
+ "Recompute": 0.00032448768615722656,
684
+ "RelaxPredicates": 0.004439115524291992,
685
+ "Rematerialization": 0.0023751258850097656,
686
+ "ReshapeWeights": 0.0008261203765869141,
687
+ "ResolveAccessConflict": 0.0039043426513671875,
688
+ "ResolveComplicatePredicates": 0.0013427734375,
689
+ "RewriteReplicationMatmul": 0.0014929771423339844,
690
+ "RewriteWeights": 0.0029494762420654297,
691
+ "SFKVectorizer": 0.257702112197876,
692
+ "SimpleAllReduceTiling": 0.0015501976013183594,
693
+ "Simplifier": 0.004350900650024414,
694
+ "SimplifyMacroPredicates": 0.0053751468658447266,
695
+ "SimplifyNeuronTensor": 0.0093841552734375,
696
+ "SimplifySlice": 0.0010516643524169922,
697
+ "SimplifyTensor": 0.0060961246490478516,
698
+ "SpillPSum": 0.02084517478942871,
699
+ "SplitAPUnionSets": 0.010507822036743164,
700
+ "SplitAccGrp": 0.0010433197021484375,
701
+ "StaticProfiler": 0.04431486129760742,
702
+ "StaticTransposeLocalTensor": 0.0048198699951171875,
703
+ "SundaISel": 0.08904266357421875,
704
+ "TCTransform": 0.0012717247009277344,
705
+ "TensorInitialization": 0.004460811614990234,
706
+ "TensorOpSimplifier": 0.006735324859619141,
707
+ "TensorOpTransform": 0.03174185752868652,
708
+ "TileCCOps": 0.00526738166809082,
709
+ "TilingProfiler": 0.010722160339355469,
710
+ "TransformConvOp": 0.002729177474975586,
711
+ "TritiumFusion": 0.10425376892089844,
712
+ "ValueNumbering": 0.0030641555786132813,
713
+ "VectorizeDMA": 0.003632783889770508,
714
+ "VectorizeMatMult": 0.0032911300659179688,
715
+ "WeightCoalescing": 0.0043370723724365234,
716
+ "ZeroSizeTensorElimination": 0.0002262592315673828
717
+ },
718
+ "tensorizer": {
719
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 5856.0,
720
+ "StaticProfiler::AifUb": 112.5947036743164,
721
+ "StaticProfiler::ArithmeticIntensityTensorizer": 128.7735595703125,
722
+ "StaticProfiler::AverageDmaLength": 7108.31689453125,
723
+ "StaticProfiler::AverageFractalPeUtilization": 100.0,
724
+ "StaticProfiler::AveragePartitionUtilization": 99.92850494384766,
725
+ "StaticProfiler::AveragePeUtilization": 100.0,
726
+ "StaticProfiler::DDRTransferBytes": 223825920.0,
727
+ "StaticProfiler::InternalTransferBytes": 9797632.0,
728
+ "StaticProfiler::LoadExpanded": 29057.0,
729
+ "StaticProfiler::LocalizationEfficiency": 114.36911010742188,
730
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 116.55321502685547,
731
+ "StaticProfiler::StoreExpanded": 1153.0,
732
+ "StaticProfiler::TotalDMAExpanded": 30210.0,
733
+ "StaticProfiler::TotalDynamicInstancesCount": 5599.0,
734
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 5599.0,
735
+ "StaticProfiler::TotalLNCComm": 0.0,
736
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
737
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
738
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
739
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
740
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
741
+ "TilingProfiler::GenericInstructionsAfterTiling": 8.0,
742
+ "TilingProfiler::MatMultInstructionsAfterTiling": 4756.0,
743
+ "TilingProfiler::NumPfTransposes": 7.0,
744
+ "TilingProfiler::NumPfTransposesForIo": 3.0,
745
+ "TilingProfiler::NumPfTransposesForLocal": 2.0,
746
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
747
+ "TilingProfiler::PfTransposeInstructions": 118.0,
748
+ "TilingProfiler::PfTransposeInstructionsForIo": 34.0,
749
+ "TilingProfiler::PfTransposeInstructionsForLocal": 20.0,
750
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 64.0,
751
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
752
+ "TilingProfiler::SimdInstructionsAfterTiling": 186.0,
753
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
754
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
755
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
756
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
757
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
758
+ "TransformConvOp::conv2d_column_packing": 0.0,
759
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
760
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
761
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
762
+ }
763
+ },
764
+ "sg0002": {
765
+ "compiletime": {
766
+ "AGOrderingAnalysisPass": 0.04956221580505371,
767
+ "AffinePredicateResolution": 0.002319812774658203,
768
+ "AliasDependencyElimination": 0.00019097328186035156,
769
+ "AliasDependencyInduction": 0.01746201515197754,
770
+ "AliasDependencyReset": 0.04065680503845215,
771
+ "BFComputeCutting": 0.0013394355773925781,
772
+ "BirCodeGenLoop": 0.11406064033508301,
773
+ "CCOpFusion": 0.03461766242980957,
774
+ "CanonicalizeDAGForPGTiling": 0.004343748092651367,
775
+ "CanonicalizeIR": 0.003444671630859375,
776
+ "CoalesceCCOp": 0.04682421684265137,
777
+ "CommuteConcat": 0.0010838508605957031,
778
+ "DMALocalityOpt": 0.002088308334350586,
779
+ "DMAProfiler": 0.005882740020751953,
780
+ "DMATilingProfiler": 0.004760265350341797,
781
+ "DataLocalityOpt": 0.11827898025512695,
782
+ "DataStreaming": 0.005929470062255859,
783
+ "DeConcat": 0.0006871223449707031,
784
+ "DeadCodeElimination": 0.0018725395202636719,
785
+ "DeadStoreElimination": 0.00615692138671875,
786
+ "DelinearIndices": 0.004624128341674805,
787
+ "Delinearization": 0.0023467540740966797,
788
+ "DoNothing": 0.0001399517059326172,
789
+ "DramToDramTranspose": 0.015562057495117188,
790
+ "DumpGraphAndMetadata": 0.021119117736816406,
791
+ "EliminateDivs": 0.0027742385864257813,
792
+ "ExpandBatchNorm": 0.0035254955291748047,
793
+ "ExpandISAMacro": 0.0046041011810302734,
794
+ "FactorizeBlkDims": 0.008176565170288086,
795
+ "FactorizeThreadAxesInFreeDims": 0.0011937618255615234,
796
+ "FlattenMacroLoop": 0.002167940139770508,
797
+ "GenericAccessSimplifier": 0.0008442401885986328,
798
+ "InferInitValue": 0.024956941604614258,
799
+ "InferIntrinsicOnCC": 0.015187978744506836,
800
+ "InferNeuronTensor": 0.01759958267211914,
801
+ "InferNonlocalTensors": 0.014631509780883789,
802
+ "InferPSumTensor": 0.03573298454284668,
803
+ "InlineNativeKernels": 0.0049402713775634766,
804
+ "InsertIOTransposes": 0.012220144271850586,
805
+ "InsertLocalTransposes": 0.0043489933013916016,
806
+ "InsertOffloadedTransposes": 0.0029261112213134766,
807
+ "LICM": 0.002863168716430664,
808
+ "LateLegalizeInst": 0.006443977355957031,
809
+ "LateLegalizePostSplit": 0.004354715347290039,
810
+ "LateLowerReshapeOp": 0.0013675689697265625,
811
+ "LateLowerTensorOp": 0.0017099380493164063,
812
+ "LateNeuronInstComb": 0.008248329162597656,
813
+ "LayoutPreprocessing": 0.024985551834106445,
814
+ "LayoutPreprocessingAndAnalysis": 0.05915069580078125,
815
+ "LayoutRequirementAnalysis": 0.004924297332763672,
816
+ "LegalizeCCOpLayout": 0.003811359405517578,
817
+ "LegalizeOpLevelAlias": 0.0012297630310058594,
818
+ "LegalizePartitionReduce": 0.001873016357421875,
819
+ "LegalizeSundaAccess": 0.06514930725097656,
820
+ "LegalizeSundaMacro": 0.010838985443115234,
821
+ "LegalizeType": 0.007501840591430664,
822
+ "LocalLayoutOpt": 0.01294255256652832,
823
+ "LoopFusion": 0.005301713943481445,
824
+ "LoopSplitting": 0.0006639957427978516,
825
+ "LowerBroadcast": 0.0011935234069824219,
826
+ "LowerCCOpBlockAxis": 0.003854513168334961,
827
+ "LowerComplexBroadcast": 0.0023345947265625,
828
+ "LowerIntrinsics": 0.3132970333099365,
829
+ "LowerTensorOp": 0.013880491256713867,
830
+ "LowerTranspose": 0.007669687271118164,
831
+ "MacroGeneration": 0.04993605613708496,
832
+ "MaskPropagation": 0.0027527809143066406,
833
+ "MemcpyElimination": 0.02739572525024414,
834
+ "MutateDataType": 0.0013971328735351563,
835
+ "NeuronAliasDependencyInduction": 0.0002524852752685547,
836
+ "NeuronAliasDependencyReset": 0.046864986419677734,
837
+ "NeuronInstComb": 0.006575107574462891,
838
+ "NeuronLICM": 0.03473496437072754,
839
+ "NeuronLoopFusion": 0.008702993392944336,
840
+ "NeuronLoopInterchange": 0.0014033317565917969,
841
+ "NeuronSimplifier": 0.0068361759185791016,
842
+ "NeuronSimplifyPredicates": 0.0032160282135009766,
843
+ "NeuronValueNumbering": 0.002985239028930664,
844
+ "OptimizeAliasedCopyChain": 0.00058746337890625,
845
+ "OptimizeNKIKernels": 0.5256326198577881,
846
+ "PAGLayoutOpt": 0.07949376106262207,
847
+ "PComputeCutting": 0.005449533462524414,
848
+ "PGLayoutTilingPipeline": 0.5191597938537598,
849
+ "PGTiling": 0.1754302978515625,
850
+ "PadElimination": 0.0005822181701660156,
851
+ "ParAxesAnnotation": 0.0432279109954834,
852
+ "PartialLoopFusion": 0.0034580230712890625,
853
+ "PartialSimdFusion": 0.003893136978149414,
854
+ "PerfectLoopNest": 0.0017859935760498047,
855
+ "RecognizeOpIdiom": 0.003717660903930664,
856
+ "Recompute": 0.0006694793701171875,
857
+ "RelaxPredicates": 0.004067182540893555,
858
+ "Rematerialization": 0.002377748489379883,
859
+ "ReshapeWeights": 0.0010006427764892578,
860
+ "ResolveAccessConflict": 0.003880739212036133,
861
+ "ResolveComplicatePredicates": 0.0018019676208496094,
862
+ "RewriteReplicationMatmul": 0.0015823841094970703,
863
+ "RewriteWeights": 0.002522706985473633,
864
+ "SFKVectorizer": 0.11758708953857422,
865
+ "SimpleAllReduceTiling": 0.0028557777404785156,
866
+ "Simplifier": 0.0035266876220703125,
867
+ "SimplifyMacroPredicates": 0.005109071731567383,
868
+ "SimplifyNeuronTensor": 0.2750875949859619,
869
+ "SimplifySlice": 0.0010409355163574219,
870
+ "SimplifyTensor": 0.025161027908325195,
871
+ "SpillPSum": 0.04991626739501953,
872
+ "SplitAPUnionSets": 0.013710737228393555,
873
+ "SplitAccGrp": 0.001070261001586914,
874
+ "StaticProfiler": 0.005475759506225586,
875
+ "StaticTransposeLocalTensor": 0.0038781166076660156,
876
+ "SundaISel": 0.04041314125061035,
877
+ "TCTransform": 0.0008709430694580078,
878
+ "TensorInitialization": 0.004585742950439453,
879
+ "TensorOpSimplifier": 0.005696535110473633,
880
+ "TensorOpTransform": 0.021961212158203125,
881
+ "TileCCOps": 0.010190010070800781,
882
+ "TilingProfiler": 0.04765009880065918,
883
+ "TransformConvOp": 0.0030345916748046875,
884
+ "TritiumFusion": 0.07207250595092773,
885
+ "ValueNumbering": 0.002035379409790039,
886
+ "VectorizeDMA": 0.006017446517944336,
887
+ "VectorizeMatMult": 0.0027844905853271484,
888
+ "WeightCoalescing": 0.003088235855102539,
889
+ "ZeroSizeTensorElimination": 0.00019311904907226563
890
+ },
891
+ "tensorizer": {
892
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 9240.0,
893
+ "StaticProfiler::AifUb": 74.75788116455078,
894
+ "StaticProfiler::ArithmeticIntensityTensorizer": 73.24122619628906,
895
+ "StaticProfiler::AverageDmaLength": 6860.4052734375,
896
+ "StaticProfiler::AverageFractalPeUtilization": 99.61843872070313,
897
+ "StaticProfiler::AveragePartitionUtilization": 98.97808837890625,
898
+ "StaticProfiler::AveragePeUtilization": 98.48223876953125,
899
+ "StaticProfiler::DDRTransferBytes": 318999072.0,
900
+ "StaticProfiler::InternalTransferBytes": 6136852.0,
901
+ "StaticProfiler::LoadExpanded": 40210.0,
902
+ "StaticProfiler::LocalizationEfficiency": 97.97124481201172,
903
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.67024993896484,
904
+ "StaticProfiler::StoreExpanded": 1195.0,
905
+ "StaticProfiler::TotalDMAExpanded": 41405.0,
906
+ "StaticProfiler::TotalDynamicInstancesCount": 9321.0,
907
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 9225.0,
908
+ "StaticProfiler::TotalLNCComm": 0.0,
909
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
910
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
911
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
912
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
913
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
914
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
915
+ "TilingProfiler::MatMultInstructionsAfterTiling": 8128.0,
916
+ "TilingProfiler::NumPfTransposes": 4.0,
917
+ "TilingProfiler::NumPfTransposesForIo": 0.0,
918
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
919
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
920
+ "TilingProfiler::PfTransposeInstructions": 97.0,
921
+ "TilingProfiler::PfTransposeInstructionsForIo": 0.0,
922
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
923
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 96.0,
924
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
925
+ "TilingProfiler::SimdInstructionsAfterTiling": 166.0,
926
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
927
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
928
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
929
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
930
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
931
+ "TransformConvOp::conv2d_column_packing": 0.0,
932
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
933
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
934
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
935
+ }
936
+ },
937
+ "sg01": {
938
+ "compiletime": {
939
+ "CanonicalizeConv": 1.2000000424450263e-05,
940
+ "CanonicalizeForTensorizer": 1.2999999853491317e-05,
941
+ "Canonicalizer": 0.00025499999173916876,
942
+ "HoistCompute": 3.000000106112566e-06,
943
+ "IdentifyCrossPassTensors": 1.1000000085914508e-05,
944
+ "MemcastMotion": 1.1000000085914508e-05,
945
+ "PenguinizeFunctions": 1.2000000424450263e-05,
946
+ "PruneFunctions": 1.2999999853491317e-05,
947
+ "RemoveOptimizationBarriers": 1.2999999853491317e-05,
948
+ "ScatterMotion": 3.000000106112566e-06,
949
+ "TensorizerLegalizationPass": 1.5999999959603883e-05,
950
+ "VerifySupportedOps": 9.999999747378752e-06,
951
+ "algsimp": 8.299999899463728e-05,
952
+ "batchnorm_expander": 1.700000029813964e-05,
953
+ "boundary-marker-removal": 6.000000212225132e-06,
954
+ "call-inliner": 9.999999747378752e-06,
955
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
956
+ "collective-stream-id-checker": 1.2000000424450263e-05,
957
+ "comparison-expander": 4.999999873689376e-06,
958
+ "computation-deduplicator": 2.300000051036477e-05,
959
+ "conditional-to-select": 7.999999979801942e-06,
960
+ "config-lowering": 6.800000119255856e-05,
961
+ "constant_folding": 9.000000318337698e-06,
962
+ "cse": 2.9000000722589903e-05,
963
+ "dce": 1.9999999949504854e-06,
964
+ "dynamic-slice-transpose": 7.000000096013537e-06,
965
+ "eliminate-redundant-compare": 6.000000212225132e-06,
966
+ "emit-offloaded-dropout": 2.2000000171829015e-05,
967
+ "flatten-call-graph": 9.000000318337698e-06,
968
+ "fuse-send-recv": 2.5999999706982635e-05,
969
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
970
+ "hilo::NeuronInstCombine": 5.2999999752501026e-05,
971
+ "hilo::NeuronOpFusion": 9.999999747378752e-06,
972
+ "hilo::ReplaceTokenTypeWithU8Pass": 4.999999873689376e-06,
973
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
974
+ "hilo::SixtyFourHack": 9.999999747378752e-06,
975
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
976
+ "hlo-mac-count": 4.8999998398358e-05,
977
+ "hlo-verifier": 0.00022499999613501132,
978
+ "legalize-ccops": 9.999999974752427e-07,
979
+ "legalize-compare": 3.999999989900971e-06,
980
+ "lower-argminmax-custom-call": 6.000000212225132e-06,
981
+ "map-inline": 1.2000000424450263e-05,
982
+ "metadata-naming": 2.9999999242136255e-05,
983
+ "mlir::detail::OpToOpPassAdaptor": 2.9999999242136255e-05,
984
+ "mlir::hlo::MhloToPyPenguin": 0.02389100007712841,
985
+ "mlir::mhlo::LowerComplexExtraPass": 9.000000136438757e-05,
986
+ "mlir::mhlo::LowerComplexPass": 0.0001140000022132881,
987
+ "native-to-custom-softmax": 1.4999999621068127e-05,
988
+ "native-to-custom-softmax-dx": 3.199999991920777e-05,
989
+ "operand_upcaster": 2.2000000171829015e-05,
990
+ "post-par-pipe-begin": 9.999999974752427e-07,
991
+ "post-par-pipe-end": 0.0,
992
+ "post-partition-simplification": 0.0006050000083632767,
993
+ "replace-minimum-constant": 1.1000000085914508e-05,
994
+ "reshape-mover": 3.999999989900971e-06,
995
+ "simplify-concat": 5.700000110664405e-05,
996
+ "simplify-while-loops": 1.9999999949504854e-06,
997
+ "transform-variadic-reduce": 1.2999999853491317e-05,
998
+ "tuple-simplifier": 4.999999873689376e-06,
999
+ "unpack-nested-aws-ntwsr": 3.000000106112566e-06,
1000
+ "unroll-while-loop": 0.0
1001
+ },
1002
+ "hilo": {
1003
+ "ArithmeticIntensity": 116.26366424560547,
1004
+ "HloMacCount": 14025752576.0,
1005
+ "Traffic": 241274912.0
1006
+ }
1007
+ },
1008
+ "sg02": {
1009
+ "compiletime": {
1010
+ "CanonicalizeConv": 0.0,
1011
+ "CanonicalizeForTensorizer": 1.2999999853491317e-05,
1012
+ "Canonicalizer": 0.00031900001340545714,
1013
+ "HoistCompute": 0.0,
1014
+ "IdentifyCrossPassTensors": 1.2000000424450263e-05,
1015
+ "MemcastMotion": 1.1000000085914508e-05,
1016
+ "PenguinizeFunctions": 9.999999747378752e-06,
1017
+ "PruneFunctions": 7.000000096013537e-06,
1018
+ "RemoveOptimizationBarriers": 1.4999999621068127e-05,
1019
+ "ScatterMotion": 9.999999974752427e-07,
1020
+ "TensorizerLegalizationPass": 6.000000212225132e-06,
1021
+ "VerifySupportedOps": 1.1000000085914508e-05,
1022
+ "algsimp": 6.70000008540228e-05,
1023
+ "batchnorm_expander": 1.2999999853491317e-05,
1024
+ "boundary-marker-removal": 3.000000106112566e-06,
1025
+ "call-inliner": 1.1000000085914508e-05,
1026
+ "canonicalize-boundary-marker": 3.999999989900971e-06,
1027
+ "collective-stream-id-checker": 4.999999873689376e-06,
1028
+ "comparison-expander": 4.999999873689376e-06,
1029
+ "computation-deduplicator": 2.2000000171829015e-05,
1030
+ "conditional-to-select": 6.000000212225132e-06,
1031
+ "config-lowering": 5.999999848427251e-05,
1032
+ "constant_folding": 7.999999979801942e-06,
1033
+ "cse": 1.2999999853491317e-05,
1034
+ "dce": 9.999999974752427e-07,
1035
+ "dynamic-slice-transpose": 3.999999989900971e-06,
1036
+ "eliminate-redundant-compare": 3.000000106112566e-06,
1037
+ "emit-offloaded-dropout": 2.099999983329326e-05,
1038
+ "flatten-call-graph": 1.1000000085914508e-05,
1039
+ "fuse-send-recv": 2.099999983329326e-05,
1040
+ "hilo::LegalizeAlias": 1.9999999949504854e-06,
1041
+ "hilo::NeuronInstCombine": 6.299999949987978e-05,
1042
+ "hilo::NeuronOpFusion": 1.8999999156221747e-05,
1043
+ "hilo::ReplaceTokenTypeWithU8Pass": 7.000000096013537e-06,
1044
+ "hilo::ScheduleFusion": 1.9999999949504854e-06,
1045
+ "hilo::SixtyFourHack": 3.7999998312443495e-05,
1046
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1047
+ "hlo-mac-count": 0.00019099999917671084,
1048
+ "hlo-verifier": 0.0001740000006975606,
1049
+ "legalize-ccops": 9.999999974752427e-07,
1050
+ "legalize-compare": 3.000000106112566e-06,
1051
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
1052
+ "map-inline": 1.2999999853491317e-05,
1053
+ "metadata-naming": 1.700000029813964e-05,
1054
+ "mlir::detail::OpToOpPassAdaptor": 3.300000025774352e-05,
1055
+ "mlir::hlo::MhloToPyPenguin": 0.016152000054717064,
1056
+ "mlir::mhlo::LowerComplexExtraPass": 0.0001230000052601099,
1057
+ "mlir::mhlo::LowerComplexPass": 8.199999865610152e-05,
1058
+ "native-to-custom-softmax": 1.1000000085914508e-05,
1059
+ "native-to-custom-softmax-dx": 3.099999958067201e-05,
1060
+ "operand_upcaster": 1.4000000192027073e-05,
1061
+ "post-par-pipe-begin": 9.999999974752427e-07,
1062
+ "post-par-pipe-end": 0.0,
1063
+ "post-partition-simplification": 0.0005029999883845448,
1064
+ "replace-minimum-constant": 9.999999747378752e-06,
1065
+ "reshape-mover": 3.000000106112566e-06,
1066
+ "simplify-concat": 4.999999873689376e-05,
1067
+ "simplify-while-loops": 1.9999999949504854e-06,
1068
+ "transform-variadic-reduce": 4.70000013592653e-05,
1069
+ "tuple-simplifier": 6.000000212225132e-06,
1070
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1071
+ "unroll-while-loop": 0.0
1072
+ },
1073
+ "hilo": {
1074
+ "ArithmeticIntensity": 72.34154510498047,
1075
+ "HloMacCount": 11341398016.0,
1076
+ "Traffic": 313551424.0
1077
+ }
1078
+ }
1079
+ }
context_encoding_model/_tp0_bk0/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_encoding_model/_tp0_bk0/neuron_config.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "MistralForCausalLM"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "attribute_map": {},
10
+ "bad_words_ids": null,
11
+ "begin_suppress_tokens": null,
12
+ "bos_token_id": 1,
13
+ "chunk_size_feed_forward": 0,
14
+ "cross_attention_hidden_size": null,
15
+ "decoder_start_token_id": null,
16
+ "diversity_penalty": 0.0,
17
+ "do_sample": false,
18
+ "early_stopping": false,
19
+ "encoder_no_repeat_ngram_size": 0,
20
+ "eos_token_id": 2,
21
+ "exponential_decay_length_penalty": null,
22
+ "finetuning_task": null,
23
+ "forced_bos_token_id": null,
24
+ "forced_eos_token_id": null,
25
+ "fused_spec_config": null,
26
+ "head_dim": 128,
27
+ "hidden_act": "silu",
28
+ "hidden_size": 4096,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 14336,
35
+ "is_decoder": false,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "length_penalty": 1.0,
42
+ "max_length": 20,
43
+ "max_position_embeddings": 32768,
44
+ "metadata": null,
45
+ "min_length": 0,
46
+ "model_type": "mistral",
47
+ "neuron_config": {
48
+ "activation_quantization_type": null,
49
+ "allow_input_truncation": false,
50
+ "apply_seq_ids_mask": false,
51
+ "async_mode": false,
52
+ "attention_dp_degree": 1,
53
+ "attention_dtype": null,
54
+ "attn_block_cte_nki_kernel_enabled": false,
55
+ "attn_block_tkg_nki_kernel_cache_update": false,
56
+ "attn_block_tkg_nki_kernel_enabled": false,
57
+ "attn_cls": "NeuronLlamaAttention",
58
+ "attn_kernel_enabled": null,
59
+ "attn_tkg_builtin_kernel_enabled": false,
60
+ "attn_tkg_nki_kernel_enabled": false,
61
+ "batch_size": 1,
62
+ "bucket_n_active_tokens": true,
63
+ "buckets": [
64
+ 128
65
+ ],
66
+ "cast_type": "config",
67
+ "cc_pipeline_tiling_factor": 2,
68
+ "chunked_prefill_config": null,
69
+ "context_encoding_buckets": [
70
+ 128
71
+ ],
72
+ "cp_degree": 1,
73
+ "ctx_batch_size": 1,
74
+ "disable_kv_cache_tiling": false,
75
+ "draft_model_modules_to_not_convert": null,
76
+ "enable_bucketing": true,
77
+ "enable_eagle_draft_input_norm": false,
78
+ "enable_eagle_speculation": false,
79
+ "enable_fused_speculation": false,
80
+ "enable_long_context_mode": false,
81
+ "enable_output_completion_notifications": false,
82
+ "enable_spill_reload_dge": false,
83
+ "enable_token_tree": false,
84
+ "ep_degree": 1,
85
+ "expert_mlp_nki_kernel_enabled": null,
86
+ "flash_decoding_enabled": false,
87
+ "fused_qkv": false,
88
+ "fused_rmsnorm_skip_gamma": false,
89
+ "is_block_kv_layout": null,
90
+ "is_chunked_prefill": false,
91
+ "is_continuous_batching": true,
92
+ "is_eagle_draft": false,
93
+ "is_medusa": false,
94
+ "is_prefill_stage": true,
95
+ "is_prefix_caching": false,
96
+ "k_cache_transposed": false,
97
+ "kv_cache_batch_size": 4,
98
+ "kv_cache_padding_size": 0,
99
+ "kv_cache_quant": false,
100
+ "kv_cache_tiling": false,
101
+ "layer_boundary_markers": false,
102
+ "lm_head_pad": false,
103
+ "lm_head_pad_alignment_size": 1,
104
+ "local_ranks_size": 2,
105
+ "logical_nc_config": 1,
106
+ "lora_config": null,
107
+ "max_batch_size": 4,
108
+ "max_context_length": 2048,
109
+ "max_length": 2048,
110
+ "max_new_tokens": null,
111
+ "medusa_speculation_length": 0,
112
+ "medusa_tree": null,
113
+ "mlp_kernel_enabled": false,
114
+ "mlp_kernel_fuse_residual_add": false,
115
+ "modules_to_not_convert": null,
116
+ "moe_fused_nki_kernel_enabled": null,
117
+ "n_active_tokens": 2048,
118
+ "n_positions": 2048,
119
+ "num_medusa_heads": 0,
120
+ "on_cpu": false,
121
+ "on_device_sampling_config": {
122
+ "deterministic": false,
123
+ "do_sample": false,
124
+ "dynamic": true,
125
+ "global_topk": 256,
126
+ "on_device_sampling_config": true,
127
+ "temperature": 1.0,
128
+ "top_k": 1,
129
+ "top_k_kernel_enabled": false,
130
+ "top_p": 1.0
131
+ },
132
+ "output_logits": false,
133
+ "overrides_torch_dtype": true,
134
+ "pa_block_size": 2048,
135
+ "pa_num_blocks": 4,
136
+ "padding_side": "right",
137
+ "pp_degree": 1,
138
+ "prefix_buckets": null,
139
+ "qk_layernorm": false,
140
+ "qkv_kernel_enabled": false,
141
+ "qkv_kernel_fuse_residual_add": false,
142
+ "qkv_kernel_nbsd_layout": false,
143
+ "quantization_dtype": "int8",
144
+ "quantization_type": "per_tensor_symmetric",
145
+ "quantize_clamp_bound": Infinity,
146
+ "quantized": false,
147
+ "quantized_checkpoints_path": null,
148
+ "quantized_mlp_kernel_enabled": false,
149
+ "rmsnorm_quantize_kernel_enabled": false,
150
+ "router_topk_nki_kernel_enabled": null,
151
+ "rpl_reduce_dtype": null,
152
+ "save_sharded_checkpoint": true,
153
+ "scratchpad_page_size": null,
154
+ "seq_len": 2048,
155
+ "seq_len_threshold_for_cc_tiling": 16384,
156
+ "sequence_parallel_enabled": false,
157
+ "shared_mlp_nki_kernel_enabled": null,
158
+ "skip_sharding": false,
159
+ "skip_warmup": false,
160
+ "spec_batch_size": 4,
161
+ "speculation_length": 0,
162
+ "start_rank_id": 0,
163
+ "target": null,
164
+ "tile_cc": false,
165
+ "tkg_batch_size": 4,
166
+ "token_generation_buckets": null,
167
+ "token_tree_config": null,
168
+ "torch_dtype": "bfloat16",
169
+ "tp_degree": 2,
170
+ "vocab_parallel": false,
171
+ "weight_gather_seq_len_threshold": 32768,
172
+ "weights_to_skip_layout_optimization": [],
173
+ "world_size": 2
174
+ },
175
+ "no_repeat_ngram_size": 0,
176
+ "num_attention_heads": 32,
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_cores_per_group": 1,
180
+ "num_hidden_layers": 32,
181
+ "num_key_value_heads": 8,
182
+ "num_return_sequences": 1,
183
+ "output_attentions": false,
184
+ "output_hidden_states": false,
185
+ "output_scores": false,
186
+ "pad_token_id": 0,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "remove_invalid_values": false,
191
+ "repetition_penalty": 1.0,
192
+ "return_dict": true,
193
+ "return_dict_in_generate": false,
194
+ "rms_norm_eps": 1e-05,
195
+ "rope_theta": 1000000.0,
196
+ "sep_token_id": null,
197
+ "sliding_window": null,
198
+ "suppress_tokens": null,
199
+ "task_specific_params": null,
200
+ "temperature": 1.0,
201
+ "tf_legacy_loss": false,
202
+ "tie_encoder_decoder": false,
203
+ "tie_word_embeddings": false,
204
+ "tokenizer_class": null,
205
+ "top_k": 50,
206
+ "top_p": 1.0,
207
+ "torchscript": false,
208
+ "transformers_version": "4.42.0.dev0",
209
+ "typical_p": 1.0,
210
+ "use_bfloat16": false,
211
+ "use_cache": true,
212
+ "vocab_size": 32768
213
+ }
context_encoding_model/_tp0_bk1/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_68c159ab1fef44a40212+6a9a7e72.hlo_module.pb --output model.MODULE_68c159ab1fef44a40212+6a9a7e72.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk1/compile_flags.MODULE_68c159ab1fef44a40212+6a9a7e72.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk1/log-neuron-cc.txt"]
context_encoding_model/_tp0_bk1/global_metric_store.json ADDED
@@ -0,0 +1,1079 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.64185333251953,
5
+ "StaticProfiler::AveragePartitionUtilization": 99.05009460449219,
6
+ "StaticProfiler::AveragePeUtilization": 98.57437896728516,
7
+ "StaticProfiler::LocalizationEfficiency": 97.34869384765625,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 101.34577178955078,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1.0,
16
+ "StaticProfiler::AveragePartitionUtilization": 1.0,
17
+ "StaticProfiler::AveragePeUtilization": 1.0,
18
+ "StaticProfiler::LocalizationEfficiency": 1.0,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 0.023177146911621094,
27
+ "AffinePredicateResolution": 0.0010716915130615234,
28
+ "AliasDependencyElimination": 0.0001862049102783203,
29
+ "AliasDependencyInduction": 0.006262540817260742,
30
+ "AliasDependencyReset": 0.030141830444335938,
31
+ "BFComputeCutting": 0.0014030933380126953,
32
+ "BirCodeGenLoop": 0.14523863792419434,
33
+ "CCOpFusion": 0.015876293182373047,
34
+ "CanonicalizeConv": 6.399999983841553e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.004258632659912109,
36
+ "CanonicalizeForTensorizer": 4.5000000682193786e-05,
37
+ "CanonicalizeIR": 0.0015943050384521484,
38
+ "Canonicalizer": 0.0009469999931752682,
39
+ "CoalesceCCOp": 0.006041288375854492,
40
+ "CommuteConcat": 0.0009081363677978516,
41
+ "DMALocalityOpt": 0.002227783203125,
42
+ "DMAProfiler": 0.006844043731689453,
43
+ "DMATilingProfiler": 0.004567146301269531,
44
+ "DataLocalityOpt": 0.1468503475189209,
45
+ "DataStreaming": 0.05744600296020508,
46
+ "DeConcat": 0.0009293556213378906,
47
+ "DeadCodeElimination": 0.0011928081512451172,
48
+ "DeadStoreElimination": 0.007216453552246094,
49
+ "DelinearIndices": 0.008558034896850586,
50
+ "Delinearization": 0.004823923110961914,
51
+ "DoNothing": 0.00029468536376953125,
52
+ "DramToDramTranspose": 0.03753399848937988,
53
+ "DumpGraphAndMetadata": 0.021843910217285156,
54
+ "EliminateDivs": 0.0021796226501464844,
55
+ "ExpandBatchNorm": 0.001444101333618164,
56
+ "ExpandISAMacro": 0.00778961181640625,
57
+ "FactorizeBlkDims": 0.00781702995300293,
58
+ "FactorizeThreadAxesInFreeDims": 0.002001047134399414,
59
+ "FlattenMacroLoop": 0.002486705780029297,
60
+ "GenericAccessSimplifier": 0.0008151531219482422,
61
+ "HoistCompute": 4.999999873689376e-06,
62
+ "IdentifyCrossPassTensors": 3.600000127335079e-05,
63
+ "InferInitValue": 0.03303098678588867,
64
+ "InferIntrinsicOnCC": 0.008953571319580078,
65
+ "InferNeuronTensor": 0.06123208999633789,
66
+ "InferNonlocalTensors": 0.015596389770507813,
67
+ "InferPSumTensor": 0.11387157440185547,
68
+ "InlineNativeKernels": 0.003874063491821289,
69
+ "InsertIOTransposes": 0.015367746353149414,
70
+ "InsertLocalTransposes": 0.00810861587524414,
71
+ "InsertOffloadedTransposes": 0.0030138492584228516,
72
+ "LICM": 0.002691507339477539,
73
+ "LateLegalizeInst": 0.007213115692138672,
74
+ "LateLegalizePostSplit": 0.0043141841888427734,
75
+ "LateLowerReshapeOp": 0.0011355876922607422,
76
+ "LateLowerTensorOp": 0.0013995170593261719,
77
+ "LateNeuronInstComb": 0.009235620498657227,
78
+ "LayoutPreprocessing": 0.03390383720397949,
79
+ "LayoutPreprocessingAndAnalysis": 0.11866283416748047,
80
+ "LayoutRequirementAnalysis": 0.009894132614135742,
81
+ "LegalizeCCOpLayout": 0.0019948482513427734,
82
+ "LegalizeOpLevelAlias": 0.0011715888977050781,
83
+ "LegalizePartitionReduce": 0.0018146038055419922,
84
+ "LegalizeSundaAccess": 0.036304473876953125,
85
+ "LegalizeSundaMacro": 0.015540599822998047,
86
+ "LegalizeType": 0.006517171859741211,
87
+ "LocalLayoutOpt": 0.023403167724609375,
88
+ "LoopFusion": 0.018198251724243164,
89
+ "LoopSplitting": 0.0003662109375,
90
+ "LowerBroadcast": 0.001718282699584961,
91
+ "LowerCCOpBlockAxis": 0.0071604251861572266,
92
+ "LowerComplexBroadcast": 0.0025353431701660156,
93
+ "LowerIntrinsics": 0.30090999603271484,
94
+ "LowerTensorOp": 0.010535478591918945,
95
+ "LowerTranspose": 0.0088653564453125,
96
+ "MacroGeneration": 0.02730083465576172,
97
+ "MaskPropagation": 0.0026755332946777344,
98
+ "MemcastMotion": 2.5000001187436283e-05,
99
+ "MemcpyElimination": 0.04188370704650879,
100
+ "MutateDataType": 0.0019674301147460938,
101
+ "NeuronAliasDependencyInduction": 0.00026416778564453125,
102
+ "NeuronAliasDependencyReset": 0.010982990264892578,
103
+ "NeuronInstComb": 0.004952669143676758,
104
+ "NeuronLICM": 0.0615081787109375,
105
+ "NeuronLoopFusion": 0.009150981903076172,
106
+ "NeuronLoopInterchange": 0.002344846725463867,
107
+ "NeuronSimplifier": 0.0074083805084228516,
108
+ "NeuronSimplifyPredicates": 0.009469747543334961,
109
+ "NeuronValueNumbering": 0.0031404495239257813,
110
+ "OptimizeAliasedCopyChain": 0.0016322135925292969,
111
+ "OptimizeNKIKernels": 0.6652572154998779,
112
+ "PAGLayoutOpt": 0.22568559646606445,
113
+ "PComputeCutting": 0.005068063735961914,
114
+ "PGLayoutTilingPipeline": 0.725257158279419,
115
+ "PGTiling": 0.16728448867797852,
116
+ "PadElimination": 0.0007500648498535156,
117
+ "ParAxesAnnotation": 0.05478692054748535,
118
+ "PartialLoopFusion": 0.006835222244262695,
119
+ "PartialSimdFusion": 0.007959842681884766,
120
+ "PenguinizeFunctions": 3.900000228895806e-05,
121
+ "PerfectLoopNest": 0.0017228126525878906,
122
+ "PruneFunctions": 3.300000025774352e-05,
123
+ "RecognizeOpIdiom": 0.0035974979400634766,
124
+ "Recompute": 0.00035452842712402344,
125
+ "RelaxPredicates": 0.005320310592651367,
126
+ "Rematerialization": 0.0019502639770507813,
127
+ "RemoveOptimizationBarriers": 3.9999998989515007e-05,
128
+ "ReshapeWeights": 0.0009279251098632813,
129
+ "ResolveAccessConflict": 0.0035495758056640625,
130
+ "ResolveComplicatePredicates": 0.0010292530059814453,
131
+ "RewriteReplicationMatmul": 0.0021431446075439453,
132
+ "RewriteWeights": 0.0024466514587402344,
133
+ "SFKVectorizer": 0.13248300552368164,
134
+ "ScatterMotion": 3.7000001611886546e-05,
135
+ "SimpleAllReduceTiling": 0.0030663013458251953,
136
+ "Simplifier": 0.0042324066162109375,
137
+ "SimplifyMacroPredicates": 0.0056989192962646484,
138
+ "SimplifyNeuronTensor": 0.2761991024017334,
139
+ "SimplifySlice": 0.0008549690246582031,
140
+ "SimplifyTensor": 0.004892826080322266,
141
+ "SpillPSum": 0.010103225708007813,
142
+ "SplitAPUnionSets": 0.013408899307250977,
143
+ "SplitAccGrp": 0.0011219978332519531,
144
+ "StaticProfiler": 0.0052869319915771484,
145
+ "StaticTransposeLocalTensor": 0.003991365432739258,
146
+ "SundaISel": 0.07178783416748047,
147
+ "TCTransform": 0.0011143684387207031,
148
+ "TensorInitialization": 0.03500223159790039,
149
+ "TensorOpSimplifier": 0.00494384765625,
150
+ "TensorOpTransform": 0.01940751075744629,
151
+ "TensorizerLegalizationPass": 4.3000000005122274e-05,
152
+ "TileCCOps": 0.007318258285522461,
153
+ "TilingProfiler": 0.007637739181518555,
154
+ "TransformConvOp": 0.002967357635498047,
155
+ "TritiumFusion": 0.06856966018676758,
156
+ "ValueNumbering": 0.001973867416381836,
157
+ "VectorizeDMA": 0.002585887908935547,
158
+ "VectorizeMatMult": 0.004343986511230469,
159
+ "VerifySupportedOps": 2.9999999242136255e-05,
160
+ "WeightCoalescing": 0.003629446029663086,
161
+ "ZeroSizeTensorElimination": 0.00018334388732910156,
162
+ "algsimp": 0.002331000054255128,
163
+ "batchnorm_expander": 4.5000000682193786e-05,
164
+ "boundary-marker-removal": 9.999999747378752e-06,
165
+ "call-inliner": 0.00041500001680105925,
166
+ "canonicalize-boundary-marker": 1.2999999853491317e-05,
167
+ "collective-stream-id-checker": 0.00011100000119768083,
168
+ "comparison-expander": 0.0005629999795928597,
169
+ "computation-deduplicator": 5.499999679159373e-05,
170
+ "conditional-to-select": 1.5999999959603883e-05,
171
+ "config-lowering": 0.00015900000289548188,
172
+ "constant-statistics": 0.00039400000241585076,
173
+ "constant_folding": 0.0002890000178012997,
174
+ "cse": 3.9999998989515007e-05,
175
+ "dce": 7.899999764049426e-05,
176
+ "dot_decomposer": 0.0016339999856427312,
177
+ "dynamic-slice-transpose": 1.799999881768599e-05,
178
+ "eliminate-redundant-compare": 0.00026000000070780516,
179
+ "emit-offloaded-dropout": 6.399999983841553e-05,
180
+ "flatten-call-graph": 0.0006720000528730452,
181
+ "fuse-send-recv": 6.800000119255856e-05,
182
+ "hilo::LegalizeAlias": 1.2000000424450263e-05,
183
+ "hilo::NeuronInstCombine": 0.00011300000187475234,
184
+ "hilo::NeuronOpFusion": 2.700000004551839e-05,
185
+ "hilo::ReplaceTokenTypeWithU8Pass": 2.099999983329326e-05,
186
+ "hilo::ScheduleFusion": 3.999999989900971e-06,
187
+ "hilo::SixtyFourHack": 6.0999998822808266e-05,
188
+ "hilo::VerifyAliasing": 4.999999873689376e-06,
189
+ "hlo-mac-count": 0.001120000029914081,
190
+ "hlo-verifier": 0.009619999676942825,
191
+ "instruction-histogram": 0.0010300000431016088,
192
+ "io-con-pipe-begin": 4.999999873689376e-06,
193
+ "io-con-pipe-end": 9.999999974752427e-07,
194
+ "io-layout-normalization": 0.0015699999639764428,
195
+ "io-statistics": 7.500000356230885e-05,
196
+ "legalize-ccops": 3.000000106112566e-06,
197
+ "legalize-compare": 1.1000000085914508e-05,
198
+ "lower-argminmax-custom-call": 9.000000318337698e-06,
199
+ "map-inline": 0.0009160000481642783,
200
+ "metadata-naming": 5.5999997130129486e-05,
201
+ "mlir::detail::OpToOpPassAdaptor": 8.299999899463728e-05,
202
+ "mlir::hlo::MhloToPyPenguin": 0.0715779960155487,
203
+ "mlir::mhlo::LowerComplexExtraPass": 0.00030700000934302807,
204
+ "mlir::mhlo::LowerComplexPass": 0.0004949999856762588,
205
+ "native-to-custom-softmax": 0.0007829999667592347,
206
+ "native-to-custom-softmax-dx": 0.0006629999843426049,
207
+ "operand_upcaster": 4.70000013592653e-05,
208
+ "opt-barrier-removal": 0.0005810000002384186,
209
+ "post-par-pipe-begin": 4.999999873689376e-06,
210
+ "post-par-pipe-end": 0.0,
211
+ "post-partition-simplification": 0.00171000009868294,
212
+ "pre-par-pipe-begin": 9.999999974752427e-07,
213
+ "pre-par-pipe-end": 0.0,
214
+ "pre-partition-simplification": 0.17589299380779266,
215
+ "replace-minimum-constant": 0.00033999999868683517,
216
+ "reshape-mover": 0.0001140000022132881,
217
+ "simplify-concat": 0.00015700000221841037,
218
+ "simplify-while-loops": 9.800000407267362e-05,
219
+ "transform-variadic-reduce": 6.199999916134402e-05,
220
+ "tuple-simplifier": 0.00028500001644715667,
221
+ "unpack-nested-aws-ntwsr": 0.0004239999980200082,
222
+ "unroll-while-loop": 1.700000029813964e-05,
223
+ "zero_sized_hlo_elimination": 0.0009629999985918403
224
+ },
225
+ "hilo": {
226
+ "ConstantSize": 533461.0,
227
+ "HloInputCount": 359.0,
228
+ "HloMacCount": 56438554624.0,
229
+ "HloOutputCount": 65.0,
230
+ "IfmapSize": 7785162752.0,
231
+ "OfmapSize": 536870912.0,
232
+ "OutputsReadFromCount": 0.0,
233
+ "PassthroughTensorsCount": 0.0,
234
+ "RedundantOutputCount": 0.0,
235
+ "Traffic": 766973312.0
236
+ },
237
+ "tensorizer": {
238
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 9788.0,
239
+ "StaticProfiler::AifUb": 148.07814025878906,
240
+ "StaticProfiler::ArithmeticIntensityTensorizer": 144.15213012695313,
241
+ "StaticProfiler::AverageDmaLength": 6822.7958984375,
242
+ "StaticProfiler::DDRTransferBytes": 323193888.0,
243
+ "StaticProfiler::InternalTransferBytes": 12166164.0,
244
+ "StaticProfiler::LoadExpanded": 40466.0,
245
+ "StaticProfiler::StoreExpanded": 1835.0,
246
+ "StaticProfiler::TotalDMAExpanded": 42301.0,
247
+ "StaticProfiler::TotalDynamicInstancesCount": 10024.0,
248
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 9928.0,
249
+ "StaticProfiler::TotalLNCComm": 0.0,
250
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
251
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
252
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
253
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
254
+ "TilingProfiler::MatMultInstructionsAfterTiling": 8576.0,
255
+ "TilingProfiler::NumPfTransposes": 4.0,
256
+ "TilingProfiler::NumPfTransposesForIo": 0.0,
257
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
258
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
259
+ "TilingProfiler::PfTransposeInstructions": 193.0,
260
+ "TilingProfiler::PfTransposeInstructionsForIo": 0.0,
261
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
262
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 192.0,
263
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
264
+ "TilingProfiler::SimdInstructionsAfterTiling": 166.0,
265
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
266
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
267
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
268
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
269
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
270
+ "TransformConvOp::conv2d_column_packing": 0.0,
271
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
272
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
273
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
274
+ }
275
+ },
276
+ "all": {
277
+ "compiletime": {
278
+ "algsimp": 0.002076999982818961,
279
+ "call-inliner": 0.0003870000073220581,
280
+ "collective-stream-id-checker": 9.7999996796716e-05,
281
+ "comparison-expander": 0.0005489999894052744,
282
+ "constant-statistics": 0.00039400000241585076,
283
+ "constant_folding": 0.0002629999944474548,
284
+ "dce": 7.599999662488699e-05,
285
+ "dot_decomposer": 0.0016339999856427312,
286
+ "eliminate-redundant-compare": 0.0002500000118743628,
287
+ "flatten-call-graph": 0.0006440000142902136,
288
+ "hlo-mac-count": 0.0008500000112690032,
289
+ "hlo-verifier": 0.009083000011742115,
290
+ "instruction-histogram": 0.0010300000431016088,
291
+ "io-con-pipe-begin": 4.999999873689376e-06,
292
+ "io-con-pipe-end": 9.999999974752427e-07,
293
+ "io-layout-normalization": 0.0015699999639764428,
294
+ "io-statistics": 7.500000356230885e-05,
295
+ "map-inline": 0.0008820000221021473,
296
+ "native-to-custom-softmax": 0.0007539999787695706,
297
+ "native-to-custom-softmax-dx": 0.0005559999844990671,
298
+ "opt-barrier-removal": 0.0005810000002384186,
299
+ "pre-par-pipe-begin": 9.999999974752427e-07,
300
+ "pre-par-pipe-end": 0.0,
301
+ "pre-partition-simplification": 0.17589299380779266,
302
+ "replace-minimum-constant": 0.0003129999968223274,
303
+ "reshape-mover": 0.00010399999882793054,
304
+ "simplify-while-loops": 9.200000204145908e-05,
305
+ "tuple-simplifier": 0.0002699999895412475,
306
+ "unpack-nested-aws-ntwsr": 0.0004140000091865659,
307
+ "unroll-while-loop": 1.700000029813964e-05,
308
+ "zero_sized_hlo_elimination": 0.0009629999985918403
309
+ }
310
+ },
311
+ "cumsum": {
312
+ "compiletime": {
313
+ "CoalesceCCOp": 0.0002796649932861328,
314
+ "DMALocalityOpt": 0.0002567768096923828,
315
+ "DMAProfiler": 0.001027822494506836,
316
+ "DataStreaming": 0.0003445148468017578,
317
+ "DoNothing": 0.0001685619354248047,
318
+ "ExpandISAMacro": 0.0005648136138916016,
319
+ "FactorizeBlkDims": 0.0005543231964111328,
320
+ "InferPSumTensor": 0.0006105899810791016,
321
+ "LateLegalizeInst": 0.0005190372467041016,
322
+ "LateNeuronInstComb": 0.0005986690521240234,
323
+ "LegalizeSundaAccess": 0.0017101764678955078,
324
+ "LegalizeType": 0.0003490447998046875,
325
+ "LowerBroadcast": 0.00031828880310058594,
326
+ "LowerIntrinsics": 0.0002765655517578125,
327
+ "LowerTranspose": 0.0002956390380859375,
328
+ "NeuronInstComb": 0.0006248950958251953,
329
+ "NeuronLICM": 0.0005059242248535156,
330
+ "NeuronSimplifyPredicates": 0.003926515579223633,
331
+ "NeuronValueNumbering": 0.0004730224609375,
332
+ "SFKVectorizer": 0.0031404495239257813,
333
+ "SimpleAllReduceTiling": 0.0002665519714355469,
334
+ "SimplifyNeuronTensor": 0.000499725341796875,
335
+ "SpillPSum": 0.0006055831909179688,
336
+ "WeightCoalescing": 0.00028896331787109375
337
+ }
338
+ },
339
+ "sg00": {
340
+ "compiletime": {
341
+ "CanonicalizeConv": 2.8000000384054147e-05,
342
+ "CanonicalizeForTensorizer": 1.8999999156221747e-05,
343
+ "Canonicalizer": 0.0003549999964889139,
344
+ "HoistCompute": 1.9999999949504854e-06,
345
+ "IdentifyCrossPassTensors": 1.4000000192027073e-05,
346
+ "MemcastMotion": 1.4000000192027073e-05,
347
+ "PenguinizeFunctions": 1.700000029813964e-05,
348
+ "PruneFunctions": 1.4000000192027073e-05,
349
+ "RemoveOptimizationBarriers": 1.4999999621068127e-05,
350
+ "ScatterMotion": 9.000000318337698e-06,
351
+ "TensorizerLegalizationPass": 2.300000051036477e-05,
352
+ "VerifySupportedOps": 9.999999747378752e-06,
353
+ "algsimp": 9.40000027185306e-05,
354
+ "batchnorm_expander": 1.8000000636675395e-05,
355
+ "boundary-marker-removal": 3.000000106112566e-06,
356
+ "call-inliner": 9.000000318337698e-06,
357
+ "canonicalize-boundary-marker": 3.999999989900971e-06,
358
+ "collective-stream-id-checker": 1.9999999949504854e-06,
359
+ "comparison-expander": 4.999999873689376e-06,
360
+ "computation-deduplicator": 1.4999999621068127e-05,
361
+ "conditional-to-select": 4.999999873689376e-06,
362
+ "config-lowering": 5.400000009103678e-05,
363
+ "constant_folding": 7.999999979801942e-06,
364
+ "cse": 1.4999999621068127e-05,
365
+ "dce": 9.999999974752427e-07,
366
+ "dynamic-slice-transpose": 7.999999979801942e-06,
367
+ "eliminate-redundant-compare": 3.000000106112566e-06,
368
+ "emit-offloaded-dropout": 2.300000051036477e-05,
369
+ "flatten-call-graph": 9.000000318337698e-06,
370
+ "fuse-send-recv": 2.2000000171829015e-05,
371
+ "hilo::LegalizeAlias": 6.000000212225132e-06,
372
+ "hilo::NeuronInstCombine": 5.199999941396527e-05,
373
+ "hilo::NeuronOpFusion": 9.999999974752427e-07,
374
+ "hilo::ReplaceTokenTypeWithU8Pass": 7.999999979801942e-06,
375
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
376
+ "hilo::SixtyFourHack": 1.2000000424450263e-05,
377
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
378
+ "hlo-mac-count": 3.7000001611886546e-05,
379
+ "hlo-verifier": 0.00017800000205170363,
380
+ "legalize-ccops": 9.999999974752427e-07,
381
+ "legalize-compare": 3.999999989900971e-06,
382
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
383
+ "map-inline": 9.999999747378752e-06,
384
+ "metadata-naming": 1.5999999959603883e-05,
385
+ "mlir::detail::OpToOpPassAdaptor": 3.600000127335079e-05,
386
+ "mlir::hlo::MhloToPyPenguin": 0.031582001596689224,
387
+ "mlir::mhlo::LowerComplexExtraPass": 0.00010399999882793054,
388
+ "mlir::mhlo::LowerComplexPass": 0.00015100000018719584,
389
+ "native-to-custom-softmax": 9.999999747378752e-06,
390
+ "native-to-custom-softmax-dx": 4.400000034365803e-05,
391
+ "operand_upcaster": 1.4000000192027073e-05,
392
+ "post-par-pipe-begin": 9.999999974752427e-07,
393
+ "post-par-pipe-end": 0.0,
394
+ "post-partition-simplification": 0.0006259999936446548,
395
+ "replace-minimum-constant": 9.000000318337698e-06,
396
+ "reshape-mover": 3.999999989900971e-06,
397
+ "simplify-concat": 5.2999999752501026e-05,
398
+ "simplify-while-loops": 1.9999999949504854e-06,
399
+ "transform-variadic-reduce": 7.999999979801942e-06,
400
+ "tuple-simplifier": 4.999999873689376e-06,
401
+ "unpack-nested-aws-ntwsr": 3.000000106112566e-06,
402
+ "unroll-while-loop": 0.0
403
+ },
404
+ "hilo": {
405
+ "ArithmeticIntensity": 55.650630950927734,
406
+ "ConstantSize": 533461.0,
407
+ "HloInputCount": 359.0,
408
+ "HloMacCount": 5637144576.0,
409
+ "HloOutputCount": 65.0,
410
+ "IfmapSize": 7785162752.0,
411
+ "OfmapSize": 536870912.0,
412
+ "OutputsReadFromCount": 0.0,
413
+ "PassthroughTensorsCount": 0.0,
414
+ "RedundantOutputCount": 0.0,
415
+ "Traffic": 202590496.0
416
+ }
417
+ },
418
+ "sg0000": {
419
+ "compiletime": {
420
+ "AGOrderingAnalysisPass": 0.03828167915344238,
421
+ "AffinePredicateResolution": 0.0017518997192382813,
422
+ "AliasDependencyElimination": 0.00022554397583007813,
423
+ "AliasDependencyInduction": 0.014617204666137695,
424
+ "AliasDependencyReset": 0.1334824562072754,
425
+ "BFComputeCutting": 0.0033521652221679688,
426
+ "BirCodeGenLoop": 0.049553871154785156,
427
+ "CCOpFusion": 0.058977603912353516,
428
+ "CanonicalizeDAGForPGTiling": 0.005285024642944336,
429
+ "CanonicalizeIR": 0.003525257110595703,
430
+ "CoalesceCCOp": 0.0051691532135009766,
431
+ "CommuteConcat": 0.0010175704956054688,
432
+ "DMALocalityOpt": 0.0012638568878173828,
433
+ "DMAProfiler": 0.00444793701171875,
434
+ "DMATilingProfiler": 0.0043413639068603516,
435
+ "DataLocalityOpt": 0.21851897239685059,
436
+ "DataStreaming": 0.003998756408691406,
437
+ "DeConcat": 0.0009417533874511719,
438
+ "DeadCodeElimination": 0.001861572265625,
439
+ "DeadStoreElimination": 0.027139902114868164,
440
+ "DelinearIndices": 0.009176254272460938,
441
+ "Delinearization": 0.002630472183227539,
442
+ "DoNothing": 0.000125885009765625,
443
+ "DramToDramTranspose": 0.026975631713867188,
444
+ "DumpGraphAndMetadata": 0.012708663940429688,
445
+ "EliminateDivs": 0.02721691131591797,
446
+ "ExpandBatchNorm": 0.0034477710723876953,
447
+ "ExpandISAMacro": 0.0028188228607177734,
448
+ "FactorizeBlkDims": 0.020637035369873047,
449
+ "FactorizeThreadAxesInFreeDims": 0.0013651847839355469,
450
+ "FlattenMacroLoop": 0.004672050476074219,
451
+ "GenericAccessSimplifier": 0.0009248256683349609,
452
+ "InferInitValue": 0.03484296798706055,
453
+ "InferIntrinsicOnCC": 0.01741480827331543,
454
+ "InferNeuronTensor": 0.09466671943664551,
455
+ "InferNonlocalTensors": 0.16442584991455078,
456
+ "InferPSumTensor": 0.03455662727355957,
457
+ "InlineNativeKernels": 0.0014760494232177734,
458
+ "InsertIOTransposes": 0.04430508613586426,
459
+ "InsertLocalTransposes": 0.03803539276123047,
460
+ "InsertOffloadedTransposes": 0.004397869110107422,
461
+ "LICM": 0.0029926300048828125,
462
+ "LateLegalizeInst": 0.0474398136138916,
463
+ "LateLegalizePostSplit": 0.0027294158935546875,
464
+ "LateLowerReshapeOp": 0.002092599868774414,
465
+ "LateLowerTensorOp": 0.010335206985473633,
466
+ "LateNeuronInstComb": 0.07859206199645996,
467
+ "LayoutPreprocessing": 0.028697729110717773,
468
+ "LayoutPreprocessingAndAnalysis": 0.10470056533813477,
469
+ "LayoutRequirementAnalysis": 0.00815582275390625,
470
+ "LegalizeCCOpLayout": 0.003177165985107422,
471
+ "LegalizeOpLevelAlias": 0.0010378360748291016,
472
+ "LegalizePartitionReduce": 0.0013747215270996094,
473
+ "LegalizeSundaAccess": 0.04130220413208008,
474
+ "LegalizeSundaMacro": 0.010147333145141602,
475
+ "LegalizeType": 0.0038063526153564453,
476
+ "LocalLayoutOpt": 0.026745080947875977,
477
+ "LoopFusion": 0.006239652633666992,
478
+ "LoopSplitting": 0.00046253204345703125,
479
+ "LowerBroadcast": 0.003262758255004883,
480
+ "LowerCCOpBlockAxis": 0.044937849044799805,
481
+ "LowerComplexBroadcast": 0.0021724700927734375,
482
+ "LowerIntrinsics": 0.05569720268249512,
483
+ "LowerTensorOp": 0.016431331634521484,
484
+ "LowerTranspose": 0.04205179214477539,
485
+ "MacroGeneration": 0.18246054649353027,
486
+ "MaskPropagation": 0.008913278579711914,
487
+ "MemcpyElimination": 0.14683842658996582,
488
+ "MutateDataType": 0.0012350082397460938,
489
+ "NeuronAliasDependencyInduction": 0.00030422210693359375,
490
+ "NeuronAliasDependencyReset": 0.1192784309387207,
491
+ "NeuronInstComb": 0.013055801391601563,
492
+ "NeuronLICM": 0.008586406707763672,
493
+ "NeuronLoopFusion": 0.015465021133422852,
494
+ "NeuronLoopInterchange": 0.0020461082458496094,
495
+ "NeuronSimplifier": 0.05031108856201172,
496
+ "NeuronSimplifyPredicates": 0.005561351776123047,
497
+ "NeuronValueNumbering": 0.004122495651245117,
498
+ "OptimizeAliasedCopyChain": 0.0008661746978759766,
499
+ "OptimizeNKIKernels": 0.001760721206665039,
500
+ "PAGLayoutOpt": 0.49636244773864746,
501
+ "PComputeCutting": 0.008408069610595703,
502
+ "PGLayoutTilingPipeline": 1.6429040431976318,
503
+ "PGTiling": 0.6523551940917969,
504
+ "PadElimination": 0.0006270408630371094,
505
+ "ParAxesAnnotation": 0.32938289642333984,
506
+ "PartialLoopFusion": 0.016936302185058594,
507
+ "PartialSimdFusion": 0.012967824935913086,
508
+ "PerfectLoopNest": 0.002012968063354492,
509
+ "RecognizeOpIdiom": 0.0038352012634277344,
510
+ "Recompute": 0.0003256797790527344,
511
+ "RelaxPredicates": 0.0037698745727539063,
512
+ "Rematerialization": 0.003920078277587891,
513
+ "ReshapeWeights": 0.0009107589721679688,
514
+ "ResolveAccessConflict": 0.027127504348754883,
515
+ "ResolveComplicatePredicates": 0.0018274784088134766,
516
+ "RewriteReplicationMatmul": 0.002187490463256836,
517
+ "RewriteWeights": 0.006139278411865234,
518
+ "SFKVectorizer": 0.28095126152038574,
519
+ "SimpleAllReduceTiling": 0.0017786026000976563,
520
+ "Simplifier": 0.007991552352905273,
521
+ "SimplifyMacroPredicates": 0.044501304626464844,
522
+ "SimplifyNeuronTensor": 0.009800195693969727,
523
+ "SimplifySlice": 0.0012738704681396484,
524
+ "SimplifyTensor": 0.005684375762939453,
525
+ "SpillPSum": 0.013557672500610352,
526
+ "SplitAPUnionSets": 0.030807971954345703,
527
+ "SplitAccGrp": 0.0014066696166992188,
528
+ "StaticProfiler": 0.004358768463134766,
529
+ "StaticTransposeLocalTensor": 0.03508925437927246,
530
+ "SundaISel": 0.08105349540710449,
531
+ "TCTransform": 0.0011637210845947266,
532
+ "TensorInitialization": 0.012772321701049805,
533
+ "TensorOpSimplifier": 0.006896257400512695,
534
+ "TensorOpTransform": 0.07355618476867676,
535
+ "TileCCOps": 0.010430574417114258,
536
+ "TilingProfiler": 0.011465787887573242,
537
+ "TransformConvOp": 0.002571582794189453,
538
+ "TritiumFusion": 0.035936594009399414,
539
+ "ValueNumbering": 0.0023512840270996094,
540
+ "VectorizeDMA": 0.005614042282104492,
541
+ "VectorizeMatMult": 0.01862645149230957,
542
+ "WeightCoalescing": 0.0029726028442382813,
543
+ "ZeroSizeTensorElimination": 0.00018477439880371094
544
+ },
545
+ "tensorizer": {
546
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 1880.0,
547
+ "StaticProfiler::AifUb": 56.6817626953125,
548
+ "StaticProfiler::ArithmeticIntensityTensorizer": 228.08059692382813,
549
+ "StaticProfiler::AverageDmaLength": 3633.1689453125,
550
+ "StaticProfiler::AverageFractalPeUtilization": 99.77600860595703,
551
+ "StaticProfiler::AveragePartitionUtilization": 95.7789077758789,
552
+ "StaticProfiler::AveragePeUtilization": 99.3130111694336,
553
+ "StaticProfiler::DDRTransferBytes": 53225736.0,
554
+ "StaticProfiler::InternalTransferBytes": 22481920.0,
555
+ "StaticProfiler::LoadExpanded": 9989.0,
556
+ "StaticProfiler::LocalizationEfficiency": 402.3879699707031,
557
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 466.7553405761719,
558
+ "StaticProfiler::StoreExpanded": 3713.0,
559
+ "StaticProfiler::TotalDMAExpanded": 13702.0,
560
+ "StaticProfiler::TotalDynamicInstancesCount": 1649.0,
561
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 1645.0,
562
+ "StaticProfiler::TotalLNCComm": 0.0,
563
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
564
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
565
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
566
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
567
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
568
+ "TilingProfiler::GenericInstructionsAfterTiling": 24.0,
569
+ "TilingProfiler::MatMultInstructionsAfterTiling": 882.0,
570
+ "TilingProfiler::NumPfTransposes": 4.0,
571
+ "TilingProfiler::NumPfTransposesForIo": 0.0,
572
+ "TilingProfiler::NumPfTransposesForLocal": 3.0,
573
+ "TilingProfiler::NumPfTransposesForNonlocal": 1.0,
574
+ "TilingProfiler::PfTransposeInstructions": 136.0,
575
+ "TilingProfiler::PfTransposeInstructionsForIo": 0.0,
576
+ "TilingProfiler::PfTransposeInstructionsForLocal": 104.0,
577
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 32.0,
578
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
579
+ "TilingProfiler::SimdInstructionsAfterTiling": 170.0,
580
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
581
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
582
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
583
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
584
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
585
+ "TransformConvOp::conv2d_column_packing": 0.0,
586
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
587
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
588
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
589
+ }
590
+ },
591
+ "sg0001": {
592
+ "compiletime": {
593
+ "AGOrderingAnalysisPass": 0.1372241973876953,
594
+ "AffinePredicateResolution": 0.0012900829315185547,
595
+ "AliasDependencyElimination": 0.00016880035400390625,
596
+ "AliasDependencyInduction": 0.00756525993347168,
597
+ "AliasDependencyReset": 0.05401206016540527,
598
+ "BFComputeCutting": 0.0025975704193115234,
599
+ "BirCodeGenLoop": 0.06449413299560547,
600
+ "CCOpFusion": 0.05572986602783203,
601
+ "CanonicalizeDAGForPGTiling": 0.005880832672119141,
602
+ "CanonicalizeIR": 0.0017058849334716797,
603
+ "CoalesceCCOp": 0.007529258728027344,
604
+ "CommuteConcat": 0.0010385513305664063,
605
+ "DMALocalityOpt": 0.0021440982818603516,
606
+ "DMAProfiler": 0.006747007369995117,
607
+ "DMATilingProfiler": 0.0044765472412109375,
608
+ "DataLocalityOpt": 0.2653350830078125,
609
+ "DataStreaming": 0.044264793395996094,
610
+ "DeConcat": 0.0012521743774414063,
611
+ "DeadCodeElimination": 0.0011494159698486328,
612
+ "DeadStoreElimination": 0.030911922454833984,
613
+ "DelinearIndices": 0.008881092071533203,
614
+ "Delinearization": 0.0041217803955078125,
615
+ "DoNothing": 0.00018095970153808594,
616
+ "DramToDramTranspose": 0.0739753246307373,
617
+ "DumpGraphAndMetadata": 0.012173652648925781,
618
+ "EliminateDivs": 0.003404378890991211,
619
+ "ExpandBatchNorm": 0.0016205310821533203,
620
+ "ExpandISAMacro": 0.004185914993286133,
621
+ "FactorizeBlkDims": 0.016408443450927734,
622
+ "FactorizeThreadAxesInFreeDims": 0.002920866012573242,
623
+ "FlattenMacroLoop": 0.003094196319580078,
624
+ "GenericAccessSimplifier": 0.0010533332824707031,
625
+ "InferInitValue": 0.07179093360900879,
626
+ "InferIntrinsicOnCC": 0.011938810348510742,
627
+ "InferNeuronTensor": 0.08669257164001465,
628
+ "InferNonlocalTensors": 0.05255532264709473,
629
+ "InferPSumTensor": 0.028987407684326172,
630
+ "InlineNativeKernels": 0.0014684200286865234,
631
+ "InsertIOTransposes": 0.02220773696899414,
632
+ "InsertLocalTransposes": 0.009196281433105469,
633
+ "InsertOffloadedTransposes": 0.003507852554321289,
634
+ "LICM": 0.0032300949096679688,
635
+ "LateLegalizeInst": 0.006733417510986328,
636
+ "LateLegalizePostSplit": 0.003352642059326172,
637
+ "LateLowerReshapeOp": 0.0015027523040771484,
638
+ "LateLowerTensorOp": 0.005047321319580078,
639
+ "LateNeuronInstComb": 0.025189876556396484,
640
+ "LayoutPreprocessing": 0.07440996170043945,
641
+ "LayoutPreprocessingAndAnalysis": 0.11858534812927246,
642
+ "LayoutRequirementAnalysis": 0.009042739868164063,
643
+ "LegalizeCCOpLayout": 0.0016829967498779297,
644
+ "LegalizeOpLevelAlias": 0.0010685920715332031,
645
+ "LegalizePartitionReduce": 0.0022325515747070313,
646
+ "LegalizeSundaAccess": 0.03753972053527832,
647
+ "LegalizeSundaMacro": 0.009591341018676758,
648
+ "LegalizeType": 0.015005111694335938,
649
+ "LocalLayoutOpt": 0.04438447952270508,
650
+ "LoopFusion": 0.006495237350463867,
651
+ "LoopSplitting": 0.00040650367736816406,
652
+ "LowerBroadcast": 0.0017704963684082031,
653
+ "LowerCCOpBlockAxis": 0.027849197387695313,
654
+ "LowerComplexBroadcast": 0.0019054412841796875,
655
+ "LowerIntrinsics": 0.031024932861328125,
656
+ "LowerTensorOp": 0.010584592819213867,
657
+ "LowerTranspose": 0.011832475662231445,
658
+ "MacroGeneration": 0.09017419815063477,
659
+ "MaskPropagation": 0.0037479400634765625,
660
+ "MemcpyElimination": 0.12743902206420898,
661
+ "MutateDataType": 0.0015845298767089844,
662
+ "NeuronAliasDependencyInduction": 0.0002963542938232422,
663
+ "NeuronAliasDependencyReset": 0.011981010437011719,
664
+ "NeuronInstComb": 0.0240938663482666,
665
+ "NeuronLICM": 0.006990671157836914,
666
+ "NeuronLoopFusion": 0.03093886375427246,
667
+ "NeuronLoopInterchange": 0.0030279159545898438,
668
+ "NeuronSimplifier": 0.011322021484375,
669
+ "NeuronSimplifyPredicates": 0.0017275810241699219,
670
+ "NeuronValueNumbering": 0.0033507347106933594,
671
+ "OptimizeAliasedCopyChain": 0.0006761550903320313,
672
+ "OptimizeNKIKernels": 0.0017766952514648438,
673
+ "PAGLayoutOpt": 0.6475393772125244,
674
+ "PComputeCutting": 0.006641864776611328,
675
+ "PGLayoutTilingPipeline": 1.6072959899902344,
676
+ "PGTiling": 0.49286890029907227,
677
+ "PadElimination": 0.0004642009735107422,
678
+ "ParAxesAnnotation": 0.6226985454559326,
679
+ "PartialLoopFusion": 0.045456886291503906,
680
+ "PartialSimdFusion": 0.052849769592285156,
681
+ "PerfectLoopNest": 0.004052400588989258,
682
+ "RecognizeOpIdiom": 0.004006147384643555,
683
+ "Recompute": 0.0004963874816894531,
684
+ "RelaxPredicates": 0.0033147335052490234,
685
+ "Rematerialization": 0.002171039581298828,
686
+ "ReshapeWeights": 0.0008096694946289063,
687
+ "ResolveAccessConflict": 0.007338285446166992,
688
+ "ResolveComplicatePredicates": 0.00125885009765625,
689
+ "RewriteReplicationMatmul": 0.001424551010131836,
690
+ "RewriteWeights": 0.00428318977355957,
691
+ "SFKVectorizer": 0.2604203224182129,
692
+ "SimpleAllReduceTiling": 0.0015559196472167969,
693
+ "Simplifier": 0.004492521286010742,
694
+ "SimplifyMacroPredicates": 0.006475210189819336,
695
+ "SimplifyNeuronTensor": 0.012001514434814453,
696
+ "SimplifySlice": 0.0011138916015625,
697
+ "SimplifyTensor": 0.04663205146789551,
698
+ "SpillPSum": 0.017986297607421875,
699
+ "SplitAPUnionSets": 0.0433957576751709,
700
+ "SplitAccGrp": 0.0013687610626220703,
701
+ "StaticProfiler": 0.004006624221801758,
702
+ "StaticTransposeLocalTensor": 0.004962921142578125,
703
+ "SundaISel": 0.08249163627624512,
704
+ "TCTransform": 0.001107931137084961,
705
+ "TensorInitialization": 0.002826690673828125,
706
+ "TensorOpSimplifier": 0.005883216857910156,
707
+ "TensorOpTransform": 0.03051924705505371,
708
+ "TileCCOps": 0.005392313003540039,
709
+ "TilingProfiler": 0.05383157730102539,
710
+ "TransformConvOp": 0.0025300979614257813,
711
+ "TritiumFusion": 0.1005716323852539,
712
+ "ValueNumbering": 0.002640962600708008,
713
+ "VectorizeDMA": 0.0018579959869384766,
714
+ "VectorizeMatMult": 0.04578280448913574,
715
+ "WeightCoalescing": 0.0028417110443115234,
716
+ "ZeroSizeTensorElimination": 0.00020384788513183594
717
+ },
718
+ "tensorizer": {
719
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 6724.0,
720
+ "StaticProfiler::AifUb": 223.01504516601563,
721
+ "StaticProfiler::ArithmeticIntensityTensorizer": 252.92869567871094,
722
+ "StaticProfiler::AverageDmaLength": 6766.392578125,
723
+ "StaticProfiler::AverageFractalPeUtilization": 100.0,
724
+ "StaticProfiler::AveragePartitionUtilization": 99.88066864013672,
725
+ "StaticProfiler::AveragePeUtilization": 100.0,
726
+ "StaticProfiler::DDRTransferBytes": 229515264.0,
727
+ "StaticProfiler::InternalTransferBytes": 20119552.0,
728
+ "StaticProfiler::LoadExpanded": 29313.0,
729
+ "StaticProfiler::LocalizationEfficiency": 113.41328430175781,
730
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 117.71570587158203,
731
+ "StaticProfiler::StoreExpanded": 2305.0,
732
+ "StaticProfiler::TotalDMAExpanded": 31618.0,
733
+ "StaticProfiler::TotalDynamicInstancesCount": 6708.0,
734
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 6708.0,
735
+ "StaticProfiler::TotalLNCComm": 0.0,
736
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
737
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
738
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
739
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
740
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
741
+ "TilingProfiler::GenericInstructionsAfterTiling": 16.0,
742
+ "TilingProfiler::MatMultInstructionsAfterTiling": 5424.0,
743
+ "TilingProfiler::NumPfTransposes": 7.0,
744
+ "TilingProfiler::NumPfTransposesForIo": 3.0,
745
+ "TilingProfiler::NumPfTransposesForLocal": 2.0,
746
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
747
+ "TilingProfiler::PfTransposeInstructions": 268.0,
748
+ "TilingProfiler::PfTransposeInstructionsForIo": 68.0,
749
+ "TilingProfiler::PfTransposeInstructionsForLocal": 72.0,
750
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 128.0,
751
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
752
+ "TilingProfiler::SimdInstructionsAfterTiling": 217.0,
753
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
754
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
755
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
756
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
757
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
758
+ "TransformConvOp::conv2d_column_packing": 0.0,
759
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
760
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
761
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
762
+ }
763
+ },
764
+ "sg0002": {
765
+ "compiletime": {
766
+ "AGOrderingAnalysisPass": 0.023177146911621094,
767
+ "AffinePredicateResolution": 0.0010716915130615234,
768
+ "AliasDependencyElimination": 0.0001862049102783203,
769
+ "AliasDependencyInduction": 0.006262540817260742,
770
+ "AliasDependencyReset": 0.030141830444335938,
771
+ "BFComputeCutting": 0.0014030933380126953,
772
+ "BirCodeGenLoop": 0.14523863792419434,
773
+ "CCOpFusion": 0.015876293182373047,
774
+ "CanonicalizeDAGForPGTiling": 0.004258632659912109,
775
+ "CanonicalizeIR": 0.0015943050384521484,
776
+ "CoalesceCCOp": 0.005761623382568359,
777
+ "CommuteConcat": 0.0009081363677978516,
778
+ "DMALocalityOpt": 0.001971006393432617,
779
+ "DMAProfiler": 0.005816221237182617,
780
+ "DMATilingProfiler": 0.004567146301269531,
781
+ "DataLocalityOpt": 0.1468503475189209,
782
+ "DataStreaming": 0.05710148811340332,
783
+ "DeConcat": 0.0009293556213378906,
784
+ "DeadCodeElimination": 0.0011928081512451172,
785
+ "DeadStoreElimination": 0.007216453552246094,
786
+ "DelinearIndices": 0.008558034896850586,
787
+ "Delinearization": 0.004823923110961914,
788
+ "DoNothing": 0.00012612342834472656,
789
+ "DramToDramTranspose": 0.03753399848937988,
790
+ "DumpGraphAndMetadata": 0.021843910217285156,
791
+ "EliminateDivs": 0.0021796226501464844,
792
+ "ExpandBatchNorm": 0.001444101333618164,
793
+ "ExpandISAMacro": 0.0072247982025146484,
794
+ "FactorizeBlkDims": 0.007262706756591797,
795
+ "FactorizeThreadAxesInFreeDims": 0.002001047134399414,
796
+ "FlattenMacroLoop": 0.002486705780029297,
797
+ "GenericAccessSimplifier": 0.0008151531219482422,
798
+ "InferInitValue": 0.03303098678588867,
799
+ "InferIntrinsicOnCC": 0.008953571319580078,
800
+ "InferNeuronTensor": 0.06123208999633789,
801
+ "InferNonlocalTensors": 0.015596389770507813,
802
+ "InferPSumTensor": 0.11326098442077637,
803
+ "InlineNativeKernels": 0.003874063491821289,
804
+ "InsertIOTransposes": 0.015367746353149414,
805
+ "InsertLocalTransposes": 0.00810861587524414,
806
+ "InsertOffloadedTransposes": 0.0030138492584228516,
807
+ "LICM": 0.002691507339477539,
808
+ "LateLegalizeInst": 0.00669407844543457,
809
+ "LateLegalizePostSplit": 0.0043141841888427734,
810
+ "LateLowerReshapeOp": 0.0011355876922607422,
811
+ "LateLowerTensorOp": 0.0013995170593261719,
812
+ "LateNeuronInstComb": 0.008636951446533203,
813
+ "LayoutPreprocessing": 0.03390383720397949,
814
+ "LayoutPreprocessingAndAnalysis": 0.11866283416748047,
815
+ "LayoutRequirementAnalysis": 0.009894132614135742,
816
+ "LegalizeCCOpLayout": 0.0019948482513427734,
817
+ "LegalizeOpLevelAlias": 0.0011715888977050781,
818
+ "LegalizePartitionReduce": 0.0018146038055419922,
819
+ "LegalizeSundaAccess": 0.03459429740905762,
820
+ "LegalizeSundaMacro": 0.015540599822998047,
821
+ "LegalizeType": 0.0061681270599365234,
822
+ "LocalLayoutOpt": 0.023403167724609375,
823
+ "LoopFusion": 0.018198251724243164,
824
+ "LoopSplitting": 0.0003662109375,
825
+ "LowerBroadcast": 0.001399993896484375,
826
+ "LowerCCOpBlockAxis": 0.0071604251861572266,
827
+ "LowerComplexBroadcast": 0.0025353431701660156,
828
+ "LowerIntrinsics": 0.30063343048095703,
829
+ "LowerTensorOp": 0.010535478591918945,
830
+ "LowerTranspose": 0.008569717407226563,
831
+ "MacroGeneration": 0.02730083465576172,
832
+ "MaskPropagation": 0.0026755332946777344,
833
+ "MemcpyElimination": 0.04188370704650879,
834
+ "MutateDataType": 0.0019674301147460938,
835
+ "NeuronAliasDependencyInduction": 0.00026416778564453125,
836
+ "NeuronAliasDependencyReset": 0.010982990264892578,
837
+ "NeuronInstComb": 0.0043277740478515625,
838
+ "NeuronLICM": 0.061002254486083984,
839
+ "NeuronLoopFusion": 0.009150981903076172,
840
+ "NeuronLoopInterchange": 0.002344846725463867,
841
+ "NeuronSimplifier": 0.0074083805084228516,
842
+ "NeuronSimplifyPredicates": 0.005543231964111328,
843
+ "NeuronValueNumbering": 0.0026674270629882813,
844
+ "OptimizeAliasedCopyChain": 0.0016322135925292969,
845
+ "OptimizeNKIKernels": 0.6652572154998779,
846
+ "PAGLayoutOpt": 0.22568559646606445,
847
+ "PComputeCutting": 0.005068063735961914,
848
+ "PGLayoutTilingPipeline": 0.725257158279419,
849
+ "PGTiling": 0.16728448867797852,
850
+ "PadElimination": 0.0007500648498535156,
851
+ "ParAxesAnnotation": 0.05478692054748535,
852
+ "PartialLoopFusion": 0.006835222244262695,
853
+ "PartialSimdFusion": 0.007959842681884766,
854
+ "PerfectLoopNest": 0.0017228126525878906,
855
+ "RecognizeOpIdiom": 0.0035974979400634766,
856
+ "Recompute": 0.00035452842712402344,
857
+ "RelaxPredicates": 0.005320310592651367,
858
+ "Rematerialization": 0.0019502639770507813,
859
+ "ReshapeWeights": 0.0009279251098632813,
860
+ "ResolveAccessConflict": 0.0035495758056640625,
861
+ "ResolveComplicatePredicates": 0.0010292530059814453,
862
+ "RewriteReplicationMatmul": 0.0021431446075439453,
863
+ "RewriteWeights": 0.0024466514587402344,
864
+ "SFKVectorizer": 0.12934255599975586,
865
+ "SimpleAllReduceTiling": 0.0027997493743896484,
866
+ "Simplifier": 0.0042324066162109375,
867
+ "SimplifyMacroPredicates": 0.0056989192962646484,
868
+ "SimplifyNeuronTensor": 0.2756993770599365,
869
+ "SimplifySlice": 0.0008549690246582031,
870
+ "SimplifyTensor": 0.004892826080322266,
871
+ "SpillPSum": 0.009497642517089844,
872
+ "SplitAPUnionSets": 0.013408899307250977,
873
+ "SplitAccGrp": 0.0011219978332519531,
874
+ "StaticProfiler": 0.0052869319915771484,
875
+ "StaticTransposeLocalTensor": 0.003991365432739258,
876
+ "SundaISel": 0.07178783416748047,
877
+ "TCTransform": 0.0011143684387207031,
878
+ "TensorInitialization": 0.03500223159790039,
879
+ "TensorOpSimplifier": 0.00494384765625,
880
+ "TensorOpTransform": 0.01940751075744629,
881
+ "TileCCOps": 0.007318258285522461,
882
+ "TilingProfiler": 0.007637739181518555,
883
+ "TransformConvOp": 0.002967357635498047,
884
+ "TritiumFusion": 0.06856966018676758,
885
+ "ValueNumbering": 0.001973867416381836,
886
+ "VectorizeDMA": 0.002585887908935547,
887
+ "VectorizeMatMult": 0.004343986511230469,
888
+ "WeightCoalescing": 0.003340482711791992,
889
+ "ZeroSizeTensorElimination": 0.00018334388732910156
890
+ },
891
+ "tensorizer": {
892
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 9788.0,
893
+ "StaticProfiler::AifUb": 148.07814025878906,
894
+ "StaticProfiler::ArithmeticIntensityTensorizer": 144.15213012695313,
895
+ "StaticProfiler::AverageDmaLength": 6822.7958984375,
896
+ "StaticProfiler::AverageFractalPeUtilization": 99.64185333251953,
897
+ "StaticProfiler::AveragePartitionUtilization": 99.05009460449219,
898
+ "StaticProfiler::AveragePeUtilization": 98.57437896728516,
899
+ "StaticProfiler::DDRTransferBytes": 323193888.0,
900
+ "StaticProfiler::InternalTransferBytes": 12166164.0,
901
+ "StaticProfiler::LoadExpanded": 40466.0,
902
+ "StaticProfiler::LocalizationEfficiency": 97.34869384765625,
903
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 101.34577178955078,
904
+ "StaticProfiler::StoreExpanded": 1835.0,
905
+ "StaticProfiler::TotalDMAExpanded": 42301.0,
906
+ "StaticProfiler::TotalDynamicInstancesCount": 10024.0,
907
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 9928.0,
908
+ "StaticProfiler::TotalLNCComm": 0.0,
909
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
910
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
911
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
912
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
913
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
914
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
915
+ "TilingProfiler::MatMultInstructionsAfterTiling": 8576.0,
916
+ "TilingProfiler::NumPfTransposes": 4.0,
917
+ "TilingProfiler::NumPfTransposesForIo": 0.0,
918
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
919
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
920
+ "TilingProfiler::PfTransposeInstructions": 193.0,
921
+ "TilingProfiler::PfTransposeInstructionsForIo": 0.0,
922
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
923
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 192.0,
924
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
925
+ "TilingProfiler::SimdInstructionsAfterTiling": 166.0,
926
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
927
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
928
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
929
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
930
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
931
+ "TransformConvOp::conv2d_column_packing": 0.0,
932
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
933
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
934
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
935
+ }
936
+ },
937
+ "sg01": {
938
+ "compiletime": {
939
+ "CanonicalizeConv": 1.5999999959603883e-05,
940
+ "CanonicalizeForTensorizer": 1.2000000424450263e-05,
941
+ "Canonicalizer": 0.00024399999529123306,
942
+ "HoistCompute": 1.9999999949504854e-06,
943
+ "IdentifyCrossPassTensors": 9.999999747378752e-06,
944
+ "MemcastMotion": 1.1000000085914508e-05,
945
+ "PenguinizeFunctions": 1.2999999853491317e-05,
946
+ "PruneFunctions": 1.2999999853491317e-05,
947
+ "RemoveOptimizationBarriers": 1.1000000085914508e-05,
948
+ "ScatterMotion": 2.700000004551839e-05,
949
+ "TensorizerLegalizationPass": 1.4000000192027073e-05,
950
+ "VerifySupportedOps": 9.000000318337698e-06,
951
+ "algsimp": 9.200000204145908e-05,
952
+ "batchnorm_expander": 1.4000000192027073e-05,
953
+ "boundary-marker-removal": 3.999999989900971e-06,
954
+ "call-inliner": 9.000000318337698e-06,
955
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
956
+ "collective-stream-id-checker": 7.000000096013537e-06,
957
+ "comparison-expander": 3.999999989900971e-06,
958
+ "computation-deduplicator": 1.9999999494757503e-05,
959
+ "conditional-to-select": 4.999999873689376e-06,
960
+ "config-lowering": 4.5000000682193786e-05,
961
+ "constant_folding": 9.000000318337698e-06,
962
+ "cse": 1.2000000424450263e-05,
963
+ "dce": 9.999999974752427e-07,
964
+ "dynamic-slice-transpose": 4.999999873689376e-06,
965
+ "eliminate-redundant-compare": 3.999999989900971e-06,
966
+ "emit-offloaded-dropout": 2.099999983329326e-05,
967
+ "flatten-call-graph": 7.999999979801942e-06,
968
+ "fuse-send-recv": 2.700000004551839e-05,
969
+ "hilo::LegalizeAlias": 3.999999989900971e-06,
970
+ "hilo::NeuronInstCombine": 5.6000000768108293e-05,
971
+ "hilo::NeuronOpFusion": 1.5999999959603883e-05,
972
+ "hilo::ReplaceTokenTypeWithU8Pass": 6.000000212225132e-06,
973
+ "hilo::ScheduleFusion": 9.999999974752427e-07,
974
+ "hilo::SixtyFourHack": 9.999999747378752e-06,
975
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
976
+ "hlo-mac-count": 3.600000127335079e-05,
977
+ "hlo-verifier": 0.00019299999985378236,
978
+ "legalize-ccops": 9.999999974752427e-07,
979
+ "legalize-compare": 3.999999989900971e-06,
980
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
981
+ "map-inline": 1.2000000424450263e-05,
982
+ "metadata-naming": 2.4000000848900527e-05,
983
+ "mlir::detail::OpToOpPassAdaptor": 2.9000000722589903e-05,
984
+ "mlir::hlo::MhloToPyPenguin": 0.023865999653935432,
985
+ "mlir::mhlo::LowerComplexExtraPass": 0.0001049999991664663,
986
+ "mlir::mhlo::LowerComplexPass": 0.00016999999934341758,
987
+ "native-to-custom-softmax": 9.999999747378752e-06,
988
+ "native-to-custom-softmax-dx": 3.300000025774352e-05,
989
+ "operand_upcaster": 1.9999999494757503e-05,
990
+ "post-par-pipe-begin": 1.9999999949504854e-06,
991
+ "post-par-pipe-end": 0.0,
992
+ "post-partition-simplification": 0.000577000027988106,
993
+ "replace-minimum-constant": 7.999999979801942e-06,
994
+ "reshape-mover": 3.000000106112566e-06,
995
+ "simplify-concat": 5.400000009103678e-05,
996
+ "simplify-while-loops": 1.9999999949504854e-06,
997
+ "transform-variadic-reduce": 9.000000318337698e-06,
998
+ "tuple-simplifier": 4.999999873689376e-06,
999
+ "unpack-nested-aws-ntwsr": 3.000000106112566e-06,
1000
+ "unroll-while-loop": 0.0
1001
+ },
1002
+ "hilo": {
1003
+ "ArithmeticIntensity": 227.59315490722656,
1004
+ "HloMacCount": 28185722880.0,
1005
+ "Traffic": 247685152.0
1006
+ }
1007
+ },
1008
+ "sg02": {
1009
+ "compiletime": {
1010
+ "CanonicalizeConv": 1.9999999494757503e-05,
1011
+ "CanonicalizeForTensorizer": 1.4000000192027073e-05,
1012
+ "Canonicalizer": 0.0003480000013951212,
1013
+ "HoistCompute": 9.999999974752427e-07,
1014
+ "IdentifyCrossPassTensors": 1.2000000424450263e-05,
1015
+ "MemcastMotion": 0.0,
1016
+ "PenguinizeFunctions": 9.000000318337698e-06,
1017
+ "PruneFunctions": 6.000000212225132e-06,
1018
+ "RemoveOptimizationBarriers": 1.4000000192027073e-05,
1019
+ "ScatterMotion": 9.999999974752427e-07,
1020
+ "TensorizerLegalizationPass": 6.000000212225132e-06,
1021
+ "VerifySupportedOps": 1.1000000085914508e-05,
1022
+ "algsimp": 6.800000119255856e-05,
1023
+ "batchnorm_expander": 1.2999999853491317e-05,
1024
+ "boundary-marker-removal": 3.000000106112566e-06,
1025
+ "call-inliner": 9.999999747378752e-06,
1026
+ "canonicalize-boundary-marker": 3.999999989900971e-06,
1027
+ "collective-stream-id-checker": 3.999999989900971e-06,
1028
+ "comparison-expander": 4.999999873689376e-06,
1029
+ "computation-deduplicator": 1.9999999494757503e-05,
1030
+ "conditional-to-select": 6.000000212225132e-06,
1031
+ "config-lowering": 5.999999848427251e-05,
1032
+ "constant_folding": 9.000000318337698e-06,
1033
+ "cse": 1.2999999853491317e-05,
1034
+ "dce": 9.999999974752427e-07,
1035
+ "dynamic-slice-transpose": 4.999999873689376e-06,
1036
+ "eliminate-redundant-compare": 3.000000106112566e-06,
1037
+ "emit-offloaded-dropout": 1.9999999494757503e-05,
1038
+ "flatten-call-graph": 1.1000000085914508e-05,
1039
+ "fuse-send-recv": 1.8999999156221747e-05,
1040
+ "hilo::LegalizeAlias": 1.9999999949504854e-06,
1041
+ "hilo::NeuronInstCombine": 4.999999873689376e-06,
1042
+ "hilo::NeuronOpFusion": 9.999999747378752e-06,
1043
+ "hilo::ReplaceTokenTypeWithU8Pass": 7.000000096013537e-06,
1044
+ "hilo::ScheduleFusion": 1.9999999949504854e-06,
1045
+ "hilo::SixtyFourHack": 3.899999865097925e-05,
1046
+ "hilo::VerifyAliasing": 9.999999974752427e-07,
1047
+ "hlo-mac-count": 0.00019700000120792538,
1048
+ "hlo-verifier": 0.00016599999798927456,
1049
+ "legalize-ccops": 9.999999974752427e-07,
1050
+ "legalize-compare": 3.000000106112566e-06,
1051
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
1052
+ "map-inline": 1.2000000424450263e-05,
1053
+ "metadata-naming": 1.5999999959603883e-05,
1054
+ "mlir::detail::OpToOpPassAdaptor": 1.8000000636675395e-05,
1055
+ "mlir::hlo::MhloToPyPenguin": 0.016130000352859497,
1056
+ "mlir::mhlo::LowerComplexExtraPass": 9.7999996796716e-05,
1057
+ "mlir::mhlo::LowerComplexPass": 0.0001740000006975606,
1058
+ "native-to-custom-softmax": 9.000000318337698e-06,
1059
+ "native-to-custom-softmax-dx": 2.9999999242136255e-05,
1060
+ "operand_upcaster": 1.2999999853491317e-05,
1061
+ "post-par-pipe-begin": 1.9999999949504854e-06,
1062
+ "post-par-pipe-end": 0.0,
1063
+ "post-partition-simplification": 0.0005070000188425183,
1064
+ "replace-minimum-constant": 9.999999747378752e-06,
1065
+ "reshape-mover": 3.000000106112566e-06,
1066
+ "simplify-concat": 4.999999873689376e-05,
1067
+ "simplify-while-loops": 1.9999999949504854e-06,
1068
+ "transform-variadic-reduce": 4.5000000682193786e-05,
1069
+ "tuple-simplifier": 4.999999873689376e-06,
1070
+ "unpack-nested-aws-ntwsr": 3.999999989900971e-06,
1071
+ "unroll-while-loop": 0.0
1072
+ },
1073
+ "hilo": {
1074
+ "ArithmeticIntensity": 142.82192993164063,
1075
+ "HloMacCount": 22615687168.0,
1076
+ "Traffic": 316697664.0
1077
+ }
1078
+ }
1079
+ }
context_encoding_model/_tp0_bk1/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_encoding_model/_tp0_bk1/neuron_config.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "MistralForCausalLM"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "attribute_map": {},
10
+ "bad_words_ids": null,
11
+ "begin_suppress_tokens": null,
12
+ "bos_token_id": 1,
13
+ "chunk_size_feed_forward": 0,
14
+ "cross_attention_hidden_size": null,
15
+ "decoder_start_token_id": null,
16
+ "diversity_penalty": 0.0,
17
+ "do_sample": false,
18
+ "early_stopping": false,
19
+ "encoder_no_repeat_ngram_size": 0,
20
+ "eos_token_id": 2,
21
+ "exponential_decay_length_penalty": null,
22
+ "finetuning_task": null,
23
+ "forced_bos_token_id": null,
24
+ "forced_eos_token_id": null,
25
+ "fused_spec_config": null,
26
+ "head_dim": 128,
27
+ "hidden_act": "silu",
28
+ "hidden_size": 4096,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 14336,
35
+ "is_decoder": false,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "length_penalty": 1.0,
42
+ "max_length": 20,
43
+ "max_position_embeddings": 32768,
44
+ "metadata": null,
45
+ "min_length": 0,
46
+ "model_type": "mistral",
47
+ "neuron_config": {
48
+ "activation_quantization_type": null,
49
+ "allow_input_truncation": false,
50
+ "apply_seq_ids_mask": false,
51
+ "async_mode": false,
52
+ "attention_dp_degree": 1,
53
+ "attention_dtype": null,
54
+ "attn_block_cte_nki_kernel_enabled": false,
55
+ "attn_block_tkg_nki_kernel_cache_update": false,
56
+ "attn_block_tkg_nki_kernel_enabled": false,
57
+ "attn_cls": "NeuronLlamaAttention",
58
+ "attn_kernel_enabled": null,
59
+ "attn_tkg_builtin_kernel_enabled": false,
60
+ "attn_tkg_nki_kernel_enabled": false,
61
+ "batch_size": 1,
62
+ "bucket_n_active_tokens": true,
63
+ "buckets": [
64
+ 256
65
+ ],
66
+ "cast_type": "config",
67
+ "cc_pipeline_tiling_factor": 2,
68
+ "chunked_prefill_config": null,
69
+ "context_encoding_buckets": [
70
+ 256
71
+ ],
72
+ "cp_degree": 1,
73
+ "ctx_batch_size": 1,
74
+ "disable_kv_cache_tiling": false,
75
+ "draft_model_modules_to_not_convert": null,
76
+ "enable_bucketing": true,
77
+ "enable_eagle_draft_input_norm": false,
78
+ "enable_eagle_speculation": false,
79
+ "enable_fused_speculation": false,
80
+ "enable_long_context_mode": false,
81
+ "enable_output_completion_notifications": false,
82
+ "enable_spill_reload_dge": false,
83
+ "enable_token_tree": false,
84
+ "ep_degree": 1,
85
+ "expert_mlp_nki_kernel_enabled": null,
86
+ "flash_decoding_enabled": false,
87
+ "fused_qkv": false,
88
+ "fused_rmsnorm_skip_gamma": false,
89
+ "is_block_kv_layout": null,
90
+ "is_chunked_prefill": false,
91
+ "is_continuous_batching": true,
92
+ "is_eagle_draft": false,
93
+ "is_medusa": false,
94
+ "is_prefill_stage": true,
95
+ "is_prefix_caching": false,
96
+ "k_cache_transposed": false,
97
+ "kv_cache_batch_size": 4,
98
+ "kv_cache_padding_size": 0,
99
+ "kv_cache_quant": false,
100
+ "kv_cache_tiling": false,
101
+ "layer_boundary_markers": false,
102
+ "lm_head_pad": false,
103
+ "lm_head_pad_alignment_size": 1,
104
+ "local_ranks_size": 2,
105
+ "logical_nc_config": 1,
106
+ "lora_config": null,
107
+ "max_batch_size": 4,
108
+ "max_context_length": 2048,
109
+ "max_length": 2048,
110
+ "max_new_tokens": null,
111
+ "medusa_speculation_length": 0,
112
+ "medusa_tree": null,
113
+ "mlp_kernel_enabled": false,
114
+ "mlp_kernel_fuse_residual_add": false,
115
+ "modules_to_not_convert": null,
116
+ "moe_fused_nki_kernel_enabled": null,
117
+ "n_active_tokens": 2048,
118
+ "n_positions": 2048,
119
+ "num_medusa_heads": 0,
120
+ "on_cpu": false,
121
+ "on_device_sampling_config": {
122
+ "deterministic": false,
123
+ "do_sample": false,
124
+ "dynamic": true,
125
+ "global_topk": 256,
126
+ "on_device_sampling_config": true,
127
+ "temperature": 1.0,
128
+ "top_k": 1,
129
+ "top_k_kernel_enabled": false,
130
+ "top_p": 1.0
131
+ },
132
+ "output_logits": false,
133
+ "overrides_torch_dtype": true,
134
+ "pa_block_size": 2048,
135
+ "pa_num_blocks": 4,
136
+ "padding_side": "right",
137
+ "pp_degree": 1,
138
+ "prefix_buckets": null,
139
+ "qk_layernorm": false,
140
+ "qkv_kernel_enabled": false,
141
+ "qkv_kernel_fuse_residual_add": false,
142
+ "qkv_kernel_nbsd_layout": false,
143
+ "quantization_dtype": "int8",
144
+ "quantization_type": "per_tensor_symmetric",
145
+ "quantize_clamp_bound": Infinity,
146
+ "quantized": false,
147
+ "quantized_checkpoints_path": null,
148
+ "quantized_mlp_kernel_enabled": false,
149
+ "rmsnorm_quantize_kernel_enabled": false,
150
+ "router_topk_nki_kernel_enabled": null,
151
+ "rpl_reduce_dtype": null,
152
+ "save_sharded_checkpoint": true,
153
+ "scratchpad_page_size": null,
154
+ "seq_len": 2048,
155
+ "seq_len_threshold_for_cc_tiling": 16384,
156
+ "sequence_parallel_enabled": false,
157
+ "shared_mlp_nki_kernel_enabled": null,
158
+ "skip_sharding": false,
159
+ "skip_warmup": false,
160
+ "spec_batch_size": 4,
161
+ "speculation_length": 0,
162
+ "start_rank_id": 0,
163
+ "target": null,
164
+ "tile_cc": false,
165
+ "tkg_batch_size": 4,
166
+ "token_generation_buckets": null,
167
+ "token_tree_config": null,
168
+ "torch_dtype": "bfloat16",
169
+ "tp_degree": 2,
170
+ "vocab_parallel": false,
171
+ "weight_gather_seq_len_threshold": 32768,
172
+ "weights_to_skip_layout_optimization": [],
173
+ "world_size": 2
174
+ },
175
+ "no_repeat_ngram_size": 0,
176
+ "num_attention_heads": 32,
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_cores_per_group": 1,
180
+ "num_hidden_layers": 32,
181
+ "num_key_value_heads": 8,
182
+ "num_return_sequences": 1,
183
+ "output_attentions": false,
184
+ "output_hidden_states": false,
185
+ "output_scores": false,
186
+ "pad_token_id": 0,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "remove_invalid_values": false,
191
+ "repetition_penalty": 1.0,
192
+ "return_dict": true,
193
+ "return_dict_in_generate": false,
194
+ "rms_norm_eps": 1e-05,
195
+ "rope_theta": 1000000.0,
196
+ "sep_token_id": null,
197
+ "sliding_window": null,
198
+ "suppress_tokens": null,
199
+ "task_specific_params": null,
200
+ "temperature": 1.0,
201
+ "tf_legacy_loss": false,
202
+ "tie_encoder_decoder": false,
203
+ "tie_word_embeddings": false,
204
+ "tokenizer_class": null,
205
+ "top_k": 50,
206
+ "top_p": 1.0,
207
+ "torchscript": false,
208
+ "transformers_version": "4.42.0.dev0",
209
+ "typical_p": 1.0,
210
+ "use_bfloat16": false,
211
+ "use_cache": true,
212
+ "vocab_size": 32768
213
+ }
context_encoding_model/_tp0_bk2/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_78e5291800ea5b96a03b+442879bd.hlo_module.pb --output model.MODULE_78e5291800ea5b96a03b+442879bd.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk2/compile_flags.MODULE_78e5291800ea5b96a03b+442879bd.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk2/log-neuron-cc.txt"]
context_encoding_model/_tp0_bk2/global_metric_store.json ADDED
@@ -0,0 +1,1079 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.68099975585938,
5
+ "StaticProfiler::AveragePartitionUtilization": 99.27783203125,
6
+ "StaticProfiler::AveragePeUtilization": 98.72872924804688,
7
+ "StaticProfiler::LocalizationEfficiency": 96.15084838867188,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 102.69681549072266,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1.0,
16
+ "StaticProfiler::AveragePartitionUtilization": 1.0,
17
+ "StaticProfiler::AveragePeUtilization": 1.0,
18
+ "StaticProfiler::LocalizationEfficiency": 1.0,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 0.018404245376586914,
27
+ "AffinePredicateResolution": 0.0012030601501464844,
28
+ "AliasDependencyElimination": 0.00032448768615722656,
29
+ "AliasDependencyInduction": 0.006628751754760742,
30
+ "AliasDependencyReset": 0.04145312309265137,
31
+ "BFComputeCutting": 0.001703023910522461,
32
+ "BirCodeGenLoop": 0.21541404724121094,
33
+ "CCOpFusion": 0.017800331115722656,
34
+ "CanonicalizeConv": 1.5999999959603883e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.004300355911254883,
36
+ "CanonicalizeForTensorizer": 5.0000002374872565e-05,
37
+ "CanonicalizeIR": 0.0014739036560058594,
38
+ "Canonicalizer": 0.000981000019237399,
39
+ "CoalesceCCOp": 0.006689786911010742,
40
+ "CommuteConcat": 0.0008399486541748047,
41
+ "DMALocalityOpt": 0.0020704269409179688,
42
+ "DMAProfiler": 0.0075225830078125,
43
+ "DMATilingProfiler": 0.04760599136352539,
44
+ "DataLocalityOpt": 0.12700510025024414,
45
+ "DataStreaming": 0.006787538528442383,
46
+ "DeConcat": 0.00067138671875,
47
+ "DeadCodeElimination": 0.0010039806365966797,
48
+ "DeadStoreElimination": 0.006083250045776367,
49
+ "DelinearIndices": 0.004990339279174805,
50
+ "Delinearization": 0.0024585723876953125,
51
+ "DoNothing": 0.0002791881561279297,
52
+ "DramToDramTranspose": 0.01742863655090332,
53
+ "DumpGraphAndMetadata": 0.022494077682495117,
54
+ "EliminateDivs": 0.0031578540802001953,
55
+ "ExpandBatchNorm": 0.0016427040100097656,
56
+ "ExpandISAMacro": 0.005059242248535156,
57
+ "FactorizeBlkDims": 0.016604900360107422,
58
+ "FactorizeThreadAxesInFreeDims": 0.0011897087097167969,
59
+ "FlattenMacroLoop": 0.0030732154846191406,
60
+ "GenericAccessSimplifier": 0.0007944107055664063,
61
+ "HoistCompute": 1.4000000192027073e-05,
62
+ "IdentifyCrossPassTensors": 3.899999865097925e-05,
63
+ "InferInitValue": 0.08164691925048828,
64
+ "InferIntrinsicOnCC": 0.00891733169555664,
65
+ "InferNeuronTensor": 0.021415233612060547,
66
+ "InferNonlocalTensors": 0.020433664321899414,
67
+ "InferPSumTensor": 0.06488466262817383,
68
+ "InlineNativeKernels": 0.002740621566772461,
69
+ "InsertIOTransposes": 0.01563739776611328,
70
+ "InsertLocalTransposes": 0.004204511642456055,
71
+ "InsertOffloadedTransposes": 0.0030889511108398438,
72
+ "LICM": 0.0025815963745117188,
73
+ "LateLegalizeInst": 0.007286548614501953,
74
+ "LateLegalizePostSplit": 0.004240512847900391,
75
+ "LateLowerReshapeOp": 0.0011327266693115234,
76
+ "LateLowerTensorOp": 0.0026912689208984375,
77
+ "LateNeuronInstComb": 0.060045719146728516,
78
+ "LayoutPreprocessing": 0.0245664119720459,
79
+ "LayoutPreprocessingAndAnalysis": 0.07019805908203125,
80
+ "LayoutRequirementAnalysis": 0.004818916320800781,
81
+ "LegalizeCCOpLayout": 0.0020072460174560547,
82
+ "LegalizeOpLevelAlias": 0.0011057853698730469,
83
+ "LegalizePartitionReduce": 0.0015609264373779297,
84
+ "LegalizeSundaAccess": 0.0679481029510498,
85
+ "LegalizeSundaMacro": 0.018359661102294922,
86
+ "LegalizeType": 0.006083011627197266,
87
+ "LocalLayoutOpt": 0.014153003692626953,
88
+ "LoopFusion": 0.005875825881958008,
89
+ "LoopSplitting": 0.0004029273986816406,
90
+ "LowerBroadcast": 0.0018463134765625,
91
+ "LowerCCOpBlockAxis": 0.003938198089599609,
92
+ "LowerComplexBroadcast": 0.004299163818359375,
93
+ "LowerIntrinsics": 0.26036787033081055,
94
+ "LowerTensorOp": 0.010293006896972656,
95
+ "LowerTranspose": 0.009520769119262695,
96
+ "MacroGeneration": 0.03194475173950195,
97
+ "MaskPropagation": 0.0027740001678466797,
98
+ "MemcastMotion": 1.1000000085914508e-05,
99
+ "MemcpyElimination": 0.02656841278076172,
100
+ "MutateDataType": 0.0012118816375732422,
101
+ "NeuronAliasDependencyInduction": 0.0003993511199951172,
102
+ "NeuronAliasDependencyReset": 0.017134428024291992,
103
+ "NeuronInstComb": 0.005475759506225586,
104
+ "NeuronLICM": 0.013751506805419922,
105
+ "NeuronLoopFusion": 0.011570215225219727,
106
+ "NeuronLoopInterchange": 0.002625703811645508,
107
+ "NeuronSimplifier": 0.012082099914550781,
108
+ "NeuronSimplifyPredicates": 0.006382942199707031,
109
+ "NeuronValueNumbering": 0.005650520324707031,
110
+ "OptimizeAliasedCopyChain": 0.0005443096160888672,
111
+ "OptimizeNKIKernels": 0.6168310642242432,
112
+ "PAGLayoutOpt": 0.10135579109191895,
113
+ "PComputeCutting": 0.004828453063964844,
114
+ "PGLayoutTilingPipeline": 0.6252057552337646,
115
+ "PGTiling": 0.1626896858215332,
116
+ "PadElimination": 0.00044655799865722656,
117
+ "ParAxesAnnotation": 0.07457947731018066,
118
+ "PartialLoopFusion": 0.047167062759399414,
119
+ "PartialSimdFusion": 0.006536245346069336,
120
+ "PenguinizeFunctions": 4.3000000005122274e-05,
121
+ "PerfectLoopNest": 0.003509998321533203,
122
+ "PruneFunctions": 1.8999999156221747e-05,
123
+ "RecognizeOpIdiom": 0.0035545825958251953,
124
+ "Recompute": 0.00031566619873046875,
125
+ "RelaxPredicates": 0.004423379898071289,
126
+ "Rematerialization": 0.001989126205444336,
127
+ "RemoveOptimizationBarriers": 4.400000034365803e-05,
128
+ "ReshapeWeights": 0.0011234283447265625,
129
+ "ResolveAccessConflict": 0.003659963607788086,
130
+ "ResolveComplicatePredicates": 0.0010039806365966797,
131
+ "RewriteReplicationMatmul": 0.0014824867248535156,
132
+ "RewriteWeights": 0.0052568912506103516,
133
+ "SFKVectorizer": 0.18593907356262207,
134
+ "ScatterMotion": 7.000000096013537e-06,
135
+ "SimpleAllReduceTiling": 0.0033435821533203125,
136
+ "Simplifier": 0.003216266632080078,
137
+ "SimplifyMacroPredicates": 0.005851030349731445,
138
+ "SimplifyNeuronTensor": 0.2244417667388916,
139
+ "SimplifySlice": 0.0011301040649414063,
140
+ "SimplifyTensor": 0.009292364120483398,
141
+ "SpillPSum": 0.01206660270690918,
142
+ "SplitAPUnionSets": 0.03329944610595703,
143
+ "SplitAccGrp": 0.0012478828430175781,
144
+ "StaticProfiler": 0.005914926528930664,
145
+ "StaticTransposeLocalTensor": 0.0035593509674072266,
146
+ "SundaISel": 0.09800505638122559,
147
+ "TCTransform": 0.0008752346038818359,
148
+ "TensorInitialization": 0.004550933837890625,
149
+ "TensorOpSimplifier": 0.004429817199707031,
150
+ "TensorOpTransform": 0.035314083099365234,
151
+ "TensorizerLegalizationPass": 4.70000013592653e-05,
152
+ "TileCCOps": 0.006109952926635742,
153
+ "TilingProfiler": 0.0074079036712646484,
154
+ "TransformConvOp": 0.0027616024017333984,
155
+ "TritiumFusion": 0.07009291648864746,
156
+ "ValueNumbering": 0.0019168853759765625,
157
+ "VectorizeDMA": 0.002183198928833008,
158
+ "VectorizeMatMult": 0.0026412010192871094,
159
+ "VerifySupportedOps": 3.9999998989515007e-05,
160
+ "WeightCoalescing": 0.003337383270263672,
161
+ "ZeroSizeTensorElimination": 0.00018453598022460938,
162
+ "algsimp": 0.002238999819383025,
163
+ "batchnorm_expander": 4.099999932805076e-05,
164
+ "boundary-marker-removal": 1.1000000085914508e-05,
165
+ "call-inliner": 0.00045200000749900937,
166
+ "canonicalize-boundary-marker": 1.4999999621068127e-05,
167
+ "collective-stream-id-checker": 7.100000220816582e-05,
168
+ "comparison-expander": 0.0004180000105407089,
169
+ "computation-deduplicator": 6.399999983841553e-05,
170
+ "conditional-to-select": 1.5999999959603883e-05,
171
+ "config-lowering": 0.00016199999663513154,
172
+ "constant-statistics": 0.0004290000069886446,
173
+ "constant_folding": 0.0003000000142492354,
174
+ "cse": 3.9999998989515007e-05,
175
+ "dce": 8.099999831756577e-05,
176
+ "dot_decomposer": 0.0011439999798312783,
177
+ "dynamic-slice-transpose": 1.3999999282532372e-05,
178
+ "eliminate-redundant-compare": 0.0002579999854788184,
179
+ "emit-offloaded-dropout": 6.70000008540228e-05,
180
+ "flatten-call-graph": 0.0006840000278316438,
181
+ "fuse-send-recv": 6.70000008540228e-05,
182
+ "hilo::LegalizeAlias": 1.3999999282532372e-05,
183
+ "hilo::NeuronInstCombine": 0.00015199999324977398,
184
+ "hilo::NeuronOpFusion": 2.700000004551839e-05,
185
+ "hilo::ReplaceTokenTypeWithU8Pass": 4.3000000005122274e-05,
186
+ "hilo::ScheduleFusion": 2.9000000722589903e-05,
187
+ "hilo::SixtyFourHack": 8.199999865610152e-05,
188
+ "hilo::VerifyAliasing": 6.000000212225132e-06,
189
+ "hlo-mac-count": 0.0011340000201016665,
190
+ "hlo-verifier": 0.006370999850332737,
191
+ "instruction-histogram": 0.001025000005029142,
192
+ "io-con-pipe-begin": 3.999999989900971e-06,
193
+ "io-con-pipe-end": 9.999999974752427e-07,
194
+ "io-layout-normalization": 0.00108299998100847,
195
+ "io-statistics": 9.600000339560211e-05,
196
+ "legalize-ccops": 3.999999989900971e-06,
197
+ "legalize-compare": 1.1999999514955562e-05,
198
+ "lower-argminmax-custom-call": 9.000000318337698e-06,
199
+ "map-inline": 0.00063699996098876,
200
+ "metadata-naming": 4.8000001697801054e-05,
201
+ "mlir::detail::OpToOpPassAdaptor": 7.599999662488699e-05,
202
+ "mlir::hlo::MhloToPyPenguin": 0.0729999989271164,
203
+ "mlir::mhlo::LowerComplexExtraPass": 0.0003330000035930425,
204
+ "mlir::mhlo::LowerComplexPass": 0.00041099998634308577,
205
+ "native-to-custom-softmax": 0.0005629999795928597,
206
+ "native-to-custom-softmax-dx": 0.000522000016644597,
207
+ "operand_upcaster": 7.500000356230885e-05,
208
+ "opt-barrier-removal": 0.0004529999860096723,
209
+ "post-par-pipe-begin": 4.999999873689376e-06,
210
+ "post-par-pipe-end": 0.0,
211
+ "post-partition-simplification": 0.001686000032350421,
212
+ "pre-par-pipe-begin": 9.999999974752427e-07,
213
+ "pre-par-pipe-end": 0.0,
214
+ "pre-partition-simplification": 0.11061400175094604,
215
+ "replace-minimum-constant": 0.00034699999378062785,
216
+ "reshape-mover": 0.00011300000187475234,
217
+ "simplify-concat": 0.000155999994603917,
218
+ "simplify-while-loops": 9.800000407267362e-05,
219
+ "transform-variadic-reduce": 7.700000423938036e-05,
220
+ "tuple-simplifier": 0.00028600002406165004,
221
+ "unpack-nested-aws-ntwsr": 0.000311999989207834,
222
+ "unroll-while-loop": 1.8000000636675395e-05,
223
+ "zero_sized_hlo_elimination": 0.000653000024612993
224
+ },
225
+ "hilo": {
226
+ "ConstantSize": 1057749.0,
227
+ "HloInputCount": 359.0,
228
+ "HloMacCount": 113883742208.0,
229
+ "HloOutputCount": 65.0,
230
+ "IfmapSize": 7785164800.0,
231
+ "OfmapSize": 536870912.0,
232
+ "OutputsReadFromCount": 0.0,
233
+ "PassthroughTensorsCount": 0.0,
234
+ "RedundantOutputCount": 0.0,
235
+ "Traffic": 795959680.0
236
+ },
237
+ "tensorizer": {
238
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 11124.0,
239
+ "StaticProfiler::AifUb": 291.8241882324219,
240
+ "StaticProfiler::ArithmeticIntensityTensorizer": 280.5914306640625,
241
+ "StaticProfiler::AverageDmaLength": 1148.8248291015625,
242
+ "StaticProfiler::DDRTransferBytes": 331583520.0,
243
+ "StaticProfiler::InternalTransferBytes": 24224788.0,
244
+ "StaticProfiler::LoadExpanded": 293906.0,
245
+ "StaticProfiler::StoreExpanded": 3627.0,
246
+ "StaticProfiler::TotalDMAExpanded": 297533.0,
247
+ "StaticProfiler::TotalDynamicInstancesCount": 13170.0,
248
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 13074.0,
249
+ "StaticProfiler::TotalLNCComm": 0.0,
250
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
251
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
252
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
253
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
254
+ "TilingProfiler::MatMultInstructionsAfterTiling": 9472.0,
255
+ "TilingProfiler::NumPfTransposes": 4.0,
256
+ "TilingProfiler::NumPfTransposesForIo": 0.0,
257
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
258
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
259
+ "TilingProfiler::PfTransposeInstructions": 385.0,
260
+ "TilingProfiler::PfTransposeInstructionsForIo": 0.0,
261
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
262
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0,
263
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
264
+ "TilingProfiler::SimdInstructionsAfterTiling": 166.0,
265
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
266
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
267
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
268
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
269
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
270
+ "TransformConvOp::conv2d_column_packing": 0.0,
271
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
272
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
273
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
274
+ }
275
+ },
276
+ "all": {
277
+ "compiletime": {
278
+ "algsimp": 0.0020379999186843634,
279
+ "call-inliner": 0.0004239999980200082,
280
+ "collective-stream-id-checker": 5.8000001445179805e-05,
281
+ "comparison-expander": 0.0004030000127386302,
282
+ "constant-statistics": 0.0004290000069886446,
283
+ "constant_folding": 0.0002739999908953905,
284
+ "dce": 7.79999973019585e-05,
285
+ "dot_decomposer": 0.0011439999798312783,
286
+ "eliminate-redundant-compare": 0.0002479999966453761,
287
+ "flatten-call-graph": 0.0006559999892488122,
288
+ "hlo-mac-count": 0.0008549999911338091,
289
+ "hlo-verifier": 0.005834000185132027,
290
+ "instruction-histogram": 0.001025000005029142,
291
+ "io-con-pipe-begin": 3.999999989900971e-06,
292
+ "io-con-pipe-end": 9.999999974752427e-07,
293
+ "io-layout-normalization": 0.00108299998100847,
294
+ "io-statistics": 9.600000339560211e-05,
295
+ "map-inline": 0.0006009999779053032,
296
+ "native-to-custom-softmax": 0.0005319999763742089,
297
+ "native-to-custom-softmax-dx": 0.0003640000068116933,
298
+ "opt-barrier-removal": 0.0004529999860096723,
299
+ "pre-par-pipe-begin": 9.999999974752427e-07,
300
+ "pre-par-pipe-end": 0.0,
301
+ "pre-partition-simplification": 0.11061400175094604,
302
+ "replace-minimum-constant": 0.0003220000071451068,
303
+ "reshape-mover": 0.00010199999815085903,
304
+ "simplify-while-loops": 9.200000204145908e-05,
305
+ "tuple-simplifier": 0.00027200000477023423,
306
+ "unpack-nested-aws-ntwsr": 0.0003009999927598983,
307
+ "unroll-while-loop": 1.8000000636675395e-05,
308
+ "zero_sized_hlo_elimination": 0.000653000024612993
309
+ }
310
+ },
311
+ "cumsum": {
312
+ "compiletime": {
313
+ "CoalesceCCOp": 0.0002663135528564453,
314
+ "DMALocalityOpt": 0.00022459030151367188,
315
+ "DMAProfiler": 0.0010578632354736328,
316
+ "DataStreaming": 0.0003883838653564453,
317
+ "DoNothing": 0.00015854835510253906,
318
+ "ExpandISAMacro": 0.0006091594696044922,
319
+ "FactorizeBlkDims": 0.00047659873962402344,
320
+ "InferPSumTensor": 0.0010039806365966797,
321
+ "LateLegalizeInst": 0.0005145072937011719,
322
+ "LateNeuronInstComb": 0.0005576610565185547,
323
+ "LegalizeSundaAccess": 0.0016772747039794922,
324
+ "LegalizeType": 0.00028705596923828125,
325
+ "LowerBroadcast": 0.0002655982971191406,
326
+ "LowerIntrinsics": 0.0002892017364501953,
327
+ "LowerTranspose": 0.0002830028533935547,
328
+ "NeuronInstComb": 0.0005538463592529297,
329
+ "NeuronLICM": 0.00041985511779785156,
330
+ "NeuronSimplifyPredicates": 0.003001689910888672,
331
+ "NeuronValueNumbering": 0.0004851818084716797,
332
+ "SFKVectorizer": 0.003008604049682617,
333
+ "SimpleAllReduceTiling": 0.00024580955505371094,
334
+ "SimplifyNeuronTensor": 0.0004801750183105469,
335
+ "SpillPSum": 0.0005950927734375,
336
+ "WeightCoalescing": 0.00028634071350097656
337
+ }
338
+ },
339
+ "sg00": {
340
+ "compiletime": {
341
+ "CanonicalizeConv": 7.999999979801942e-06,
342
+ "CanonicalizeForTensorizer": 1.4999999621068127e-05,
343
+ "Canonicalizer": 0.00033999999868683517,
344
+ "HoistCompute": 3.000000106112566e-06,
345
+ "IdentifyCrossPassTensors": 1.2000000424450263e-05,
346
+ "MemcastMotion": 3.999999989900971e-06,
347
+ "PenguinizeFunctions": 1.4999999621068127e-05,
348
+ "PruneFunctions": 4.999999873689376e-06,
349
+ "RemoveOptimizationBarriers": 1.4000000192027073e-05,
350
+ "ScatterMotion": 0.0,
351
+ "TensorizerLegalizationPass": 2.2000000171829015e-05,
352
+ "VerifySupportedOps": 1.1000000085914508e-05,
353
+ "algsimp": 6.800000119255856e-05,
354
+ "batchnorm_expander": 1.4000000192027073e-05,
355
+ "boundary-marker-removal": 3.000000106112566e-06,
356
+ "call-inliner": 7.999999979801942e-06,
357
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
358
+ "collective-stream-id-checker": 1.9999999949504854e-06,
359
+ "comparison-expander": 3.999999989900971e-06,
360
+ "computation-deduplicator": 1.4000000192027073e-05,
361
+ "conditional-to-select": 4.999999873689376e-06,
362
+ "config-lowering": 5.2999999752501026e-05,
363
+ "constant_folding": 7.999999979801942e-06,
364
+ "cse": 1.2999999853491317e-05,
365
+ "dce": 9.999999974752427e-07,
366
+ "dynamic-slice-transpose": 4.999999873689376e-06,
367
+ "eliminate-redundant-compare": 3.000000106112566e-06,
368
+ "emit-offloaded-dropout": 2.2000000171829015e-05,
369
+ "flatten-call-graph": 9.000000318337698e-06,
370
+ "fuse-send-recv": 1.9999999494757503e-05,
371
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
372
+ "hilo::NeuronInstCombine": 8.900000102585182e-05,
373
+ "hilo::NeuronOpFusion": 1.8000000636675395e-05,
374
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.4999999621068127e-05,
375
+ "hilo::ScheduleFusion": 6.000000212225132e-06,
376
+ "hilo::SixtyFourHack": 1.1000000085914508e-05,
377
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
378
+ "hlo-mac-count": 3.7999998312443495e-05,
379
+ "hlo-verifier": 0.00014899999951012433,
380
+ "legalize-ccops": 1.9999999949504854e-06,
381
+ "legalize-compare": 3.999999989900971e-06,
382
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
383
+ "map-inline": 1.1000000085914508e-05,
384
+ "metadata-naming": 1.4999999621068127e-05,
385
+ "mlir::detail::OpToOpPassAdaptor": 2.099999983329326e-05,
386
+ "mlir::hlo::MhloToPyPenguin": 0.03171300143003464,
387
+ "mlir::mhlo::LowerComplexExtraPass": 0.00011700000322889537,
388
+ "mlir::mhlo::LowerComplexPass": 0.00014899999951012433,
389
+ "native-to-custom-softmax": 1.1000000085914508e-05,
390
+ "native-to-custom-softmax-dx": 9.100000170292333e-05,
391
+ "operand_upcaster": 2.4000000848900527e-05,
392
+ "post-par-pipe-begin": 9.999999974752427e-07,
393
+ "post-par-pipe-end": 0.0,
394
+ "post-partition-simplification": 0.0005540000274777412,
395
+ "replace-minimum-constant": 7.999999979801942e-06,
396
+ "reshape-mover": 3.999999989900971e-06,
397
+ "simplify-concat": 5.199999941396527e-05,
398
+ "simplify-while-loops": 1.9999999949504854e-06,
399
+ "transform-variadic-reduce": 9.000000318337698e-06,
400
+ "tuple-simplifier": 3.999999989900971e-06,
401
+ "unpack-nested-aws-ntwsr": 3.000000106112566e-06,
402
+ "unroll-while-loop": 0.0
403
+ },
404
+ "hilo": {
405
+ "ArithmeticIntensity": 111.23420715332031,
406
+ "ConstantSize": 1057749.0,
407
+ "HloInputCount": 359.0,
408
+ "HloMacCount": 11811160064.0,
409
+ "HloOutputCount": 65.0,
410
+ "IfmapSize": 7785164800.0,
411
+ "OfmapSize": 536870912.0,
412
+ "OutputsReadFromCount": 0.0,
413
+ "PassthroughTensorsCount": 0.0,
414
+ "RedundantOutputCount": 0.0,
415
+ "Traffic": 212365600.0
416
+ }
417
+ },
418
+ "sg0000": {
419
+ "compiletime": {
420
+ "AGOrderingAnalysisPass": 0.027149438858032227,
421
+ "AffinePredicateResolution": 0.0014154911041259766,
422
+ "AliasDependencyElimination": 0.00019693374633789063,
423
+ "AliasDependencyInduction": 0.008726119995117188,
424
+ "AliasDependencyReset": 0.11011934280395508,
425
+ "BFComputeCutting": 0.002534627914428711,
426
+ "BirCodeGenLoop": 0.04940390586853027,
427
+ "CCOpFusion": 0.03955817222595215,
428
+ "CanonicalizeDAGForPGTiling": 0.0033364295959472656,
429
+ "CanonicalizeIR": 0.0017902851104736328,
430
+ "CoalesceCCOp": 0.004707813262939453,
431
+ "CommuteConcat": 0.0011563301086425781,
432
+ "DMALocalityOpt": 0.0012772083282470703,
433
+ "DMAProfiler": 0.00436711311340332,
434
+ "DMATilingProfiler": 0.0046231746673583984,
435
+ "DataLocalityOpt": 0.12483954429626465,
436
+ "DataStreaming": 0.004174470901489258,
437
+ "DeConcat": 0.001886606216430664,
438
+ "DeadCodeElimination": 0.0011522769927978516,
439
+ "DeadStoreElimination": 0.040479183197021484,
440
+ "DelinearIndices": 0.006760358810424805,
441
+ "Delinearization": 0.003210306167602539,
442
+ "DoNothing": 0.00013256072998046875,
443
+ "DramToDramTranspose": 0.024677276611328125,
444
+ "DumpGraphAndMetadata": 0.012624502182006836,
445
+ "EliminateDivs": 0.0033044815063476563,
446
+ "ExpandBatchNorm": 0.0016093254089355469,
447
+ "ExpandISAMacro": 0.002683877944946289,
448
+ "FactorizeBlkDims": 0.02043914794921875,
449
+ "FactorizeThreadAxesInFreeDims": 0.0014965534210205078,
450
+ "FlattenMacroLoop": 0.005756378173828125,
451
+ "GenericAccessSimplifier": 0.0009303092956542969,
452
+ "InferInitValue": 0.057647705078125,
453
+ "InferIntrinsicOnCC": 0.009967327117919922,
454
+ "InferNeuronTensor": 0.037697553634643555,
455
+ "InferNonlocalTensors": 0.19877171516418457,
456
+ "InferPSumTensor": 0.05426788330078125,
457
+ "InlineNativeKernels": 0.0014045238494873047,
458
+ "InsertIOTransposes": 0.012149333953857422,
459
+ "InsertLocalTransposes": 0.006654024124145508,
460
+ "InsertOffloadedTransposes": 0.042548418045043945,
461
+ "LICM": 0.004395484924316406,
462
+ "LateLegalizeInst": 0.006590127944946289,
463
+ "LateLegalizePostSplit": 0.0027794837951660156,
464
+ "LateLowerReshapeOp": 0.021605253219604492,
465
+ "LateLowerTensorOp": 0.02600717544555664,
466
+ "LateNeuronInstComb": 0.03907942771911621,
467
+ "LayoutPreprocessing": 0.07296419143676758,
468
+ "LayoutPreprocessingAndAnalysis": 0.09938645362854004,
469
+ "LayoutRequirementAnalysis": 0.013081550598144531,
470
+ "LegalizeCCOpLayout": 0.002216339111328125,
471
+ "LegalizeOpLevelAlias": 0.001260519027709961,
472
+ "LegalizePartitionReduce": 0.0013256072998046875,
473
+ "LegalizeSundaAccess": 0.057663917541503906,
474
+ "LegalizeSundaMacro": 0.007729530334472656,
475
+ "LegalizeType": 0.003155231475830078,
476
+ "LocalLayoutOpt": 0.05669593811035156,
477
+ "LoopFusion": 0.007777690887451172,
478
+ "LoopSplitting": 0.0003819465637207031,
479
+ "LowerBroadcast": 0.0032148361206054688,
480
+ "LowerCCOpBlockAxis": 0.004834890365600586,
481
+ "LowerComplexBroadcast": 0.003167390823364258,
482
+ "LowerIntrinsics": 0.025065183639526367,
483
+ "LowerTensorOp": 0.011690616607666016,
484
+ "LowerTranspose": 0.026025772094726563,
485
+ "MacroGeneration": 0.19495820999145508,
486
+ "MaskPropagation": 0.00906229019165039,
487
+ "MemcpyElimination": 0.16968393325805664,
488
+ "MutateDataType": 0.0013375282287597656,
489
+ "NeuronAliasDependencyInduction": 0.00027489662170410156,
490
+ "NeuronAliasDependencyReset": 0.049456119537353516,
491
+ "NeuronInstComb": 0.013614177703857422,
492
+ "NeuronLICM": 0.007915735244750977,
493
+ "NeuronLoopFusion": 0.020767688751220703,
494
+ "NeuronLoopInterchange": 0.0029578208923339844,
495
+ "NeuronSimplifier": 0.015317201614379883,
496
+ "NeuronSimplifyPredicates": 0.005019426345825195,
497
+ "NeuronValueNumbering": 0.0039823055267333984,
498
+ "OptimizeAliasedCopyChain": 0.0007677078247070313,
499
+ "OptimizeNKIKernels": 0.0017440319061279297,
500
+ "PAGLayoutOpt": 0.6516931056976318,
501
+ "PComputeCutting": 0.008367061614990234,
502
+ "PGLayoutTilingPipeline": 1.5081679821014404,
503
+ "PGTiling": 0.33455348014831543,
504
+ "PadElimination": 0.0004961490631103516,
505
+ "ParAxesAnnotation": 0.39901041984558105,
506
+ "PartialLoopFusion": 0.024018049240112305,
507
+ "PartialSimdFusion": 0.012691020965576172,
508
+ "PerfectLoopNest": 0.001743316650390625,
509
+ "RecognizeOpIdiom": 0.004468679428100586,
510
+ "Recompute": 0.0003910064697265625,
511
+ "RelaxPredicates": 0.0034019947052001953,
512
+ "Rematerialization": 0.006110429763793945,
513
+ "ReshapeWeights": 0.0015015602111816406,
514
+ "ResolveAccessConflict": 0.0035288333892822266,
515
+ "ResolveComplicatePredicates": 0.00148773193359375,
516
+ "RewriteReplicationMatmul": 0.0015423297882080078,
517
+ "RewriteWeights": 0.0054814815521240234,
518
+ "SFKVectorizer": 0.27728796005249023,
519
+ "SimpleAllReduceTiling": 0.0017549991607666016,
520
+ "Simplifier": 0.00467371940612793,
521
+ "SimplifyMacroPredicates": 0.01883554458618164,
522
+ "SimplifyNeuronTensor": 0.03848075866699219,
523
+ "SimplifySlice": 0.0010480880737304688,
524
+ "SimplifyTensor": 0.0058362483978271484,
525
+ "SpillPSum": 0.017147064208984375,
526
+ "SplitAPUnionSets": 0.02936863899230957,
527
+ "SplitAccGrp": 0.0024695396423339844,
528
+ "StaticProfiler": 0.004050016403198242,
529
+ "StaticTransposeLocalTensor": 0.004538059234619141,
530
+ "SundaISel": 0.10911178588867188,
531
+ "TCTransform": 0.0014197826385498047,
532
+ "TensorInitialization": 0.009113788604736328,
533
+ "TensorOpSimplifier": 0.00628209114074707,
534
+ "TensorOpTransform": 0.027051925659179688,
535
+ "TileCCOps": 0.025897979736328125,
536
+ "TilingProfiler": 0.010443687438964844,
537
+ "TransformConvOp": 0.0024106502532958984,
538
+ "TritiumFusion": 0.15155267715454102,
539
+ "ValueNumbering": 0.0042994022369384766,
540
+ "VectorizeDMA": 0.006690025329589844,
541
+ "VectorizeMatMult": 0.06819844245910645,
542
+ "WeightCoalescing": 0.003052949905395508,
543
+ "ZeroSizeTensorElimination": 0.00017309188842773438
544
+ },
545
+ "tensorizer": {
546
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 2978.0,
547
+ "StaticProfiler::AifUb": 120.88131713867188,
548
+ "StaticProfiler::ArithmeticIntensityTensorizer": 409.4889831542969,
549
+ "StaticProfiler::AverageDmaLength": 1928.296630859375,
550
+ "StaticProfiler::AverageFractalPeUtilization": 99.95079803466797,
551
+ "StaticProfiler::AveragePartitionUtilization": 99.86166381835938,
552
+ "StaticProfiler::AveragePeUtilization": 99.79874420166016,
553
+ "StaticProfiler::DDRTransferBytes": 64557320.0,
554
+ "StaticProfiler::InternalTransferBytes": 44957696.0,
555
+ "StaticProfiler::LoadExpanded": 23043.0,
556
+ "StaticProfiler::LocalizationEfficiency": 338.7529296875,
557
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 438.4559631347656,
558
+ "StaticProfiler::StoreExpanded": 5505.0,
559
+ "StaticProfiler::TotalDMAExpanded": 28548.0,
560
+ "StaticProfiler::TotalDynamicInstancesCount": 3406.0,
561
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 3403.0,
562
+ "StaticProfiler::TotalLNCComm": 0.0,
563
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
564
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
565
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
566
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
567
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
568
+ "TilingProfiler::GenericInstructionsAfterTiling": 48.0,
569
+ "TilingProfiler::MatMultInstructionsAfterTiling": 1796.0,
570
+ "TilingProfiler::NumPfTransposes": 5.0,
571
+ "TilingProfiler::NumPfTransposesForIo": 1.0,
572
+ "TilingProfiler::NumPfTransposesForLocal": 3.0,
573
+ "TilingProfiler::NumPfTransposesForNonlocal": 1.0,
574
+ "TilingProfiler::PfTransposeInstructions": 528.0,
575
+ "TilingProfiler::PfTransposeInstructionsForIo": 128.0,
576
+ "TilingProfiler::PfTransposeInstructionsForLocal": 336.0,
577
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 64.0,
578
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
579
+ "TilingProfiler::SimdInstructionsAfterTiling": 240.0,
580
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
581
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
582
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
583
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
584
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
585
+ "TransformConvOp::conv2d_column_packing": 0.0,
586
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
587
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
588
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
589
+ }
590
+ },
591
+ "sg0001": {
592
+ "compiletime": {
593
+ "AGOrderingAnalysisPass": 0.11623430252075195,
594
+ "AffinePredicateResolution": 0.0013043880462646484,
595
+ "AliasDependencyElimination": 0.0002994537353515625,
596
+ "AliasDependencyInduction": 0.008661746978759766,
597
+ "AliasDependencyReset": 0.10846662521362305,
598
+ "BFComputeCutting": 0.002373218536376953,
599
+ "BirCodeGenLoop": 0.03558611869812012,
600
+ "CCOpFusion": 0.07521343231201172,
601
+ "CanonicalizeDAGForPGTiling": 0.003203153610229492,
602
+ "CanonicalizeIR": 0.001676797866821289,
603
+ "CoalesceCCOp": 0.00453639030456543,
604
+ "CommuteConcat": 0.0011301040649414063,
605
+ "DMALocalityOpt": 0.001081228256225586,
606
+ "DMAProfiler": 0.004008054733276367,
607
+ "DMATilingProfiler": 0.005870342254638672,
608
+ "DataLocalityOpt": 0.28061389923095703,
609
+ "DataStreaming": 0.003554821014404297,
610
+ "DeConcat": 0.0023250579833984375,
611
+ "DeadCodeElimination": 0.0011475086212158203,
612
+ "DeadStoreElimination": 0.03395438194274902,
613
+ "DelinearIndices": 0.008394002914428711,
614
+ "Delinearization": 0.0034415721893310547,
615
+ "DoNothing": 0.00012373924255371094,
616
+ "DramToDramTranspose": 0.08242917060852051,
617
+ "DumpGraphAndMetadata": 0.013036966323852539,
618
+ "EliminateDivs": 0.003282785415649414,
619
+ "ExpandBatchNorm": 0.001613616943359375,
620
+ "ExpandISAMacro": 0.0023279190063476563,
621
+ "FactorizeBlkDims": 0.013802051544189453,
622
+ "FactorizeThreadAxesInFreeDims": 0.002902507781982422,
623
+ "FlattenMacroLoop": 0.003271341323852539,
624
+ "GenericAccessSimplifier": 0.0010917186737060547,
625
+ "InferInitValue": 0.07010388374328613,
626
+ "InferIntrinsicOnCC": 0.009655475616455078,
627
+ "InferNeuronTensor": 0.13204073905944824,
628
+ "InferNonlocalTensors": 0.07003545761108398,
629
+ "InferPSumTensor": 0.02702808380126953,
630
+ "InlineNativeKernels": 0.0015347003936767578,
631
+ "InsertIOTransposes": 0.06269145011901855,
632
+ "InsertLocalTransposes": 0.03624296188354492,
633
+ "InsertOffloadedTransposes": 0.0031883716583251953,
634
+ "LICM": 0.03521370887756348,
635
+ "LateLegalizeInst": 0.004965066909790039,
636
+ "LateLegalizePostSplit": 0.002466440200805664,
637
+ "LateLowerReshapeOp": 0.0013051033020019531,
638
+ "LateLowerTensorOp": 0.005318164825439453,
639
+ "LateNeuronInstComb": 0.052803993225097656,
640
+ "LayoutPreprocessing": 0.09067535400390625,
641
+ "LayoutPreprocessingAndAnalysis": 0.11781954765319824,
642
+ "LayoutRequirementAnalysis": 0.007211446762084961,
643
+ "LegalizeCCOpLayout": 0.0017924308776855469,
644
+ "LegalizeOpLevelAlias": 0.0011508464813232422,
645
+ "LegalizePartitionReduce": 0.0017359256744384766,
646
+ "LegalizeSundaAccess": 0.015620946884155273,
647
+ "LegalizeSundaMacro": 0.018268585205078125,
648
+ "LegalizeType": 0.0046215057373046875,
649
+ "LocalLayoutOpt": 0.043848276138305664,
650
+ "LoopFusion": 0.0060999393463134766,
651
+ "LoopSplitting": 0.0003905296325683594,
652
+ "LowerBroadcast": 0.0017726421356201172,
653
+ "LowerCCOpBlockAxis": 0.004726886749267578,
654
+ "LowerComplexBroadcast": 0.0022859573364257813,
655
+ "LowerIntrinsics": 0.03918600082397461,
656
+ "LowerTensorOp": 0.010795831680297852,
657
+ "LowerTranspose": 0.05237984657287598,
658
+ "MacroGeneration": 0.09652948379516602,
659
+ "MaskPropagation": 0.0031325817108154297,
660
+ "MemcpyElimination": 0.17755985260009766,
661
+ "MutateDataType": 0.0014410018920898438,
662
+ "NeuronAliasDependencyInduction": 0.0003838539123535156,
663
+ "NeuronAliasDependencyReset": 0.013648271560668945,
664
+ "NeuronInstComb": 0.013708114624023438,
665
+ "NeuronLICM": 0.007151603698730469,
666
+ "NeuronLoopFusion": 0.025348663330078125,
667
+ "NeuronLoopInterchange": 0.001642465591430664,
668
+ "NeuronSimplifier": 0.02402472496032715,
669
+ "NeuronSimplifyPredicates": 0.001932382583618164,
670
+ "NeuronValueNumbering": 0.0032088756561279297,
671
+ "OptimizeAliasedCopyChain": 0.0006577968597412109,
672
+ "OptimizeNKIKernels": 0.0014371871948242188,
673
+ "PAGLayoutOpt": 0.4579346179962158,
674
+ "PComputeCutting": 0.01232600212097168,
675
+ "PGLayoutTilingPipeline": 1.3759241104125977,
676
+ "PGTiling": 0.3235757350921631,
677
+ "PadElimination": 0.00040984153747558594,
678
+ "ParAxesAnnotation": 0.3829355239868164,
679
+ "PartialLoopFusion": 0.013192892074584961,
680
+ "PartialSimdFusion": 0.04269862174987793,
681
+ "PerfectLoopNest": 0.00379180908203125,
682
+ "RecognizeOpIdiom": 0.008028745651245117,
683
+ "Recompute": 0.00033402442932128906,
684
+ "RelaxPredicates": 0.002958059310913086,
685
+ "Rematerialization": 0.002005338668823242,
686
+ "ReshapeWeights": 0.0014390945434570313,
687
+ "ResolveAccessConflict": 0.004492282867431641,
688
+ "ResolveComplicatePredicates": 0.0012967586517333984,
689
+ "RewriteReplicationMatmul": 0.002475738525390625,
690
+ "RewriteWeights": 0.005257844924926758,
691
+ "SFKVectorizer": 0.19685626029968262,
692
+ "SimpleAllReduceTiling": 0.0014569759368896484,
693
+ "Simplifier": 0.004656791687011719,
694
+ "SimplifyMacroPredicates": 0.006787300109863281,
695
+ "SimplifyNeuronTensor": 0.006094932556152344,
696
+ "SimplifySlice": 0.0012688636779785156,
697
+ "SimplifyTensor": 0.0069119930267333984,
698
+ "SpillPSum": 0.020178794860839844,
699
+ "SplitAPUnionSets": 0.016407012939453125,
700
+ "SplitAccGrp": 0.0013232231140136719,
701
+ "StaticProfiler": 0.003663778305053711,
702
+ "StaticTransposeLocalTensor": 0.004960536956787109,
703
+ "SundaISel": 0.04571676254272461,
704
+ "TCTransform": 0.001123189926147461,
705
+ "TensorInitialization": 0.002666473388671875,
706
+ "TensorOpSimplifier": 0.006299495697021484,
707
+ "TensorOpTransform": 0.031577110290527344,
708
+ "TileCCOps": 0.0058362483978271484,
709
+ "TilingProfiler": 0.05374789237976074,
710
+ "TransformConvOp": 0.0022611618041992188,
711
+ "TritiumFusion": 0.11932778358459473,
712
+ "ValueNumbering": 0.0037584304809570313,
713
+ "VectorizeDMA": 0.0016014575958251953,
714
+ "VectorizeMatMult": 0.007041454315185547,
715
+ "WeightCoalescing": 0.0025255680084228516,
716
+ "ZeroSizeTensorElimination": 0.0001938343048095703
717
+ },
718
+ "tensorizer": {
719
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 8708.0,
720
+ "StaticProfiler::AifUb": 437.5911865234375,
721
+ "StaticProfiler::ArithmeticIntensityTensorizer": 488.4932556152344,
722
+ "StaticProfiler::AverageDmaLength": 845.4234619140625,
723
+ "StaticProfiler::AverageFractalPeUtilization": 100.0,
724
+ "StaticProfiler::AveragePartitionUtilization": 99.8521957397461,
725
+ "StaticProfiler::AveragePeUtilization": 100.0,
726
+ "StaticProfiler::DDRTransferBytes": 240992256.0,
727
+ "StaticProfiler::InternalTransferBytes": 43515904.0,
728
+ "StaticProfiler::LoadExpanded": 275585.0,
729
+ "StaticProfiler::LocalizationEfficiency": 111.63233184814453,
730
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 119.98538208007813,
731
+ "StaticProfiler::StoreExpanded": 5121.0,
732
+ "StaticProfiler::TotalDMAExpanded": 280706.0,
733
+ "StaticProfiler::TotalDynamicInstancesCount": 10829.0,
734
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 10829.0,
735
+ "StaticProfiler::TotalLNCComm": 0.0,
736
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
737
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
738
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
739
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
740
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
741
+ "TilingProfiler::GenericInstructionsAfterTiling": 32.0,
742
+ "TilingProfiler::MatMultInstructionsAfterTiling": 6784.0,
743
+ "TilingProfiler::NumPfTransposes": 7.0,
744
+ "TilingProfiler::NumPfTransposesForIo": 3.0,
745
+ "TilingProfiler::NumPfTransposesForLocal": 2.0,
746
+ "TilingProfiler::NumPfTransposesForNonlocal": 2.0,
747
+ "TilingProfiler::PfTransposeInstructions": 664.0,
748
+ "TilingProfiler::PfTransposeInstructionsForIo": 136.0,
749
+ "TilingProfiler::PfTransposeInstructionsForLocal": 272.0,
750
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 256.0,
751
+ "TilingProfiler::ReduceInstructionsAfterTiling": 0.0,
752
+ "TilingProfiler::SimdInstructionsAfterTiling": 279.0,
753
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
754
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
755
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
756
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
757
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
758
+ "TransformConvOp::conv2d_column_packing": 0.0,
759
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
760
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
761
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
762
+ }
763
+ },
764
+ "sg0002": {
765
+ "compiletime": {
766
+ "AGOrderingAnalysisPass": 0.018404245376586914,
767
+ "AffinePredicateResolution": 0.0012030601501464844,
768
+ "AliasDependencyElimination": 0.00032448768615722656,
769
+ "AliasDependencyInduction": 0.006628751754760742,
770
+ "AliasDependencyReset": 0.04145312309265137,
771
+ "BFComputeCutting": 0.001703023910522461,
772
+ "BirCodeGenLoop": 0.21541404724121094,
773
+ "CCOpFusion": 0.017800331115722656,
774
+ "CanonicalizeDAGForPGTiling": 0.004300355911254883,
775
+ "CanonicalizeIR": 0.0014739036560058594,
776
+ "CoalesceCCOp": 0.006423473358154297,
777
+ "CommuteConcat": 0.0008399486541748047,
778
+ "DMALocalityOpt": 0.0018458366394042969,
779
+ "DMAProfiler": 0.006464719772338867,
780
+ "DMATilingProfiler": 0.04760599136352539,
781
+ "DataLocalityOpt": 0.12700510025024414,
782
+ "DataStreaming": 0.0063991546630859375,
783
+ "DeConcat": 0.00067138671875,
784
+ "DeadCodeElimination": 0.0010039806365966797,
785
+ "DeadStoreElimination": 0.006083250045776367,
786
+ "DelinearIndices": 0.004990339279174805,
787
+ "Delinearization": 0.0024585723876953125,
788
+ "DoNothing": 0.00012063980102539063,
789
+ "DramToDramTranspose": 0.01742863655090332,
790
+ "DumpGraphAndMetadata": 0.022494077682495117,
791
+ "EliminateDivs": 0.0031578540802001953,
792
+ "ExpandBatchNorm": 0.0016427040100097656,
793
+ "ExpandISAMacro": 0.004450082778930664,
794
+ "FactorizeBlkDims": 0.0161283016204834,
795
+ "FactorizeThreadAxesInFreeDims": 0.0011897087097167969,
796
+ "FlattenMacroLoop": 0.0030732154846191406,
797
+ "GenericAccessSimplifier": 0.0007944107055664063,
798
+ "InferInitValue": 0.08164691925048828,
799
+ "InferIntrinsicOnCC": 0.00891733169555664,
800
+ "InferNeuronTensor": 0.021415233612060547,
801
+ "InferNonlocalTensors": 0.020433664321899414,
802
+ "InferPSumTensor": 0.06388068199157715,
803
+ "InlineNativeKernels": 0.002740621566772461,
804
+ "InsertIOTransposes": 0.01563739776611328,
805
+ "InsertLocalTransposes": 0.004204511642456055,
806
+ "InsertOffloadedTransposes": 0.0030889511108398438,
807
+ "LICM": 0.0025815963745117188,
808
+ "LateLegalizeInst": 0.006772041320800781,
809
+ "LateLegalizePostSplit": 0.004240512847900391,
810
+ "LateLowerReshapeOp": 0.0011327266693115234,
811
+ "LateLowerTensorOp": 0.0026912689208984375,
812
+ "LateNeuronInstComb": 0.05948805809020996,
813
+ "LayoutPreprocessing": 0.0245664119720459,
814
+ "LayoutPreprocessingAndAnalysis": 0.07019805908203125,
815
+ "LayoutRequirementAnalysis": 0.004818916320800781,
816
+ "LegalizeCCOpLayout": 0.0020072460174560547,
817
+ "LegalizeOpLevelAlias": 0.0011057853698730469,
818
+ "LegalizePartitionReduce": 0.0015609264373779297,
819
+ "LegalizeSundaAccess": 0.06627082824707031,
820
+ "LegalizeSundaMacro": 0.018359661102294922,
821
+ "LegalizeType": 0.005795955657958984,
822
+ "LocalLayoutOpt": 0.014153003692626953,
823
+ "LoopFusion": 0.005875825881958008,
824
+ "LoopSplitting": 0.0004029273986816406,
825
+ "LowerBroadcast": 0.0015807151794433594,
826
+ "LowerCCOpBlockAxis": 0.003938198089599609,
827
+ "LowerComplexBroadcast": 0.004299163818359375,
828
+ "LowerIntrinsics": 0.26007866859436035,
829
+ "LowerTensorOp": 0.010293006896972656,
830
+ "LowerTranspose": 0.00923776626586914,
831
+ "MacroGeneration": 0.03194475173950195,
832
+ "MaskPropagation": 0.0027740001678466797,
833
+ "MemcpyElimination": 0.02656841278076172,
834
+ "MutateDataType": 0.0012118816375732422,
835
+ "NeuronAliasDependencyInduction": 0.0003993511199951172,
836
+ "NeuronAliasDependencyReset": 0.017134428024291992,
837
+ "NeuronInstComb": 0.004921913146972656,
838
+ "NeuronLICM": 0.01333165168762207,
839
+ "NeuronLoopFusion": 0.011570215225219727,
840
+ "NeuronLoopInterchange": 0.002625703811645508,
841
+ "NeuronSimplifier": 0.012082099914550781,
842
+ "NeuronSimplifyPredicates": 0.0033812522888183594,
843
+ "NeuronValueNumbering": 0.0051653385162353516,
844
+ "OptimizeAliasedCopyChain": 0.0005443096160888672,
845
+ "OptimizeNKIKernels": 0.6168310642242432,
846
+ "PAGLayoutOpt": 0.10135579109191895,
847
+ "PComputeCutting": 0.004828453063964844,
848
+ "PGLayoutTilingPipeline": 0.6252057552337646,
849
+ "PGTiling": 0.1626896858215332,
850
+ "PadElimination": 0.00044655799865722656,
851
+ "ParAxesAnnotation": 0.07457947731018066,
852
+ "PartialLoopFusion": 0.047167062759399414,
853
+ "PartialSimdFusion": 0.006536245346069336,
854
+ "PerfectLoopNest": 0.003509998321533203,
855
+ "RecognizeOpIdiom": 0.0035545825958251953,
856
+ "Recompute": 0.00031566619873046875,
857
+ "RelaxPredicates": 0.004423379898071289,
858
+ "Rematerialization": 0.001989126205444336,
859
+ "ReshapeWeights": 0.0011234283447265625,
860
+ "ResolveAccessConflict": 0.003659963607788086,
861
+ "ResolveComplicatePredicates": 0.0010039806365966797,
862
+ "RewriteReplicationMatmul": 0.0014824867248535156,
863
+ "RewriteWeights": 0.0052568912506103516,
864
+ "SFKVectorizer": 0.18293046951293945,
865
+ "SimpleAllReduceTiling": 0.0030977725982666016,
866
+ "Simplifier": 0.003216266632080078,
867
+ "SimplifyMacroPredicates": 0.005851030349731445,
868
+ "SimplifyNeuronTensor": 0.22396159172058105,
869
+ "SimplifySlice": 0.0011301040649414063,
870
+ "SimplifyTensor": 0.009292364120483398,
871
+ "SpillPSum": 0.01147150993347168,
872
+ "SplitAPUnionSets": 0.03329944610595703,
873
+ "SplitAccGrp": 0.0012478828430175781,
874
+ "StaticProfiler": 0.005914926528930664,
875
+ "StaticTransposeLocalTensor": 0.0035593509674072266,
876
+ "SundaISel": 0.09800505638122559,
877
+ "TCTransform": 0.0008752346038818359,
878
+ "TensorInitialization": 0.004550933837890625,
879
+ "TensorOpSimplifier": 0.004429817199707031,
880
+ "TensorOpTransform": 0.035314083099365234,
881
+ "TileCCOps": 0.006109952926635742,
882
+ "TilingProfiler": 0.0074079036712646484,
883
+ "TransformConvOp": 0.0027616024017333984,
884
+ "TritiumFusion": 0.07009291648864746,
885
+ "ValueNumbering": 0.0019168853759765625,
886
+ "VectorizeDMA": 0.002183198928833008,
887
+ "VectorizeMatMult": 0.0026412010192871094,
888
+ "WeightCoalescing": 0.0030510425567626953,
889
+ "ZeroSizeTensorElimination": 0.00018453598022460938
890
+ },
891
+ "tensorizer": {
892
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 11124.0,
893
+ "StaticProfiler::AifUb": 291.8241882324219,
894
+ "StaticProfiler::ArithmeticIntensityTensorizer": 280.5914306640625,
895
+ "StaticProfiler::AverageDmaLength": 1148.8248291015625,
896
+ "StaticProfiler::AverageFractalPeUtilization": 99.68099975585938,
897
+ "StaticProfiler::AveragePartitionUtilization": 99.27783203125,
898
+ "StaticProfiler::AveragePeUtilization": 98.72872924804688,
899
+ "StaticProfiler::DDRTransferBytes": 331583520.0,
900
+ "StaticProfiler::InternalTransferBytes": 24224788.0,
901
+ "StaticProfiler::LoadExpanded": 293906.0,
902
+ "StaticProfiler::LocalizationEfficiency": 96.15084838867188,
903
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 102.69681549072266,
904
+ "StaticProfiler::StoreExpanded": 3627.0,
905
+ "StaticProfiler::TotalDMAExpanded": 297533.0,
906
+ "StaticProfiler::TotalDynamicInstancesCount": 13170.0,
907
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 13074.0,
908
+ "StaticProfiler::TotalLNCComm": 0.0,
909
+ "StaticProfiler::TotalLNCCommTransfer": 0.0,
910
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0,
911
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0,
912
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0,
913
+ "TilingProfiler::DmaInstructionsAfterTiling": 0.0,
914
+ "TilingProfiler::GenericInstructionsAfterTiling": 4.0,
915
+ "TilingProfiler::MatMultInstructionsAfterTiling": 9472.0,
916
+ "TilingProfiler::NumPfTransposes": 4.0,
917
+ "TilingProfiler::NumPfTransposesForIo": 0.0,
918
+ "TilingProfiler::NumPfTransposesForLocal": 1.0,
919
+ "TilingProfiler::NumPfTransposesForNonlocal": 3.0,
920
+ "TilingProfiler::PfTransposeInstructions": 385.0,
921
+ "TilingProfiler::PfTransposeInstructionsForIo": 0.0,
922
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1.0,
923
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0,
924
+ "TilingProfiler::ReduceInstructionsAfterTiling": 4.0,
925
+ "TilingProfiler::SimdInstructionsAfterTiling": 166.0,
926
+ "TilingProfiler::TotalInstructionsAfterTiling": 0.0,
927
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0,
928
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0,
929
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0,
930
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0,
931
+ "TransformConvOp::conv2d_column_packing": 0.0,
932
+ "TransformConvOp::conv2d_column_packing_1": 0.0,
933
+ "TransformConvOp::conv2d_column_packing_io10": 0.0,
934
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0
935
+ }
936
+ },
937
+ "sg01": {
938
+ "compiletime": {
939
+ "CanonicalizeConv": 7.000000096013537e-06,
940
+ "CanonicalizeForTensorizer": 1.4000000192027073e-05,
941
+ "Canonicalizer": 0.0002880000101868063,
942
+ "HoistCompute": 1.1000000085914508e-05,
943
+ "IdentifyCrossPassTensors": 1.1000000085914508e-05,
944
+ "MemcastMotion": 7.000000096013537e-06,
945
+ "PenguinizeFunctions": 1.2999999853491317e-05,
946
+ "PruneFunctions": 4.999999873689376e-06,
947
+ "RemoveOptimizationBarriers": 1.2000000424450263e-05,
948
+ "ScatterMotion": 6.000000212225132e-06,
949
+ "TensorizerLegalizationPass": 1.4999999621068127e-05,
950
+ "VerifySupportedOps": 1.1000000085914508e-05,
951
+ "algsimp": 6.600000051548705e-05,
952
+ "batchnorm_expander": 1.4000000192027073e-05,
953
+ "boundary-marker-removal": 3.999999989900971e-06,
954
+ "call-inliner": 9.000000318337698e-06,
955
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
956
+ "collective-stream-id-checker": 6.000000212225132e-06,
957
+ "comparison-expander": 4.999999873689376e-06,
958
+ "computation-deduplicator": 2.499999936844688e-05,
959
+ "conditional-to-select": 4.999999873689376e-06,
960
+ "config-lowering": 4.8999998398358e-05,
961
+ "constant_folding": 9.000000318337698e-06,
962
+ "cse": 1.2999999853491317e-05,
963
+ "dce": 9.999999974752427e-07,
964
+ "dynamic-slice-transpose": 3.999999989900971e-06,
965
+ "eliminate-redundant-compare": 3.999999989900971e-06,
966
+ "emit-offloaded-dropout": 2.2000000171829015e-05,
967
+ "flatten-call-graph": 7.999999979801942e-06,
968
+ "fuse-send-recv": 2.5999999706982635e-05,
969
+ "hilo::LegalizeAlias": 4.999999873689376e-06,
970
+ "hilo::NeuronInstCombine": 5.0999999075429514e-05,
971
+ "hilo::NeuronOpFusion": 4.999999873689376e-06,
972
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05,
973
+ "hilo::ScheduleFusion": 6.000000212225132e-06,
974
+ "hilo::SixtyFourHack": 9.999999747378752e-06,
975
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
976
+ "hlo-mac-count": 3.5000000934815034e-05,
977
+ "hlo-verifier": 0.00019299999985378236,
978
+ "legalize-ccops": 9.999999974752427e-07,
979
+ "legalize-compare": 4.999999873689376e-06,
980
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
981
+ "map-inline": 1.2000000424450263e-05,
982
+ "metadata-naming": 1.8000000636675395e-05,
983
+ "mlir::detail::OpToOpPassAdaptor": 2.300000051036477e-05,
984
+ "mlir::hlo::MhloToPyPenguin": 0.02390900067985058,
985
+ "mlir::mhlo::LowerComplexExtraPass": 0.0001140000022132881,
986
+ "mlir::mhlo::LowerComplexPass": 0.0002209999947808683,
987
+ "native-to-custom-softmax": 9.999999747378752e-06,
988
+ "native-to-custom-softmax-dx": 2.9999999242136255e-05,
989
+ "operand_upcaster": 3.300000025774352e-05,
990
+ "post-par-pipe-begin": 1.9999999949504854e-06,
991
+ "post-par-pipe-end": 0.0,
992
+ "post-partition-simplification": 0.0005729999975301325,
993
+ "replace-minimum-constant": 7.000000096013537e-06,
994
+ "reshape-mover": 3.999999989900971e-06,
995
+ "simplify-concat": 5.400000009103678e-05,
996
+ "simplify-while-loops": 1.9999999949504854e-06,
997
+ "transform-variadic-reduce": 9.000000318337698e-06,
998
+ "tuple-simplifier": 4.999999873689376e-06,
999
+ "unpack-nested-aws-ntwsr": 3.000000106112566e-06,
1000
+ "unroll-while-loop": 0.0
1001
+ },
1002
+ "hilo": {
1003
+ "ArithmeticIntensity": 436.7417907714844,
1004
+ "HloMacCount": 56908316672.0,
1005
+ "Traffic": 260603936.0
1006
+ }
1007
+ },
1008
+ "sg02": {
1009
+ "compiletime": {
1010
+ "CanonicalizeConv": 9.999999974752427e-07,
1011
+ "CanonicalizeForTensorizer": 2.099999983329326e-05,
1012
+ "Canonicalizer": 0.0003530000103637576,
1013
+ "HoistCompute": 0.0,
1014
+ "IdentifyCrossPassTensors": 1.5999999959603883e-05,
1015
+ "MemcastMotion": 0.0,
1016
+ "PenguinizeFunctions": 1.4999999621068127e-05,
1017
+ "PruneFunctions": 9.000000318337698e-06,
1018
+ "RemoveOptimizationBarriers": 1.8000000636675395e-05,
1019
+ "ScatterMotion": 9.999999974752427e-07,
1020
+ "TensorizerLegalizationPass": 9.999999747378752e-06,
1021
+ "VerifySupportedOps": 1.8000000636675395e-05,
1022
+ "algsimp": 6.70000008540228e-05,
1023
+ "batchnorm_expander": 1.2999999853491317e-05,
1024
+ "boundary-marker-removal": 3.999999989900971e-06,
1025
+ "call-inliner": 1.1000000085914508e-05,
1026
+ "canonicalize-boundary-marker": 4.999999873689376e-06,
1027
+ "collective-stream-id-checker": 4.999999873689376e-06,
1028
+ "comparison-expander": 6.000000212225132e-06,
1029
+ "computation-deduplicator": 2.499999936844688e-05,
1030
+ "conditional-to-select": 6.000000212225132e-06,
1031
+ "config-lowering": 5.999999848427251e-05,
1032
+ "constant_folding": 9.000000318337698e-06,
1033
+ "cse": 1.4000000192027073e-05,
1034
+ "dce": 9.999999974752427e-07,
1035
+ "dynamic-slice-transpose": 4.999999873689376e-06,
1036
+ "eliminate-redundant-compare": 3.000000106112566e-06,
1037
+ "emit-offloaded-dropout": 2.300000051036477e-05,
1038
+ "flatten-call-graph": 1.1000000085914508e-05,
1039
+ "fuse-send-recv": 2.099999983329326e-05,
1040
+ "hilo::LegalizeAlias": 3.999999989900971e-06,
1041
+ "hilo::NeuronInstCombine": 1.2000000424450263e-05,
1042
+ "hilo::NeuronOpFusion": 3.999999989900971e-06,
1043
+ "hilo::ReplaceTokenTypeWithU8Pass": 1.2000000424450263e-05,
1044
+ "hilo::ScheduleFusion": 1.700000029813964e-05,
1045
+ "hilo::SixtyFourHack": 6.0999998822808266e-05,
1046
+ "hilo::VerifyAliasing": 1.9999999949504854e-06,
1047
+ "hlo-mac-count": 0.00020599999697878957,
1048
+ "hlo-verifier": 0.00019500000053085387,
1049
+ "legalize-ccops": 9.999999974752427e-07,
1050
+ "legalize-compare": 3.000000106112566e-06,
1051
+ "lower-argminmax-custom-call": 3.000000106112566e-06,
1052
+ "map-inline": 1.2999999853491317e-05,
1053
+ "metadata-naming": 1.4999999621068127e-05,
1054
+ "mlir::detail::OpToOpPassAdaptor": 3.199999991920777e-05,
1055
+ "mlir::hlo::MhloToPyPenguin": 0.017378000542521477,
1056
+ "mlir::mhlo::LowerComplexExtraPass": 0.00010199999815085903,
1057
+ "mlir::mhlo::LowerComplexPass": 4.099999932805076e-05,
1058
+ "native-to-custom-softmax": 9.999999747378752e-06,
1059
+ "native-to-custom-softmax-dx": 3.7000001611886546e-05,
1060
+ "operand_upcaster": 1.8000000636675395e-05,
1061
+ "post-par-pipe-begin": 1.9999999949504854e-06,
1062
+ "post-par-pipe-end": 0.0,
1063
+ "post-partition-simplification": 0.0005590000073425472,
1064
+ "replace-minimum-constant": 9.999999747378752e-06,
1065
+ "reshape-mover": 3.000000106112566e-06,
1066
+ "simplify-concat": 4.999999873689376e-05,
1067
+ "simplify-while-loops": 1.9999999949504854e-06,
1068
+ "transform-variadic-reduce": 5.900000178371556e-05,
1069
+ "tuple-simplifier": 4.999999873689376e-06,
1070
+ "unpack-nested-aws-ntwsr": 4.999999873689376e-06,
1071
+ "unroll-while-loop": 0.0
1072
+ },
1073
+ "hilo": {
1074
+ "ArithmeticIntensity": 279.6634216308594,
1075
+ "HloMacCount": 45164265472.0,
1076
+ "Traffic": 322990144.0
1077
+ }
1078
+ }
1079
+ }
context_encoding_model/_tp0_bk2/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
context_encoding_model/_tp0_bk2/neuron_config.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "MistralForCausalLM"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "attribute_map": {},
10
+ "bad_words_ids": null,
11
+ "begin_suppress_tokens": null,
12
+ "bos_token_id": 1,
13
+ "chunk_size_feed_forward": 0,
14
+ "cross_attention_hidden_size": null,
15
+ "decoder_start_token_id": null,
16
+ "diversity_penalty": 0.0,
17
+ "do_sample": false,
18
+ "early_stopping": false,
19
+ "encoder_no_repeat_ngram_size": 0,
20
+ "eos_token_id": 2,
21
+ "exponential_decay_length_penalty": null,
22
+ "finetuning_task": null,
23
+ "forced_bos_token_id": null,
24
+ "forced_eos_token_id": null,
25
+ "fused_spec_config": null,
26
+ "head_dim": 128,
27
+ "hidden_act": "silu",
28
+ "hidden_size": 4096,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 14336,
35
+ "is_decoder": false,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "length_penalty": 1.0,
42
+ "max_length": 20,
43
+ "max_position_embeddings": 32768,
44
+ "metadata": null,
45
+ "min_length": 0,
46
+ "model_type": "mistral",
47
+ "neuron_config": {
48
+ "activation_quantization_type": null,
49
+ "allow_input_truncation": false,
50
+ "apply_seq_ids_mask": false,
51
+ "async_mode": false,
52
+ "attention_dp_degree": 1,
53
+ "attention_dtype": null,
54
+ "attn_block_cte_nki_kernel_enabled": false,
55
+ "attn_block_tkg_nki_kernel_cache_update": false,
56
+ "attn_block_tkg_nki_kernel_enabled": false,
57
+ "attn_cls": "NeuronLlamaAttention",
58
+ "attn_kernel_enabled": null,
59
+ "attn_tkg_builtin_kernel_enabled": false,
60
+ "attn_tkg_nki_kernel_enabled": false,
61
+ "batch_size": 1,
62
+ "bucket_n_active_tokens": true,
63
+ "buckets": [
64
+ 512
65
+ ],
66
+ "cast_type": "config",
67
+ "cc_pipeline_tiling_factor": 2,
68
+ "chunked_prefill_config": null,
69
+ "context_encoding_buckets": [
70
+ 512
71
+ ],
72
+ "cp_degree": 1,
73
+ "ctx_batch_size": 1,
74
+ "disable_kv_cache_tiling": false,
75
+ "draft_model_modules_to_not_convert": null,
76
+ "enable_bucketing": true,
77
+ "enable_eagle_draft_input_norm": false,
78
+ "enable_eagle_speculation": false,
79
+ "enable_fused_speculation": false,
80
+ "enable_long_context_mode": false,
81
+ "enable_output_completion_notifications": false,
82
+ "enable_spill_reload_dge": false,
83
+ "enable_token_tree": false,
84
+ "ep_degree": 1,
85
+ "expert_mlp_nki_kernel_enabled": null,
86
+ "flash_decoding_enabled": false,
87
+ "fused_qkv": false,
88
+ "fused_rmsnorm_skip_gamma": false,
89
+ "is_block_kv_layout": null,
90
+ "is_chunked_prefill": false,
91
+ "is_continuous_batching": true,
92
+ "is_eagle_draft": false,
93
+ "is_medusa": false,
94
+ "is_prefill_stage": true,
95
+ "is_prefix_caching": false,
96
+ "k_cache_transposed": false,
97
+ "kv_cache_batch_size": 4,
98
+ "kv_cache_padding_size": 0,
99
+ "kv_cache_quant": false,
100
+ "kv_cache_tiling": false,
101
+ "layer_boundary_markers": false,
102
+ "lm_head_pad": false,
103
+ "lm_head_pad_alignment_size": 1,
104
+ "local_ranks_size": 2,
105
+ "logical_nc_config": 1,
106
+ "lora_config": null,
107
+ "max_batch_size": 4,
108
+ "max_context_length": 2048,
109
+ "max_length": 2048,
110
+ "max_new_tokens": null,
111
+ "medusa_speculation_length": 0,
112
+ "medusa_tree": null,
113
+ "mlp_kernel_enabled": false,
114
+ "mlp_kernel_fuse_residual_add": false,
115
+ "modules_to_not_convert": null,
116
+ "moe_fused_nki_kernel_enabled": null,
117
+ "n_active_tokens": 2048,
118
+ "n_positions": 2048,
119
+ "num_medusa_heads": 0,
120
+ "on_cpu": false,
121
+ "on_device_sampling_config": {
122
+ "deterministic": false,
123
+ "do_sample": false,
124
+ "dynamic": true,
125
+ "global_topk": 256,
126
+ "on_device_sampling_config": true,
127
+ "temperature": 1.0,
128
+ "top_k": 1,
129
+ "top_k_kernel_enabled": false,
130
+ "top_p": 1.0
131
+ },
132
+ "output_logits": false,
133
+ "overrides_torch_dtype": true,
134
+ "pa_block_size": 2048,
135
+ "pa_num_blocks": 4,
136
+ "padding_side": "right",
137
+ "pp_degree": 1,
138
+ "prefix_buckets": null,
139
+ "qk_layernorm": false,
140
+ "qkv_kernel_enabled": false,
141
+ "qkv_kernel_fuse_residual_add": false,
142
+ "qkv_kernel_nbsd_layout": false,
143
+ "quantization_dtype": "int8",
144
+ "quantization_type": "per_tensor_symmetric",
145
+ "quantize_clamp_bound": Infinity,
146
+ "quantized": false,
147
+ "quantized_checkpoints_path": null,
148
+ "quantized_mlp_kernel_enabled": false,
149
+ "rmsnorm_quantize_kernel_enabled": false,
150
+ "router_topk_nki_kernel_enabled": null,
151
+ "rpl_reduce_dtype": null,
152
+ "save_sharded_checkpoint": true,
153
+ "scratchpad_page_size": null,
154
+ "seq_len": 2048,
155
+ "seq_len_threshold_for_cc_tiling": 16384,
156
+ "sequence_parallel_enabled": false,
157
+ "shared_mlp_nki_kernel_enabled": null,
158
+ "skip_sharding": false,
159
+ "skip_warmup": false,
160
+ "spec_batch_size": 4,
161
+ "speculation_length": 0,
162
+ "start_rank_id": 0,
163
+ "target": null,
164
+ "tile_cc": false,
165
+ "tkg_batch_size": 4,
166
+ "token_generation_buckets": null,
167
+ "token_tree_config": null,
168
+ "torch_dtype": "bfloat16",
169
+ "tp_degree": 2,
170
+ "vocab_parallel": false,
171
+ "weight_gather_seq_len_threshold": 32768,
172
+ "weights_to_skip_layout_optimization": [],
173
+ "world_size": 2
174
+ },
175
+ "no_repeat_ngram_size": 0,
176
+ "num_attention_heads": 32,
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_cores_per_group": 1,
180
+ "num_hidden_layers": 32,
181
+ "num_key_value_heads": 8,
182
+ "num_return_sequences": 1,
183
+ "output_attentions": false,
184
+ "output_hidden_states": false,
185
+ "output_scores": false,
186
+ "pad_token_id": 0,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "remove_invalid_values": false,
191
+ "repetition_penalty": 1.0,
192
+ "return_dict": true,
193
+ "return_dict_in_generate": false,
194
+ "rms_norm_eps": 1e-05,
195
+ "rope_theta": 1000000.0,
196
+ "sep_token_id": null,
197
+ "sliding_window": null,
198
+ "suppress_tokens": null,
199
+ "task_specific_params": null,
200
+ "temperature": 1.0,
201
+ "tf_legacy_loss": false,
202
+ "tie_encoder_decoder": false,
203
+ "tie_word_embeddings": false,
204
+ "tokenizer_class": null,
205
+ "top_k": 50,
206
+ "top_p": 1.0,
207
+ "torchscript": false,
208
+ "transformers_version": "4.42.0.dev0",
209
+ "typical_p": 1.0,
210
+ "use_bfloat16": false,
211
+ "use_cache": true,
212
+ "vocab_size": 32768
213
+ }
context_encoding_model/_tp0_bk3/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_2e1f11fbf72d40b46e64+5ae2bfda.hlo_module.pb --output model.MODULE_2e1f11fbf72d40b46e64+5ae2bfda.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35
context_encoding_model/_tp0_bk3/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.42.0.dev0"
6
+ }
neuron_config.json ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "MistralForCausalLM"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "attribute_map": {},
10
+ "bad_words_ids": null,
11
+ "begin_suppress_tokens": null,
12
+ "bos_token_id": 1,
13
+ "chunk_size_feed_forward": 0,
14
+ "cross_attention_hidden_size": null,
15
+ "decoder_start_token_id": null,
16
+ "diversity_penalty": 0.0,
17
+ "do_sample": false,
18
+ "early_stopping": false,
19
+ "encoder_no_repeat_ngram_size": 0,
20
+ "eos_token_id": 2,
21
+ "exponential_decay_length_penalty": null,
22
+ "finetuning_task": null,
23
+ "forced_bos_token_id": null,
24
+ "forced_eos_token_id": null,
25
+ "fused_spec_config": null,
26
+ "head_dim": 128,
27
+ "hidden_act": "silu",
28
+ "hidden_size": 4096,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 14336,
35
+ "is_decoder": false,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "length_penalty": 1.0,
42
+ "max_length": 20,
43
+ "max_position_embeddings": 32768,
44
+ "metadata": null,
45
+ "min_length": 0,
46
+ "model_type": "mistral",
47
+ "neuron_config": {
48
+ "activation_quantization_type": null,
49
+ "allow_input_truncation": false,
50
+ "apply_seq_ids_mask": false,
51
+ "async_mode": false,
52
+ "attention_dp_degree": 1,
53
+ "attention_dtype": null,
54
+ "attn_block_cte_nki_kernel_enabled": false,
55
+ "attn_block_tkg_nki_kernel_cache_update": false,
56
+ "attn_block_tkg_nki_kernel_enabled": false,
57
+ "attn_cls": "NeuronLlamaAttention",
58
+ "attn_kernel_enabled": null,
59
+ "attn_tkg_builtin_kernel_enabled": false,
60
+ "attn_tkg_nki_kernel_enabled": false,
61
+ "batch_size": 4,
62
+ "bucket_n_active_tokens": false,
63
+ "buckets": [
64
+ 2048
65
+ ],
66
+ "cast_type": "config",
67
+ "cc_pipeline_tiling_factor": 2,
68
+ "chunked_prefill_config": null,
69
+ "context_encoding_buckets": null,
70
+ "cp_degree": 1,
71
+ "ctx_batch_size": 1,
72
+ "disable_kv_cache_tiling": false,
73
+ "draft_model_modules_to_not_convert": null,
74
+ "enable_bucketing": true,
75
+ "enable_eagle_draft_input_norm": false,
76
+ "enable_eagle_speculation": false,
77
+ "enable_fused_speculation": false,
78
+ "enable_long_context_mode": false,
79
+ "enable_output_completion_notifications": false,
80
+ "enable_spill_reload_dge": false,
81
+ "enable_token_tree": false,
82
+ "ep_degree": 1,
83
+ "expert_mlp_nki_kernel_enabled": null,
84
+ "flash_decoding_enabled": false,
85
+ "fused_qkv": false,
86
+ "fused_rmsnorm_skip_gamma": false,
87
+ "is_block_kv_layout": null,
88
+ "is_chunked_prefill": false,
89
+ "is_continuous_batching": true,
90
+ "is_eagle_draft": false,
91
+ "is_medusa": false,
92
+ "is_prefill_stage": null,
93
+ "is_prefix_caching": false,
94
+ "k_cache_transposed": false,
95
+ "kv_cache_batch_size": 4,
96
+ "kv_cache_padding_size": 0,
97
+ "kv_cache_quant": false,
98
+ "kv_cache_tiling": false,
99
+ "layer_boundary_markers": false,
100
+ "lm_head_pad": false,
101
+ "lm_head_pad_alignment_size": 1,
102
+ "local_ranks_size": 2,
103
+ "logical_nc_config": 1,
104
+ "lora_config": null,
105
+ "max_batch_size": 4,
106
+ "max_context_length": 2048,
107
+ "max_length": 2048,
108
+ "max_new_tokens": null,
109
+ "medusa_speculation_length": 0,
110
+ "medusa_tree": null,
111
+ "mlp_kernel_enabled": false,
112
+ "mlp_kernel_fuse_residual_add": false,
113
+ "modules_to_not_convert": null,
114
+ "moe_fused_nki_kernel_enabled": null,
115
+ "n_active_tokens": 2048,
116
+ "n_positions": 2048,
117
+ "num_medusa_heads": 0,
118
+ "on_cpu": false,
119
+ "on_device_sampling_config": {
120
+ "deterministic": false,
121
+ "do_sample": false,
122
+ "dynamic": true,
123
+ "global_topk": 256,
124
+ "on_device_sampling_config": true,
125
+ "temperature": 1.0,
126
+ "top_k": 1,
127
+ "top_k_kernel_enabled": false,
128
+ "top_p": 1.0
129
+ },
130
+ "output_logits": false,
131
+ "overrides_torch_dtype": true,
132
+ "pa_block_size": 2048,
133
+ "pa_num_blocks": 4,
134
+ "padding_side": "right",
135
+ "pp_degree": 1,
136
+ "prefix_buckets": null,
137
+ "qk_layernorm": false,
138
+ "qkv_kernel_enabled": false,
139
+ "qkv_kernel_fuse_residual_add": false,
140
+ "qkv_kernel_nbsd_layout": false,
141
+ "quantization_dtype": "int8",
142
+ "quantization_type": "per_tensor_symmetric",
143
+ "quantize_clamp_bound": Infinity,
144
+ "quantized": false,
145
+ "quantized_checkpoints_path": null,
146
+ "quantized_mlp_kernel_enabled": false,
147
+ "rmsnorm_quantize_kernel_enabled": false,
148
+ "router_topk_nki_kernel_enabled": null,
149
+ "rpl_reduce_dtype": null,
150
+ "save_sharded_checkpoint": true,
151
+ "scratchpad_page_size": null,
152
+ "seq_len": 2048,
153
+ "seq_len_threshold_for_cc_tiling": 16384,
154
+ "sequence_parallel_enabled": false,
155
+ "shared_mlp_nki_kernel_enabled": null,
156
+ "skip_sharding": false,
157
+ "skip_warmup": false,
158
+ "spec_batch_size": 4,
159
+ "speculation_length": 0,
160
+ "start_rank_id": 0,
161
+ "target": null,
162
+ "tile_cc": false,
163
+ "tkg_batch_size": 4,
164
+ "token_generation_buckets": null,
165
+ "token_tree_config": null,
166
+ "torch_dtype": "bfloat16",
167
+ "tp_degree": 2,
168
+ "vocab_parallel": false,
169
+ "weight_gather_seq_len_threshold": 32768,
170
+ "weights_to_skip_layout_optimization": [],
171
+ "world_size": 2
172
+ },
173
+ "no_repeat_ngram_size": 0,
174
+ "num_attention_heads": 32,
175
+ "num_beam_groups": 1,
176
+ "num_beams": 1,
177
+ "num_cores_per_group": 1,
178
+ "num_hidden_layers": 32,
179
+ "num_key_value_heads": 8,
180
+ "num_return_sequences": 1,
181
+ "output_attentions": false,
182
+ "output_hidden_states": false,
183
+ "output_scores": false,
184
+ "pad_token_id": null,
185
+ "prefix": null,
186
+ "problem_type": null,
187
+ "pruned_heads": {},
188
+ "remove_invalid_values": false,
189
+ "repetition_penalty": 1.0,
190
+ "return_dict": true,
191
+ "return_dict_in_generate": false,
192
+ "rms_norm_eps": 1e-05,
193
+ "rope_theta": 1000000.0,
194
+ "sep_token_id": null,
195
+ "sliding_window": null,
196
+ "suppress_tokens": null,
197
+ "task_specific_params": null,
198
+ "temperature": 1.0,
199
+ "tf_legacy_loss": false,
200
+ "tie_encoder_decoder": false,
201
+ "tie_word_embeddings": false,
202
+ "tokenizer_class": null,
203
+ "top_k": 50,
204
+ "top_p": 1.0,
205
+ "torchscript": false,
206
+ "transformers_version": "4.42.0.dev0",
207
+ "typical_p": 1.0,
208
+ "use_bfloat16": false,
209
+ "use_cache": true,
210
+ "vocab_size": 32768
211
+ }
params.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dim": 4096,
3
+ "n_layers": 32,
4
+ "head_dim": 128,
5
+ "hidden_dim": 14336,
6
+ "n_heads": 32,
7
+ "n_kv_heads": 8,
8
+ "norm_eps": 1e-05,
9
+ "vocab_size": 32768,
10
+ "rope_theta": 1000000.0
11
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
token_generation_model/_tp0_bk0/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_67d3774d5bacfe6ba851+72d461cc.hlo_module.pb --output model.MODULE_67d3774d5bacfe6ba851+72d461cc.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --enable-internal-neff-wrapper --verbose=35
token_generation_model/_tp0_bk0/compile_flags.MODULE_67d3774d5bacfe6ba851+72d461cc.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=1", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/models/mistral-7b-v0.3-instruct-neuronx/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"]
token_generation_model/_tp0_bk0/global_metric_store.json ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.3995361328125,
5
+ "StaticProfiler::AveragePartitionUtilization": 98.55674743652344,
6
+ "StaticProfiler::AveragePeUtilization": 97.5706558227539,
7
+ "StaticProfiler::LocalizationEfficiency": 116.26432037353516,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 116.41236877441406,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1,
16
+ "StaticProfiler::AveragePartitionUtilization": 1,
17
+ "StaticProfiler::AveragePeUtilization": 1,
18
+ "StaticProfiler::LocalizationEfficiency": 1,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 1.7505581378936768,
27
+ "AffinePredicateResolution": 0.046631813049316406,
28
+ "AliasDependencyElimination": 0.002095460891723633,
29
+ "AliasDependencyInduction": 0.340775728225708,
30
+ "AliasDependencyReset": 0.35799717903137207,
31
+ "BFComputeCutting": 0.0918271541595459,
32
+ "BirCodeGenLoop": 1.9524388313293457,
33
+ "CCOpFusion": 0.6108810901641846,
34
+ "CanonicalizeConv": 3.999999989900971e-06,
35
+ "CanonicalizeDAGForPGTiling": 0.16879653930664063,
36
+ "CanonicalizeForTensorizer": 0.00046999999904073775,
37
+ "CanonicalizeIR": 0.061083316802978516,
38
+ "Canonicalizer": 0.006062000058591366,
39
+ "CoalesceCCOp": 0.1506364345550537,
40
+ "CommuteConcat": 0.030129432678222656,
41
+ "DMALocalityOpt": 0.028980255126953125,
42
+ "DMAProfiler": 0.07041788101196289,
43
+ "DMATilingProfiler": 0.06604599952697754,
44
+ "DataLocalityOpt": 2.1674563884735107,
45
+ "DataStreaming": 0.10460638999938965,
46
+ "DeConcat": 0.016613245010375977,
47
+ "DeadCodeElimination": 0.03136706352233887,
48
+ "DeadStoreElimination": 1.2213473320007324,
49
+ "DelinearIndices": 0.28975749015808105,
50
+ "Delinearization": 0.18038702011108398,
51
+ "DoNothing": 0.00029587745666503906,
52
+ "DramToDramTranspose": 1.0997114181518555,
53
+ "DumpGraphAndMetadata": 0.15712499618530273,
54
+ "EliminateDivs": 0.14302921295166016,
55
+ "ExpandBatchNorm": 0.05438518524169922,
56
+ "ExpandISAMacro": 0.06612992286682129,
57
+ "FactorizeBlkDims": 0.26380443572998047,
58
+ "FactorizeThreadAxesInFreeDims": 0.03626608848571777,
59
+ "FlattenMacroLoop": 0.0791018009185791,
60
+ "GenericAccessSimplifier": 0.028739452362060547,
61
+ "HoistCompute": 4.999999873689376e-05,
62
+ "IdentifyCrossPassTensors": 0.00021100000594742596,
63
+ "InferInitValue": 1.245818853378296,
64
+ "InferIntrinsicOnCC": 0.3838510513305664,
65
+ "InferNeuronTensor": 1.782684326171875,
66
+ "InferNonlocalTensors": 4.216822624206543,
67
+ "InferPSumTensor": 0.8002936840057373,
68
+ "InlineNativeKernels": 0.04331803321838379,
69
+ "InsertIOTransposes": 0.9620308876037598,
70
+ "InsertLocalTransposes": 0.9075572490692139,
71
+ "InsertOffloadedTransposes": 0.06905245780944824,
72
+ "LICM": 0.09245109558105469,
73
+ "LateLegalizeInst": 0.1371767520904541,
74
+ "LateLegalizePostSplit": 0.07115364074707031,
75
+ "LateLowerReshapeOp": 0.037413835525512695,
76
+ "LateLowerTensorOp": 0.2697789669036865,
77
+ "LateNeuronInstComb": 0.36163973808288574,
78
+ "LayoutPreprocessing": 0.8670649528503418,
79
+ "LayoutPreprocessingAndAnalysis": 1.1549065113067627,
80
+ "LayoutRequirementAnalysis": 0.27439332008361816,
81
+ "LegalizeCCOpLayout": 0.0667877197265625,
82
+ "LegalizeOpLevelAlias": 0.02463841438293457,
83
+ "LegalizePartitionReduce": 0.031178951263427734,
84
+ "LegalizeSundaAccess": 1.0370941162109375,
85
+ "LegalizeSundaMacro": 0.30040812492370605,
86
+ "LegalizeType": 0.15511703491210938,
87
+ "LocalLayoutOpt": 0.5859987735748291,
88
+ "LoopFusion": 0.31392574310302734,
89
+ "LoopSplitting": 0.03730463981628418,
90
+ "LowerBroadcast": 0.043883323669433594,
91
+ "LowerCCOpBlockAxis": 0.21113038063049316,
92
+ "LowerComplexBroadcast": 0.13543224334716797,
93
+ "LowerIntrinsics": 0.5533030033111572,
94
+ "LowerTensorOp": 0.4427661895751953,
95
+ "LowerTranspose": 0.445873498916626,
96
+ "MacroGeneration": 2.473928213119507,
97
+ "MaskPropagation": 0.11900115013122559,
98
+ "MemcastMotion": 0.00013499999477062374,
99
+ "MemcpyElimination": 3.895864963531494,
100
+ "MutateDataType": 0.0398097038269043,
101
+ "NeuronAliasDependencyInduction": 0.018038034439086914,
102
+ "NeuronAliasDependencyReset": 0.03026747703552246,
103
+ "NeuronInstComb": 0.2801692485809326,
104
+ "NeuronLICM": 0.2402355670928955,
105
+ "NeuronLoopFusion": 0.578554630279541,
106
+ "NeuronLoopInterchange": 0.04063010215759277,
107
+ "NeuronSimplifier": 0.338550329208374,
108
+ "NeuronSimplifyPredicates": 0.05692338943481445,
109
+ "NeuronValueNumbering": 0.08049511909484863,
110
+ "OptimizeAliasedCopyChain": 0.010602712631225586,
111
+ "OptimizeNKIKernels": 0.35573911666870117,
112
+ "PAGLayoutOpt": 41.70356750488281,
113
+ "PComputeCutting": 0.31862449645996094,
114
+ "PGLayoutTilingPipeline": 54.89517593383789,
115
+ "PGTiling": 4.955427169799805,
116
+ "PadElimination": 0.008247613906860352,
117
+ "ParAxesAnnotation": 40.78300476074219,
118
+ "PartialLoopFusion": 0.27280497550964355,
119
+ "PartialSimdFusion": 0.23333501815795898,
120
+ "PenguinizeFunctions": 0.00019500000053085387,
121
+ "PerfectLoopNest": 0.054427146911621094,
122
+ "PruneFunctions": 0.0002859999949578196,
123
+ "RecognizeOpIdiom": 0.14151334762573242,
124
+ "Recompute": 0.006689548492431641,
125
+ "RelaxPredicates": 0.11504864692687988,
126
+ "Rematerialization": 0.1454627513885498,
127
+ "RemoveOptimizationBarriers": 0.0001289999927394092,
128
+ "ReshapeWeights": 0.02218008041381836,
129
+ "ResolveAccessConflict": 0.2085890769958496,
130
+ "ResolveComplicatePredicates": 0.04809713363647461,
131
+ "RewriteReplicationMatmul": 0.04198598861694336,
132
+ "RewriteWeights": 0.06269216537475586,
133
+ "SFKVectorizer": 4.823036193847656,
134
+ "ScatterMotion": 0.004162999801337719,
135
+ "SimpleAllReduceTiling": 0.050234317779541016,
136
+ "Simplifier": 0.09822559356689453,
137
+ "SimplifyMacroPredicates": 0.17580604553222656,
138
+ "SimplifyNeuronTensor": 0.3997023105621338,
139
+ "SimplifySlice": 0.030458927154541016,
140
+ "SimplifyTensor": 0.19034838676452637,
141
+ "SpillPSum": 0.750556230545044,
142
+ "SplitAPUnionSets": 0.21661949157714844,
143
+ "SplitAccGrp": 0.034674882888793945,
144
+ "StaticProfiler": 0.1068570613861084,
145
+ "StaticTransposeLocalTensor": 0.23836159706115723,
146
+ "SundaISel": 1.3462426662445068,
147
+ "TCTransform": 0.03340411186218262,
148
+ "TensorInitialization": 0.08121585845947266,
149
+ "TensorOpSimplifier": 0.1906282901763916,
150
+ "TensorOpTransform": 0.768507719039917,
151
+ "TensorizerLegalizationPass": 0.00017600000137463212,
152
+ "TileCCOps": 0.1881403923034668,
153
+ "TilingProfiler": 0.4166131019592285,
154
+ "TransformConvOp": 0.04472208023071289,
155
+ "TritiumFusion": 1.3044919967651367,
156
+ "ValueNumbering": 0.0843496322631836,
157
+ "VectorizeDMA": 0.1174325942993164,
158
+ "VectorizeMatMult": 0.018784761428833008,
159
+ "VerifySupportedOps": 0.00020500000391621143,
160
+ "WeightCoalescing": 0.041903018951416016,
161
+ "ZeroSizeTensorElimination": 0.0003848075866699219,
162
+ "algsimp": 0.0015930000226944685,
163
+ "batchnorm_expander": 0.0005460000247694552,
164
+ "boundary-marker-removal": 0.0002610000083222985,
165
+ "call-inliner": 0.0002479999966453761,
166
+ "canonicalize-boundary-marker": 0.0002809999859891832,
167
+ "collective-stream-id-checker": 4.8000001697801054e-05,
168
+ "comparison-expander": 0.0002800000074785203,
169
+ "computation-deduplicator": 0.0003530000103637576,
170
+ "conditional-to-select": 8.399999933317304e-05,
171
+ "config-lowering": 0.00025499999173916876,
172
+ "constant_folding": 0.00017600000137463212,
173
+ "cse": 0.00037900000461377203,
174
+ "dce": 4.5000000682193786e-05,
175
+ "dynamic-slice-transpose": 0.0001610000035725534,
176
+ "eliminate-redundant-compare": 0.0001610000035725534,
177
+ "emit-offloaded-dropout": 0.0002629999944474548,
178
+ "flatten-call-graph": 0.00025599999935366213,
179
+ "fuse-send-recv": 0.0010339999571442604,
180
+ "hilo::LegalizeAlias": 0.0022499999031424522,
181
+ "hilo::NeuronInstCombine": 0.0009289999725297093,
182
+ "hilo::NeuronOpFusion": 0.00037399999564513564,
183
+ "hilo::ReplaceTokenTypeWithU8Pass": 0.0003389999910723418,
184
+ "hilo::ScheduleFusion": 3.9999998989515007e-05,
185
+ "hilo::SixtyFourHack": 0.00022699999681208283,
186
+ "hilo::VerifyAliasing": 0.00013600000238511711,
187
+ "hlo-mac-count": 0.0006539999740198255,
188
+ "hlo-verifier": 0.004976999945938587,
189
+ "io-con-pipe-begin": 3.000000106112566e-06,
190
+ "io-con-pipe-end": 9.999999974752427e-07,
191
+ "io-layout-normalization": 0.0007140000234358013,
192
+ "legalize-ccops": 2.2000000171829015e-05,
193
+ "legalize-compare": 0.00023799999326001853,
194
+ "lower-argminmax-custom-call": 0.00014899999951012433,
195
+ "map-inline": 0.0005290000117383897,
196
+ "metadata-naming": 0.0007159999804571271,
197
+ "mlir::detail::OpToOpPassAdaptor": 0.00019799999427050352,
198
+ "mlir::hlo::MhloToPyPenguin": 0.04695200175046921,
199
+ "mlir::mhlo::LowerComplexExtraPass": 0.002549000084400177,
200
+ "mlir::mhlo::LowerComplexPass": 0.0030499999411404133,
201
+ "native-to-custom-softmax": 0.0002849999873433262,
202
+ "native-to-custom-softmax-dx": 0.00030399998649954796,
203
+ "operand_upcaster": 0.0005130000063218176,
204
+ "post-par-pipe-begin": 9.999999974752427e-07,
205
+ "post-par-pipe-end": 0.0,
206
+ "post-partition-simplification": 0.05249100178480148,
207
+ "pre-hlo-begin": 1.9999999949504854e-06,
208
+ "pre-hlo-end": 9.999999974752427e-07,
209
+ "replace-minimum-constant": 0.0001740000006975606,
210
+ "reshape-mover": 7.500000356230885e-05,
211
+ "simplify-concat": 0.0017800000496208668,
212
+ "simplify-while-loops": 5.700000110664405e-05,
213
+ "transform-variadic-reduce": 0.00047900000936351717,
214
+ "tuple-simplifier": 0.00017699999443721026,
215
+ "unpack-nested-aws-ntwsr": 0.00023999999393709004,
216
+ "unroll-while-loop": 1.1000000085914508e-05
217
+ },
218
+ "hilo": {
219
+ "HloMacCount": 14294450176.0,
220
+ "Traffic": 7787338240.0
221
+ },
222
+ "tensorizer": {
223
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 239380,
224
+ "StaticProfiler::AifUb": 3.780517101287842,
225
+ "StaticProfiler::ArithmeticIntensityTensorizer": 4.395392417907715,
226
+ "StaticProfiler::AverageDmaLength": 7173.484375,
227
+ "StaticProfiler::DDRTransferBytes": 7159761404,
228
+ "StaticProfiler::InternalTransferBytes": 29110320,
229
+ "StaticProfiler::LoadExpanded": 931327,
230
+ "StaticProfiler::StoreExpanded": 2344,
231
+ "StaticProfiler::TotalDMAExpanded": 933671,
232
+ "StaticProfiler::TotalDynamicInstancesCount": 249148,
233
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 244610,
234
+ "StaticProfiler::TotalLNCComm": 0,
235
+ "StaticProfiler::TotalLNCCommTransfer": 0,
236
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0,
237
+ "TilingProfiler::DmaInstructionsAfterTiling": 0,
238
+ "TilingProfiler::GenericInstructionsAfterTiling": 329,
239
+ "TilingProfiler::MatMultInstructionsAfterTiling": 219136,
240
+ "TilingProfiler::NumPfTransposes": 394,
241
+ "TilingProfiler::NumPfTransposesForIo": 33,
242
+ "TilingProfiler::NumPfTransposesForLocal": 226,
243
+ "TilingProfiler::NumPfTransposesForNonlocal": 135,
244
+ "TilingProfiler::PfTransposeInstructions": 5649,
245
+ "TilingProfiler::PfTransposeInstructionsForIo": 513,
246
+ "TilingProfiler::PfTransposeInstructionsForLocal": 900,
247
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 4236,
248
+ "TilingProfiler::ReduceInstructionsAfterTiling": 67,
249
+ "TilingProfiler::SimdInstructionsAfterTiling": 6366,
250
+ "TilingProfiler::TotalInstructionsAfterTiling": 0,
251
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
252
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
253
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
254
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
255
+ "TransformConvOp::conv2d_column_packing": 0,
256
+ "TransformConvOp::conv2d_column_packing_1": 0,
257
+ "TransformConvOp::conv2d_column_packing_io10": 0,
258
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
259
+ }
260
+ },
261
+ "all": {
262
+ "compiletime": {
263
+ "CanonicalizeConv": 3.999999989900971e-06,
264
+ "CanonicalizeForTensorizer": 0.00046999999904073775,
265
+ "Canonicalizer": 0.006062000058591366,
266
+ "HoistCompute": 4.999999873689376e-05,
267
+ "IdentifyCrossPassTensors": 0.00021100000594742596,
268
+ "MemcastMotion": 0.00013499999477062374,
269
+ "PenguinizeFunctions": 0.00019500000053085387,
270
+ "PruneFunctions": 0.0002859999949578196,
271
+ "RemoveOptimizationBarriers": 0.0001289999927394092,
272
+ "ScatterMotion": 0.004162999801337719,
273
+ "TensorizerLegalizationPass": 0.00017600000137463212,
274
+ "VerifySupportedOps": 0.00020500000391621143,
275
+ "algsimp": 0.0015930000226944685,
276
+ "batchnorm_expander": 0.0005460000247694552,
277
+ "boundary-marker-removal": 0.0002610000083222985,
278
+ "call-inliner": 0.0002479999966453761,
279
+ "canonicalize-boundary-marker": 0.0002809999859891832,
280
+ "collective-stream-id-checker": 4.8000001697801054e-05,
281
+ "comparison-expander": 0.0002800000074785203,
282
+ "computation-deduplicator": 0.0003530000103637576,
283
+ "conditional-to-select": 8.399999933317304e-05,
284
+ "config-lowering": 0.00025499999173916876,
285
+ "constant_folding": 0.00017600000137463212,
286
+ "cse": 0.00037900000461377203,
287
+ "dce": 4.5000000682193786e-05,
288
+ "dynamic-slice-transpose": 0.0001610000035725534,
289
+ "eliminate-redundant-compare": 0.0001610000035725534,
290
+ "emit-offloaded-dropout": 0.0002629999944474548,
291
+ "flatten-call-graph": 0.00025599999935366213,
292
+ "fuse-send-recv": 0.0010339999571442604,
293
+ "hilo::LegalizeAlias": 0.0022499999031424522,
294
+ "hilo::NeuronInstCombine": 0.0009289999725297093,
295
+ "hilo::NeuronOpFusion": 0.00037399999564513564,
296
+ "hilo::ReplaceTokenTypeWithU8Pass": 0.0003389999910723418,
297
+ "hilo::ScheduleFusion": 3.9999998989515007e-05,
298
+ "hilo::SixtyFourHack": 0.00022699999681208283,
299
+ "hilo::VerifyAliasing": 0.00013600000238511711,
300
+ "hlo-mac-count": 0.0006539999740198255,
301
+ "hlo-verifier": 0.004976999945938587,
302
+ "io-con-pipe-begin": 3.000000106112566e-06,
303
+ "io-con-pipe-end": 9.999999974752427e-07,
304
+ "io-layout-normalization": 0.0007140000234358013,
305
+ "legalize-ccops": 2.2000000171829015e-05,
306
+ "legalize-compare": 0.00023799999326001853,
307
+ "lower-argminmax-custom-call": 0.00014899999951012433,
308
+ "map-inline": 0.0005290000117383897,
309
+ "metadata-naming": 0.0007159999804571271,
310
+ "mlir::detail::OpToOpPassAdaptor": 0.00019799999427050352,
311
+ "mlir::hlo::MhloToPyPenguin": 0.04695200175046921,
312
+ "mlir::mhlo::LowerComplexExtraPass": 0.002549000084400177,
313
+ "mlir::mhlo::LowerComplexPass": 0.0030499999411404133,
314
+ "native-to-custom-softmax": 0.0002849999873433262,
315
+ "native-to-custom-softmax-dx": 0.00030399998649954796,
316
+ "operand_upcaster": 0.0005130000063218176,
317
+ "post-par-pipe-begin": 9.999999974752427e-07,
318
+ "post-par-pipe-end": 0.0,
319
+ "post-partition-simplification": 0.05249100178480148,
320
+ "pre-hlo-begin": 1.9999999949504854e-06,
321
+ "pre-hlo-end": 9.999999974752427e-07,
322
+ "replace-minimum-constant": 0.0001740000006975606,
323
+ "reshape-mover": 7.500000356230885e-05,
324
+ "simplify-concat": 0.0017800000496208668,
325
+ "simplify-while-loops": 5.700000110664405e-05,
326
+ "transform-variadic-reduce": 0.00047900000936351717,
327
+ "tuple-simplifier": 0.00017699999443721026,
328
+ "unpack-nested-aws-ntwsr": 0.00023999999393709004,
329
+ "unroll-while-loop": 1.1000000085914508e-05
330
+ }
331
+ },
332
+ "cumsum": {
333
+ "compiletime": {
334
+ "CoalesceCCOp": 0.0002486705780029297,
335
+ "DMALocalityOpt": 0.00020575523376464844,
336
+ "DMAProfiler": 0.0008375644683837891,
337
+ "DataStreaming": 0.0002942085266113281,
338
+ "DoNothing": 0.000156402587890625,
339
+ "ExpandISAMacro": 0.0005459785461425781,
340
+ "FactorizeBlkDims": 0.00045418739318847656,
341
+ "InferPSumTensor": 0.00048661231994628906,
342
+ "LateLegalizeInst": 0.00043702125549316406,
343
+ "LateNeuronInstComb": 0.0004830360412597656,
344
+ "LegalizeSundaAccess": 0.0015192031860351563,
345
+ "LegalizeType": 0.0002753734588623047,
346
+ "LowerBroadcast": 0.00025272369384765625,
347
+ "LowerIntrinsics": 0.000255584716796875,
348
+ "LowerTranspose": 0.0002532005310058594,
349
+ "NeuronInstComb": 0.0005066394805908203,
350
+ "NeuronLICM": 0.00046443939208984375,
351
+ "NeuronSimplifyPredicates": 0.002785921096801758,
352
+ "NeuronValueNumbering": 0.00043272972106933594,
353
+ "SFKVectorizer": 0.002558469772338867,
354
+ "SimpleAllReduceTiling": 0.00026297569274902344,
355
+ "SimplifyNeuronTensor": 0.00043082237243652344,
356
+ "SpillPSum": 0.0005290508270263672,
357
+ "WeightCoalescing": 0.00025010108947753906
358
+ }
359
+ },
360
+ "sg00": {
361
+ "hilo": {
362
+ "ArithmeticIntensity": 3.671203136444092,
363
+ "HloMacCount": 14294450176.0,
364
+ "Traffic": 7787338240.0
365
+ }
366
+ },
367
+ "sg0000": {
368
+ "compiletime": {
369
+ "AGOrderingAnalysisPass": 1.7505581378936768,
370
+ "AffinePredicateResolution": 0.046631813049316406,
371
+ "AliasDependencyElimination": 0.002095460891723633,
372
+ "AliasDependencyInduction": 0.340775728225708,
373
+ "AliasDependencyReset": 0.35799717903137207,
374
+ "BFComputeCutting": 0.0918271541595459,
375
+ "BirCodeGenLoop": 1.9524388313293457,
376
+ "CCOpFusion": 0.6108810901641846,
377
+ "CanonicalizeDAGForPGTiling": 0.16879653930664063,
378
+ "CanonicalizeIR": 0.061083316802978516,
379
+ "CoalesceCCOp": 0.15038776397705078,
380
+ "CommuteConcat": 0.030129432678222656,
381
+ "DMALocalityOpt": 0.028774499893188477,
382
+ "DMAProfiler": 0.0695803165435791,
383
+ "DMATilingProfiler": 0.06604599952697754,
384
+ "DataLocalityOpt": 2.1674563884735107,
385
+ "DataStreaming": 0.10431218147277832,
386
+ "DeConcat": 0.016613245010375977,
387
+ "DeadCodeElimination": 0.03136706352233887,
388
+ "DeadStoreElimination": 1.2213473320007324,
389
+ "DelinearIndices": 0.28975749015808105,
390
+ "Delinearization": 0.18038702011108398,
391
+ "DoNothing": 0.00013947486877441406,
392
+ "DramToDramTranspose": 1.0997114181518555,
393
+ "DumpGraphAndMetadata": 0.15712499618530273,
394
+ "EliminateDivs": 0.14302921295166016,
395
+ "ExpandBatchNorm": 0.05438518524169922,
396
+ "ExpandISAMacro": 0.06558394432067871,
397
+ "FactorizeBlkDims": 0.263350248336792,
398
+ "FactorizeThreadAxesInFreeDims": 0.03626608848571777,
399
+ "FlattenMacroLoop": 0.0791018009185791,
400
+ "GenericAccessSimplifier": 0.028739452362060547,
401
+ "InferInitValue": 1.245818853378296,
402
+ "InferIntrinsicOnCC": 0.3838510513305664,
403
+ "InferNeuronTensor": 1.782684326171875,
404
+ "InferNonlocalTensors": 4.216822624206543,
405
+ "InferPSumTensor": 0.799807071685791,
406
+ "InlineNativeKernels": 0.04331803321838379,
407
+ "InsertIOTransposes": 0.9620308876037598,
408
+ "InsertLocalTransposes": 0.9075572490692139,
409
+ "InsertOffloadedTransposes": 0.06905245780944824,
410
+ "LICM": 0.09245109558105469,
411
+ "LateLegalizeInst": 0.13673973083496094,
412
+ "LateLegalizePostSplit": 0.07115364074707031,
413
+ "LateLowerReshapeOp": 0.037413835525512695,
414
+ "LateLowerTensorOp": 0.2697789669036865,
415
+ "LateNeuronInstComb": 0.361156702041626,
416
+ "LayoutPreprocessing": 0.8670649528503418,
417
+ "LayoutPreprocessingAndAnalysis": 1.1549065113067627,
418
+ "LayoutRequirementAnalysis": 0.27439332008361816,
419
+ "LegalizeCCOpLayout": 0.0667877197265625,
420
+ "LegalizeOpLevelAlias": 0.02463841438293457,
421
+ "LegalizePartitionReduce": 0.031178951263427734,
422
+ "LegalizeSundaAccess": 1.0355749130249023,
423
+ "LegalizeSundaMacro": 0.30040812492370605,
424
+ "LegalizeType": 0.15484166145324707,
425
+ "LocalLayoutOpt": 0.5859987735748291,
426
+ "LoopFusion": 0.31392574310302734,
427
+ "LoopSplitting": 0.03730463981628418,
428
+ "LowerBroadcast": 0.04363059997558594,
429
+ "LowerCCOpBlockAxis": 0.21113038063049316,
430
+ "LowerComplexBroadcast": 0.13543224334716797,
431
+ "LowerIntrinsics": 0.5530474185943604,
432
+ "LowerTensorOp": 0.4427661895751953,
433
+ "LowerTranspose": 0.4456202983856201,
434
+ "MacroGeneration": 2.473928213119507,
435
+ "MaskPropagation": 0.11900115013122559,
436
+ "MemcpyElimination": 3.895864963531494,
437
+ "MutateDataType": 0.0398097038269043,
438
+ "NeuronAliasDependencyInduction": 0.018038034439086914,
439
+ "NeuronAliasDependencyReset": 0.03026747703552246,
440
+ "NeuronInstComb": 0.2796626091003418,
441
+ "NeuronLICM": 0.23977112770080566,
442
+ "NeuronLoopFusion": 0.578554630279541,
443
+ "NeuronLoopInterchange": 0.04063010215759277,
444
+ "NeuronSimplifier": 0.338550329208374,
445
+ "NeuronSimplifyPredicates": 0.054137468338012695,
446
+ "NeuronValueNumbering": 0.0800623893737793,
447
+ "OptimizeAliasedCopyChain": 0.010602712631225586,
448
+ "OptimizeNKIKernels": 0.35573911666870117,
449
+ "PAGLayoutOpt": 41.70356750488281,
450
+ "PComputeCutting": 0.31862449645996094,
451
+ "PGLayoutTilingPipeline": 54.89517593383789,
452
+ "PGTiling": 4.955427169799805,
453
+ "PadElimination": 0.008247613906860352,
454
+ "ParAxesAnnotation": 40.78300476074219,
455
+ "PartialLoopFusion": 0.27280497550964355,
456
+ "PartialSimdFusion": 0.23333501815795898,
457
+ "PerfectLoopNest": 0.054427146911621094,
458
+ "RecognizeOpIdiom": 0.14151334762573242,
459
+ "Recompute": 0.006689548492431641,
460
+ "RelaxPredicates": 0.11504864692687988,
461
+ "Rematerialization": 0.1454627513885498,
462
+ "ReshapeWeights": 0.02218008041381836,
463
+ "ResolveAccessConflict": 0.2085890769958496,
464
+ "ResolveComplicatePredicates": 0.04809713363647461,
465
+ "RewriteReplicationMatmul": 0.04198598861694336,
466
+ "RewriteWeights": 0.06269216537475586,
467
+ "SFKVectorizer": 4.820477485656738,
468
+ "SimpleAllReduceTiling": 0.04997134208679199,
469
+ "Simplifier": 0.09822559356689453,
470
+ "SimplifyMacroPredicates": 0.17580604553222656,
471
+ "SimplifyNeuronTensor": 0.39927148818969727,
472
+ "SimplifySlice": 0.030458927154541016,
473
+ "SimplifyTensor": 0.19034838676452637,
474
+ "SpillPSum": 0.7500271797180176,
475
+ "SplitAPUnionSets": 0.21661949157714844,
476
+ "SplitAccGrp": 0.034674882888793945,
477
+ "StaticProfiler": 0.1068570613861084,
478
+ "StaticTransposeLocalTensor": 0.23836159706115723,
479
+ "SundaISel": 1.3462426662445068,
480
+ "TCTransform": 0.03340411186218262,
481
+ "TensorInitialization": 0.08121585845947266,
482
+ "TensorOpSimplifier": 0.1906282901763916,
483
+ "TensorOpTransform": 0.768507719039917,
484
+ "TileCCOps": 0.1881403923034668,
485
+ "TilingProfiler": 0.4166131019592285,
486
+ "TransformConvOp": 0.04472208023071289,
487
+ "TritiumFusion": 1.3044919967651367,
488
+ "ValueNumbering": 0.0843496322631836,
489
+ "VectorizeDMA": 0.1174325942993164,
490
+ "VectorizeMatMult": 0.018784761428833008,
491
+ "WeightCoalescing": 0.04165291786193848,
492
+ "ZeroSizeTensorElimination": 0.0003848075866699219
493
+ },
494
+ "tensorizer": {
495
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 239380,
496
+ "StaticProfiler::AifUb": 3.780517101287842,
497
+ "StaticProfiler::ArithmeticIntensityTensorizer": 4.395392417907715,
498
+ "StaticProfiler::AverageDmaLength": 7173.484375,
499
+ "StaticProfiler::AverageFractalPeUtilization": 99.3995361328125,
500
+ "StaticProfiler::AveragePartitionUtilization": 98.55674743652344,
501
+ "StaticProfiler::AveragePeUtilization": 97.5706558227539,
502
+ "StaticProfiler::DDRTransferBytes": 7159761404,
503
+ "StaticProfiler::InternalTransferBytes": 29110320,
504
+ "StaticProfiler::LoadExpanded": 931327,
505
+ "StaticProfiler::LocalizationEfficiency": 116.26432037353516,
506
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 116.41236877441406,
507
+ "StaticProfiler::StoreExpanded": 2344,
508
+ "StaticProfiler::TotalDMAExpanded": 933671,
509
+ "StaticProfiler::TotalDynamicInstancesCount": 249148,
510
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 244610,
511
+ "StaticProfiler::TotalLNCComm": 0,
512
+ "StaticProfiler::TotalLNCCommTransfer": 0,
513
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
514
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0,
515
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0,
516
+ "TilingProfiler::DmaInstructionsAfterTiling": 0,
517
+ "TilingProfiler::GenericInstructionsAfterTiling": 329,
518
+ "TilingProfiler::MatMultInstructionsAfterTiling": 219136,
519
+ "TilingProfiler::NumPfTransposes": 394,
520
+ "TilingProfiler::NumPfTransposesForIo": 33,
521
+ "TilingProfiler::NumPfTransposesForLocal": 226,
522
+ "TilingProfiler::NumPfTransposesForNonlocal": 135,
523
+ "TilingProfiler::PfTransposeInstructions": 5649,
524
+ "TilingProfiler::PfTransposeInstructionsForIo": 513,
525
+ "TilingProfiler::PfTransposeInstructionsForLocal": 900,
526
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 4236,
527
+ "TilingProfiler::ReduceInstructionsAfterTiling": 67,
528
+ "TilingProfiler::SimdInstructionsAfterTiling": 6366,
529
+ "TilingProfiler::TotalInstructionsAfterTiling": 0,
530
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
531
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
532
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
533
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
534
+ "TransformConvOp::conv2d_column_packing": 0,
535
+ "TransformConvOp::conv2d_column_packing_1": 0,
536
+ "TransformConvOp::conv2d_column_packing_io10": 0,
537
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
538
+ }
539
+ }
540
+ }
token_generation_model/_tp0_bk0/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
token_generation_model/_tp0_bk0/neuron_config.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "MistralForCausalLM"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "attribute_map": {},
10
+ "bad_words_ids": null,
11
+ "begin_suppress_tokens": null,
12
+ "bos_token_id": 1,
13
+ "chunk_size_feed_forward": 0,
14
+ "cross_attention_hidden_size": null,
15
+ "decoder_start_token_id": null,
16
+ "diversity_penalty": 0.0,
17
+ "do_sample": false,
18
+ "early_stopping": false,
19
+ "encoder_no_repeat_ngram_size": 0,
20
+ "eos_token_id": 2,
21
+ "exponential_decay_length_penalty": null,
22
+ "finetuning_task": null,
23
+ "forced_bos_token_id": null,
24
+ "forced_eos_token_id": null,
25
+ "fused_spec_config": null,
26
+ "head_dim": 128,
27
+ "hidden_act": "silu",
28
+ "hidden_size": 4096,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 14336,
35
+ "is_decoder": false,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "length_penalty": 1.0,
42
+ "max_length": 20,
43
+ "max_position_embeddings": 32768,
44
+ "metadata": null,
45
+ "min_length": 0,
46
+ "model_type": "mistral",
47
+ "neuron_config": {
48
+ "activation_quantization_type": null,
49
+ "allow_input_truncation": false,
50
+ "apply_seq_ids_mask": false,
51
+ "async_mode": false,
52
+ "attention_dp_degree": 1,
53
+ "attention_dtype": null,
54
+ "attn_block_cte_nki_kernel_enabled": false,
55
+ "attn_block_tkg_nki_kernel_cache_update": false,
56
+ "attn_block_tkg_nki_kernel_enabled": false,
57
+ "attn_cls": "NeuronLlamaAttention",
58
+ "attn_kernel_enabled": null,
59
+ "attn_tkg_builtin_kernel_enabled": false,
60
+ "attn_tkg_nki_kernel_enabled": false,
61
+ "batch_size": 4,
62
+ "bucket_n_active_tokens": false,
63
+ "buckets": [
64
+ 128
65
+ ],
66
+ "cast_type": "config",
67
+ "cc_pipeline_tiling_factor": 1,
68
+ "chunked_prefill_config": null,
69
+ "context_encoding_buckets": null,
70
+ "cp_degree": 1,
71
+ "ctx_batch_size": 1,
72
+ "disable_kv_cache_tiling": false,
73
+ "draft_model_modules_to_not_convert": null,
74
+ "enable_bucketing": true,
75
+ "enable_eagle_draft_input_norm": false,
76
+ "enable_eagle_speculation": false,
77
+ "enable_fused_speculation": false,
78
+ "enable_long_context_mode": false,
79
+ "enable_output_completion_notifications": false,
80
+ "enable_spill_reload_dge": false,
81
+ "enable_token_tree": false,
82
+ "ep_degree": 1,
83
+ "expert_mlp_nki_kernel_enabled": null,
84
+ "flash_decoding_enabled": false,
85
+ "fused_qkv": false,
86
+ "fused_rmsnorm_skip_gamma": false,
87
+ "is_block_kv_layout": null,
88
+ "is_chunked_prefill": false,
89
+ "is_continuous_batching": true,
90
+ "is_eagle_draft": false,
91
+ "is_medusa": false,
92
+ "is_prefill_stage": false,
93
+ "is_prefix_caching": false,
94
+ "k_cache_transposed": false,
95
+ "kv_cache_batch_size": 4,
96
+ "kv_cache_padding_size": 0,
97
+ "kv_cache_quant": false,
98
+ "kv_cache_tiling": false,
99
+ "layer_boundary_markers": false,
100
+ "lm_head_pad": false,
101
+ "lm_head_pad_alignment_size": 1,
102
+ "local_ranks_size": 2,
103
+ "logical_nc_config": 1,
104
+ "lora_config": null,
105
+ "max_batch_size": 4,
106
+ "max_context_length": 2048,
107
+ "max_length": 2048,
108
+ "max_new_tokens": null,
109
+ "medusa_speculation_length": 0,
110
+ "medusa_tree": null,
111
+ "mlp_kernel_enabled": false,
112
+ "mlp_kernel_fuse_residual_add": false,
113
+ "modules_to_not_convert": null,
114
+ "moe_fused_nki_kernel_enabled": null,
115
+ "n_active_tokens": 1,
116
+ "n_positions": 2048,
117
+ "num_medusa_heads": 0,
118
+ "on_cpu": false,
119
+ "on_device_sampling_config": {
120
+ "deterministic": false,
121
+ "do_sample": false,
122
+ "dynamic": true,
123
+ "global_topk": 256,
124
+ "on_device_sampling_config": true,
125
+ "temperature": 1.0,
126
+ "top_k": 1,
127
+ "top_k_kernel_enabled": false,
128
+ "top_p": 1.0
129
+ },
130
+ "output_logits": false,
131
+ "overrides_torch_dtype": true,
132
+ "pa_block_size": 2048,
133
+ "pa_num_blocks": 4,
134
+ "padding_side": "right",
135
+ "pp_degree": 1,
136
+ "prefix_buckets": null,
137
+ "qk_layernorm": false,
138
+ "qkv_kernel_enabled": false,
139
+ "qkv_kernel_fuse_residual_add": false,
140
+ "qkv_kernel_nbsd_layout": false,
141
+ "quantization_dtype": "int8",
142
+ "quantization_type": "per_tensor_symmetric",
143
+ "quantize_clamp_bound": Infinity,
144
+ "quantized": false,
145
+ "quantized_checkpoints_path": null,
146
+ "quantized_mlp_kernel_enabled": false,
147
+ "rmsnorm_quantize_kernel_enabled": false,
148
+ "router_topk_nki_kernel_enabled": null,
149
+ "rpl_reduce_dtype": null,
150
+ "save_sharded_checkpoint": true,
151
+ "scratchpad_page_size": null,
152
+ "seq_len": 2048,
153
+ "seq_len_threshold_for_cc_tiling": 16384,
154
+ "sequence_parallel_enabled": false,
155
+ "shared_mlp_nki_kernel_enabled": null,
156
+ "skip_sharding": false,
157
+ "skip_warmup": false,
158
+ "spec_batch_size": 4,
159
+ "speculation_length": 0,
160
+ "start_rank_id": 0,
161
+ "target": null,
162
+ "tile_cc": false,
163
+ "tkg_batch_size": 4,
164
+ "token_generation_buckets": [
165
+ 128
166
+ ],
167
+ "token_tree_config": null,
168
+ "torch_dtype": "bfloat16",
169
+ "tp_degree": 2,
170
+ "vocab_parallel": false,
171
+ "weight_gather_seq_len_threshold": 32768,
172
+ "weights_to_skip_layout_optimization": [],
173
+ "world_size": 2
174
+ },
175
+ "no_repeat_ngram_size": 0,
176
+ "num_attention_heads": 32,
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_cores_per_group": 1,
180
+ "num_hidden_layers": 32,
181
+ "num_key_value_heads": 8,
182
+ "num_return_sequences": 1,
183
+ "output_attentions": false,
184
+ "output_hidden_states": false,
185
+ "output_scores": false,
186
+ "pad_token_id": 0,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "remove_invalid_values": false,
191
+ "repetition_penalty": 1.0,
192
+ "return_dict": true,
193
+ "return_dict_in_generate": false,
194
+ "rms_norm_eps": 1e-05,
195
+ "rope_theta": 1000000.0,
196
+ "sep_token_id": null,
197
+ "sliding_window": null,
198
+ "suppress_tokens": null,
199
+ "task_specific_params": null,
200
+ "temperature": 1.0,
201
+ "tf_legacy_loss": false,
202
+ "tie_encoder_decoder": false,
203
+ "tie_word_embeddings": false,
204
+ "tokenizer_class": null,
205
+ "top_k": 50,
206
+ "top_p": 1.0,
207
+ "torchscript": false,
208
+ "transformers_version": "4.42.0.dev0",
209
+ "typical_p": 1.0,
210
+ "use_bfloat16": false,
211
+ "use_cache": true,
212
+ "vocab_size": 32768
213
+ }
token_generation_model/_tp0_bk1/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_92bbfea7801df2fea75e+4948da29.hlo_module.pb --output model.MODULE_92bbfea7801df2fea75e+4948da29.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --verbose=35
token_generation_model/_tp0_bk1/compile_flags.MODULE_92bbfea7801df2fea75e+4948da29.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=1", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/models/mistral-7b-v0.3-instruct-neuronx/token_generation_model/_tp0_bk1/log-neuron-cc.txt"]
token_generation_model/_tp0_bk1/global_metric_store.json ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.41757202148438,
5
+ "StaticProfiler::AveragePartitionUtilization": 98.54876708984375,
6
+ "StaticProfiler::AveragePeUtilization": 97.57201385498047,
7
+ "StaticProfiler::LocalizationEfficiency": 115.72197723388672,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 115.86865234375,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1,
16
+ "StaticProfiler::AveragePartitionUtilization": 1,
17
+ "StaticProfiler::AveragePeUtilization": 1,
18
+ "StaticProfiler::LocalizationEfficiency": 1,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 2.5377185344696045,
27
+ "AffinePredicateResolution": 0.09365105628967285,
28
+ "AliasDependencyElimination": 0.0029387474060058594,
29
+ "AliasDependencyInduction": 0.9149439334869385,
30
+ "AliasDependencyReset": 0.9402039051055908,
31
+ "BFComputeCutting": 0.08754968643188477,
32
+ "BirCodeGenLoop": 1.745743751525879,
33
+ "CCOpFusion": 0.6600353717803955,
34
+ "CanonicalizeConv": 7.100000220816582e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.18796682357788086,
36
+ "CanonicalizeForTensorizer": 0.0007590000168420374,
37
+ "CanonicalizeIR": 0.11113405227661133,
38
+ "Canonicalizer": 0.007284999825060368,
39
+ "CoalesceCCOp": 0.17181134223937988,
40
+ "CommuteConcat": 0.03446626663208008,
41
+ "DMALocalityOpt": 0.0312197208404541,
42
+ "DMAProfiler": 0.07886791229248047,
43
+ "DMATilingProfiler": 0.07289671897888184,
44
+ "DataLocalityOpt": 2.36210560798645,
45
+ "DataStreaming": 0.11742329597473145,
46
+ "DeConcat": 0.018580198287963867,
47
+ "DeadCodeElimination": 0.034860849380493164,
48
+ "DeadStoreElimination": 1.3254823684692383,
49
+ "DelinearIndices": 0.3446221351623535,
50
+ "Delinearization": 0.1506659984588623,
51
+ "DoNothing": 0.0002808570861816406,
52
+ "DramToDramTranspose": 1.202136516571045,
53
+ "DumpGraphAndMetadata": 0.16636061668395996,
54
+ "EliminateDivs": 0.2120811939239502,
55
+ "ExpandBatchNorm": 0.08646512031555176,
56
+ "ExpandISAMacro": 0.07142186164855957,
57
+ "FactorizeBlkDims": 0.2816791534423828,
58
+ "FactorizeThreadAxesInFreeDims": 0.04656529426574707,
59
+ "FlattenMacroLoop": 0.09647870063781738,
60
+ "GenericAccessSimplifier": 0.03339672088623047,
61
+ "HoistCompute": 0.00010299999848939478,
62
+ "IdentifyCrossPassTensors": 0.00017699999443721026,
63
+ "InferInitValue": 1.0866944789886475,
64
+ "InferIntrinsicOnCC": 0.36792969703674316,
65
+ "InferNeuronTensor": 1.8716816902160645,
66
+ "InferNonlocalTensors": 4.868690013885498,
67
+ "InferPSumTensor": 1.1845901012420654,
68
+ "InlineNativeKernels": 0.046529531478881836,
69
+ "InsertIOTransposes": 0.8822572231292725,
70
+ "InsertLocalTransposes": 0.9775772094726563,
71
+ "InsertOffloadedTransposes": 0.07813811302185059,
72
+ "LICM": 0.09977531433105469,
73
+ "LateLegalizeInst": 0.15209054946899414,
74
+ "LateLegalizePostSplit": 0.07948708534240723,
75
+ "LateLowerReshapeOp": 0.044101715087890625,
76
+ "LateLowerTensorOp": 0.9023852348327637,
77
+ "LateNeuronInstComb": 0.3997476100921631,
78
+ "LayoutPreprocessing": 1.0312676429748535,
79
+ "LayoutPreprocessingAndAnalysis": 1.378429651260376,
80
+ "LayoutRequirementAnalysis": 0.3332996368408203,
81
+ "LegalizeCCOpLayout": 0.11418366432189941,
82
+ "LegalizeOpLevelAlias": 0.030529260635375977,
83
+ "LegalizePartitionReduce": 0.03310394287109375,
84
+ "LegalizeSundaAccess": 1.0588877201080322,
85
+ "LegalizeSundaMacro": 0.32631635665893555,
86
+ "LegalizeType": 0.15178585052490234,
87
+ "LocalLayoutOpt": 0.736790657043457,
88
+ "LoopFusion": 0.3599417209625244,
89
+ "LoopSplitting": 0.03815197944641113,
90
+ "LowerBroadcast": 0.05054831504821777,
91
+ "LowerCCOpBlockAxis": 0.24752306938171387,
92
+ "LowerComplexBroadcast": 0.15025925636291504,
93
+ "LowerIntrinsics": 0.5705869197845459,
94
+ "LowerTensorOp": 0.6496686935424805,
95
+ "LowerTranspose": 0.47455263137817383,
96
+ "MacroGeneration": 2.619112730026245,
97
+ "MaskPropagation": 0.13018035888671875,
98
+ "MemcastMotion": 0.00015799999528098851,
99
+ "MemcpyElimination": 9.560661315917969,
100
+ "MutateDataType": 0.04511690139770508,
101
+ "NeuronAliasDependencyInduction": 0.01844191551208496,
102
+ "NeuronAliasDependencyReset": 0.032405853271484375,
103
+ "NeuronInstComb": 0.31444406509399414,
104
+ "NeuronLICM": 0.2590310573577881,
105
+ "NeuronLoopFusion": 0.7092890739440918,
106
+ "NeuronLoopInterchange": 0.0486292839050293,
107
+ "NeuronSimplifier": 0.3838067054748535,
108
+ "NeuronSimplifyPredicates": 0.06155133247375488,
109
+ "NeuronValueNumbering": 0.09124898910522461,
110
+ "OptimizeAliasedCopyChain": 0.01573491096496582,
111
+ "OptimizeNKIKernels": 0.37357568740844727,
112
+ "PAGLayoutOpt": 43.78666687011719,
113
+ "PComputeCutting": 0.2951951026916504,
114
+ "PGLayoutTilingPipeline": 58.915504455566406,
115
+ "PGTiling": 5.879387855529785,
116
+ "PadElimination": 0.015177726745605469,
117
+ "ParAxesAnnotation": 42.794742584228516,
118
+ "PartialLoopFusion": 0.3791334629058838,
119
+ "PartialSimdFusion": 0.28896236419677734,
120
+ "PenguinizeFunctions": 0.00035099999513477087,
121
+ "PerfectLoopNest": 0.06396722793579102,
122
+ "PruneFunctions": 0.0005990000208839774,
123
+ "RecognizeOpIdiom": 0.14191770553588867,
124
+ "Recompute": 0.008079290390014648,
125
+ "RelaxPredicates": 0.12253475189208984,
126
+ "Rematerialization": 0.15925836563110352,
127
+ "RemoveOptimizationBarriers": 0.0001049999991664663,
128
+ "ReshapeWeights": 0.02360057830810547,
129
+ "ResolveAccessConflict": 0.23374605178833008,
130
+ "ResolveComplicatePredicates": 0.09412980079650879,
131
+ "RewriteReplicationMatmul": 0.05128669738769531,
132
+ "RewriteWeights": 0.069183349609375,
133
+ "SFKVectorizer": 5.670078754425049,
134
+ "ScatterMotion": 0.007300000172108412,
135
+ "SimpleAllReduceTiling": 0.05962991714477539,
136
+ "Simplifier": 0.11846542358398438,
137
+ "SimplifyMacroPredicates": 0.20037221908569336,
138
+ "SimplifyNeuronTensor": 0.4303016662597656,
139
+ "SimplifySlice": 0.03480982780456543,
140
+ "SimplifyTensor": 0.21754741668701172,
141
+ "SpillPSum": 0.5786852836608887,
142
+ "SplitAPUnionSets": 0.2246379852294922,
143
+ "SplitAccGrp": 0.041144371032714844,
144
+ "StaticProfiler": 0.11391091346740723,
145
+ "StaticTransposeLocalTensor": 0.2581653594970703,
146
+ "SundaISel": 1.714808702468872,
147
+ "TCTransform": 0.03960013389587402,
148
+ "TensorInitialization": 0.08710885047912598,
149
+ "TensorOpSimplifier": 0.5631639957427979,
150
+ "TensorOpTransform": 1.7516539096832275,
151
+ "TensorizerLegalizationPass": 0.00033099998836405575,
152
+ "TileCCOps": 0.2776148319244385,
153
+ "TilingProfiler": 0.4766700267791748,
154
+ "TransformConvOp": 0.05919003486633301,
155
+ "TritiumFusion": 1.33201265335083,
156
+ "ValueNumbering": 0.09897065162658691,
157
+ "VectorizeDMA": 0.1225745677947998,
158
+ "VectorizeMatMult": 0.0306241512298584,
159
+ "VerifySupportedOps": 0.0002410000015515834,
160
+ "WeightCoalescing": 0.047278642654418945,
161
+ "ZeroSizeTensorElimination": 0.0004246234893798828,
162
+ "algsimp": 0.003000000026077032,
163
+ "batchnorm_expander": 0.0011419999646022916,
164
+ "boundary-marker-removal": 0.0003319999959785491,
165
+ "call-inliner": 0.0004710000066552311,
166
+ "canonicalize-boundary-marker": 0.00036800000816583633,
167
+ "collective-stream-id-checker": 8.399999933317304e-05,
168
+ "comparison-expander": 0.00036199999158270657,
169
+ "computation-deduplicator": 0.000446999998530373,
170
+ "conditional-to-select": 0.00016700000560376793,
171
+ "config-lowering": 0.0003150000120513141,
172
+ "constant_folding": 0.0003420000139158219,
173
+ "cse": 0.0009110000100918114,
174
+ "dce": 0.00011300000187475234,
175
+ "dynamic-slice-transpose": 0.00029799999902024865,
176
+ "eliminate-redundant-compare": 0.00032900000223889947,
177
+ "emit-offloaded-dropout": 0.0004870000120718032,
178
+ "flatten-call-graph": 0.00035600000410340726,
179
+ "fuse-send-recv": 0.002119000069797039,
180
+ "hilo::LegalizeAlias": 0.007114000152796507,
181
+ "hilo::NeuronInstCombine": 0.002025000052526593,
182
+ "hilo::NeuronOpFusion": 1.4999999621068127e-05,
183
+ "hilo::ReplaceTokenTypeWithU8Pass": 0.0002280000044265762,
184
+ "hilo::ScheduleFusion": 7.100000220816582e-05,
185
+ "hilo::SixtyFourHack": 0.0005000000237487257,
186
+ "hilo::VerifyAliasing": 0.000295000005280599,
187
+ "hlo-mac-count": 0.0010900000343099236,
188
+ "hlo-verifier": 0.008117999881505966,
189
+ "io-con-pipe-begin": 1.1000000085914508e-05,
190
+ "io-con-pipe-end": 9.999999974752427e-07,
191
+ "io-layout-normalization": 0.0011190000222995877,
192
+ "legalize-ccops": 4.70000013592653e-05,
193
+ "legalize-compare": 0.00031999999191612005,
194
+ "lower-argminmax-custom-call": 0.0002849999873433262,
195
+ "map-inline": 0.0006060000159777701,
196
+ "metadata-naming": 0.001366000040434301,
197
+ "mlir::detail::OpToOpPassAdaptor": 0.00028899998869746923,
198
+ "mlir::hlo::MhloToPyPenguin": 0.06228100135922432,
199
+ "mlir::mhlo::LowerComplexExtraPass": 0.0029380000196397305,
200
+ "mlir::mhlo::LowerComplexPass": 0.004118999931961298,
201
+ "native-to-custom-softmax": 0.000371000001905486,
202
+ "native-to-custom-softmax-dx": 0.0003800000122282654,
203
+ "operand_upcaster": 0.0007779999868944287,
204
+ "post-par-pipe-begin": 1.9999999949504854e-06,
205
+ "post-par-pipe-end": 0.0,
206
+ "post-partition-simplification": 0.09592299908399582,
207
+ "pre-hlo-begin": 3.999999989900971e-06,
208
+ "pre-hlo-end": 9.999999974752427e-07,
209
+ "replace-minimum-constant": 0.0003220000071451068,
210
+ "reshape-mover": 0.00015500000154133886,
211
+ "simplify-concat": 0.0029800001066178083,
212
+ "simplify-while-loops": 0.00015199999324977398,
213
+ "transform-variadic-reduce": 0.000577000027988106,
214
+ "tuple-simplifier": 0.00037200000951997936,
215
+ "unpack-nested-aws-ntwsr": 0.000307999987853691,
216
+ "unroll-while-loop": 2.700000004551839e-05
217
+ },
218
+ "hilo": {
219
+ "HloMacCount": 14361559040.0,
220
+ "Traffic": 7787340288.0
221
+ },
222
+ "tensorizer": {
223
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 241479,
224
+ "StaticProfiler::AifUb": 4.066871166229248,
225
+ "StaticProfiler::ArithmeticIntensityTensorizer": 4.706264019012451,
226
+ "StaticProfiler::AverageDmaLength": 6434.17578125,
227
+ "StaticProfiler::DDRTransferBytes": 7193317884,
228
+ "StaticProfiler::InternalTransferBytes": 47988784,
229
+ "StaticProfiler::LoadExpanded": 1054211,
230
+ "StaticProfiler::StoreExpanded": 2218,
231
+ "StaticProfiler::TotalDMAExpanded": 1056429,
232
+ "StaticProfiler::TotalDynamicInstancesCount": 251892,
233
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 247287,
234
+ "StaticProfiler::TotalLNCComm": 0,
235
+ "StaticProfiler::TotalLNCCommTransfer": 0,
236
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0,
237
+ "TilingProfiler::DmaInstructionsAfterTiling": 0,
238
+ "TilingProfiler::GenericInstructionsAfterTiling": 281,
239
+ "TilingProfiler::MatMultInstructionsAfterTiling": 220160,
240
+ "TilingProfiler::NumPfTransposes": 394,
241
+ "TilingProfiler::NumPfTransposesForIo": 33,
242
+ "TilingProfiler::NumPfTransposesForLocal": 226,
243
+ "TilingProfiler::NumPfTransposesForNonlocal": 135,
244
+ "TilingProfiler::PfTransposeInstructions": 6226,
245
+ "TilingProfiler::PfTransposeInstructionsForIo": 1026,
246
+ "TilingProfiler::PfTransposeInstructionsForLocal": 964,
247
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 4236,
248
+ "TilingProfiler::ReduceInstructionsAfterTiling": 67,
249
+ "TilingProfiler::SimdInstructionsAfterTiling": 6399,
250
+ "TilingProfiler::TotalInstructionsAfterTiling": 0,
251
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
252
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
253
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
254
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
255
+ "TransformConvOp::conv2d_column_packing": 0,
256
+ "TransformConvOp::conv2d_column_packing_1": 0,
257
+ "TransformConvOp::conv2d_column_packing_io10": 0,
258
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
259
+ }
260
+ },
261
+ "all": {
262
+ "compiletime": {
263
+ "CanonicalizeConv": 7.100000220816582e-05,
264
+ "CanonicalizeForTensorizer": 0.0007590000168420374,
265
+ "Canonicalizer": 0.007284999825060368,
266
+ "HoistCompute": 0.00010299999848939478,
267
+ "IdentifyCrossPassTensors": 0.00017699999443721026,
268
+ "MemcastMotion": 0.00015799999528098851,
269
+ "PenguinizeFunctions": 0.00035099999513477087,
270
+ "PruneFunctions": 0.0005990000208839774,
271
+ "RemoveOptimizationBarriers": 0.0001049999991664663,
272
+ "ScatterMotion": 0.007300000172108412,
273
+ "TensorizerLegalizationPass": 0.00033099998836405575,
274
+ "VerifySupportedOps": 0.0002410000015515834,
275
+ "algsimp": 0.003000000026077032,
276
+ "batchnorm_expander": 0.0011419999646022916,
277
+ "boundary-marker-removal": 0.0003319999959785491,
278
+ "call-inliner": 0.0004710000066552311,
279
+ "canonicalize-boundary-marker": 0.00036800000816583633,
280
+ "collective-stream-id-checker": 8.399999933317304e-05,
281
+ "comparison-expander": 0.00036199999158270657,
282
+ "computation-deduplicator": 0.000446999998530373,
283
+ "conditional-to-select": 0.00016700000560376793,
284
+ "config-lowering": 0.0003150000120513141,
285
+ "constant_folding": 0.0003420000139158219,
286
+ "cse": 0.0009110000100918114,
287
+ "dce": 0.00011300000187475234,
288
+ "dynamic-slice-transpose": 0.00029799999902024865,
289
+ "eliminate-redundant-compare": 0.00032900000223889947,
290
+ "emit-offloaded-dropout": 0.0004870000120718032,
291
+ "flatten-call-graph": 0.00035600000410340726,
292
+ "fuse-send-recv": 0.002119000069797039,
293
+ "hilo::LegalizeAlias": 0.007114000152796507,
294
+ "hilo::NeuronInstCombine": 0.002025000052526593,
295
+ "hilo::NeuronOpFusion": 1.4999999621068127e-05,
296
+ "hilo::ReplaceTokenTypeWithU8Pass": 0.0002280000044265762,
297
+ "hilo::ScheduleFusion": 7.100000220816582e-05,
298
+ "hilo::SixtyFourHack": 0.0005000000237487257,
299
+ "hilo::VerifyAliasing": 0.000295000005280599,
300
+ "hlo-mac-count": 0.0010900000343099236,
301
+ "hlo-verifier": 0.008117999881505966,
302
+ "io-con-pipe-begin": 1.1000000085914508e-05,
303
+ "io-con-pipe-end": 9.999999974752427e-07,
304
+ "io-layout-normalization": 0.0011190000222995877,
305
+ "legalize-ccops": 4.70000013592653e-05,
306
+ "legalize-compare": 0.00031999999191612005,
307
+ "lower-argminmax-custom-call": 0.0002849999873433262,
308
+ "map-inline": 0.0006060000159777701,
309
+ "metadata-naming": 0.001366000040434301,
310
+ "mlir::detail::OpToOpPassAdaptor": 0.00028899998869746923,
311
+ "mlir::hlo::MhloToPyPenguin": 0.06228100135922432,
312
+ "mlir::mhlo::LowerComplexExtraPass": 0.0029380000196397305,
313
+ "mlir::mhlo::LowerComplexPass": 0.004118999931961298,
314
+ "native-to-custom-softmax": 0.000371000001905486,
315
+ "native-to-custom-softmax-dx": 0.0003800000122282654,
316
+ "operand_upcaster": 0.0007779999868944287,
317
+ "post-par-pipe-begin": 1.9999999949504854e-06,
318
+ "post-par-pipe-end": 0.0,
319
+ "post-partition-simplification": 0.09592299908399582,
320
+ "pre-hlo-begin": 3.999999989900971e-06,
321
+ "pre-hlo-end": 9.999999974752427e-07,
322
+ "replace-minimum-constant": 0.0003220000071451068,
323
+ "reshape-mover": 0.00015500000154133886,
324
+ "simplify-concat": 0.0029800001066178083,
325
+ "simplify-while-loops": 0.00015199999324977398,
326
+ "transform-variadic-reduce": 0.000577000027988106,
327
+ "tuple-simplifier": 0.00037200000951997936,
328
+ "unpack-nested-aws-ntwsr": 0.000307999987853691,
329
+ "unroll-while-loop": 2.700000004551839e-05
330
+ }
331
+ },
332
+ "cumsum": {
333
+ "compiletime": {
334
+ "CoalesceCCOp": 0.0002815723419189453,
335
+ "DMALocalityOpt": 0.00020623207092285156,
336
+ "DMAProfiler": 0.0009174346923828125,
337
+ "DataStreaming": 0.0003020763397216797,
338
+ "DoNothing": 0.00015735626220703125,
339
+ "ExpandISAMacro": 0.000522613525390625,
340
+ "FactorizeBlkDims": 0.00048041343688964844,
341
+ "InferPSumTensor": 0.0004773139953613281,
342
+ "LateLegalizeInst": 0.0004329681396484375,
343
+ "LateNeuronInstComb": 0.00048470497131347656,
344
+ "LegalizeSundaAccess": 0.0015790462493896484,
345
+ "LegalizeType": 0.0002918243408203125,
346
+ "LowerBroadcast": 0.0002551078796386719,
347
+ "LowerIntrinsics": 0.00031304359436035156,
348
+ "LowerTranspose": 0.0002570152282714844,
349
+ "NeuronInstComb": 0.0004956722259521484,
350
+ "NeuronLICM": 0.00045418739318847656,
351
+ "NeuronSimplifyPredicates": 0.0029306411743164063,
352
+ "NeuronValueNumbering": 0.00044155120849609375,
353
+ "SFKVectorizer": 0.002784252166748047,
354
+ "SimpleAllReduceTiling": 0.000247955322265625,
355
+ "SimplifyNeuronTensor": 0.0004968643188476563,
356
+ "SpillPSum": 0.0005667209625244141,
357
+ "WeightCoalescing": 0.000274658203125
358
+ }
359
+ },
360
+ "sg00": {
361
+ "hilo": {
362
+ "ArithmeticIntensity": 3.6884374618530273,
363
+ "HloMacCount": 14361559040.0,
364
+ "Traffic": 7787340288.0
365
+ }
366
+ },
367
+ "sg0000": {
368
+ "compiletime": {
369
+ "AGOrderingAnalysisPass": 2.5377185344696045,
370
+ "AffinePredicateResolution": 0.09365105628967285,
371
+ "AliasDependencyElimination": 0.0029387474060058594,
372
+ "AliasDependencyInduction": 0.9149439334869385,
373
+ "AliasDependencyReset": 0.9402039051055908,
374
+ "BFComputeCutting": 0.08754968643188477,
375
+ "BirCodeGenLoop": 1.745743751525879,
376
+ "CCOpFusion": 0.6600353717803955,
377
+ "CanonicalizeDAGForPGTiling": 0.18796682357788086,
378
+ "CanonicalizeIR": 0.11113405227661133,
379
+ "CoalesceCCOp": 0.17152976989746094,
380
+ "CommuteConcat": 0.03446626663208008,
381
+ "DMALocalityOpt": 0.03101348876953125,
382
+ "DMAProfiler": 0.07795047760009766,
383
+ "DMATilingProfiler": 0.07289671897888184,
384
+ "DataLocalityOpt": 2.36210560798645,
385
+ "DataStreaming": 0.11712121963500977,
386
+ "DeConcat": 0.018580198287963867,
387
+ "DeadCodeElimination": 0.034860849380493164,
388
+ "DeadStoreElimination": 1.3254823684692383,
389
+ "DelinearIndices": 0.3446221351623535,
390
+ "Delinearization": 0.1506659984588623,
391
+ "DoNothing": 0.00012350082397460938,
392
+ "DramToDramTranspose": 1.202136516571045,
393
+ "DumpGraphAndMetadata": 0.16636061668395996,
394
+ "EliminateDivs": 0.2120811939239502,
395
+ "ExpandBatchNorm": 0.08646512031555176,
396
+ "ExpandISAMacro": 0.07089924812316895,
397
+ "FactorizeBlkDims": 0.28119874000549316,
398
+ "FactorizeThreadAxesInFreeDims": 0.04656529426574707,
399
+ "FlattenMacroLoop": 0.09647870063781738,
400
+ "GenericAccessSimplifier": 0.03339672088623047,
401
+ "InferInitValue": 1.0866944789886475,
402
+ "InferIntrinsicOnCC": 0.36792969703674316,
403
+ "InferNeuronTensor": 1.8716816902160645,
404
+ "InferNonlocalTensors": 4.868690013885498,
405
+ "InferPSumTensor": 1.184112787246704,
406
+ "InlineNativeKernels": 0.046529531478881836,
407
+ "InsertIOTransposes": 0.8822572231292725,
408
+ "InsertLocalTransposes": 0.9775772094726563,
409
+ "InsertOffloadedTransposes": 0.07813811302185059,
410
+ "LICM": 0.09977531433105469,
411
+ "LateLegalizeInst": 0.1516575813293457,
412
+ "LateLegalizePostSplit": 0.07948708534240723,
413
+ "LateLowerReshapeOp": 0.044101715087890625,
414
+ "LateLowerTensorOp": 0.9023852348327637,
415
+ "LateNeuronInstComb": 0.3992629051208496,
416
+ "LayoutPreprocessing": 1.0312676429748535,
417
+ "LayoutPreprocessingAndAnalysis": 1.378429651260376,
418
+ "LayoutRequirementAnalysis": 0.3332996368408203,
419
+ "LegalizeCCOpLayout": 0.11418366432189941,
420
+ "LegalizeOpLevelAlias": 0.030529260635375977,
421
+ "LegalizePartitionReduce": 0.03310394287109375,
422
+ "LegalizeSundaAccess": 1.0573086738586426,
423
+ "LegalizeSundaMacro": 0.32631635665893555,
424
+ "LegalizeType": 0.15149402618408203,
425
+ "LocalLayoutOpt": 0.736790657043457,
426
+ "LoopFusion": 0.3599417209625244,
427
+ "LoopSplitting": 0.03815197944641113,
428
+ "LowerBroadcast": 0.0502932071685791,
429
+ "LowerCCOpBlockAxis": 0.24752306938171387,
430
+ "LowerComplexBroadcast": 0.15025925636291504,
431
+ "LowerIntrinsics": 0.5702738761901855,
432
+ "LowerTensorOp": 0.6496686935424805,
433
+ "LowerTranspose": 0.47429561614990234,
434
+ "MacroGeneration": 2.619112730026245,
435
+ "MaskPropagation": 0.13018035888671875,
436
+ "MemcpyElimination": 9.560661315917969,
437
+ "MutateDataType": 0.04511690139770508,
438
+ "NeuronAliasDependencyInduction": 0.01844191551208496,
439
+ "NeuronAliasDependencyReset": 0.032405853271484375,
440
+ "NeuronInstComb": 0.313948392868042,
441
+ "NeuronLICM": 0.2585768699645996,
442
+ "NeuronLoopFusion": 0.7092890739440918,
443
+ "NeuronLoopInterchange": 0.0486292839050293,
444
+ "NeuronSimplifier": 0.3838067054748535,
445
+ "NeuronSimplifyPredicates": 0.05862069129943848,
446
+ "NeuronValueNumbering": 0.09080743789672852,
447
+ "OptimizeAliasedCopyChain": 0.01573491096496582,
448
+ "OptimizeNKIKernels": 0.37357568740844727,
449
+ "PAGLayoutOpt": 43.78666687011719,
450
+ "PComputeCutting": 0.2951951026916504,
451
+ "PGLayoutTilingPipeline": 58.915504455566406,
452
+ "PGTiling": 5.879387855529785,
453
+ "PadElimination": 0.015177726745605469,
454
+ "ParAxesAnnotation": 42.794742584228516,
455
+ "PartialLoopFusion": 0.3791334629058838,
456
+ "PartialSimdFusion": 0.28896236419677734,
457
+ "PerfectLoopNest": 0.06396722793579102,
458
+ "RecognizeOpIdiom": 0.14191770553588867,
459
+ "Recompute": 0.008079290390014648,
460
+ "RelaxPredicates": 0.12253475189208984,
461
+ "Rematerialization": 0.15925836563110352,
462
+ "ReshapeWeights": 0.02360057830810547,
463
+ "ResolveAccessConflict": 0.23374605178833008,
464
+ "ResolveComplicatePredicates": 0.09412980079650879,
465
+ "RewriteReplicationMatmul": 0.05128669738769531,
466
+ "RewriteWeights": 0.069183349609375,
467
+ "SFKVectorizer": 5.667294502258301,
468
+ "SimpleAllReduceTiling": 0.059381961822509766,
469
+ "Simplifier": 0.11846542358398438,
470
+ "SimplifyMacroPredicates": 0.20037221908569336,
471
+ "SimplifyNeuronTensor": 0.42980480194091797,
472
+ "SimplifySlice": 0.03480982780456543,
473
+ "SimplifyTensor": 0.21754741668701172,
474
+ "SpillPSum": 0.5781185626983643,
475
+ "SplitAPUnionSets": 0.2246379852294922,
476
+ "SplitAccGrp": 0.041144371032714844,
477
+ "StaticProfiler": 0.11391091346740723,
478
+ "StaticTransposeLocalTensor": 0.2581653594970703,
479
+ "SundaISel": 1.714808702468872,
480
+ "TCTransform": 0.03960013389587402,
481
+ "TensorInitialization": 0.08710885047912598,
482
+ "TensorOpSimplifier": 0.5631639957427979,
483
+ "TensorOpTransform": 1.7516539096832275,
484
+ "TileCCOps": 0.2776148319244385,
485
+ "TilingProfiler": 0.4766700267791748,
486
+ "TransformConvOp": 0.05919003486633301,
487
+ "TritiumFusion": 1.33201265335083,
488
+ "ValueNumbering": 0.09897065162658691,
489
+ "VectorizeDMA": 0.1225745677947998,
490
+ "VectorizeMatMult": 0.0306241512298584,
491
+ "WeightCoalescing": 0.047003984451293945,
492
+ "ZeroSizeTensorElimination": 0.0004246234893798828
493
+ },
494
+ "tensorizer": {
495
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 241479,
496
+ "StaticProfiler::AifUb": 4.066871166229248,
497
+ "StaticProfiler::ArithmeticIntensityTensorizer": 4.706264019012451,
498
+ "StaticProfiler::AverageDmaLength": 6434.17578125,
499
+ "StaticProfiler::AverageFractalPeUtilization": 99.41757202148438,
500
+ "StaticProfiler::AveragePartitionUtilization": 98.54876708984375,
501
+ "StaticProfiler::AveragePeUtilization": 97.57201385498047,
502
+ "StaticProfiler::DDRTransferBytes": 7193317884,
503
+ "StaticProfiler::InternalTransferBytes": 47988784,
504
+ "StaticProfiler::LoadExpanded": 1054211,
505
+ "StaticProfiler::LocalizationEfficiency": 115.72197723388672,
506
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 115.86865234375,
507
+ "StaticProfiler::StoreExpanded": 2218,
508
+ "StaticProfiler::TotalDMAExpanded": 1056429,
509
+ "StaticProfiler::TotalDynamicInstancesCount": 251892,
510
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 247287,
511
+ "StaticProfiler::TotalLNCComm": 0,
512
+ "StaticProfiler::TotalLNCCommTransfer": 0,
513
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
514
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0,
515
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0,
516
+ "TilingProfiler::DmaInstructionsAfterTiling": 0,
517
+ "TilingProfiler::GenericInstructionsAfterTiling": 281,
518
+ "TilingProfiler::MatMultInstructionsAfterTiling": 220160,
519
+ "TilingProfiler::NumPfTransposes": 394,
520
+ "TilingProfiler::NumPfTransposesForIo": 33,
521
+ "TilingProfiler::NumPfTransposesForLocal": 226,
522
+ "TilingProfiler::NumPfTransposesForNonlocal": 135,
523
+ "TilingProfiler::PfTransposeInstructions": 6226,
524
+ "TilingProfiler::PfTransposeInstructionsForIo": 1026,
525
+ "TilingProfiler::PfTransposeInstructionsForLocal": 964,
526
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 4236,
527
+ "TilingProfiler::ReduceInstructionsAfterTiling": 67,
528
+ "TilingProfiler::SimdInstructionsAfterTiling": 6399,
529
+ "TilingProfiler::TotalInstructionsAfterTiling": 0,
530
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
531
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
532
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
533
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
534
+ "TransformConvOp::conv2d_column_packing": 0,
535
+ "TransformConvOp::conv2d_column_packing_1": 0,
536
+ "TransformConvOp::conv2d_column_packing_io10": 0,
537
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
538
+ }
539
+ }
540
+ }
token_generation_model/_tp0_bk1/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
token_generation_model/_tp0_bk1/neuron_config.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "MistralForCausalLM"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "attribute_map": {},
10
+ "bad_words_ids": null,
11
+ "begin_suppress_tokens": null,
12
+ "bos_token_id": 1,
13
+ "chunk_size_feed_forward": 0,
14
+ "cross_attention_hidden_size": null,
15
+ "decoder_start_token_id": null,
16
+ "diversity_penalty": 0.0,
17
+ "do_sample": false,
18
+ "early_stopping": false,
19
+ "encoder_no_repeat_ngram_size": 0,
20
+ "eos_token_id": 2,
21
+ "exponential_decay_length_penalty": null,
22
+ "finetuning_task": null,
23
+ "forced_bos_token_id": null,
24
+ "forced_eos_token_id": null,
25
+ "fused_spec_config": null,
26
+ "head_dim": 128,
27
+ "hidden_act": "silu",
28
+ "hidden_size": 4096,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 14336,
35
+ "is_decoder": false,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "length_penalty": 1.0,
42
+ "max_length": 20,
43
+ "max_position_embeddings": 32768,
44
+ "metadata": null,
45
+ "min_length": 0,
46
+ "model_type": "mistral",
47
+ "neuron_config": {
48
+ "activation_quantization_type": null,
49
+ "allow_input_truncation": false,
50
+ "apply_seq_ids_mask": false,
51
+ "async_mode": false,
52
+ "attention_dp_degree": 1,
53
+ "attention_dtype": null,
54
+ "attn_block_cte_nki_kernel_enabled": false,
55
+ "attn_block_tkg_nki_kernel_cache_update": false,
56
+ "attn_block_tkg_nki_kernel_enabled": false,
57
+ "attn_cls": "NeuronLlamaAttention",
58
+ "attn_kernel_enabled": null,
59
+ "attn_tkg_builtin_kernel_enabled": false,
60
+ "attn_tkg_nki_kernel_enabled": false,
61
+ "batch_size": 4,
62
+ "bucket_n_active_tokens": false,
63
+ "buckets": [
64
+ 256
65
+ ],
66
+ "cast_type": "config",
67
+ "cc_pipeline_tiling_factor": 1,
68
+ "chunked_prefill_config": null,
69
+ "context_encoding_buckets": null,
70
+ "cp_degree": 1,
71
+ "ctx_batch_size": 1,
72
+ "disable_kv_cache_tiling": false,
73
+ "draft_model_modules_to_not_convert": null,
74
+ "enable_bucketing": true,
75
+ "enable_eagle_draft_input_norm": false,
76
+ "enable_eagle_speculation": false,
77
+ "enable_fused_speculation": false,
78
+ "enable_long_context_mode": false,
79
+ "enable_output_completion_notifications": false,
80
+ "enable_spill_reload_dge": false,
81
+ "enable_token_tree": false,
82
+ "ep_degree": 1,
83
+ "expert_mlp_nki_kernel_enabled": null,
84
+ "flash_decoding_enabled": false,
85
+ "fused_qkv": false,
86
+ "fused_rmsnorm_skip_gamma": false,
87
+ "is_block_kv_layout": null,
88
+ "is_chunked_prefill": false,
89
+ "is_continuous_batching": true,
90
+ "is_eagle_draft": false,
91
+ "is_medusa": false,
92
+ "is_prefill_stage": false,
93
+ "is_prefix_caching": false,
94
+ "k_cache_transposed": false,
95
+ "kv_cache_batch_size": 4,
96
+ "kv_cache_padding_size": 0,
97
+ "kv_cache_quant": false,
98
+ "kv_cache_tiling": false,
99
+ "layer_boundary_markers": false,
100
+ "lm_head_pad": false,
101
+ "lm_head_pad_alignment_size": 1,
102
+ "local_ranks_size": 2,
103
+ "logical_nc_config": 1,
104
+ "lora_config": null,
105
+ "max_batch_size": 4,
106
+ "max_context_length": 2048,
107
+ "max_length": 2048,
108
+ "max_new_tokens": null,
109
+ "medusa_speculation_length": 0,
110
+ "medusa_tree": null,
111
+ "mlp_kernel_enabled": false,
112
+ "mlp_kernel_fuse_residual_add": false,
113
+ "modules_to_not_convert": null,
114
+ "moe_fused_nki_kernel_enabled": null,
115
+ "n_active_tokens": 1,
116
+ "n_positions": 2048,
117
+ "num_medusa_heads": 0,
118
+ "on_cpu": false,
119
+ "on_device_sampling_config": {
120
+ "deterministic": false,
121
+ "do_sample": false,
122
+ "dynamic": true,
123
+ "global_topk": 256,
124
+ "on_device_sampling_config": true,
125
+ "temperature": 1.0,
126
+ "top_k": 1,
127
+ "top_k_kernel_enabled": false,
128
+ "top_p": 1.0
129
+ },
130
+ "output_logits": false,
131
+ "overrides_torch_dtype": true,
132
+ "pa_block_size": 2048,
133
+ "pa_num_blocks": 4,
134
+ "padding_side": "right",
135
+ "pp_degree": 1,
136
+ "prefix_buckets": null,
137
+ "qk_layernorm": false,
138
+ "qkv_kernel_enabled": false,
139
+ "qkv_kernel_fuse_residual_add": false,
140
+ "qkv_kernel_nbsd_layout": false,
141
+ "quantization_dtype": "int8",
142
+ "quantization_type": "per_tensor_symmetric",
143
+ "quantize_clamp_bound": Infinity,
144
+ "quantized": false,
145
+ "quantized_checkpoints_path": null,
146
+ "quantized_mlp_kernel_enabled": false,
147
+ "rmsnorm_quantize_kernel_enabled": false,
148
+ "router_topk_nki_kernel_enabled": null,
149
+ "rpl_reduce_dtype": null,
150
+ "save_sharded_checkpoint": true,
151
+ "scratchpad_page_size": null,
152
+ "seq_len": 2048,
153
+ "seq_len_threshold_for_cc_tiling": 16384,
154
+ "sequence_parallel_enabled": false,
155
+ "shared_mlp_nki_kernel_enabled": null,
156
+ "skip_sharding": false,
157
+ "skip_warmup": false,
158
+ "spec_batch_size": 4,
159
+ "speculation_length": 0,
160
+ "start_rank_id": 0,
161
+ "target": null,
162
+ "tile_cc": false,
163
+ "tkg_batch_size": 4,
164
+ "token_generation_buckets": [
165
+ 256
166
+ ],
167
+ "token_tree_config": null,
168
+ "torch_dtype": "bfloat16",
169
+ "tp_degree": 2,
170
+ "vocab_parallel": false,
171
+ "weight_gather_seq_len_threshold": 32768,
172
+ "weights_to_skip_layout_optimization": [],
173
+ "world_size": 2
174
+ },
175
+ "no_repeat_ngram_size": 0,
176
+ "num_attention_heads": 32,
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_cores_per_group": 1,
180
+ "num_hidden_layers": 32,
181
+ "num_key_value_heads": 8,
182
+ "num_return_sequences": 1,
183
+ "output_attentions": false,
184
+ "output_hidden_states": false,
185
+ "output_scores": false,
186
+ "pad_token_id": 0,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "remove_invalid_values": false,
191
+ "repetition_penalty": 1.0,
192
+ "return_dict": true,
193
+ "return_dict_in_generate": false,
194
+ "rms_norm_eps": 1e-05,
195
+ "rope_theta": 1000000.0,
196
+ "sep_token_id": null,
197
+ "sliding_window": null,
198
+ "suppress_tokens": null,
199
+ "task_specific_params": null,
200
+ "temperature": 1.0,
201
+ "tf_legacy_loss": false,
202
+ "tie_encoder_decoder": false,
203
+ "tie_word_embeddings": false,
204
+ "tokenizer_class": null,
205
+ "top_k": 50,
206
+ "top_p": 1.0,
207
+ "torchscript": false,
208
+ "transformers_version": "4.42.0.dev0",
209
+ "typical_p": 1.0,
210
+ "use_bfloat16": false,
211
+ "use_cache": true,
212
+ "vocab_size": 32768
213
+ }
token_generation_model/_tp0_bk2/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_2f686dc6ba7ef3326a56+6113de8c.hlo_module.pb --output model.MODULE_2f686dc6ba7ef3326a56+6113de8c.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --verbose=35
token_generation_model/_tp0_bk2/compile_flags.MODULE_2f686dc6ba7ef3326a56+6113de8c.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=1", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/models/mistral-7b-v0.3-instruct-neuronx/token_generation_model/_tp0_bk2/log-neuron-cc.txt"]
token_generation_model/_tp0_bk2/global_metric_store.json ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.39725494384766,
5
+ "StaticProfiler::AveragePartitionUtilization": 98.54637908935547,
6
+ "StaticProfiler::AveragePeUtilization": 97.55433654785156,
7
+ "StaticProfiler::LocalizationEfficiency": 114.65234375,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 114.79631042480469,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1,
16
+ "StaticProfiler::AveragePartitionUtilization": 1,
17
+ "StaticProfiler::AveragePeUtilization": 1,
18
+ "StaticProfiler::LocalizationEfficiency": 1,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 2.565244436264038,
27
+ "AffinePredicateResolution": 0.08378434181213379,
28
+ "AliasDependencyElimination": 0.003157377243041992,
29
+ "AliasDependencyInduction": 0.6296179294586182,
30
+ "AliasDependencyReset": 0.6559426784515381,
31
+ "BFComputeCutting": 0.08006763458251953,
32
+ "BirCodeGenLoop": 1.8334097862243652,
33
+ "CCOpFusion": 0.7150745391845703,
34
+ "CanonicalizeConv": 3.999999989900971e-06,
35
+ "CanonicalizeDAGForPGTiling": 0.18795156478881836,
36
+ "CanonicalizeForTensorizer": 0.0006089999806135893,
37
+ "CanonicalizeIR": 0.0996088981628418,
38
+ "Canonicalizer": 0.007222999818623066,
39
+ "CoalesceCCOp": 0.17608022689819336,
40
+ "CommuteConcat": 0.036467552185058594,
41
+ "DMALocalityOpt": 0.03174924850463867,
42
+ "DMAProfiler": 0.07987165451049805,
43
+ "DMATilingProfiler": 0.07628154754638672,
44
+ "DataLocalityOpt": 2.366703510284424,
45
+ "DataStreaming": 0.11896848678588867,
46
+ "DeConcat": 0.024710893630981445,
47
+ "DeadCodeElimination": 0.03867483139038086,
48
+ "DeadStoreElimination": 1.3142352104187012,
49
+ "DelinearIndices": 0.33995985984802246,
50
+ "Delinearization": 0.1597750186920166,
51
+ "DoNothing": 0.00028634071350097656,
52
+ "DramToDramTranspose": 1.1566178798675537,
53
+ "DumpGraphAndMetadata": 0.16752195358276367,
54
+ "EliminateDivs": 0.16330885887145996,
55
+ "ExpandBatchNorm": 0.11280179023742676,
56
+ "ExpandISAMacro": 0.07273101806640625,
57
+ "FactorizeBlkDims": 0.5880463123321533,
58
+ "FactorizeThreadAxesInFreeDims": 0.05299186706542969,
59
+ "FlattenMacroLoop": 0.08956408500671387,
60
+ "GenericAccessSimplifier": 0.033487796783447266,
61
+ "HoistCompute": 7.000000186963007e-05,
62
+ "IdentifyCrossPassTensors": 0.0003000000142492354,
63
+ "InferInitValue": 1.0861613750457764,
64
+ "InferIntrinsicOnCC": 0.3759927749633789,
65
+ "InferNeuronTensor": 1.848968744277954,
66
+ "InferNonlocalTensors": 4.863279342651367,
67
+ "InferPSumTensor": 1.23720383644104,
68
+ "InlineNativeKernels": 0.053342580795288086,
69
+ "InsertIOTransposes": 0.9273536205291748,
70
+ "InsertLocalTransposes": 0.7733509540557861,
71
+ "InsertOffloadedTransposes": 0.07759976387023926,
72
+ "LICM": 0.10009574890136719,
73
+ "LateLegalizeInst": 0.15100622177124023,
74
+ "LateLegalizePostSplit": 0.08344554901123047,
75
+ "LateLowerReshapeOp": 0.04735827445983887,
76
+ "LateLowerTensorOp": 0.9971563816070557,
77
+ "LateNeuronInstComb": 0.4363434314727783,
78
+ "LayoutPreprocessing": 1.0707452297210693,
79
+ "LayoutPreprocessingAndAnalysis": 1.4177396297454834,
80
+ "LayoutRequirementAnalysis": 0.33483314514160156,
81
+ "LegalizeCCOpLayout": 0.12552833557128906,
82
+ "LegalizeOpLevelAlias": 0.03364896774291992,
83
+ "LegalizePartitionReduce": 0.07460308074951172,
84
+ "LegalizeSundaAccess": 1.0685956478118896,
85
+ "LegalizeSundaMacro": 0.3450655937194824,
86
+ "LegalizeType": 0.16722607612609863,
87
+ "LocalLayoutOpt": 0.7447531223297119,
88
+ "LoopFusion": 0.3385646343231201,
89
+ "LoopSplitting": 0.03500247001647949,
90
+ "LowerBroadcast": 0.09404349327087402,
91
+ "LowerCCOpBlockAxis": 0.24126768112182617,
92
+ "LowerComplexBroadcast": 0.15259552001953125,
93
+ "LowerIntrinsics": 0.5980756282806396,
94
+ "LowerTensorOp": 0.6645591259002686,
95
+ "LowerTranspose": 0.4301431179046631,
96
+ "MacroGeneration": 2.585885763168335,
97
+ "MaskPropagation": 0.13078856468200684,
98
+ "MemcastMotion": 0.0001500000071246177,
99
+ "MemcpyElimination": 9.53384780883789,
100
+ "MutateDataType": 0.04887223243713379,
101
+ "NeuronAliasDependencyInduction": 0.01924896240234375,
102
+ "NeuronAliasDependencyReset": 0.034535884857177734,
103
+ "NeuronInstComb": 0.3494391441345215,
104
+ "NeuronLICM": 0.252178430557251,
105
+ "NeuronLoopFusion": 0.7606415748596191,
106
+ "NeuronLoopInterchange": 0.04935908317565918,
107
+ "NeuronSimplifier": 0.3867678642272949,
108
+ "NeuronSimplifyPredicates": 0.06488251686096191,
109
+ "NeuronValueNumbering": 0.0907585620880127,
110
+ "OptimizeAliasedCopyChain": 0.028003215789794922,
111
+ "OptimizeNKIKernels": 0.3752737045288086,
112
+ "PAGLayoutOpt": 43.734619140625,
113
+ "PComputeCutting": 0.3206455707550049,
114
+ "PGLayoutTilingPipeline": 58.89807891845703,
115
+ "PGTiling": 5.892523765563965,
116
+ "PadElimination": 0.015540599822998047,
117
+ "ParAxesAnnotation": 42.94718551635742,
118
+ "PartialLoopFusion": 0.41214418411254883,
119
+ "PartialSimdFusion": 0.3176724910736084,
120
+ "PenguinizeFunctions": 0.00025400001322850585,
121
+ "PerfectLoopNest": 0.06399226188659668,
122
+ "PruneFunctions": 0.0001340000017080456,
123
+ "RecognizeOpIdiom": 0.15635156631469727,
124
+ "Recompute": 0.009923696517944336,
125
+ "RelaxPredicates": 0.12309074401855469,
126
+ "Rematerialization": 0.22021842002868652,
127
+ "RemoveOptimizationBarriers": 0.00010699999984353781,
128
+ "ReshapeWeights": 0.026267051696777344,
129
+ "ResolveAccessConflict": 0.22898530960083008,
130
+ "ResolveComplicatePredicates": 0.09242081642150879,
131
+ "RewriteReplicationMatmul": 0.051862239837646484,
132
+ "RewriteWeights": 0.0745856761932373,
133
+ "SFKVectorizer": 6.074093341827393,
134
+ "ScatterMotion": 0.007073000073432922,
135
+ "SimpleAllReduceTiling": 0.0595548152923584,
136
+ "Simplifier": 0.1168510913848877,
137
+ "SimplifyMacroPredicates": 0.19426846504211426,
138
+ "SimplifyNeuronTensor": 0.43988728523254395,
139
+ "SimplifySlice": 0.07861804962158203,
140
+ "SimplifyTensor": 0.22563552856445313,
141
+ "SpillPSum": 0.6065900325775146,
142
+ "SplitAPUnionSets": 0.2475893497467041,
143
+ "SplitAccGrp": 0.040537118911743164,
144
+ "StaticProfiler": 0.12209963798522949,
145
+ "StaticTransposeLocalTensor": 0.26148557662963867,
146
+ "SundaISel": 1.426959753036499,
147
+ "TCTransform": 0.04041242599487305,
148
+ "TensorInitialization": 0.0903174877166748,
149
+ "TensorOpSimplifier": 0.5430335998535156,
150
+ "TensorOpTransform": 2.182173252105713,
151
+ "TensorizerLegalizationPass": 0.0002280000044265762,
152
+ "TileCCOps": 0.2870340347290039,
153
+ "TilingProfiler": 0.4631321430206299,
154
+ "TransformConvOp": 0.07256841659545898,
155
+ "TritiumFusion": 1.387312412261963,
156
+ "ValueNumbering": 0.10227847099304199,
157
+ "VectorizeDMA": 0.12761187553405762,
158
+ "VectorizeMatMult": 0.02546095848083496,
159
+ "VerifySupportedOps": 0.0002390000008745119,
160
+ "WeightCoalescing": 0.04958319664001465,
161
+ "ZeroSizeTensorElimination": 0.00046133995056152344,
162
+ "algsimp": 0.0017790000420063734,
163
+ "batchnorm_expander": 0.0009469999931752682,
164
+ "boundary-marker-removal": 0.0003819999983534217,
165
+ "call-inliner": 0.0003330000035930425,
166
+ "canonicalize-boundary-marker": 0.0004419999895617366,
167
+ "collective-stream-id-checker": 7.999999797903001e-05,
168
+ "comparison-expander": 0.00040600000647827983,
169
+ "computation-deduplicator": 0.0006760000251233578,
170
+ "conditional-to-select": 0.00012599999899975955,
171
+ "config-lowering": 0.0003220000071451068,
172
+ "constant_folding": 0.00023200000578071922,
173
+ "cse": 0.0005840000230818987,
174
+ "dce": 6.500000017695129e-05,
175
+ "dynamic-slice-transpose": 0.00021100000594742596,
176
+ "eliminate-redundant-compare": 0.00021699999342672527,
177
+ "emit-offloaded-dropout": 0.00032699998700991273,
178
+ "flatten-call-graph": 0.0005499999970197678,
179
+ "fuse-send-recv": 0.002495999913662672,
180
+ "hilo::LegalizeAlias": 0.0030300000216811895,
181
+ "hilo::NeuronInstCombine": 0.0010979999788105488,
182
+ "hilo::NeuronOpFusion": 0.0006970000104047358,
183
+ "hilo::ReplaceTokenTypeWithU8Pass": 0.00018200000340584666,
184
+ "hilo::ScheduleFusion": 5.8000001445179805e-05,
185
+ "hilo::SixtyFourHack": 0.0002469999890308827,
186
+ "hilo::VerifyAliasing": 0.00012199999764561653,
187
+ "hlo-mac-count": 0.0007970000151544809,
188
+ "hlo-verifier": 0.009019999764859676,
189
+ "io-con-pipe-begin": 7.000000096013537e-06,
190
+ "io-con-pipe-end": 9.999999974752427e-07,
191
+ "io-layout-normalization": 0.0010720000136643648,
192
+ "legalize-ccops": 2.5999999706982635e-05,
193
+ "legalize-compare": 0.0003650000144261867,
194
+ "lower-argminmax-custom-call": 0.0002280000044265762,
195
+ "map-inline": 0.0006910000229254365,
196
+ "metadata-naming": 0.002133999951183796,
197
+ "mlir::detail::OpToOpPassAdaptor": 0.00021600000036414713,
198
+ "mlir::hlo::MhloToPyPenguin": 0.05840799957513809,
199
+ "mlir::mhlo::LowerComplexExtraPass": 0.004083000123500824,
200
+ "mlir::mhlo::LowerComplexPass": 0.004681000020354986,
201
+ "native-to-custom-softmax": 0.00060299999313429,
202
+ "native-to-custom-softmax-dx": 0.05077800154685974,
203
+ "operand_upcaster": 0.0009350000182166696,
204
+ "post-par-pipe-begin": 9.999999974752427e-07,
205
+ "post-par-pipe-end": 0.0,
206
+ "post-partition-simplification": 0.10879900306463242,
207
+ "pre-hlo-begin": 1.9999999949504854e-06,
208
+ "pre-hlo-end": 9.999999974752427e-07,
209
+ "replace-minimum-constant": 0.00022600000374950469,
210
+ "reshape-mover": 9.300000237999484e-05,
211
+ "simplify-concat": 0.0023070001043379307,
212
+ "simplify-while-loops": 9.000000136438757e-05,
213
+ "transform-variadic-reduce": 0.0006370000191964209,
214
+ "tuple-simplifier": 0.0002800000074785203,
215
+ "unpack-nested-aws-ntwsr": 0.0003440000000409782,
216
+ "unroll-while-loop": 1.5999999959603883e-05
217
+ },
218
+ "hilo": {
219
+ "HloMacCount": 14495776768.0,
220
+ "Traffic": 7787344384.0
221
+ },
222
+ "tensorizer": {
223
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 245899,
224
+ "StaticProfiler::AifUb": 4.623864650726318,
225
+ "StaticProfiler::ArithmeticIntensityTensorizer": 5.3013691902160645,
226
+ "StaticProfiler::AverageDmaLength": 6491.51513671875,
227
+ "StaticProfiler::DDRTransferBytes": 7260430844,
228
+ "StaticProfiler::InternalTransferBytes": 82595888,
229
+ "StaticProfiler::LoadExpanded": 1054219,
230
+ "StaticProfiler::StoreExpanded": 2218,
231
+ "StaticProfiler::TotalDMAExpanded": 1056437,
232
+ "StaticProfiler::TotalDynamicInstancesCount": 255460,
233
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 250753,
234
+ "StaticProfiler::TotalLNCComm": 0,
235
+ "StaticProfiler::TotalLNCCommTransfer": 0,
236
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0,
237
+ "TilingProfiler::DmaInstructionsAfterTiling": 0,
238
+ "TilingProfiler::GenericInstructionsAfterTiling": 281,
239
+ "TilingProfiler::MatMultInstructionsAfterTiling": 222208,
240
+ "TilingProfiler::NumPfTransposes": 330,
241
+ "TilingProfiler::NumPfTransposesForIo": 33,
242
+ "TilingProfiler::NumPfTransposesForLocal": 162,
243
+ "TilingProfiler::NumPfTransposesForNonlocal": 135,
244
+ "TilingProfiler::PfTransposeInstructions": 7220,
245
+ "TilingProfiler::PfTransposeInstructionsForIo": 2052,
246
+ "TilingProfiler::PfTransposeInstructionsForLocal": 932,
247
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 4236,
248
+ "TilingProfiler::ReduceInstructionsAfterTiling": 163,
249
+ "TilingProfiler::SimdInstructionsAfterTiling": 6657,
250
+ "TilingProfiler::TotalInstructionsAfterTiling": 0,
251
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
252
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
253
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
254
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
255
+ "TransformConvOp::conv2d_column_packing": 0,
256
+ "TransformConvOp::conv2d_column_packing_1": 0,
257
+ "TransformConvOp::conv2d_column_packing_io10": 0,
258
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
259
+ }
260
+ },
261
+ "all": {
262
+ "compiletime": {
263
+ "CanonicalizeConv": 3.999999989900971e-06,
264
+ "CanonicalizeForTensorizer": 0.0006089999806135893,
265
+ "Canonicalizer": 0.007222999818623066,
266
+ "HoistCompute": 7.000000186963007e-05,
267
+ "IdentifyCrossPassTensors": 0.0003000000142492354,
268
+ "MemcastMotion": 0.0001500000071246177,
269
+ "PenguinizeFunctions": 0.00025400001322850585,
270
+ "PruneFunctions": 0.0001340000017080456,
271
+ "RemoveOptimizationBarriers": 0.00010699999984353781,
272
+ "ScatterMotion": 0.007073000073432922,
273
+ "TensorizerLegalizationPass": 0.0002280000044265762,
274
+ "VerifySupportedOps": 0.0002390000008745119,
275
+ "algsimp": 0.0017790000420063734,
276
+ "batchnorm_expander": 0.0009469999931752682,
277
+ "boundary-marker-removal": 0.0003819999983534217,
278
+ "call-inliner": 0.0003330000035930425,
279
+ "canonicalize-boundary-marker": 0.0004419999895617366,
280
+ "collective-stream-id-checker": 7.999999797903001e-05,
281
+ "comparison-expander": 0.00040600000647827983,
282
+ "computation-deduplicator": 0.0006760000251233578,
283
+ "conditional-to-select": 0.00012599999899975955,
284
+ "config-lowering": 0.0003220000071451068,
285
+ "constant_folding": 0.00023200000578071922,
286
+ "cse": 0.0005840000230818987,
287
+ "dce": 6.500000017695129e-05,
288
+ "dynamic-slice-transpose": 0.00021100000594742596,
289
+ "eliminate-redundant-compare": 0.00021699999342672527,
290
+ "emit-offloaded-dropout": 0.00032699998700991273,
291
+ "flatten-call-graph": 0.0005499999970197678,
292
+ "fuse-send-recv": 0.002495999913662672,
293
+ "hilo::LegalizeAlias": 0.0030300000216811895,
294
+ "hilo::NeuronInstCombine": 0.0010979999788105488,
295
+ "hilo::NeuronOpFusion": 0.0006970000104047358,
296
+ "hilo::ReplaceTokenTypeWithU8Pass": 0.00018200000340584666,
297
+ "hilo::ScheduleFusion": 5.8000001445179805e-05,
298
+ "hilo::SixtyFourHack": 0.0002469999890308827,
299
+ "hilo::VerifyAliasing": 0.00012199999764561653,
300
+ "hlo-mac-count": 0.0007970000151544809,
301
+ "hlo-verifier": 0.009019999764859676,
302
+ "io-con-pipe-begin": 7.000000096013537e-06,
303
+ "io-con-pipe-end": 9.999999974752427e-07,
304
+ "io-layout-normalization": 0.0010720000136643648,
305
+ "legalize-ccops": 2.5999999706982635e-05,
306
+ "legalize-compare": 0.0003650000144261867,
307
+ "lower-argminmax-custom-call": 0.0002280000044265762,
308
+ "map-inline": 0.0006910000229254365,
309
+ "metadata-naming": 0.002133999951183796,
310
+ "mlir::detail::OpToOpPassAdaptor": 0.00021600000036414713,
311
+ "mlir::hlo::MhloToPyPenguin": 0.05840799957513809,
312
+ "mlir::mhlo::LowerComplexExtraPass": 0.004083000123500824,
313
+ "mlir::mhlo::LowerComplexPass": 0.004681000020354986,
314
+ "native-to-custom-softmax": 0.00060299999313429,
315
+ "native-to-custom-softmax-dx": 0.05077800154685974,
316
+ "operand_upcaster": 0.0009350000182166696,
317
+ "post-par-pipe-begin": 9.999999974752427e-07,
318
+ "post-par-pipe-end": 0.0,
319
+ "post-partition-simplification": 0.10879900306463242,
320
+ "pre-hlo-begin": 1.9999999949504854e-06,
321
+ "pre-hlo-end": 9.999999974752427e-07,
322
+ "replace-minimum-constant": 0.00022600000374950469,
323
+ "reshape-mover": 9.300000237999484e-05,
324
+ "simplify-concat": 0.0023070001043379307,
325
+ "simplify-while-loops": 9.000000136438757e-05,
326
+ "transform-variadic-reduce": 0.0006370000191964209,
327
+ "tuple-simplifier": 0.0002800000074785203,
328
+ "unpack-nested-aws-ntwsr": 0.0003440000000409782,
329
+ "unroll-while-loop": 1.5999999959603883e-05
330
+ }
331
+ },
332
+ "cumsum": {
333
+ "compiletime": {
334
+ "CoalesceCCOp": 0.00025391578674316406,
335
+ "DMALocalityOpt": 0.00021147727966308594,
336
+ "DMAProfiler": 0.0009748935699462891,
337
+ "DataStreaming": 0.0003063678741455078,
338
+ "DoNothing": 0.00016117095947265625,
339
+ "ExpandISAMacro": 0.0005402565002441406,
340
+ "FactorizeBlkDims": 0.00048279762268066406,
341
+ "InferPSumTensor": 0.0004856586456298828,
342
+ "LateLegalizeInst": 0.0004565715789794922,
343
+ "LateNeuronInstComb": 0.0005049705505371094,
344
+ "LegalizeSundaAccess": 0.0016589164733886719,
345
+ "LegalizeType": 0.000293731689453125,
346
+ "LowerBroadcast": 0.00036835670471191406,
347
+ "LowerIntrinsics": 0.0002689361572265625,
348
+ "LowerTranspose": 0.0002696514129638672,
349
+ "NeuronInstComb": 0.0005173683166503906,
350
+ "NeuronLICM": 0.000446319580078125,
351
+ "NeuronSimplifyPredicates": 0.002989530563354492,
352
+ "NeuronValueNumbering": 0.0004544258117675781,
353
+ "SFKVectorizer": 0.002811908721923828,
354
+ "SimpleAllReduceTiling": 0.00024199485778808594,
355
+ "SimplifyNeuronTensor": 0.0004520416259765625,
356
+ "SpillPSum": 0.0006034374237060547,
357
+ "WeightCoalescing": 0.0002529621124267578
358
+ }
359
+ },
360
+ "sg00": {
361
+ "hilo": {
362
+ "ArithmeticIntensity": 3.7229063510894775,
363
+ "HloMacCount": 14495776768.0,
364
+ "Traffic": 7787344384.0
365
+ }
366
+ },
367
+ "sg0000": {
368
+ "compiletime": {
369
+ "AGOrderingAnalysisPass": 2.565244436264038,
370
+ "AffinePredicateResolution": 0.08378434181213379,
371
+ "AliasDependencyElimination": 0.003157377243041992,
372
+ "AliasDependencyInduction": 0.6296179294586182,
373
+ "AliasDependencyReset": 0.6559426784515381,
374
+ "BFComputeCutting": 0.08006763458251953,
375
+ "BirCodeGenLoop": 1.8334097862243652,
376
+ "CCOpFusion": 0.7150745391845703,
377
+ "CanonicalizeDAGForPGTiling": 0.18795156478881836,
378
+ "CanonicalizeIR": 0.0996088981628418,
379
+ "CoalesceCCOp": 0.1758263111114502,
380
+ "CommuteConcat": 0.036467552185058594,
381
+ "DMALocalityOpt": 0.031537771224975586,
382
+ "DMAProfiler": 0.07889676094055176,
383
+ "DMATilingProfiler": 0.07628154754638672,
384
+ "DataLocalityOpt": 2.366703510284424,
385
+ "DataStreaming": 0.11866211891174316,
386
+ "DeConcat": 0.024710893630981445,
387
+ "DeadCodeElimination": 0.03867483139038086,
388
+ "DeadStoreElimination": 1.3142352104187012,
389
+ "DelinearIndices": 0.33995985984802246,
390
+ "Delinearization": 0.1597750186920166,
391
+ "DoNothing": 0.0001251697540283203,
392
+ "DramToDramTranspose": 1.1566178798675537,
393
+ "DumpGraphAndMetadata": 0.16752195358276367,
394
+ "EliminateDivs": 0.16330885887145996,
395
+ "ExpandBatchNorm": 0.11280179023742676,
396
+ "ExpandISAMacro": 0.07219076156616211,
397
+ "FactorizeBlkDims": 0.5875635147094727,
398
+ "FactorizeThreadAxesInFreeDims": 0.05299186706542969,
399
+ "FlattenMacroLoop": 0.08956408500671387,
400
+ "GenericAccessSimplifier": 0.033487796783447266,
401
+ "InferInitValue": 1.0861613750457764,
402
+ "InferIntrinsicOnCC": 0.3759927749633789,
403
+ "InferNeuronTensor": 1.848968744277954,
404
+ "InferNonlocalTensors": 4.863279342651367,
405
+ "InferPSumTensor": 1.2367181777954102,
406
+ "InlineNativeKernels": 0.053342580795288086,
407
+ "InsertIOTransposes": 0.9273536205291748,
408
+ "InsertLocalTransposes": 0.7733509540557861,
409
+ "InsertOffloadedTransposes": 0.07759976387023926,
410
+ "LICM": 0.10009574890136719,
411
+ "LateLegalizeInst": 0.15054965019226074,
412
+ "LateLegalizePostSplit": 0.08344554901123047,
413
+ "LateLowerReshapeOp": 0.04735827445983887,
414
+ "LateLowerTensorOp": 0.9971563816070557,
415
+ "LateNeuronInstComb": 0.4358384609222412,
416
+ "LayoutPreprocessing": 1.0707452297210693,
417
+ "LayoutPreprocessingAndAnalysis": 1.4177396297454834,
418
+ "LayoutRequirementAnalysis": 0.33483314514160156,
419
+ "LegalizeCCOpLayout": 0.12552833557128906,
420
+ "LegalizeOpLevelAlias": 0.03364896774291992,
421
+ "LegalizePartitionReduce": 0.07460308074951172,
422
+ "LegalizeSundaAccess": 1.066936731338501,
423
+ "LegalizeSundaMacro": 0.3450655937194824,
424
+ "LegalizeType": 0.1669323444366455,
425
+ "LocalLayoutOpt": 0.7447531223297119,
426
+ "LoopFusion": 0.3385646343231201,
427
+ "LoopSplitting": 0.03500247001647949,
428
+ "LowerBroadcast": 0.09367513656616211,
429
+ "LowerCCOpBlockAxis": 0.24126768112182617,
430
+ "LowerComplexBroadcast": 0.15259552001953125,
431
+ "LowerIntrinsics": 0.5978066921234131,
432
+ "LowerTensorOp": 0.6645591259002686,
433
+ "LowerTranspose": 0.4298734664916992,
434
+ "MacroGeneration": 2.585885763168335,
435
+ "MaskPropagation": 0.13078856468200684,
436
+ "MemcpyElimination": 9.53384780883789,
437
+ "MutateDataType": 0.04887223243713379,
438
+ "NeuronAliasDependencyInduction": 0.01924896240234375,
439
+ "NeuronAliasDependencyReset": 0.034535884857177734,
440
+ "NeuronInstComb": 0.3489217758178711,
441
+ "NeuronLICM": 0.25173211097717285,
442
+ "NeuronLoopFusion": 0.7606415748596191,
443
+ "NeuronLoopInterchange": 0.04935908317565918,
444
+ "NeuronSimplifier": 0.3867678642272949,
445
+ "NeuronSimplifyPredicates": 0.06189298629760742,
446
+ "NeuronValueNumbering": 0.09030413627624512,
447
+ "OptimizeAliasedCopyChain": 0.028003215789794922,
448
+ "OptimizeNKIKernels": 0.3752737045288086,
449
+ "PAGLayoutOpt": 43.734619140625,
450
+ "PComputeCutting": 0.3206455707550049,
451
+ "PGLayoutTilingPipeline": 58.89807891845703,
452
+ "PGTiling": 5.892523765563965,
453
+ "PadElimination": 0.015540599822998047,
454
+ "ParAxesAnnotation": 42.94718551635742,
455
+ "PartialLoopFusion": 0.41214418411254883,
456
+ "PartialSimdFusion": 0.3176724910736084,
457
+ "PerfectLoopNest": 0.06399226188659668,
458
+ "RecognizeOpIdiom": 0.15635156631469727,
459
+ "Recompute": 0.009923696517944336,
460
+ "RelaxPredicates": 0.12309074401855469,
461
+ "Rematerialization": 0.22021842002868652,
462
+ "ReshapeWeights": 0.026267051696777344,
463
+ "ResolveAccessConflict": 0.22898530960083008,
464
+ "ResolveComplicatePredicates": 0.09242081642150879,
465
+ "RewriteReplicationMatmul": 0.051862239837646484,
466
+ "RewriteWeights": 0.0745856761932373,
467
+ "SFKVectorizer": 6.071281433105469,
468
+ "SimpleAllReduceTiling": 0.05931282043457031,
469
+ "Simplifier": 0.1168510913848877,
470
+ "SimplifyMacroPredicates": 0.19426846504211426,
471
+ "SimplifyNeuronTensor": 0.4394352436065674,
472
+ "SimplifySlice": 0.07861804962158203,
473
+ "SimplifyTensor": 0.22563552856445313,
474
+ "SpillPSum": 0.6059865951538086,
475
+ "SplitAPUnionSets": 0.2475893497467041,
476
+ "SplitAccGrp": 0.040537118911743164,
477
+ "StaticProfiler": 0.12209963798522949,
478
+ "StaticTransposeLocalTensor": 0.26148557662963867,
479
+ "SundaISel": 1.426959753036499,
480
+ "TCTransform": 0.04041242599487305,
481
+ "TensorInitialization": 0.0903174877166748,
482
+ "TensorOpSimplifier": 0.5430335998535156,
483
+ "TensorOpTransform": 2.182173252105713,
484
+ "TileCCOps": 0.2870340347290039,
485
+ "TilingProfiler": 0.4631321430206299,
486
+ "TransformConvOp": 0.07256841659545898,
487
+ "TritiumFusion": 1.387312412261963,
488
+ "ValueNumbering": 0.10227847099304199,
489
+ "VectorizeDMA": 0.12761187553405762,
490
+ "VectorizeMatMult": 0.02546095848083496,
491
+ "WeightCoalescing": 0.04933023452758789,
492
+ "ZeroSizeTensorElimination": 0.00046133995056152344
493
+ },
494
+ "tensorizer": {
495
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 245899,
496
+ "StaticProfiler::AifUb": 4.623864650726318,
497
+ "StaticProfiler::ArithmeticIntensityTensorizer": 5.3013691902160645,
498
+ "StaticProfiler::AverageDmaLength": 6491.51513671875,
499
+ "StaticProfiler::AverageFractalPeUtilization": 99.39725494384766,
500
+ "StaticProfiler::AveragePartitionUtilization": 98.54637908935547,
501
+ "StaticProfiler::AveragePeUtilization": 97.55433654785156,
502
+ "StaticProfiler::DDRTransferBytes": 7260430844,
503
+ "StaticProfiler::InternalTransferBytes": 82595888,
504
+ "StaticProfiler::LoadExpanded": 1054219,
505
+ "StaticProfiler::LocalizationEfficiency": 114.65234375,
506
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 114.79631042480469,
507
+ "StaticProfiler::StoreExpanded": 2218,
508
+ "StaticProfiler::TotalDMAExpanded": 1056437,
509
+ "StaticProfiler::TotalDynamicInstancesCount": 255460,
510
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 250753,
511
+ "StaticProfiler::TotalLNCComm": 0,
512
+ "StaticProfiler::TotalLNCCommTransfer": 0,
513
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
514
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0,
515
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0,
516
+ "TilingProfiler::DmaInstructionsAfterTiling": 0,
517
+ "TilingProfiler::GenericInstructionsAfterTiling": 281,
518
+ "TilingProfiler::MatMultInstructionsAfterTiling": 222208,
519
+ "TilingProfiler::NumPfTransposes": 330,
520
+ "TilingProfiler::NumPfTransposesForIo": 33,
521
+ "TilingProfiler::NumPfTransposesForLocal": 162,
522
+ "TilingProfiler::NumPfTransposesForNonlocal": 135,
523
+ "TilingProfiler::PfTransposeInstructions": 7220,
524
+ "TilingProfiler::PfTransposeInstructionsForIo": 2052,
525
+ "TilingProfiler::PfTransposeInstructionsForLocal": 932,
526
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 4236,
527
+ "TilingProfiler::ReduceInstructionsAfterTiling": 163,
528
+ "TilingProfiler::SimdInstructionsAfterTiling": 6657,
529
+ "TilingProfiler::TotalInstructionsAfterTiling": 0,
530
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
531
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
532
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
533
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
534
+ "TransformConvOp::conv2d_column_packing": 0,
535
+ "TransformConvOp::conv2d_column_packing_1": 0,
536
+ "TransformConvOp::conv2d_column_packing_io10": 0,
537
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
538
+ }
539
+ }
540
+ }
token_generation_model/_tp0_bk2/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
token_generation_model/_tp0_bk2/neuron_config.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "MistralForCausalLM"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "attribute_map": {},
10
+ "bad_words_ids": null,
11
+ "begin_suppress_tokens": null,
12
+ "bos_token_id": 1,
13
+ "chunk_size_feed_forward": 0,
14
+ "cross_attention_hidden_size": null,
15
+ "decoder_start_token_id": null,
16
+ "diversity_penalty": 0.0,
17
+ "do_sample": false,
18
+ "early_stopping": false,
19
+ "encoder_no_repeat_ngram_size": 0,
20
+ "eos_token_id": 2,
21
+ "exponential_decay_length_penalty": null,
22
+ "finetuning_task": null,
23
+ "forced_bos_token_id": null,
24
+ "forced_eos_token_id": null,
25
+ "fused_spec_config": null,
26
+ "head_dim": 128,
27
+ "hidden_act": "silu",
28
+ "hidden_size": 4096,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 14336,
35
+ "is_decoder": false,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "length_penalty": 1.0,
42
+ "max_length": 20,
43
+ "max_position_embeddings": 32768,
44
+ "metadata": null,
45
+ "min_length": 0,
46
+ "model_type": "mistral",
47
+ "neuron_config": {
48
+ "activation_quantization_type": null,
49
+ "allow_input_truncation": false,
50
+ "apply_seq_ids_mask": false,
51
+ "async_mode": false,
52
+ "attention_dp_degree": 1,
53
+ "attention_dtype": null,
54
+ "attn_block_cte_nki_kernel_enabled": false,
55
+ "attn_block_tkg_nki_kernel_cache_update": false,
56
+ "attn_block_tkg_nki_kernel_enabled": false,
57
+ "attn_cls": "NeuronLlamaAttention",
58
+ "attn_kernel_enabled": null,
59
+ "attn_tkg_builtin_kernel_enabled": false,
60
+ "attn_tkg_nki_kernel_enabled": false,
61
+ "batch_size": 4,
62
+ "bucket_n_active_tokens": false,
63
+ "buckets": [
64
+ 512
65
+ ],
66
+ "cast_type": "config",
67
+ "cc_pipeline_tiling_factor": 1,
68
+ "chunked_prefill_config": null,
69
+ "context_encoding_buckets": null,
70
+ "cp_degree": 1,
71
+ "ctx_batch_size": 1,
72
+ "disable_kv_cache_tiling": false,
73
+ "draft_model_modules_to_not_convert": null,
74
+ "enable_bucketing": true,
75
+ "enable_eagle_draft_input_norm": false,
76
+ "enable_eagle_speculation": false,
77
+ "enable_fused_speculation": false,
78
+ "enable_long_context_mode": false,
79
+ "enable_output_completion_notifications": false,
80
+ "enable_spill_reload_dge": false,
81
+ "enable_token_tree": false,
82
+ "ep_degree": 1,
83
+ "expert_mlp_nki_kernel_enabled": null,
84
+ "flash_decoding_enabled": false,
85
+ "fused_qkv": false,
86
+ "fused_rmsnorm_skip_gamma": false,
87
+ "is_block_kv_layout": null,
88
+ "is_chunked_prefill": false,
89
+ "is_continuous_batching": true,
90
+ "is_eagle_draft": false,
91
+ "is_medusa": false,
92
+ "is_prefill_stage": false,
93
+ "is_prefix_caching": false,
94
+ "k_cache_transposed": false,
95
+ "kv_cache_batch_size": 4,
96
+ "kv_cache_padding_size": 0,
97
+ "kv_cache_quant": false,
98
+ "kv_cache_tiling": false,
99
+ "layer_boundary_markers": false,
100
+ "lm_head_pad": false,
101
+ "lm_head_pad_alignment_size": 1,
102
+ "local_ranks_size": 2,
103
+ "logical_nc_config": 1,
104
+ "lora_config": null,
105
+ "max_batch_size": 4,
106
+ "max_context_length": 2048,
107
+ "max_length": 2048,
108
+ "max_new_tokens": null,
109
+ "medusa_speculation_length": 0,
110
+ "medusa_tree": null,
111
+ "mlp_kernel_enabled": false,
112
+ "mlp_kernel_fuse_residual_add": false,
113
+ "modules_to_not_convert": null,
114
+ "moe_fused_nki_kernel_enabled": null,
115
+ "n_active_tokens": 1,
116
+ "n_positions": 2048,
117
+ "num_medusa_heads": 0,
118
+ "on_cpu": false,
119
+ "on_device_sampling_config": {
120
+ "deterministic": false,
121
+ "do_sample": false,
122
+ "dynamic": true,
123
+ "global_topk": 256,
124
+ "on_device_sampling_config": true,
125
+ "temperature": 1.0,
126
+ "top_k": 1,
127
+ "top_k_kernel_enabled": false,
128
+ "top_p": 1.0
129
+ },
130
+ "output_logits": false,
131
+ "overrides_torch_dtype": true,
132
+ "pa_block_size": 2048,
133
+ "pa_num_blocks": 4,
134
+ "padding_side": "right",
135
+ "pp_degree": 1,
136
+ "prefix_buckets": null,
137
+ "qk_layernorm": false,
138
+ "qkv_kernel_enabled": false,
139
+ "qkv_kernel_fuse_residual_add": false,
140
+ "qkv_kernel_nbsd_layout": false,
141
+ "quantization_dtype": "int8",
142
+ "quantization_type": "per_tensor_symmetric",
143
+ "quantize_clamp_bound": Infinity,
144
+ "quantized": false,
145
+ "quantized_checkpoints_path": null,
146
+ "quantized_mlp_kernel_enabled": false,
147
+ "rmsnorm_quantize_kernel_enabled": false,
148
+ "router_topk_nki_kernel_enabled": null,
149
+ "rpl_reduce_dtype": null,
150
+ "save_sharded_checkpoint": true,
151
+ "scratchpad_page_size": null,
152
+ "seq_len": 2048,
153
+ "seq_len_threshold_for_cc_tiling": 16384,
154
+ "sequence_parallel_enabled": false,
155
+ "shared_mlp_nki_kernel_enabled": null,
156
+ "skip_sharding": false,
157
+ "skip_warmup": false,
158
+ "spec_batch_size": 4,
159
+ "speculation_length": 0,
160
+ "start_rank_id": 0,
161
+ "target": null,
162
+ "tile_cc": false,
163
+ "tkg_batch_size": 4,
164
+ "token_generation_buckets": [
165
+ 512
166
+ ],
167
+ "token_tree_config": null,
168
+ "torch_dtype": "bfloat16",
169
+ "tp_degree": 2,
170
+ "vocab_parallel": false,
171
+ "weight_gather_seq_len_threshold": 32768,
172
+ "weights_to_skip_layout_optimization": [],
173
+ "world_size": 2
174
+ },
175
+ "no_repeat_ngram_size": 0,
176
+ "num_attention_heads": 32,
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_cores_per_group": 1,
180
+ "num_hidden_layers": 32,
181
+ "num_key_value_heads": 8,
182
+ "num_return_sequences": 1,
183
+ "output_attentions": false,
184
+ "output_hidden_states": false,
185
+ "output_scores": false,
186
+ "pad_token_id": 0,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "remove_invalid_values": false,
191
+ "repetition_penalty": 1.0,
192
+ "return_dict": true,
193
+ "return_dict_in_generate": false,
194
+ "rms_norm_eps": 1e-05,
195
+ "rope_theta": 1000000.0,
196
+ "sep_token_id": null,
197
+ "sliding_window": null,
198
+ "suppress_tokens": null,
199
+ "task_specific_params": null,
200
+ "temperature": 1.0,
201
+ "tf_legacy_loss": false,
202
+ "tie_encoder_decoder": false,
203
+ "tie_word_embeddings": false,
204
+ "tokenizer_class": null,
205
+ "top_k": 50,
206
+ "top_p": 1.0,
207
+ "torchscript": false,
208
+ "transformers_version": "4.42.0.dev0",
209
+ "typical_p": 1.0,
210
+ "use_bfloat16": false,
211
+ "use_cache": true,
212
+ "vocab_size": 32768
213
+ }
token_generation_model/_tp0_bk3/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_668122c92a86c0ce6817+f94fe8ed.hlo_module.pb --output model.MODULE_668122c92a86c0ce6817+f94fe8ed.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --verbose=35
token_generation_model/_tp0_bk3/compile_flags.MODULE_668122c92a86c0ce6817+f94fe8ed.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=1", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/models/mistral-7b-v0.3-instruct-neuronx/token_generation_model/_tp0_bk3/log-neuron-cc.txt"]
token_generation_model/_tp0_bk3/global_metric_store.json ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.4054183959961,
5
+ "StaticProfiler::AveragePartitionUtilization": 98.58908081054688,
6
+ "StaticProfiler::AveragePeUtilization": 97.55974578857422,
7
+ "StaticProfiler::LocalizationEfficiency": 112.5713119506836,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 112.71009826660156,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1,
16
+ "StaticProfiler::AveragePartitionUtilization": 1,
17
+ "StaticProfiler::AveragePeUtilization": 1,
18
+ "StaticProfiler::LocalizationEfficiency": 1,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 2.305126667022705,
27
+ "AffinePredicateResolution": 0.09757089614868164,
28
+ "AliasDependencyElimination": 0.0027077198028564453,
29
+ "AliasDependencyInduction": 0.6443400382995605,
30
+ "AliasDependencyReset": 0.6698822975158691,
31
+ "BFComputeCutting": 0.07641983032226563,
32
+ "BirCodeGenLoop": 2.047102689743042,
33
+ "CCOpFusion": 0.708749532699585,
34
+ "CanonicalizeConv": 0.0,
35
+ "CanonicalizeDAGForPGTiling": 0.18178820610046387,
36
+ "CanonicalizeForTensorizer": 0.0006220000213943422,
37
+ "CanonicalizeIR": 0.11043667793273926,
38
+ "Canonicalizer": 0.006990000139921904,
39
+ "CoalesceCCOp": 0.1608116626739502,
40
+ "CommuteConcat": 0.036214590072631836,
41
+ "DMALocalityOpt": 0.02865910530090332,
42
+ "DMAProfiler": 0.07138633728027344,
43
+ "DMATilingProfiler": 0.06614851951599121,
44
+ "DataLocalityOpt": 2.321251392364502,
45
+ "DataStreaming": 0.1097254753112793,
46
+ "DeConcat": 0.02409839630126953,
47
+ "DeadCodeElimination": 0.035909414291381836,
48
+ "DeadStoreElimination": 1.309647560119629,
49
+ "DelinearIndices": 0.3166069984436035,
50
+ "Delinearization": 0.12859582901000977,
51
+ "DoNothing": 0.00027942657470703125,
52
+ "DramToDramTranspose": 1.095766544342041,
53
+ "DumpGraphAndMetadata": 0.16588640213012695,
54
+ "EliminateDivs": 0.2634701728820801,
55
+ "ExpandBatchNorm": 0.09694433212280273,
56
+ "ExpandISAMacro": 0.06791496276855469,
57
+ "FactorizeBlkDims": 0.3919196128845215,
58
+ "FactorizeThreadAxesInFreeDims": 0.04469728469848633,
59
+ "FlattenMacroLoop": 0.07828259468078613,
60
+ "GenericAccessSimplifier": 0.03183865547180176,
61
+ "HoistCompute": 0.00010399999882793054,
62
+ "IdentifyCrossPassTensors": 0.0005099999834783375,
63
+ "InferInitValue": 1.2782227993011475,
64
+ "InferIntrinsicOnCC": 0.35162997245788574,
65
+ "InferNeuronTensor": 1.7735645771026611,
66
+ "InferNonlocalTensors": 4.640493392944336,
67
+ "InferPSumTensor": 0.9281036853790283,
68
+ "InlineNativeKernels": 0.04500627517700195,
69
+ "InsertIOTransposes": 0.8502795696258545,
70
+ "InsertLocalTransposes": 0.7141804695129395,
71
+ "InsertOffloadedTransposes": 0.06985926628112793,
72
+ "LICM": 0.09515905380249023,
73
+ "LateLegalizeInst": 0.13829779624938965,
74
+ "LateLegalizePostSplit": 0.07408785820007324,
75
+ "LateLowerReshapeOp": 0.03981423377990723,
76
+ "LateLowerTensorOp": 0.9939045906066895,
77
+ "LateNeuronInstComb": 0.3960134983062744,
78
+ "LayoutPreprocessing": 0.9070339202880859,
79
+ "LayoutPreprocessingAndAnalysis": 1.2123572826385498,
80
+ "LayoutRequirementAnalysis": 0.2940239906311035,
81
+ "LegalizeCCOpLayout": 0.1556408405303955,
82
+ "LegalizeOpLevelAlias": 0.03197050094604492,
83
+ "LegalizePartitionReduce": 0.06994032859802246,
84
+ "LegalizeSundaAccess": 1.0174872875213623,
85
+ "LegalizeSundaMacro": 0.344529390335083,
86
+ "LegalizeType": 0.1504347324371338,
87
+ "LocalLayoutOpt": 0.7021169662475586,
88
+ "LoopFusion": 0.3479132652282715,
89
+ "LoopSplitting": 0.031319618225097656,
90
+ "LowerBroadcast": 0.08377981185913086,
91
+ "LowerCCOpBlockAxis": 0.21303391456604004,
92
+ "LowerComplexBroadcast": 0.13837647438049316,
93
+ "LowerIntrinsics": 0.8480772972106934,
94
+ "LowerTensorOp": 0.6734035015106201,
95
+ "LowerTranspose": 0.4170114994049072,
96
+ "MacroGeneration": 2.3995394706726074,
97
+ "MaskPropagation": 0.11813521385192871,
98
+ "MemcastMotion": 0.00015700000221841037,
99
+ "MemcpyElimination": 9.78618049621582,
100
+ "MutateDataType": 0.04571127891540527,
101
+ "NeuronAliasDependencyInduction": 0.018005847930908203,
102
+ "NeuronAliasDependencyReset": 0.033030033111572266,
103
+ "NeuronInstComb": 0.31530284881591797,
104
+ "NeuronLICM": 0.2629721164703369,
105
+ "NeuronLoopFusion": 0.7358701229095459,
106
+ "NeuronLoopInterchange": 0.04501771926879883,
107
+ "NeuronSimplifier": 0.37318873405456543,
108
+ "NeuronSimplifyPredicates": 0.059458255767822266,
109
+ "NeuronValueNumbering": 0.08360004425048828,
110
+ "OptimizeAliasedCopyChain": 0.016149282455444336,
111
+ "OptimizeNKIKernels": 0.36123156547546387,
112
+ "PAGLayoutOpt": 43.12562561035156,
113
+ "PComputeCutting": 0.31317949295043945,
114
+ "PGLayoutTilingPipeline": 57.15171813964844,
115
+ "PGTiling": 5.401483535766602,
116
+ "PadElimination": 0.01123952865600586,
117
+ "ParAxesAnnotation": 42.39716339111328,
118
+ "PartialLoopFusion": 0.4405984878540039,
119
+ "PartialSimdFusion": 0.33167028427124023,
120
+ "PenguinizeFunctions": 0.00023700000019744039,
121
+ "PerfectLoopNest": 0.057257890701293945,
122
+ "PruneFunctions": 0.00019799999427050352,
123
+ "RecognizeOpIdiom": 0.14631319046020508,
124
+ "Recompute": 0.008225679397583008,
125
+ "RelaxPredicates": 0.11752939224243164,
126
+ "Rematerialization": 0.1648578643798828,
127
+ "RemoveOptimizationBarriers": 0.0005300000193528831,
128
+ "ReshapeWeights": 0.02127981185913086,
129
+ "ResolveAccessConflict": 0.24170351028442383,
130
+ "ResolveComplicatePredicates": 0.05225372314453125,
131
+ "RewriteReplicationMatmul": 0.042751312255859375,
132
+ "RewriteWeights": 0.06114816665649414,
133
+ "SFKVectorizer": 6.08488655090332,
134
+ "ScatterMotion": 0.007275999989360571,
135
+ "SimpleAllReduceTiling": 0.0534822940826416,
136
+ "Simplifier": 0.11982178688049316,
137
+ "SimplifyMacroPredicates": 0.18191838264465332,
138
+ "SimplifyNeuronTensor": 0.40709972381591797,
139
+ "SimplifySlice": 0.07960724830627441,
140
+ "SimplifyTensor": 0.19757962226867676,
141
+ "SpillPSum": 0.577000617980957,
142
+ "SplitAPUnionSets": 0.2315382957458496,
143
+ "SplitAccGrp": 0.03763127326965332,
144
+ "StaticProfiler": 0.11375260353088379,
145
+ "StaticTransposeLocalTensor": 0.2317829132080078,
146
+ "SundaISel": 1.3966825008392334,
147
+ "TCTransform": 0.039675235748291016,
148
+ "TensorInitialization": 0.13313651084899902,
149
+ "TensorOpSimplifier": 0.5821249485015869,
150
+ "TensorOpTransform": 2.0534775257110596,
151
+ "TensorizerLegalizationPass": 0.0002500000118743628,
152
+ "TileCCOps": 0.2945268154144287,
153
+ "TilingProfiler": 0.41756296157836914,
154
+ "TransformConvOp": 0.069244384765625,
155
+ "TritiumFusion": 1.3019096851348877,
156
+ "ValueNumbering": 0.09937286376953125,
157
+ "VectorizeDMA": 0.11787676811218262,
158
+ "VectorizeMatMult": 0.0218658447265625,
159
+ "VerifySupportedOps": 0.00025599999935366213,
160
+ "WeightCoalescing": 0.046278953552246094,
161
+ "ZeroSizeTensorElimination": 0.00039458274841308594,
162
+ "algsimp": 0.0017810000572353601,
163
+ "batchnorm_expander": 0.0009689999860711396,
164
+ "boundary-marker-removal": 0.00036299999919719994,
165
+ "call-inliner": 0.0004079999926034361,
166
+ "canonicalize-boundary-marker": 0.0004490000137593597,
167
+ "collective-stream-id-checker": 0.00010399999882793054,
168
+ "comparison-expander": 0.0004090000002179295,
169
+ "computation-deduplicator": 0.0005169999785721302,
170
+ "conditional-to-select": 0.00012099999730708078,
171
+ "config-lowering": 0.0003060000017285347,
172
+ "constant_folding": 0.00023099999816622585,
173
+ "cse": 0.0005119999987073243,
174
+ "dce": 6.900000153109431e-05,
175
+ "dynamic-slice-transpose": 0.00020799999765586108,
176
+ "eliminate-redundant-compare": 0.00024300000222865492,
177
+ "emit-offloaded-dropout": 0.0003650000144261867,
178
+ "flatten-call-graph": 0.0003650000144261867,
179
+ "fuse-send-recv": 0.002230999991297722,
180
+ "hilo::LegalizeAlias": 0.0036810000892728567,
181
+ "hilo::NeuronInstCombine": 0.001101000001654029,
182
+ "hilo::NeuronOpFusion": 0.00039599998854100704,
183
+ "hilo::ReplaceTokenTypeWithU8Pass": 0.0004830000107176602,
184
+ "hilo::ScheduleFusion": 5.900000178371556e-05,
185
+ "hilo::SixtyFourHack": 0.00026000000070780516,
186
+ "hilo::VerifyAliasing": 0.00015300000086426735,
187
+ "hlo-mac-count": 0.0007660000119358301,
188
+ "hlo-verifier": 0.00834800023585558,
189
+ "io-con-pipe-begin": 7.999999979801942e-06,
190
+ "io-con-pipe-end": 9.999999974752427e-07,
191
+ "io-layout-normalization": 0.0012280000373721123,
192
+ "legalize-ccops": 2.5999999706982635e-05,
193
+ "legalize-compare": 0.00031900001340545714,
194
+ "lower-argminmax-custom-call": 0.00019999999494757503,
195
+ "map-inline": 0.000674000009894371,
196
+ "metadata-naming": 0.0016090000281110406,
197
+ "mlir::detail::OpToOpPassAdaptor": 0.00021300000662449747,
198
+ "mlir::hlo::MhloToPyPenguin": 0.05411199852824211,
199
+ "mlir::mhlo::LowerComplexExtraPass": 0.00292299990542233,
200
+ "mlir::mhlo::LowerComplexPass": 0.0035620001144707203,
201
+ "native-to-custom-softmax": 0.00039999998989515007,
202
+ "native-to-custom-softmax-dx": 0.00041599999531172216,
203
+ "operand_upcaster": 0.0007249999907799065,
204
+ "post-par-pipe-begin": 9.999999974752427e-07,
205
+ "post-par-pipe-end": 0.0,
206
+ "post-partition-simplification": 0.10395599901676178,
207
+ "pre-hlo-begin": 3.000000106112566e-06,
208
+ "pre-hlo-end": 9.999999974752427e-07,
209
+ "replace-minimum-constant": 0.0002280000044265762,
210
+ "reshape-mover": 9.699999645818025e-05,
211
+ "simplify-concat": 0.002274000085890293,
212
+ "simplify-while-loops": 9.200000204145908e-05,
213
+ "transform-variadic-reduce": 0.0007149999728426337,
214
+ "tuple-simplifier": 0.0002570000069681555,
215
+ "unpack-nested-aws-ntwsr": 0.0003380000125616789,
216
+ "unroll-while-loop": 1.9999999494757503e-05
217
+ },
218
+ "hilo": {
219
+ "HloMacCount": 14764212224.0,
220
+ "Traffic": 7787352576.0
221
+ },
222
+ "tensorizer": {
223
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 254771,
224
+ "StaticProfiler::AifUb": 5.7534074783325195,
225
+ "StaticProfiler::ArithmeticIntensityTensorizer": 6.476686000823975,
226
+ "StaticProfiler::AverageDmaLength": 6606.11767578125,
227
+ "StaticProfiler::DDRTransferBytes": 7394656764,
228
+ "StaticProfiler::InternalTransferBytes": 153915440,
229
+ "StaticProfiler::LoadExpanded": 1054235,
230
+ "StaticProfiler::StoreExpanded": 2218,
231
+ "StaticProfiler::TotalDMAExpanded": 1056453,
232
+ "StaticProfiler::TotalDynamicInstancesCount": 262628,
233
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 257749,
234
+ "StaticProfiler::TotalLNCComm": 0,
235
+ "StaticProfiler::TotalLNCCommTransfer": 0,
236
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0,
237
+ "TilingProfiler::DmaInstructionsAfterTiling": 0,
238
+ "TilingProfiler::GenericInstructionsAfterTiling": 281,
239
+ "TilingProfiler::MatMultInstructionsAfterTiling": 226304,
240
+ "TilingProfiler::NumPfTransposes": 330,
241
+ "TilingProfiler::NumPfTransposesForIo": 33,
242
+ "TilingProfiler::NumPfTransposesForLocal": 162,
243
+ "TilingProfiler::NumPfTransposesForNonlocal": 135,
244
+ "TilingProfiler::PfTransposeInstructions": 9400,
245
+ "TilingProfiler::PfTransposeInstructionsForIo": 4104,
246
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1060,
247
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 4236,
248
+ "TilingProfiler::ReduceInstructionsAfterTiling": 323,
249
+ "TilingProfiler::SimdInstructionsAfterTiling": 7045,
250
+ "TilingProfiler::TotalInstructionsAfterTiling": 0,
251
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
252
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
253
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
254
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
255
+ "TransformConvOp::conv2d_column_packing": 0,
256
+ "TransformConvOp::conv2d_column_packing_1": 0,
257
+ "TransformConvOp::conv2d_column_packing_io10": 0,
258
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
259
+ }
260
+ },
261
+ "all": {
262
+ "compiletime": {
263
+ "CanonicalizeConv": 0.0,
264
+ "CanonicalizeForTensorizer": 0.0006220000213943422,
265
+ "Canonicalizer": 0.006990000139921904,
266
+ "HoistCompute": 0.00010399999882793054,
267
+ "IdentifyCrossPassTensors": 0.0005099999834783375,
268
+ "MemcastMotion": 0.00015700000221841037,
269
+ "PenguinizeFunctions": 0.00023700000019744039,
270
+ "PruneFunctions": 0.00019799999427050352,
271
+ "RemoveOptimizationBarriers": 0.0005300000193528831,
272
+ "ScatterMotion": 0.007275999989360571,
273
+ "TensorizerLegalizationPass": 0.0002500000118743628,
274
+ "VerifySupportedOps": 0.00025599999935366213,
275
+ "algsimp": 0.0017810000572353601,
276
+ "batchnorm_expander": 0.0009689999860711396,
277
+ "boundary-marker-removal": 0.00036299999919719994,
278
+ "call-inliner": 0.0004079999926034361,
279
+ "canonicalize-boundary-marker": 0.0004490000137593597,
280
+ "collective-stream-id-checker": 0.00010399999882793054,
281
+ "comparison-expander": 0.0004090000002179295,
282
+ "computation-deduplicator": 0.0005169999785721302,
283
+ "conditional-to-select": 0.00012099999730708078,
284
+ "config-lowering": 0.0003060000017285347,
285
+ "constant_folding": 0.00023099999816622585,
286
+ "cse": 0.0005119999987073243,
287
+ "dce": 6.900000153109431e-05,
288
+ "dynamic-slice-transpose": 0.00020799999765586108,
289
+ "eliminate-redundant-compare": 0.00024300000222865492,
290
+ "emit-offloaded-dropout": 0.0003650000144261867,
291
+ "flatten-call-graph": 0.0003650000144261867,
292
+ "fuse-send-recv": 0.002230999991297722,
293
+ "hilo::LegalizeAlias": 0.0036810000892728567,
294
+ "hilo::NeuronInstCombine": 0.001101000001654029,
295
+ "hilo::NeuronOpFusion": 0.00039599998854100704,
296
+ "hilo::ReplaceTokenTypeWithU8Pass": 0.0004830000107176602,
297
+ "hilo::ScheduleFusion": 5.900000178371556e-05,
298
+ "hilo::SixtyFourHack": 0.00026000000070780516,
299
+ "hilo::VerifyAliasing": 0.00015300000086426735,
300
+ "hlo-mac-count": 0.0007660000119358301,
301
+ "hlo-verifier": 0.00834800023585558,
302
+ "io-con-pipe-begin": 7.999999979801942e-06,
303
+ "io-con-pipe-end": 9.999999974752427e-07,
304
+ "io-layout-normalization": 0.0012280000373721123,
305
+ "legalize-ccops": 2.5999999706982635e-05,
306
+ "legalize-compare": 0.00031900001340545714,
307
+ "lower-argminmax-custom-call": 0.00019999999494757503,
308
+ "map-inline": 0.000674000009894371,
309
+ "metadata-naming": 0.0016090000281110406,
310
+ "mlir::detail::OpToOpPassAdaptor": 0.00021300000662449747,
311
+ "mlir::hlo::MhloToPyPenguin": 0.05411199852824211,
312
+ "mlir::mhlo::LowerComplexExtraPass": 0.00292299990542233,
313
+ "mlir::mhlo::LowerComplexPass": 0.0035620001144707203,
314
+ "native-to-custom-softmax": 0.00039999998989515007,
315
+ "native-to-custom-softmax-dx": 0.00041599999531172216,
316
+ "operand_upcaster": 0.0007249999907799065,
317
+ "post-par-pipe-begin": 9.999999974752427e-07,
318
+ "post-par-pipe-end": 0.0,
319
+ "post-partition-simplification": 0.10395599901676178,
320
+ "pre-hlo-begin": 3.000000106112566e-06,
321
+ "pre-hlo-end": 9.999999974752427e-07,
322
+ "replace-minimum-constant": 0.0002280000044265762,
323
+ "reshape-mover": 9.699999645818025e-05,
324
+ "simplify-concat": 0.002274000085890293,
325
+ "simplify-while-loops": 9.200000204145908e-05,
326
+ "transform-variadic-reduce": 0.0007149999728426337,
327
+ "tuple-simplifier": 0.0002570000069681555,
328
+ "unpack-nested-aws-ntwsr": 0.0003380000125616789,
329
+ "unroll-while-loop": 1.9999999494757503e-05
330
+ }
331
+ },
332
+ "cumsum": {
333
+ "compiletime": {
334
+ "CoalesceCCOp": 0.0002467632293701172,
335
+ "DMALocalityOpt": 0.0002090930938720703,
336
+ "DMAProfiler": 0.0008313655853271484,
337
+ "DataStreaming": 0.0002913475036621094,
338
+ "DoNothing": 0.00015354156494140625,
339
+ "ExpandISAMacro": 0.0005238056182861328,
340
+ "FactorizeBlkDims": 0.00044465065002441406,
341
+ "InferPSumTensor": 0.00044846534729003906,
342
+ "LateLegalizeInst": 0.00041985511779785156,
343
+ "LateNeuronInstComb": 0.0004794597625732422,
344
+ "LegalizeSundaAccess": 0.0014634132385253906,
345
+ "LegalizeType": 0.00027251243591308594,
346
+ "LowerBroadcast": 0.00026226043701171875,
347
+ "LowerIntrinsics": 0.0002529621124267578,
348
+ "LowerTranspose": 0.0002646446228027344,
349
+ "NeuronInstComb": 0.0004792213439941406,
350
+ "NeuronLICM": 0.0004215240478515625,
351
+ "NeuronSimplifyPredicates": 0.002805471420288086,
352
+ "NeuronValueNumbering": 0.0004172325134277344,
353
+ "SFKVectorizer": 0.002515554428100586,
354
+ "SimpleAllReduceTiling": 0.00023794174194335938,
355
+ "SimplifyNeuronTensor": 0.0004241466522216797,
356
+ "SpillPSum": 0.0005099773406982422,
357
+ "WeightCoalescing": 0.00024390220642089844
358
+ }
359
+ },
360
+ "sg00": {
361
+ "hilo": {
362
+ "ArithmeticIntensity": 3.7918436527252197,
363
+ "HloMacCount": 14764212224.0,
364
+ "Traffic": 7787352576.0
365
+ }
366
+ },
367
+ "sg0000": {
368
+ "compiletime": {
369
+ "AGOrderingAnalysisPass": 2.305126667022705,
370
+ "AffinePredicateResolution": 0.09757089614868164,
371
+ "AliasDependencyElimination": 0.0027077198028564453,
372
+ "AliasDependencyInduction": 0.6443400382995605,
373
+ "AliasDependencyReset": 0.6698822975158691,
374
+ "BFComputeCutting": 0.07641983032226563,
375
+ "BirCodeGenLoop": 2.047102689743042,
376
+ "CCOpFusion": 0.708749532699585,
377
+ "CanonicalizeDAGForPGTiling": 0.18178820610046387,
378
+ "CanonicalizeIR": 0.11043667793273926,
379
+ "CoalesceCCOp": 0.16056489944458008,
380
+ "CommuteConcat": 0.036214590072631836,
381
+ "DMALocalityOpt": 0.02845001220703125,
382
+ "DMAProfiler": 0.07055497169494629,
383
+ "DMATilingProfiler": 0.06614851951599121,
384
+ "DataLocalityOpt": 2.321251392364502,
385
+ "DataStreaming": 0.10943412780761719,
386
+ "DeConcat": 0.02409839630126953,
387
+ "DeadCodeElimination": 0.035909414291381836,
388
+ "DeadStoreElimination": 1.309647560119629,
389
+ "DelinearIndices": 0.3166069984436035,
390
+ "Delinearization": 0.12859582901000977,
391
+ "DoNothing": 0.000125885009765625,
392
+ "DramToDramTranspose": 1.095766544342041,
393
+ "DumpGraphAndMetadata": 0.16588640213012695,
394
+ "EliminateDivs": 0.2634701728820801,
395
+ "ExpandBatchNorm": 0.09694433212280273,
396
+ "ExpandISAMacro": 0.06739115715026855,
397
+ "FactorizeBlkDims": 0.39147496223449707,
398
+ "FactorizeThreadAxesInFreeDims": 0.04469728469848633,
399
+ "FlattenMacroLoop": 0.07828259468078613,
400
+ "GenericAccessSimplifier": 0.03183865547180176,
401
+ "InferInitValue": 1.2782227993011475,
402
+ "InferIntrinsicOnCC": 0.35162997245788574,
403
+ "InferNeuronTensor": 1.7735645771026611,
404
+ "InferNonlocalTensors": 4.640493392944336,
405
+ "InferPSumTensor": 0.9276552200317383,
406
+ "InlineNativeKernels": 0.04500627517700195,
407
+ "InsertIOTransposes": 0.8502795696258545,
408
+ "InsertLocalTransposes": 0.7141804695129395,
409
+ "InsertOffloadedTransposes": 0.06985926628112793,
410
+ "LICM": 0.09515905380249023,
411
+ "LateLegalizeInst": 0.1378779411315918,
412
+ "LateLegalizePostSplit": 0.07408785820007324,
413
+ "LateLowerReshapeOp": 0.03981423377990723,
414
+ "LateLowerTensorOp": 0.9939045906066895,
415
+ "LateNeuronInstComb": 0.39553403854370117,
416
+ "LayoutPreprocessing": 0.9070339202880859,
417
+ "LayoutPreprocessingAndAnalysis": 1.2123572826385498,
418
+ "LayoutRequirementAnalysis": 0.2940239906311035,
419
+ "LegalizeCCOpLayout": 0.1556408405303955,
420
+ "LegalizeOpLevelAlias": 0.03197050094604492,
421
+ "LegalizePartitionReduce": 0.06994032859802246,
422
+ "LegalizeSundaAccess": 1.016023874282837,
423
+ "LegalizeSundaMacro": 0.344529390335083,
424
+ "LegalizeType": 0.1501622200012207,
425
+ "LocalLayoutOpt": 0.7021169662475586,
426
+ "LoopFusion": 0.3479132652282715,
427
+ "LoopSplitting": 0.031319618225097656,
428
+ "LowerBroadcast": 0.08351755142211914,
429
+ "LowerCCOpBlockAxis": 0.21303391456604004,
430
+ "LowerComplexBroadcast": 0.13837647438049316,
431
+ "LowerIntrinsics": 0.8478243350982666,
432
+ "LowerTensorOp": 0.6734035015106201,
433
+ "LowerTranspose": 0.4167468547821045,
434
+ "MacroGeneration": 2.3995394706726074,
435
+ "MaskPropagation": 0.11813521385192871,
436
+ "MemcpyElimination": 9.78618049621582,
437
+ "MutateDataType": 0.04571127891540527,
438
+ "NeuronAliasDependencyInduction": 0.018005847930908203,
439
+ "NeuronAliasDependencyReset": 0.033030033111572266,
440
+ "NeuronInstComb": 0.31482362747192383,
441
+ "NeuronLICM": 0.26255059242248535,
442
+ "NeuronLoopFusion": 0.7358701229095459,
443
+ "NeuronLoopInterchange": 0.04501771926879883,
444
+ "NeuronSimplifier": 0.37318873405456543,
445
+ "NeuronSimplifyPredicates": 0.05665278434753418,
446
+ "NeuronValueNumbering": 0.08318281173706055,
447
+ "OptimizeAliasedCopyChain": 0.016149282455444336,
448
+ "OptimizeNKIKernels": 0.36123156547546387,
449
+ "PAGLayoutOpt": 43.12562561035156,
450
+ "PComputeCutting": 0.31317949295043945,
451
+ "PGLayoutTilingPipeline": 57.15171813964844,
452
+ "PGTiling": 5.401483535766602,
453
+ "PadElimination": 0.01123952865600586,
454
+ "ParAxesAnnotation": 42.39716339111328,
455
+ "PartialLoopFusion": 0.4405984878540039,
456
+ "PartialSimdFusion": 0.33167028427124023,
457
+ "PerfectLoopNest": 0.057257890701293945,
458
+ "RecognizeOpIdiom": 0.14631319046020508,
459
+ "Recompute": 0.008225679397583008,
460
+ "RelaxPredicates": 0.11752939224243164,
461
+ "Rematerialization": 0.1648578643798828,
462
+ "ReshapeWeights": 0.02127981185913086,
463
+ "ResolveAccessConflict": 0.24170351028442383,
464
+ "ResolveComplicatePredicates": 0.05225372314453125,
465
+ "RewriteReplicationMatmul": 0.042751312255859375,
466
+ "RewriteWeights": 0.06114816665649414,
467
+ "SFKVectorizer": 6.082370758056641,
468
+ "SimpleAllReduceTiling": 0.05324435234069824,
469
+ "Simplifier": 0.11982178688049316,
470
+ "SimplifyMacroPredicates": 0.18191838264465332,
471
+ "SimplifyNeuronTensor": 0.4066755771636963,
472
+ "SimplifySlice": 0.07960724830627441,
473
+ "SimplifyTensor": 0.19757962226867676,
474
+ "SpillPSum": 0.5764906406402588,
475
+ "SplitAPUnionSets": 0.2315382957458496,
476
+ "SplitAccGrp": 0.03763127326965332,
477
+ "StaticProfiler": 0.11375260353088379,
478
+ "StaticTransposeLocalTensor": 0.2317829132080078,
479
+ "SundaISel": 1.3966825008392334,
480
+ "TCTransform": 0.039675235748291016,
481
+ "TensorInitialization": 0.13313651084899902,
482
+ "TensorOpSimplifier": 0.5821249485015869,
483
+ "TensorOpTransform": 2.0534775257110596,
484
+ "TileCCOps": 0.2945268154144287,
485
+ "TilingProfiler": 0.41756296157836914,
486
+ "TransformConvOp": 0.069244384765625,
487
+ "TritiumFusion": 1.3019096851348877,
488
+ "ValueNumbering": 0.09937286376953125,
489
+ "VectorizeDMA": 0.11787676811218262,
490
+ "VectorizeMatMult": 0.0218658447265625,
491
+ "WeightCoalescing": 0.046035051345825195,
492
+ "ZeroSizeTensorElimination": 0.00039458274841308594
493
+ },
494
+ "tensorizer": {
495
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 254771,
496
+ "StaticProfiler::AifUb": 5.7534074783325195,
497
+ "StaticProfiler::ArithmeticIntensityTensorizer": 6.476686000823975,
498
+ "StaticProfiler::AverageDmaLength": 6606.11767578125,
499
+ "StaticProfiler::AverageFractalPeUtilization": 99.4054183959961,
500
+ "StaticProfiler::AveragePartitionUtilization": 98.58908081054688,
501
+ "StaticProfiler::AveragePeUtilization": 97.55974578857422,
502
+ "StaticProfiler::DDRTransferBytes": 7394656764,
503
+ "StaticProfiler::InternalTransferBytes": 153915440,
504
+ "StaticProfiler::LoadExpanded": 1054235,
505
+ "StaticProfiler::LocalizationEfficiency": 112.5713119506836,
506
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 112.71009826660156,
507
+ "StaticProfiler::StoreExpanded": 2218,
508
+ "StaticProfiler::TotalDMAExpanded": 1056453,
509
+ "StaticProfiler::TotalDynamicInstancesCount": 262628,
510
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 257749,
511
+ "StaticProfiler::TotalLNCComm": 0,
512
+ "StaticProfiler::TotalLNCCommTransfer": 0,
513
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
514
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0,
515
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0,
516
+ "TilingProfiler::DmaInstructionsAfterTiling": 0,
517
+ "TilingProfiler::GenericInstructionsAfterTiling": 281,
518
+ "TilingProfiler::MatMultInstructionsAfterTiling": 226304,
519
+ "TilingProfiler::NumPfTransposes": 330,
520
+ "TilingProfiler::NumPfTransposesForIo": 33,
521
+ "TilingProfiler::NumPfTransposesForLocal": 162,
522
+ "TilingProfiler::NumPfTransposesForNonlocal": 135,
523
+ "TilingProfiler::PfTransposeInstructions": 9400,
524
+ "TilingProfiler::PfTransposeInstructionsForIo": 4104,
525
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1060,
526
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 4236,
527
+ "TilingProfiler::ReduceInstructionsAfterTiling": 323,
528
+ "TilingProfiler::SimdInstructionsAfterTiling": 7045,
529
+ "TilingProfiler::TotalInstructionsAfterTiling": 0,
530
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
531
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
532
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
533
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
534
+ "TransformConvOp::conv2d_column_packing": 0,
535
+ "TransformConvOp::conv2d_column_packing_1": 0,
536
+ "TransformConvOp::conv2d_column_packing_io10": 0,
537
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
538
+ }
539
+ }
540
+ }
token_generation_model/_tp0_bk3/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
token_generation_model/_tp0_bk3/neuron_config.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "MistralForCausalLM"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "attribute_map": {},
10
+ "bad_words_ids": null,
11
+ "begin_suppress_tokens": null,
12
+ "bos_token_id": 1,
13
+ "chunk_size_feed_forward": 0,
14
+ "cross_attention_hidden_size": null,
15
+ "decoder_start_token_id": null,
16
+ "diversity_penalty": 0.0,
17
+ "do_sample": false,
18
+ "early_stopping": false,
19
+ "encoder_no_repeat_ngram_size": 0,
20
+ "eos_token_id": 2,
21
+ "exponential_decay_length_penalty": null,
22
+ "finetuning_task": null,
23
+ "forced_bos_token_id": null,
24
+ "forced_eos_token_id": null,
25
+ "fused_spec_config": null,
26
+ "head_dim": 128,
27
+ "hidden_act": "silu",
28
+ "hidden_size": 4096,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 14336,
35
+ "is_decoder": false,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "length_penalty": 1.0,
42
+ "max_length": 20,
43
+ "max_position_embeddings": 32768,
44
+ "metadata": null,
45
+ "min_length": 0,
46
+ "model_type": "mistral",
47
+ "neuron_config": {
48
+ "activation_quantization_type": null,
49
+ "allow_input_truncation": false,
50
+ "apply_seq_ids_mask": false,
51
+ "async_mode": false,
52
+ "attention_dp_degree": 1,
53
+ "attention_dtype": null,
54
+ "attn_block_cte_nki_kernel_enabled": false,
55
+ "attn_block_tkg_nki_kernel_cache_update": false,
56
+ "attn_block_tkg_nki_kernel_enabled": false,
57
+ "attn_cls": "NeuronLlamaAttention",
58
+ "attn_kernel_enabled": null,
59
+ "attn_tkg_builtin_kernel_enabled": false,
60
+ "attn_tkg_nki_kernel_enabled": false,
61
+ "batch_size": 4,
62
+ "bucket_n_active_tokens": false,
63
+ "buckets": [
64
+ 1024
65
+ ],
66
+ "cast_type": "config",
67
+ "cc_pipeline_tiling_factor": 1,
68
+ "chunked_prefill_config": null,
69
+ "context_encoding_buckets": null,
70
+ "cp_degree": 1,
71
+ "ctx_batch_size": 1,
72
+ "disable_kv_cache_tiling": false,
73
+ "draft_model_modules_to_not_convert": null,
74
+ "enable_bucketing": true,
75
+ "enable_eagle_draft_input_norm": false,
76
+ "enable_eagle_speculation": false,
77
+ "enable_fused_speculation": false,
78
+ "enable_long_context_mode": false,
79
+ "enable_output_completion_notifications": false,
80
+ "enable_spill_reload_dge": false,
81
+ "enable_token_tree": false,
82
+ "ep_degree": 1,
83
+ "expert_mlp_nki_kernel_enabled": null,
84
+ "flash_decoding_enabled": false,
85
+ "fused_qkv": false,
86
+ "fused_rmsnorm_skip_gamma": false,
87
+ "is_block_kv_layout": null,
88
+ "is_chunked_prefill": false,
89
+ "is_continuous_batching": true,
90
+ "is_eagle_draft": false,
91
+ "is_medusa": false,
92
+ "is_prefill_stage": false,
93
+ "is_prefix_caching": false,
94
+ "k_cache_transposed": false,
95
+ "kv_cache_batch_size": 4,
96
+ "kv_cache_padding_size": 0,
97
+ "kv_cache_quant": false,
98
+ "kv_cache_tiling": false,
99
+ "layer_boundary_markers": false,
100
+ "lm_head_pad": false,
101
+ "lm_head_pad_alignment_size": 1,
102
+ "local_ranks_size": 2,
103
+ "logical_nc_config": 1,
104
+ "lora_config": null,
105
+ "max_batch_size": 4,
106
+ "max_context_length": 2048,
107
+ "max_length": 2048,
108
+ "max_new_tokens": null,
109
+ "medusa_speculation_length": 0,
110
+ "medusa_tree": null,
111
+ "mlp_kernel_enabled": false,
112
+ "mlp_kernel_fuse_residual_add": false,
113
+ "modules_to_not_convert": null,
114
+ "moe_fused_nki_kernel_enabled": null,
115
+ "n_active_tokens": 1,
116
+ "n_positions": 2048,
117
+ "num_medusa_heads": 0,
118
+ "on_cpu": false,
119
+ "on_device_sampling_config": {
120
+ "deterministic": false,
121
+ "do_sample": false,
122
+ "dynamic": true,
123
+ "global_topk": 256,
124
+ "on_device_sampling_config": true,
125
+ "temperature": 1.0,
126
+ "top_k": 1,
127
+ "top_k_kernel_enabled": false,
128
+ "top_p": 1.0
129
+ },
130
+ "output_logits": false,
131
+ "overrides_torch_dtype": true,
132
+ "pa_block_size": 2048,
133
+ "pa_num_blocks": 4,
134
+ "padding_side": "right",
135
+ "pp_degree": 1,
136
+ "prefix_buckets": null,
137
+ "qk_layernorm": false,
138
+ "qkv_kernel_enabled": false,
139
+ "qkv_kernel_fuse_residual_add": false,
140
+ "qkv_kernel_nbsd_layout": false,
141
+ "quantization_dtype": "int8",
142
+ "quantization_type": "per_tensor_symmetric",
143
+ "quantize_clamp_bound": Infinity,
144
+ "quantized": false,
145
+ "quantized_checkpoints_path": null,
146
+ "quantized_mlp_kernel_enabled": false,
147
+ "rmsnorm_quantize_kernel_enabled": false,
148
+ "router_topk_nki_kernel_enabled": null,
149
+ "rpl_reduce_dtype": null,
150
+ "save_sharded_checkpoint": true,
151
+ "scratchpad_page_size": null,
152
+ "seq_len": 2048,
153
+ "seq_len_threshold_for_cc_tiling": 16384,
154
+ "sequence_parallel_enabled": false,
155
+ "shared_mlp_nki_kernel_enabled": null,
156
+ "skip_sharding": false,
157
+ "skip_warmup": false,
158
+ "spec_batch_size": 4,
159
+ "speculation_length": 0,
160
+ "start_rank_id": 0,
161
+ "target": null,
162
+ "tile_cc": false,
163
+ "tkg_batch_size": 4,
164
+ "token_generation_buckets": [
165
+ 1024
166
+ ],
167
+ "token_tree_config": null,
168
+ "torch_dtype": "bfloat16",
169
+ "tp_degree": 2,
170
+ "vocab_parallel": false,
171
+ "weight_gather_seq_len_threshold": 32768,
172
+ "weights_to_skip_layout_optimization": [],
173
+ "world_size": 2
174
+ },
175
+ "no_repeat_ngram_size": 0,
176
+ "num_attention_heads": 32,
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_cores_per_group": 1,
180
+ "num_hidden_layers": 32,
181
+ "num_key_value_heads": 8,
182
+ "num_return_sequences": 1,
183
+ "output_attentions": false,
184
+ "output_hidden_states": false,
185
+ "output_scores": false,
186
+ "pad_token_id": 0,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "remove_invalid_values": false,
191
+ "repetition_penalty": 1.0,
192
+ "return_dict": true,
193
+ "return_dict_in_generate": false,
194
+ "rms_norm_eps": 1e-05,
195
+ "rope_theta": 1000000.0,
196
+ "sep_token_id": null,
197
+ "sliding_window": null,
198
+ "suppress_tokens": null,
199
+ "task_specific_params": null,
200
+ "temperature": 1.0,
201
+ "tf_legacy_loss": false,
202
+ "tie_encoder_decoder": false,
203
+ "tie_word_embeddings": false,
204
+ "tokenizer_class": null,
205
+ "top_k": 50,
206
+ "top_p": 1.0,
207
+ "torchscript": false,
208
+ "transformers_version": "4.42.0.dev0",
209
+ "typical_p": 1.0,
210
+ "use_bfloat16": false,
211
+ "use_cache": true,
212
+ "vocab_size": 32768
213
+ }
token_generation_model/_tp0_bk4/command.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ neuronx-cc compile --framework=XLA model.MODULE_fb6decaa94b1936d08da+1b5847e3.hlo_module.pb --output model.MODULE_fb6decaa94b1936d08da+1b5847e3.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --verbose=35
token_generation_model/_tp0_bk4/compile_flags.MODULE_fb6decaa94b1936d08da+1b5847e3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=1", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/models/mistral-7b-v0.3-instruct-neuronx/token_generation_model/_tp0_bk4/log-neuron-cc.txt"]
token_generation_model/_tp0_bk4/global_metric_store.json ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Average": {
3
+ "tensorizer": {
4
+ "StaticProfiler::AverageFractalPeUtilization": 99.4205093383789,
5
+ "StaticProfiler::AveragePartitionUtilization": 98.67665100097656,
6
+ "StaticProfiler::AveragePeUtilization": 97.56974029541016,
7
+ "StaticProfiler::LocalizationEfficiency": 108.62796020507813,
8
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 108.75718688964844,
9
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
10
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0
11
+ }
12
+ },
13
+ "Count": {
14
+ "tensorizer": {
15
+ "StaticProfiler::AverageFractalPeUtilization": 1,
16
+ "StaticProfiler::AveragePartitionUtilization": 1,
17
+ "StaticProfiler::AveragePeUtilization": 1,
18
+ "StaticProfiler::LocalizationEfficiency": 1,
19
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1,
20
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1,
21
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 1
22
+ }
23
+ },
24
+ "Sum": {
25
+ "compiletime": {
26
+ "AGOrderingAnalysisPass": 2.275869369506836,
27
+ "AffinePredicateResolution": 0.10508346557617188,
28
+ "AliasDependencyElimination": 0.0029473304748535156,
29
+ "AliasDependencyInduction": 1.0161068439483643,
30
+ "AliasDependencyReset": 1.0964906215667725,
31
+ "BFComputeCutting": 0.3273308277130127,
32
+ "BirCodeGenLoop": 1.817042350769043,
33
+ "CCOpFusion": 0.759657621383667,
34
+ "CanonicalizeConv": 1.8000000636675395e-05,
35
+ "CanonicalizeDAGForPGTiling": 0.18803882598876953,
36
+ "CanonicalizeForTensorizer": 0.0005799999926239252,
37
+ "CanonicalizeIR": 0.11030387878417969,
38
+ "Canonicalizer": 0.007720999885350466,
39
+ "CoalesceCCOp": 0.17146921157836914,
40
+ "CommuteConcat": 0.03537917137145996,
41
+ "DMALocalityOpt": 0.03381848335266113,
42
+ "DMAProfiler": 0.08129215240478516,
43
+ "DMATilingProfiler": 0.07448625564575195,
44
+ "DataLocalityOpt": 2.595104932785034,
45
+ "DataStreaming": 0.12432718276977539,
46
+ "DeConcat": 0.026020050048828125,
47
+ "DeadCodeElimination": 0.03551626205444336,
48
+ "DeadStoreElimination": 1.3150177001953125,
49
+ "DelinearIndices": 0.3447244167327881,
50
+ "Delinearization": 0.1470789909362793,
51
+ "DoNothing": 0.00028705596923828125,
52
+ "DramToDramTranspose": 1.1585099697113037,
53
+ "DumpGraphAndMetadata": 0.16599822044372559,
54
+ "EliminateDivs": 0.16966032981872559,
55
+ "ExpandBatchNorm": 0.08444547653198242,
56
+ "ExpandISAMacro": 0.07590079307556152,
57
+ "FactorizeBlkDims": 0.41552281379699707,
58
+ "FactorizeThreadAxesInFreeDims": 0.05351138114929199,
59
+ "FlattenMacroLoop": 0.08939695358276367,
60
+ "GenericAccessSimplifier": 0.03192710876464844,
61
+ "HoistCompute": 9.600000339560211e-05,
62
+ "IdentifyCrossPassTensors": 0.00026000000070780516,
63
+ "InferInitValue": 1.1084926128387451,
64
+ "InferIntrinsicOnCC": 0.36765527725219727,
65
+ "InferNeuronTensor": 1.879042387008667,
66
+ "InferNonlocalTensors": 4.959750175476074,
67
+ "InferPSumTensor": 0.9908087253570557,
68
+ "InlineNativeKernels": 0.04849410057067871,
69
+ "InsertIOTransposes": 0.9103872776031494,
70
+ "InsertLocalTransposes": 0.7676966190338135,
71
+ "InsertOffloadedTransposes": 0.0744314193725586,
72
+ "LICM": 0.10265731811523438,
73
+ "LateLegalizeInst": 0.1494448184967041,
74
+ "LateLegalizePostSplit": 0.08162188529968262,
75
+ "LateLowerReshapeOp": 0.043623924255371094,
76
+ "LateLowerTensorOp": 0.7195582389831543,
77
+ "LateNeuronInstComb": 0.4356348514556885,
78
+ "LayoutPreprocessing": 1.0318124294281006,
79
+ "LayoutPreprocessingAndAnalysis": 1.369814157485962,
80
+ "LayoutRequirementAnalysis": 0.32599711418151855,
81
+ "LegalizeCCOpLayout": 0.11543631553649902,
82
+ "LegalizeOpLevelAlias": 0.042886972427368164,
83
+ "LegalizePartitionReduce": 0.07607841491699219,
84
+ "LegalizeSundaAccess": 1.0659985542297363,
85
+ "LegalizeSundaMacro": 0.3716259002685547,
86
+ "LegalizeType": 0.17189598083496094,
87
+ "LocalLayoutOpt": 0.7340552806854248,
88
+ "LoopFusion": 0.35437893867492676,
89
+ "LoopSplitting": 0.03284788131713867,
90
+ "LowerBroadcast": 0.0972299575805664,
91
+ "LowerCCOpBlockAxis": 0.4246203899383545,
92
+ "LowerComplexBroadcast": 0.1469728946685791,
93
+ "LowerIntrinsics": 0.607001543045044,
94
+ "LowerTensorOp": 0.625375509262085,
95
+ "LowerTranspose": 0.4381232261657715,
96
+ "MacroGeneration": 2.3300650119781494,
97
+ "MaskPropagation": 0.13726162910461426,
98
+ "MemcastMotion": 0.00020199999562464654,
99
+ "MemcpyElimination": 9.993162155151367,
100
+ "MutateDataType": 0.04567551612854004,
101
+ "NeuronAliasDependencyInduction": 0.018932819366455078,
102
+ "NeuronAliasDependencyReset": 0.03390645980834961,
103
+ "NeuronInstComb": 0.35080647468566895,
104
+ "NeuronLICM": 0.26523780822753906,
105
+ "NeuronLoopFusion": 0.7789182662963867,
106
+ "NeuronLoopInterchange": 0.04993152618408203,
107
+ "NeuronSimplifier": 0.3916890621185303,
108
+ "NeuronSimplifyPredicates": 0.06646299362182617,
109
+ "NeuronValueNumbering": 0.09050369262695313,
110
+ "OptimizeAliasedCopyChain": 0.024803638458251953,
111
+ "OptimizeNKIKernels": 0.3809840679168701,
112
+ "PAGLayoutOpt": 44.665672302246094,
113
+ "PComputeCutting": 0.327991247177124,
114
+ "PGLayoutTilingPipeline": 59.74672317504883,
115
+ "PGTiling": 5.59335994720459,
116
+ "PadElimination": 0.014229774475097656,
117
+ "ParAxesAnnotation": 43.884578704833984,
118
+ "PartialLoopFusion": 0.4160308837890625,
119
+ "PartialSimdFusion": 0.35861706733703613,
120
+ "PenguinizeFunctions": 0.00023999999393709004,
121
+ "PerfectLoopNest": 0.06206011772155762,
122
+ "PruneFunctions": 0.00014600000577047467,
123
+ "RecognizeOpIdiom": 0.14852476119995117,
124
+ "Recompute": 0.00867772102355957,
125
+ "RelaxPredicates": 0.12411117553710938,
126
+ "Rematerialization": 0.15559935569763184,
127
+ "RemoveOptimizationBarriers": 0.00010399999882793054,
128
+ "ReshapeWeights": 0.0252685546875,
129
+ "ResolveAccessConflict": 0.23229002952575684,
130
+ "ResolveComplicatePredicates": 0.08260774612426758,
131
+ "RewriteReplicationMatmul": 0.04953479766845703,
132
+ "RewriteWeights": 0.06951451301574707,
133
+ "SFKVectorizer": 5.916876792907715,
134
+ "ScatterMotion": 0.009108999744057655,
135
+ "SimpleAllReduceTiling": 0.05846905708312988,
136
+ "Simplifier": 0.12264633178710938,
137
+ "SimplifyMacroPredicates": 0.19836044311523438,
138
+ "SimplifyNeuronTensor": 0.7509462833404541,
139
+ "SimplifySlice": 0.033946990966796875,
140
+ "SimplifyTensor": 0.20671844482421875,
141
+ "SpillPSum": 0.6513323783874512,
142
+ "SplitAPUnionSets": 0.25449252128601074,
143
+ "SplitAccGrp": 0.04361081123352051,
144
+ "StaticProfiler": 0.44228219985961914,
145
+ "StaticTransposeLocalTensor": 0.25648951530456543,
146
+ "SundaISel": 1.4593234062194824,
147
+ "TCTransform": 0.0390927791595459,
148
+ "TensorInitialization": 0.1486976146697998,
149
+ "TensorOpSimplifier": 0.5177226066589355,
150
+ "TensorOpTransform": 1.8292319774627686,
151
+ "TensorizerLegalizationPass": 0.00022499999613501132,
152
+ "TileCCOps": 0.2758169174194336,
153
+ "TilingProfiler": 0.45922422409057617,
154
+ "TransformConvOp": 0.05972599983215332,
155
+ "TritiumFusion": 1.8706681728363037,
156
+ "ValueNumbering": 0.09980320930480957,
157
+ "VectorizeDMA": 0.12941217422485352,
158
+ "VectorizeMatMult": 0.02786874771118164,
159
+ "VerifySupportedOps": 0.0002690000110305846,
160
+ "WeightCoalescing": 0.04975485801696777,
161
+ "ZeroSizeTensorElimination": 0.00039505958557128906,
162
+ "algsimp": 0.0017940000398084521,
163
+ "batchnorm_expander": 0.0008820000221021473,
164
+ "boundary-marker-removal": 0.00037900000461377203,
165
+ "call-inliner": 0.0003760000108741224,
166
+ "canonicalize-boundary-marker": 0.0004459999909158796,
167
+ "collective-stream-id-checker": 8.099999831756577e-05,
168
+ "comparison-expander": 0.00043700000969693065,
169
+ "computation-deduplicator": 0.0005830000154674053,
170
+ "conditional-to-select": 0.00013899999612476677,
171
+ "config-lowering": 0.0003220000071451068,
172
+ "constant_folding": 0.00022699999681208283,
173
+ "cse": 0.0005879999953322113,
174
+ "dce": 0.00010299999848939478,
175
+ "dynamic-slice-transpose": 0.00020300000323913991,
176
+ "eliminate-redundant-compare": 0.00022000000171829015,
177
+ "emit-offloaded-dropout": 0.000371000001905486,
178
+ "flatten-call-graph": 0.0004889999981969595,
179
+ "fuse-send-recv": 0.0023360000923275948,
180
+ "hilo::LegalizeAlias": 0.003705000039190054,
181
+ "hilo::NeuronInstCombine": 0.0010809999657794833,
182
+ "hilo::NeuronOpFusion": 0.0005680000176653266,
183
+ "hilo::ReplaceTokenTypeWithU8Pass": 0.0003440000000409782,
184
+ "hilo::ScheduleFusion": 6.199999916134402e-05,
185
+ "hilo::SixtyFourHack": 0.00025599999935366213,
186
+ "hilo::VerifyAliasing": 0.00011899999663000926,
187
+ "hlo-mac-count": 0.0007440000190399587,
188
+ "hlo-verifier": 0.008407999761402607,
189
+ "io-con-pipe-begin": 7.000000096013537e-06,
190
+ "io-con-pipe-end": 9.999999974752427e-07,
191
+ "io-layout-normalization": 0.0013689999468624592,
192
+ "legalize-ccops": 2.4000000848900527e-05,
193
+ "legalize-compare": 0.0004239999980200082,
194
+ "lower-argminmax-custom-call": 0.00019700000120792538,
195
+ "map-inline": 0.0007559999939985573,
196
+ "metadata-naming": 0.0019259999971836805,
197
+ "mlir::detail::OpToOpPassAdaptor": 0.00028199999360367656,
198
+ "mlir::hlo::MhloToPyPenguin": 0.058632999658584595,
199
+ "mlir::mhlo::LowerComplexExtraPass": 0.0029980000108480453,
200
+ "mlir::mhlo::LowerComplexPass": 0.0030219999607652426,
201
+ "native-to-custom-softmax": 0.00038400001358240843,
202
+ "native-to-custom-softmax-dx": 0.0004239999980200082,
203
+ "operand_upcaster": 0.0008040000102482736,
204
+ "post-par-pipe-begin": 9.999999974752427e-07,
205
+ "post-par-pipe-end": 0.0,
206
+ "post-partition-simplification": 0.10732399672269821,
207
+ "pre-hlo-begin": 3.000000106112566e-06,
208
+ "pre-hlo-end": 9.999999974752427e-07,
209
+ "replace-minimum-constant": 0.0003169999981764704,
210
+ "reshape-mover": 9.300000237999484e-05,
211
+ "simplify-concat": 0.0022960000205785036,
212
+ "simplify-while-loops": 7.899999764049426e-05,
213
+ "transform-variadic-reduce": 0.000707999977748841,
214
+ "tuple-simplifier": 0.0002300000051036477,
215
+ "unpack-nested-aws-ntwsr": 0.000371000001905486,
216
+ "unroll-while-loop": 1.8000000636675395e-05
217
+ },
218
+ "hilo": {
219
+ "HloMacCount": 15301083136.0,
220
+ "Traffic": 7787368960.0
221
+ },
222
+ "tensorizer": {
223
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 272515,
224
+ "StaticProfiler::AifUb": 8.01248550415039,
225
+ "StaticProfiler::ArithmeticIntensityTensorizer": 8.703800201416016,
226
+ "StaticProfiler::AverageDmaLength": 5833.71923828125,
227
+ "StaticProfiler::DDRTransferBytes": 7663108604,
228
+ "StaticProfiler::InternalTransferBytes": 296554544,
229
+ "StaticProfiler::LoadExpanded": 1250875,
230
+ "StaticProfiler::StoreExpanded": 2218,
231
+ "StaticProfiler::TotalDMAExpanded": 1253093,
232
+ "StaticProfiler::TotalDynamicInstancesCount": 278788,
233
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 273565,
234
+ "StaticProfiler::TotalLNCComm": 0,
235
+ "StaticProfiler::TotalLNCCommTransfer": 0,
236
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0,
237
+ "TilingProfiler::DmaInstructionsAfterTiling": 0,
238
+ "TilingProfiler::GenericInstructionsAfterTiling": 281,
239
+ "TilingProfiler::MatMultInstructionsAfterTiling": 234496,
240
+ "TilingProfiler::NumPfTransposes": 330,
241
+ "TilingProfiler::NumPfTransposesForIo": 33,
242
+ "TilingProfiler::NumPfTransposesForLocal": 162,
243
+ "TilingProfiler::NumPfTransposesForNonlocal": 135,
244
+ "TilingProfiler::PfTransposeInstructions": 13760,
245
+ "TilingProfiler::PfTransposeInstructionsForIo": 8208,
246
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1316,
247
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 4236,
248
+ "TilingProfiler::ReduceInstructionsAfterTiling": 643,
249
+ "TilingProfiler::SimdInstructionsAfterTiling": 7821,
250
+ "TilingProfiler::TotalInstructionsAfterTiling": 0,
251
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
252
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
253
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
254
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
255
+ "TransformConvOp::conv2d_column_packing": 0,
256
+ "TransformConvOp::conv2d_column_packing_1": 0,
257
+ "TransformConvOp::conv2d_column_packing_io10": 0,
258
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
259
+ }
260
+ },
261
+ "all": {
262
+ "compiletime": {
263
+ "CanonicalizeConv": 1.8000000636675395e-05,
264
+ "CanonicalizeForTensorizer": 0.0005799999926239252,
265
+ "Canonicalizer": 0.007720999885350466,
266
+ "HoistCompute": 9.600000339560211e-05,
267
+ "IdentifyCrossPassTensors": 0.00026000000070780516,
268
+ "MemcastMotion": 0.00020199999562464654,
269
+ "PenguinizeFunctions": 0.00023999999393709004,
270
+ "PruneFunctions": 0.00014600000577047467,
271
+ "RemoveOptimizationBarriers": 0.00010399999882793054,
272
+ "ScatterMotion": 0.009108999744057655,
273
+ "TensorizerLegalizationPass": 0.00022499999613501132,
274
+ "VerifySupportedOps": 0.0002690000110305846,
275
+ "algsimp": 0.0017940000398084521,
276
+ "batchnorm_expander": 0.0008820000221021473,
277
+ "boundary-marker-removal": 0.00037900000461377203,
278
+ "call-inliner": 0.0003760000108741224,
279
+ "canonicalize-boundary-marker": 0.0004459999909158796,
280
+ "collective-stream-id-checker": 8.099999831756577e-05,
281
+ "comparison-expander": 0.00043700000969693065,
282
+ "computation-deduplicator": 0.0005830000154674053,
283
+ "conditional-to-select": 0.00013899999612476677,
284
+ "config-lowering": 0.0003220000071451068,
285
+ "constant_folding": 0.00022699999681208283,
286
+ "cse": 0.0005879999953322113,
287
+ "dce": 0.00010299999848939478,
288
+ "dynamic-slice-transpose": 0.00020300000323913991,
289
+ "eliminate-redundant-compare": 0.00022000000171829015,
290
+ "emit-offloaded-dropout": 0.000371000001905486,
291
+ "flatten-call-graph": 0.0004889999981969595,
292
+ "fuse-send-recv": 0.0023360000923275948,
293
+ "hilo::LegalizeAlias": 0.003705000039190054,
294
+ "hilo::NeuronInstCombine": 0.0010809999657794833,
295
+ "hilo::NeuronOpFusion": 0.0005680000176653266,
296
+ "hilo::ReplaceTokenTypeWithU8Pass": 0.0003440000000409782,
297
+ "hilo::ScheduleFusion": 6.199999916134402e-05,
298
+ "hilo::SixtyFourHack": 0.00025599999935366213,
299
+ "hilo::VerifyAliasing": 0.00011899999663000926,
300
+ "hlo-mac-count": 0.0007440000190399587,
301
+ "hlo-verifier": 0.008407999761402607,
302
+ "io-con-pipe-begin": 7.000000096013537e-06,
303
+ "io-con-pipe-end": 9.999999974752427e-07,
304
+ "io-layout-normalization": 0.0013689999468624592,
305
+ "legalize-ccops": 2.4000000848900527e-05,
306
+ "legalize-compare": 0.0004239999980200082,
307
+ "lower-argminmax-custom-call": 0.00019700000120792538,
308
+ "map-inline": 0.0007559999939985573,
309
+ "metadata-naming": 0.0019259999971836805,
310
+ "mlir::detail::OpToOpPassAdaptor": 0.00028199999360367656,
311
+ "mlir::hlo::MhloToPyPenguin": 0.058632999658584595,
312
+ "mlir::mhlo::LowerComplexExtraPass": 0.0029980000108480453,
313
+ "mlir::mhlo::LowerComplexPass": 0.0030219999607652426,
314
+ "native-to-custom-softmax": 0.00038400001358240843,
315
+ "native-to-custom-softmax-dx": 0.0004239999980200082,
316
+ "operand_upcaster": 0.0008040000102482736,
317
+ "post-par-pipe-begin": 9.999999974752427e-07,
318
+ "post-par-pipe-end": 0.0,
319
+ "post-partition-simplification": 0.10732399672269821,
320
+ "pre-hlo-begin": 3.000000106112566e-06,
321
+ "pre-hlo-end": 9.999999974752427e-07,
322
+ "replace-minimum-constant": 0.0003169999981764704,
323
+ "reshape-mover": 9.300000237999484e-05,
324
+ "simplify-concat": 0.0022960000205785036,
325
+ "simplify-while-loops": 7.899999764049426e-05,
326
+ "transform-variadic-reduce": 0.000707999977748841,
327
+ "tuple-simplifier": 0.0002300000051036477,
328
+ "unpack-nested-aws-ntwsr": 0.000371000001905486,
329
+ "unroll-while-loop": 1.8000000636675395e-05
330
+ }
331
+ },
332
+ "cumsum": {
333
+ "compiletime": {
334
+ "CoalesceCCOp": 0.00024366378784179688,
335
+ "DMALocalityOpt": 0.00020456314086914063,
336
+ "DMAProfiler": 0.000926971435546875,
337
+ "DataStreaming": 0.00028705596923828125,
338
+ "DoNothing": 0.00016045570373535156,
339
+ "ExpandISAMacro": 0.0005159378051757813,
340
+ "FactorizeBlkDims": 0.0005528926849365234,
341
+ "InferPSumTensor": 0.0005333423614501953,
342
+ "LateLegalizeInst": 0.00042557716369628906,
343
+ "LateNeuronInstComb": 0.0005631446838378906,
344
+ "LegalizeSundaAccess": 0.0017147064208984375,
345
+ "LegalizeType": 0.0003027915954589844,
346
+ "LowerBroadcast": 0.00026917457580566406,
347
+ "LowerIntrinsics": 0.00027251243591308594,
348
+ "LowerTranspose": 0.0002696514129638672,
349
+ "NeuronInstComb": 0.0005166530609130859,
350
+ "NeuronLICM": 0.0004801750183105469,
351
+ "NeuronSimplifyPredicates": 0.002925872802734375,
352
+ "NeuronValueNumbering": 0.0004703998565673828,
353
+ "SFKVectorizer": 0.002753019332885742,
354
+ "SimpleAllReduceTiling": 0.00023436546325683594,
355
+ "SimplifyNeuronTensor": 0.0004436969757080078,
356
+ "SpillPSum": 0.0006051063537597656,
357
+ "WeightCoalescing": 0.0002589225769042969
358
+ }
359
+ },
360
+ "sg00": {
361
+ "hilo": {
362
+ "ArithmeticIntensity": 3.929718255996704,
363
+ "HloMacCount": 15301083136.0,
364
+ "Traffic": 7787368960.0
365
+ }
366
+ },
367
+ "sg0000": {
368
+ "compiletime": {
369
+ "AGOrderingAnalysisPass": 2.275869369506836,
370
+ "AffinePredicateResolution": 0.10508346557617188,
371
+ "AliasDependencyElimination": 0.0029473304748535156,
372
+ "AliasDependencyInduction": 1.0161068439483643,
373
+ "AliasDependencyReset": 1.0964906215667725,
374
+ "BFComputeCutting": 0.3273308277130127,
375
+ "BirCodeGenLoop": 1.817042350769043,
376
+ "CCOpFusion": 0.759657621383667,
377
+ "CanonicalizeDAGForPGTiling": 0.18803882598876953,
378
+ "CanonicalizeIR": 0.11030387878417969,
379
+ "CoalesceCCOp": 0.17122554779052734,
380
+ "CommuteConcat": 0.03537917137145996,
381
+ "DMALocalityOpt": 0.03361392021179199,
382
+ "DMAProfiler": 0.08036518096923828,
383
+ "DMATilingProfiler": 0.07448625564575195,
384
+ "DataLocalityOpt": 2.595104932785034,
385
+ "DataStreaming": 0.12404012680053711,
386
+ "DeConcat": 0.026020050048828125,
387
+ "DeadCodeElimination": 0.03551626205444336,
388
+ "DeadStoreElimination": 1.3150177001953125,
389
+ "DelinearIndices": 0.3447244167327881,
390
+ "Delinearization": 0.1470789909362793,
391
+ "DoNothing": 0.0001266002655029297,
392
+ "DramToDramTranspose": 1.1585099697113037,
393
+ "DumpGraphAndMetadata": 0.16599822044372559,
394
+ "EliminateDivs": 0.16966032981872559,
395
+ "ExpandBatchNorm": 0.08444547653198242,
396
+ "ExpandISAMacro": 0.07538485527038574,
397
+ "FactorizeBlkDims": 0.41496992111206055,
398
+ "FactorizeThreadAxesInFreeDims": 0.05351138114929199,
399
+ "FlattenMacroLoop": 0.08939695358276367,
400
+ "GenericAccessSimplifier": 0.03192710876464844,
401
+ "InferInitValue": 1.1084926128387451,
402
+ "InferIntrinsicOnCC": 0.36765527725219727,
403
+ "InferNeuronTensor": 1.879042387008667,
404
+ "InferNonlocalTensors": 4.959750175476074,
405
+ "InferPSumTensor": 0.9902753829956055,
406
+ "InlineNativeKernels": 0.04849410057067871,
407
+ "InsertIOTransposes": 0.9103872776031494,
408
+ "InsertLocalTransposes": 0.7676966190338135,
409
+ "InsertOffloadedTransposes": 0.0744314193725586,
410
+ "LICM": 0.10265731811523438,
411
+ "LateLegalizeInst": 0.1490192413330078,
412
+ "LateLegalizePostSplit": 0.08162188529968262,
413
+ "LateLowerReshapeOp": 0.043623924255371094,
414
+ "LateLowerTensorOp": 0.7195582389831543,
415
+ "LateNeuronInstComb": 0.4350717067718506,
416
+ "LayoutPreprocessing": 1.0318124294281006,
417
+ "LayoutPreprocessingAndAnalysis": 1.369814157485962,
418
+ "LayoutRequirementAnalysis": 0.32599711418151855,
419
+ "LegalizeCCOpLayout": 0.11543631553649902,
420
+ "LegalizeOpLevelAlias": 0.042886972427368164,
421
+ "LegalizePartitionReduce": 0.07607841491699219,
422
+ "LegalizeSundaAccess": 1.064283847808838,
423
+ "LegalizeSundaMacro": 0.3716259002685547,
424
+ "LegalizeType": 0.17159318923950195,
425
+ "LocalLayoutOpt": 0.7340552806854248,
426
+ "LoopFusion": 0.35437893867492676,
427
+ "LoopSplitting": 0.03284788131713867,
428
+ "LowerBroadcast": 0.09696078300476074,
429
+ "LowerCCOpBlockAxis": 0.4246203899383545,
430
+ "LowerComplexBroadcast": 0.1469728946685791,
431
+ "LowerIntrinsics": 0.6067290306091309,
432
+ "LowerTensorOp": 0.625375509262085,
433
+ "LowerTranspose": 0.4378535747528076,
434
+ "MacroGeneration": 2.3300650119781494,
435
+ "MaskPropagation": 0.13726162910461426,
436
+ "MemcpyElimination": 9.993162155151367,
437
+ "MutateDataType": 0.04567551612854004,
438
+ "NeuronAliasDependencyInduction": 0.018932819366455078,
439
+ "NeuronAliasDependencyReset": 0.03390645980834961,
440
+ "NeuronInstComb": 0.35028982162475586,
441
+ "NeuronLICM": 0.2647576332092285,
442
+ "NeuronLoopFusion": 0.7789182662963867,
443
+ "NeuronLoopInterchange": 0.04993152618408203,
444
+ "NeuronSimplifier": 0.3916890621185303,
445
+ "NeuronSimplifyPredicates": 0.0635371208190918,
446
+ "NeuronValueNumbering": 0.09003329277038574,
447
+ "OptimizeAliasedCopyChain": 0.024803638458251953,
448
+ "OptimizeNKIKernels": 0.3809840679168701,
449
+ "PAGLayoutOpt": 44.665672302246094,
450
+ "PComputeCutting": 0.327991247177124,
451
+ "PGLayoutTilingPipeline": 59.74672317504883,
452
+ "PGTiling": 5.59335994720459,
453
+ "PadElimination": 0.014229774475097656,
454
+ "ParAxesAnnotation": 43.884578704833984,
455
+ "PartialLoopFusion": 0.4160308837890625,
456
+ "PartialSimdFusion": 0.35861706733703613,
457
+ "PerfectLoopNest": 0.06206011772155762,
458
+ "RecognizeOpIdiom": 0.14852476119995117,
459
+ "Recompute": 0.00867772102355957,
460
+ "RelaxPredicates": 0.12411117553710938,
461
+ "Rematerialization": 0.15559935569763184,
462
+ "ReshapeWeights": 0.0252685546875,
463
+ "ResolveAccessConflict": 0.23229002952575684,
464
+ "ResolveComplicatePredicates": 0.08260774612426758,
465
+ "RewriteReplicationMatmul": 0.04953479766845703,
466
+ "RewriteWeights": 0.06951451301574707,
467
+ "SFKVectorizer": 5.91412353515625,
468
+ "SimpleAllReduceTiling": 0.05823469161987305,
469
+ "Simplifier": 0.12264633178710938,
470
+ "SimplifyMacroPredicates": 0.19836044311523438,
471
+ "SimplifyNeuronTensor": 0.7505025863647461,
472
+ "SimplifySlice": 0.033946990966796875,
473
+ "SimplifyTensor": 0.20671844482421875,
474
+ "SpillPSum": 0.6507272720336914,
475
+ "SplitAPUnionSets": 0.25449252128601074,
476
+ "SplitAccGrp": 0.04361081123352051,
477
+ "StaticProfiler": 0.44228219985961914,
478
+ "StaticTransposeLocalTensor": 0.25648951530456543,
479
+ "SundaISel": 1.4593234062194824,
480
+ "TCTransform": 0.0390927791595459,
481
+ "TensorInitialization": 0.1486976146697998,
482
+ "TensorOpSimplifier": 0.5177226066589355,
483
+ "TensorOpTransform": 1.8292319774627686,
484
+ "TileCCOps": 0.2758169174194336,
485
+ "TilingProfiler": 0.45922422409057617,
486
+ "TransformConvOp": 0.05972599983215332,
487
+ "TritiumFusion": 1.8706681728363037,
488
+ "ValueNumbering": 0.09980320930480957,
489
+ "VectorizeDMA": 0.12941217422485352,
490
+ "VectorizeMatMult": 0.02786874771118164,
491
+ "WeightCoalescing": 0.04949593544006348,
492
+ "ZeroSizeTensorElimination": 0.00039505958557128906
493
+ },
494
+ "tensorizer": {
495
+ "DMATilingProfiler::TotalInstructionsAfterTiling": 272515,
496
+ "StaticProfiler::AifUb": 8.01248550415039,
497
+ "StaticProfiler::ArithmeticIntensityTensorizer": 8.703800201416016,
498
+ "StaticProfiler::AverageDmaLength": 5833.71923828125,
499
+ "StaticProfiler::AverageFractalPeUtilization": 99.4205093383789,
500
+ "StaticProfiler::AveragePartitionUtilization": 98.67665100097656,
501
+ "StaticProfiler::AveragePeUtilization": 97.56974029541016,
502
+ "StaticProfiler::DDRTransferBytes": 7663108604,
503
+ "StaticProfiler::InternalTransferBytes": 296554544,
504
+ "StaticProfiler::LoadExpanded": 1250875,
505
+ "StaticProfiler::LocalizationEfficiency": 108.62796020507813,
506
+ "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 108.75718688964844,
507
+ "StaticProfiler::StoreExpanded": 2218,
508
+ "StaticProfiler::TotalDMAExpanded": 1253093,
509
+ "StaticProfiler::TotalDynamicInstancesCount": 278788,
510
+ "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 273565,
511
+ "StaticProfiler::TotalLNCComm": 0,
512
+ "StaticProfiler::TotalLNCCommTransfer": 0,
513
+ "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0,
514
+ "TilingProfiler::AveragePeUtilizationAfterTiling": 0,
515
+ "TilingProfiler::BatchnormInstructionsAfterTiling": 0,
516
+ "TilingProfiler::DmaInstructionsAfterTiling": 0,
517
+ "TilingProfiler::GenericInstructionsAfterTiling": 281,
518
+ "TilingProfiler::MatMultInstructionsAfterTiling": 234496,
519
+ "TilingProfiler::NumPfTransposes": 330,
520
+ "TilingProfiler::NumPfTransposesForIo": 33,
521
+ "TilingProfiler::NumPfTransposesForLocal": 162,
522
+ "TilingProfiler::NumPfTransposesForNonlocal": 135,
523
+ "TilingProfiler::PfTransposeInstructions": 13760,
524
+ "TilingProfiler::PfTransposeInstructionsForIo": 8208,
525
+ "TilingProfiler::PfTransposeInstructionsForLocal": 1316,
526
+ "TilingProfiler::PfTransposeInstructionsForNonlocal": 4236,
527
+ "TilingProfiler::ReduceInstructionsAfterTiling": 643,
528
+ "TilingProfiler::SimdInstructionsAfterTiling": 7821,
529
+ "TilingProfiler::TotalInstructionsAfterTiling": 0,
530
+ "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0,
531
+ "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0,
532
+ "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0,
533
+ "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0,
534
+ "TransformConvOp::conv2d_column_packing": 0,
535
+ "TransformConvOp::conv2d_column_packing_1": 0,
536
+ "TransformConvOp::conv2d_column_packing_io10": 0,
537
+ "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0
538
+ }
539
+ }
540
+ }
token_generation_model/_tp0_bk4/log-neuron-cc.txt ADDED
The diff for this file is too large to render. See raw diff
 
token_generation_model/_tp0_bk4/neuron_config.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
4
+ "add_cross_attention": false,
5
+ "architectures": [
6
+ "MistralForCausalLM"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "attribute_map": {},
10
+ "bad_words_ids": null,
11
+ "begin_suppress_tokens": null,
12
+ "bos_token_id": 1,
13
+ "chunk_size_feed_forward": 0,
14
+ "cross_attention_hidden_size": null,
15
+ "decoder_start_token_id": null,
16
+ "diversity_penalty": 0.0,
17
+ "do_sample": false,
18
+ "early_stopping": false,
19
+ "encoder_no_repeat_ngram_size": 0,
20
+ "eos_token_id": 2,
21
+ "exponential_decay_length_penalty": null,
22
+ "finetuning_task": null,
23
+ "forced_bos_token_id": null,
24
+ "forced_eos_token_id": null,
25
+ "fused_spec_config": null,
26
+ "head_dim": 128,
27
+ "hidden_act": "silu",
28
+ "hidden_size": 4096,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 14336,
35
+ "is_decoder": false,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "length_penalty": 1.0,
42
+ "max_length": 20,
43
+ "max_position_embeddings": 32768,
44
+ "metadata": null,
45
+ "min_length": 0,
46
+ "model_type": "mistral",
47
+ "neuron_config": {
48
+ "activation_quantization_type": null,
49
+ "allow_input_truncation": false,
50
+ "apply_seq_ids_mask": false,
51
+ "async_mode": false,
52
+ "attention_dp_degree": 1,
53
+ "attention_dtype": null,
54
+ "attn_block_cte_nki_kernel_enabled": false,
55
+ "attn_block_tkg_nki_kernel_cache_update": false,
56
+ "attn_block_tkg_nki_kernel_enabled": false,
57
+ "attn_cls": "NeuronLlamaAttention",
58
+ "attn_kernel_enabled": null,
59
+ "attn_tkg_builtin_kernel_enabled": false,
60
+ "attn_tkg_nki_kernel_enabled": false,
61
+ "batch_size": 4,
62
+ "bucket_n_active_tokens": false,
63
+ "buckets": [
64
+ 2048
65
+ ],
66
+ "cast_type": "config",
67
+ "cc_pipeline_tiling_factor": 1,
68
+ "chunked_prefill_config": null,
69
+ "context_encoding_buckets": null,
70
+ "cp_degree": 1,
71
+ "ctx_batch_size": 1,
72
+ "disable_kv_cache_tiling": false,
73
+ "draft_model_modules_to_not_convert": null,
74
+ "enable_bucketing": true,
75
+ "enable_eagle_draft_input_norm": false,
76
+ "enable_eagle_speculation": false,
77
+ "enable_fused_speculation": false,
78
+ "enable_long_context_mode": false,
79
+ "enable_output_completion_notifications": false,
80
+ "enable_spill_reload_dge": false,
81
+ "enable_token_tree": false,
82
+ "ep_degree": 1,
83
+ "expert_mlp_nki_kernel_enabled": null,
84
+ "flash_decoding_enabled": false,
85
+ "fused_qkv": false,
86
+ "fused_rmsnorm_skip_gamma": false,
87
+ "is_block_kv_layout": null,
88
+ "is_chunked_prefill": false,
89
+ "is_continuous_batching": true,
90
+ "is_eagle_draft": false,
91
+ "is_medusa": false,
92
+ "is_prefill_stage": false,
93
+ "is_prefix_caching": false,
94
+ "k_cache_transposed": false,
95
+ "kv_cache_batch_size": 4,
96
+ "kv_cache_padding_size": 0,
97
+ "kv_cache_quant": false,
98
+ "kv_cache_tiling": false,
99
+ "layer_boundary_markers": false,
100
+ "lm_head_pad": false,
101
+ "lm_head_pad_alignment_size": 1,
102
+ "local_ranks_size": 2,
103
+ "logical_nc_config": 1,
104
+ "lora_config": null,
105
+ "max_batch_size": 4,
106
+ "max_context_length": 2048,
107
+ "max_length": 2048,
108
+ "max_new_tokens": null,
109
+ "medusa_speculation_length": 0,
110
+ "medusa_tree": null,
111
+ "mlp_kernel_enabled": false,
112
+ "mlp_kernel_fuse_residual_add": false,
113
+ "modules_to_not_convert": null,
114
+ "moe_fused_nki_kernel_enabled": null,
115
+ "n_active_tokens": 1,
116
+ "n_positions": 2048,
117
+ "num_medusa_heads": 0,
118
+ "on_cpu": false,
119
+ "on_device_sampling_config": {
120
+ "deterministic": false,
121
+ "do_sample": false,
122
+ "dynamic": true,
123
+ "global_topk": 256,
124
+ "on_device_sampling_config": true,
125
+ "temperature": 1.0,
126
+ "top_k": 1,
127
+ "top_k_kernel_enabled": false,
128
+ "top_p": 1.0
129
+ },
130
+ "output_logits": false,
131
+ "overrides_torch_dtype": true,
132
+ "pa_block_size": 2048,
133
+ "pa_num_blocks": 4,
134
+ "padding_side": "right",
135
+ "pp_degree": 1,
136
+ "prefix_buckets": null,
137
+ "qk_layernorm": false,
138
+ "qkv_kernel_enabled": false,
139
+ "qkv_kernel_fuse_residual_add": false,
140
+ "qkv_kernel_nbsd_layout": false,
141
+ "quantization_dtype": "int8",
142
+ "quantization_type": "per_tensor_symmetric",
143
+ "quantize_clamp_bound": Infinity,
144
+ "quantized": false,
145
+ "quantized_checkpoints_path": null,
146
+ "quantized_mlp_kernel_enabled": false,
147
+ "rmsnorm_quantize_kernel_enabled": false,
148
+ "router_topk_nki_kernel_enabled": null,
149
+ "rpl_reduce_dtype": null,
150
+ "save_sharded_checkpoint": true,
151
+ "scratchpad_page_size": null,
152
+ "seq_len": 2048,
153
+ "seq_len_threshold_for_cc_tiling": 16384,
154
+ "sequence_parallel_enabled": false,
155
+ "shared_mlp_nki_kernel_enabled": null,
156
+ "skip_sharding": false,
157
+ "skip_warmup": false,
158
+ "spec_batch_size": 4,
159
+ "speculation_length": 0,
160
+ "start_rank_id": 0,
161
+ "target": null,
162
+ "tile_cc": false,
163
+ "tkg_batch_size": 4,
164
+ "token_generation_buckets": [
165
+ 2048
166
+ ],
167
+ "token_tree_config": null,
168
+ "torch_dtype": "bfloat16",
169
+ "tp_degree": 2,
170
+ "vocab_parallel": false,
171
+ "weight_gather_seq_len_threshold": 32768,
172
+ "weights_to_skip_layout_optimization": [],
173
+ "world_size": 2
174
+ },
175
+ "no_repeat_ngram_size": 0,
176
+ "num_attention_heads": 32,
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_cores_per_group": 1,
180
+ "num_hidden_layers": 32,
181
+ "num_key_value_heads": 8,
182
+ "num_return_sequences": 1,
183
+ "output_attentions": false,
184
+ "output_hidden_states": false,
185
+ "output_scores": false,
186
+ "pad_token_id": 0,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "remove_invalid_values": false,
191
+ "repetition_penalty": 1.0,
192
+ "return_dict": true,
193
+ "return_dict_in_generate": false,
194
+ "rms_norm_eps": 1e-05,
195
+ "rope_theta": 1000000.0,
196
+ "sep_token_id": null,
197
+ "sliding_window": null,
198
+ "suppress_tokens": null,
199
+ "task_specific_params": null,
200
+ "temperature": 1.0,
201
+ "tf_legacy_loss": false,
202
+ "tie_encoder_decoder": false,
203
+ "tie_word_embeddings": false,
204
+ "tokenizer_class": null,
205
+ "top_k": 50,
206
+ "top_p": 1.0,
207
+ "torchscript": false,
208
+ "transformers_version": "4.42.0.dev0",
209
+ "typical_p": 1.0,
210
+ "use_bfloat16": false,
211
+ "use_cache": true,
212
+ "vocab_size": 32768
213
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff