diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..676657bad56e970af117f85e4b50236807780f4e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.model.v3 filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk3/model.MODULE_668122c92a86c0ce6817+f94fe8ed.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk4/model.MODULE_fb6decaa94b1936d08da+1b5847e3.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk4/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk0/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk0/model.MODULE_67d3774d5bacfe6ba851+72d461cc.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk2/model.MODULE_2f686dc6ba7ef3326a56+6113de8c.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk1/model.MODULE_92bbfea7801df2fea75e+4948da29.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk1/model.MODULE_68c159ab1fef44a40212+6a9a7e72.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk2/model.MODULE_78e5291800ea5b96a03b+442879bd.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk0/model.MODULE_c6824be80aab0b095843+cc19d8a1.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk3/model.MODULE_2e1f11fbf72d40b46e64+5ae2bfda.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk4/graph.neff filter=lfs diff=lfs merge=lfs -text +layout_opt/graph.neff filter=lfs diff=lfs merge=lfs -text +layout_opt/model/graph.hlo filter=lfs diff=lfs merge=lfs -text diff --git a/context_encoding_model/_tp0_bk0/graph.neff b/context_encoding_model/_tp0_bk0/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..1071a63590675160614ee61d656a3b69accf63bf --- /dev/null +++ b/context_encoding_model/_tp0_bk0/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4204eb4a15cdd349ac9a8e820ca7e3720613827e792ac79e7a5dd1055080e37 +size 625664 diff --git a/context_encoding_model/_tp0_bk0/metaneff.pb b/context_encoding_model/_tp0_bk0/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..c401e6ee11682e17a11c2242a388670ce5c5f430 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:026981247cc92ae3d4098052e6e5cd96444bcad2ad94540d0cedbaf5978e6a67 +size 873633 diff --git a/context_encoding_model/_tp0_bk0/model.MODULE_c6824be80aab0b095843+cc19d8a1.hlo_module.pb b/context_encoding_model/_tp0_bk0/model.MODULE_c6824be80aab0b095843+cc19d8a1.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..6b7868b629f451ab575198397354a5d025ea33b1 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/model.MODULE_c6824be80aab0b095843+cc19d8a1.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b83c27a0c2c3a9734291ce7f47544f4494b27f1c8a6c5b171a2abaead1f7e45c +size 939543 diff --git a/context_encoding_model/_tp0_bk0/model.MODULE_c6824be80aab0b095843+cc19d8a1.neff b/context_encoding_model/_tp0_bk0/model.MODULE_c6824be80aab0b095843+cc19d8a1.neff new file mode 100644 index 0000000000000000000000000000000000000000..1071a63590675160614ee61d656a3b69accf63bf --- /dev/null +++ b/context_encoding_model/_tp0_bk0/model.MODULE_c6824be80aab0b095843+cc19d8a1.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4204eb4a15cdd349ac9a8e820ca7e3720613827e792ac79e7a5dd1055080e37 +size 625664 diff --git a/context_encoding_model/_tp0_bk1/graph.neff b/context_encoding_model/_tp0_bk1/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..0aa09045d7a26f44b7d1f0d1021b1c3c493d643d --- /dev/null +++ b/context_encoding_model/_tp0_bk1/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97771a78aebed34c313542b68a55aa0b7ad1bcc196ef7859e9c6d32f2aca5755 +size 728064 diff --git a/context_encoding_model/_tp0_bk1/metaneff.pb b/context_encoding_model/_tp0_bk1/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..7c41b7d747bbfa227ebcc30c05d7940ddc7c7533 --- /dev/null +++ b/context_encoding_model/_tp0_bk1/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7919366f46c6b8e36ccac5f786a1a8c01287cf244d988f1b58a68be4356face6 +size 971205 diff --git a/context_encoding_model/_tp0_bk1/model.MODULE_68c159ab1fef44a40212+6a9a7e72.hlo_module.pb b/context_encoding_model/_tp0_bk1/model.MODULE_68c159ab1fef44a40212+6a9a7e72.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..046fa8492cee476fb0333ce67c65a381e736def6 --- /dev/null +++ b/context_encoding_model/_tp0_bk1/model.MODULE_68c159ab1fef44a40212+6a9a7e72.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:925d5603c197f320b5a97bcd1bb270fee71d58a600ff2ac6d2ac1c4ce205b7b6 +size 1037079 diff --git a/context_encoding_model/_tp0_bk1/model.MODULE_68c159ab1fef44a40212+6a9a7e72.neff b/context_encoding_model/_tp0_bk1/model.MODULE_68c159ab1fef44a40212+6a9a7e72.neff new file mode 100644 index 0000000000000000000000000000000000000000..0aa09045d7a26f44b7d1f0d1021b1c3c493d643d --- /dev/null +++ b/context_encoding_model/_tp0_bk1/model.MODULE_68c159ab1fef44a40212+6a9a7e72.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97771a78aebed34c313542b68a55aa0b7ad1bcc196ef7859e9c6d32f2aca5755 +size 728064 diff --git a/context_encoding_model/_tp0_bk2/graph.neff b/context_encoding_model/_tp0_bk2/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..6ccb36b858b4e44af9c8bedcee63824f7da780f5 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:984d840f5e118d1ed3bba502a877aa785002b074a22a45b384cd2172958beb3f +size 1035264 diff --git a/context_encoding_model/_tp0_bk2/metaneff.pb b/context_encoding_model/_tp0_bk2/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..a5784a2fc5a8ca6d5c3e8503445ab61192b071af --- /dev/null +++ b/context_encoding_model/_tp0_bk2/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0cb712ca363e2b1bd7dcf4027263b606df90abb05f672ae1e8fb4af5f2b3616 +size 1167813 diff --git a/context_encoding_model/_tp0_bk2/model.MODULE_78e5291800ea5b96a03b+442879bd.hlo_module.pb b/context_encoding_model/_tp0_bk2/model.MODULE_78e5291800ea5b96a03b+442879bd.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..0296c67f16bdac666099f79e8d679939d95c961a --- /dev/null +++ b/context_encoding_model/_tp0_bk2/model.MODULE_78e5291800ea5b96a03b+442879bd.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdf8c3a993bb11cf1a28c6df55c6f130c69c4908ba4dde20d7bbc2356b5f2f53 +size 1233687 diff --git a/context_encoding_model/_tp0_bk2/model.MODULE_78e5291800ea5b96a03b+442879bd.neff b/context_encoding_model/_tp0_bk2/model.MODULE_78e5291800ea5b96a03b+442879bd.neff new file mode 100644 index 0000000000000000000000000000000000000000..6ccb36b858b4e44af9c8bedcee63824f7da780f5 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/model.MODULE_78e5291800ea5b96a03b+442879bd.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:984d840f5e118d1ed3bba502a877aa785002b074a22a45b384cd2172958beb3f +size 1035264 diff --git a/context_encoding_model/_tp0_bk3/compile_flags.MODULE_2e1f11fbf72d40b46e64+5ae2bfda.json b/context_encoding_model/_tp0_bk3/compile_flags.MODULE_2e1f11fbf72d40b46e64+5ae2bfda.json new file mode 100644 index 0000000000000000000000000000000000000000..23d9792160eb2da485e0d2aedc7983d2f4373ec0 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/compile_flags.MODULE_2e1f11fbf72d40b46e64+5ae2bfda.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk3/log-neuron-cc.txt"] \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk3/global_metric_store.json b/context_encoding_model/_tp0_bk3/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..124e8d14a6243693dcef7768a26de3381f1a160e --- /dev/null +++ b/context_encoding_model/_tp0_bk3/global_metric_store.json @@ -0,0 +1,1079 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 99.79875946044922, + "StaticProfiler::AveragePartitionUtilization": 99.50694274902344, + "StaticProfiler::AveragePeUtilization": 99.19517517089844, + "StaticProfiler::LocalizationEfficiency": 80.37861633300781, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 88.63314819335938, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1.0, + "StaticProfiler::AveragePartitionUtilization": 1.0, + "StaticProfiler::AveragePeUtilization": 1.0, + "StaticProfiler::LocalizationEfficiency": 1.0, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 0.01833963394165039, + "AffinePredicateResolution": 0.0011298656463623047, + "AliasDependencyElimination": 0.0003044605255126953, + "AliasDependencyInduction": 0.0059871673583984375, + "AliasDependencyReset": 0.024695634841918945, + "BFComputeCutting": 0.0022745132446289063, + "BirCodeGenLoop": 0.12499594688415527, + "CCOpFusion": 0.025257110595703125, + "CanonicalizeConv": 2.300000051036477e-05, + "CanonicalizeDAGForPGTiling": 0.0039975643157958984, + "CanonicalizeForTensorizer": 5.6000000768108293e-05, + "CanonicalizeIR": 0.0017023086547851563, + "Canonicalizer": 0.0013979999348521233, + "CoalesceCCOp": 0.006863117218017578, + "CommuteConcat": 0.0009205341339111328, + "DMALocalityOpt": 0.0034034252166748047, + "DMAProfiler": 0.007045269012451172, + "DMATilingProfiler": 0.004607439041137695, + "DataLocalityOpt": 0.15957880020141602, + "DataStreaming": 0.037320613861083984, + "DeConcat": 0.0007259845733642578, + "DeadCodeElimination": 0.0009546279907226563, + "DeadStoreElimination": 0.006250619888305664, + "DelinearIndices": 0.005332231521606445, + "Delinearization": 0.0033500194549560547, + "DoNothing": 0.00037598609924316406, + "DramToDramTranspose": 0.020763397216796875, + "DumpGraphAndMetadata": 0.025223493576049805, + "EliminateDivs": 0.0023469924926757813, + "ExpandBatchNorm": 0.001692056655883789, + "ExpandISAMacro": 0.009050607681274414, + "FactorizeBlkDims": 0.009798526763916016, + "FactorizeThreadAxesInFreeDims": 0.002184152603149414, + "FlattenMacroLoop": 0.0022482872009277344, + "GenericAccessSimplifier": 0.0009622573852539063, + "HoistCompute": 6.000000212225132e-06, + "IdentifyCrossPassTensors": 5.999999848427251e-05, + "InferInitValue": 0.027300357818603516, + "InferIntrinsicOnCC": 0.009199380874633789, + "InferNeuronTensor": 0.028067350387573242, + "InferNonlocalTensors": 0.014671802520751953, + "InferPSumTensor": 0.08141279220581055, + "InlineNativeKernels": 0.002727031707763672, + "InsertIOTransposes": 0.017727136611938477, + "InsertLocalTransposes": 0.004176616668701172, + "InsertOffloadedTransposes": 0.002771615982055664, + "LICM": 0.005248069763183594, + "LateLegalizeInst": 0.007282733917236328, + "LateLegalizePostSplit": 0.0045223236083984375, + "LateLowerReshapeOp": 0.0012927055358886719, + "LateLowerTensorOp": 0.0014028549194335938, + "LateNeuronInstComb": 0.016957759857177734, + "LayoutPreprocessing": 0.026221275329589844, + "LayoutPreprocessingAndAnalysis": 0.07468867301940918, + "LayoutRequirementAnalysis": 0.004823446273803711, + "LegalizeCCOpLayout": 0.0023353099822998047, + "LegalizeOpLevelAlias": 0.0013494491577148438, + "LegalizePartitionReduce": 0.0018906593322753906, + "LegalizeSundaAccess": 0.06240987777709961, + "LegalizeSundaMacro": 0.04256129264831543, + "LegalizeType": 0.006028175354003906, + "LocalLayoutOpt": 0.016018390655517578, + "LoopFusion": 0.005109071731567383, + "LoopSplitting": 0.00048542022705078125, + "LowerBroadcast": 0.003258943557739258, + "LowerCCOpBlockAxis": 0.0038700103759765625, + "LowerComplexBroadcast": 0.004511594772338867, + "LowerIntrinsics": 0.32482099533081055, + "LowerTensorOp": 0.010710477828979492, + "LowerTranspose": 0.054924726486206055, + "MacroGeneration": 0.061620473861694336, + "MaskPropagation": 0.002919435501098633, + "MemcastMotion": 3.400000059627928e-05, + "MemcpyElimination": 0.02559375762939453, + "MutateDataType": 0.0014896392822265625, + "NeuronAliasDependencyInduction": 0.0002808570861816406, + "NeuronAliasDependencyReset": 0.05649685859680176, + "NeuronInstComb": 0.005097627639770508, + "NeuronLICM": 0.014602899551391602, + "NeuronLoopFusion": 0.009732246398925781, + "NeuronLoopInterchange": 0.0025072097778320313, + "NeuronSimplifier": 0.03835606575012207, + "NeuronSimplifyPredicates": 0.009032487869262695, + "NeuronValueNumbering": 0.003210306167602539, + "OptimizeAliasedCopyChain": 0.0007545948028564453, + "OptimizeNKIKernels": 0.6443507671356201, + "PAGLayoutOpt": 0.20021605491638184, + "PComputeCutting": 0.0046160221099853516, + "PGLayoutTilingPipeline": 0.6925618648529053, + "PGTiling": 0.21065187454223633, + "PadElimination": 0.00038623809814453125, + "ParAxesAnnotation": 0.052834510803222656, + "PartialLoopFusion": 0.051622629165649414, + "PartialSimdFusion": 0.014065980911254883, + "PenguinizeFunctions": 5.199999941396527e-05, + "PerfectLoopNest": 0.0019462108612060547, + "PruneFunctions": 4.3000000005122274e-05, + "RecognizeOpIdiom": 0.0037450790405273438, + "Recompute": 0.0004031658172607422, + "RelaxPredicates": 0.03561973571777344, + "Rematerialization": 0.0018870830535888672, + "RemoveOptimizationBarriers": 6.500000017695129e-05, + "ReshapeWeights": 0.0009450912475585938, + "ResolveAccessConflict": 0.0038840770721435547, + "ResolveComplicatePredicates": 0.0011222362518310547, + "RewriteReplicationMatmul": 0.0017135143280029297, + "RewriteWeights": 0.0024623870849609375, + "SFKVectorizer": 0.19468188285827637, + "ScatterMotion": 1.0000000656873453e-05, + "SimpleAllReduceTiling": 0.0037994384765625, + "Simplifier": 0.0030031204223632813, + "SimplifyMacroPredicates": 0.005193233489990234, + "SimplifyNeuronTensor": 0.38555216789245605, + "SimplifySlice": 0.001062631607055664, + "SimplifyTensor": 0.009534358978271484, + "SpillPSum": 0.053937673568725586, + "SplitAPUnionSets": 0.013537406921386719, + "SplitAccGrp": 0.0014171600341796875, + "StaticProfiler": 0.005720615386962891, + "StaticTransposeLocalTensor": 0.003614664077758789, + "SundaISel": 0.09031486511230469, + "TCTransform": 0.0008947849273681641, + "TensorInitialization": 0.010958433151245117, + "TensorOpSimplifier": 0.005278110504150391, + "TensorOpTransform": 0.020787477493286133, + "TensorizerLegalizationPass": 6.299999949987978e-05, + "TileCCOps": 0.005544900894165039, + "TilingProfiler": 0.007747173309326172, + "TransformConvOp": 0.003238677978515625, + "TritiumFusion": 0.16130614280700684, + "ValueNumbering": 0.0018999576568603516, + "VectorizeDMA": 0.0017979145050048828, + "VectorizeMatMult": 0.007079362869262695, + "VerifySupportedOps": 4.900000203633681e-05, + "WeightCoalescing": 0.0033416748046875, + "ZeroSizeTensorElimination": 0.00022983551025390625, + "algsimp": 0.0024079999420791864, + "batchnorm_expander": 4.999999873689376e-05, + "boundary-marker-removal": 1.3999999282532372e-05, + "call-inliner": 0.0004330000083427876, + "canonicalize-boundary-marker": 1.6999998479150236e-05, + "collective-stream-id-checker": 9.699999645818025e-05, + "comparison-expander": 0.0005000000237487257, + "computation-deduplicator": 7.700000423938036e-05, + "conditional-to-select": 2.099999983329326e-05, + "config-lowering": 0.00019799999427050352, + "constant-statistics": 0.0005200000014156103, + "constant_folding": 0.000295000005280599, + "cse": 5.499999679159373e-05, + "dce": 8.099999831756577e-05, + "dot_decomposer": 0.0013620000099763274, + "dynamic-slice-transpose": 1.5999999959603883e-05, + "eliminate-redundant-compare": 0.00025499999173916876, + "emit-offloaded-dropout": 7.100000220816582e-05, + "flatten-call-graph": 0.0007510000141337514, + "fuse-send-recv": 8.70000003487803e-05, + "hilo::LegalizeAlias": 1.5999999959603883e-05, + "hilo::NeuronInstCombine": 0.00015199999324977398, + "hilo::NeuronOpFusion": 5.0000002374872565e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 6.70000008540228e-05, + "hilo::ScheduleFusion": 3.999999989900971e-06, + "hilo::SixtyFourHack": 7.599999662488699e-05, + "hilo::VerifyAliasing": 8.999999408842996e-06, + "hlo-mac-count": 0.0012550000101327896, + "hlo-verifier": 0.008069000206887722, + "instruction-histogram": 0.001006999984383583, + "io-con-pipe-begin": 7.999999979801942e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.001221999991685152, + "io-statistics": 9.200000204145908e-05, + "legalize-ccops": 3.999999989900971e-06, + "legalize-compare": 1.4999999621068127e-05, + "lower-argminmax-custom-call": 1.300000076298602e-05, + "map-inline": 0.0007819999591447413, + "metadata-naming": 6.800000119255856e-05, + "mlir::detail::OpToOpPassAdaptor": 0.00011300000187475234, + "mlir::hlo::MhloToPyPenguin": 0.07539799809455872, + "mlir::mhlo::LowerComplexExtraPass": 0.00035600000410340726, + "mlir::mhlo::LowerComplexPass": 0.0005510000046342611, + "native-to-custom-softmax": 0.0006350000621750951, + "native-to-custom-softmax-dx": 0.0006360000115819275, + "operand_upcaster": 6.799999391660094e-05, + "opt-barrier-removal": 0.0004710000066552311, + "post-par-pipe-begin": 1.4000000192027073e-05, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0020860000513494015, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.21597300469875336, + "replace-minimum-constant": 0.00034199998481199145, + "reshape-mover": 0.00011600000289035961, + "simplify-concat": 0.00017500000831205398, + "simplify-while-loops": 0.00010400000610388815, + "transform-variadic-reduce": 8.299999899463728e-05, + "tuple-simplifier": 0.00028500001644715667, + "unpack-nested-aws-ntwsr": 0.0003440000000409782, + "unroll-while-loop": 1.900000097521115e-05, + "zero_sized_hlo_elimination": 0.0008210000232793391 + }, + "hilo": { + "ConstantSize": 2106325.0, + "HloInputCount": 359.0, + "HloMacCount": 231995342848.0, + "HloOutputCount": 65.0, + "IfmapSize": 7785168896.0, + "OfmapSize": 536870912.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 854718848.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 17056.0, + "StaticProfiler::AifUb": 568.2581176757813, + "StaticProfiler::ArithmeticIntensityTensorizer": 456.7580261230469, + "StaticProfiler::AverageDmaLength": 1314.3221435546875, + "StaticProfiler::DDRTransferBytes": 407087136.0, + "StaticProfiler::InternalTransferBytes": 48342036.0, + "StaticProfiler::LoadExpanded": 310291.0, + "StaticProfiler::StoreExpanded": 6699.0, + "StaticProfiler::TotalDMAExpanded": 316990.0, + "StaticProfiler::TotalDynamicInstancesCount": 19674.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 19578.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 14848.0, + "TilingProfiler::NumPfTransposes": 4.0, + "TilingProfiler::NumPfTransposesForIo": 0.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 769.0, + "TilingProfiler::PfTransposeInstructionsForIo": 0.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 6.0, + "TilingProfiler::SimdInstructionsAfterTiling": 319.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "all": { + "compiletime": { + "algsimp": 0.0021410000044852495, + "call-inliner": 0.00039599998854100704, + "collective-stream-id-checker": 7.79999973019585e-05, + "comparison-expander": 0.00048099999548867345, + "constant-statistics": 0.0005200000014156103, + "constant_folding": 0.0002629999944474548, + "dce": 7.699999696342275e-05, + "dot_decomposer": 0.0013620000099763274, + "eliminate-redundant-compare": 0.00024199999461416155, + "flatten-call-graph": 0.0007140000234358013, + "hlo-mac-count": 0.0009169999975711107, + "hlo-verifier": 0.007406999822705984, + "instruction-histogram": 0.001006999984383583, + "io-con-pipe-begin": 7.999999979801942e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.001221999991685152, + "io-statistics": 9.200000204145908e-05, + "map-inline": 0.0007389999809674919, + "native-to-custom-softmax": 0.0005990000208839774, + "native-to-custom-softmax-dx": 0.0004440000047907233, + "opt-barrier-removal": 0.0004710000066552311, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.21597300469875336, + "replace-minimum-constant": 0.00030899999546818435, + "reshape-mover": 0.00010299999848939478, + "simplify-while-loops": 9.40000027185306e-05, + "tuple-simplifier": 0.00026699999580159783, + "unpack-nested-aws-ntwsr": 0.0003319999959785491, + "unroll-while-loop": 1.8000000636675395e-05, + "zero_sized_hlo_elimination": 0.0008210000232793391 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.000293731689453125, + "DMALocalityOpt": 0.00022101402282714844, + "DMAProfiler": 0.0010464191436767578, + "DataStreaming": 0.00040221214294433594, + "DoNothing": 0.00025200843811035156, + "ExpandISAMacro": 0.0005903244018554688, + "FactorizeBlkDims": 0.0005807876586914063, + "InferPSumTensor": 0.0005562305450439453, + "LateLegalizeInst": 0.00046944618225097656, + "LateNeuronInstComb": 0.0006792545318603516, + "LegalizeSundaAccess": 0.0017774105072021484, + "LegalizeType": 0.00032138824462890625, + "LowerBroadcast": 0.0003333091735839844, + "LowerIntrinsics": 0.0002849102020263672, + "LowerTranspose": 0.00046753883361816406, + "NeuronInstComb": 0.0008723735809326172, + "NeuronLICM": 0.00047659873962402344, + "NeuronSimplifyPredicates": 0.0030825138092041016, + "NeuronValueNumbering": 0.0004870891571044922, + "SFKVectorizer": 0.003458738327026367, + "SimpleAllReduceTiling": 0.0002646446228027344, + "SimplifyNeuronTensor": 0.0004863739013671875, + "SpillPSum": 0.0005884170532226563, + "WeightCoalescing": 0.00028324127197265625 + } + }, + "sg00": { + "compiletime": { + "CanonicalizeConv": 2.300000051036477e-05, + "CanonicalizeForTensorizer": 2.2000000171829015e-05, + "Canonicalizer": 0.0005029999883845448, + "HoistCompute": 9.999999974752427e-07, + "IdentifyCrossPassTensors": 2.2000000171829015e-05, + "MemcastMotion": 7.999999979801942e-06, + "PenguinizeFunctions": 2.2000000171829015e-05, + "PruneFunctions": 1.5999999959603883e-05, + "RemoveOptimizationBarriers": 2.4000000848900527e-05, + "ScatterMotion": 1.9999999949504854e-06, + "TensorizerLegalizationPass": 3.400000059627928e-05, + "VerifySupportedOps": 1.5999999959603883e-05, + "algsimp": 6.900000153109431e-05, + "batchnorm_expander": 1.2999999853491317e-05, + "boundary-marker-removal": 3.000000106112566e-06, + "call-inliner": 7.999999979801942e-06, + "canonicalize-boundary-marker": 3.999999989900971e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 3.999999989900971e-06, + "computation-deduplicator": 1.5999999959603883e-05, + "conditional-to-select": 4.999999873689376e-06, + "config-lowering": 5.8000001445179805e-05, + "constant_folding": 7.999999979801942e-06, + "cse": 1.4999999621068127e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 2.099999983329326e-05, + "flatten-call-graph": 7.999999979801942e-06, + "fuse-send-recv": 2.499999936844688e-05, + "hilo::LegalizeAlias": 7.000000096013537e-06, + "hilo::NeuronInstCombine": 5.400000009103678e-05, + "hilo::NeuronOpFusion": 6.000000212225132e-06, + "hilo::ReplaceTokenTypeWithU8Pass": 2.4000000848900527e-05, + "hilo::ScheduleFusion": 0.0, + "hilo::SixtyFourHack": 1.4999999621068127e-05, + "hilo::VerifyAliasing": 3.999999989900971e-06, + "hlo-mac-count": 3.7999998312443495e-05, + "hlo-verifier": 0.00017100000695791095, + "legalize-ccops": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 1.1000000085914508e-05, + "metadata-naming": 1.700000029813964e-05, + "mlir::detail::OpToOpPassAdaptor": 3.7000001611886546e-05, + "mlir::hlo::MhloToPyPenguin": 0.033358000218868256, + "mlir::mhlo::LowerComplexExtraPass": 0.00013499999477062374, + "mlir::mhlo::LowerComplexPass": 0.00014200000441633165, + "native-to-custom-softmax": 7.999999979801942e-06, + "native-to-custom-softmax-dx": 9.999999747378752e-05, + "operand_upcaster": 1.8999999156221747e-05, + "post-par-pipe-begin": 1.9999999949504854e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005610000225715339, + "replace-minimum-constant": 9.000000318337698e-06, + "reshape-mover": 3.999999989900971e-06, + "simplify-concat": 5.199999941396527e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 9.000000318337698e-06, + "tuple-simplifier": 3.999999989900971e-06, + "unpack-nested-aws-ntwsr": 3.000000106112566e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 221.8579559326172, + "ConstantSize": 2106325.0, + "HloInputCount": 359.0, + "HloMacCount": 25769803776.0, + "HloOutputCount": 65.0, + "IfmapSize": 7785168896.0, + "OfmapSize": 536870912.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 232309024.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 0.07846212387084961, + "AffinePredicateResolution": 0.0015842914581298828, + "AliasDependencyElimination": 0.0002803802490234375, + "AliasDependencyInduction": 0.03549337387084961, + "AliasDependencyReset": 0.06158638000488281, + "BFComputeCutting": 0.003358125686645508, + "BirCodeGenLoop": 0.06645083427429199, + "CCOpFusion": 0.03297877311706543, + "CanonicalizeDAGForPGTiling": 0.0029740333557128906, + "CanonicalizeIR": 0.0038878917694091797, + "CoalesceCCOp": 0.0058116912841796875, + "CommuteConcat": 0.0010180473327636719, + "DMALocalityOpt": 0.0015497207641601563, + "DMAProfiler": 0.005065441131591797, + "DMATilingProfiler": 0.004613637924194336, + "DataLocalityOpt": 0.16799569129943848, + "DataStreaming": 0.00627899169921875, + "DeConcat": 0.0015079975128173828, + "DeadCodeElimination": 0.0011029243469238281, + "DeadStoreElimination": 0.06819939613342285, + "DelinearIndices": 0.0475771427154541, + "Delinearization": 0.003088235855102539, + "DoNothing": 0.0001838207244873047, + "DramToDramTranspose": 0.08775472640991211, + "DumpGraphAndMetadata": 0.013874053955078125, + "EliminateDivs": 0.006442070007324219, + "ExpandBatchNorm": 0.00305938720703125, + "ExpandISAMacro": 0.00470423698425293, + "FactorizeBlkDims": 0.026311397552490234, + "FactorizeThreadAxesInFreeDims": 0.0019838809967041016, + "FlattenMacroLoop": 0.004168987274169922, + "GenericAccessSimplifier": 0.0016493797302246094, + "InferInitValue": 0.05328845977783203, + "InferIntrinsicOnCC": 0.009886503219604492, + "InferNeuronTensor": 0.08689069747924805, + "InferNonlocalTensors": 0.2075808048248291, + "InferPSumTensor": 0.12219834327697754, + "InlineNativeKernels": 0.002942323684692383, + "InsertIOTransposes": 0.019949674606323242, + "InsertLocalTransposes": 0.0066678524017333984, + "InsertOffloadedTransposes": 0.005246877670288086, + "LICM": 0.002876758575439453, + "LateLegalizeInst": 0.009313821792602539, + "LateLegalizePostSplit": 0.0034275054931640625, + "LateLowerReshapeOp": 0.001237630844116211, + "LateLowerTensorOp": 0.036368370056152344, + "LateNeuronInstComb": 0.019298315048217773, + "LayoutPreprocessing": 0.0656280517578125, + "LayoutPreprocessingAndAnalysis": 0.0845177173614502, + "LayoutRequirementAnalysis": 0.006539821624755859, + "LegalizeCCOpLayout": 0.002690553665161133, + "LegalizeOpLevelAlias": 0.002089977264404297, + "LegalizePartitionReduce": 0.0019116401672363281, + "LegalizeSundaAccess": 0.04238390922546387, + "LegalizeSundaMacro": 0.008917093276977539, + "LegalizeType": 0.00662541389465332, + "LocalLayoutOpt": 0.017171859741210938, + "LoopFusion": 0.04693031311035156, + "LoopSplitting": 0.0004513263702392578, + "LowerBroadcast": 0.0021796226501464844, + "LowerCCOpBlockAxis": 0.005298614501953125, + "LowerComplexBroadcast": 0.002663135528564453, + "LowerIntrinsics": 0.08481836318969727, + "LowerTensorOp": 0.05078911781311035, + "LowerTranspose": 0.052706241607666016, + "MacroGeneration": 0.16595196723937988, + "MaskPropagation": 0.00496983528137207, + "MemcpyElimination": 0.27239394187927246, + "MutateDataType": 0.0022711753845214844, + "NeuronAliasDependencyInduction": 0.00037479400634765625, + "NeuronAliasDependencyReset": 0.012241363525390625, + "NeuronInstComb": 0.010676145553588867, + "NeuronLICM": 0.01803445816040039, + "NeuronLoopFusion": 0.01843857765197754, + "NeuronLoopInterchange": 0.0022115707397460938, + "NeuronSimplifier": 0.011580228805541992, + "NeuronSimplifyPredicates": 0.017709970474243164, + "NeuronValueNumbering": 0.045330047607421875, + "OptimizeAliasedCopyChain": 0.0012116432189941406, + "OptimizeNKIKernels": 0.04246807098388672, + "PAGLayoutOpt": 0.38617491722106934, + "PComputeCutting": 0.008383512496948242, + "PGLayoutTilingPipeline": 1.3029937744140625, + "PGTiling": 0.34752726554870605, + "PadElimination": 0.0006172657012939453, + "ParAxesAnnotation": 0.36298155784606934, + "PartialLoopFusion": 0.024132490158081055, + "PartialSimdFusion": 0.026205062866210938, + "PerfectLoopNest": 0.0019898414611816406, + "RecognizeOpIdiom": 0.006145477294921875, + "Recompute": 0.00034356117248535156, + "RelaxPredicates": 0.0044634342193603516, + "Rematerialization": 0.004605531692504883, + "ReshapeWeights": 0.0008733272552490234, + "ResolveAccessConflict": 0.003629446029663086, + "ResolveComplicatePredicates": 0.0018143653869628906, + "RewriteReplicationMatmul": 0.001529693603515625, + "RewriteWeights": 0.0036728382110595703, + "SFKVectorizer": 0.5580539703369141, + "SimpleAllReduceTiling": 0.0026845932006835938, + "Simplifier": 0.0046727657318115234, + "SimplifyMacroPredicates": 0.01622939109802246, + "SimplifyNeuronTensor": 0.015488386154174805, + "SimplifySlice": 0.0018961429595947266, + "SimplifyTensor": 0.006178140640258789, + "SpillPSum": 0.06065011024475098, + "SplitAPUnionSets": 0.075592041015625, + "SplitAccGrp": 0.0017442703247070313, + "StaticProfiler": 0.00494384765625, + "StaticTransposeLocalTensor": 0.0146331787109375, + "SundaISel": 0.056458473205566406, + "TCTransform": 0.001115560531616211, + "TensorInitialization": 0.021691322326660156, + "TensorOpSimplifier": 0.010814189910888672, + "TensorOpTransform": 0.07015466690063477, + "TileCCOps": 0.007310152053833008, + "TilingProfiler": 0.012901067733764648, + "TransformConvOp": 0.00453495979309082, + "TritiumFusion": 0.10158801078796387, + "ValueNumbering": 0.0044324398040771484, + "VectorizeDMA": 0.006791114807128906, + "VectorizeMatMult": 0.01838517189025879, + "WeightCoalescing": 0.004769086837768555, + "ZeroSizeTensorElimination": 0.00017833709716796875 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 5791.0, + "StaticProfiler::AifUb": 261.38446044921875, + "StaticProfiler::ArithmeticIntensityTensorizer": 680.3948364257813, + "StaticProfiler::AverageDmaLength": 2076.933837890625, + "StaticProfiler::AverageFractalPeUtilization": 99.95938110351563, + "StaticProfiler::AveragePartitionUtilization": 99.89742279052734, + "StaticProfiler::AveragePeUtilization": 99.83380126953125, + "StaticProfiler::DDRTransferBytes": 87646472.0, + "StaticProfiler::InternalTransferBytes": 114032640.0, + "StaticProfiler::LoadExpanded": 20995.0, + "StaticProfiler::LocalizationEfficiency": 260.3042297363281, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 391.4252624511719, + "StaticProfiler::StoreExpanded": 10753.0, + "StaticProfiler::TotalDMAExpanded": 31748.0, + "StaticProfiler::TotalDynamicInstancesCount": 8459.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 8453.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 96.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 3080.0, + "TilingProfiler::NumPfTransposes": 7.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 5.0, + "TilingProfiler::NumPfTransposesForNonlocal": 1.0, + "TilingProfiler::PfTransposeInstructions": 1632.0, + "TilingProfiler::PfTransposeInstructionsForIo": 256.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1248.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 128.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 612.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0001": { + "compiletime": { + "AGOrderingAnalysisPass": 0.19782710075378418, + "AffinePredicateResolution": 0.0014352798461914063, + "AliasDependencyElimination": 0.00022602081298828125, + "AliasDependencyInduction": 0.008897542953491211, + "AliasDependencyReset": 0.07564544677734375, + "BFComputeCutting": 0.0038797855377197266, + "BirCodeGenLoop": 0.08420419692993164, + "CCOpFusion": 0.0411074161529541, + "CanonicalizeDAGForPGTiling": 0.004708290100097656, + "CanonicalizeIR": 0.001739501953125, + "CoalesceCCOp": 0.005135774612426758, + "CommuteConcat": 0.0010938644409179688, + "DMALocalityOpt": 0.0010821819305419922, + "DMAProfiler": 0.03509354591369629, + "DMATilingProfiler": 0.008334875106811523, + "DataLocalityOpt": 0.3732140064239502, + "DataStreaming": 0.004484653472900391, + "DeConcat": 0.0014607906341552734, + "DeadCodeElimination": 0.002012491226196289, + "DeadStoreElimination": 0.06306838989257813, + "DelinearIndices": 0.03899812698364258, + "Delinearization": 0.015190839767456055, + "DoNothing": 0.00013589859008789063, + "DramToDramTranspose": 0.05379915237426758, + "DumpGraphAndMetadata": 0.053969621658325195, + "EliminateDivs": 0.005895376205444336, + "ExpandBatchNorm": 0.0030879974365234375, + "ExpandISAMacro": 0.002570629119873047, + "FactorizeBlkDims": 0.03216910362243652, + "FactorizeThreadAxesInFreeDims": 0.0017580986022949219, + "FlattenMacroLoop": 0.004896402359008789, + "GenericAccessSimplifier": 0.001070261001586914, + "InferInitValue": 0.09278488159179688, + "InferIntrinsicOnCC": 0.010787725448608398, + "InferNeuronTensor": 0.16329479217529297, + "InferNonlocalTensors": 0.08827400207519531, + "InferPSumTensor": 0.041254281997680664, + "InlineNativeKernels": 0.002732515335083008, + "InsertIOTransposes": 0.030591964721679688, + "InsertLocalTransposes": 0.0069196224212646484, + "InsertOffloadedTransposes": 0.0034880638122558594, + "LICM": 0.0034477710723876953, + "LateLegalizeInst": 0.005655765533447266, + "LateLegalizePostSplit": 0.003046751022338867, + "LateLowerReshapeOp": 0.0013928413391113281, + "LateLowerTensorOp": 0.0053386688232421875, + "LateNeuronInstComb": 0.027225971221923828, + "LayoutPreprocessing": 0.047040700912475586, + "LayoutPreprocessingAndAnalysis": 0.12968659400939941, + "LayoutRequirementAnalysis": 0.01332712173461914, + "LegalizeCCOpLayout": 0.0019299983978271484, + "LegalizeOpLevelAlias": 0.0019905567169189453, + "LegalizePartitionReduce": 0.0013320446014404297, + "LegalizeSundaAccess": 0.0154571533203125, + "LegalizeSundaMacro": 0.018419265747070313, + "LegalizeType": 0.0047800540924072266, + "LocalLayoutOpt": 0.029850482940673828, + "LoopFusion": 0.006402492523193359, + "LoopSplitting": 0.0006403923034667969, + "LowerBroadcast": 0.0029153823852539063, + "LowerCCOpBlockAxis": 0.005182743072509766, + "LowerComplexBroadcast": 0.0022389888763427734, + "LowerIntrinsics": 0.056134939193725586, + "LowerTensorOp": 0.01170802116394043, + "LowerTranspose": 0.0226747989654541, + "MacroGeneration": 0.12812113761901855, + "MaskPropagation": 0.003968477249145508, + "MemcpyElimination": 0.1272127628326416, + "MutateDataType": 0.0016314983367919922, + "NeuronAliasDependencyInduction": 0.0003142356872558594, + "NeuronAliasDependencyReset": 0.011624336242675781, + "NeuronInstComb": 0.00946044921875, + "NeuronLICM": 0.008498668670654297, + "NeuronLoopFusion": 0.01998734474182129, + "NeuronLoopInterchange": 0.0018498897552490234, + "NeuronSimplifier": 0.03274989128112793, + "NeuronSimplifyPredicates": 0.001984834671020508, + "NeuronValueNumbering": 0.03443026542663574, + "OptimizeAliasedCopyChain": 0.0008573532104492188, + "OptimizeNKIKernels": 0.0016489028930664063, + "PAGLayoutOpt": 0.52590012550354, + "PComputeCutting": 0.007617473602294922, + "PGLayoutTilingPipeline": 1.6884160041809082, + "PGTiling": 0.42557621002197266, + "PadElimination": 0.0004146099090576172, + "ParAxesAnnotation": 0.49584078788757324, + "PartialLoopFusion": 0.04620671272277832, + "PartialSimdFusion": 0.04396200180053711, + "PerfectLoopNest": 0.002160310745239258, + "RecognizeOpIdiom": 0.004221677780151367, + "Recompute": 0.0006210803985595703, + "RelaxPredicates": 0.0031533241271972656, + "Rematerialization": 0.0020017623901367188, + "ReshapeWeights": 0.0012595653533935547, + "ResolveAccessConflict": 0.034206390380859375, + "ResolveComplicatePredicates": 0.001447916030883789, + "RewriteReplicationMatmul": 0.003072500228881836, + "RewriteWeights": 0.005293369293212891, + "SFKVectorizer": 0.31648850440979004, + "SimpleAllReduceTiling": 0.0026230812072753906, + "Simplifier": 0.00507354736328125, + "SimplifyMacroPredicates": 0.011813640594482422, + "SimplifyNeuronTensor": 0.029469728469848633, + "SimplifySlice": 0.0010852813720703125, + "SimplifyTensor": 0.006476879119873047, + "SpillPSum": 0.047782182693481445, + "SplitAPUnionSets": 0.022653579711914063, + "SplitAccGrp": 0.0025262832641601563, + "StaticProfiler": 0.03480696678161621, + "StaticTransposeLocalTensor": 0.006014108657836914, + "SundaISel": 0.05354189872741699, + "TCTransform": 0.0011737346649169922, + "TensorInitialization": 0.004692554473876953, + "TensorOpSimplifier": 0.007290840148925781, + "TensorOpTransform": 0.039176225662231445, + "TileCCOps": 0.009789466857910156, + "TilingProfiler": 0.02116703987121582, + "TransformConvOp": 0.002421855926513672, + "TritiumFusion": 0.24414300918579102, + "ValueNumbering": 0.002656698226928711, + "VectorizeDMA": 0.0018146038055419922, + "VectorizeMatMult": 0.034119606018066406, + "WeightCoalescing": 0.002785921096801758, + "ZeroSizeTensorElimination": 0.00019216537475585938 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 17420.0, + "StaticProfiler::AifUb": 844.2889404296875, + "StaticProfiler::ArithmeticIntensityTensorizer": 712.69189453125, + "StaticProfiler::AverageDmaLength": 1079.260986328125, + "StaticProfiler::AverageFractalPeUtilization": 100.0, + "StaticProfiler::AveragePartitionUtilization": 99.85012817382813, + "StaticProfiler::AveragePeUtilization": 100.0, + "StaticProfiler::DDRTransferBytes": 339836928.0, + "StaticProfiler::InternalTransferBytes": 106692608.0, + "StaticProfiler::LoadExpanded": 296193.0, + "StaticProfiler::LocalizationEfficiency": 84.41326904296875, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 93.66107177734375, + "StaticProfiler::StoreExpanded": 10241.0, + "StaticProfiler::TotalDMAExpanded": 306434.0, + "StaticProfiler::TotalDynamicInstancesCount": 21356.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 21356.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 64.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 13824.0, + "TilingProfiler::NumPfTransposes": 9.0, + "TilingProfiler::NumPfTransposesForIo": 3.0, + "TilingProfiler::NumPfTransposesForLocal": 4.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 1904.0, + "TilingProfiler::PfTransposeInstructionsForIo": 272.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1120.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 512.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 683.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0002": { + "compiletime": { + "AGOrderingAnalysisPass": 0.01833963394165039, + "AffinePredicateResolution": 0.0011298656463623047, + "AliasDependencyElimination": 0.0003044605255126953, + "AliasDependencyInduction": 0.0059871673583984375, + "AliasDependencyReset": 0.024695634841918945, + "BFComputeCutting": 0.0022745132446289063, + "BirCodeGenLoop": 0.12499594688415527, + "CCOpFusion": 0.025257110595703125, + "CanonicalizeDAGForPGTiling": 0.0039975643157958984, + "CanonicalizeIR": 0.0017023086547851563, + "CoalesceCCOp": 0.006569385528564453, + "CommuteConcat": 0.0009205341339111328, + "DMALocalityOpt": 0.0031824111938476563, + "DMAProfiler": 0.005998849868774414, + "DMATilingProfiler": 0.004607439041137695, + "DataLocalityOpt": 0.15957880020141602, + "DataStreaming": 0.03691840171813965, + "DeConcat": 0.0007259845733642578, + "DeadCodeElimination": 0.0009546279907226563, + "DeadStoreElimination": 0.006250619888305664, + "DelinearIndices": 0.005332231521606445, + "Delinearization": 0.0033500194549560547, + "DoNothing": 0.0001239776611328125, + "DramToDramTranspose": 0.020763397216796875, + "DumpGraphAndMetadata": 0.025223493576049805, + "EliminateDivs": 0.0023469924926757813, + "ExpandBatchNorm": 0.001692056655883789, + "ExpandISAMacro": 0.008460283279418945, + "FactorizeBlkDims": 0.00921773910522461, + "FactorizeThreadAxesInFreeDims": 0.002184152603149414, + "FlattenMacroLoop": 0.0022482872009277344, + "GenericAccessSimplifier": 0.0009622573852539063, + "InferInitValue": 0.027300357818603516, + "InferIntrinsicOnCC": 0.009199380874633789, + "InferNeuronTensor": 0.028067350387573242, + "InferNonlocalTensors": 0.014671802520751953, + "InferPSumTensor": 0.0808565616607666, + "InlineNativeKernels": 0.002727031707763672, + "InsertIOTransposes": 0.017727136611938477, + "InsertLocalTransposes": 0.004176616668701172, + "InsertOffloadedTransposes": 0.002771615982055664, + "LICM": 0.005248069763183594, + "LateLegalizeInst": 0.0068132877349853516, + "LateLegalizePostSplit": 0.0045223236083984375, + "LateLowerReshapeOp": 0.0012927055358886719, + "LateLowerTensorOp": 0.0014028549194335938, + "LateNeuronInstComb": 0.016278505325317383, + "LayoutPreprocessing": 0.026221275329589844, + "LayoutPreprocessingAndAnalysis": 0.07468867301940918, + "LayoutRequirementAnalysis": 0.004823446273803711, + "LegalizeCCOpLayout": 0.0023353099822998047, + "LegalizeOpLevelAlias": 0.0013494491577148438, + "LegalizePartitionReduce": 0.0018906593322753906, + "LegalizeSundaAccess": 0.06063246726989746, + "LegalizeSundaMacro": 0.04256129264831543, + "LegalizeType": 0.005706787109375, + "LocalLayoutOpt": 0.016018390655517578, + "LoopFusion": 0.005109071731567383, + "LoopSplitting": 0.00048542022705078125, + "LowerBroadcast": 0.0029256343841552734, + "LowerCCOpBlockAxis": 0.0038700103759765625, + "LowerComplexBroadcast": 0.004511594772338867, + "LowerIntrinsics": 0.3245360851287842, + "LowerTensorOp": 0.010710477828979492, + "LowerTranspose": 0.05445718765258789, + "MacroGeneration": 0.061620473861694336, + "MaskPropagation": 0.002919435501098633, + "MemcpyElimination": 0.02559375762939453, + "MutateDataType": 0.0014896392822265625, + "NeuronAliasDependencyInduction": 0.0002808570861816406, + "NeuronAliasDependencyReset": 0.05649685859680176, + "NeuronInstComb": 0.004225254058837891, + "NeuronLICM": 0.014126300811767578, + "NeuronLoopFusion": 0.009732246398925781, + "NeuronLoopInterchange": 0.0025072097778320313, + "NeuronSimplifier": 0.03835606575012207, + "NeuronSimplifyPredicates": 0.005949974060058594, + "NeuronValueNumbering": 0.002723217010498047, + "OptimizeAliasedCopyChain": 0.0007545948028564453, + "OptimizeNKIKernels": 0.6443507671356201, + "PAGLayoutOpt": 0.20021605491638184, + "PComputeCutting": 0.0046160221099853516, + "PGLayoutTilingPipeline": 0.6925618648529053, + "PGTiling": 0.21065187454223633, + "PadElimination": 0.00038623809814453125, + "ParAxesAnnotation": 0.052834510803222656, + "PartialLoopFusion": 0.051622629165649414, + "PartialSimdFusion": 0.014065980911254883, + "PerfectLoopNest": 0.0019462108612060547, + "RecognizeOpIdiom": 0.0037450790405273438, + "Recompute": 0.0004031658172607422, + "RelaxPredicates": 0.03561973571777344, + "Rematerialization": 0.0018870830535888672, + "ReshapeWeights": 0.0009450912475585938, + "ResolveAccessConflict": 0.0038840770721435547, + "ResolveComplicatePredicates": 0.0011222362518310547, + "RewriteReplicationMatmul": 0.0017135143280029297, + "RewriteWeights": 0.0024623870849609375, + "SFKVectorizer": 0.19122314453125, + "SimpleAllReduceTiling": 0.0035347938537597656, + "Simplifier": 0.0030031204223632813, + "SimplifyMacroPredicates": 0.005193233489990234, + "SimplifyNeuronTensor": 0.38506579399108887, + "SimplifySlice": 0.001062631607055664, + "SimplifyTensor": 0.009534358978271484, + "SpillPSum": 0.05334925651550293, + "SplitAPUnionSets": 0.013537406921386719, + "SplitAccGrp": 0.0014171600341796875, + "StaticProfiler": 0.005720615386962891, + "StaticTransposeLocalTensor": 0.003614664077758789, + "SundaISel": 0.09031486511230469, + "TCTransform": 0.0008947849273681641, + "TensorInitialization": 0.010958433151245117, + "TensorOpSimplifier": 0.005278110504150391, + "TensorOpTransform": 0.020787477493286133, + "TileCCOps": 0.005544900894165039, + "TilingProfiler": 0.007747173309326172, + "TransformConvOp": 0.003238677978515625, + "TritiumFusion": 0.16130614280700684, + "ValueNumbering": 0.0018999576568603516, + "VectorizeDMA": 0.0017979145050048828, + "VectorizeMatMult": 0.007079362869262695, + "WeightCoalescing": 0.0030584335327148438, + "ZeroSizeTensorElimination": 0.00022983551025390625 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 17056.0, + "StaticProfiler::AifUb": 568.2581176757813, + "StaticProfiler::ArithmeticIntensityTensorizer": 456.7580261230469, + "StaticProfiler::AverageDmaLength": 1314.3221435546875, + "StaticProfiler::AverageFractalPeUtilization": 99.79875946044922, + "StaticProfiler::AveragePartitionUtilization": 99.50694274902344, + "StaticProfiler::AveragePeUtilization": 99.19517517089844, + "StaticProfiler::DDRTransferBytes": 407087136.0, + "StaticProfiler::InternalTransferBytes": 48342036.0, + "StaticProfiler::LoadExpanded": 310291.0, + "StaticProfiler::LocalizationEfficiency": 80.37861633300781, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 88.63314819335938, + "StaticProfiler::StoreExpanded": 6699.0, + "StaticProfiler::TotalDMAExpanded": 316990.0, + "StaticProfiler::TotalDynamicInstancesCount": 19674.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 19578.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 14848.0, + "TilingProfiler::NumPfTransposes": 4.0, + "TilingProfiler::NumPfTransposesForIo": 0.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 769.0, + "TilingProfiler::PfTransposeInstructionsForIo": 0.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 6.0, + "TilingProfiler::SimdInstructionsAfterTiling": 319.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg01": { + "compiletime": { + "CanonicalizeConv": 0.0, + "CanonicalizeForTensorizer": 1.9999999494757503e-05, + "Canonicalizer": 0.0003800000122282654, + "HoistCompute": 3.000000106112566e-06, + "IdentifyCrossPassTensors": 1.9999999494757503e-05, + "MemcastMotion": 1.2000000424450263e-05, + "PenguinizeFunctions": 1.9999999494757503e-05, + "PruneFunctions": 1.700000029813964e-05, + "RemoveOptimizationBarriers": 2.499999936844688e-05, + "ScatterMotion": 7.000000096013537e-06, + "TensorizerLegalizationPass": 2.2000000171829015e-05, + "VerifySupportedOps": 1.4999999621068127e-05, + "algsimp": 0.00012199999764561653, + "batchnorm_expander": 2.2000000171829015e-05, + "boundary-marker-removal": 7.000000096013537e-06, + "call-inliner": 1.700000029813964e-05, + "canonicalize-boundary-marker": 7.999999979801942e-06, + "collective-stream-id-checker": 9.999999747378752e-06, + "comparison-expander": 9.000000318337698e-06, + "computation-deduplicator": 3.600000127335079e-05, + "conditional-to-select": 9.000000318337698e-06, + "config-lowering": 7.699999696342275e-05, + "constant_folding": 1.2999999853491317e-05, + "cse": 2.5999999706982635e-05, + "dce": 1.9999999949504854e-06, + "dynamic-slice-transpose": 7.999999979801942e-06, + "eliminate-redundant-compare": 7.000000096013537e-06, + "emit-offloaded-dropout": 2.9000000722589903e-05, + "flatten-call-graph": 1.700000029813964e-05, + "fuse-send-recv": 3.7999998312443495e-05, + "hilo::LegalizeAlias": 7.000000096013537e-06, + "hilo::NeuronInstCombine": 4.8000001697801054e-05, + "hilo::NeuronOpFusion": 2.300000051036477e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 2.499999936844688e-05, + "hilo::ScheduleFusion": 0.0, + "hilo::SixtyFourHack": 1.8000000636675395e-05, + "hilo::VerifyAliasing": 3.999999989900971e-06, + "hlo-mac-count": 5.900000178371556e-05, + "hlo-verifier": 0.00028700000257231295, + "legalize-ccops": 1.9999999949504854e-06, + "legalize-compare": 7.000000096013537e-06, + "lower-argminmax-custom-call": 7.000000096013537e-06, + "map-inline": 1.8000000636675395e-05, + "metadata-naming": 3.400000059627928e-05, + "mlir::detail::OpToOpPassAdaptor": 4.099999932805076e-05, + "mlir::hlo::MhloToPyPenguin": 0.025769000872969627, + "mlir::mhlo::LowerComplexExtraPass": 0.00010399999882793054, + "mlir::mhlo::LowerComplexPass": 0.00024399999529123306, + "native-to-custom-softmax": 1.700000029813964e-05, + "native-to-custom-softmax-dx": 4.70000013592653e-05, + "operand_upcaster": 2.9999999242136255e-05, + "post-par-pipe-begin": 9.000000318337698e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0009069999796338379, + "replace-minimum-constant": 1.4000000192027073e-05, + "reshape-mover": 6.000000212225132e-06, + "simplify-concat": 7.300000288523734e-05, + "simplify-while-loops": 4.999999873689376e-06, + "transform-variadic-reduce": 1.2999999853491317e-05, + "tuple-simplifier": 9.000000318337698e-06, + "unpack-nested-aws-ntwsr": 4.999999873689376e-06, + "unroll-while-loop": 9.999999974752427e-07 + }, + "hilo": { + "ArithmeticIntensity": 808.5779418945313, + "HloMacCount": 115964116992.0, + "Traffic": 286834720.0 + } + }, + "sg02": { + "compiletime": { + "CanonicalizeConv": 0.0, + "CanonicalizeForTensorizer": 1.4000000192027073e-05, + "Canonicalizer": 0.0005150000215508044, + "HoistCompute": 1.9999999949504854e-06, + "IdentifyCrossPassTensors": 1.8000000636675395e-05, + "MemcastMotion": 1.4000000192027073e-05, + "PenguinizeFunctions": 9.999999747378752e-06, + "PruneFunctions": 9.999999747378752e-06, + "RemoveOptimizationBarriers": 1.5999999959603883e-05, + "ScatterMotion": 9.999999974752427e-07, + "TensorizerLegalizationPass": 7.000000096013537e-06, + "VerifySupportedOps": 1.8000000636675395e-05, + "algsimp": 7.599999662488699e-05, + "batchnorm_expander": 1.4999999621068127e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 1.2000000424450263e-05, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 6.000000212225132e-06, + "comparison-expander": 6.000000212225132e-06, + "computation-deduplicator": 2.499999936844688e-05, + "conditional-to-select": 7.000000096013537e-06, + "config-lowering": 6.299999949987978e-05, + "constant_folding": 1.1000000085914508e-05, + "cse": 1.4000000192027073e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 2.099999983329326e-05, + "flatten-call-graph": 1.2000000424450263e-05, + "fuse-send-recv": 2.4000000848900527e-05, + "hilo::LegalizeAlias": 1.9999999949504854e-06, + "hilo::NeuronInstCombine": 4.999999873689376e-05, + "hilo::NeuronOpFusion": 2.099999983329326e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 1.8000000636675395e-05, + "hilo::ScheduleFusion": 3.999999989900971e-06, + "hilo::SixtyFourHack": 4.3000000005122274e-05, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 0.0002410000015515834, + "hlo-verifier": 0.00020399999630171806, + "legalize-ccops": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 1.4000000192027073e-05, + "metadata-naming": 1.700000029813964e-05, + "mlir::detail::OpToOpPassAdaptor": 3.5000000934815034e-05, + "mlir::hlo::MhloToPyPenguin": 0.01627100072801113, + "mlir::mhlo::LowerComplexExtraPass": 0.00011700000322889537, + "mlir::mhlo::LowerComplexPass": 0.00016500000492669642, + "native-to-custom-softmax": 1.1000000085914508e-05, + "native-to-custom-softmax-dx": 4.5000000682193786e-05, + "operand_upcaster": 1.8999999156221747e-05, + "post-par-pipe-begin": 3.000000106112566e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0006179999909363687, + "replace-minimum-constant": 9.999999747378752e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 4.999999873689376e-05, + "simplify-while-loops": 3.000000106112566e-06, + "transform-variadic-reduce": 6.0999998822808266e-05, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 537.9506225585938, + "HloMacCount": 90261422080.0, + "Traffic": 335575104.0 + } + } +} \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk3/graph.neff b/context_encoding_model/_tp0_bk3/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..ef1a5a984745b5bb2ff3a3b3207a01a52af7c78f --- /dev/null +++ b/context_encoding_model/_tp0_bk3/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9330f87daaab052682ce2a183d9908828cede116976ffca894ecbd7ea31a028c +size 1731584 diff --git a/context_encoding_model/_tp0_bk3/metaneff.pb b/context_encoding_model/_tp0_bk3/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..b00d28114ad7fa426c9ee7f6734337760aa3692d --- /dev/null +++ b/context_encoding_model/_tp0_bk3/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:592de62bcecb744077fb8bd9e5363e57ea2543bf82083638f8ad2039a512933c +size 1561029 diff --git a/context_encoding_model/_tp0_bk3/model.MODULE_2e1f11fbf72d40b46e64+5ae2bfda.hlo_module.pb b/context_encoding_model/_tp0_bk3/model.MODULE_2e1f11fbf72d40b46e64+5ae2bfda.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..d80720b21799bc9a70a7b8305a9e1523b155a417 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/model.MODULE_2e1f11fbf72d40b46e64+5ae2bfda.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba393ab52af446df672f41632e2c112cb7e051a45d99da03d33fb1f12262cca6 +size 1626903 diff --git a/context_encoding_model/_tp0_bk3/model.MODULE_2e1f11fbf72d40b46e64+5ae2bfda.neff b/context_encoding_model/_tp0_bk3/model.MODULE_2e1f11fbf72d40b46e64+5ae2bfda.neff new file mode 100644 index 0000000000000000000000000000000000000000..ef1a5a984745b5bb2ff3a3b3207a01a52af7c78f --- /dev/null +++ b/context_encoding_model/_tp0_bk3/model.MODULE_2e1f11fbf72d40b46e64+5ae2bfda.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9330f87daaab052682ce2a183d9908828cede116976ffca894ecbd7ea31a028c +size 1731584 diff --git a/context_encoding_model/_tp0_bk3/neuron_config.json b/context_encoding_model/_tp0_bk3/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b89b797b515a5377cedf6ec09f32672a4c27666c --- /dev/null +++ b/context_encoding_model/_tp0_bk3/neuron_config.json @@ -0,0 +1,213 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3", + "add_cross_attention": false, + "architectures": [ + "MistralForCausalLM" + ], + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 14336, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "metadata": null, + "min_length": 0, + "model_type": "mistral", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": "NeuronLlamaAttention", + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": true, + "buckets": [ + 1024 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": [ + 1024 + ], + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": true, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 4, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": false, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 1, + "lora_config": null, + "max_batch_size": 4, + "max_context_length": 2048, + "max_length": 2048, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 2048, + "n_positions": 2048, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 2048, + "pa_num_blocks": 4, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 2048, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 4, + "speculation_length": 0, + "start_rank_id": 0, + "target": null, + "tile_cc": false, + "tkg_batch_size": 4, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 32, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.42.0.dev0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 32768 +} diff --git a/context_encoding_model/_tp0_bk4/command.txt b/context_encoding_model/_tp0_bk4/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..4895e56e8e3a98bc5ed18ca1baaebda301447ec9 --- /dev/null +++ b/context_encoding_model/_tp0_bk4/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_d342327da795afc2aa68+5e8b788a.hlo_module.pb --output model.MODULE_d342327da795afc2aa68+5e8b788a.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk4/compile_flags.MODULE_d342327da795afc2aa68+5e8b788a.json b/context_encoding_model/_tp0_bk4/compile_flags.MODULE_d342327da795afc2aa68+5e8b788a.json new file mode 100644 index 0000000000000000000000000000000000000000..12baf68476e8d35c577570b0a530a8244e79b804 --- /dev/null +++ b/context_encoding_model/_tp0_bk4/compile_flags.MODULE_d342327da795afc2aa68+5e8b788a.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/log-neuron-cc.txt"] \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk4/global_metric_store.json b/context_encoding_model/_tp0_bk4/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..468e5b3355e69b77a2088727aecc8b459edfaeec --- /dev/null +++ b/context_encoding_model/_tp0_bk4/global_metric_store.json @@ -0,0 +1,1079 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 99.88423156738281, + "StaticProfiler::AveragePartitionUtilization": 99.71043395996094, + "StaticProfiler::AveragePeUtilization": 99.53581237792969, + "StaticProfiler::LocalizationEfficiency": 41.61907196044922, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 45.55835723876953, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1.0, + "StaticProfiler::AveragePartitionUtilization": 1.0, + "StaticProfiler::AveragePeUtilization": 1.0, + "StaticProfiler::LocalizationEfficiency": 1.0, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 0.06499266624450684, + "AffinePredicateResolution": 0.00141143798828125, + "AliasDependencyElimination": 0.00017595291137695313, + "AliasDependencyInduction": 0.006516218185424805, + "AliasDependencyReset": 0.02024674415588379, + "BFComputeCutting": 0.0023620128631591797, + "BirCodeGenLoop": 0.13731598854064941, + "CCOpFusion": 0.02620387077331543, + "CanonicalizeConv": 6.0999998822808266e-05, + "CanonicalizeDAGForPGTiling": 0.0074574947357177734, + "CanonicalizeForTensorizer": 4.999999509891495e-05, + "CanonicalizeIR": 0.0019347667694091797, + "Canonicalizer": 0.0009759999811649323, + "CoalesceCCOp": 0.005659818649291992, + "CommuteConcat": 0.0009889602661132813, + "DMALocalityOpt": 0.0024099349975585938, + "DMAProfiler": 0.008657455444335938, + "DMATilingProfiler": 0.04570889472961426, + "DataLocalityOpt": 0.1127479076385498, + "DataStreaming": 0.007959365844726563, + "DeConcat": 0.0007421970367431641, + "DeadCodeElimination": 0.002073049545288086, + "DeadStoreElimination": 0.006093263626098633, + "DelinearIndices": 0.010124444961547852, + "Delinearization": 0.005106449127197266, + "DoNothing": 0.0003044605255126953, + "DramToDramTranspose": 0.03771638870239258, + "DumpGraphAndMetadata": 0.05296611785888672, + "EliminateDivs": 0.0021944046020507813, + "ExpandBatchNorm": 0.0015587806701660156, + "ExpandISAMacro": 0.005168437957763672, + "FactorizeBlkDims": 0.011832475662231445, + "FactorizeThreadAxesInFreeDims": 0.0014889240264892578, + "FlattenMacroLoop": 0.0025510787963867188, + "GenericAccessSimplifier": 0.0009717941284179688, + "HoistCompute": 1.3999999282532372e-05, + "IdentifyCrossPassTensors": 4.099999932805076e-05, + "InferInitValue": 0.030786514282226563, + "InferIntrinsicOnCC": 0.012189865112304688, + "InferNeuronTensor": 0.0819096565246582, + "InferNonlocalTensors": 0.025629520416259766, + "InferPSumTensor": 0.08477997779846191, + "InlineNativeKernels": 0.003083944320678711, + "InsertIOTransposes": 0.02764296531677246, + "InsertLocalTransposes": 0.0040624141693115234, + "InsertOffloadedTransposes": 0.005682229995727539, + "LICM": 0.003050565719604492, + "LateLegalizeInst": 0.02321314811706543, + "LateLegalizePostSplit": 0.004519462585449219, + "LateLowerReshapeOp": 0.0023851394653320313, + "LateLowerTensorOp": 0.0016567707061767578, + "LateNeuronInstComb": 0.011125564575195313, + "LayoutPreprocessing": 0.06753706932067871, + "LayoutPreprocessingAndAnalysis": 0.16236424446105957, + "LayoutRequirementAnalysis": 0.005420684814453125, + "LegalizeCCOpLayout": 0.0023717880249023438, + "LegalizeOpLevelAlias": 0.0012898445129394531, + "LegalizePartitionReduce": 0.0011932849884033203, + "LegalizeSundaAccess": 0.026709556579589844, + "LegalizeSundaMacro": 0.012512683868408203, + "LegalizeType": 0.04736900329589844, + "LocalLayoutOpt": 0.0263979434967041, + "LoopFusion": 0.005193948745727539, + "LoopSplitting": 0.0005512237548828125, + "LowerBroadcast": 0.04221224784851074, + "LowerCCOpBlockAxis": 0.008313655853271484, + "LowerComplexBroadcast": 0.0025756359100341797, + "LowerIntrinsics": 0.11752676963806152, + "LowerTensorOp": 0.010608196258544922, + "LowerTranspose": 0.08257818222045898, + "MacroGeneration": 0.07271862030029297, + "MaskPropagation": 0.005186557769775391, + "MemcastMotion": 2.2000000171829015e-05, + "MemcpyElimination": 0.026259899139404297, + "MutateDataType": 0.0013203620910644531, + "NeuronAliasDependencyInduction": 0.0002338886260986328, + "NeuronAliasDependencyReset": 0.029464006423950195, + "NeuronInstComb": 0.004740476608276367, + "NeuronLICM": 0.01508331298828125, + "NeuronLoopFusion": 0.00891876220703125, + "NeuronLoopInterchange": 0.0014586448669433594, + "NeuronSimplifier": 0.009086847305297852, + "NeuronSimplifyPredicates": 0.006235837936401367, + "NeuronValueNumbering": 0.0030777454376220703, + "OptimizeAliasedCopyChain": 0.0006422996520996094, + "OptimizeNKIKernels": 0.5174376964569092, + "PAGLayoutOpt": 0.12734031677246094, + "PComputeCutting": 0.005000591278076172, + "PGLayoutTilingPipeline": 0.8229436874389648, + "PGTiling": 0.26772499084472656, + "PadElimination": 0.0005135536193847656, + "ParAxesAnnotation": 0.07412934303283691, + "PartialLoopFusion": 0.013575553894042969, + "PartialSimdFusion": 0.011231422424316406, + "PenguinizeFunctions": 4.099999932805076e-05, + "PerfectLoopNest": 0.0019729137420654297, + "PruneFunctions": 2.5000001187436283e-05, + "RecognizeOpIdiom": 0.0038080215454101563, + "Recompute": 0.00034308433532714844, + "RelaxPredicates": 0.004430532455444336, + "Rematerialization": 0.002201557159423828, + "RemoveOptimizationBarriers": 7.999999797903001e-05, + "ReshapeWeights": 0.0009114742279052734, + "ResolveAccessConflict": 0.027348041534423828, + "ResolveComplicatePredicates": 0.0011477470397949219, + "RewriteReplicationMatmul": 0.0025103092193603516, + "RewriteWeights": 0.0029447078704833984, + "SFKVectorizer": 0.19977569580078125, + "ScatterMotion": 3.400000059627928e-05, + "SimpleAllReduceTiling": 0.0034945011138916016, + "Simplifier": 0.003106832504272461, + "SimplifyMacroPredicates": 0.03599357604980469, + "SimplifyNeuronTensor": 0.18126153945922852, + "SimplifySlice": 0.0016787052154541016, + "SimplifyTensor": 0.04330563545227051, + "SpillPSum": 0.06513023376464844, + "SplitAPUnionSets": 0.012967586517333984, + "SplitAccGrp": 0.0015358924865722656, + "StaticProfiler": 0.00551915168762207, + "StaticTransposeLocalTensor": 0.004834890365600586, + "SundaISel": 0.0945746898651123, + "TCTransform": 0.0009295940399169922, + "TensorInitialization": 0.006634950637817383, + "TensorOpSimplifier": 0.005204439163208008, + "TensorOpTransform": 0.02082967758178711, + "TensorizerLegalizationPass": 5.0000002374872565e-05, + "TileCCOps": 0.006725311279296875, + "TilingProfiler": 0.016322612762451172, + "TransformConvOp": 0.0029544830322265625, + "TritiumFusion": 0.09467315673828125, + "ValueNumbering": 0.0020852088928222656, + "VectorizeDMA": 0.0017535686492919922, + "VectorizeMatMult": 0.008865118026733398, + "VerifySupportedOps": 3.300000025774352e-05, + "WeightCoalescing": 0.003345489501953125, + "ZeroSizeTensorElimination": 0.00018644332885742188, + "algsimp": 0.0030140001326799393, + "batchnorm_expander": 4.400000034365803e-05, + "boundary-marker-removal": 1.1000000085914508e-05, + "call-inliner": 0.0004670000053010881, + "canonicalize-boundary-marker": 1.4999999621068127e-05, + "collective-stream-id-checker": 0.00010199999815085903, + "comparison-expander": 0.0005569999921135604, + "computation-deduplicator": 6.500000017695129e-05, + "conditional-to-select": 1.700000029813964e-05, + "config-lowering": 0.0001630000042496249, + "constant-statistics": 0.0005039999959990382, + "constant_folding": 0.0002969999914057553, + "cse": 6.0999998822808266e-05, + "dce": 8.600000001024455e-05, + "dot_decomposer": 0.001433999976143241, + "dynamic-slice-transpose": 1.2999999853491317e-05, + "eliminate-redundant-compare": 0.0002640000020619482, + "emit-offloaded-dropout": 6.500000017695129e-05, + "flatten-call-graph": 0.0007960000075399876, + "fuse-send-recv": 7.000000186963007e-05, + "hilo::LegalizeAlias": 1.4999999621068127e-05, + "hilo::NeuronInstCombine": 0.00012399999832268804, + "hilo::NeuronOpFusion": 6.399999983841553e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 4.5000000682193786e-05, + "hilo::ScheduleFusion": 1.300000076298602e-05, + "hilo::SixtyFourHack": 6.800000119255856e-05, + "hilo::VerifyAliasing": 6.000000212225132e-06, + "hlo-mac-count": 0.0012410000199452043, + "hlo-verifier": 0.010365999303758144, + "instruction-histogram": 0.0010479999473318458, + "io-con-pipe-begin": 7.999999979801942e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0016609999584034085, + "io-statistics": 0.0001049999991664663, + "legalize-ccops": 4.999999873689376e-06, + "legalize-compare": 1.1000000085914508e-05, + "lower-argminmax-custom-call": 9.999999747378752e-06, + "map-inline": 0.0009129999671131372, + "metadata-naming": 5.400000372901559e-05, + "mlir::detail::OpToOpPassAdaptor": 8.199999865610152e-05, + "mlir::hlo::MhloToPyPenguin": 0.07495799660682678, + "mlir::mhlo::LowerComplexExtraPass": 0.00035899996873922646, + "mlir::mhlo::LowerComplexPass": 0.0005389999714680016, + "native-to-custom-softmax": 0.000842000066768378, + "native-to-custom-softmax-dx": 0.0008819999638944864, + "operand_upcaster": 6.800000119255856e-05, + "opt-barrier-removal": 0.0005799999926239252, + "post-par-pipe-begin": 1.700000029813964e-05, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0018259999342262745, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.2598330080509186, + "replace-minimum-constant": 0.0004039999912492931, + "reshape-mover": 0.00012399999832268804, + "simplify-concat": 0.0001630000042496249, + "simplify-while-loops": 0.00010000000474974513, + "transform-variadic-reduce": 0.0001939999929163605, + "tuple-simplifier": 0.0003140000335406512, + "unpack-nested-aws-ntwsr": 0.0003929999948013574, + "unroll-while-loop": 1.8000000636675395e-05, + "zero_sized_hlo_elimination": 0.0009759999811649323 + }, + "hilo": { + "ConstantSize": 4203477.0, + "HloInputCount": 359.0, + "HloMacCount": 481103446016.0, + "HloOutputCount": 65.0, + "IfmapSize": 7785177088.0, + "OfmapSize": 536870912.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 975382912.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 28921.0, + "StaticProfiler::AifUb": 1080.6693115234375, + "StaticProfiler::ArithmeticIntensityTensorizer": 449.7645263671875, + "StaticProfiler::AverageDmaLength": 1323.6162109375, + "StaticProfiler::DDRTransferBytes": 826525760.0, + "StaticProfiler::InternalTransferBytes": 96576528.0, + "StaticProfiler::LoadExpanded": 619540.0, + "StaticProfiler::StoreExpanded": 12842.0, + "StaticProfiler::TotalDMAExpanded": 632382.0, + "StaticProfiler::TotalDynamicInstancesCount": 34834.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 34738.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 25600.0, + "TilingProfiler::NumPfTransposes": 4.0, + "TilingProfiler::NumPfTransposesForIo": 0.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 1537.0, + "TilingProfiler::PfTransposeInstructionsForIo": 0.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1536.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 10.0, + "TilingProfiler::SimdInstructionsAfterTiling": 626.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "all": { + "compiletime": { + "algsimp": 0.00279300007969141, + "call-inliner": 0.00043799998820759356, + "collective-stream-id-checker": 8.600000001024455e-05, + "comparison-expander": 0.0005419999943114817, + "constant-statistics": 0.0005039999959990382, + "constant_folding": 0.0002699999895412475, + "dce": 8.099999831756577e-05, + "dot_decomposer": 0.001433999976143241, + "eliminate-redundant-compare": 0.00025400001322850585, + "flatten-call-graph": 0.0007660000119358301, + "hlo-mac-count": 0.0009599999757483602, + "hlo-verifier": 0.009782999753952026, + "instruction-histogram": 0.0010479999473318458, + "io-con-pipe-begin": 7.999999979801942e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0016609999584034085, + "io-statistics": 0.0001049999991664663, + "map-inline": 0.0008759999764151871, + "native-to-custom-softmax": 0.0008140000281855464, + "native-to-custom-softmax-dx": 0.0007149999728426337, + "opt-barrier-removal": 0.0005799999926239252, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.2598330080509186, + "replace-minimum-constant": 0.0003769999893847853, + "reshape-mover": 0.00011300000187475234, + "simplify-while-loops": 9.300000237999484e-05, + "tuple-simplifier": 0.0003000000142492354, + "unpack-nested-aws-ntwsr": 0.0003809999907389283, + "unroll-while-loop": 1.8000000636675395e-05, + "zero_sized_hlo_elimination": 0.0009759999811649323 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.00027561187744140625, + "DMALocalityOpt": 0.0002129077911376953, + "DMAProfiler": 0.0009992122650146484, + "DataStreaming": 0.0003039836883544922, + "DoNothing": 0.0001742839813232422, + "ExpandISAMacro": 0.0005218982696533203, + "FactorizeBlkDims": 0.0004630088806152344, + "InferPSumTensor": 0.0004932880401611328, + "LateLegalizeInst": 0.0005190372467041016, + "LateNeuronInstComb": 0.0005123615264892578, + "LegalizeSundaAccess": 0.0015988349914550781, + "LegalizeType": 0.00028014183044433594, + "LowerBroadcast": 0.00025653839111328125, + "LowerIntrinsics": 0.0002598762512207031, + "LowerTranspose": 0.00026535987854003906, + "NeuronInstComb": 0.0005023479461669922, + "NeuronLICM": 0.00043654441833496094, + "NeuronSimplifyPredicates": 0.0028448104858398438, + "NeuronValueNumbering": 0.0004410743713378906, + "SFKVectorizer": 0.0033159255981445313, + "SimpleAllReduceTiling": 0.00028634071350097656, + "SimplifyNeuronTensor": 0.0004749298095703125, + "SpillPSum": 0.0005846023559570313, + "WeightCoalescing": 0.00024771690368652344 + } + }, + "sg00": { + "compiletime": { + "CanonicalizeConv": 4.3000000005122274e-05, + "CanonicalizeForTensorizer": 1.4999999621068127e-05, + "Canonicalizer": 0.00034500000765547156, + "HoistCompute": 1.9999999949504854e-06, + "IdentifyCrossPassTensors": 1.2999999853491317e-05, + "MemcastMotion": 1.2999999853491317e-05, + "PenguinizeFunctions": 1.2999999853491317e-05, + "PruneFunctions": 1.2000000424450263e-05, + "RemoveOptimizationBarriers": 1.2999999853491317e-05, + "ScatterMotion": 2.9000000722589903e-05, + "TensorizerLegalizationPass": 2.099999983329326e-05, + "VerifySupportedOps": 9.999999747378752e-06, + "algsimp": 7.699999696342275e-05, + "batchnorm_expander": 1.4000000192027073e-05, + "boundary-marker-removal": 3.000000106112566e-06, + "call-inliner": 9.000000318337698e-06, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 3.999999989900971e-06, + "comparison-expander": 3.999999989900971e-06, + "computation-deduplicator": 1.8000000636675395e-05, + "conditional-to-select": 4.999999873689376e-06, + "config-lowering": 5.6000000768108293e-05, + "constant_folding": 7.999999979801942e-06, + "cse": 2.499999936844688e-05, + "dce": 1.9999999949504854e-06, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.999999989900971e-06, + "emit-offloaded-dropout": 2.099999983329326e-05, + "flatten-call-graph": 7.999999979801942e-06, + "fuse-send-recv": 2.300000051036477e-05, + "hilo::LegalizeAlias": 4.999999873689376e-06, + "hilo::NeuronInstCombine": 2.9999999242136255e-05, + "hilo::NeuronOpFusion": 1.8000000636675395e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 4.999999873689376e-06, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.1000000085914508e-05, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 3.7000001611886546e-05, + "hlo-verifier": 0.00017499999376013875, + "legalize-ccops": 1.9999999949504854e-06, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 1.1000000085914508e-05, + "metadata-naming": 1.700000029813964e-05, + "mlir::detail::OpToOpPassAdaptor": 3.300000025774352e-05, + "mlir::hlo::MhloToPyPenguin": 0.03136799857020378, + "mlir::mhlo::LowerComplexExtraPass": 9.899999713525176e-05, + "mlir::mhlo::LowerComplexPass": 0.00019999999494757503, + "native-to-custom-softmax": 7.999999979801942e-06, + "native-to-custom-softmax-dx": 9.300000237999484e-05, + "operand_upcaster": 2.700000004551839e-05, + "post-par-pipe-begin": 1.9999999949504854e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0006479999865405262, + "replace-minimum-constant": 9.000000318337698e-06, + "reshape-mover": 3.999999989900971e-06, + "simplify-concat": 5.900000178371556e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 9.000000318337698e-06, + "tuple-simplifier": 3.999999989900971e-06, + "unpack-nested-aws-ntwsr": 3.000000106112566e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 439.27252197265625, + "ConstantSize": 4203477.0, + "HloInputCount": 359.0, + "HloMacCount": 60129542144.0, + "HloOutputCount": 65.0, + "IfmapSize": 7785177088.0, + "OfmapSize": 536870912.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 273768736.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 0.13596534729003906, + "AffinePredicateResolution": 0.0015311241149902344, + "AliasDependencyElimination": 0.0001938343048095703, + "AliasDependencyInduction": 0.007838010787963867, + "AliasDependencyReset": 0.15939617156982422, + "BFComputeCutting": 0.006036996841430664, + "BirCodeGenLoop": 0.38369321823120117, + "CCOpFusion": 0.15093040466308594, + "CanonicalizeDAGForPGTiling": 0.014190196990966797, + "CanonicalizeIR": 0.0019371509552001953, + "CoalesceCCOp": 0.0029022693634033203, + "CommuteConcat": 0.0010671615600585938, + "DMALocalityOpt": 0.0018265247344970703, + "DMAProfiler": 0.006582021713256836, + "DMATilingProfiler": 0.005391597747802734, + "DataLocalityOpt": 0.20601868629455566, + "DataStreaming": 0.00843048095703125, + "DeConcat": 0.0018315315246582031, + "DeadCodeElimination": 0.0020117759704589844, + "DeadStoreElimination": 0.027777433395385742, + "DelinearIndices": 0.029506444931030273, + "Delinearization": 0.003535747528076172, + "DoNothing": 0.0001571178436279297, + "DramToDramTranspose": 0.07804989814758301, + "DumpGraphAndMetadata": 0.04837989807128906, + "EliminateDivs": 0.0034132003784179688, + "ExpandBatchNorm": 0.0020427703857421875, + "ExpandISAMacro": 0.0035333633422851563, + "FactorizeBlkDims": 0.06211543083190918, + "FactorizeThreadAxesInFreeDims": 0.0018017292022705078, + "FlattenMacroLoop": 0.005364418029785156, + "GenericAccessSimplifier": 0.0018382072448730469, + "InferInitValue": 0.04181218147277832, + "InferIntrinsicOnCC": 0.05515456199645996, + "InferNeuronTensor": 0.08455061912536621, + "InferNonlocalTensors": 0.3793964385986328, + "InferPSumTensor": 0.06014227867126465, + "InlineNativeKernels": 0.0018780231475830078, + "InsertIOTransposes": 0.05663871765136719, + "InsertLocalTransposes": 0.013693094253540039, + "InsertOffloadedTransposes": 0.003034353256225586, + "LICM": 0.0034589767456054688, + "LateLegalizeInst": 0.01206350326538086, + "LateLegalizePostSplit": 0.004300355911254883, + "LateLowerReshapeOp": 0.001447439193725586, + "LateLowerTensorOp": 0.005361080169677734, + "LateNeuronInstComb": 0.028362512588500977, + "LayoutPreprocessing": 0.17102479934692383, + "LayoutPreprocessingAndAnalysis": 0.20053863525390625, + "LayoutRequirementAnalysis": 0.00810098648071289, + "LegalizeCCOpLayout": 0.002534151077270508, + "LegalizeOpLevelAlias": 0.0013082027435302734, + "LegalizePartitionReduce": 0.0018541812896728516, + "LegalizeSundaAccess": 0.06417489051818848, + "LegalizeSundaMacro": 0.011395931243896484, + "LegalizeType": 0.004536867141723633, + "LocalLayoutOpt": 0.019284486770629883, + "LoopFusion": 0.005501747131347656, + "LoopSplitting": 0.0007183551788330078, + "LowerBroadcast": 0.0020034313201904297, + "LowerCCOpBlockAxis": 0.006723642349243164, + "LowerComplexBroadcast": 0.0025110244750976563, + "LowerIntrinsics": 0.04395008087158203, + "LowerTensorOp": 0.01201629638671875, + "LowerTranspose": 0.015764951705932617, + "MacroGeneration": 0.1732039451599121, + "MaskPropagation": 0.006498575210571289, + "MemcpyElimination": 0.13526344299316406, + "MutateDataType": 0.0024404525756835938, + "NeuronAliasDependencyInduction": 0.00028133392333984375, + "NeuronAliasDependencyReset": 0.027801036834716797, + "NeuronInstComb": 0.014089107513427734, + "NeuronLICM": 0.011513233184814453, + "NeuronLoopFusion": 0.018094778060913086, + "NeuronLoopInterchange": 0.002248525619506836, + "NeuronSimplifier": 0.014221668243408203, + "NeuronSimplifyPredicates": 0.04183816909790039, + "NeuronValueNumbering": 0.010004520416259766, + "OptimizeAliasedCopyChain": 0.0007202625274658203, + "OptimizeNKIKernels": 0.0027985572814941406, + "PAGLayoutOpt": 0.6076157093048096, + "PComputeCutting": 0.01562190055847168, + "PGLayoutTilingPipeline": 1.8925251960754395, + "PGTiling": 0.4175417423248291, + "PadElimination": 0.0005469322204589844, + "ParAxesAnnotation": 0.5765500068664551, + "PartialLoopFusion": 0.06665897369384766, + "PartialSimdFusion": 0.06845211982727051, + "PerfectLoopNest": 0.002520322799682617, + "RecognizeOpIdiom": 0.0038416385650634766, + "Recompute": 0.00042510032653808594, + "RelaxPredicates": 0.004330158233642578, + "Rematerialization": 0.0048253536224365234, + "ReshapeWeights": 0.0009126663208007813, + "ResolveAccessConflict": 0.007032871246337891, + "ResolveComplicatePredicates": 0.0016722679138183594, + "RewriteReplicationMatmul": 0.0017805099487304688, + "RewriteWeights": 0.00464630126953125, + "SFKVectorizer": 0.6191775798797607, + "SimpleAllReduceTiling": 0.0028734207153320313, + "Simplifier": 0.04510617256164551, + "SimplifyMacroPredicates": 0.03182697296142578, + "SimplifyNeuronTensor": 0.018846511840820313, + "SimplifySlice": 0.0010728836059570313, + "SimplifyTensor": 0.00718235969543457, + "SpillPSum": 0.02726292610168457, + "SplitAPUnionSets": 0.20770835876464844, + "SplitAccGrp": 0.0018444061279296875, + "StaticProfiler": 0.009473562240600586, + "StaticTransposeLocalTensor": 0.0051805973052978516, + "SundaISel": 0.0508725643157959, + "TCTransform": 0.0011992454528808594, + "TensorInitialization": 0.02745676040649414, + "TensorOpSimplifier": 0.006608009338378906, + "TensorOpTransform": 0.026006698608398438, + "TileCCOps": 0.008507728576660156, + "TilingProfiler": 0.015691757202148438, + "TransformConvOp": 0.002533435821533203, + "TritiumFusion": 0.1403183937072754, + "ValueNumbering": 0.0023522377014160156, + "VectorizeDMA": 0.006667613983154297, + "VectorizeMatMult": 0.025510072708129883, + "WeightCoalescing": 0.002580881118774414, + "ZeroSizeTensorElimination": 0.0002429485321044922 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 15146.0, + "StaticProfiler::AifUb": 590.4973754882813, + "StaticProfiler::ArithmeticIntensityTensorizer": 748.2540283203125, + "StaticProfiler::AverageDmaLength": 2622.051025390625, + "StaticProfiler::AverageFractalPeUtilization": 99.97018432617188, + "StaticProfiler::AveragePartitionUtilization": 99.92617797851563, + "StaticProfiler::AveragePeUtilization": 99.87796020507813, + "StaticProfiler::DDRTransferBytes": 196215040.0, + "StaticProfiler::InternalTransferBytes": 332922880.0, + "StaticProfiler::LoadExpanded": 37252.0, + "StaticProfiler::LocalizationEfficiency": 126.71589660644531, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 180.83277893066406, + "StaticProfiler::StoreExpanded": 16897.0, + "StaticProfiler::TotalDMAExpanded": 54149.0, + "StaticProfiler::TotalDynamicInstancesCount": 23848.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 23836.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 192.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 7184.0, + "TilingProfiler::NumPfTransposes": 8.0, + "TilingProfiler::NumPfTransposesForIo": 0.0, + "TilingProfiler::NumPfTransposesForLocal": 6.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 5568.0, + "TilingProfiler::PfTransposeInstructionsForIo": 0.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 4800.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 1764.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0001": { + "compiletime": { + "AGOrderingAnalysisPass": 0.1609482765197754, + "AffinePredicateResolution": 0.0013859272003173828, + "AliasDependencyElimination": 0.00018930435180664063, + "AliasDependencyInduction": 0.01599717140197754, + "AliasDependencyReset": 0.031088829040527344, + "BFComputeCutting": 0.004532814025878906, + "BirCodeGenLoop": 0.21514463424682617, + "CCOpFusion": 0.2648317813873291, + "CanonicalizeDAGForPGTiling": 0.003528594970703125, + "CanonicalizeIR": 0.0019960403442382813, + "CoalesceCCOp": 0.00243377685546875, + "CommuteConcat": 0.0011005401611328125, + "DMALocalityOpt": 0.0013654232025146484, + "DMAProfiler": 0.008886098861694336, + "DMATilingProfiler": 0.005682706832885742, + "DataLocalityOpt": 0.2774043083190918, + "DataStreaming": 0.007985115051269531, + "DeConcat": 0.001863241195678711, + "DeadCodeElimination": 0.0013644695281982422, + "DeadStoreElimination": 0.07262182235717773, + "DelinearIndices": 0.047678232192993164, + "Delinearization": 0.004866838455200195, + "DoNothing": 0.00013303756713867188, + "DramToDramTranspose": 0.0971534252166748, + "DumpGraphAndMetadata": 0.013672351837158203, + "EliminateDivs": 0.003657102584838867, + "ExpandBatchNorm": 0.0016169548034667969, + "ExpandISAMacro": 0.0034465789794921875, + "FactorizeBlkDims": 0.060559749603271484, + "FactorizeThreadAxesInFreeDims": 0.0021708011627197266, + "FlattenMacroLoop": 0.004648447036743164, + "GenericAccessSimplifier": 0.000980377197265625, + "InferInitValue": 0.05812406539916992, + "InferIntrinsicOnCC": 0.010819196701049805, + "InferNeuronTensor": 0.14679336547851563, + "InferNonlocalTensors": 0.034285783767700195, + "InferPSumTensor": 0.09114336967468262, + "InlineNativeKernels": 0.0017209053039550781, + "InsertIOTransposes": 0.0731968879699707, + "InsertLocalTransposes": 0.0275421142578125, + "InsertOffloadedTransposes": 0.007097005844116211, + "LICM": 0.0033905506134033203, + "LateLegalizeInst": 0.006936788558959961, + "LateLegalizePostSplit": 0.003220081329345703, + "LateLowerReshapeOp": 0.0016317367553710938, + "LateLowerTensorOp": 0.005948543548583984, + "LateNeuronInstComb": 0.018251657485961914, + "LayoutPreprocessing": 0.09319257736206055, + "LayoutPreprocessingAndAnalysis": 0.11977434158325195, + "LayoutRequirementAnalysis": 0.009629964828491211, + "LegalizeCCOpLayout": 0.0020868778228759766, + "LegalizeOpLevelAlias": 0.0011761188507080078, + "LegalizePartitionReduce": 0.001623392105102539, + "LegalizeSundaAccess": 0.021021366119384766, + "LegalizeSundaMacro": 0.012225627899169922, + "LegalizeType": 0.02536749839782715, + "LocalLayoutOpt": 0.04628801345825195, + "LoopFusion": 0.005954742431640625, + "LoopSplitting": 0.0006933212280273438, + "LowerBroadcast": 0.0018084049224853516, + "LowerCCOpBlockAxis": 0.006256580352783203, + "LowerComplexBroadcast": 0.002477884292602539, + "LowerIntrinsics": 0.03852725028991699, + "LowerTensorOp": 0.010782480239868164, + "LowerTranspose": 0.018457412719726563, + "MacroGeneration": 0.1307680606842041, + "MaskPropagation": 0.0035936832427978516, + "MemcpyElimination": 0.15900325775146484, + "MutateDataType": 0.001459360122680664, + "NeuronAliasDependencyInduction": 0.00030994415283203125, + "NeuronAliasDependencyReset": 0.0227048397064209, + "NeuronInstComb": 0.01124882698059082, + "NeuronLICM": 0.010287761688232422, + "NeuronLoopFusion": 0.06714057922363281, + "NeuronLoopInterchange": 0.0033617019653320313, + "NeuronSimplifier": 0.015295267105102539, + "NeuronSimplifyPredicates": 0.002671957015991211, + "NeuronValueNumbering": 0.004712104797363281, + "OptimizeAliasedCopyChain": 0.0008287429809570313, + "OptimizeNKIKernels": 0.0030798912048339844, + "PAGLayoutOpt": 0.4701688289642334, + "PComputeCutting": 0.008523941040039063, + "PGLayoutTilingPipeline": 1.527449607849121, + "PGTiling": 0.562786340713501, + "PadElimination": 0.0005154609680175781, + "ParAxesAnnotation": 0.4113032817840576, + "PartialLoopFusion": 0.03786206245422363, + "PartialSimdFusion": 0.09660077095031738, + "PerfectLoopNest": 0.0025701522827148438, + "RecognizeOpIdiom": 0.004408836364746094, + "Recompute": 0.0004405975341796875, + "RelaxPredicates": 0.004298210144042969, + "Rematerialization": 0.0020570755004882813, + "ReshapeWeights": 0.0008633136749267578, + "ResolveAccessConflict": 0.004068136215209961, + "ResolveComplicatePredicates": 0.0015447139739990234, + "RewriteReplicationMatmul": 0.0018274784088134766, + "RewriteWeights": 0.024018287658691406, + "SFKVectorizer": 0.5714495182037354, + "SimpleAllReduceTiling": 0.05605673789978027, + "Simplifier": 0.03458523750305176, + "SimplifyMacroPredicates": 0.007905960083007813, + "SimplifyNeuronTensor": 0.05205702781677246, + "SimplifySlice": 0.0012252330780029297, + "SimplifyTensor": 0.007117748260498047, + "SpillPSum": 0.0394134521484375, + "SplitAPUnionSets": 0.0830078125, + "SplitAccGrp": 0.0015587806701660156, + "StaticProfiler": 0.008753538131713867, + "StaticTransposeLocalTensor": 0.03607439994812012, + "SundaISel": 0.06672215461730957, + "TCTransform": 0.0011696815490722656, + "TensorInitialization": 0.006832122802734375, + "TensorOpSimplifier": 0.0061838626861572266, + "TensorOpTransform": 0.03341221809387207, + "TileCCOps": 0.00767970085144043, + "TilingProfiler": 0.07469630241394043, + "TransformConvOp": 0.00249481201171875, + "TritiumFusion": 0.3289809226989746, + "ValueNumbering": 0.0027396678924560547, + "VectorizeDMA": 0.0023260116577148438, + "VectorizeMatMult": 0.05879783630371094, + "WeightCoalescing": 0.002382993698120117, + "ZeroSizeTensorElimination": 0.0001971721649169922 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 37569.0, + "StaticProfiler::AifUb": 1576.160400390625, + "StaticProfiler::ArithmeticIntensityTensorizer": 624.52294921875, + "StaticProfiler::AverageDmaLength": 1256.79248046875, + "StaticProfiler::AverageFractalPeUtilization": 100.0, + "StaticProfiler::AveragePartitionUtilization": 99.870361328125, + "StaticProfiler::AveragePeUtilization": 100.0, + "StaticProfiler::DDRTransferBytes": 818020352.0, + "StaticProfiler::InternalTransferBytes": 284688384.0, + "StaticProfiler::LoadExpanded": 616833.0, + "StaticProfiler::LocalizationEfficiency": 39.6230583190918, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 43.16416549682617, + "StaticProfiler::StoreExpanded": 17409.0, + "StaticProfiler::TotalDMAExpanded": 634242.0, + "StaticProfiler::TotalDynamicInstancesCount": 49371.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 49371.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 128.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 28672.0, + "TilingProfiler::NumPfTransposes": 9.0, + "TilingProfiler::NumPfTransposesForIo": 3.0, + "TilingProfiler::NumPfTransposesForLocal": 4.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 5856.0, + "TilingProfiler::PfTransposeInstructionsForIo": 544.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 4288.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1024.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 1876.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0002": { + "compiletime": { + "AGOrderingAnalysisPass": 0.06499266624450684, + "AffinePredicateResolution": 0.00141143798828125, + "AliasDependencyElimination": 0.00017595291137695313, + "AliasDependencyInduction": 0.006516218185424805, + "AliasDependencyReset": 0.02024674415588379, + "BFComputeCutting": 0.0023620128631591797, + "BirCodeGenLoop": 0.13731598854064941, + "CCOpFusion": 0.02620387077331543, + "CanonicalizeDAGForPGTiling": 0.0074574947357177734, + "CanonicalizeIR": 0.0019347667694091797, + "CoalesceCCOp": 0.005384206771850586, + "CommuteConcat": 0.0009889602661132813, + "DMALocalityOpt": 0.0021970272064208984, + "DMAProfiler": 0.007658243179321289, + "DMATilingProfiler": 0.04570889472961426, + "DataLocalityOpt": 0.1127479076385498, + "DataStreaming": 0.00765538215637207, + "DeConcat": 0.0007421970367431641, + "DeadCodeElimination": 0.002073049545288086, + "DeadStoreElimination": 0.006093263626098633, + "DelinearIndices": 0.010124444961547852, + "Delinearization": 0.005106449127197266, + "DoNothing": 0.00013017654418945313, + "DramToDramTranspose": 0.03771638870239258, + "DumpGraphAndMetadata": 0.05296611785888672, + "EliminateDivs": 0.0021944046020507813, + "ExpandBatchNorm": 0.0015587806701660156, + "ExpandISAMacro": 0.0046465396881103516, + "FactorizeBlkDims": 0.011369466781616211, + "FactorizeThreadAxesInFreeDims": 0.0014889240264892578, + "FlattenMacroLoop": 0.0025510787963867188, + "GenericAccessSimplifier": 0.0009717941284179688, + "InferInitValue": 0.030786514282226563, + "InferIntrinsicOnCC": 0.012189865112304688, + "InferNeuronTensor": 0.0819096565246582, + "InferNonlocalTensors": 0.025629520416259766, + "InferPSumTensor": 0.08428668975830078, + "InlineNativeKernels": 0.003083944320678711, + "InsertIOTransposes": 0.02764296531677246, + "InsertLocalTransposes": 0.0040624141693115234, + "InsertOffloadedTransposes": 0.005682229995727539, + "LICM": 0.003050565719604492, + "LateLegalizeInst": 0.022694110870361328, + "LateLegalizePostSplit": 0.004519462585449219, + "LateLowerReshapeOp": 0.0023851394653320313, + "LateLowerTensorOp": 0.0016567707061767578, + "LateNeuronInstComb": 0.010613203048706055, + "LayoutPreprocessing": 0.06753706932067871, + "LayoutPreprocessingAndAnalysis": 0.16236424446105957, + "LayoutRequirementAnalysis": 0.005420684814453125, + "LegalizeCCOpLayout": 0.0023717880249023438, + "LegalizeOpLevelAlias": 0.0012898445129394531, + "LegalizePartitionReduce": 0.0011932849884033203, + "LegalizeSundaAccess": 0.025110721588134766, + "LegalizeSundaMacro": 0.012512683868408203, + "LegalizeType": 0.0470888614654541, + "LocalLayoutOpt": 0.0263979434967041, + "LoopFusion": 0.005193948745727539, + "LoopSplitting": 0.0005512237548828125, + "LowerBroadcast": 0.04195570945739746, + "LowerCCOpBlockAxis": 0.008313655853271484, + "LowerComplexBroadcast": 0.0025756359100341797, + "LowerIntrinsics": 0.11726689338684082, + "LowerTensorOp": 0.010608196258544922, + "LowerTranspose": 0.08231282234191895, + "MacroGeneration": 0.07271862030029297, + "MaskPropagation": 0.005186557769775391, + "MemcpyElimination": 0.026259899139404297, + "MutateDataType": 0.0013203620910644531, + "NeuronAliasDependencyInduction": 0.0002338886260986328, + "NeuronAliasDependencyReset": 0.029464006423950195, + "NeuronInstComb": 0.004238128662109375, + "NeuronLICM": 0.014646768569946289, + "NeuronLoopFusion": 0.00891876220703125, + "NeuronLoopInterchange": 0.0014586448669433594, + "NeuronSimplifier": 0.009086847305297852, + "NeuronSimplifyPredicates": 0.0033910274505615234, + "NeuronValueNumbering": 0.0026366710662841797, + "OptimizeAliasedCopyChain": 0.0006422996520996094, + "OptimizeNKIKernels": 0.5174376964569092, + "PAGLayoutOpt": 0.12734031677246094, + "PComputeCutting": 0.005000591278076172, + "PGLayoutTilingPipeline": 0.8229436874389648, + "PGTiling": 0.26772499084472656, + "PadElimination": 0.0005135536193847656, + "ParAxesAnnotation": 0.07412934303283691, + "PartialLoopFusion": 0.013575553894042969, + "PartialSimdFusion": 0.011231422424316406, + "PerfectLoopNest": 0.0019729137420654297, + "RecognizeOpIdiom": 0.0038080215454101563, + "Recompute": 0.00034308433532714844, + "RelaxPredicates": 0.004430532455444336, + "Rematerialization": 0.002201557159423828, + "ReshapeWeights": 0.0009114742279052734, + "ResolveAccessConflict": 0.027348041534423828, + "ResolveComplicatePredicates": 0.0011477470397949219, + "RewriteReplicationMatmul": 0.0025103092193603516, + "RewriteWeights": 0.0029447078704833984, + "SFKVectorizer": 0.19645977020263672, + "SimpleAllReduceTiling": 0.003208160400390625, + "Simplifier": 0.003106832504272461, + "SimplifyMacroPredicates": 0.03599357604980469, + "SimplifyNeuronTensor": 0.1807866096496582, + "SimplifySlice": 0.0016787052154541016, + "SimplifyTensor": 0.04330563545227051, + "SpillPSum": 0.0645456314086914, + "SplitAPUnionSets": 0.012967586517333984, + "SplitAccGrp": 0.0015358924865722656, + "StaticProfiler": 0.00551915168762207, + "StaticTransposeLocalTensor": 0.004834890365600586, + "SundaISel": 0.0945746898651123, + "TCTransform": 0.0009295940399169922, + "TensorInitialization": 0.006634950637817383, + "TensorOpSimplifier": 0.005204439163208008, + "TensorOpTransform": 0.02082967758178711, + "TileCCOps": 0.006725311279296875, + "TilingProfiler": 0.016322612762451172, + "TransformConvOp": 0.0029544830322265625, + "TritiumFusion": 0.09467315673828125, + "ValueNumbering": 0.0020852088928222656, + "VectorizeDMA": 0.0017535686492919922, + "VectorizeMatMult": 0.008865118026733398, + "WeightCoalescing": 0.0030977725982666016, + "ZeroSizeTensorElimination": 0.00018644332885742188 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 28921.0, + "StaticProfiler::AifUb": 1080.6693115234375, + "StaticProfiler::ArithmeticIntensityTensorizer": 449.7645263671875, + "StaticProfiler::AverageDmaLength": 1323.6162109375, + "StaticProfiler::AverageFractalPeUtilization": 99.88423156738281, + "StaticProfiler::AveragePartitionUtilization": 99.71043395996094, + "StaticProfiler::AveragePeUtilization": 99.53581237792969, + "StaticProfiler::DDRTransferBytes": 826525760.0, + "StaticProfiler::InternalTransferBytes": 96576528.0, + "StaticProfiler::LoadExpanded": 619540.0, + "StaticProfiler::LocalizationEfficiency": 41.61907196044922, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 45.55835723876953, + "StaticProfiler::StoreExpanded": 12842.0, + "StaticProfiler::TotalDMAExpanded": 632382.0, + "StaticProfiler::TotalDynamicInstancesCount": 34834.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 34738.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 25600.0, + "TilingProfiler::NumPfTransposes": 4.0, + "TilingProfiler::NumPfTransposesForIo": 0.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 1537.0, + "TilingProfiler::PfTransposeInstructionsForIo": 0.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1536.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 10.0, + "TilingProfiler::SimdInstructionsAfterTiling": 626.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg01": { + "compiletime": { + "CanonicalizeConv": 7.000000096013537e-06, + "CanonicalizeForTensorizer": 1.9999999494757503e-05, + "Canonicalizer": 0.00028300000121816993, + "HoistCompute": 1.9999999949504854e-06, + "IdentifyCrossPassTensors": 1.4999999621068127e-05, + "MemcastMotion": 9.000000318337698e-06, + "PenguinizeFunctions": 1.8000000636675395e-05, + "PruneFunctions": 4.999999873689376e-06, + "RemoveOptimizationBarriers": 5.0999999075429514e-05, + "ScatterMotion": 3.999999989900971e-06, + "TensorizerLegalizationPass": 2.2000000171829015e-05, + "VerifySupportedOps": 1.1000000085914508e-05, + "algsimp": 6.800000119255856e-05, + "batchnorm_expander": 1.5999999959603883e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 9.000000318337698e-06, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 7.000000096013537e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 2.4000000848900527e-05, + "conditional-to-select": 4.999999873689376e-06, + "config-lowering": 4.600000102072954e-05, + "constant_folding": 9.999999747378752e-06, + "cse": 2.2000000171829015e-05, + "dce": 1.9999999949504854e-06, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 2.300000051036477e-05, + "flatten-call-graph": 9.000000318337698e-06, + "fuse-send-recv": 2.5999999706982635e-05, + "hilo::LegalizeAlias": 7.999999979801942e-06, + "hilo::NeuronInstCombine": 7.79999973019585e-05, + "hilo::NeuronOpFusion": 2.8000000384054147e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 2.300000051036477e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.5999999959603883e-05, + "hilo::VerifyAliasing": 3.000000106112566e-06, + "hlo-mac-count": 3.899999865097925e-05, + "hlo-verifier": 0.00021499999274965376, + "legalize-ccops": 1.9999999949504854e-06, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 1.1000000085914508e-05, + "metadata-naming": 2.300000051036477e-05, + "mlir::detail::OpToOpPassAdaptor": 2.9999999242136255e-05, + "mlir::hlo::MhloToPyPenguin": 0.025178000330924988, + "mlir::mhlo::LowerComplexExtraPass": 8.70000003487803e-05, + "mlir::mhlo::LowerComplexPass": 0.00013099999341648072, + "native-to-custom-softmax": 9.999999747378752e-06, + "native-to-custom-softmax-dx": 3.600000127335079e-05, + "operand_upcaster": 2.300000051036477e-05, + "post-par-pipe-begin": 1.1000000085914508e-05, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.000590000010561198, + "replace-minimum-constant": 7.000000096013537e-06, + "reshape-mover": 3.999999989900971e-06, + "simplify-concat": 5.400000009103678e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 9.999999747378752e-06, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 1411.2052001953125, + "HloMacCount": 240518168576.0, + "Traffic": 340869152.0 + } + }, + "sg02": { + "compiletime": { + "CanonicalizeConv": 1.1000000085914508e-05, + "CanonicalizeForTensorizer": 1.4999999621068127e-05, + "Canonicalizer": 0.0003480000013951212, + "HoistCompute": 9.999999747378752e-06, + "IdentifyCrossPassTensors": 1.2999999853491317e-05, + "MemcastMotion": 0.0, + "PenguinizeFunctions": 9.999999747378752e-06, + "PruneFunctions": 7.999999979801942e-06, + "RemoveOptimizationBarriers": 1.5999999959603883e-05, + "ScatterMotion": 9.999999974752427e-07, + "TensorizerLegalizationPass": 7.000000096013537e-06, + "VerifySupportedOps": 1.2000000424450263e-05, + "algsimp": 7.599999662488699e-05, + "batchnorm_expander": 1.4000000192027073e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 1.1000000085914508e-05, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 4.999999873689376e-06, + "comparison-expander": 6.000000212225132e-06, + "computation-deduplicator": 2.300000051036477e-05, + "conditional-to-select": 7.000000096013537e-06, + "config-lowering": 6.0999998822808266e-05, + "constant_folding": 9.000000318337698e-06, + "cse": 1.4000000192027073e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 4.999999873689376e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 2.099999983329326e-05, + "flatten-call-graph": 1.2999999853491317e-05, + "fuse-send-recv": 2.099999983329326e-05, + "hilo::LegalizeAlias": 1.9999999949504854e-06, + "hilo::NeuronInstCombine": 1.5999999959603883e-05, + "hilo::NeuronOpFusion": 1.8000000636675395e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 1.700000029813964e-05, + "hilo::ScheduleFusion": 1.1000000085914508e-05, + "hilo::SixtyFourHack": 4.099999932805076e-05, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 0.00020500000391621143, + "hlo-verifier": 0.00019299999985378236, + "legalize-ccops": 9.999999974752427e-07, + "legalize-compare": 3.000000106112566e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 1.4999999621068127e-05, + "metadata-naming": 1.4000000192027073e-05, + "mlir::detail::OpToOpPassAdaptor": 1.8999999156221747e-05, + "mlir::hlo::MhloToPyPenguin": 0.018411999568343163, + "mlir::mhlo::LowerComplexExtraPass": 0.00017299999308306724, + "mlir::mhlo::LowerComplexPass": 0.00020799999765586108, + "native-to-custom-softmax": 9.999999747378752e-06, + "native-to-custom-softmax-dx": 3.7999998312443495e-05, + "operand_upcaster": 1.8000000636675395e-05, + "post-par-pipe-begin": 3.999999989900971e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005879999953322113, + "replace-minimum-constant": 1.1000000085914508e-05, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 4.999999873689376e-05, + "simplify-while-loops": 3.000000106112566e-06, + "transform-variadic-reduce": 0.00017499999376013875, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 4.999999873689376e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 1000.4613647460938, + "HloMacCount": 180455735296.0, + "Traffic": 360745024.0 + } + } +} \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk4/graph.neff b/context_encoding_model/_tp0_bk4/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..5ed302ca091dd9016ef3fabb2b6ddebfddb3fe5b --- /dev/null +++ b/context_encoding_model/_tp0_bk4/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93d468f8c91c8ae558da4744c631e0351092b98d3698d8a39f05082867c022a7 +size 3298304 diff --git a/context_encoding_model/_tp0_bk4/log-neuron-cc.txt b/context_encoding_model/_tp0_bk4/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..d720479635a684ed29739e6b917ed2afdf269100 --- /dev/null +++ b/context_encoding_model/_tp0_bk4/log-neuron-cc.txt @@ -0,0 +1,6284 @@ +2025-09-05T19:14:57Z INFO 750 [root]: /opt/conda/bin/neuronx-cc compile --framework=XLA /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.hlo_module.pb --output /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=/models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/log-neuron-cc.txt --verbose=35 +2025-09-05T19:15:03Z INFO 750 [root]: NeuronX Compiler version 2.20.9961.0+0acef03a Python version 3.10.17 HWM version 2.20.0.9961+0acef03a NumPy version 1.26.4 +2025-09-05T19:15:03Z INFO 1166 [root]: XLA detected +2025-09-05T19:15:03Z INFO 1166 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-09-05T19:15:03Z INFO 1166 [root]: Intermediate files stored in /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y, output in /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4 +2025-09-05T19:15:03Z INFO 1166 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-09-05T19:15:03Z INFO 1166 [pipeline.Pipeline.0]: Processing input #0 +2025-09-05T19:15:03Z INFO 1166 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-09-05T19:15:03Z INFO 1166 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-09-05T19:15:03Z INFO 1166 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-09-05T19:15:03Z INFO 1166 [job.HLOToTensorizer.0]: Processing input #0 +2025-09-05T19:15:03Z INFO 1166 [job.HLOToTensorizer.0]: IR signature: 592a9cdc4c9b4697249af595e7e4e7ae477f80acdebaede8842f0734e5baf50e for model.MODULE_d342327da795afc2aa68+5e8b788a.hlo_module.pb +2025-09-05T19:15:03Z INFO 1166 [job.HLOToTensorizer.0]: Executing: /opt/conda/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --modular-flow-mac-threshold=10 --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-09-05T19:15:04Z INFO 1166 [job.HLOToTensorizer.0]: DEBUG: needsModular_PreSplit? Yes. macCnt 7696648634368 threshold 4398046511104 num non-trivial Ops 3259 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 34 + +Pre-Partition Pre-Opt Histogram: +total HLO instructions: 9335 + reshape 1992 21.34% ################################################################ + broadcast 1543 16.53% ################################################# + transpose 1129 12.09% #################################### + convert 1013 10.85% ################################ + constant 727 7.79% ####################### + slice 397 4.25% ############ + parameter 359 3.85% ########### + add 325 3.48% ########## + multiply 291 3.12% ######### + dot 290 3.11% ######### + get-tuple-element 263 2.82% ######## + select 227 2.43% ####### + compare 198 2.12% ###### + concatenate 132 1.41% #### + call 102 1.09% ### + tuple 65 0.70% ## + scatter 65 0.70% ## + negate 64 0.69% ## + all-reduce 64 0.69% ## + custom-call 34 0.36% # + divide 33 0.35% # + iota 7 0.07% + gather 6 0.06% + all-gather 3 0.03% + reduce 3 0.03% + sine 1 0.01% + cosine 1 0.01% + maximum 1 0.01% + +INFO: IoStatistics: total inputs: 359 +INFO: IoStatistics: total outputs: 65 +INFO: IoStatistics: total passthrough tensors: 0 +INFO: IoStatistics: total outputs read from: 0 +INFO: IoStatistics: total redundant outputs: 0 +INFO: IoStatistics: total ifmap size (KiB): 7602712 +INFO: IoStatistics: total ofmap size (KiB): 524288 +INFO: IoStatistics: total must-alias size (KiB): 524288 +INFO: IoStatistics: total may-alias size (KiB): 0 +INFO: HloMacCount has found 7696648503296 +INFO: Traffic has found 8888291037 +INFO: AIF 1731.86 + +Pre-Partition Post-Op Histogram: +total HLO instructions: 5713 + reshape 1397 24.45% ################################################################ + transpose 838 14.67% ###################################### + convert 756 13.23% ################################## + constant 403 7.05% ################## + broadcast 366 6.41% ################ + parameter 359 6.28% ################ + dot 289 5.06% ############# + multiply 195 3.41% ######## + add 195 3.41% ######## + custom-call 135 2.36% ###### + get-tuple-element 135 2.36% ###### + slice 131 2.29% ###### + concatenate 130 2.28% ##### + select 98 1.72% #### + compare 68 1.19% ### + scatter 65 1.14% ## + negate 64 1.12% ## + all-reduce 64 1.12% ## + gather 6 0.11% + iota 5 0.09% + all-gather 3 0.05% + reduce 3 0.05% + pad 2 0.04% + sine 1 0.02% + divide 1 0.02% + tuple 1 0.02% + maximum 1 0.02% + rng 1 0.02% + cosine 1 0.02% + +INFO: Found compute bound graph +DEBUG: needsModular_PreSplit? Yes. macCnt 7696648503296 threshold 4398046511104 num non-trivial Ops 2218 +DEBUG: transformer model +INFO: Partitioner configs:ModularFlow BO LBL SA ConcatGraphs: 1 MaxDisj:2 MaxSep:4 LPM:1 +INFO: Markers NOT detected +Potential split-points stats: #CC 67 #AR 64 #AG 3 #BN 0 nClamp 0 +DEBUG: needsModular_SplitFinder? Yes. +ModuleSplitter initial partitioning... #parts 67 +ModuleSplitter initial partitioning... Done. +INFO: Num of unique Module Definitions: 6 +DEBUG: DefMap: 0 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 65 66 +New disjoint wave: start 2 len 62 NumReps: 31 macs 7456063225856 +INFO: Attempting to identify and split optimizer at end +First non-zero-mac/used part from the end is 65 +Not enough zero-mac parts. skip +INFO: Optimized 0 all-reduce split instructions +INFO: Number of splitPoints: 33 +ModuleSplitter initial partitioning... #parts 33 +ModuleSplitter initial partitioning... Done. +Remat: gather-iota 0 matches, 0 ops rematted +INFO: Alias legality verification of partitions PASSED. +INFO: No transposable_weight_idx attrs found +INFO: Peak intermediate memory demand is at Partition 1. Num live intermediates at peak is 9 and memory usage is 72351748 bytes. +INFO: Please refer to LiveRangeReport_PostHloPart.txt for detailed intermediate lifetime info. +DEBUG: DefMap: 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 32 +Wrote HLO netlist to hlo_netlist.json +Wrote graph partitions in debug_info_hlo_partitions.json +Processing partition 0 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 60129542144 +INFO: Traffic has found 273768742 +INFO: AIF 439.27 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call dot gather get-tuple-element iota multiply negate parameter reshape scatter select sine slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 1 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 240518168576 +INFO: Traffic has found 340869156 +INFO: AIF 1411.21 +HLO Ops used in computation: add all-reduce broadcast compare concatenate constant convert custom-call dot get-tuple-element multiply negate parameter reshape scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 2 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 180455735296 +INFO: Traffic has found 360745035 +INFO: AIF 1000.46 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert custom-call divide dot gather get-tuple-element iota maximum multiply pad parameter reduce reshape rng scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass + +2025-09-05T19:15:04Z INFO 1166 [job.HLOToTensorizer.0]: IR signature: bf07dad5917813daf757494aab7b0ee22029850b02b556444eb2043a265b4c62 for sg0000/HLOToTensorizer +2025-09-05T19:15:04Z INFO 1166 [job.HLOToTensorizer.0]: IR signature: fb76e04136f1f669dcc2f10b0451e90637062afb3c3b258df7aa58c4c0e54150 for sg0001/HLOToTensorizer +2025-09-05T19:15:04Z INFO 1166 [job.HLOToTensorizer.0]: IR signature: daae429ba70769f08ee27a2c9da1c835bf765295909c37620157070f06beaa1c for sg0002/HLOToTensorizer +2025-09-05T19:15:04Z INFO 1166 [job.HLOToTensorizer.0]: Job #0 finished +2025-09-05T19:15:04Z INFO 1166 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-09-05T19:15:04Z INFO 1166 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-09-05T19:15:04Z INFO 1166 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-09-05T19:15:04Z INFO 1166 [job.Frontend.0]: Processing input #0 +2025-09-05T19:15:04Z INFO 1166 [job.Frontend.0]: Start model loading +2025-09-05T19:15:04Z INFO 1166 [job.Frontend.0]: Start tensorization +2025-09-05T19:15:04Z INFO 1166 [job.Frontend.0]: Num jobs: 32 +2025-09-05T19:15:04Z USER 1166 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-09-05T19:15:04Z INFO 1166 [Tensorizer]: Max workers: 3 +2025-09-05T19:15:04Z INFO 1617 [Tensorizer]: Building model from Penguin script "penguin.py.000000"... +2025-09-05T19:15:04Z INFO 1618 [Tensorizer]: Building model from Penguin script "penguin.py.000001"... +2025-09-05T19:15:04Z INFO 1619 [Tensorizer]: Building model from Penguin script "penguin.py.000002"... +2025-09-05T19:15:05Z INFO 1618 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-09-05T19:15:05Z INFO 1619 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1617 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.011 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.012 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.005 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.042 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.013 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.039 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.007 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.003 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.016 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.005 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.015 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.011 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.012 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.006 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.020 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.006 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.021 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.026 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.005 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.020 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.026 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.015 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Rematerialization]: Running Rematerialization +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Rematerialization]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Rematerialization]: Rematerialization finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.014 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.009 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.008 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.159 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.003 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.028 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.033 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.006 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.012 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.016 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.031 seconds +2025-09-05T19:15:05Z INFO 1618 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.135 seconds +2025-09-05T19:15:05Z INFO 1617 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/PadElimination]: Running PadElimination +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/PadElimination]: Finished (changed=False) +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-09-05T19:15:05Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.059 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.005 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.005 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.019 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.159 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.028 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LoopFusion]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.028 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Rematerialization]: Running Rematerialization +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Rematerialization]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.036 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.006 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Recompute]: Running Recompute +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Recompute]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1619 [Tensorizer]: After optimization: 38 statements +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/MutateDataType]: Running MutateDataType +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/MutateDataType]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.006 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.008 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Rematerialization]: Rematerialization finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.006 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/MutateDataType]: MutateDataType finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/TileCCOps]: Running TileCCOps +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-153 AllGather_add(float32 (256,) %'add.11', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.7175 | hlo_id: 103 | , id = 153 +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-169 AllGather_add(uint32 (256,) %'add.12', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.7310 | hlo_id: 112 | , id = 169 +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/TileCCOps]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.010 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/TileCCOps]: TileCCOps finished after 0.007 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.011 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.015 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.039 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.028 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LateLowerReshapeOp]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.012 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1617 [Tensorizer]: After optimization: 30 statements +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.027 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.008 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/PadElimination]: Running PadElimination +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/PadElimination]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.026 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.006 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.008 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.045 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/TileCCOps]: Finished (changed=True) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.009 seconds +2025-09-05T19:15:06Z INFO 1617 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds +2025-09-05T19:15:06Z INFO 1618 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.068 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.005 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.162 seconds +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-09-05T19:15:06Z INFO 1619 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.073 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/Recompute]: Running Recompute +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/Recompute]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-09-05T19:15:07Z INFO 1618 [Tensorizer]: After optimization: 29 statements +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/MutateDataType]: Running MutateDataType +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/MutateDataType]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/MutateDataType]: MutateDataType finished after 0.001 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.026 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.090 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.035 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/TileCCOps]: Running TileCCOps +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/TileCCOps]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/TileCCOps]: TileCCOps finished after 0.008 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.008 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.001 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.074 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.004 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.127 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.055 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.007 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.007 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.008 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/PGTiling]: Running PGTiling +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.081 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.019 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 591 of IO tensor {'CrossPassTensor': ''}bfloat16 %input355|NC|(128, 32) is not sorted, index list (w/ AG ids): [(18, 'AG54'), (14, 'AG55')] +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.030 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 592 of IO tensor {'CrossPassTensor': ''}bfloat16 %input356|NHWC|(2, 28, 128, 32, 128) is not sorted, index list (w/ AG ids): [(18, 'AG54'), (14, 'AG55')] +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 593 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NHWC|(2, 28, 128, 32, 128) is not sorted, index list (w/ AG ids): [(18, 'AG54'), (14, 'AG55')] +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 594 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353(32, 2, 128, 28, 128) is not sorted, index list (w/ AG ids): [(11, 'AG60'), (17, 'AG58'), (12, 'AG59')] +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 595 of IO tensor {'CrossPassTensor': ''}bfloat16 %input358|NC|(128, 32) is not sorted, index list (w/ AG ids): [(18, 'AG54'), (14, 'AG55')] +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 596 of IO tensor {'CrossPassTensor': ''}bfloat16 %input357|NHWC|(128, 128, 32, 128) is not sorted, index list (w/ AG ids): [(15, 'AG66'), (14, 'AG55'), (16, 'AG65')] +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.065 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.007 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.005 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.005 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LateLowerReshapeOp]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.011 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.002 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.004 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.171 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.073 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/PGTiling]: PGTiling finished after 0.268 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.008 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.201 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.046 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.028 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.006 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.048 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.013 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.038 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 0.823 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 7168: matmul_128x128x512 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 7168: matmul_128x128x512 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 7168: matmul_128x128x512 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 4096: matmul_128x128x1 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 224: simd128x512 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 128: rmsnorm128x512x128 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 128: simd128x512 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 128: rmsnorm128x512x128 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 4: reduce512x1x1 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 4: simd1x512 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 4: reduce512x1x1 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 2: indirect_load128x1 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingBottleneck]: 1: indirect_load32x128 +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.016 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.093 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.007 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.010 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.120 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.082 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.034 seconds +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.010 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:07Z INFO 1618 [sg0001/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.003 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.006 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.379 seconds +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.005 seconds +2025-09-05T19:15:07Z INFO 1619 [sg0002/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-09-05T19:15:07Z INFO 1617 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.113 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 7168: matmul_128x128x512 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 7168: matmul_128x128x512 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 7168: matmul_128x128x512 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 4096: matmul_128x128x1 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 448: dma128x512 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 256: dma128x2048 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 224: simd128x512 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 128: rmsnorm128x512x128 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 128: simd128x512 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 128: rmsnorm128x512x128 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x1 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 112: dma128x2048 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 112: dma128x2048 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: dma128x1024 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 4: reduce512x1x1 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 4: simd1x512 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 4: reduce512x1x1 +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.046 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.008 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.013 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.060 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.006 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.003 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.003 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.411 seconds +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.036 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/InferInitValue]: Running InferInitValue +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.028 seconds +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.470 seconds +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/InferInitValue]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/InferInitValue]: InferInitValue finished after 0.031 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.004 seconds +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.009 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.006 seconds +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/PGTiling]: Running PGTiling +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.043 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/SundaISel]: Running SundaISel +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.577 seconds +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/SundaISel]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/SundaISel]: SundaISel finished after 0.095 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 629 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(128, 32) is not sorted, index list (w/ AG ids): [(16, 'AG104'), (12, 'AG105')] +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 630 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77|NHWC|(2, 28, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG104'), (12, 'AG105')] +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 631 of IO tensor {'CrossPassTensor': ''}bfloat16 %input75|NHWC|(2, 28, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG104'), (12, 'AG105')] +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 632 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74(32, 2, 128, 28, 128) is not sorted, index list (w/ AG ids): [(8, 'AG110'), (15, 'AG108'), (9, 'AG109')] +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 633 of IO tensor {'CrossPassTensor': ''}bfloat16 %input80|NC|(128, 32) is not sorted, index list (w/ AG ids): [(16, 'AG104'), (12, 'AG105')] +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 634 of IO tensor {'CrossPassTensor': ''}bfloat16 %input82(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(16, 'AG104'), (12, 'AG105')] +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 641 of IO tensor {'CrossPassTensor': ''}bfloat16 %input81(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(16, 'AG104'), (12, 'AG105')] +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 644 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG104'), (12, 'AG105')] +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 423 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(3, 'AG120'), (1, 'AG116'), (2, 'AG115'), (4, 'AG119'), (5, 'AG118')] +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.014 seconds +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.608 seconds +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.029 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.003 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.161 seconds +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.006 seconds +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.014 seconds +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.036 seconds +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.007 seconds +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.009 seconds +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.005 seconds +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.009 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.001 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.007 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.011 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input71|NC|(128, 2, 2, 8) is not sorted, index list (w/ AG ids): [(20, 'AG113'), (14, 'AG116'), (10, 'AG115'), (17, 'AG114')] +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 663 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73(4, 4, 128, 2, 2, 8, 2, 64) is not sorted, index list (w/ AG ids): [(20, 'AG113'), (14, 'AG116'), (10, 'AG115'), (17, 'AG114')] +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 666 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72(4, 128, 2, 2, 8, 2, 64) is not sorted, index list (w/ AG ids): [(20, 'AG113'), (14, 'AG116'), (10, 'AG115'), (17, 'AG114')] +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|NHWC|(4, 128, 2, 2, 8, 128) is not sorted, index list (w/ AG ids): [(20, 'AG113'), (14, 'AG116'), (10, 'AG115'), (17, 'AG114')] +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 435 of IO tensor {'CrossPassTensor': ''}bfloat16 %input69(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(11, 'AG128'), (8, 'AG124'), (9, 'AG123'), (13, 'AG127'), (16, 'AG126')] +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 658 of IO tensor non_local bfloat16 %all_gather.1(2, 2, 8, 128, 2, 1024) is not sorted, index list (w/ AG ids): [(10, 'AG115'), (14, 'AG116'), (17, 'AG114'), (20, 'AG113'), (3, 'AG118')] +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 545 of IO tensor {'IntermediateTensor': ''}bfloat16 %intermediate1(2, 1024, 2, 2, 8, 128) is not sorted, index list (w/ AG ids): [(3, 'AG118'), (14, 'AG116'), (10, 'AG115'), (17, 'AG114'), (20, 'AG113')] +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.136 seconds +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.005 seconds +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.064 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.016 seconds +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.004 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.006 seconds +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-09-05T19:15:08Z INFO 1617 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.002 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.001 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.001 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/DeConcat]: Running DeConcat +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/DeConcat]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/DeConcat]: DeConcat finished after 0.001 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.001 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.131 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/PGTiling]: PGTiling finished after 0.563 seconds +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.011 seconds +2025-09-05T19:15:08Z INFO 1619 [sg0002/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-09-05T19:15:08Z INFO 1618 [sg0001/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.073 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.007 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.173 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.418 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.095 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.057 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.048 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.003 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.009 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.097 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.527 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.014 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 7168: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 7168: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 7168: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 4096: transpose_128x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 2048: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.078 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 2048: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 1024: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 1024: softmax512x1x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 1024: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 224: simd128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 128: rmsnorm128x512x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 128: simd128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 128: rmsnorm128x512x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 64: simd64x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingBottleneck]: 64: simd64x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.893 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 4096: transpose_128x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 2048: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 2048: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 1024: softmax512x1x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 128: simd128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 128: rmsnorm128x512x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 64: simd128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 64: indirect_load128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 64: simd128x256 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 64: simd128x256 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 64: simd128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.075 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.016 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.082 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.013 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.022 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.042 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.011 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.002 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/SpillPSum]: Running SpillPSum +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.085 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/SpillPSum]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/SpillPSum]: SpillPSum finished after 0.065 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.050 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.147 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.013 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.008 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/LICM]: LICM finished after 0.005 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.009 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.049 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.117 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.005 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.003 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LegalizeType]: Running LegalizeType +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LegalizeType]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LegalizeType]: LegalizeType finished after 0.047 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.015 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.084 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.206 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4096: transpose_128x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: softmax512x1x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: simd128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: rmsnorm128x512x128 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: simd128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: indirect_load128x512 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: dma128x1024 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: dma128x1024 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: simd128x256 +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.005 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.025 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.004 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/TensorInitialization]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.007 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.014 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.005 seconds +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.277 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 7168: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 7168: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 7168: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 4096: transpose_128x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 2048: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 2048: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1024: softmax512x1x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 448: dma128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 224: simd128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: rmsnorm128x512x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: simd128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: rmsnorm128x512x128 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x512 +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.006 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.011 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.015 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.012 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.036 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.003 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.016 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.011 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.003 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.005 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.011 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.005 seconds +2025-09-05T19:15:09Z INFO 1617 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.024 seconds +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-09-05T19:15:09Z INFO 1618 [sg0001/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-09-05T19:15:09Z INFO 1619 [sg0002/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.032 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.005 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.181 seconds +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.008 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/InferInitValue]: Running InferInitValue +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DataStreaming]: Running DataStreaming +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DataStreaming]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.042 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DataStreaming]: DataStreaming finished after 0.008 seconds +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.014 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.007 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/InferInitValue]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/InferInitValue]: InferInitValue finished after 0.058 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.015 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.007 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/LICM]: Running LICM +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/SundaISel]: Running SundaISel +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.051 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.028 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.003 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/SundaISel]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/SundaISel]: SundaISel finished after 0.067 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.012 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.023 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.196 seconds +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.018 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.002 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.023 seconds +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.015 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.005 seconds +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.003 seconds +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 2.427ms (112.000MiB, est bw: 48.383GB/s, 41.581% of tot. time) for bfloat16<128 x 128> TongaSB partitions[6] bfloat16 (2, 2, 2, 2, 2, 28, 128, 512) %1075[i16_0_768,i15_0_0_768_0_0_1074,i15_0_0_1,i15_0_0_0_1,c1_756,c2_757,i0.128,i1.128+128p_1409] = load bfloat16<128 x 128> {'CrossPassTensor': ''}bfloat16 (8, 4, 2, 128, 28, 128) %'input353'[4i15_0_0_768_0_0_1074+i15_0_0_1+2i15_0_0_0_1,p_1409,c1_756,i0.128,c2_757,i1.128] # id=946, src_id=None, , instances=3584 # dl = tensor_op_name: _dot.228 | hlo_id: 59 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 1.137ms (224.000MiB, est bw: 206.489GB/s, 19.486% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 28, 2, 128, 2048) %1076[i11_0,i11_1_0,i10_0_0,i10_0_1,c2_737,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 28, 128, 4096) %'input354'[i10_0_0,i10_0_1,i0.128,i1.2048+2048c2_737] # id=937, src_id=None, , instances=448 # dl = tensor_op_name: _dot.226 | hlo_id: 49 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 1.137ms (224.000MiB, est bw: 206.489GB/s, 19.486% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 28, 2, 128, 2048) %1073[i16_0_768,i13_1_0,i12_0_0,i12_0_1,c2_747,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 28, 128, 4096) %'input356'[i12_0_0,i12_0_1,i0.128,i1.2048+2048c2_747] # id=940, src_id=None, , instances=448 # dl = tensor_op_name: _dot.227 | hlo_id: 40 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 650.557us (128.000MiB, est bw: 206.312GB/s, 11.144% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (128, 2, 128, 2048) %'input357_local_788'[4i31_0_0+i31_0_1,i30_0_0_792,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (128, 128, 4096) %'input357'[4i31_0_0+i31_0_1,i0.128,i1.2048+2048i30_0_0_792] # id=978, src_id=None, , instances=256 # dl = tensor_op_name: _dot.229 | hlo_id: 92 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 1.413% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 4, 2, 128, 2048) %'693.1388'[i11_0,i11_1_0,T_i2_0,T_i3_0_1713,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2, 2, 4, 128, 4096) %'add.9'[i11_0,i11_1_0,T_i2_0,i0.128,i1.2048+2048T_i3_0_1713] # id=1079, src_id=None, , instances=32 # dl = tensor_op_name: add.9_pftranspose_693 | hlo_id: 27 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 1.413% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2048) %'_reload_1069'[i16_0_768,i13_1_0,i4_0_601_0_1072,i4_0_601_1_1072_0,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM3DBlk partitions[4] bfloat16 (2, 4, 2, 2, 128, 2048) %'_spill_1066'[i4_0_601_0_1072,i4_0_601_1_1072_0,i16_0_768,i13_1_0,i0.128,i1.2048] # id=1071, src_id=None, , instances=32 # dl = tensor_op_name: _dot.227 | hlo_id: 40 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 1.413% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 4, 2, 128, 2048) %'697.1393'[T_i20_0_705,T_i20_1_0_705,T_i2_0,T_i3_0_1714,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 4, 128, 4096) %'all_reduce.3'[T_i20_0_705,0,T_i20_1_0_705,T_i2_0,i0.128,i1.2048+2048T_i3_0_1714] # id=1088, src_id=None, , instances=32 # dl = tensor_op_name: all_reduce.3_pftranspose_697 | hlo_id: 62 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 49.765us (16.000MiB, est bw: 337.130GB/s, 0.852% of tot. time) for bfloat16<128 x 2048> DRAM3DBlk partitions[4] bfloat16 (2, 4, 2, 2, 128, 2048) %'_spill_1066'[i2_0_0_1144,i2_0_1_1144_0,i11_0,i11_1_0,i0.128,i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2048) %712[i11_0,i11_1_0,i2_0_0_1144,i2_0_1_1144_0,i0.128,i1.2048] # id=1068, src_id=None, , instances=32 # dl = tensor_op_name: _custom-call.230 | hlo_id: 33 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 49.765us (16.000MiB, est bw: 337.130GB/s, 0.852% of tot. time) for bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 8, 128, 4096) %'dot.14'[i16_0_768,0,4i16_1_0_0_768_1074+i16_1_0_1_768_1074,i0.128,2048i15_0_0_768_0_0_1074+i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2048) %769[i16_0_768,i15_0_0_768_0_0_1074,i16_1_0_0_768_1074,i16_1_0_1_768_1074,i0.128,i1.2048] # id=949, src_id=None, , instances=32 # dl = tensor_op_name: _dot.228 | hlo_id: 59 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 49.765us (16.000MiB, est bw: 337.130GB/s, 0.852% of tot. time) for bfloat16<128 x 2048> non_local bfloat16 (2048, 32, 128) %'convert.49'[1024T_i20_0_705+i0.128+512T_i20_1_0_705+128T_i20_1_1_705_0,16T_i19_0_705_0_1166+i2.4+4i3.4,i1.128] = store bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 4, 512) %'701.1539'[T_i20_0_705,T_i20_1_0_705,T_i19_0_705_0_1166,T_i20_1_1_705_0,i0.128,i3.4,i1.128+128i2.4] # id=1092, src_id=None, , instances=32 # dl = tensor_op_name: convert.49_pftranspose_701 | hlo_id: 70 | [[i0.128];[i1.128, i2.4, i3.4]] -> [[i0.128];[i1.128, i2.4, i3.4]] +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.067 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.062 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.008 seconds +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.016 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.061 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.004 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.114 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.005 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1619 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.517 seconds +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.011 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.120 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.002 seconds +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.026 seconds +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-09-05T19:15:10Z INFO 1619 [sg0002/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.010 seconds +2025-09-05T19:15:10Z INFO 1617 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.006 seconds +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.002 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/DeConcat]: Running DeConcat +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/DeConcat]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.013 seconds +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/DeConcat]: DeConcat finished after 0.002 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.005 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.002 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.014 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.007 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.010 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.002 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.002 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.002 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.053 seconds +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.097 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.068 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1619 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.137 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.140 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.062 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-09-05T19:15:11Z INFO 1619 [Tensorizer]: BirCodeGen estimate #instances=35075 in sg0002 +2025-09-05T19:15:11Z INFO 1619 [Tensorizer]: IR signature: 8c05f5e1b6ba355928b785f9aae0e858ab021248bd1218f27305b4cf636bc35b for nc00/sg0002/TensorizerBIR +2025-09-05T19:15:11Z INFO 1619 [Tensorizer]: Weights total number of bytes: 135176 +2025-09-05T19:15:11Z INFO 1619 [Tensorizer]: Successfully built model. +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.026 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.329 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.067 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.009 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.049 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.016 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.028 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.002 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.059 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.027 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.038 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.019 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.044 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.002 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.018 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.005 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.012 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.018 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.002 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/SpillPSum]: Running SpillPSum +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/SpillPSum]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/SpillPSum]: SpillPSum finished after 0.039 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.060 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.039 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.002 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LegalizeType]: Running LegalizeType +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LegalizeType]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/LegalizeType]: LegalizeType finished after 0.025 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.064 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.004 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.010 seconds +2025-09-05T19:15:11Z INFO 1618 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=True) +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.027 seconds +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:15:11Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.042 seconds +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.004 seconds +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.091 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.019 seconds +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.021 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.008 seconds +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.004 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.007 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.052 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.001 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DataStreaming]: Running DataStreaming +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DataStreaming]: Finished (changed=True) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DataStreaming]: DataStreaming finished after 0.008 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.619 seconds +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.012 seconds +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.003 seconds +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.571 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.003 seconds +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 325.929us (64.000MiB, est bw: 205.901GB/s, 34.095% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 4, 4, 2, 128, 2048) %1602[i44_0,i44_1_0_0,i45_0_1601,T_i3,c2_1298,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (4, 4, 128, 2, 2048) %'input73'[i45_0_1601,T_i3,i0.128,c2_1298,i1.2048] # id=1458, src_id=None, , instances=128 # dl = tensor_op_name: _dot.2 | hlo_id: 34 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 99.175us (16.000MiB, est bw: 169.167GB/s, 10.375% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 4, 512) %'input69_local_1335'[c0_1327_0_0,c0_1327_0_1,c0_1327_1,c1_1328,i0.128,i3.4,i1.128+128i2.2+256p_1851] = load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 4, 4, 2, 128) %'input69'[4c0_1327_0_0+2c0_1327_0_1+c0_1327_1,p_1851,i0.128,c1_1328,i3.4,i2.2,i1.128] # id=1557, src_id=None, , instances=64 # dl = tensor_op_name: _dot.3 | hlo_id: 139 | [[i0.128];[i1.128, i2.2, i3.4]] -> [[i0.128];[i1.128, i2.2, i3.4]] +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 8.626% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2, 2, 4, 128) %'1217.1727'[i1_1_0_1258,i1_0_1258,T_i2,2T_i3_0_2297+T_i3_1_2297,i0.128,i4.2,i3.2,i2.4,i1.128] = load bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 2, 4, 128, 2, 1024) %'all_gather.1'[i1_1_0_1258,0,i1_0_1258,T_i2,2T_i3_0_2297+T_i3_1_2297,i0.128,i4.2,i1.128+128i2.4+512i3.2] # id=1626, src_id=None, , instances=32 # dl = tensor_op_name: all_gather.1_pftranspose_1217 | hlo_id: 19 | [[i0.128];[i1.128, i2.4, i3.2, i4.2]] -> [[i0.128];[i1.128, i2.4, i3.2, i4.2]] +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 8.626% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2, 1024) %'all_gather.1_local_1281'[i44_0,i29_0_1_0_1285,i29_0_0_1285,i29_0_1_1_1_1285,i0.128,i2.2,i1.1024] = load bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 2, 4, 128, 2, 1024) %'all_gather.1'[i29_0_1_0_1285,0,i29_0_0_1285,i2.2,i29_0_1_1_1_1285,i0.128,i44_0,i1.1024] # id=1453, src_id=None, , instances=32 # dl = tensor_op_name: _custom-call.136 | hlo_id: 27 | [[i0.128];[i1.1024, i2.2]] -> [[i0.128];[i1.1024, i2.2]] +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 59.748us (16.000MiB, est bw: 280.799GB/s, 6.250% of tot. time) for bfloat16<128 x 1024> {'IntermediateTensor': ''}bfloat16 (2, 8, 128, 2, 2, 1024) %'intermediate1'(init=0.0)[i0_0_1258,4i0_1_0_0_1258+i0_1_0_1_1258,i0.128,i1_0_1258,i1_1_0_1258,i1.1024] = store bfloat16<128 x 1024> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 4, 128, 1024) %1259[i1_1_0_1258,i1_0_1258,i0_0_1258,i0_1_0_0_1258,i0_1_0_1_1258,i0.128,i1.1024] # id=1418, src_id=None, , instances=64 # dl = tensor_op_name: UnnamedModule | hlo_id: 1 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 50.238us (8.000MiB, est bw: 166.979GB/s, 5.255% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[2] bfloat16 (2, 16, 128, 1024) %'transpose.1_pftranspose_1212'[T_i12_0_1216,i13_0,i0.128,i1.1024] = indirect_load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (32768, 2, 1024) %'input68'[i0.128,T_i12_0_1216,i1.1024] generic generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[0] int32 (128, 16, 1) %'input0_local_1252'[i0.128,i13_0,0] # id=1412, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=32 # dl = tensor_op_name: _gather.41 | hlo_id: 16 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 49.765us (16.000MiB, est bw: 337.130GB/s, 5.206% of tot. time) for bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 8, 128, 4096) %'dot.4'[i117_0_1341,0,4i117_1_0_0_1341+i117_1_0_1_1341,i0.128,i1.2048+2048i116_0_0_0_1341_0_0_2348] = store bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 4, 2, 128, 2048) %1342[i117_0_1341,i117_1_0_0_1341,i117_1_0_1_1341,i116_0_0_0_1341_0_0_2348,i0.128,i1.2048] # id=1560, src_id=None, , instances=32 # dl = tensor_op_name: _dot.3 | hlo_id: 139 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 26.236us (2.000MiB, est bw: 79.934GB/s, 2.745% of tot. time) for bfloat16<128 x 128> bfloat16 (4, 4, 2048, 128) %'output1'[i0.128,i1.128] generic, generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[4] int32 (2, 4, 2, 4, 128, 1) %'scatter.7381.1898'[i132_0_2310,i133_2303_2310,i132_1_0_0_2303_2310,i132_1_0_1_2303_2310,i0.128,0] = indirect_save bfloat16<128 x 128> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 512) %'add.2'[i132_0_2310,i132_1_0_0_2303_2310,i133_2303_2310,i0.128,i1.128+128i132_1_0_1_2303_2310] # id=1581, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=64 # dl = tensor_op_name: _scatter.7381 | hlo_id: 164 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 26.236us (2.000MiB, est bw: 79.934GB/s, 2.745% of tot. time) for bfloat16<128 x 128> bfloat16 (4, 4, 2048, 128) %'output2'[i0.128,i1.128] generic, generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[4] int32 (2, 2, 4, 4, 128, 1) %'scatter.7433.1894'[i132_0_2309,i125_1_0_0_2302_2309,i125_1_0_1_2302_2309,2i126_2302_2309_0+i126_2302_2309_1,i0.128,0] = indirect_save bfloat16<128 x 128> TongaSB partitions[2] bfloat16 (2, 2, 128, 4, 4, 128) %'scatter.7433.1935'[i132_0_2309,i125_1_0_0_2302_2309,i0.128,i125_1_0_1_2302_2309,2i126_2302_2309_0+i126_2302_2309_1,i1.128] # id=1575, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=64 # dl = tensor_op_name: _scatter.7433 | hlo_id: 179 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 25.532us (8.000MiB, est bw: 328.547GB/s, 2.671% of tot. time) for bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 4, 128, 16, 128) %'transpose.1'[T_i12_0_1216,0,T_i12_1_0_1216,2T_i12_1_1_1216_0_0+T_i12_1_1_1216_0_1,i0.128,i2.4+4i3.4,i1.128] = store bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 4, 512) %'1212.1911'[T_i12_0_1216,T_i12_1_0_1216,2T_i12_1_1_1216_0_0+T_i12_1_1_1216_0_1,i0.128,i3.4,i1.128+128i2.4] # id=1613, src_id=None, , instances=16 # dl = tensor_op_name: transpose.1_pftranspose_1212 | hlo_id: 16 | [[i0.128];[i1.128, i2.4, i3.4]] -> [[i0.128];[i1.128, i2.4, i3.4]] +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.007 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.007 seconds +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.002 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.003 seconds +2025-09-05T19:15:12Z INFO 1617 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.056 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 2.427ms (112.000MiB, est bw: 48.383GB/s, 42.061% of tot. time) for bfloat16<128 x 128> TongaSB partitions[6] bfloat16 (2, 2, 2, 2, 2, 28, 128, 512) %1341[i16_0_1054,i15_0_0_1054_0_0_1340,i15_0_0_1,i15_0_0_0_1,c1_1042,c2_1043,i0.128,i1.128+128p_1526] = load bfloat16<128 x 128> {'CrossPassTensor': ''}bfloat16 (8, 4, 2, 128, 28, 128) %'input74'[4i15_0_0_1054_0_0_1340+i15_0_0_1+2i15_0_0_0_1,p_1526,c1_1042,i0.128,c2_1043,i1.128] # id=1218, src_id=None, , instances=3584 # dl = tensor_op_name: _dot.6 | hlo_id: 49 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 1.137ms (224.000MiB, est bw: 206.489GB/s, 19.711% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 28, 2, 128, 2048) %1342[i11_0,i11_1_0,i10_0_0,i10_0_1,c2_1023,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 28, 128, 4096) %'input75'[i10_0_0,i10_0_1,i0.128,i1.2048+2048c2_1023] # id=1209, src_id=None, , instances=448 # dl = tensor_op_name: _dot.4 | hlo_id: 39 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 1.137ms (224.000MiB, est bw: 206.489GB/s, 19.711% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 28, 2, 128, 2048) %1339[i16_0_1054,i13_1_0,i12_0_0,i12_0_1,c2_1033,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 28, 128, 4096) %'input77'[i12_0_0,i12_0_1,i0.128,i1.2048+2048c2_1033] # id=1212, src_id=None, , instances=448 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 325.929us (64.000MiB, est bw: 205.901GB/s, 5.648% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 4, 4, 2, 128, 2048) %1343[i37_0,i37_1_0,i38_0,i38_1,c2_1064,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (4, 4, 128, 4096) %'input82'[i38_0,i38_1,i0.128,i1.2048+2048c2_1064] # id=1233, src_id=None, , instances=128 # dl = tensor_op_name: _dot.9 | hlo_id: 67 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 99.175us (16.000MiB, est bw: 169.167GB/s, 1.719% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 4, 512) %'input78_local_1128'[c0_1120_0_0,c0_1120_0_1,c0_1120_1,c1_1121,i0.128,i3.4,i1.128+128i2.2+256p_1531] = load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 4, 4, 2, 128) %'input78'[4c0_1120_0_0+2c0_1120_0_1+c0_1120_1,p_1531,i0.128,c1_1121,i3.4,i2.2,i1.128] # id=1282, src_id=None, , instances=64 # dl = tensor_op_name: _dot.10 | hlo_id: 159 | [[i0.128];[i1.128, i2.2, i3.4]] -> [[i0.128];[i1.128, i2.2, i3.4]] +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 1.429% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 4, 2, 128, 2048) %'974.1502'[i11_0,i11_1_0,T_i2_0,T_i3_0_1988,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2, 2, 4, 128, 4096) %'add.4'[i11_0,i11_1_0,T_i2_0,i0.128,i1.2048+2048T_i3_0_1988] # id=1354, src_id=None, , instances=32 # dl = tensor_op_name: add.4_pftranspose_974 | hlo_id: 17 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 1.429% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2048) %'_reload_1335'[i16_0_1054,i13_1_0,i4_0_710_0_1338,i4_0_710_1_1338_0,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM3DBlk partitions[4] bfloat16 (2, 4, 2, 2, 128, 2048) %'_spill_1332'[i4_0_710_0_1338,i4_0_710_1_1338_0,i16_0_1054,i13_1_0,i0.128,i1.2048] # id=1337, src_id=None, , instances=32 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 1.429% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2048) %'978.1507'[i37_0,i37_1_0,T_i17_0_986_0,2T_i2_0_0_1989+T_i2_0_1_1989,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 4, 128, 4096) %'all_reduce.1'[i37_0,0,i37_1_0,2T_i2_0_0_1989+T_i2_0_1_1989,i0.128,i1.2048+2048T_i17_0_986_0] # id=1363, src_id=None, , instances=32 # dl = tensor_op_name: all_reduce.1_pftranspose_978 | hlo_id: 52 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 49.765us (16.000MiB, est bw: 337.130GB/s, 0.862% of tot. time) for bfloat16<128 x 2048> DRAM3DBlk partitions[4] bfloat16 (2, 4, 2, 2, 128, 2048) %'_spill_1332'[i2_0_0_1455,i2_0_1_1455_0,i11_0,i11_1_0,i0.128,i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2048) %995[i11_0,i11_1_0,i2_0_0_1455,i2_0_1_1455_0,i0.128,i1.2048] # id=1334, src_id=None, , instances=32 # dl = tensor_op_name: _custom-call.137 | hlo_id: 23 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 49.765us (16.000MiB, est bw: 337.130GB/s, 0.862% of tot. time) for bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 8, 128, 4096) %'dot.7'[i16_0_1054,0,4i16_1_0_0_1054_1340+i16_1_0_1_1054_1340,i0.128,2048i15_0_0_1054_0_0_1340+i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2048) %1055[i16_0_1054,i15_0_0_1054_0_0_1340,i16_1_0_0_1054_1340,i16_1_0_1_1054_1340,i0.128,i1.2048] # id=1221, src_id=None, , instances=32 # dl = tensor_op_name: _dot.6 | hlo_id: 49 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.009 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.003 seconds +2025-09-05T19:15:12Z INFO 1618 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.151 seconds +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.009 seconds +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.265 seconds +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.009 seconds +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.083 seconds +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.003 seconds +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.208 seconds +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.004 seconds +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.014 seconds +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.048 seconds +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-09-05T19:15:13Z INFO 1618 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.215 seconds +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-09-05T19:15:13Z INFO 1617 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.384 seconds +2025-09-05T19:15:13Z INFO 1618 [Tensorizer]: BirCodeGen estimate #instances=50675 in sg0001 +2025-09-05T19:15:13Z INFO 1618 [Tensorizer]: IR signature: f8bb07e50b77a50190fcdb85b29ee8455dcc3f14b7d669ca25f791cb80f6204e for nc00/sg0001/TensorizerBIR +2025-09-05T19:15:13Z INFO 1618 [Tensorizer]: Weights total number of bytes: 262144 +2025-09-05T19:15:13Z INFO 1618 [Tensorizer]: Successfully built model. +2025-09-05T19:15:13Z INFO 1617 [Tensorizer]: BirCodeGen estimate #instances=55418 in sg0000 +2025-09-05T19:15:14Z INFO 1617 [Tensorizer]: IR signature: 9934a1dffc30797ebb3c57cc753d949b6e63e56cc42a89f9a794b20e183d0b09 for nc00/sg0000/TensorizerBIR +2025-09-05T19:15:14Z INFO 1617 [Tensorizer]: Weights total number of bytes: 262400 +2025-09-05T19:15:14Z INFO 1617 [Tensorizer]: Successfully built model. +2025-09-05T19:15:14Z USER 1166 [root/Tensorizer/Tensorizer]: Tensorizer finished after 9.159 seconds +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: End tensorization +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input68 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input0 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input71 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input73 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input1 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input72 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input70 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input69 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input4 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input2 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input5 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input76 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input77 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input75 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input74 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input80 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input82 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input81 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input79 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input78 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input6 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input2 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input7 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input355 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input356 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input354 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input353 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input358 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input1 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input357 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Network input: input3 +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: wrote bir.json +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: wrote tensor_map.json +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: wrote bir.json +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: wrote tensor_map.json +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: wrote bir.json +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: wrote tensor_map.json +2025-09-05T19:15:14Z INFO 1166 [job.Frontend.0]: Job #0 finished +2025-09-05T19:15:14Z INFO 1166 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-09-05T19:15:14Z INFO 1166 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-09-05T19:15:14Z INFO 1166 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-09-05T19:15:14Z INFO 1166 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-09-05T19:15:14Z INFO 1166 [job.WalrusDriver.0]: BackendDriver has 3 states with 1 core LNC +2025-09-05T19:15:14Z INFO 1166 [job.WalrusDriver.0]: BackendDriver MT cwd: /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y +2025-09-05T19:15:14Z INFO 1166 [job.BIRLinker.1]: Creating directory sgLnk/sg00 +2025-09-05T19:15:14Z INFO 1166 [job.WalrusDriver.0]: StateId sg00 Dir /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y/sg00 +2025-09-05T19:15:14Z INFO 1166 [job.WalrusDriver.0]: StateId sg01 Dir /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y/sg01 +2025-09-05T19:15:14Z INFO 1166 [job.WalrusDriver.0]: StateId sg02 Dir /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y/sg02 +2025-09-05T19:15:14Z INFO 1166 [job.WalrusDriver.0]: Number of subgraphs to link: 3 +2025-09-05T19:15:14Z INFO 1166 [job.WalrusDriver.0]: lnkState: {"model": ["/models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y/sgLnk/sg00", "state_id": "sgLnk"} +2025-09-05T19:15:14Z INFO 1166 [job.WalrusDriver.0]: BackendDriver in_state.num_states 3 with 1 core LNC +2025-09-05T19:15:14Z INFO 1166 [job.WalrusDriver.0]: Executing /opt/conda/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/log-neuron-cc.txt -o walrus_bir.out.json --enable-call-graph --enable-mt-backend --link-subgraphs sg00,sg01,sg02 --link-dir sgLnk/sg00 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-lower-bound 0.14 --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/conda/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/conda/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --enable-internal-partitioner --dge-levels vector_dynamic_offsets,scalar_dynamic_offset,io --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.neff +2025-09-05T19:15:14Z INFO 1166 [job.WalrusDriver.0]: Working directory is /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y +2025-09-05T19:15:14Z INFO 1166 [job.WalrusDriver.0]: propagate_exit=True +2025-09-05T19:15:14Z INFO 1166 [job.WalrusDriver.0]: use_logger=False +2025-09-05T19:15:14Z INFO 1166 [job.WalrusDriver.0]: expose_stderr=True +2025-09-05T19:15:14Z INFO 1724 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-09-05T19:15:14Z INFO 1724 [BackendDriver]: max_allowed_parallelism=32 +2025-09-05T19:15:14Z INFO 1724 [BackendDriver]: Loading module from sg00/bir.json +2025-09-05T19:15:14Z INFO 1724 [BackendDriver]: Loading module from sg01/bir.json +2025-09-05T19:15:14Z INFO 1724 [BackendDriver]: Loading module from sg02/bir.json +2025-09-05T19:15:14Z INFO 1724 [BackendDriver]: Backend driver mtBackend: true numModules: 3 Cwd: "/models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y" +2025-09-05T19:15:14Z INFO 1724 [BackendDriver]: DynamicDMA is enabled +2025-09-05T19:15:14Z INFO 1724 [BackendDriver]: DynamicDMA levels being enabled: io, scalar_dynamic_offset, vector_dynamic_offsets, +2025-09-05T19:15:14Z INFO 1724 [BackendDriver]: Modular flow call graph is enabled +2025-09-05T19:15:14Z INFO 1724 [BackendDriver]: Internal partitioner is enabled +2025-09-05T19:15:14Z USER 1724 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:15:14Z INFO 1724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=503 blocks=3 instructions=390 Max writers: 32 Max Readers: 95 +2025-09-05T19:15:14Z USER 1724 (sg00) [ModuleForkPass]: Running do_nothing +2025-09-05T19:15:14Z USER 1724 (sg02) [ModuleForkPass]: Running do_nothing +2025-09-05T19:15:14Z USER 1724 (sg01) [ModuleForkPass]: Running do_nothing +2025-09-05T19:15:14Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=177 blocks=1 instructions=313 Max writers: 32 Max Readers: 95 +2025-09-05T19:15:14Z USER 1724 (sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-09-05T19:15:14Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 96mb, ru_maxrss: 209mb (delta=0mb) +2025-09-05T19:15:14Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=189 blocks=1 instructions=37 Max writers: 30 Max Readers: 29 +2025-09-05T19:15:14Z USER 1724 (sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-09-05T19:15:14Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 96mb, ru_maxrss: 209mb (delta=0mb) +2025-09-05T19:15:14Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 177 memory location(s), 1 block(s), and 313 instruction(s). Max writers: 32 Max Readers: 95 +2025-09-05T19:15:14Z USER 1724 (sg02) [ModuleForkPass]: Running birverifier +2025-09-05T19:15:14Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=137 blocks=1 instructions=40 Max writers: 4 Max Readers: 10 +2025-09-05T19:15:14Z USER 1724 (sg01) [ModuleForkPass]: do_nothing finished after 0.001 seconds +2025-09-05T19:15:14Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 96mb, ru_maxrss: 209mb (delta=0mb) +2025-09-05T19:15:14Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=177 blocks=1 instructions=313 Max writers: 32 Max Readers: 95 +2025-09-05T19:15:14Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 189 memory location(s), 1 block(s), and 37 instruction(s). Max writers: 30 Max Readers: 29 +2025-09-05T19:15:14Z USER 1724 (sg00) [ModuleForkPass]: Running birverifier +2025-09-05T19:15:14Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 137 memory location(s), 1 block(s), and 40 instruction(s). Max writers: 4 Max Readers: 10 +2025-09-05T19:15:14Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=189 blocks=1 instructions=37 Max writers: 30 Max Readers: 29 +2025-09-05T19:15:14Z USER 1724 (sg01) [ModuleForkPass]: Running birverifier +2025-09-05T19:15:14Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=137 blocks=1 instructions=40 Max writers: 4 Max Readers: 10 +2025-09-05T19:15:14Z USER 1724 (sg00) [ModuleForkPass]: birverifier finished after 0.044 seconds +2025-09-05T19:15:14Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 234mb, ru_maxrss: 234mb (delta=25mb) +2025-09-05T19:15:14Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 189 memory location(s), 1 block(s), and 37 instruction(s). Max writers: 30 Max Readers: 29 +2025-09-05T19:15:15Z USER 1724 (sg02) [ModuleForkPass]: birverifier finished after 0.115 seconds +2025-09-05T19:15:15Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 368mb, ru_maxrss: 368mb (delta=159mb) +2025-09-05T19:15:15Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 177 memory location(s), 1 block(s), and 313 instruction(s). Max writers: 32 Max Readers: 95 +2025-09-05T19:15:15Z USER 1724 (sg01) [ModuleForkPass]: birverifier finished after 0.129 seconds +2025-09-05T19:15:15Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 391mb, ru_maxrss: 391mb (delta=182mb) +2025-09-05T19:15:15Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 137 memory location(s), 1 block(s), and 40 instruction(s). Max writers: 4 Max Readers: 10 +2025-09-05T19:15:15Z USER 1724 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-09-05T19:15:15Z USER 1724 [BackendPassManager]: mod_parallel_pass finished after 0.132 seconds +2025-09-05T19:15:15Z INFO 1724 [BackendPassManager]: curr_vmrss: 391mb, ru_maxrss: 391mb (delta=182mb) +2025-09-05T19:15:15Z INFO 1724 [BackendPassManager]: Output has 3 module(s), 3 function(s), 503 memory location(s), 3 block(s), and 390 instruction(s). Max writers: 32 Max Readers: 95 +2025-09-05T19:15:15Z USER 1724 [BackendPassManager]: Running subgraph_parallel_pass +2025-09-05T19:15:15Z INFO 1724 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=503 blocks=3 instructions=390 Max writers: 32 Max Readers: 95 +2025-09-05T19:15:15Z USER 1724 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-09-05T19:15:15Z USER 1724 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-09-05T19:15:15Z USER 1724 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-09-05T19:15:15Z INFO 1724 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=177 blocks=1 instructions=313 Max writers: 32 Max Readers: 95 +2025-09-05T19:15:15Z USER 1724 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-09-05T19:15:15Z INFO 1724 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=189 blocks=1 instructions=37 Max writers: 30 Max Readers: 29 +2025-09-05T19:15:15Z USER 1724 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-09-05T19:15:15Z INFO 1724 (sg02) [SubgraphForkPass]: curr_vmrss: 391mb, ru_maxrss: 391mb (delta=0mb) +2025-09-05T19:15:15Z INFO 1724 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=137 blocks=1 instructions=40 Max writers: 4 Max Readers: 10 +2025-09-05T19:15:15Z INFO 1724 (sg00) [SubgraphForkPass]: curr_vmrss: 391mb, ru_maxrss: 391mb (delta=0mb) +2025-09-05T19:15:15Z USER 1724 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-09-05T19:15:15Z INFO 1724 (sg01) [SubgraphForkPass]: curr_vmrss: 391mb, ru_maxrss: 391mb (delta=0mb) +2025-09-05T19:15:15Z INFO 1724 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 177 memory location(s), 1 block(s), and 313 instruction(s). Max writers: 32 Max Readers: 95 +2025-09-05T19:15:15Z INFO 1724 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 189 memory location(s), 1 block(s), and 37 instruction(s). Max writers: 30 Max Readers: 29 +2025-09-05T19:15:15Z INFO 1724 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 137 memory location(s), 1 block(s), and 40 instruction(s). Max writers: 4 Max Readers: 10 +2025-09-05T19:15:15Z USER 1724 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-09-05T19:15:15Z USER 1724 [BackendPassManager]: subgraph_parallel_pass finished after 0.001 seconds +2025-09-05T19:15:15Z INFO 1724 [BackendPassManager]: curr_vmrss: 391mb, ru_maxrss: 391mb (delta=0mb) +2025-09-05T19:15:15Z INFO 1724 [BackendPassManager]: Output has 3 module(s), 3 function(s), 503 memory location(s), 3 block(s), and 390 instruction(s). Max writers: 32 Max Readers: 95 +2025-09-05T19:15:15Z USER 1724 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:15:15Z INFO 1724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=503 blocks=3 instructions=390 Max writers: 32 Max Readers: 95 +2025-09-05T19:15:15Z USER 1724 (sg00) [ModuleForkPass]: Running expand_replication +2025-09-05T19:15:15Z USER 1724 (sg01) [ModuleForkPass]: Running expand_replication +2025-09-05T19:15:15Z USER 1724 (sg02) [ModuleForkPass]: Running expand_replication +2025-09-05T19:15:15Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=177 blocks=1 instructions=313 Max writers: 32 Max Readers: 95 +2025-09-05T19:15:15Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=189 blocks=1 instructions=37 Max writers: 30 Max Readers: 29 +2025-09-05T19:15:15Z INFO 1724 (sg02) [ExpandReplication]: Found 0 replicated matmults +2025-09-05T19:15:15Z INFO 1724 (sg00) [ExpandReplication]: Found 0 replicated matmults +2025-09-05T19:15:15Z USER 1724 (sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-09-05T19:15:15Z USER 1724 (sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-09-05T19:15:15Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=137 blocks=1 instructions=40 Max writers: 4 Max Readers: 10 +2025-09-05T19:15:15Z INFO 1724 (sg01) [ExpandReplication]: Found 0 replicated matmults +2025-09-05T19:15:15Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 391mb, ru_maxrss: 391mb (delta=0mb) +2025-09-05T19:15:15Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 391mb, ru_maxrss: 391mb (delta=0mb) +2025-09-05T19:15:15Z USER 1724 (sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-09-05T19:15:15Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 391mb, ru_maxrss: 391mb (delta=0mb) +2025-09-05T19:15:15Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 177 memory location(s), 1 block(s), and 313 instruction(s). Max writers: 32 Max Readers: 95 +2025-09-05T19:15:15Z USER 1724 (sg02) [ModuleForkPass]: Running unroll +2025-09-05T19:15:15Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 189 memory location(s), 1 block(s), and 37 instruction(s). Max writers: 30 Max Readers: 29 +2025-09-05T19:15:15Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=177 blocks=1 instructions=313 Max writers: 32 Max Readers: 95 +2025-09-05T19:15:15Z USER 1724 (sg00) [ModuleForkPass]: Running unroll +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: INFO (Unroll) Start unrolling at Fri Sep 5 19:15:15 2025 +2025-09-05T19:15:15Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 137 memory location(s), 1 block(s), and 40 instruction(s). Max writers: 4 Max Readers: 10 +2025-09-05T19:15:15Z USER 1724 (sg01) [ModuleForkPass]: Running unroll +2025-09-05T19:15:15Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=189 blocks=1 instructions=37 Max writers: 30 Max Readers: 29 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: INFO (Unroll) Start unrolling at Fri Sep 5 19:15:15 2025 +2025-09-05T19:15:15Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=137 blocks=1 instructions=40 Max writers: 4 Max Readers: 10 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: INFO (Unroll) Start unrolling at Fri Sep 5 19:15:15 2025 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: INFO (Unroll) DONE unrolling Fri Sep 5 19:15:15 2025 + +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: Total count: 20602 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: Matmult: 10320 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: TensorScalarPtr: 2566 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: GenericCopy: 1826 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: TensorReduce: 1536 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: TensorTensor: 1208 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: Activation: 940 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: Memset: 775 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: TensorScalarAffineSelect: 640 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: Load: 294 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: DMACopy: 163 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: Save: 133 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: Reciprocal: 128 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: Iota: 64 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: CollectiveCompute: 4 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: StreamShuffle: 4 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: Select: 1 +2025-09-05T19:15:15Z INFO 1724 (sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 160 +2025-09-05T19:15:15Z USER 1724 (sg00) [ModuleForkPass]: unroll finished after 0.485 seconds +2025-09-05T19:15:15Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 776mb, ru_maxrss: 776mb (delta=385mb) +2025-09-05T19:15:15Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11749 memory location(s), 1 block(s), and 20602 instruction(s). Max writers: 64 Max Readers: 3776 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: INFO (Unroll) DONE unrolling Fri Sep 5 19:15:15 2025 + +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: Total count: 35073 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: Matmult: 27521 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: Load: 4860 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: GenericCopy: 902 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: TensorTensor: 625 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: Activation: 491 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: TensorScalarPtr: 281 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: Save: 142 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: Max: 64 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: MaxIndex: 64 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: MatchReplace: 62 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: Memset: 20 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: TensorReduce: 14 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: Select: 6 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: Iota: 5 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: CollectiveCompute: 4 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: StreamShuffle: 4 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: Gather: 3 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: Reciprocal: 3 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: DMACopy: 2 +2025-09-05T19:15:15Z INFO 1724 (sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-09-05T19:15:15Z USER 1724 (sg02) [ModuleForkPass]: unroll finished after 0.664 seconds +2025-09-05T19:15:15Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 825mb, ru_maxrss: 825mb (delta=434mb) +2025-09-05T19:15:15Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5285 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: INFO (Unroll) DONE unrolling Fri Sep 5 19:15:15 2025 + +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: Total count: 50675 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: Matmult: 34912 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: Load: 4820 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: TensorScalarPtr: 2634 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: TensorReduce: 2304 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: GenericCopy: 2204 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: Activation: 1512 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: Select: 1025 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: TensorTensor: 848 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: Save: 137 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: DMACopy: 131 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: Reciprocal: 128 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: Memset: 12 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: CollectiveCompute: 4 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: StreamShuffle: 4 +2025-09-05T19:15:15Z INFO 1724 (sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 128 +2025-09-05T19:15:15Z USER 1724 (sg01) [ModuleForkPass]: unroll finished after 0.964 seconds +2025-09-05T19:15:15Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 823mb, ru_maxrss: 825mb (delta=434mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13073 memory location(s), 1 block(s), and 50675 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-09-05T19:15:16Z USER 1724 [BackendPassManager]: mod_parallel_pass finished after 0.994 seconds +2025-09-05T19:15:16Z INFO 1724 [BackendPassManager]: curr_vmrss: 671mb, ru_maxrss: 825mb (delta=434mb) +2025-09-05T19:15:16Z INFO 1724 [BackendPassManager]: Output has 3 module(s), 3 function(s), 30107 memory location(s), 3 block(s), and 106350 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 [BackendPassManager]: Running subgraph_parallel_pass +2025-09-05T19:15:16Z INFO 1724 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=30107 blocks=3 instructions=106350 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg00) [SubgraphForkPass]: Running dead_code_elim +2025-09-05T19:15:16Z USER 1724 (sg01) [SubgraphForkPass]: Running dead_code_elim +2025-09-05T19:15:16Z USER 1724 (sg02) [SubgraphForkPass]: Running dead_code_elim +2025-09-05T19:15:16Z INFO 1724 (sg02) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=5285 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z INFO 1724 (sg01) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=13073 blocks=1 instructions=50675 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z INFO 1724 (sg00) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=11749 blocks=1 instructions=20602 Max writers: 64 Max Readers: 3776 +2025-09-05T19:15:16Z INFO 1724 (sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:16Z INFO 1724 (sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z USER 1724 (sg00) [SubgraphForkPass]: dead_code_elim finished after 0.078 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [SubgraphForkPass]: curr_vmrss: 702mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9178 memory location(s), 1 block(s), and 20176 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z INFO 1724 (sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:16Z INFO 1724 (sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:16Z INFO 1724 (sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z USER 1724 (sg02) [SubgraphForkPass]: dead_code_elim finished after 0.177 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [SubgraphForkPass]: curr_vmrss: 727mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z INFO 1724 (sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z USER 1724 (sg01) [SubgraphForkPass]: dead_code_elim finished after 0.195 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [SubgraphForkPass]: curr_vmrss: 727mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 12976 memory location(s), 1 block(s), and 50675 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-09-05T19:15:16Z USER 1724 [BackendPassManager]: subgraph_parallel_pass finished after 0.197 seconds +2025-09-05T19:15:16Z INFO 1724 [BackendPassManager]: curr_vmrss: 727mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 [BackendPassManager]: Output has 3 module(s), 3 function(s), 27392 memory location(s), 3 block(s), and 105924 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:15:16Z INFO 1724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=27392 blocks=3 instructions=105924 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running birverifier +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: Running birverifier +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running birverifier +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=12976 blocks=1 instructions=50675 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=9178 blocks=1 instructions=20176 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: birverifier finished after 0.062 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 734mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9178 memory location(s), 1 block(s), and 20176 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: birverifier finished after 0.065 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 735mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: birverifier finished after 0.080 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 752mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12976 memory location(s), 1 block(s), and 50675 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-09-05T19:15:16Z USER 1724 [BackendPassManager]: mod_parallel_pass finished after 0.083 seconds +2025-09-05T19:15:16Z INFO 1724 [BackendPassManager]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 [BackendPassManager]: Output has 3 module(s), 3 function(s), 27392 memory location(s), 3 block(s), and 105924 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 [BackendPassManager]: Running subgraph_parallel_pass +2025-09-05T19:15:16Z INFO 1724 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=27392 blocks=3 instructions=105924 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-09-05T19:15:16Z USER 1724 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-09-05T19:15:16Z USER 1724 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-09-05T19:15:16Z INFO 1724 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [SubgraphForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=9178 blocks=1 instructions=20176 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [SubgraphForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=12976 blocks=1 instructions=50675 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z INFO 1724 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [SubgraphForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 12976 memory location(s), 1 block(s), and 50675 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z INFO 1724 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9178 memory location(s), 1 block(s), and 20176 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-09-05T19:15:16Z USER 1724 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-09-05T19:15:16Z INFO 1724 [BackendPassManager]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 [BackendPassManager]: Output has 3 module(s), 3 function(s), 27392 memory location(s), 3 block(s), and 105924 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:15:16Z INFO 1724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=27392 blocks=3 instructions=105924 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running instruction_reorder +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running instruction_reorder +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: Running instruction_reorder +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=9178 blocks=1 instructions=20176 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=12976 blocks=1 instructions=50675 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: instruction_reorder finished after 0.003 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9178 memory location(s), 1 block(s), and 20176 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running psum_legalization +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=9178 blocks=1 instructions=20176 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: instruction_reorder finished after 0.005 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running psum_legalization +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: instruction_reorder finished after 0.007 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: psum_legalization finished after 0.003 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9178 memory location(s), 1 block(s), and 20176 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12976 memory location(s), 1 block(s), and 50675 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: Running psum_legalization +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=9178 blocks=1 instructions=20176 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=12976 blocks=1 instructions=50675 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: psum_legalization finished after 0.003 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.002 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9178 memory location(s), 1 block(s), and 20176 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running pre_opts +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=9178 blocks=1 instructions=20176 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9178 memory location(s), 1 block(s), and 20176 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running error_injector +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=9178 blocks=1 instructions=20176 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z WARNING 1724 (sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.002 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9178 memory location(s), 1 block(s), and 20176 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running vn_splitter +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running pre_opts +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=9178 blocks=1 instructions=20176 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z INFO 1724 (sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running error_injector +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z WARNING 1724 (sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 6 +2025-09-05T19:15:16Z INFO 1724 (sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running vn_splitter +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z INFO 1724 (sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 45 +2025-09-05T19:15:16Z INFO 1724 (sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: psum_legalization finished after 0.005 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 747mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12976 memory location(s), 1 block(s), and 50675 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=12976 blocks=1 instructions=50675 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.003 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 749mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12976 memory location(s), 1 block(s), and 50675 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: Running pre_opts +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=12976 blocks=1 instructions=50675 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z INFO 1724 (sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: pre_opts finished after 0.001 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 749mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12976 memory location(s), 1 block(s), and 50675 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: Running error_injector +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=12976 blocks=1 instructions=50675 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z WARNING 1724 (sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 749mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12976 memory location(s), 1 block(s), and 50675 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: Running vn_splitter +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=12976 blocks=1 instructions=50675 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z INFO 1724 (sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 39 +2025-09-05T19:15:16Z INFO 1724 (sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-09-05T19:15:16Z INFO 1724 (sg02) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-09-05T19:15:16Z INFO 1724 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-09-05T19:15:16Z INFO 1724 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-09-05T19:15:16Z INFO 1724 (sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.007 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.007 seconds +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: vn_splitter finished after 0.021 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 751mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running constant_propagate +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z INFO 1724 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-09-05T19:15:16Z INFO 1724 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-09-05T19:15:16Z INFO 1724 (sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.012 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.006 seconds +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: vn_splitter finished after 0.025 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 751mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9178 memory location(s), 1 block(s), and 20176 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running constant_propagate +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=9178 blocks=1 instructions=20176 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z INFO 1724 (sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-09-05T19:15:16Z INFO 1724 (sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-09-05T19:15:16Z INFO 1724 (sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:16Z INFO 1724 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-09-05T19:15:16Z INFO 1724 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-09-05T19:15:16Z INFO 1724 (sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0.001 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.016 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.011 seconds +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: vn_splitter finished after 0.039 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 752mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12976 memory location(s), 1 block(s), and 50675 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: Running constant_propagate +2025-09-05T19:15:16Z INFO 1724 (sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=12976 blocks=1 instructions=50675 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z INFO 1724 (sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-09-05T19:15:16Z INFO 1724 (sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 384 +2025-09-05T19:15:16Z INFO 1724 (sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:16Z INFO 1724 (sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:16Z INFO 1724 (sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: constant_propagate finished after 0.100 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 751mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8794 memory location(s), 1 block(s), and 19792 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running lower_ac +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=8794 blocks=1 instructions=19792 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z INFO 1724 (sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: lower_ac finished after 0.003 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 750mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8794 memory location(s), 1 block(s), and 19792 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=8794 blocks=1 instructions=19792 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z INFO 1724 (sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:16Z INFO 1724 (sg00) [DMAOptimizationBase]: DMA input Coalescing combined 4 input loads +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.065 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 751mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8792 memory location(s), 1 block(s), and 19790 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running remat_optimization +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=8792 blocks=1 instructions=19790 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z INFO 1724 (sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-09-05T19:15:16Z INFO 1724 (sg00) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.16_pftranspose_1195-t1600_i0_SpillSave0_Reload +2025-09-05T19:15:16Z INFO 1724 (sg00) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.16_pftranspose_1195-t1600_i1_SpillSave1_Reload +2025-09-05T19:15:16Z INFO 1724 (sg00) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.16_pftranspose_1195-t1600_i4_SpillSave2_Reload +2025-09-05T19:15:16Z INFO 1724 (sg00) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.16_pftranspose_1195-t1600_i5_SpillSave3_Reload +2025-09-05T19:15:16Z INFO 1724 (sg00) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.16_pftranspose_1195-t1600_i8_SpillSave4_Reload +2025-09-05T19:15:16Z INFO 1724 (sg00) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.16_pftranspose_1195-t1600_i9_SpillSave5_Reload +2025-09-05T19:15:16Z INFO 1724 (sg00) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.16_pftranspose_1195-t1600_i12_SpillSave6_Reload +2025-09-05T19:15:16Z INFO 1724 (sg00) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.16_pftranspose_1195-t1600_i13_SpillSave7_Reload +2025-09-05T19:15:16Z INFO 1724 (sg00) [RematOpt]: Removed 8 remat instructions +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: remat_optimization finished after 0.014 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 752mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8800 memory location(s), 1 block(s), and 19798 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running early_peephole_opts +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=8800 blocks=1 instructions=19798 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z INFO 1724 (sg00) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-09-05T19:15:16Z INFO 1724 (sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg00) [EarlyPeepholeOpts]: Activation Accumulate: 640 +2025-09-05T19:15:16Z INFO 1724 (sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: early_peephole_opts finished after 0.090 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 751mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8800 memory location(s), 1 block(s), and 19798 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=8800 blocks=1 instructions=19798 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.002 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 750mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8800 memory location(s), 1 block(s), and 19798 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running infer_stream_ids +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=8800 blocks=1 instructions=19798 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: infer_stream_ids finished after 0.002 seconds +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 750mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8800 memory location(s), 1 block(s), and 19798 instruction(s). Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z USER 1724 (sg00) [ModuleForkPass]: Running pre_sched +2025-09-05T19:15:16Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=8800 blocks=1 instructions=19798 Max writers: 64 Max Readers: 3744 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Fri Sep 5 19:15:16 2025 +2025-09-05T19:15:16Z INFO 1724 [LayerSpiller]: LayerSpill: Start... +2025-09-05T19:15:16Z INFO 1724 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-09-05T19:15:16Z INFO 1724 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-09-05T19:15:16Z INFO 1724 (sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-09-05T19:15:16Z INFO 1724 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-09-05T19:15:16Z INFO 1724 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-09-05T19:15:16Z INFO 1724 [LayerSpiller]: LayerSpill: Done. +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: Start split live ranges Fri Sep 5 19:15:16 2025 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: Num_Splits: 0 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: End split live ranges Fri Sep 5 19:15:16 2025 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: Strt remove redundncies Fri Sep 5 19:15:16 2025 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: remove_redundant_memsets +2025-09-05T19:15:16Z INFO 1724 (sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:16Z INFO 1724 (sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: constant_propagate finished after 0.380 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 759mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running lower_ac +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: remove_redundant_memsets: 0 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: remove_redundant_loads +2025-09-05T19:15:16Z INFO 1724 (sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: lower_ac finished after 0.004 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 760mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: remove_redundant_loads: 0 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: End remove redundncies Fri Sep 5 19:15:16 2025 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: Start DCE Fri Sep 5 19:15:16 2025 +2025-09-05T19:15:16Z INFO 1724 (sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.012 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 760mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running remat_optimization +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z INFO 1724 (sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg02) [RematOpt]: Removed 0 remat instructions +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: remat_optimization finished after 0.020 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 761mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running early_peephole_opts +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z INFO 1724 (sg02) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg02) [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: End DCE Fri Sep 5 19:15:16 2025 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: Start build flow dependencies Fri Sep 5 19:15:16 2025 +2025-09-05T19:15:16Z INFO 1724 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Fri Sep 5 19:15:16 2025 +2025-09-05T19:15:16Z INFO 1724 (sg00) [build_flow_deps]: Allocs: 8790 instructions: 19764 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: early_peephole_opts finished after 0.049 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 761mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.002 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 761mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running infer_stream_ids +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: infer_stream_ids finished after 0.002 seconds +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 761mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5238 memory location(s), 1 block(s), and 35073 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z USER 1724 (sg02) [ModuleForkPass]: Running pre_sched +2025-09-05T19:15:16Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=5238 blocks=1 instructions=35073 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:16Z INFO 1724 (sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Fri Sep 5 19:15:16 2025 +2025-09-05T19:15:16Z INFO 1724 [LayerSpiller]: LayerSpill: Start... +2025-09-05T19:15:16Z INFO 1724 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-09-05T19:15:16Z INFO 1724 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-09-05T19:15:16Z INFO 1724 (sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: constant_propagate finished after 0.418 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 761mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12976 memory location(s), 1 block(s), and 50675 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: Running lower_ac +2025-09-05T19:15:16Z INFO 1724 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-09-05T19:15:16Z INFO 1724 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-09-05T19:15:16Z INFO 1724 [LayerSpiller]: LayerSpill: Done. +2025-09-05T19:15:16Z INFO 1724 (sg02) [PreSched]: Start split live ranges Fri Sep 5 19:15:16 2025 +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=12976 blocks=1 instructions=50675 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z INFO 1724 (sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: lower_ac finished after 0.006 seconds +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 762mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12976 memory location(s), 1 block(s), and 50675 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z USER 1724 (sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-09-05T19:15:16Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=12976 blocks=1 instructions=50675 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:16Z INFO 1724 (sg02) [PreSched]: Num_Splits: 8 +2025-09-05T19:15:16Z INFO 1724 (sg02) [PreSched]: End split live ranges Fri Sep 5 19:15:16 2025 +2025-09-05T19:15:16Z INFO 1724 (sg02) [PreSched]: Strt remove redundncies Fri Sep 5 19:15:16 2025 +2025-09-05T19:15:16Z INFO 1724 (sg02) [PreSched]: remove_redundant_memsets +2025-09-05T19:15:16Z INFO 1724 (sg02) [PreSched]: remove_redundant_memsets: 0 +2025-09-05T19:15:16Z INFO 1724 (sg02) [PreSched]: remove_redundant_loads +2025-09-05T19:15:16Z INFO 1724 (sg00) [build_flow_deps]: Build fdeps inserted 52376 edges +2025-09-05T19:15:16Z INFO 1724 (sg00) [build_flow_deps]: Done build fdeps 52376 Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: End build flow dependencies Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:16Z INFO 1724 (sg00) [PreSched]: Start remove useless insts Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PreSched]: remove_useless_insts +2025-09-05T19:15:17Z INFO 1724 (sg01) [DMAOptimizationBase]: DMA input Coalescing combined 4 input loads +2025-09-05T19:15:17Z USER 1724 (sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.094 seconds +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 770mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12974 memory location(s), 1 block(s), and 50673 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:17Z USER 1724 (sg01) [ModuleForkPass]: Running remat_optimization +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=12974 blocks=1 instructions=50673 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PreSched]: remove Useless Instructions: 0 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PreSched]: End remove useless insts Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PreSched]: Start scratchpad optimization Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: remove_redundant_loads: 0 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: End remove redundncies Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: Start DCE Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PreSched]: End scratchpad optimization Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i0_SpillSave0_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i1_SpillSave1_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i2_SpillSave2_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i3_SpillSave3_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i4_SpillSave4_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i5_SpillSave5_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i6_SpillSave6_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i7_SpillSave7_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i8_SpillSave8_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i9_SpillSave9_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i10_SpillSave10_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i11_SpillSave11_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i12_SpillSave12_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i13_SpillSave13_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i14_SpillSave14_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Remove a block of remat writer and replaced with reload transpose.39_pftranspose_942-t1331_i15_SpillSave15_Reload +2025-09-05T19:15:17Z INFO 1724 (sg01) [RematOpt]: Removed 16 remat instructions +2025-09-05T19:15:17Z USER 1724 (sg01) [ModuleForkPass]: remat_optimization finished after 0.086 seconds +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 771mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12990 memory location(s), 1 block(s), and 50689 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:17Z USER 1724 (sg01) [ModuleForkPass]: Running early_peephole_opts +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=12990 blocks=1 instructions=50689 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:17Z INFO 1724 (sg01) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-09-05T19:15:17Z INFO 1724 (sg00) [PreSched]: DONE PRE scheduling Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: pre_sched finished after 0.397 seconds +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 769mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8790 memory location(s), 1 block(s), and 19764 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=8790 blocks=1 instructions=19764 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z INFO 1724 (sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-09-05T19:15:17Z INFO 1724 (sg01) [EarlyPeepholeOpts]: Activation Accumulate: 1024 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:17Z USER 1724 (sg01) [ModuleForkPass]: early_peephole_opts finished after 0.092 seconds +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 768mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12990 memory location(s), 1 block(s), and 50689 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:17Z USER 1724 (sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=12990 blocks=1 instructions=50689 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:17Z USER 1724 (sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.004 seconds +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 767mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12990 memory location(s), 1 block(s), and 50689 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:17Z USER 1724 (sg01) [ModuleForkPass]: Running infer_stream_ids +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=12990 blocks=1 instructions=50689 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:17Z USER 1724 (sg01) [ModuleForkPass]: infer_stream_ids finished after 0.004 seconds +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 767mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12990 memory location(s), 1 block(s), and 50689 instruction(s). Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:17Z USER 1724 (sg01) [ModuleForkPass]: Running pre_sched +2025-09-05T19:15:17Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=12990 blocks=1 instructions=50689 Max writers: 64 Max Readers: 5984 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 [LayerSpiller]: LayerSpill: Start... +2025-09-05T19:15:17Z INFO 1724 (sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:17Z INFO 1724 (sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:17Z INFO 1724 (sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:17Z INFO 1724 [LayerSpiller]: LayerSpill: Found 4 Splits CCs +2025-09-05T19:15:17Z INFO 1724 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.105 seconds +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 767mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8790 memory location(s), 1 block(s), and 19764 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=8790 blocks=1 instructions=19764 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 766mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8791 memory location(s), 1 block(s), and 19764 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=8791 blocks=1 instructions=19764 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.001 seconds +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 766mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8791 memory location(s), 1 block(s), and 19764 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: End DCE Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=8791 blocks=1 instructions=19764 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z INFO 1724 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:17Z INFO 1724 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:17Z INFO 1724 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-09-05T19:15:17Z INFO 1724 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-09-05T19:15:17Z INFO 1724 [LayerSpiller]: LayerSpill: Done. +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: Start split live ranges Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: Start build flow dependencies Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 2Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg02) [build_flow_deps]: Allocs: 5254 instructions: 35089 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: allocating PSUM +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: Num_Splits: 0 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: End split live ranges Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: Strt remove redundncies Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: remove_redundant_memsets +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: main loop +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: renumber locations +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: size = 2562 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: remove_redundant_memsets: 0 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: remove_redundant_loads +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: build_no_bitmap start +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: remove_redundant_loads: 0 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: End remove redundncies Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: Start DCE Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: found 5904 edges +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: mean: 4.6089 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: median: 4.72011 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: adjacency vectors require 47232 bytes +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: build_no_bitmap done +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: find costs +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: simplify interference graph +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: initialize low and high +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: lo = 2562 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: hi = 0 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: inf = 0 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: total = 2562 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: simplify +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: new candidates = 0 +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: select ranges +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: no more spills +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-09-05T19:15:17Z INFO 1724 (sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.188 seconds +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 781mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8791 memory location(s), 1 block(s), and 19764 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=8791 blocks=1 instructions=19764 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z INFO 1724 (sg02) [build_flow_deps]: Build fdeps inserted 114076 edges +2025-09-05T19:15:17Z INFO 1724 (sg02) [build_flow_deps]: Done build fdeps 114076 Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: End build flow dependencies Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: Start remove useless insts Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: remove_useless_insts +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: remove Useless Instructions: 0 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: End remove useless insts Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: Start scratchpad optimization Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-09-05T19:15:17Z INFO 1724 (sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.018 seconds +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 780mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8791 memory location(s), 1 block(s), and 19764 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: Running address_rotation_psum +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=8791 blocks=1 instructions=19764 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: End scratchpad optimization Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:17Z INFO 1724 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:17Z INFO 1724 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-09-05T19:15:17Z INFO 1724 (sg02) [PreSched]: DONE PRE scheduling Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: End DCE Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z USER 1724 (sg02) [ModuleForkPass]: pre_sched finished after 0.620 seconds +2025-09-05T19:15:17Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 782mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5254 memory location(s), 1 block(s), and 35089 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:17Z USER 1724 (sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-09-05T19:15:17Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=5254 blocks=1 instructions=35089 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:17Z INFO 1724 (sg02) [TensorCopyElim]: Tensor CP elimination: 1 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: Start build flow dependencies Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 3Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 516 PSUM Banks +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: address_rotation_psum finished after 0.162 seconds +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 774mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 8791 memory location(s), 1 block(s), and 19764 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z USER 1724 (sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-09-05T19:15:17Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=8791 blocks=1 instructions=19764 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:17Z INFO 1724 (sg01) [build_flow_deps]: Allocs: 12970 instructions: 50621 +2025-09-05T19:15:17Z INFO 1724 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 131186948 +2025-09-05T19:15:17Z INFO 1724 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3509 bytes +2025-09-05T19:15:17Z INFO 1724 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 52428802 +2025-09-05T19:15:17Z INFO 1724 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2925 bytes +2025-09-05T19:15:17Z INFO 1724 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 12673024 +2025-09-05T19:15:17Z INFO 1724 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 308 bytes +2025-09-05T19:15:17Z INFO 1724 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:17Z INFO 1724 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: allocating SB +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: main loop +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: renumber locations +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: size = 6183 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: find partners +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: found 1958 accumulation groups +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: largest = custom-call.136.1691_i1 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: tensors = 33 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: requires 66048 bytes/partition +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: expanding partners +2025-09-05T19:15:17Z INFO 1724 []: find first defs for local +2025-09-05T19:15:17Z INFO 1724 (sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:17Z INFO 1724 []: find first defs for global +2025-09-05T19:15:17Z INFO 1724 (sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:17Z INFO 1724 (sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:17Z INFO 1724 (sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: find loads +2025-09-05T19:15:17Z USER 1724 (sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.298 seconds +2025-09-05T19:15:17Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 798mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5253 memory location(s), 1 block(s), and 35088 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:17Z USER 1724 (sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-09-05T19:15:17Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=5253 blocks=1 instructions=35088 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:17Z USER 1724 (sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-09-05T19:15:17Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 797mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5254 memory location(s), 1 block(s), and 35088 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:17Z USER 1724 (sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-09-05T19:15:17Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=5254 blocks=1 instructions=35088 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:17Z USER 1724 (sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-09-05T19:15:17Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 797mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:17Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5254 memory location(s), 1 block(s), and 35088 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:17Z USER 1724 (sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-09-05T19:15:17Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=5254 blocks=1 instructions=35088 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:17Z INFO 1724 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:17Z INFO 1724 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: 1 pin count +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: 229 remat count +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: build interference graph +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: Num intervals 6183 Num locations 6183 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:17Z INFO 1724 (sg01) [build_flow_deps]: Build fdeps inserted 154576 edges +2025-09-05T19:15:17Z INFO 1724 (sg01) [build_flow_deps]: Done build fdeps 154576 Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: End build flow dependencies Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: Start remove useless insts Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: remove_useless_insts +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: edge: 714258 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: mean: 231.039 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: median: 144.084 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: find costs +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: remove Useless Instructions: 0 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: End remove useless insts Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: Start scratchpad optimization Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PSUM_Allocator]: allocating PSUM +2025-09-05T19:15:17Z INFO 1724 (sg02) [PSUM_Allocator]: main loop +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: safe = 440 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: unsafe = 4895 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: inf = 847 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: total = 6182 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: simplify +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 4879 #Pinned 0 #Safe 0 minCost 0.000296304 maxCost 0.077986 locations 6183 +2025-09-05T19:15:17Z INFO 1724 (sg01) [PreSched]: End scratchpad optimization Fri Sep 5 19:15:17 2025 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PSUM_Allocator]: renumber locations +2025-09-05T19:15:17Z INFO 1724 (sg02) [PSUM_Allocator]: size = 1139 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: new candidates = 441 +2025-09-05T19:15:17Z INFO 1724 (sg00) [SB_Allocator]: select ranges +2025-09-05T19:15:17Z INFO 1724 (sg02) [PSUM_Allocator]: build_no_bitmap start +2025-09-05T19:15:17Z INFO 1724 (sg02) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-09-05T19:15:17Z INFO 1724 (sg02) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-09-05T19:15:17Z INFO 1724 (sg02) [PSUM_Allocator]: found 1544 edges +2025-09-05T19:15:17Z INFO 1724 (sg02) [PSUM_Allocator]: mean: 2.71115 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PSUM_Allocator]: median: 2.51994 +2025-09-05T19:15:17Z INFO 1724 (sg02) [PSUM_Allocator]: adjacency vectors require 12352 bytes +2025-09-05T19:15:17Z INFO 1724 (sg02) [PSUM_Allocator]: build_no_bitmap done +2025-09-05T19:15:17Z INFO 1724 (sg02) [PSUM_Allocator]: find costs +2025-09-05T19:15:18Z INFO 1724 (sg01) [PreSched]: DONE PRE scheduling Fri Sep 5 19:15:18 2025 +2025-09-05T19:15:18Z USER 1724 (sg01) [ModuleForkPass]: pre_sched finished after 0.885 seconds +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 807mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12970 memory location(s), 1 block(s), and 50621 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:18Z USER 1724 (sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=12970 blocks=1 instructions=50621 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Total: 6182 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Spilled: 0.027 (166) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Allocated: 0.973 (6016) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Rover zone: 0.092 (554) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Pre-rover zone: 0.001 (4) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Post-rover zone: 0.907 (5458) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Blocks nothing: 0.000 (1) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Blocks tall: 1.000 (6015) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.999 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Success +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: SB spills = 166 tensors +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: size = 293120 bytes/partition +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: remats = 3 tensors +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: SB score = 2.57399e+06 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: best SB heuristic = 0 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: collect spills +2025-09-05T19:15:18Z INFO 1724 (sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: insert spills +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: deleting loads #loadsToDelete: 3 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: deleting locs #locationsToDelete: 3 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: locationsToDelete done +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: main loop +2025-09-05T19:15:18Z INFO 1724 (sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: renumber locations +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: size = 7129 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: find partners +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: found 1958 accumulation groups +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: largest = custom-call.136.1691_i1 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: tensors = 33 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: requires 66048 bytes/partition +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: expanding partners +2025-09-05T19:15:18Z INFO 1724 (sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:18Z INFO 1724 (sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:15:18Z INFO 1724 (sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:15:18Z INFO 1724 []: find first defs for local +2025-09-05T19:15:18Z USER 1724 (sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.243 seconds +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 797mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12970 memory location(s), 1 block(s), and 50621 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:18Z USER 1724 (sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=12970 blocks=1 instructions=50621 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:18Z USER 1724 (sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 787mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12971 memory location(s), 1 block(s), and 50621 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:18Z USER 1724 (sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=12971 blocks=1 instructions=50621 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:18Z USER 1724 (sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 787mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12971 memory location(s), 1 block(s), and 50621 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:18Z USER 1724 (sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=12971 blocks=1 instructions=50621 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:18Z INFO 1724 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:18Z INFO 1724 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: simplify interference graph +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: initialize low and high +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: lo = 1135 +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: hi = 4 +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: inf = 0 +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: total = 1139 +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: simplify +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: new candidates = 0 +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: select ranges +2025-09-05T19:15:18Z INFO 1724 []: find first defs for global +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: no more spills +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-09-05T19:15:18Z INFO 1724 (sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-09-05T19:15:18Z USER 1724 (sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.584 seconds +2025-09-05T19:15:18Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 787mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:18Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5254 memory location(s), 1 block(s), and 35088 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:18Z USER 1724 (sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-09-05T19:15:18Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=5254 blocks=1 instructions=35088 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: find loads +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: 1 pin count +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: 1107 remat count +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: build interference graph +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: allocating PSUM +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: main loop +2025-09-05T19:15:18Z INFO 1724 (sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-09-05T19:15:18Z INFO 1724 (sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: renumber locations +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: size = 3400 +2025-09-05T19:15:18Z USER 1724 (sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.032 seconds +2025-09-05T19:15:18Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 788mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:18Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5254 memory location(s), 1 block(s), and 35088 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:18Z USER 1724 (sg02) [ModuleForkPass]: Running address_rotation_psum +2025-09-05T19:15:18Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=5254 blocks=1 instructions=35088 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Num intervals 7129 Num locations 7129 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: edge: 426534 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: mean: 119.662 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: median: 77.3259 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: find costs +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: build_no_bitmap start +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: safe = 385 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: unsafe = 150 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: inf = 577 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: total = 1112 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: simplify +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 150 #Pinned 0 #Safe 0 minCost 0.00645532 maxCost 0.0809151 locations 7129 +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: found 9600 edges +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: mean: 5.64706 +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: median: 6.99957 +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: adjacency vectors require 76800 bytes +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: build_no_bitmap done +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: find costs +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: new candidates = 105 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: (including 562 infinite cost tensors) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: select ranges +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Total: 1112 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Allocated: 1.000 (1112) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Rover zone: 0.489 (544) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Pre-rover zone: 0.017 (19) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Post-rover zone: 0.494 (549) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Blocks tall: 1.000 (1112) +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.998 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: Success +2025-09-05T19:15:18Z INFO 1724 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 110 PSUM Banks +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: SB spills = 0 tensors +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: remats = 0 tensors +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: SB score = 0 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: spilling from SB cost about 2.57399e+06 cycles +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: number of tensors spilled from SB = 166 +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: total size of spilled tensors = 293120 bytes/partition +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-09-05T19:15:18Z INFO 1724 (sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-09-05T19:15:18Z INFO 1724 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 352699140 +2025-09-05T19:15:18Z INFO 1724 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2229 bytes +2025-09-05T19:15:18Z INFO 1724 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 112460034 +2025-09-05T19:15:18Z INFO 1724 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2374 bytes +2025-09-05T19:15:18Z INFO 1724 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 12673024 +2025-09-05T19:15:18Z INFO 1724 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 308 bytes +2025-09-05T19:15:18Z USER 1724 (sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.956 seconds +2025-09-05T19:15:18Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 794mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:18Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9900 memory location(s), 1 block(s), and 20941 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:18Z USER 1724 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-09-05T19:15:18Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=9900 blocks=1 instructions=20941 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:18Z INFO 1724 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 4 PSUM Banks +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:15:18Z USER 1724 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.069 seconds +2025-09-05T19:15:18Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 788mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:18Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9900 memory location(s), 1 block(s), and 20941 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:18Z USER 1724 (sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-09-05T19:15:18Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=9900 blocks=1 instructions=20941 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 465159174, 19.8619% input load, 4.73388% output write, 75.4042% spill/reload [sg0000] +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-09-05T19:15:18Z INFO 1724 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-09-05T19:15:18Z USER 1724 (sg02) [ModuleForkPass]: address_rotation_psum finished after 0.210 seconds +2025-09-05T19:15:18Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 788mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:18Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5254 memory location(s), 1 block(s), and 35088 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: removed 0 identical load +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: simplify interference graph +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: initialize low and high +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: lo = 3400 +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: hi = 0 +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: inf = 0 +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: total = 3400 +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: simplify +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: new candidates = 0 +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: select ranges +2025-09-05T19:15:18Z USER 1724 (sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-09-05T19:15:18Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=5254 blocks=1 instructions=35088 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-09-05T19:15:18Z INFO 1724 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 776094748 +2025-09-05T19:15:18Z INFO 1724 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 1251 bytes +2025-09-05T19:15:18Z INFO 1724 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 54604808 +2025-09-05T19:15:18Z INFO 1724 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 4011 bytes +2025-09-05T19:15:18Z INFO 1724 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8196 +2025-09-05T19:15:18Z INFO 1724 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 248 bytes +2025-09-05T19:15:18Z INFO 1724 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:18Z INFO 1724 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: no more spills +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-09-05T19:15:18Z INFO 1724 (sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-09-05T19:15:18Z USER 1724 (sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.360 seconds +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 785mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12971 memory location(s), 1 block(s), and 50621 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:18Z USER 1724 (sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=12971 blocks=1 instructions=50621 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: allocating SB +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: main loop +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: renumber locations +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: size = 4036 +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 1572864, 0.338135% out of total dma traffic(9.23896e+07) +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: find partners +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: found 1129 accumulation groups +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: largest = _dot.228-t859_i59 +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: expanding partners +2025-09-05T19:15:18Z INFO 1724 (sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-09-05T19:15:18Z INFO 1724 (sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-09-05T19:15:18Z USER 1724 (sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.044 seconds +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 789mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12971 memory location(s), 1 block(s), and 50621 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:18Z USER 1724 (sg01) [ModuleForkPass]: Running address_rotation_psum +2025-09-05T19:15:18Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=12971 blocks=1 instructions=50621 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 471 spill/reload instructions +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 459 spill/reload memory locations +2025-09-05T19:15:18Z INFO 1724 []: find first defs for local +2025-09-05T19:15:18Z INFO 1724 []: find first defs for global +2025-09-05T19:15:18Z INFO 1724 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 40 spill/reload instructions +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 36 spill/reload memory locations +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: find loads +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: 1 pin count +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: 1276 remat count +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: build interference graph +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: Num intervals 4036 Num locations 4036 +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [spill optimization round 2]: removed 2 spill/reload instructions +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [spill optimization round 2]: removed 2 spill/reload memory locations +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 91095040, 25.9715% out of total spill/reload dma traffic +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: edge: 566434 +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: mean: 280.691 +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: median: 239.556 +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: find costs +2025-09-05T19:15:18Z INFO 1724 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:18Z INFO 1724 (sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: safe = 755 +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: unsafe = 964 +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: inf = 2316 +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: total = 4035 +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: simplify +2025-09-05T19:15:18Z INFO 1724 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 902 #Pinned 0 #Safe 0 minCost 0.000709421 maxCost 0.12305 locations 4036 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: new candidates = 452 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: select ranges +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Total: 4035 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Spilled: 0.080 (324) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Allocated: 0.920 (3711) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Rover zone: 0.657 (2437) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Pre-rover zone: 0.023 (84) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Post-rover zone: 0.320 (1186) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Slice zone: 0.001 (4) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Blocks nothing: 0.014 (51) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Blocks medium: 0.003 (10) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.541 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (median): 0.664 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.770 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Blocks tall: 0.984 (3650) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.973 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Success +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: SB spills = 324 tensors +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: size = 390144 bytes/partition +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: remats = 0 tensors +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: SB score = 1.79852e+06 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: best SB heuristic = 0 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: collect spills +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-09-05T19:15:19Z INFO 1724 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 3 PSUM Banks +2025-09-05T19:15:19Z USER 1724 (sg01) [ModuleForkPass]: address_rotation_psum finished after 0.327 seconds +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-09-05T19:15:19Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 807mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:19Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12971 memory location(s), 1 block(s), and 50621 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:19Z USER 1724 (sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-09-05T19:15:19Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=12971 blocks=1 instructions=50621 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:19Z INFO 1724 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 742506500 +2025-09-05T19:15:19Z INFO 1724 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 1201 bytes +2025-09-05T19:15:19Z INFO 1724 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 73400322 +2025-09-05T19:15:19Z INFO 1724 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3772 bytes +2025-09-05T19:15:19Z INFO 1724 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4259840 +2025-09-05T19:15:19Z INFO 1724 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-09-05T19:15:19Z INFO 1724 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:19Z INFO 1724 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: insert spills +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: locationsToDelete done +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: main loop +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: allocating SB +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: main loop +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: renumber locations +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: size = 4526 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: find partners +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: renumber locations +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: size = 9481 +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: find partners +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: found 1129 accumulation groups +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: largest = _dot.228-t859_i59 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: expanding partners +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: found 3376 accumulation groups +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 228 SpillSaves and Reloads +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: largest = _dot.6-t1177_i33 +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: expanding partners +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: average loaded DMA size 3277 bytes +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: average saved DMA size 2843 bytes +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 92 SpillSaves and Reloads +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: average loaded DMA size 3517 bytes +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: average saved DMA size 3083 bytes +2025-09-05T19:15:19Z INFO 1724 []: find first defs for local +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 16 SpillSaves and Reloads +2025-09-05T19:15:19Z INFO 1724 []: find first defs for global +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: average loaded DMA size 3565 bytes +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: average saved DMA size 3128 bytes +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 264749828 +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3565 bytes +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 107741442 +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3128 bytes +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: find loads +2025-09-05T19:15:19Z INFO 1724 []: find first defs for local +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 27246592, 7.76811% out of total spill/reload dma traffic +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 119914496, 25.7792% out of total dma traffic +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 345244678, 26.305% input load, 6.37811% output write, 67.3168% spill/reload [sg0000] +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 243467012 +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3279 bytes +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: 1 pin count +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: 1709 remat count +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: build interference graph +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 101777666 +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2955 bytes +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 12673024 +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 308 bytes +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 2388 bytes +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-09-05T19:15:19Z USER 1724 (sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.788 seconds +2025-09-05T19:15:19Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 822mb, ru_maxrss: 825mb (delta=0mb) +2025-09-05T19:15:19Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20184 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:19Z USER 1724 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-09-05T19:15:19Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=9074 blocks=1 instructions=20184 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:19Z INFO 1724 []: find first defs for global +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Num intervals 4526 Num locations 4526 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 70 Sb address +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: edge: 255899 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: mean: 113.08 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: median: 94.7787 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: find costs +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: find loads +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: 1 pin count +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: 1179 remat count +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: build interference graph +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: safe = 568 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: unsafe = 16 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: inf = 230 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: total = 814 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: simplify +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 16 #Pinned 0 #Safe 0 minCost 0.0186565 maxCost 0.0331975 locations 4526 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: new candidates = 13 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: (including 229 infinite cost tensors) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: select ranges +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Total: 814 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Spilled: 0.221 (180) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Allocated: 0.779 (634) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Rover zone: 0.970 (615) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Pre-rover zone: 0.003 (2) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Post-rover zone: 0.027 (17) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Blocks tall: 1.000 (634) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Success +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: SB spills = 180 tensors +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: size = 184320 bytes/partition +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: remats = 180 tensors +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: SB score = inf +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Incremental select cannot allocate, start a complete select iteration +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: safe = 2266 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: unsafe = 687 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: inf = 1572 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: total = 4525 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: simplify +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 595 #Pinned 0 #Safe 0 minCost 0.00250196 maxCost 0.913296 locations 4526 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: new candidates = 181 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: (including 1076 infinite cost tensors) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: select ranges +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Total: 4525 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Spilled: 0.012 (56) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Allocated: 0.988 (4469) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Rover zone: 0.738 (3298) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Pre-rover zone: 0.021 (92) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Post-rover zone: 0.241 (1075) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Slice zone: 0.001 (4) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Blocks nothing: 0.011 (51) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Blocks medium: 0.002 (10) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.547 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (median): 0.686 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.737 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Blocks tall: 0.986 (4408) +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.978 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: Success +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: SB spills = 56 tensors +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: size = 71936 bytes/partition +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: remats = 0 tensors +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: SB score = 641114 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: best SB heuristic = 0 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: collect spills +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 167 Sb address +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: Num intervals 9481 Num locations 9481 +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: insert spills +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: edge: 1478544 +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: mean: 311.896 +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: median: 194.596 +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: find costs +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 142 Sb address +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: locationsToDelete done +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: main loop +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: renumber locations +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: size = 4791 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: find partners +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: found 1129 accumulation groups +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: largest = _dot.228-t859_i59 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:19Z INFO 1724 (sg02) [SB_Allocator]: expanding partners +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: safe = 286 +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: unsafe = 6220 +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: inf = 2974 +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: total = 9480 +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: simplify +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 6216 #Pinned 0 #Safe 0 minCost 0.000460461 maxCost 0.0757071 locations 9481 +2025-09-05T19:15:19Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 63 Sb address +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: new candidates = 953 +2025-09-05T19:15:19Z INFO 1724 (sg01) [SB_Allocator]: select ranges +2025-09-05T19:15:20Z INFO 1724 []: find first defs for local +2025-09-05T19:15:20Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 1489 Sb address +2025-09-05T19:15:20Z INFO 1724 []: find first defs for global +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: find loads +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Total: 9480 +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Spilled: 0.058 (552) +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Allocated: 0.942 (8928) +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Rover zone: 0.240 (2147) +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Pre-rover zone: 0.007 (66) +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Post-rover zone: 0.752 (6715) +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (1) +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Blocks tall: 1.000 (8927) +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: Success +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: 1 pin count +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: 1959 remat count +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: build interference graph +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: SB spills = 552 tensors +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: size = 720000 bytes/partition +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: remats = 11 tensors +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: SB score = 4.54307e+06 +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: best SB heuristic = 0 +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: collect spills +2025-09-05T19:15:20Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.814 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 849mb, ru_maxrss: 849mb (delta=24mb) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Num intervals 4791 Num locations 4791 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20184 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=9074 blocks=1 instructions=20184 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z INFO 1724 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:20Z INFO 1724 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: edge: 232334 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: mean: 96.9877 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: median: 94.0509 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: find costs +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: reserved space = 248635654 bytes +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: spill space = 99352832 bytes +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: aligned spill space = 99356672 bytes +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: renumber locations +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: size = 100 +2025-09-05T19:15:20Z INFO 1724 []: find first defs for local +2025-09-05T19:15:20Z INFO 1724 []: find first defs for global +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: safe = 161 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: unsafe = 0 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: inf = 160 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: total = 321 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: simplify +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 4791 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: new candidates = 0 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: (including 160 infinite cost tensors) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: select ranges +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Total: 321 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Spilled: 0.312 (100) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Allocated: 0.688 (221) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Rover zone: 0.742 (164) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Pre-rover zone: 0.005 (1) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Post-rover zone: 0.253 (56) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Blocks tall: 1.000 (221) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Success +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: insert spills +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: SB spills = 100 tensors +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: size = 102400 bytes/partition +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: remats = 98 tensors +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: SB score = inf +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Incremental select cannot allocate, start a complete select iteration +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: safe = 2721 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: unsafe = 549 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: inf = 1520 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: total = 4790 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: simplify +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 227 #Pinned 0 #Safe 0 minCost 0.0051418 maxCost 0.947387 locations 4791 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: Num intervals 100 Num locations 100 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: info.neighbors init Done +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: simplify interference graph +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: initialize low and high +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: lo = 100 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: hi = 0 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: total = 100 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: simplify +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: new candidates = 0 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: select ranges +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: new candidates = 124 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: (including 1176 infinite cost tensors) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: select ranges +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: allreduce_dram_hwm 58720256 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: Real CC buffer size 58720256 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: DRAM hwm after allocation: 90701824 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DRAM_Allocator]: DRAM allocation successful +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.107 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 855mb, ru_maxrss: 855mb (delta=6mb) +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20184 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: Running address_rotation_dram +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=9074 blocks=1 instructions=20184 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-09-05T19:15:20Z INFO 1724 (sg00) [DMAOptimizationBase]: DRAM hwm before rotation 90701824 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Total: 4790 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Spilled: 0.011 (52) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Allocated: 0.989 (4738) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Rover zone: 0.789 (3737) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Pre-rover zone: 0.014 (64) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Post-rover zone: 0.197 (933) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Slice zone: 0.001 (4) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Blocks nothing: 0.011 (50) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Blocks medium: 0.002 (10) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.567 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (median): 0.651 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.816 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Blocks tall: 0.987 (4678) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.979 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Success +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: SB spills = 52 tensors +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: size = 53248 bytes/partition +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: remats = 0 tensors +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: SB score = 463060 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: best SB heuristic = 0 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: collect spills +2025-09-05T19:15:20Z INFO 1724 (sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DMAOptimizationBase]: allreduce hwm 58720256 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DMAOptimizationBase]: Real CC buffer size 58720256 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DMAOptimizationBase]: DRAM hwm after rotation 90701824 +2025-09-05T19:15:20Z INFO 1724 (sg00) [DMAOptimizationBase]: DRAM Rotation rotated 1 Dram address +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: address_rotation_dram finished after 0.064 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 862mb, ru_maxrss: 862mb (delta=7mb) +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20184 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=9074 blocks=1 instructions=20184 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z INFO 1724 (sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-09-05T19:15:20Z INFO 1724 (sg00) [TensorCopyAccel::Impl]: Accelerated 128 out of 2201 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.004 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 865mb, ru_maxrss: 865mb (delta=3mb) +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20184 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: Running peephole_opts +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=9074 blocks=1 instructions=20184 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z INFO 1724 (sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: peephole_opts finished after 0.016 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 869mb, ru_maxrss: 869mb (delta=4mb) +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: Running lower_kernel +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: insert spills +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z INFO 1724 (sg00) [LowerKernel]: Started running LowerKernel +2025-09-05T19:15:20Z INFO 1724 (sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 20185, number of allocs: 9074 +2025-09-05T19:15:20Z INFO 1724 (sg00) [LowerKernel]: Scan BKs time (s): 0.001672 +2025-09-05T19:15:20Z INFO 1724 (sg00) [LowerKernel]: Lower BKs time (s): 4e-06 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: lower_kernel finished after 0.003 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 871mb, ru_maxrss: 871mb (delta=1mb) +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: deleting loads #loadsToDelete: 11 +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: deleting locs #locationsToDelete: 11 +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: locationsToDelete done +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.002 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 872mb, ru_maxrss: 872mb (delta=1mb) +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.003 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 872mb, ru_maxrss: 872mb (delta=0mb) +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: Running birverifier +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: locationsToDelete done +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: main loop +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: renumber locations +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: size = 10867 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: main loop +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: find partners +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: birverifier finished after 0.065 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 872mb, ru_maxrss: 872mb (delta=0mb) +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: renumber locations +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: size = 4999 +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: find partners +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.003 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 843mb, ru_maxrss: 872mb (delta=0mb) +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: found 3376 accumulation groups +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: Running build_fdeps +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: largest = _dot.6-t1177_i33 +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:20Z INFO 1724 (sg01) [SB_Allocator]: expanding partners +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z INFO 1724 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 4Fri Sep 5 19:15:20 2025 +2025-09-05T19:15:20Z INFO 1724 (sg00) [build_flow_deps]: Allocs: 9074 instructions: 20185 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: found 1129 accumulation groups +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: largest = _dot.228-t859_i59 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: expanding partners +2025-09-05T19:15:20Z INFO 1724 (sg00) [build_flow_deps]: Build fdeps inserted 52336 edges +2025-09-05T19:15:20Z INFO 1724 (sg00) [build_flow_deps]: Done build fdeps 52336 Fri Sep 5 19:15:20 2025 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: build_fdeps finished after 0.108 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 864mb, ru_maxrss: 872mb (delta=0mb) +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: Running remove_redundancies +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z INFO 1724 (sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-09-05T19:15:20Z INFO 1724 (sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-09-05T19:15:20Z INFO 1724 (sg00) [RemoveRedundancies]: remove_useless_insts +2025-09-05T19:15:20Z INFO 1724 (sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: remove_redundancies finished after 0.073 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 864mb, ru_maxrss: 872mb (delta=0mb) +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z INFO 1724 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-09-05T19:15:20Z INFO 1724 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-09-05T19:15:20Z INFO 1724 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-09-05T19:15:20Z INFO 1724 []: find first defs for local +2025-09-05T19:15:20Z INFO 1724 []: find first defs for global +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: find loads +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.215 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 886mb, ru_maxrss: 886mb (delta=14mb) +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: 1 pin count +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: 2167 remat count +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: build interference graph +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:20Z INFO 1724 (sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-09-05T19:15:20Z INFO 1724 []: find first defs for local +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: Num intervals 4999 Num locations 4999 +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:20Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:20Z INFO 1724 (sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:20Z USER 1724 (sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.095 seconds +2025-09-05T19:15:20Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 880mb, ru_maxrss: 886mb (delta=0mb) +2025-09-05T19:15:21Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:21Z USER 1724 (sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-09-05T19:15:21Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:21Z USER 1724 (sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.001 seconds +2025-09-05T19:15:21Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 881mb, ru_maxrss: 886mb (delta=0mb) +2025-09-05T19:15:21Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:21Z USER 1724 (sg00) [ModuleForkPass]: Running post_sched +2025-09-05T19:15:21Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:21Z INFO 1724 [post_scheduler]: Start PosT ScheD 3 sunda Fri Sep 5 19:15:21 2025 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: edge: 231892 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: mean: 92.7754 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: median: 99.7345 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: find costs +2025-09-05T19:15:21Z INFO 1724 []: find first defs for global +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: safe = 52 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: unsafe = 0 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: inf = 208 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: total = 260 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: simplify +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 4999 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: new candidates = 0 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: (including 208 infinite cost tensors) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: select ranges +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Total: 260 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Spilled: 0.385 (100) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Allocated: 0.615 (160) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Rover zone: 0.325 (52) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Pre-rover zone: 0.000 (0) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Post-rover zone: 0.675 (108) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Blocks tall: 1.000 (160) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Success +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: SB spills = 100 tensors +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: size = 102400 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: remats = 100 tensors +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: SB score = inf +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Incremental select cannot allocate, start a complete select iteration +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: safe = 2869 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: unsafe = 497 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: inf = 1632 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: total = 4998 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: simplify +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 159 #Pinned 0 #Safe 0 minCost 0.00554868 maxCost 0.947387 locations 4999 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: new candidates = 72 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: (including 1440 infinite cost tensors) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: select ranges +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: find loads +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Total: 4998 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Spilled: 0.008 (40) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Allocated: 0.992 (4958) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Rover zone: 0.735 (3643) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Pre-rover zone: 0.010 (48) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Post-rover zone: 0.255 (1263) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Slice zone: 0.001 (4) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Blocks nothing: 0.010 (50) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Blocks medium: 0.002 (10) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.567 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (median): 0.651 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.816 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Blocks tall: 0.988 (4898) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.980 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Success +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: SB spills = 40 tensors +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: size = 77824 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: remats = 0 tensors +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: SB score = 559132 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: best SB heuristic = 0 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: collect spills +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: 1 pin count +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: 2424 remat count +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: build interference graph +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: insert spills +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Num intervals 10867 Num locations 10867 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: locationsToDelete done +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: main loop +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: edge: 724095 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: mean: 133.265 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: median: 85.4095 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: find costs +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: renumber locations +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: size = 5159 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: find partners +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: found 1129 accumulation groups +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: largest = _dot.228-t859_i59 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: expanding partners +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: safe = 940 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: unsafe = 311 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: inf = 687 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: total = 1938 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: simplify +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 303 #Pinned 0 #Safe 0 minCost 0.00648629 maxCost 0.0283161 locations 10867 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: new candidates = 292 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: (including 677 infinite cost tensors) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: select ranges +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Total: 1938 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Spilled: 0.093 (180) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Allocated: 0.907 (1758) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Rover zone: 0.803 (1412) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Pre-rover zone: 0.073 (128) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Post-rover zone: 0.124 (218) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Blocks tall: 1.000 (1758) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Success +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: SB spills = 180 tensors +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: size = 184320 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: remats = 180 tensors +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: SB score = inf +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Incremental select cannot allocate, start a complete select iteration +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: safe = 2242 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: unsafe = 5945 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: inf = 2679 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: total = 10866 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: simplify +2025-09-05T19:15:21Z INFO 1724 [post_scheduler]: Time-aware hwm post-sched +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 5865 #Pinned 0 #Safe 0 minCost 0.000902565 maxCost 0.911128 locations 10867 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: new candidates = 726 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: (including 1076 infinite cost tensors) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: select ranges +2025-09-05T19:15:21Z INFO 1724 []: find first defs for local +2025-09-05T19:15:21Z INFO 1724 []: find first defs for global +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Total: 10866 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Spilled: 0.007 (81) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Allocated: 0.993 (10785) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Rover zone: 0.354 (3822) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Pre-rover zone: 0.008 (86) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Post-rover zone: 0.638 (6877) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (1) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Blocks tall: 1.000 (10784) +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Success +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: SB spills = 81 tensors +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: size = 98304 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: remats = 0 tensors +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: SB score = 818433 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: best SB heuristic = 0 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: collect spills +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: find loads +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: 1 pin count +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: 2291 remat count +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: build interference graph +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Num intervals 5159 Num locations 5159 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: insert spills +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: locationsToDelete done +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: edge: 233912 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: mean: 90.6811 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: median: 102.573 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: find costs +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: main loop +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: renumber locations +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: size = 11194 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: find partners +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: safe = 40 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: unsafe = 56 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: inf = 104 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: total = 200 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: simplify +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 56 #Pinned 0 #Safe 0 minCost 0.00551513 maxCost 0.00551513 locations 5159 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: new candidates = 56 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: (including 104 infinite cost tensors) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: select ranges +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Total: 200 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Spilled: 0.380 (76) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Allocated: 0.620 (124) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Rover zone: 0.419 (52) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Pre-rover zone: 0.000 (0) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Post-rover zone: 0.581 (72) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Blocks tall: 1.000 (124) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Success +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: SB spills = 76 tensors +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: size = 151552 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: remats = 52 tensors +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: SB score = inf +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Incremental select cannot allocate, start a complete select iteration +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: safe = 2909 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: unsafe = 1793 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: inf = 456 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: total = 5158 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: simplify +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 1455 #Pinned 0 #Safe 0 minCost 0.00551513 maxCost 0.947387 locations 5159 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: found 3376 accumulation groups +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: largest = _dot.6-t1177_i33 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: expanding partners +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: new candidates = 558 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: select ranges +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Total: 5158 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Spilled: 0.019 (100) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Allocated: 0.981 (5058) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Rover zone: 0.734 (3713) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Pre-rover zone: 0.010 (52) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Post-rover zone: 0.255 (1289) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Slice zone: 0.001 (4) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Blocks nothing: 0.010 (50) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Blocks medium: 0.002 (10) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.573 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (median): 0.711 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.785 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Blocks tall: 0.988 (4998) +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.980 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Success +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: SB spills = 100 tensors +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: size = 102400 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: remats = 100 tensors +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: SB score = 391820 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: best SB heuristic = 0 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: collect spills +2025-09-05T19:15:21Z INFO 1724 [post_scheduler]: Time-aware simulation time: 4394455 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: insert spills +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: locationsToDelete done +2025-09-05T19:15:21Z INFO 1724 [post_scheduler]: Done PosT ScheD Fri Sep 5 19:15:21 2025 +2025-09-05T19:15:21Z USER 1724 (sg00) [ModuleForkPass]: post_sched finished after 0.727 seconds +2025-09-05T19:15:21Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 907mb, ru_maxrss: 907mb (delta=21mb) +2025-09-05T19:15:21Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:21Z USER 1724 (sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-09-05T19:15:21Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:21Z USER 1724 (sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.004 seconds +2025-09-05T19:15:21Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 892mb, ru_maxrss: 907mb (delta=0mb) +2025-09-05T19:15:21Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:21Z USER 1724 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-09-05T19:15:21Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: main loop +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: renumber locations +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: size = 5379 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: find partners +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: found 1129 accumulation groups +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: largest = _dot.228-t859_i59 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: expanding partners +2025-09-05T19:15:21Z INFO 1724 []: find first defs for local +2025-09-05T19:15:21Z INFO 1724 []: find first defs for global +2025-09-05T19:15:21Z INFO 1724 []: find first defs for local +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: find loads +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: 1 pin count +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: 2736 remat count +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: build interference graph +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:21Z INFO 1724 []: find first defs for global +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: find loads +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: 1 pin count +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: 2511 remat count +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: build interference graph +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:21Z INFO 1724 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 933 PSUM Banks +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: Num intervals 11194 Num locations 11194 +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:21Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: Num intervals 5379 Num locations 5379 +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:21Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: edge: 680458 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: mean: 121.575 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: median: 81.4087 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: find costs +2025-09-05T19:15:22Z INFO 1724 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 849 PSUM Banks +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: edge: 253410 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: mean: 94.222 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: median: 111.953 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: find costs +2025-09-05T19:15:22Z INFO 1724 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 298 PSUM Banks +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: safe = 0 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: unsafe = 0 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: inf = 320 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: total = 320 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: simplify +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 5379 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: safe = 177 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: unsafe = 2 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: inf = 229 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: total = 408 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: simplify +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 11194 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: new candidates = 0 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: (including 320 infinite cost tensors) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: select ranges +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Total: 320 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Spilled: 0.200 (64) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Allocated: 0.800 (256) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Rover zone: 0.562 (144) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Pre-rover zone: 0.047 (12) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Post-rover zone: 0.391 (100) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Blocks tall: 1.000 (256) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Success +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: new candidates = 0 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: (including 229 infinite cost tensors) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: select ranges +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Total: 408 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Spilled: 0.245 (100) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Allocated: 0.755 (308) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Rover zone: 0.656 (202) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Pre-rover zone: 0.019 (6) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Post-rover zone: 0.325 (100) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Blocks tall: 1.000 (308) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Success +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: SB spills = 64 tensors +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: size = 65536 bytes/partition +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: remats = 64 tensors +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: SB score = inf +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Incremental select cannot allocate, start a complete select iteration +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: safe = 2909 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: unsafe = 1693 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: inf = 776 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: total = 5378 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: simplify +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 1355 #Pinned 0 #Safe 0 minCost 0.00481337 maxCost 0.947387 locations 5379 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: SB spills = 100 tensors +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: size = 102400 bytes/partition +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: remats = 98 tensors +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: SB score = inf +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Incremental select cannot allocate, start a complete select iteration +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: new candidates = 500 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: select ranges +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: safe = 2783 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: unsafe = 5848 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: inf = 2562 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: total = 11193 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: simplify +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 5714 #Pinned 0 #Safe 0 minCost 0.000949358 maxCost 0.294184 locations 11194 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Total: 5378 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Spilled: 0.008 (44) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Allocated: 0.992 (5334) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Rover zone: 0.738 (3938) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Pre-rover zone: 0.009 (47) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Post-rover zone: 0.252 (1345) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Slice zone: 0.001 (4) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Blocks nothing: 0.009 (50) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Blocks medium: 0.002 (10) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.573 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (median): 0.711 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.785 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Blocks tall: 0.989 (5274) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.981 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Success +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: new candidates = 643 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: (including 1176 infinite cost tensors) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: select ranges +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: SB spills = 44 tensors +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: size = 45056 bytes/partition +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: remats = 44 tensors +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: SB score = 217282 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: best SB heuristic = 0 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: collect spills +2025-09-05T19:15:22Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 20 Sb address +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Total: 11193 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Spilled: 0.005 (52) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Allocated: 0.995 (11141) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Rover zone: 0.359 (4003) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Pre-rover zone: 0.011 (127) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Post-rover zone: 0.629 (7011) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (1) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Blocks tall: 1.000 (11140) +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: Success +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: insert spills +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: SB spills = 52 tensors +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: size = 53248 bytes/partition +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: remats = 0 tensors +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: SB score = 463060 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: best SB heuristic = 0 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: collect spills +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: locationsToDelete done +2025-09-05T19:15:22Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 45 Sb address +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: main loop +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: renumber locations +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: size = 5501 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: find partners +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: found 1129 accumulation groups +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: largest = _dot.228-t859_i59 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: expanding partners +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: insert spills +2025-09-05T19:15:22Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 132 Sb address +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: locationsToDelete done +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: main loop +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: renumber locations +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: size = 11402 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: find partners +2025-09-05T19:15:22Z INFO 1724 []: find first defs for local +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: found 3376 accumulation groups +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: largest = _dot.6-t1177_i33 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: expanding partners +2025-09-05T19:15:22Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 156 Sb address +2025-09-05T19:15:22Z INFO 1724 []: find first defs for global +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: find loads +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: 1 pin count +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: 2633 remat count +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: build interference graph +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Num intervals 5501 Num locations 5501 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: edge: 259002 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: mean: 94.1654 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: median: 110.66 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: find costs +2025-09-05T19:15:22Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 862 Sb address +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: safe = 0 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: unsafe = 0 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: inf = 166 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: total = 166 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: simplify +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 5501 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: new candidates = 0 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: (including 166 infinite cost tensors) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: select ranges +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Total: 166 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Allocated: 1.000 (166) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Rover zone: 0.831 (138) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Pre-rover zone: 0.048 (8) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Post-rover zone: 0.120 (20) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Blocks tall: 1.000 (166) +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:22Z INFO 1724 (sg02) [SB_Allocator]: Success +2025-09-05T19:15:22Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:15:22Z INFO 1724 []: find first defs for local +2025-09-05T19:15:22Z INFO 1724 []: find first defs for global +2025-09-05T19:15:22Z INFO 1724 (sg00) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-09-05T19:15:22Z USER 1724 (sg00) [ModuleForkPass]: address_rotation_sb finished after 1.187 seconds +2025-09-05T19:15:22Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 909mb, ru_maxrss: 909mb (delta=2mb) +2025-09-05T19:15:22Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:22Z USER 1724 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-09-05T19:15:22Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:22Z INFO 1724 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-09-05T19:15:22Z INFO 1724 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-09-05T19:15:22Z INFO 1724 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-09-05T19:15:22Z INFO 1724 (sg01) [SB_Allocator]: find loads +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: 1 pin count +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: 2944 remat count +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: build interference graph +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:23Z USER 1724 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.183 seconds +2025-09-05T19:15:23Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 912mb, ru_maxrss: 912mb (delta=3mb) +2025-09-05T19:15:23Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:23Z USER 1724 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-09-05T19:15:23Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:23Z INFO 1724 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-09-05T19:15:23Z INFO 1724 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-09-05T19:15:23Z INFO 1724 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Num intervals 11402 Num locations 11402 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:23Z USER 1724 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.079 seconds +2025-09-05T19:15:23Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 906mb, ru_maxrss: 912mb (delta=0mb) +2025-09-05T19:15:23Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:23Z USER 1724 (sg00) [ModuleForkPass]: Running dep_opt +2025-09-05T19:15:23Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:23Z INFO 1724 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 5Fri Sep 5 19:15:23 2025 +2025-09-05T19:15:23Z INFO 1724 (sg00) [build_flow_deps]: Allocs: 9074 instructions: 20185 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: edge: 680224 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: mean: 119.317 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: median: 83.2723 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: find costs +2025-09-05T19:15:23Z INFO 1724 (sg00) [build_flow_deps]: Build fdeps inserted 50129 edges +2025-09-05T19:15:23Z INFO 1724 (sg00) [build_flow_deps]: Done build fdeps 50129 Fri Sep 5 19:15:23 2025 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: safe = 52 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: unsafe = 0 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: inf = 208 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: total = 260 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: simplify +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 11402 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: new candidates = 0 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: (including 208 infinite cost tensors) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: select ranges +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Total: 260 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Spilled: 0.385 (100) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Allocated: 0.615 (160) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Rover zone: 0.325 (52) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Pre-rover zone: 0.000 (0) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Post-rover zone: 0.675 (108) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Blocks tall: 1.000 (160) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Success +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: SB spills = 100 tensors +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: size = 102400 bytes/partition +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: remats = 100 tensors +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: SB score = inf +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Incremental select cannot allocate, start a complete select iteration +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: safe = 2935 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: unsafe = 5796 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: inf = 2670 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: total = 11401 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: simplify +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 5654 #Pinned 0 #Safe 0 minCost 0.000949358 maxCost 0.756153 locations 11402 +2025-09-05T19:15:23Z USER 1724 (sg00) [ModuleForkPass]: dep_opt finished after 0.137 seconds +2025-09-05T19:15:23Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 908mb, ru_maxrss: 912mb (delta=0mb) +2025-09-05T19:15:23Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:23Z USER 1724 (sg00) [ModuleForkPass]: Running report_stats +2025-09-05T19:15:23Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:23Z INFO 1724 (sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 33 │ 4294975488 │ +│ DMACopy │ Internal -> ExternalOutput │ 128 │ 1073741824 │ +│ DMACopy │ Internal -> Output │ 2 │ 33554432 │ +│ Load │ Const -> Internal │ 4 │ 98560 │ +│ Load │ ExternalInput -> Internal │ 209 │ 90718212 │ +│ Load │ Internal │ 373 │ 152650240 │ +│ Save │ Internal │ 97 │ 47972608 │ +│ Save │ Internal -> Output │ 77 │ 22020098 │ +│ Save (Spill) │ Internal │ 97 │ 31784960 │ +└──────────────┴────────────────────────────┴───────┴────────────┘ + +2025-09-05T19:15:23Z INFO 1724 (sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 1 │ +│ 64 │ 1 │ +│ 256 │ 7 │ +│ 512 │ 1 │ +│ 896 │ 24 │ +│ 1024 │ 71 │ +│ 1920 │ 64 │ +│ 2048 │ 199 │ +│ 4096 │ 502 │ +│ 8064 │ 7 │ +│ 8192 │ 12 │ +│ 524288 │ 128 │ +│ 8388608 │ 4 │ +└─────────────────────┴───────┘ + +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: new candidates = 591 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: (including 1440 infinite cost tensors) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: select ranges +2025-09-05T19:15:23Z INFO 1724 (sg00) [ReportStats]: MM Stats: #MatMults 10256 #MatMult-Transposes 3712 +2025-09-05T19:15:23Z INFO 1724 (sg00) [ReportStats]: IO Tensor size combined: 209739780 +2025-09-05T19:15:23Z INFO 1724 (sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input68 │ ExternalInput │ bfloat16 │ 134217728 │ +│ input69 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input73 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input4 │ ExternalInput │ bfloat16 │ 8388608 │ +│ output2 │ ExternalOutput │ bfloat16 │ 8388608 │ +│ input5 │ ExternalInput │ bfloat16 │ 8388608 │ +│ output1 │ ExternalOutput │ bfloat16 │ 8388608 │ +│ input72 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input70 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input1 │ ExternalInput │ int32 │ 8192 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-09-05T19:15:23Z INFO 1724 (sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate1 │ Output │ bfloat16 │ 16777216 │ +│ intermediate4 │ Output │ bfloat16 │ 16777216 │ +│ dot.4_i0 │ Internal │ bfloat16 │ 8388608 │ +│ _all-reduce.256-t2355_i1 │ Internal │ bfloat16 │ 8388608 │ +│ all_gather.1_i0 │ Internal │ bfloat16 │ 8388608 │ +│ _all-reduce.256-t2355_i0 │ Internal │ bfloat16 │ 8388608 │ +│ dot.4_i1 │ Internal │ bfloat16 │ 8388608 │ +│ all_gather.1_i1 │ Internal │ bfloat16 │ 8388608 │ +│ transpose.1_i1 │ Internal │ bfloat16 │ 4194304 │ +│ transpose.1_i0 │ Internal │ bfloat16 │ 4194304 │ +└──────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-09-05T19:15:23Z USER 1724 (sg00) [ModuleForkPass]: report_stats finished after 0.006 seconds +2025-09-05T19:15:23Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 906mb, ru_maxrss: 912mb (delta=0mb) +2025-09-05T19:15:23Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Total: 11401 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Spilled: 0.004 (40) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Allocated: 0.996 (11361) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Rover zone: 0.403 (4584) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Pre-rover zone: 0.010 (114) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Post-rover zone: 0.586 (6663) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (1) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Blocks tall: 1.000 (11360) +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Success +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: SB spills = 40 tensors +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: size = 77824 bytes/partition +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: remats = 0 tensors +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: SB score = 559132 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: best SB heuristic = 0 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: collect spills +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: insert spills +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: locationsToDelete done +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: main loop +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: renumber locations +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: size = 11562 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: find partners +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: found 3376 accumulation groups +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: largest = _dot.6-t1177_i33 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: expanding partners +2025-09-05T19:15:23Z INFO 1724 []: find first defs for local +2025-09-05T19:15:23Z INFO 1724 []: find first defs for global +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: find loads +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: 1 pin count +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: 3068 remat count +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: build interference graph +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: Num intervals 11562 Num locations 11562 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: edge: 682404 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: mean: 118.043 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: median: 83.2008 +2025-09-05T19:15:23Z INFO 1724 (sg01) [SB_Allocator]: find costs +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: safe = 40 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: unsafe = 56 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: inf = 104 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: total = 200 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: simplify +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 56 #Pinned 0 #Safe 0 minCost 0.00550423 maxCost 0.00550423 locations 11562 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: new candidates = 56 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: (including 104 infinite cost tensors) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: select ranges +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Total: 200 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Spilled: 0.380 (76) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Allocated: 0.620 (124) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Rover zone: 0.419 (52) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Pre-rover zone: 0.000 (0) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Post-rover zone: 0.581 (72) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Blocks tall: 1.000 (124) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Success +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: SB spills = 76 tensors +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: size = 151552 bytes/partition +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: remats = 52 tensors +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: SB score = inf +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Incremental select cannot allocate, start a complete select iteration +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: safe = 2975 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: unsafe = 7092 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: inf = 1494 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: total = 11561 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: simplify +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 6942 #Pinned 0 #Safe 0 minCost 0.000949358 maxCost 0.294184 locations 11562 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: new candidates = 1083 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: select ranges +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Total: 11561 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Spilled: 0.009 (101) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Allocated: 0.991 (11460) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Rover zone: 0.358 (4105) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Pre-rover zone: 0.006 (66) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Post-rover zone: 0.636 (7289) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (1) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Blocks tall: 1.000 (11459) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Success +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: SB spills = 101 tensors +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: size = 102912 bytes/partition +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: remats = 100 tensors +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: SB score = 406644 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: best SB heuristic = 0 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: collect spills +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: insert spills +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: locationsToDelete done +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: main loop +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: renumber locations +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: size = 11790 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: find partners +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: found 3376 accumulation groups +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: largest = _dot.6-t1177_i33 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: expanding partners +2025-09-05T19:15:24Z INFO 1724 []: find first defs for local +2025-09-05T19:15:24Z INFO 1724 []: find first defs for global +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: find loads +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: 1 pin count +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: 3296 remat count +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: build interference graph +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Num intervals 11790 Num locations 11790 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: edge: 701397 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: mean: 118.982 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: median: 89.5684 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: find costs +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: safe = 5 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: unsafe = 0 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: inf = 324 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: total = 329 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: simplify +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 11790 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: new candidates = 0 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: (including 324 infinite cost tensors) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: select ranges +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Total: 329 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Spilled: 0.170 (56) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Allocated: 0.830 (273) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Rover zone: 0.593 (162) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Pre-rover zone: 0.040 (11) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Post-rover zone: 0.366 (100) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Blocks tall: 1.000 (273) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Success +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: SB spills = 56 tensors +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: size = 57344 bytes/partition +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: remats = 56 tensors +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: SB score = inf +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Incremental select cannot allocate, start a complete select iteration +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: safe = 2984 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: unsafe = 6991 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: inf = 1814 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: total = 11789 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: simplify +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 6789 #Pinned 0 #Safe 0 minCost 0.000949358 maxCost 0.294184 locations 11790 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: new candidates = 1020 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: select ranges +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Total: 11789 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Spilled: 0.003 (40) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Allocated: 0.997 (11749) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Rover zone: 0.372 (4365) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Pre-rover zone: 0.005 (62) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Post-rover zone: 0.623 (7322) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (1) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Blocks tall: 1.000 (11748) +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: Success +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: SB spills = 40 tensors +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: size = 40960 bytes/partition +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: remats = 40 tensors +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: SB score = 195910 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: best SB heuristic = 0 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: collect spills +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: insert spills +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: locationsToDelete done +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: main loop +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: renumber locations +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: size = 11900 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: find partners +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: found 3376 accumulation groups +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: largest = _dot.6-t1177_i33 +2025-09-05T19:15:24Z INFO 1724 (sg01) [SB_Allocator]: tensors = 112 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: requires 114688 bytes/partition +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: expanding partners +2025-09-05T19:15:25Z INFO 1724 []: find first defs for local +2025-09-05T19:15:25Z INFO 1724 []: find first defs for global +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: find loads +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: 1 pin count +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: 3406 remat count +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: build interference graph +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: pass 1 int-tree +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Num intervals 11900 Num locations 11900 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors init Done +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: edge: 706689 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: mean: 118.771 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: median: 90.2331 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: find costs +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: simplify interference graph +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: safe = 0 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: unsafe = 0 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: inf = 150 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: total = 150 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: simplify +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 11900 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: new candidates = 0 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: (including 150 infinite cost tensors) +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: select ranges +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Total: 150 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Allocated: 1.000 (150) +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Rover zone: 0.787 (118) +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Pre-rover zone: 0.047 (7) +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Post-rover zone: 0.167 (25) +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Blocks tall: 1.000 (150) +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: Success +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: SB spills = 0 tensors +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: remats = 0 tensors +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: SB score = 0 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: spilling from SB cost about 6.98625e+06 cycles +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: number of tensors spilled from SB = 866 +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: total size of spilled tensors = 949888 bytes/partition +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-09-05T19:15:25Z INFO 1724 (sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-09-05T19:15:25Z INFO 1724 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 1264852996 +2025-09-05T19:15:25Z INFO 1724 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 1363 bytes +2025-09-05T19:15:25Z INFO 1724 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 267321346 +2025-09-05T19:15:25Z INFO 1724 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1972 bytes +2025-09-05T19:15:25Z INFO 1724 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4259840 +2025-09-05T19:15:25Z INFO 1724 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-09-05T19:15:25Z USER 1724 (sg01) [ModuleForkPass]: coloring_allocator_sb finished after 6.235 seconds +2025-09-05T19:15:25Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 916mb, ru_maxrss: 916mb (delta=91mb) +2025-09-05T19:15:25Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16105 memory location(s), 1 block(s), and 53947 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:25Z USER 1724 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-09-05T19:15:25Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=16105 blocks=1 instructions=53947 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:15:25Z USER 1724 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.083 seconds +2025-09-05T19:15:25Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 885mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:25Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16105 memory location(s), 1 block(s), and 53947 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:25Z USER 1724 (sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-09-05T19:15:25Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=16105 blocks=1 instructions=53947 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 1532174342, 46.8543% input load, 1.09499% output write, 52.0507% spill/reload [sg0001] +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: removed 0 identical load +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: sub-graph will get execute 31 times +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 8192, 0.000534665% out of total dma traffic(7.1789e+08) +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 828 spill/reload instructions +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 729 spill/reload memory locations +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 21 spill/reload instructions +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 16 spill/reload memory locations +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 2 spill/reload instructions +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 1 spill/reload memory locations +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 204210176, 25.606% out of total spill/reload dma traffic +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-09-05T19:15:25Z INFO 1724 (sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 3 spill/reload instructions +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 1 spill/reload memory locations +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 1092 SpillSaves and Reloads +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: average loaded DMA size 1465 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: average saved DMA size 2617 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 472 SpillSaves and Reloads +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: average loaded DMA size 1523 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: average saved DMA size 3233 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 4 SpillSaves and Reloads +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: average loaded DMA size 1523 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: average saved DMA size 3239 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 1120403460 +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 1523 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 207355906 +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3239 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 38207488, 4.79086% out of total spill/reload dma traffic +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 242425856, 15.8223% out of total dma traffic +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 1289748486, 55.6606% input load, 1.30081% output write, 43.0386% spill/reload [sg0001] +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 1098776580 +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 1502 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 190971906 +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 3147 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4259840 +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1568 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-09-05T19:15:26Z USER 1724 (sg01) [ModuleForkPass]: dma_optimization_sb finished after 1.097 seconds +2025-09-05T19:15:26Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 898mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:26Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 51886 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:26Z USER 1724 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-09-05T19:15:26Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=13756 blocks=1 instructions=51886 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 149 Sb address +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 737 Sb address +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 169 Sb address +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 39 Sb address +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 1297 Sb address +2025-09-05T19:15:26Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:15:26Z USER 1724 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.374 seconds +2025-09-05T19:15:26Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 890mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:26Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 51886 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:26Z USER 1724 (sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-09-05T19:15:26Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=13756 blocks=1 instructions=51886 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:26Z INFO 1724 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:26Z INFO 1724 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:26Z INFO 1724 (sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-09-05T19:15:26Z INFO 1724 (sg01) [DRAM_Allocator]: reserved space = 324124680 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DRAM_Allocator]: spill space = 215744512 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DRAM_Allocator]: aligned spill space = 215744512 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-09-05T19:15:26Z INFO 1724 (sg01) [DRAM_Allocator]: renumber locations +2025-09-05T19:15:26Z INFO 1724 (sg01) [DRAM_Allocator]: size = 324 +2025-09-05T19:15:26Z INFO 1724 []: find first defs for local +2025-09-05T19:15:26Z INFO 1724 []: find first defs for global +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: Num intervals 324 Num locations 324 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: info.neighbors init Done +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: simplify interference graph +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: initialize low and high +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: lo = 324 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: hi = 0 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: total = 324 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: simplify +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: new candidates = 0 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: select ranges +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: allreduce_dram_hwm 67108864 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: Real CC buffer size 67108864 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: DRAM hwm after allocation: 134610944 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DRAM_Allocator]: DRAM allocation successful +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.165 seconds +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 892mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 51886 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: Running address_rotation_dram +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=13756 blocks=1 instructions=51886 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-09-05T19:15:27Z INFO 1724 (sg01) [DMAOptimizationBase]: DRAM hwm before rotation 134610944 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DMAOptimizationBase]: allreduce hwm 67108864 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DMAOptimizationBase]: Real CC buffer size 67108864 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DMAOptimizationBase]: DRAM hwm after rotation 134610944 +2025-09-05T19:15:27Z INFO 1724 (sg01) [DMAOptimizationBase]: DRAM Rotation rotated 20 Dram address +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: address_rotation_dram finished after 0.048 seconds +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 890mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 51886 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=13756 blocks=1 instructions=51886 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z INFO 1724 (sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-09-05T19:15:27Z INFO 1724 (sg01) [TensorCopyAccel::Impl]: Accelerated 64 out of 2202 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.005 seconds +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 890mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 51886 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: Running peephole_opts +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=13756 blocks=1 instructions=51886 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z INFO 1724 (sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: peephole_opts finished after 0.031 seconds +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 890mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52911 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: Running lower_kernel +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=13756 blocks=1 instructions=52911 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z INFO 1724 (sg01) [LowerKernel]: Started running LowerKernel +2025-09-05T19:15:27Z INFO 1724 (sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 52911, number of allocs: 13756 +2025-09-05T19:15:27Z INFO 1724 (sg01) [LowerKernel]: Scan BKs time (s): 0.003436 +2025-09-05T19:15:27Z INFO 1724 (sg01) [LowerKernel]: Lower BKs time (s): 3e-06 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: lower_kernel finished after 0.005 seconds +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 890mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52911 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=13756 blocks=1 instructions=52911 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.004 seconds +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 890mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52911 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=13756 blocks=1 instructions=52911 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.008 seconds +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 891mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52911 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: Running birverifier +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=13756 blocks=1 instructions=52911 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: birverifier finished after 0.065 seconds +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 891mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52911 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=13756 blocks=1 instructions=52911 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.007 seconds +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 890mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52911 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: Running build_fdeps +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=13756 blocks=1 instructions=52911 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z INFO 1724 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 6Fri Sep 5 19:15:27 2025 +2025-09-05T19:15:27Z INFO 1724 (sg01) [build_flow_deps]: Allocs: 13756 instructions: 52911 +2025-09-05T19:15:27Z INFO 1724 (sg01) [build_flow_deps]: Build fdeps inserted 157074 edges +2025-09-05T19:15:27Z INFO 1724 (sg01) [build_flow_deps]: Done build fdeps 157074 Fri Sep 5 19:15:27 2025 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: build_fdeps finished after 0.215 seconds +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 905mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52911 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: Running remove_redundancies +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=13756 blocks=1 instructions=52911 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z INFO 1724 (sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-09-05T19:15:27Z INFO 1724 (sg01) [RemoveRedundancies]: remove_clobbered_writes: 52 +2025-09-05T19:15:27Z INFO 1724 (sg01) [RemoveRedundancies]: remove_useless_insts +2025-09-05T19:15:27Z INFO 1724 (sg01) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: remove_redundancies finished after 0.021 seconds +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 905mb, ru_maxrss: 916mb (delta=0mb) +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z INFO 1724 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-09-05T19:15:27Z INFO 1724 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-09-05T19:15:27Z INFO 1724 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.439 seconds +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 945mb, ru_maxrss: 945mb (delta=29mb) +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:27Z USER 1724 (sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-09-05T19:15:27Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:28Z INFO 1724 (sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-09-05T19:15:28Z INFO 1724 (sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:28Z USER 1724 (sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.101 seconds +2025-09-05T19:15:28Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 929mb, ru_maxrss: 945mb (delta=0mb) +2025-09-05T19:15:28Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:28Z USER 1724 (sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-09-05T19:15:28Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:28Z USER 1724 (sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.001 seconds +2025-09-05T19:15:28Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 927mb, ru_maxrss: 945mb (delta=0mb) +2025-09-05T19:15:28Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:28Z USER 1724 (sg01) [ModuleForkPass]: Running post_sched +2025-09-05T19:15:28Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:28Z INFO 1724 [post_scheduler]: Start PosT ScheD 3 sunda Fri Sep 5 19:15:28 2025 +2025-09-05T19:15:28Z INFO 1724 (sg02) [SB_Allocator]: SB spills = 0 tensors +2025-09-05T19:15:28Z INFO 1724 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:28Z INFO 1724 (sg02) [SB_Allocator]: remats = 0 tensors +2025-09-05T19:15:28Z INFO 1724 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:15:28Z INFO 1724 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:15:28Z INFO 1724 (sg02) [SB_Allocator]: SB score = 0 +2025-09-05T19:15:28Z INFO 1724 (sg02) [SB_Allocator]: spilling from SB cost about 4.07092e+06 cycles +2025-09-05T19:15:28Z INFO 1724 (sg02) [SB_Allocator]: number of tensors spilled from SB = 616 +2025-09-05T19:15:28Z INFO 1724 (sg02) [SB_Allocator]: total size of spilled tensors = 593152 bytes/partition +2025-09-05T19:15:28Z INFO 1724 (sg02) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-09-05T19:15:28Z INFO 1724 (sg02) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-09-05T19:15:28Z INFO 1724 (sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-09-05T19:15:28Z INFO 1724 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 1028178972 +2025-09-05T19:15:28Z INFO 1724 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 1272 bytes +2025-09-05T19:15:28Z INFO 1724 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 187151368 +2025-09-05T19:15:28Z INFO 1724 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2130 bytes +2025-09-05T19:15:28Z INFO 1724 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8196 +2025-09-05T19:15:28Z INFO 1724 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 248 bytes +2025-09-05T19:15:28Z USER 1724 (sg02) [ModuleForkPass]: coloring_allocator_sb finished after 9.795 seconds +2025-09-05T19:15:28Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 947mb, ru_maxrss: 947mb (delta=122mb) +2025-09-05T19:15:28Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7191 memory location(s), 1 block(s), and 37133 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:28Z USER 1724 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-09-05T19:15:28Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=7191 blocks=1 instructions=37133 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:15:28Z USER 1724 (sg02) [ModuleForkPass]: address_rotation_sb finished after 0.043 seconds +2025-09-05T19:15:28Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 930mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:28Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7191 memory location(s), 1 block(s), and 37133 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:28Z USER 1724 (sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-09-05T19:15:28Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=7191 blocks=1 instructions=37133 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 1215330340, 59.3656% input load, 3.29129e-07% output write, 40.6344% spill/reload [sg0002] +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: removed 0 identical load +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 4096, 0.000337028% out of total dma traffic(7.21488e+08) +2025-09-05T19:15:28Z INFO 1724 [post_scheduler]: Time-aware hwm post-sched +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 563 spill/reload instructions +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 500 spill/reload memory locations +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 36 spill/reload instructions +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 24 spill/reload memory locations +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload instructions +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload memory locations +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 140183552, 28.3863% out of total spill/reload dma traffic +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-09-05T19:15:28Z INFO 1724 (sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 728 SpillSaves and Reloads +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: average loaded DMA size 1346 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: average saved DMA size 2981 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 328 SpillSaves and Reloads +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: average loaded DMA size 1391 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: average saved DMA size 3844 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: average loaded DMA size 1391 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: average saved DMA size 3844 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 935702556 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 1391 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 139440136 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3844 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 25165824, 5.09592% out of total spill/reload dma traffic +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 165353472, 13.6056% out of total dma traffic +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 1049976868, 68.7143% input load, 3.80961e-07% output write, 31.2857% spill/reload [sg0002] +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 923119644 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 1372 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 126857224 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 3497 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 8196 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 248 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1480 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.634 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 944mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35670 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=5538 blocks=1 instructions=35670 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 114 Sb address +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 609 Sb address +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 144 Sb address +2025-09-05T19:15:29Z INFO 1724 [post_scheduler]: Time-aware simulation time: 335709199 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 39 Sb address +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 120 Sb address +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: address_rotation_sb finished after 0.226 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 940mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35670 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=5538 blocks=1 instructions=35670 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z INFO 1724 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:29Z INFO 1724 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: reserved space = 343992346 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: spill space = 155697156 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: aligned spill space = 155738112 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: renumber locations +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: size = 205 +2025-09-05T19:15:29Z INFO 1724 []: find first defs for local +2025-09-05T19:15:29Z INFO 1724 [post_scheduler]: Done PosT ScheD Fri Sep 5 19:15:29 2025 +2025-09-05T19:15:29Z USER 1724 (sg01) [ModuleForkPass]: post_sched finished after 1.316 seconds +2025-09-05T19:15:29Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 932mb, ru_maxrss: 947mb (delta=2mb) +2025-09-05T19:15:29Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:29Z USER 1724 (sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-09-05T19:15:29Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:29Z INFO 1724 []: find first defs for global +2025-09-05T19:15:29Z USER 1724 (sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.007 seconds +2025-09-05T19:15:29Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 922mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:29Z USER 1724 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-09-05T19:15:29Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: Num intervals 205 Num locations 205 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: info.neighbors init Done +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: simplify interference graph +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: initialize low and high +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: lo = 205 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: hi = 0 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: total = 205 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: simplify +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: new candidates = 0 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: select ranges +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: allreduce_dram_hwm 33570816 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: Real CC buffer size 33570816 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: DRAM hwm after allocation: 104775680 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DRAM_Allocator]: DRAM allocation successful +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.063 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 923mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35670 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running address_rotation_dram +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=5538 blocks=1 instructions=35670 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: DRAM hwm before rotation 104775680 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: allreduce hwm 33570816 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: Real CC buffer size 33570816 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: DRAM hwm after rotation 104775680 +2025-09-05T19:15:29Z INFO 1724 (sg02) [DMAOptimizationBase]: DRAM Rotation rotated 22 Dram address +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: address_rotation_dram finished after 0.027 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 921mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35670 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=5538 blocks=1 instructions=35670 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z INFO 1724 (sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-09-05T19:15:29Z INFO 1724 (sg02) [TensorCopyAccel::Impl]: Accelerated 0 out of 921 tensorcopy in Function: sg0002 average acceleration factor: -nan +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.003 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 922mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35670 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running peephole_opts +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=5538 blocks=1 instructions=35670 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z INFO 1724 (sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: peephole_opts finished after 0.009 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 922mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running lower_kernel +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z INFO 1724 (sg02) [LowerKernel]: Started running LowerKernel +2025-09-05T19:15:29Z INFO 1724 (sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 35676, number of allocs: 5538 +2025-09-05T19:15:29Z INFO 1724 (sg02) [LowerKernel]: Scan BKs time (s): 0.001885 +2025-09-05T19:15:29Z INFO 1724 (sg02) [LowerKernel]: Lower BKs time (s): 3e-06 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: lower_kernel finished after 0.002 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 922mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.002 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 922mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.005 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 922mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running birverifier +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: birverifier finished after 0.023 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 922mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.005 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 921mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running build_fdeps +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z INFO 1724 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 7Fri Sep 5 19:15:29 2025 +2025-09-05T19:15:29Z INFO 1724 (sg02) [build_flow_deps]: Allocs: 5538 instructions: 35676 +2025-09-05T19:15:29Z INFO 1724 (sg02) [build_flow_deps]: Build fdeps inserted 114740 edges +2025-09-05T19:15:29Z INFO 1724 (sg02) [build_flow_deps]: Done build fdeps 114740 Fri Sep 5 19:15:29 2025 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: build_fdeps finished after 0.089 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 925mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running remove_redundancies +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z INFO 1724 (sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-09-05T19:15:29Z INFO 1724 (sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-09-05T19:15:29Z INFO 1724 (sg02) [RemoveRedundancies]: remove_useless_insts +2025-09-05T19:15:29Z INFO 1724 (sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: remove_redundancies finished after 0.014 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 925mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z INFO 1724 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-09-05T19:15:29Z INFO 1724 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-09-05T19:15:29Z INFO 1724 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.355 seconds +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 944mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:29Z USER 1724 (sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-09-05T19:15:29Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:30Z INFO 1724 (sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-09-05T19:15:30Z INFO 1724 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 1437 PSUM Banks +2025-09-05T19:15:30Z INFO 1724 (sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:30Z USER 1724 (sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.086 seconds +2025-09-05T19:15:30Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 932mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:30Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:30Z USER 1724 (sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-09-05T19:15:30Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:30Z USER 1724 (sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-09-05T19:15:30Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 928mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:30Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:30Z USER 1724 (sg02) [ModuleForkPass]: Running post_sched +2025-09-05T19:15:30Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:30Z INFO 1724 [post_scheduler]: Start PosT ScheD 3 sunda Fri Sep 5 19:15:30 2025 +2025-09-05T19:15:30Z INFO 1724 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 1627 PSUM Banks +2025-09-05T19:15:30Z INFO 1724 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-09-05T19:15:30Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 31 Sb address +2025-09-05T19:15:30Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 210 Sb address +2025-09-05T19:15:30Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 215 Sb address +2025-09-05T19:15:30Z INFO 1724 [post_scheduler]: Time-aware hwm post-sched +2025-09-05T19:15:30Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 93 Sb address +2025-09-05T19:15:30Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 493 Sb address +2025-09-05T19:15:30Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:15:30Z INFO 1724 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:15:30Z USER 1724 (sg01) [ModuleForkPass]: address_rotation_sb finished after 1.433 seconds +2025-09-05T19:15:30Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 942mb, ru_maxrss: 947mb (delta=0mb) +2025-09-05T19:15:30Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:30Z USER 1724 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-09-05T19:15:30Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:30Z INFO 1724 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-09-05T19:15:30Z INFO 1724 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-09-05T19:15:30Z INFO 1724 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-09-05T19:15:31Z USER 1724 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.352 seconds +2025-09-05T19:15:31Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 970mb, ru_maxrss: 970mb (delta=23mb) +2025-09-05T19:15:31Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:31Z USER 1724 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-09-05T19:15:31Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:31Z INFO 1724 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-09-05T19:15:31Z INFO 1724 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-09-05T19:15:31Z INFO 1724 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-09-05T19:15:31Z USER 1724 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.043 seconds +2025-09-05T19:15:31Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 955mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:31Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:31Z USER 1724 (sg01) [ModuleForkPass]: Running dep_opt +2025-09-05T19:15:31Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:31Z INFO 1724 [post_scheduler]: Time-aware simulation time: 7731449 +2025-09-05T19:15:31Z INFO 1724 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 8Fri Sep 5 19:15:31 2025 +2025-09-05T19:15:31Z INFO 1724 (sg01) [build_flow_deps]: Allocs: 13756 instructions: 52859 +2025-09-05T19:15:31Z INFO 1724 [post_scheduler]: Done PosT ScheD Fri Sep 5 19:15:31 2025 +2025-09-05T19:15:31Z USER 1724 (sg02) [ModuleForkPass]: post_sched finished after 1.333 seconds +2025-09-05T19:15:31Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 957mb, ru_maxrss: 970mb (delta=23mb) +2025-09-05T19:15:31Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:31Z USER 1724 (sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-09-05T19:15:31Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:31Z USER 1724 (sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.005 seconds +2025-09-05T19:15:31Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 948mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:31Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:31Z USER 1724 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-09-05T19:15:31Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:31Z INFO 1724 (sg01) [build_flow_deps]: Build fdeps inserted 155806 edges +2025-09-05T19:15:31Z INFO 1724 (sg01) [build_flow_deps]: Done build fdeps 155806 Fri Sep 5 19:15:31 2025 +2025-09-05T19:15:31Z USER 1724 (sg01) [ModuleForkPass]: dep_opt finished after 0.276 seconds +2025-09-05T19:15:31Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 948mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:31Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:31Z USER 1724 (sg01) [ModuleForkPass]: Running report_stats +2025-09-05T19:15:31Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:31Z INFO 1724 (sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌──────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 50331648 │ +│ DMACopy │ Internal -> ExternalOutput │ 128 │ 1073741824 │ +│ DMACopy │ Internal -> Output │ 2 │ 33554432 │ +│ Load │ Const -> Internal │ 3 │ 98304 │ +│ Load │ ExternalInput -> Internal │ 4693 │ 679510020 │ +│ Load │ Input -> Internal │ 78 │ 38273024 │ +│ Load │ Internal │ 919 │ 377487360 │ +│ Save │ Internal │ 275 │ 132382720 │ +│ Save │ Internal -> Output │ 33 │ 16777218 │ +│ Save (Spill) │ Internal │ 193 │ 41811968 │ +└──────────────┴────────────────────────────┴───────┴────────────┘ + +2025-09-05T19:15:31Z INFO 1724 (sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 1 │ +│ 64 │ 4 │ +│ 256 │ 3587 │ +│ 1024 │ 301 │ +│ 2048 │ 267 │ +│ 4096 │ 2033 │ +│ 524288 │ 128 │ +│ 8388608 │ 4 │ +│ 16777216 │ 3 │ +└─────────────────────┴───────┘ + +2025-09-05T19:15:31Z INFO 1724 (sg01) [ReportStats]: MM Stats: #MatMults 34848 #MatMult-Transposes 5920 +2025-09-05T19:15:31Z INFO 1724 (sg01) [ReportStats]: IO Tensor size combined: 251674628 +2025-09-05T19:15:31Z INFO 1724 (sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input74 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input77 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input75 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input78 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input82 │ ExternalInput │ bfloat16 │ 16777216 │ +│ output4 │ ExternalOutput │ bfloat16 │ 8388608 │ +│ input6 │ ExternalInput │ bfloat16 │ 8388608 │ +│ output3 │ ExternalOutput │ bfloat16 │ 8388608 │ +│ input7 │ ExternalInput │ bfloat16 │ 8388608 │ +│ input79 │ ExternalInput │ bfloat16 │ 4194304 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-09-05T19:15:31Z INFO 1724 (sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate1 │ Input │ bfloat16 │ 16777216 │ +│ intermediate6 │ Output │ bfloat16 │ 16777216 │ +│ intermediate7 │ Output │ bfloat16 │ 16777216 │ +│ intermediate4 │ Input │ bfloat16 │ 16777216 │ +│ add.4 │ Internal │ bfloat16 │ 16777216 │ +│ dot.7_i1 │ Internal │ bfloat16 │ 8388608 │ +│ dot.7_i0 │ Internal │ bfloat16 │ 8388608 │ +│ dot.11_i0 │ Internal │ bfloat16 │ 8388608 │ +│ all_reduce.1_i1 │ Internal │ bfloat16 │ 8388608 │ +│ all_reduce.1_i0 │ Internal │ bfloat16 │ 8388608 │ +└─────────────────┴──────────┴──────────┴──────────────┘ + +2025-09-05T19:15:31Z USER 1724 (sg01) [ModuleForkPass]: report_stats finished after 0.016 seconds +2025-09-05T19:15:31Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 948mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:31Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:31Z INFO 1724 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 478 PSUM Banks +2025-09-05T19:15:31Z INFO 1724 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 438 PSUM Banks +2025-09-05T19:15:31Z INFO 1724 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-09-05T19:15:31Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 21 Sb address +2025-09-05T19:15:31Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 271 Sb address +2025-09-05T19:15:31Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 139 Sb address +2025-09-05T19:15:31Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 61 Sb address +2025-09-05T19:15:32Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 168 Sb address +2025-09-05T19:15:32Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-09-05T19:15:32Z INFO 1724 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: address_rotation_sb finished after 0.665 seconds +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 948mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z INFO 1724 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-09-05T19:15:32Z INFO 1724 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-09-05T19:15:32Z INFO 1724 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.219 seconds +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 951mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z INFO 1724 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-09-05T19:15:32Z INFO 1724 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-09-05T19:15:32Z INFO 1724 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.025 seconds +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 932mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: Running dep_opt +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z INFO 1724 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 9Fri Sep 5 19:15:32 2025 +2025-09-05T19:15:32Z INFO 1724 (sg02) [build_flow_deps]: Allocs: 5538 instructions: 35676 +2025-09-05T19:15:32Z INFO 1724 (sg02) [build_flow_deps]: Build fdeps inserted 114195 edges +2025-09-05T19:15:32Z INFO 1724 (sg02) [build_flow_deps]: Done build fdeps 114195 Fri Sep 5 19:15:32 2025 +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: dep_opt finished after 0.194 seconds +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 932mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: Running report_stats +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z INFO 1724 (sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌──────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 50331648 │ +│ DMACopy │ Internal │ 1 │ 16777216 │ +│ Load │ Const -> Internal │ 4 │ 34824 │ +│ Load │ ExternalInput -> Internal │ 4744 │ 721448972 │ +│ Load │ Internal │ 526 │ 201635848 │ +│ Save │ Internal │ 256 │ 103264260 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +│ Save (Spill) │ Internal │ 69 │ 23592960 │ +└──────────────┴────────────────────────────┴───────┴───────────┘ + +2025-09-05T19:15:32Z INFO 1724 (sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 2 │ +│ 256 │ 3597 │ +│ 1024 │ 158 │ +│ 2048 │ 132 │ +│ 4096 │ 1696 │ +│ 65536 │ 1 │ +│ 16777216 │ 3 │ +└─────────────────────┴───────┘ + +2025-09-05T19:15:32Z INFO 1724 (sg02) [ReportStats]: MM Stats: #MatMults 27521 #MatMult-Transposes 1665 +2025-09-05T19:15:32Z INFO 1724 (sg02) [ReportStats]: IO Tensor size combined: 310403088 +2025-09-05T19:15:32Z INFO 1724 (sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input357 │ ExternalInput │ bfloat16 │ 134217728 │ +│ input356 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input353 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input354 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input358 │ ExternalInput │ bfloat16 │ 8192 │ +│ input355 │ ExternalInput │ bfloat16 │ 8192 │ +│ input1 │ ExternalInput │ int32 │ 8192 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-09-05T19:15:32Z INFO 1724 (sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────┼──────────┼──────────┼──────────────┤ +│ convert.49 │ Internal │ bfloat16 │ 16777216 │ +│ intermediate97 │ Input │ bfloat16 │ 16777216 │ +│ add.9 │ Internal │ bfloat16 │ 16777216 │ +│ intermediate96 │ Input │ bfloat16 │ 16777216 │ +│ dot.14_i1 │ Internal │ bfloat16 │ 8388608 │ +│ all_reduce.3_i0 │ Internal │ bfloat16 │ 8388608 │ +│ all_reduce.3_i1 │ Internal │ bfloat16 │ 8388608 │ +│ dot.14_i0 │ Internal │ bfloat16 │ 8388608 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ -t1743 │ Internal │ float32 │ 1048576 │ +└──────────────────────┴──────────┴──────────┴──────────────┘ + +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: report_stats finished after 0.010 seconds +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 932mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-09-05T19:15:32Z USER 1724 [BackendPassManager]: mod_parallel_pass finished after 16.172 seconds +2025-09-05T19:15:32Z INFO 1724 [BackendPassManager]: curr_vmrss: 932mb, ru_maxrss: 970mb (delta=145mb) +2025-09-05T19:15:32Z INFO 1724 [BackendPassManager]: Output has 3 module(s), 3 function(s), 28368 memory location(s), 3 block(s), and 108720 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 [BackendPassManager]: Running assign_trigger_engine +2025-09-05T19:15:32Z INFO 1724 [BackendPassManager]: Inputs to assign_trigger_engine: modules=3 functions=3 allocs=28368 blocks=3 instructions=108720 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z INFO 1724 (sg00) [AssignTriggerEngine]: Assigned trigger engine for 260 DMA instructions. Moved 66 DMA instructions to CC's engines. +2025-09-05T19:15:32Z INFO 1724 (sg01) [AssignTriggerEngine]: Assigned trigger engine for 502 DMA instructions. Moved 34 DMA instructions to CC's engines. +2025-09-05T19:15:32Z INFO 1724 (sg02) [AssignTriggerEngine]: Assigned trigger engine for 360 DMA instructions. Moved 35 DMA instructions to CC's engines. +2025-09-05T19:15:32Z USER 1724 [BackendPassManager]: assign_trigger_engine finished after 0.042 seconds +2025-09-05T19:15:32Z INFO 1724 [BackendPassManager]: curr_vmrss: 927mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 [BackendPassManager]: Output has 3 module(s), 3 function(s), 28368 memory location(s), 3 block(s), and 108720 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 [BackendPassManager]: Running subgraph_parallel_pass +2025-09-05T19:15:32Z INFO 1724 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=28368 blocks=3 instructions=108720 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-09-05T19:15:32Z USER 1724 (sg02) [SubgraphForkPass]: Running lower_local_collectives +2025-09-05T19:15:32Z USER 1724 (sg01) [SubgraphForkPass]: Running lower_local_collectives +2025-09-05T19:15:32Z INFO 1724 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z INFO 1724 (sg02) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.001 seconds +2025-09-05T19:15:32Z USER 1724 (sg02) [SubgraphForkPass]: lower_local_collectives finished after 0.001 seconds +2025-09-05T19:15:32Z INFO 1724 (sg00) [SubgraphForkPass]: curr_vmrss: 927mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg02) [SubgraphForkPass]: curr_vmrss: 927mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg02) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-09-05T19:15:32Z INFO 1724 (sg02) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg02) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.000 seconds +2025-09-05T19:15:32Z INFO 1724 (sg02) [SubgraphForkPass]: curr_vmrss: 927mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z USER 1724 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-09-05T19:15:32Z INFO 1724 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg02) [SubgraphForkPass]: Running dead_code_elim +2025-09-05T19:15:32Z INFO 1724 (sg02) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z INFO 1724 (sg01) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 (sg01) [SubgraphForkPass]: lower_local_collectives finished after 0.002 seconds +2025-09-05T19:15:32Z INFO 1724 (sg01) [SubgraphForkPass]: curr_vmrss: 927mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z USER 1724 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.001 seconds +2025-09-05T19:15:32Z INFO 1724 (sg00) [SubgraphForkPass]: curr_vmrss: 927mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z USER 1724 (sg00) [SubgraphForkPass]: Running dead_code_elim +2025-09-05T19:15:32Z INFO 1724 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 (sg01) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-09-05T19:15:32Z INFO 1724 (sg00) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z INFO 1724 (sg01) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 (sg01) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.001 seconds +2025-09-05T19:15:32Z INFO 1724 (sg01) [SubgraphForkPass]: curr_vmrss: 927mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 (sg01) [SubgraphForkPass]: Running dead_code_elim +2025-09-05T19:15:32Z INFO 1724 (sg01) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z INFO 1724 (sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:32Z USER 1724 (sg00) [SubgraphForkPass]: dead_code_elim finished after 0.029 seconds +2025-09-05T19:15:32Z INFO 1724 (sg00) [SubgraphForkPass]: curr_vmrss: 928mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z INFO 1724 (sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:32Z INFO 1724 (sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:15:32Z USER 1724 (sg02) [SubgraphForkPass]: dead_code_elim finished after 0.080 seconds +2025-09-05T19:15:32Z INFO 1724 (sg02) [SubgraphForkPass]: curr_vmrss: 925mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg01) [SubgraphForkPass]: dead_code_elim finished after 0.108 seconds +2025-09-05T19:15:32Z INFO 1724 (sg01) [SubgraphForkPass]: curr_vmrss: 926mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-09-05T19:15:32Z USER 1724 [BackendPassManager]: subgraph_parallel_pass finished after 0.115 seconds +2025-09-05T19:15:32Z INFO 1724 [BackendPassManager]: curr_vmrss: 925mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 [BackendPassManager]: Output has 3 module(s), 3 function(s), 28368 memory location(s), 3 block(s), and 108720 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 [BackendPassManager]: Running assign_hwdge_engine +2025-09-05T19:15:32Z INFO 1724 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=3 functions=3 allocs=28368 blocks=3 instructions=108720 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 [BackendPassManager]: assign_hwdge_engine finished after 0.013 seconds +2025-09-05T19:15:32Z INFO 1724 [BackendPassManager]: curr_vmrss: 925mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 [BackendPassManager]: Output has 3 module(s), 3 function(s), 28368 memory location(s), 3 block(s), and 108720 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:15:32Z INFO 1724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=28368 blocks=3 instructions=108720 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 (sg00) [ModuleForkPass]: Running alloc_queues +2025-09-05T19:15:32Z USER 1724 (sg01) [ModuleForkPass]: Running alloc_queues +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: Running alloc_queues +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z INFO 1724 (sg00) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z INFO 1724 (sg02) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-09-05T19:15:32Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z INFO 1724 (sg01) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-09-05T19:15:32Z INFO 1724 (sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 4 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 313 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 147 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 73 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 38 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 444 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-09-05T19:15:32Z USER 1724 (sg00) [ModuleForkPass]: alloc_queues finished after 0.004 seconds +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 925mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z USER 1724 (sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z USER 1724 (sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 925mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z USER 1724 (sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z USER 1724 (sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 925mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z USER 1724 (sg00) [ModuleForkPass]: Running lower_control +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z INFO 1724 (sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 7 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 495 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 224 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 47 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 89 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 4739 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: alloc_queues finished after 0.007 seconds +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 925mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 925mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 925mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: Running lower_control +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z INFO 1724 (sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 2 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 890 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 268 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 176 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 56 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 4932 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-09-05T19:15:32Z USER 1724 (sg01) [ModuleForkPass]: alloc_queues finished after 0.010 seconds +2025-09-05T19:15:32Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 925mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 (sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-09-05T19:15:32Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 (sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.001 seconds +2025-09-05T19:15:32Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 925mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 (sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-09-05T19:15:32Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 (sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-09-05T19:15:32Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 925mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 (sg01) [ModuleForkPass]: Running lower_control +2025-09-05T19:15:32Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z INFO 1724 (sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-09-05T19:15:32Z USER 1724 (sg00) [ModuleForkPass]: lower_control finished after 0.018 seconds +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 925mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z USER 1724 (sg00) [ModuleForkPass]: Running dep_reduction +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=9074 blocks=1 instructions=20185 Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z INFO 1724 (sg00) [DepReduction]: Start Dependency Reduction +2025-09-05T19:15:32Z INFO 1724 (sg00) [DepReduction]: Processing async instrs... +2025-09-05T19:15:32Z INFO 1724 (sg00) [DepReduction]: Processing secondary edges per engine... +2025-09-05T19:15:32Z INFO 1724 (sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-09-05T19:15:32Z INFO 1724 (sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 18314 +2025-09-05T19:15:32Z INFO 1724 (sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: lower_control finished after 0.043 seconds +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 926mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z USER 1724 (sg02) [ModuleForkPass]: Running dep_reduction +2025-09-05T19:15:32Z INFO 1724 (sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=5538 blocks=1 instructions=35676 Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:32Z INFO 1724 (sg02) [DepReduction]: Start Dependency Reduction +2025-09-05T19:15:32Z INFO 1724 (sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 19206 +2025-09-05T19:15:32Z INFO 1724 (sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 19206 +2025-09-05T19:15:32Z USER 1724 (sg01) [ModuleForkPass]: lower_control finished after 0.050 seconds +2025-09-05T19:15:32Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 927mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z USER 1724 (sg01) [ModuleForkPass]: Running dep_reduction +2025-09-05T19:15:32Z INFO 1724 (sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=13756 blocks=1 instructions=52859 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:32Z INFO 1724 (sg01) [DepReduction]: Start Dependency Reduction +2025-09-05T19:15:32Z INFO 1724 (sg02) [DepReduction]: Processing async instrs... +2025-09-05T19:15:32Z INFO 1724 (sg02) [DepReduction]: Processing secondary edges per engine... +2025-09-05T19:15:32Z INFO 1724 (sg01) [DepReduction]: Processing async instrs... +2025-09-05T19:15:32Z INFO 1724 (sg01) [DepReduction]: Processing secondary edges per engine... +2025-09-05T19:15:32Z INFO 1724 (sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 60836 +2025-09-05T19:15:32Z INFO 1724 (sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 74653 +2025-09-05T19:15:32Z INFO 1724 (sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 66697 +2025-09-05T19:15:32Z INFO 1724 (sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 66697 +2025-09-05T19:15:32Z INFO 1724 (sg00) [DepReduction]: Num Async removed: 0 +2025-09-05T19:15:32Z INFO 1724 (sg00) [DepReduction]: Finished dependency reduction: 111563 removed, new total 8884 +2025-09-05T19:15:32Z INFO 1724 (sg00) [DepReduction]: Finished Dependency Reduction +2025-09-05T19:15:32Z USER 1724 (sg00) [ModuleForkPass]: dep_reduction finished after 0.176 seconds +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: curr_vmrss: 948mb, ru_maxrss: 970mb (delta=0mb) +2025-09-05T19:15:32Z INFO 1724 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9074 memory location(s), 1 block(s), and 20185 instruction(s). Max writers: 64 Max Readers: 3712 +2025-09-05T19:15:32Z INFO 1724 (sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 81407 +2025-09-05T19:15:32Z INFO 1724 (sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 81407 +2025-09-05T19:15:33Z INFO 1724 (sg02) [DepReduction]: Num Async removed: 0 +2025-09-05T19:15:33Z INFO 1724 (sg02) [DepReduction]: Finished dependency reduction: 236027 removed, new total 10895 +2025-09-05T19:15:33Z INFO 1724 (sg02) [DepReduction]: Finished Dependency Reduction +2025-09-05T19:15:33Z USER 1724 (sg02) [ModuleForkPass]: dep_reduction finished after 0.373 seconds +2025-09-05T19:15:33Z INFO 1724 (sg02) [ModuleForkPass]: curr_vmrss: 972mb, ru_maxrss: 972mb (delta=2mb) +2025-09-05T19:15:33Z INFO 1724 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5538 memory location(s), 1 block(s), and 35676 instruction(s). Max writers: 56 Max Readers: 4096 +2025-09-05T19:15:33Z INFO 1724 (sg01) [DepReduction]: Num Async removed: 0 +2025-09-05T19:15:33Z INFO 1724 (sg01) [DepReduction]: Finished dependency reduction: 325467 removed, new total 19163 +2025-09-05T19:15:33Z INFO 1724 (sg01) [DepReduction]: Finished Dependency Reduction +2025-09-05T19:15:33Z USER 1724 (sg01) [ModuleForkPass]: dep_reduction finished after 0.512 seconds +2025-09-05T19:15:33Z INFO 1724 (sg01) [ModuleForkPass]: curr_vmrss: 972mb, ru_maxrss: 972mb (delta=2mb) +2025-09-05T19:15:33Z INFO 1724 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13756 memory location(s), 1 block(s), and 52859 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:33Z USER 1724 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-09-05T19:15:33Z USER 1724 [BackendPassManager]: mod_parallel_pass finished after 0.628 seconds +2025-09-05T19:15:33Z INFO 1724 [BackendPassManager]: curr_vmrss: 966mb, ru_maxrss: 972mb (delta=2mb) +2025-09-05T19:15:33Z INFO 1724 [BackendPassManager]: Output has 3 module(s), 3 function(s), 28368 memory location(s), 3 block(s), and 108720 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:33Z USER 1724 [BackendPassManager]: Running nc_parallel_pass +2025-09-05T19:15:33Z INFO 1724 [BackendPassManager]: Inputs to nc_parallel_pass: modules=3 functions=3 allocs=28368 blocks=3 instructions=108720 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:33Z USER 1724 [CoreForkPass]: Running bir_linker +2025-09-05T19:15:33Z INFO 1724 [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=28368 blocks=3 instructions=108720 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:33Z INFO 1724 (sgLnk) [BirLinker]: bir_linker cwd: +2025-09-05T19:15:33Z INFO 1724 (sgLnk) [BirLinker]: Num intermediates 99 +2025-09-05T19:15:33Z INFO 1724 (sgLnk) [BirLinker]: Num Module Definitions 3 +2025-09-05T19:15:33Z INFO 1724 (sgLnk) [BirLinker]: Linking to a call-graph structure +2025-09-05T19:15:33Z INFO 1724 (sgLnk) [BirLinker]: Added a new SpillReload Que qPoolPIOParam0 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [BirLinker]: tensor_map verification successful. +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [BirLinker]: Writing updated tensor_map /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y/sgLnk/sg00/tensor_map.json +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [BirLinker]: PostLink Stats: #MatMults 1118065 #MatMult-Transposes 188897 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [BirLinker]: Total Intermediate MMTs 16864 #out: 15872 #inp: 992 #symmetric: 0 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 33 #out: 31 #inp: 2 #both: 0 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [BirLinker]: releasing pre-link modules +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [BirLinker]: linking Done. +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: bir_linker finished after 1.309 seconds +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: curr_vmrss: 1515mb, ru_maxrss: 1515mb (delta=543mb) +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 108780 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: Running postlnk_dma_report +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=28891 blocks=4 instructions=108780 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 2681562160, 57.0631% input load, 1.44682% output write, 41.4901% spill/reload +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: postlnk_dma_report finished after 0.009 seconds +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: curr_vmrss: 838mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 108780 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: Running report_stats +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=28891 blocks=4 instructions=108780 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 33 │ 4294975488 │ +│ DMACopy │ Internal -> ExternalOutput │ 128 │ 1073741824 │ +│ DMACopy │ Internal -> Output │ 2 │ 33554432 │ +│ Load │ Const -> Internal │ 4 │ 98560 │ +│ Load │ ExternalInput -> Internal │ 209 │ 90718212 │ +│ Load │ Internal │ 373 │ 152650240 │ +│ Save │ Internal │ 97 │ 47972608 │ +│ Save │ Internal -> Output │ 77 │ 22020098 │ +│ Save (Spill) │ Internal │ 97 │ 31784960 │ +└──────────────┴────────────────────────────┴───────┴────────────┘ + +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 1 │ +│ 64 │ 1 │ +│ 256 │ 7 │ +│ 512 │ 1 │ +│ 896 │ 24 │ +│ 1024 │ 71 │ +│ 1920 │ 64 │ +│ 2048 │ 199 │ +│ 4096 │ 502 │ +│ 8064 │ 7 │ +│ 8192 │ 12 │ +│ 524288 │ 128 │ +│ 8388608 │ 4 │ +└─────────────────────┴───────┘ + +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌──────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 50331648 │ +│ DMACopy │ Internal -> ExternalOutput │ 128 │ 1073741824 │ +│ DMACopy │ Internal -> Output │ 2 │ 33554432 │ +│ Load │ Const -> Internal │ 3 │ 98304 │ +│ Load │ ExternalInput -> Internal │ 4693 │ 679510020 │ +│ Load │ Input -> Internal │ 78 │ 38273024 │ +│ Load │ Internal │ 919 │ 377487360 │ +│ Save │ Internal │ 275 │ 132382720 │ +│ Save │ Internal -> Output │ 33 │ 16777218 │ +│ Save (Spill) │ Internal │ 193 │ 41811968 │ +└──────────────┴────────────────────────────┴───────┴────────────┘ + +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 1 │ +│ 64 │ 4 │ +│ 256 │ 3587 │ +│ 1024 │ 301 │ +│ 2048 │ 267 │ +│ 4096 │ 2033 │ +│ 524288 │ 128 │ +│ 8388608 │ 4 │ +│ 16777216 │ 3 │ +└─────────────────────┴───────┘ + +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌──────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 50331648 │ +│ DMACopy │ Internal │ 1 │ 16777216 │ +│ Load │ Const -> Internal │ 4 │ 34824 │ +│ Load │ ExternalInput -> Internal │ 4744 │ 721448972 │ +│ Load │ Internal │ 526 │ 201635848 │ +│ Save │ Internal │ 256 │ 103264260 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +│ Save (Spill) │ Internal │ 69 │ 23592960 │ +└──────────────┴────────────────────────────┴───────┴───────────┘ + +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 2 │ +│ 256 │ 3597 │ +│ 1024 │ 158 │ +│ 2048 │ 132 │ +│ 4096 │ 1696 │ +│ 65536 │ 1 │ +│ 16777216 │ 3 │ +└─────────────────────┴───────┘ + +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ReportStats]: MM Stats: #MatMults 72625 #MatMult-Transposes 11297 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ReportStats]: IO Tensor size combined: 9093865516 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input68_sg0000 │ ExternalInput │ bfloat16 │ 134217728 │ +│ input357_sg0002 │ ExternalInput │ bfloat16 │ 134217728 │ +│ input68 │ ExternalInput │ bfloat16 │ 134217728 │ +│ input357 │ ExternalInput │ bfloat16 │ 134217728 │ +│ input104 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input95 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input86 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input113 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input77 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input149 │ ExternalInput │ bfloat16 │ 58720256 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────┬───────────────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────┼───────────────────┼──────────┼──────────────┤ +│ intermediate1 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate4 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate18 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate9 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate15 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate12 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate27 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate24 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate21 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate6 │ InternalInterface │ bfloat16 │ 16777216 │ +└─────────────────┴───────────────────┴──────────┴──────────────┘ + +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: report_stats finished after 0.022 seconds +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: curr_vmrss: 838mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 108780 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=28891 blocks=4 instructions=108780 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: reserved space = 8322048020 bytes +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: spill space = 1078984768 bytes +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: aligned spill space = 1079115776 bytes +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: renumber locations +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: size = 99 +2025-09-05T19:15:34Z INFO 1724 []: find first defs for local +2025-09-05T19:15:34Z INFO 1724 []: find first defs for global +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: Num intervals 99 Num locations 99 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: simplify interference graph +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: initialize low and high +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: lo = 99 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: hi = 0 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: total = 99 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: simplify +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 134610944 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: select ranges +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 134610944 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: Real CC buffer size 134610944 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 223752192 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.066 seconds +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: curr_vmrss: 838mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 108780 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=28891 blocks=4 instructions=108780 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.065 seconds +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: curr_vmrss: 840mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 108780 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: Running lower_dynamic_dma +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=28891 blocks=4 instructions=108780 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: lower_dynamic_dma finished after 0.014 seconds +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: curr_vmrss: 840mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 108780 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: Running legalize_dynamic_dma +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=28891 blocks=4 instructions=108780 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-09-05T19:15:34Z INFO 1724 (sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: legalize_dynamic_dma finished after 0.025 seconds +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: curr_vmrss: 840mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 108780 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:34Z USER 1724 [CoreForkPass]: Running lower_dma +2025-09-05T19:15:34Z INFO 1724 [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=28891 blocks=4 instructions=108780 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 153755/153755 (100% DGE) + power-of-2 partition : 153819/153890 (99.9539% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 153819/153890 (99.9539% DGE) + Cast (DGE/DMA) + 128 partition : 127/127 (100% DGE) + power-of-2 partition : 127/129 (98.4496% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 127/129 (98.4496% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 0/42472 (0% DGE) + power-of-2 partition : 0/44516 (0% DGE) + > 3 dimensional : 0/128 (0% DGE) + non-integer desc size : 0/0 + total : 0/44516 (0% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 32 + Transpose : 1 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 4128/4128 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: lower_dma finished after 0.357 seconds +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: curr_vmrss: 840mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 108780 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: Running expand_all_engine +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=28891 blocks=4 instructions=108780 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: expand_all_engine finished after 0.014 seconds +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: curr_vmrss: 840mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 108780 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: Running alloc_semaphores +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=28891 blocks=4 instructions=108780 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: alloc_semaphores finished after 0.083 seconds +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: curr_vmrss: 840mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 108780 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: Running expand_inst_late +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=28891 blocks=4 instructions=108780 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: expand_inst_late finished after 0.094 seconds +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: curr_vmrss: 840mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 109327 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: Running seq_inst_opt +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=28891 blocks=4 instructions=109327 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [SeqInstOpt]: Removing 414 unnecessary InstRegisterMove instruction(s) from Block1 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [SeqInstOpt]: Removing 127 unnecessary InstRegisterMove instruction(s) from Block1 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: seq_inst_opt finished after 0.009 seconds +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: curr_vmrss: 840mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 108786 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: Running lower_sync +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=28891 blocks=4 instructions=108786 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: lower_sync finished after 0.063 seconds +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: curr_vmrss: 872mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 120197 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: Running lower_act +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=28891 blocks=4 instructions=120197 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: lower_act finished after 0.050 seconds +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: curr_vmrss: 872mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 120238 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: Running lower_dve +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=28891 blocks=4 instructions=120238 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/conda/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: lower_dve finished after 0.142 seconds +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: curr_vmrss: 878mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 120238 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: Running lower_ap +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=28891 blocks=4 instructions=120238 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: lower_ap finished after 0.019 seconds +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: curr_vmrss: 878mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 120238 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: Running coloring_allocator_reg +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=28891 blocks=4 instructions=120238 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: allocating REG +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: allocating REG +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: renumber registers +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: size = 2 +2025-09-05T19:15:35Z INFO 1724 []: find first defs for local reg +2025-09-05T19:15:35Z INFO 1724 []: find first defs for global reg +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: live range analysis +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: find costs +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: simplify interference graph +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: initialize low and high +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: lo = 2 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: hi = 0 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: inf = 0 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: total = 2 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: simplify +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: select ranges +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: no more spills +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: allocating REG +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: renumber registers +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: size = 1 +2025-09-05T19:15:35Z INFO 1724 []: find first defs for local reg +2025-09-05T19:15:35Z INFO 1724 []: find first defs for global reg +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: live range analysis +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: find costs +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: simplify interference graph +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: initialize low and high +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: lo = 1 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: hi = 0 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: inf = 0 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: total = 1 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: simplify +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: select ranges +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: no more spills +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: allocating REG +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: renumber registers +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: size = 4 +2025-09-05T19:15:35Z INFO 1724 []: find first defs for local reg +2025-09-05T19:15:35Z INFO 1724 []: find first defs for global reg +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: live range analysis +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: find costs +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: simplify interference graph +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: initialize low and high +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: lo = 4 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: hi = 0 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: inf = 0 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: total = 4 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: simplify +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: select ranges +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: no more spills +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-09-05T19:15:35Z INFO 1724 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-09-05T19:15:35Z USER 1724 [CoreForkPass]: coloring_allocator_reg finished after 0.155 seconds +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: curr_vmrss: 879mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:35Z INFO 1724 [CoreForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 120238 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [BackendPassManager]: nc_parallel_pass finished after 2.623 seconds +2025-09-05T19:15:35Z INFO 1724 [BackendPassManager]: curr_vmrss: 879mb, ru_maxrss: 1515mb (delta=543mb) +2025-09-05T19:15:35Z INFO 1724 [BackendPassManager]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 120238 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:15:35Z INFO 1724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=4 allocs=28891 blocks=4 instructions=120238 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:35Z USER 1724 [ModuleForkPass]: Running birverifier +2025-09-05T19:15:36Z INFO 1724 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=28891 blocks=4 instructions=120238 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:36Z USER 1724 [ModuleForkPass]: birverifier finished after 0.097 seconds +2025-09-05T19:15:36Z INFO 1724 [ModuleForkPass]: curr_vmrss: 893mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:36Z INFO 1724 [ModuleForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 120238 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:36Z USER 1724 [BackendPassManager]: mod_parallel_pass finished after 0.101 seconds +2025-09-05T19:15:36Z INFO 1724 [BackendPassManager]: curr_vmrss: 893mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:36Z INFO 1724 [BackendPassManager]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 120238 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:36Z USER 1724 [BackendPassManager]: Running subgraph_parallel_pass +2025-09-05T19:15:36Z INFO 1724 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=4 allocs=28891 blocks=4 instructions=120238 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:36Z USER 1724 [SubgraphForkPass]: Running lnc_verifier +2025-09-05T19:15:36Z INFO 1724 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=4 allocs=28891 blocks=4 instructions=120238 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:36Z USER 1724 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-09-05T19:15:36Z INFO 1724 [SubgraphForkPass]: curr_vmrss: 893mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:36Z INFO 1724 [SubgraphForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 120238 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:36Z USER 1724 [BackendPassManager]: subgraph_parallel_pass finished after 0.003 seconds +2025-09-05T19:15:36Z INFO 1724 [BackendPassManager]: curr_vmrss: 893mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:36Z INFO 1724 [BackendPassManager]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 120238 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:36Z USER 1724 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:15:36Z INFO 1724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=4 allocs=28891 blocks=4 instructions=120238 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:36Z USER 1724 [ModuleForkPass]: Running codegen +2025-09-05T19:15:36Z INFO 1724 [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=28891 blocks=4 instructions=120238 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: Total compiler allocated DRAM tensors: 0.208385 GB +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 6.75051 │ +│ ExternalOutput │ 0.5 │ +│ Const │ 0.000215776 │ +└────────────────┴─────────────┘ + +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: Total runtime managed DRAM tensors: 7.25073 GB +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: Instruction Stats: +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 72641 │ +│ LDWEIGHTS │ 71477 │ +│ ACTIVATE │ 12070 │ +│ EVENT_SEMAPHORE │ 11411 │ +│ UNKNOWN(0xd4) │ 10115 │ +│ PSEUDO_DMA_TRIGGER │ 2832 │ +│ TENSOR_TENSOR │ 1881 │ +│ UNKNOWN(0x24) │ 1664 │ +│ UNKNOWN(0x8d) │ 1664 │ +│ UNKNOWN(0xe8) │ 1032 │ +│ COPY │ 1001 │ +│ TENSOR_SCALAR_ADDR │ 683 │ +│ UNKNOWN(0x8b) │ 544 │ +│ TENSOR_REDUCE │ 526 │ +│ MEMSET │ 424 │ +│ TENSOR_SCALAR │ 415 │ +│ UNKNOWN(0xda) │ 301 │ +│ RECIPROCAL │ 259 │ +│ UNKNOWN(0x92) │ 256 │ +│ UNKNOWN(0x8a) │ 256 │ +│ CAST │ 183 │ +│ UNKNOWN(0xd3) │ 165 │ +│ MATCH_VALUE_LOAD │ 126 │ +│ IOTA │ 69 │ +│ FIND_INDEX8 │ 64 │ +│ MAX8 │ 64 │ +│ MATCH_REPLACE8 │ 62 │ +│ ACT_TABLE_LOAD │ 41 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ UNKNOWN(0xd2) │ 15 │ +│ PSEUDO_DMA_REARM │ 12 │ +│ UNKNOWN(0xcf) │ 12 │ +│ LOAD_MASK_SELECT │ 12 │ +│ UNKNOWN(0xd9) │ 12 │ +│ STREAM_SHUFFLE │ 12 │ +│ GATHER │ 3 │ +│ MOVE │ 3 │ +│ POOL_BUFFER_LOAD │ 3 │ +│ ALU_OP │ 2 │ +│ UNKNOWN(0xe5) │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ TENSOR_SCALAR │ 1 │ +│ RNG │ 1 │ +└─────────────────────┴───────┘ + +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 15591 │ +│ Scalar │ 15443 │ +│ Tensor │ 148708 │ +│ SyncDMA │ 0 │ +│ Vector │ 9643 │ +│ Sync │ 2972 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: Total instructions: 192357 (0.0114654 GB) +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: Total DynamicDMA instruction count: 10115 +2025-09-05T19:15:36Z USER 1724 (sgLnk) [Codegen]: isa_gen finished after 0.482 seconds +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_0 │ 37632 │ +│ qActSpillReload0_defId_1 │ 68608 │ +│ qActSpillReload0_defId_2 │ 48200 │ +│ qDVESpillReload0_defId_0 │ 9728 │ +│ qDVESpillReload0_defId_1 │ 38400 │ +│ qDVESpillReload0_defId_2 │ 21514 │ +│ qPoolIO0 │ 2 │ +│ qPoolPIOParam0 │ 64 │ +│ qPoolSpillReload0_defId_0 │ 26626 │ +│ qPoolSpillReload0_defId_1 │ 14336 │ +│ qPoolSpillReload0_defId_2 │ 11270 │ +│ qSPIO0 │ 262384 │ +│ qSPSpillReload0_defId_0 │ 94726 │ +│ qSPSpillReload0_defId_1 │ 241664 │ +│ qSPSpillReload0_defId_2 │ 123672 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 998826 (0.0148837 GB) +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qPoolIO0 │ 16 │ +│ qPoolPIOParam0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: Tensors with largest descriptor count: +┌────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ input78_local_1128_i14_SpillSave527_sg0001 │ Internal │ bfloat16 │ 17 │ +│ all-reduce.475.1564_sg0001 │ Internal │ bfloat16 │ 31 │ +│ compare.2.1542_sg0001 │ Internal │ int32 │ 31 │ +│ input2 │ ExternalInput │ int32 │ 32 │ +│ all_gather.1_i1_sg0000 │ Internal │ bfloat16 │ 32 │ +│ all_gather.1_i0_sg0000 │ Internal │ bfloat16 │ 32 │ +│ convert.49_sg0002 │ Internal │ bfloat16 │ 32 │ +│ convert.51_sg0002 │ Internal │ float32 │ 33 │ +│ add.9_sg0002 │ Internal │ bfloat16 │ 33 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 63 │ +└────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-09-05T19:15:36Z USER 1724 (sgLnk) [Codegen]: dma_desc_gen finished after 0.075 seconds +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: Estimated peak DRAM usage: 7.48546 GB +2025-09-05T19:15:36Z INFO 1724 (sgLnk) [Codegen]: Generating debug info +2025-09-05T19:15:36Z USER 1724 (sgLnk) [Codegen]: debug_info_gen finished after 0.316 seconds +2025-09-05T19:15:37Z USER 1724 [ModuleForkPass]: codegen finished after 0.913 seconds +2025-09-05T19:15:37Z INFO 1724 [ModuleForkPass]: curr_vmrss: 993mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:37Z INFO 1724 [ModuleForkPass]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 120238 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:37Z USER 1724 [BackendPassManager]: mod_parallel_pass finished after 0.923 seconds +2025-09-05T19:15:37Z INFO 1724 [BackendPassManager]: curr_vmrss: 993mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:37Z INFO 1724 [BackendPassManager]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 120238 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:37Z USER 1724 [BackendPassManager]: Running neff_packager +2025-09-05T19:15:37Z INFO 1724 [BackendPassManager]: Inputs to neff_packager: modules=1 functions=4 allocs=28891 blocks=4 instructions=120238 Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:37Z INFO 1724 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1401_CRSM.npy +2025-09-05T19:15:37Z INFO 1724 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.4-1343-1403_CRSM.npy +2025-09-05T19:15:37Z INFO 1724 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.3-1353-1405_CRSM.npy +2025-09-05T19:15:37Z INFO 1724 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1620_CRSM.npy +2025-09-05T19:15:37Z INFO 1724 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.13-1136-1199_CRSM.npy +2025-09-05T19:15:37Z INFO 1724 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1146-1201_CRSM.npy +2025-09-05T19:15:37Z INFO 1724 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1357_CRSM.npy +2025-09-05T19:15:37Z INFO 1724 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.24_CRSM.npy +2025-09-05T19:15:37Z INFO 1724 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.25_CRSM.npy +2025-09-05T19:15:37Z INFO 1724 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26-817-929_CRSM.npy +2025-09-05T19:15:37Z INFO 1724 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1082_CRSM.npy +2025-09-05T19:15:37Z INFO 1724 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-09-05T19:15:37Z WARNING 1724 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.169490.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-09-05T19:15:37Z INFO 1724 [NeffFileWriter]: Neff will be written to: /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.neff +2025-09-05T19:15:37Z INFO 1724 [NeffFileWriter]: IR signature: 39ecba9b318b0e559a3c43c6ea6cdee8 for neff artifacts +2025-09-05T19:15:37Z USER 1724 [BackendPassManager]: neff_packager finished after 0.438 seconds +2025-09-05T19:15:37Z INFO 1724 [BackendPassManager]: curr_vmrss: 993mb, ru_maxrss: 1515mb (delta=0mb) +2025-09-05T19:15:37Z INFO 1724 [BackendPassManager]: Output has 1 module(s), 4 function(s), 28891 memory location(s), 4 block(s), and 120238 instruction(s). Max writers: 64 Max Readers: 5920 +2025-09-05T19:15:37Z INFO 1724 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.084473 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.092533 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local │ 0.125366 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: local │ 0.200928 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local │ 0.097580 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: local │ 0.145042 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.125366 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.208385 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 1.005005 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.208385 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-09-05T19:15:37Z INFO 1724 [BackendDriver]: Backend completed successfully, tearing down. +2025-09-05T19:15:37Z INFO 1166 [job.WalrusDriver.0]: new_lnkState: {"model": ["/models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y/sgLnk/sg00", "state_id": "sgLnk"} +2025-09-05T19:15:37Z INFO 1166 [job.WalrusDriver.0]: MTBackend: completed successfully. +2025-09-05T19:15:37Z INFO 1166 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-09-05T19:15:37Z INFO 1166 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-09-05T19:15:37Z INFO 1166 [job.BIRLinker.0]: Replay this job by calling: /opt/conda/bin/neuronx-cc compile --framework XLA --state '{"model": ["/models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y/sgLnk/sg00", "state_id": "sgLnk"}' --pipeline BIRLinker +2025-09-05T19:15:37Z INFO 1166 [job.BIRLinker.0]: BIRLinker cwd: /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y +2025-09-05T19:15:37Z INFO 1166 [job.BIRLinker.0]: Linking already done. +2025-09-05T19:15:37Z INFO 1166 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-09-05T19:15:37Z INFO 1166 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-09-05T19:15:37Z INFO 1166 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-09-05T19:15:37Z INFO 1166 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-09-05T19:15:37Z INFO 1166 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-09-05T19:15:37Z INFO 1166 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-09-05T19:15:37Z INFO 1166 [job.NeffWrapper.0]: Processing input #0 +2025-09-05T19:15:37Z INFO 1166 [job.NeffWrapper.0]: Start NeffWrapper +2025-09-05T19:15:37Z INFO 1166 [job.NeffWrapper.0]: Executing: /opt/conda/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.hlo_module.pb --neff /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.neff --io_transposes /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y/io_transposes.json --output /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/wrapped_neff.hlo --netlist /models/mistral-7b-v0.3-instruct-neuronx/context_encoding_model/_tp0_bk4/neuronxcc-p52odp_y/hlo_netlist.json +2025-09-05T19:15:37Z INFO 1166 [job.NeffWrapper.0]: There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-09-05T19:15:37Z INFO 1166 [job.NeffWrapper.0]: Job #0 finished +2025-09-05T19:15:37Z INFO 1166 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-09-05T19:15:37Z INFO 1166 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-09-05T19:15:37Z INFO 1166 [pipeline.Pipeline.0]: Job #0 finished +2025-09-05T19:15:38Z INFO 750 [root]: Subcommand returned with exitcode=0 diff --git a/context_encoding_model/_tp0_bk4/metaneff.pb b/context_encoding_model/_tp0_bk4/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..2a7e668455074a247d53defe2d3ead071cc5bbfd --- /dev/null +++ b/context_encoding_model/_tp0_bk4/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14460b3b6b308407432a80fca62093da7cc19d3c26c9d018e923b97ffe30fde0 +size 2347463 diff --git a/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.hlo_module.pb b/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..db8a9d717f6f8a7008e23c12e567627c5801f8e3 --- /dev/null +++ b/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:592a9cdc4c9b4697249af595e7e4e7ae477f80acdebaede8842f0734e5baf50e +size 2413336 diff --git a/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.neff b/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.neff new file mode 100644 index 0000000000000000000000000000000000000000..5ed302ca091dd9016ef3fabb2b6ddebfddb3fe5b --- /dev/null +++ b/context_encoding_model/_tp0_bk4/model.MODULE_d342327da795afc2aa68+5e8b788a.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93d468f8c91c8ae558da4744c631e0351092b98d3698d8a39f05082867c022a7 +size 3298304 diff --git a/context_encoding_model/_tp0_bk4/neuron_config.json b/context_encoding_model/_tp0_bk4/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cf772551a5e5ee8f32b808629897f763e444bee3 --- /dev/null +++ b/context_encoding_model/_tp0_bk4/neuron_config.json @@ -0,0 +1,213 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3", + "add_cross_attention": false, + "architectures": [ + "MistralForCausalLM" + ], + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 14336, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "metadata": null, + "min_length": 0, + "model_type": "mistral", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": "NeuronLlamaAttention", + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": true, + "buckets": [ + 2048 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": [ + 2048 + ], + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": true, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 4, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": false, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 1, + "lora_config": null, + "max_batch_size": 4, + "max_context_length": 2048, + "max_length": 2048, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 2048, + "n_positions": 2048, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 2048, + "pa_num_blocks": 4, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 2048, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 4, + "speculation_length": 0, + "start_rank_id": 0, + "target": null, + "tile_cc": false, + "tkg_batch_size": 4, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 32, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.42.0.dev0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 32768 +} diff --git a/layout_opt/command.txt b/layout_opt/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..46d6fa42a12e872a9e04b4e312b9fc70e847859c --- /dev/null +++ b/layout_opt/command.txt @@ -0,0 +1 @@ +neuronx-cc compile graph.hlo --framework XLA --target trn1 --output graph.neff --model-type=transformer -O1 --lnc=1 '--internal-hlo2tensorizer-options=--experimental-unsafe-fp8e4m3fn-as-fp8e4m3 --verify-hlo=false' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/layout_opt/graph.neff b/layout_opt/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..5f754c76818fbaa3c53adb5b4e24e928f4c815f0 --- /dev/null +++ b/layout_opt/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:140da06783df36b3d25d8903dc194df46247db9b5b03ef10b1abebe30d252275 +size 5848064 diff --git a/layout_opt/log-neuron-cc.txt b/layout_opt/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a1a173aae03961fbe4efde51f872238c3034656 --- /dev/null +++ b/layout_opt/log-neuron-cc.txt @@ -0,0 +1,1958 @@ +2025-09-05T19:18:35Z INFO 1939 [root]: /opt/conda/bin/neuronx-cc compile /models/mistral-7b-v0.3-instruct-neuronx/layout_opt/model/graph.hlo --framework XLA --target trn1 --output /models/mistral-7b-v0.3-instruct-neuronx/layout_opt/graph.neff --model-type=transformer -O1 --lnc=1 '--internal-hlo2tensorizer-options=--experimental-unsafe-fp8e4m3fn-as-fp8e4m3 --verify-hlo=false' --logfile=/models/mistral-7b-v0.3-instruct-neuronx/layout_opt/log-neuron-cc.txt --verbose=35 +2025-09-05T19:18:41Z INFO 1939 [root]: NeuronX Compiler version 2.20.9961.0+0acef03a Python version 3.10.17 HWM version 2.20.0.9961+0acef03a NumPy version 1.26.4 +2025-09-05T19:18:41Z INFO 1971 [root]: XLA detected +2025-09-05T19:18:41Z INFO 1971 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-09-05T19:18:41Z INFO 1971 [root]: Intermediate files stored in /opt/vllm/neuronxcc-gss0wi_w, output in /opt/vllm +2025-09-05T19:18:41Z INFO 1971 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-09-05T19:18:41Z INFO 1971 [pipeline.Pipeline.0]: Processing input #0 +2025-09-05T19:18:41Z INFO 1971 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-09-05T19:18:41Z INFO 1971 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-09-05T19:18:41Z INFO 1971 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-09-05T19:18:41Z INFO 1971 [job.HLOToTensorizer.0]: Processing input #0 +2025-09-05T19:18:41Z INFO 1971 [job.HLOToTensorizer.0]: IR signature: b7a76fc5f3f76d1d69d57e0e784721bafd07e3a61734f6594e8c815123a8a771 for graph.hlo +2025-09-05T19:18:41Z INFO 1971 [job.HLOToTensorizer.0]: Executing: /opt/conda/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /models/mistral-7b-v0.3-instruct-neuronx/layout_opt/model/graph.hlo --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --experimental-unsafe-fp8e4m3fn-as-fp8e4m3 --verify-hlo=false --native-to-custom-softmax --partitioner-opts='--transformer' +2025-09-05T19:18:41Z INFO 1971 [job.HLOToTensorizer.0]: DEBUG: needsModular? No. macCnt 0 num non-trivial Ops 290 +INFO: Switching to single-module compile. PrePartitionPipe skipped. +INFO: Found memory bound graph +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 0 +INFO: Traffic has found 7248289792 +INFO: AIF 0 +HLO Ops used in computation: parameter reshape transpose tuple +Warning: Could not open file debug_info_hlo_partitions.json +2025-09-05 19:18:41.368875: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:52] Truncating long HLO operator name %last = tuple(%p68, %transpose.290, %transpose.291, %transpose.292, %transpose.293, %transpose.294, %transpose.295, %transpose.296, %transpose.297, %transpose.298, %transpose.299, %transpose.300, %transpose.301, %transpose.302, %transpose.303, %transpose.304, %transpose.305, %transpose.306, %transpose.307, %transpose.308, %transpose.309, %transpose.310, %transpose.311, %transpose.312, %transpose.313, %transpose.314, %transpose.315, %transpose.316, %transpose.317, %transpose.318, %transpose.319, %transpos... to 512 characters in the compiler's debug metadata +Invoking RemoveOptimizationBarriers pass + +2025-09-05T19:18:41Z INFO 1971 [job.HLOToTensorizer.0]: IR signature: 2e0dd598350cceee9da216d825f016375db09d3b92ca63d286d74eb4016c4d07 for sg0000/HLOToTensorizer +2025-09-05T19:18:41Z INFO 1971 [job.HLOToTensorizer.0]: Job #0 finished +2025-09-05T19:18:41Z INFO 1971 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-09-05T19:18:41Z INFO 1971 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-09-05T19:18:41Z INFO 1971 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-09-05T19:18:41Z INFO 1971 [job.Frontend.0]: Processing input #0 +2025-09-05T19:18:41Z INFO 1971 [job.Frontend.0]: Start model loading +2025-09-05T19:18:41Z INFO 1971 [job.Frontend.0]: Start tensorization +2025-09-05T19:18:41Z INFO 1971 [job.Frontend.0]: Num jobs: 32 +2025-09-05T19:18:41Z USER 1971 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-09-05T19:18:41Z INFO 1971 [Tensorizer]: Frontend did not find netlist info. Switching to flat flow. +2025-09-05T19:18:41Z INFO 1971 [Tensorizer]: Building model from Penguin script "penguin.py"... +2025-09-05T19:18:41Z INFO 1971 [Tensorizer]: Tensorizer options: --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=matmult-bf16 --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-09-05T19:18:41Z INFO 1971 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-09-05T19:18:41Z INFO 1971 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-09-05T19:18:41Z INFO 1971 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-09-05T19:18:41Z INFO 1971 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-09-05T19:18:41Z INFO 1971 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-09-05T19:18:41Z INFO 1971 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.003 seconds +2025-09-05T19:18:41Z INFO 1971 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-09-05T19:18:41Z INFO 1971 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-09-05T19:18:41Z INFO 1971 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.004 seconds +2025-09-05T19:18:41Z INFO 1971 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-09-05T19:18:41Z INFO 1971 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-09-05T19:18:41Z INFO 1971 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.032 seconds +2025-09-05T19:18:41Z INFO 1971 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.011 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.004 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.033 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.047 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.014 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.004 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.068 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.006 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.060 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.005 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.005 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.016 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Rematerialization]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [Tensorizer]: After optimization: 290 statements +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AutoCastTCInputs]: Running AutoCastTCInputs +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AutoCastTCInputs]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AutoCastTCInputs]: AutoCastTCInputs finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.007 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.004 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.038 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.019 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.005 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.035 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.017 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.016 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.004 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.031 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.048 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.007 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.003 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.019 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.114 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.002 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.001 seconds +2025-09-05T19:18:42Z INFO 1971 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=True) +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 14.818 seconds +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 15.072 seconds +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 4096: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.174 seconds +2025-09-05T19:18:57Z INFO 1971 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.101 seconds +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.654 seconds +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.101 seconds +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/LICM]: LICM finished after 0.030 seconds +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.023 seconds +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:18:58Z INFO 1971 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.073 seconds +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.084 seconds +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.148 seconds +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4096: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1792: transpose_128x128 +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.029 seconds +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.107 seconds +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=False) +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.053 seconds +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.106 seconds +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.022 seconds +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.080 seconds +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=False) +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.018 seconds +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=False) +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.006 seconds +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.066 seconds +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.087 seconds +2025-09-05T19:18:59Z INFO 1971 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-09-05T19:19:00Z INFO 1971 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-09-05T19:19:00Z INFO 1971 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.437 seconds +2025-09-05T19:19:00Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-09-05T19:19:00Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-09-05T19:19:00Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.109 seconds +2025-09-05T19:19:00Z INFO 1971 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-09-05T19:19:00Z INFO 1971 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-09-05T19:19:00Z INFO 1971 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.068 seconds +2025-09-05T19:19:00Z INFO 1971 [sg0000/Tensorizer/LICM]: Running LICM +2025-09-05T19:19:00Z INFO 1971 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-09-05T19:19:00Z INFO 1971 [sg0000/Tensorizer/LICM]: LICM finished after 0.031 seconds +2025-09-05T19:19:00Z INFO 1971 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.472 seconds +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=True) +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.039 seconds +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.051 seconds +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.022 seconds +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.018 seconds +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.014 seconds +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.076 seconds +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.018 seconds +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.070 seconds +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.092 seconds +2025-09-05T19:19:01Z INFO 1971 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 1.250 seconds +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.035 seconds +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.016 seconds +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.023 seconds +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.009 seconds +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.008 seconds +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.001 seconds +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.017 seconds +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=False) +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.007 seconds +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=False) +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.008 seconds +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.069 seconds +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.004 seconds +2025-09-05T19:19:02Z INFO 1971 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=False) +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.122 seconds +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.038 seconds +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.415 seconds +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.016 seconds +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.107 seconds +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.013 seconds +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/SpillPSum]: Finished (changed=False) +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.120 seconds +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.015 seconds +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.012 seconds +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.089 seconds +2025-09-05T19:19:03Z INFO 1971 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.063 seconds +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.137 seconds +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.013 seconds +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.114 seconds +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.029 seconds +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.015 seconds +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.014 seconds +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.025 seconds +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.050 seconds +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.010 seconds +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/DataStreaming]: Finished (changed=False) +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.026 seconds +2025-09-05T19:19:04Z INFO 1971 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 2.728 seconds +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.050 seconds +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.015 seconds +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.015 seconds +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 583.920us (128.000MiB, est bw: 229.856GB/s, 1.157% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (128, 128, 4096) %'19629.24343'[T_i0,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (128, 128, 4096) %'input289'[T_i0,i0.128,i1.4096] # id=23881, src_id=None, , instances=128 # dl = tensor_op_name: t4051_pftranspose_19629 | hlo_id: 2097 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 349.240us (128.000MiB, est bw: 384.314GB/s, 0.692% of tot. time) for bfloat16<128 x 4096> bfloat16 (128, 128, 32, 128) %'output289'[16i0_18312_0+i0_18312_1,i0.128,i2.32,i1.128] = store bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (128, 128, 4096) %'t4051_pftranspose_19629'[16i0_18312_0+i0_18312_1,i0.128,i1.128+128i2.32] # id=21501, src_id=None, , instances=128 # dl = tensor_op_name: _transpose.578 | hlo_id: 2097 | [[i0.128];[i1.128, i2.32]] -> [[i0.128];[i1.128, i2.32]] +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 262.710us (56.000MiB, est bw: 223.518GB/s, 0.521% of tot. time) for bfloat16<128 x 3584> TongaSB partitions[2] bfloat16 (32, 2, 128, 3584) %'18341.23903'[T_i0,T_i2_26087,i0.128,i1.3584] = load bfloat16<128 x 3584> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3584) %'input6'[T_i0,i0.128,T_i2_26087,i1.3584] # id=22057, src_id=None, , instances=64 # dl = tensor_op_name: t2070_pftranspose_18341 | hlo_id: 1531 | [[i0.128];[i1.3584]] -> [[i0.128];[i1.3584]] +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 262.710us (56.000MiB, est bw: 223.518GB/s, 0.521% of tot. time) for bfloat16<128 x 3584> TongaSB partitions[2] bfloat16 (32, 2, 128, 3584) %'18382.23917'[T_i0,T_i2_26095,i0.128,i1.3584] = load bfloat16<128 x 3584> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3584) %'input15'[T_i0,i0.128,T_i2_26095,i1.3584] # id=22115, src_id=None, , instances=64 # dl = tensor_op_name: t2133_pftranspose_18382 | hlo_id: 1549 | [[i0.128];[i1.3584]] -> [[i0.128];[i1.3584]] +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 262.710us (56.000MiB, est bw: 223.518GB/s, 0.521% of tot. time) for bfloat16<128 x 3584> TongaSB partitions[2] bfloat16 (32, 2, 128, 3584) %'18423.23931'[T_i0,T_i2_26103,i0.128,i1.3584] = load bfloat16<128 x 3584> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3584) %'input24'[T_i0,i0.128,T_i2_26103,i1.3584] # id=22173, src_id=None, , instances=64 # dl = tensor_op_name: t2196_pftranspose_18423 | hlo_id: 1567 | [[i0.128];[i1.3584]] -> [[i0.128];[i1.3584]] +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 262.710us (56.000MiB, est bw: 223.518GB/s, 0.521% of tot. time) for bfloat16<128 x 3584> TongaSB partitions[2] bfloat16 (32, 2, 128, 3584) %'18464.23945'[T_i0,T_i2_26111,i0.128,i1.3584] = load bfloat16<128 x 3584> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3584) %'input33'[T_i0,i0.128,T_i2_26111,i1.3584] # id=22231, src_id=None, , instances=64 # dl = tensor_op_name: t2259_pftranspose_18464 | hlo_id: 1585 | [[i0.128];[i1.3584]] -> [[i0.128];[i1.3584]] +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 262.710us (56.000MiB, est bw: 223.518GB/s, 0.521% of tot. time) for bfloat16<128 x 3584> TongaSB partitions[2] bfloat16 (32, 2, 128, 3584) %'18505.23959'[T_i0,T_i2_26119,i0.128,i1.3584] = load bfloat16<128 x 3584> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3584) %'input42'[T_i0,i0.128,T_i2_26119,i1.3584] # id=22289, src_id=None, , instances=64 # dl = tensor_op_name: t2322_pftranspose_18505 | hlo_id: 1603 | [[i0.128];[i1.3584]] -> [[i0.128];[i1.3584]] +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 262.710us (56.000MiB, est bw: 223.518GB/s, 0.521% of tot. time) for bfloat16<128 x 3584> TongaSB partitions[2] bfloat16 (32, 2, 128, 3584) %'18546.23973'[T_i0,T_i2_26127,i0.128,i1.3584] = load bfloat16<128 x 3584> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3584) %'input51'[T_i0,i0.128,T_i2_26127,i1.3584] # id=22347, src_id=None, , instances=64 # dl = tensor_op_name: t2385_pftranspose_18546 | hlo_id: 1621 | [[i0.128];[i1.3584]] -> [[i0.128];[i1.3584]] +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 262.710us (56.000MiB, est bw: 223.518GB/s, 0.521% of tot. time) for bfloat16<128 x 3584> TongaSB partitions[2] bfloat16 (32, 2, 128, 3584) %'18587.23987'[T_i0,T_i2_26135,i0.128,i1.3584] = load bfloat16<128 x 3584> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3584) %'input60'[T_i0,i0.128,T_i2_26135,i1.3584] # id=22405, src_id=None, , instances=64 # dl = tensor_op_name: t2448_pftranspose_18587 | hlo_id: 1639 | [[i0.128];[i1.3584]] -> [[i0.128];[i1.3584]] +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 262.710us (56.000MiB, est bw: 223.518GB/s, 0.521% of tot. time) for bfloat16<128 x 3584> TongaSB partitions[2] bfloat16 (32, 2, 128, 3584) %'18628.24001'[T_i0,T_i2_26143,i0.128,i1.3584] = load bfloat16<128 x 3584> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3584) %'input69'[T_i0,i0.128,T_i2_26143,i1.3584] # id=22463, src_id=None, , instances=64 # dl = tensor_op_name: t2511_pftranspose_18628 | hlo_id: 1657 | [[i0.128];[i1.3584]] -> [[i0.128];[i1.3584]] +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.029 seconds +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.015 seconds +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.304 seconds +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-09-05T19:19:07Z WARNING 1971 [sg0000/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 100.00 percent of all matmul computation +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.035 seconds +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.130 seconds +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.032 seconds +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.038 seconds +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-09-05T19:19:07Z INFO 1971 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-09-05T19:19:08Z INFO 1971 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-09-05T19:19:08Z INFO 1971 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.618 seconds +2025-09-05T19:19:09Z INFO 1971 [Tensorizer]: BirCodeGen estimate #instances=285991 in sg0000 +2025-09-05T19:19:09Z INFO 1971 [Tensorizer]: IR signature: 868d010c2395728dde8cb4de874491d6109b3553bfa434107fadd7604cffd131 for nc00/sg0000/TensorizerBIR +2025-09-05T19:19:09Z INFO 1971 [Tensorizer]: Weights total number of bytes: 131072 +2025-09-05T19:19:09Z INFO 1971 [Tensorizer]: Successfully built model. +2025-09-05T19:19:09Z USER 1971 [root/Tensorizer/Tensorizer]: Tensorizer finished after 27.551 seconds +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: End tensorization +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input0 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input1 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input2 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input3 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input4 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input5 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input6 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input7 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input8 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input9 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input10 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input11 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input12 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input13 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input14 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input15 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input16 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input17 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input18 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input19 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input20 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input21 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input22 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input23 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input24 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input25 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input26 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input27 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input28 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input29 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input30 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input31 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input32 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input33 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input34 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input35 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input36 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input37 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input38 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input39 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input40 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input41 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input42 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input43 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input44 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input45 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input46 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input47 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input48 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input49 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input50 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input51 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input52 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input53 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input54 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input55 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input56 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input57 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input58 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input59 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input60 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input61 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input62 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input63 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input64 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input65 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input66 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input67 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input68 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input69 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input70 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input71 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input72 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input73 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input74 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input75 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input76 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input77 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input78 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input79 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input80 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input81 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input82 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input83 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input84 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input85 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input86 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input87 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input88 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input89 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input90 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input91 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input92 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input93 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input94 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input95 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input96 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input97 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input98 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input99 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input100 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input101 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input102 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input103 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input104 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input105 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input106 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input107 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input108 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input109 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input110 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input111 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input112 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input113 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input114 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input115 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input116 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input117 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input118 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input119 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input120 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input121 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input122 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input123 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input124 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input125 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input126 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input127 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input128 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input129 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input130 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input131 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input132 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input133 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input134 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input135 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input136 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input137 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input138 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input139 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input140 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input141 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input142 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input143 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input144 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input145 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input146 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input147 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input148 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input149 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input150 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input151 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input152 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input153 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input154 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input155 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input156 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input157 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input158 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input159 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input160 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input161 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input162 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input163 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input164 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input165 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input166 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input167 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input168 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input169 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input170 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input171 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input172 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input173 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input174 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input175 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input176 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input177 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input178 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input179 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input180 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input181 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input182 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input183 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input184 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input185 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input186 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input187 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input188 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input189 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input190 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input191 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input192 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input193 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input194 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input195 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input196 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input197 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input198 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input199 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input200 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input201 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input202 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input203 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input204 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input205 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input206 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input207 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input208 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input209 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input210 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input211 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input212 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input213 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input214 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input215 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input216 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input217 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input218 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input219 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input220 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input221 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input222 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input223 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input224 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input225 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input226 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input227 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input228 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input229 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input230 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input231 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input232 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input233 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input234 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input235 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input236 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input237 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input238 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input239 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input240 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input241 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input242 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input243 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input244 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input245 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input246 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input247 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input248 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input249 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input250 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input251 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input252 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input253 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input254 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input255 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input256 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input257 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input258 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input259 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input260 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input261 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input262 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input263 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input264 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input265 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input266 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input267 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input268 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input269 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input270 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input271 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input272 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input273 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input274 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input275 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input276 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input277 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input278 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input279 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input280 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input281 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input282 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input283 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input284 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input285 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input286 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input287 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input288 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input289 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Network input: input290 +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: wrote bir.json +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: wrote tensor_map.json +2025-09-05T19:19:09Z INFO 1971 [job.Frontend.0]: Job #0 finished +2025-09-05T19:19:09Z INFO 1971 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-09-05T19:19:09Z INFO 1971 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-09-05T19:19:09Z INFO 1971 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-09-05T19:19:09Z INFO 1971 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-09-05T19:19:09Z INFO 1971 [job.WalrusDriver.0]: BackendDriver has 1 states with 1 core LNC +2025-09-05T19:19:09Z INFO 1971 [job.WalrusDriver.0]: BackendDriver: no partitions found. Switching to flat flow. +2025-09-05T19:19:09Z INFO 1971 [job.WalrusDriver.0]: Job WalrusDriver len(in_states) 1 +2025-09-05T19:19:09Z INFO 1971 [job.WalrusDriver.0]: Processing input #0 +2025-09-05T19:19:09Z INFO 1971 [job.WalrusDriver.0]: BackendDriver in_state.num_states 1 with 1 core LNC +2025-09-05T19:19:09Z INFO 1971 [job.WalrusDriver.0]: Executing /opt/conda/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /models/mistral-7b-v0.3-instruct-neuronx/layout_opt/log-neuron-cc.txt --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-color-rotation --max-load-lower-bound 0.14 --mm-reorder-opt --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/conda/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/conda/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json --unified-backend-and-legacy-codegen --tensor-map tensor_map.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --dge-levels vector_dynamic_offsets,scalar_dynamic_offset,io --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /models/mistral-7b-v0.3-instruct-neuronx/layout_opt/graph.neff +2025-09-05T19:19:09Z INFO 1971 [job.WalrusDriver.0]: Working directory is /opt/vllm/neuronxcc-gss0wi_w/sg00 +2025-09-05T19:19:09Z INFO 1971 [job.WalrusDriver.0]: propagate_exit=True +2025-09-05T19:19:09Z INFO 1971 [job.WalrusDriver.0]: use_logger=False +2025-09-05T19:19:09Z INFO 1971 [job.WalrusDriver.0]: expose_stderr=True +2025-09-05T19:19:09Z INFO 2007 [Logging]: Logging to ../../../../models/mistral-7b-v0.3-instruct-neuronx/layout_opt/log-neuron-cc.txt at level 'INFO' +2025-09-05T19:19:09Z INFO 2007 [BackendDriver]: max_allowed_parallelism=32 +2025-09-05T19:19:09Z INFO 2007 [BackendDriver]: Backend driver mtBackend: false numModules: 1 Cwd: "/opt/vllm/neuronxcc-gss0wi_w/sg00" +2025-09-05T19:19:09Z INFO 2007 [BackendDriver]: DynamicDMA is enabled +2025-09-05T19:19:09Z INFO 2007 [BackendDriver]: DynamicDMA levels being enabled: io, scalar_dynamic_offset, vector_dynamic_offsets, +2025-09-05T19:19:09Z USER 2007 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:19:09Z INFO 2007 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=1455 blocks=1 instructions=775 Max writers: 1 Max Readers: 290 +2025-09-05T19:19:09Z USER 2007 [ModuleForkPass]: Running do_nothing +2025-09-05T19:19:09Z INFO 2007 [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=1455 blocks=1 instructions=775 Max writers: 1 Max Readers: 290 +2025-09-05T19:19:09Z USER 2007 [ModuleForkPass]: do_nothing finished after 0.002 seconds +2025-09-05T19:19:09Z INFO 2007 [ModuleForkPass]: curr_vmrss: 175mb, ru_maxrss: 419mb (delta=0mb) +2025-09-05T19:19:09Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1455 memory location(s), 1 block(s), and 775 instruction(s). Max writers: 1 Max Readers: 290 +2025-09-05T19:19:09Z USER 2007 [ModuleForkPass]: Running birverifier +2025-09-05T19:19:09Z INFO 2007 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1455 blocks=1 instructions=775 Max writers: 1 Max Readers: 290 +2025-09-05T19:19:10Z USER 2007 [ModuleForkPass]: birverifier finished after 0.275 seconds +2025-09-05T19:19:10Z INFO 2007 [ModuleForkPass]: curr_vmrss: 961mb, ru_maxrss: 961mb (delta=542mb) +2025-09-05T19:19:10Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1455 memory location(s), 1 block(s), and 775 instruction(s). Max writers: 1 Max Readers: 290 +2025-09-05T19:19:10Z USER 2007 [BackendPassManager]: mod_parallel_pass finished after 0.282 seconds +2025-09-05T19:19:10Z INFO 2007 [BackendPassManager]: curr_vmrss: 961mb, ru_maxrss: 961mb (delta=542mb) +2025-09-05T19:19:10Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 1455 memory location(s), 1 block(s), and 775 instruction(s). Max writers: 1 Max Readers: 290 +2025-09-05T19:19:10Z USER 2007 [BackendPassManager]: Running subgraph_parallel_pass +2025-09-05T19:19:10Z INFO 2007 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=1455 blocks=1 instructions=775 Max writers: 1 Max Readers: 290 +2025-09-05T19:19:10Z USER 2007 [SubgraphForkPass]: Running lnc_verifier +2025-09-05T19:19:10Z INFO 2007 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=1455 blocks=1 instructions=775 Max writers: 1 Max Readers: 290 +2025-09-05T19:19:10Z USER 2007 [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-09-05T19:19:10Z INFO 2007 [SubgraphForkPass]: curr_vmrss: 961mb, ru_maxrss: 961mb (delta=0mb) +2025-09-05T19:19:10Z INFO 2007 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 1455 memory location(s), 1 block(s), and 775 instruction(s). Max writers: 1 Max Readers: 290 +2025-09-05T19:19:10Z USER 2007 [BackendPassManager]: subgraph_parallel_pass finished after 0.001 seconds +2025-09-05T19:19:10Z INFO 2007 [BackendPassManager]: curr_vmrss: 961mb, ru_maxrss: 961mb (delta=0mb) +2025-09-05T19:19:10Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 1455 memory location(s), 1 block(s), and 775 instruction(s). Max writers: 1 Max Readers: 290 +2025-09-05T19:19:10Z USER 2007 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:19:10Z INFO 2007 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=1455 blocks=1 instructions=775 Max writers: 1 Max Readers: 290 +2025-09-05T19:19:10Z USER 2007 [ModuleForkPass]: Running expand_replication +2025-09-05T19:19:10Z INFO 2007 [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=1455 blocks=1 instructions=775 Max writers: 1 Max Readers: 290 +2025-09-05T19:19:10Z INFO 2007 [ExpandReplication]: Found 0 replicated matmults +2025-09-05T19:19:10Z USER 2007 [ModuleForkPass]: expand_replication finished after 0.001 seconds +2025-09-05T19:19:10Z INFO 2007 [ModuleForkPass]: curr_vmrss: 961mb, ru_maxrss: 961mb (delta=0mb) +2025-09-05T19:19:10Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1455 memory location(s), 1 block(s), and 775 instruction(s). Max writers: 1 Max Readers: 290 +2025-09-05T19:19:10Z USER 2007 [ModuleForkPass]: Running unroll +2025-09-05T19:19:10Z INFO 2007 [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=1455 blocks=1 instructions=775 Max writers: 1 Max Readers: 290 +2025-09-05T19:19:10Z INFO 2007 [Unroll]: INFO (Unroll) Start unrolling at Fri Sep 5 19:19:10 2025 +2025-09-05T19:19:12Z INFO 2007 [Unroll]: INFO (Unroll) DONE unrolling Fri Sep 5 19:19:10 2025 + +2025-09-05T19:19:12Z INFO 2007 [Unroll]: sg0000 Instruction count after Unroll: +2025-09-05T19:19:12Z INFO 2007 [Unroll]: Total count: 285701 +2025-09-05T19:19:12Z INFO 2007 [Unroll]: Matmult: 217153 +2025-09-05T19:19:12Z INFO 2007 [Unroll]: GenericCopy: 54337 +2025-09-05T19:19:12Z INFO 2007 [Unroll]: Load: 7106 +2025-09-05T19:19:12Z INFO 2007 [Unroll]: Save: 7105 +2025-09-05T19:19:12Z INFO 2007 [Unroll]: Unrolled DGE count with Dynamic AP: 0 +2025-09-05T19:19:12Z USER 2007 [ModuleForkPass]: unroll finished after 2.471 seconds +2025-09-05T19:19:12Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2540mb, ru_maxrss: 2541mb (delta=1580mb) +2025-09-05T19:19:12Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69807 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:12Z USER 2007 [BackendPassManager]: mod_parallel_pass finished after 2.543 seconds +2025-09-05T19:19:12Z INFO 2007 [BackendPassManager]: curr_vmrss: 1679mb, ru_maxrss: 2541mb (delta=1580mb) +2025-09-05T19:19:12Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69807 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:12Z USER 2007 [BackendPassManager]: Running subgraph_parallel_pass +2025-09-05T19:19:12Z INFO 2007 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=69807 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:12Z USER 2007 [SubgraphForkPass]: Running dead_code_elim +2025-09-05T19:19:12Z INFO 2007 [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=69807 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:12Z INFO 2007 [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:19:12Z INFO 2007 [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:19:12Z INFO 2007 [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:19:12Z INFO 2007 [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:19:12Z USER 2007 [SubgraphForkPass]: dead_code_elim finished after 0.257 seconds +2025-09-05T19:19:12Z INFO 2007 [SubgraphForkPass]: curr_vmrss: 1699mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:12Z INFO 2007 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:12Z USER 2007 [BackendPassManager]: subgraph_parallel_pass finished after 0.266 seconds +2025-09-05T19:19:12Z INFO 2007 [BackendPassManager]: curr_vmrss: 1699mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:12Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:12Z USER 2007 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:19:12Z INFO 2007 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:12Z USER 2007 [ModuleForkPass]: Running birverifier +2025-09-05T19:19:12Z INFO 2007 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: birverifier finished after 0.232 seconds +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1753mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [BackendPassManager]: mod_parallel_pass finished after 0.239 seconds +2025-09-05T19:19:13Z INFO 2007 [BackendPassManager]: curr_vmrss: 1753mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:13Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [BackendPassManager]: Running subgraph_parallel_pass +2025-09-05T19:19:13Z INFO 2007 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [SubgraphForkPass]: Running lnc_verifier +2025-09-05T19:19:13Z INFO 2007 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [SubgraphForkPass]: lnc_verifier finished after 0.003 seconds +2025-09-05T19:19:13Z INFO 2007 [SubgraphForkPass]: curr_vmrss: 1753mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:13Z INFO 2007 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [BackendPassManager]: subgraph_parallel_pass finished after 0.010 seconds +2025-09-05T19:19:13Z INFO 2007 [BackendPassManager]: curr_vmrss: 1753mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:13Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:19:13Z INFO 2007 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: Running instruction_reorder +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: instruction_reorder finished after 0.027 seconds +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1753mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: Running psum_legalization +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: psum_legalization finished after 0.031 seconds +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1753mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: Running legalize_cce_dma +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: legalize_cce_dma finished after 0.014 seconds +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1753mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: Running error_injector +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z WARNING 2007 [ErrorInjector]: Unrecognized injected error value "0" +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: error_injector finished after 0.003 seconds +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1753mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: Running vn_splitter +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z INFO 2007 [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-09-05T19:19:13Z INFO 2007 [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-09-05T19:19:13Z INFO 2007 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-09-05T19:19:13Z INFO 2007 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-09-05T19:19:13Z INFO 2007 [VNSplitterPass]: INFO (VNSplitter) Time: 0.004 seconds +2025-09-05T19:19:13Z INFO 2007 [VNSplitterPass]: INFO (VerticalFusion) Time: 0.073 seconds +2025-09-05T19:19:13Z INFO 2007 [VNSplitterPass]: INFO (ShrinkDN) Time: 0.075 seconds +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: vn_splitter finished after 0.186 seconds +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1764mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: Running constant_propagate +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z INFO 2007 [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-09-05T19:19:13Z INFO 2007 [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-09-05T19:19:13Z INFO 2007 [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:19:13Z INFO 2007 [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:19:13Z INFO 2007 [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:19:13Z INFO 2007 [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-09-05T19:19:13Z INFO 2007 [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-09-05T19:19:13Z INFO 2007 [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:19:13Z INFO 2007 [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:19:13Z INFO 2007 [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: constant_propagate finished after 0.482 seconds +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1768mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:13Z USER 2007 [ModuleForkPass]: Running lower_ac +2025-09-05T19:19:13Z INFO 2007 [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:14Z INFO 2007 [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-09-05T19:19:14Z USER 2007 [ModuleForkPass]: lower_ac finished after 0.026 seconds +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1768mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:14Z USER 2007 [ModuleForkPass]: Running input_dma_coalescing +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:14Z INFO 2007 [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-09-05T19:19:14Z USER 2007 [ModuleForkPass]: input_dma_coalescing finished after 0.078 seconds +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1768mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:14Z USER 2007 [ModuleForkPass]: Running remat_optimization +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:14Z INFO 2007 [RematOpt]: Removed 0 remat instructions +2025-09-05T19:19:14Z USER 2007 [ModuleForkPass]: remat_optimization finished after 0.174 seconds +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1772mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:14Z USER 2007 [ModuleForkPass]: Running early_peephole_opts +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:14Z INFO 2007 [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-09-05T19:19:14Z INFO 2007 [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-09-05T19:19:14Z USER 2007 [ModuleForkPass]: early_peephole_opts finished after 0.048 seconds +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1772mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:14Z USER 2007 [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:14Z USER 2007 [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.015 seconds +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1772mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:14Z USER 2007 [ModuleForkPass]: Running infer_stream_ids +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:14Z USER 2007 [ModuleForkPass]: infer_stream_ids finished after 0.015 seconds +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1772mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:14Z USER 2007 [ModuleForkPass]: Running pre_sched +2025-09-05T19:19:14Z INFO 2007 [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:14Z INFO 2007 [PreSched]: Start PRE scheduling 2 cores: 1 at: Fri Sep 5 19:19:14 2025 +2025-09-05T19:19:14Z INFO 2007 [LayerSpiller]: LayerSpill: Start... +2025-09-05T19:19:14Z INFO 2007 [LayerSpiller]: LayerSpill: Found 0 Splits CCs +2025-09-05T19:19:14Z INFO 2007 [LayerSpiller]: Grouped CCs to 0 clusters. +2025-09-05T19:19:14Z INFO 2007 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-09-05T19:19:14Z INFO 2007 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-09-05T19:19:14Z INFO 2007 [LayerSpiller]: LayerSpill: Done. +2025-09-05T19:19:14Z INFO 2007 [PreSched]: Start split live ranges Fri Sep 5 19:19:14 2025 +2025-09-05T19:19:14Z INFO 2007 [PreSched]: Num_Splits: 0 +2025-09-05T19:19:14Z INFO 2007 [PreSched]: End split live ranges Fri Sep 5 19:19:14 2025 +2025-09-05T19:19:14Z INFO 2007 [PreSched]: Strt remove redundncies Fri Sep 5 19:19:14 2025 +2025-09-05T19:19:14Z INFO 2007 [PreSched]: remove_redundant_memsets +2025-09-05T19:19:14Z INFO 2007 [PreSched]: remove_redundant_memsets: 0 +2025-09-05T19:19:14Z INFO 2007 [PreSched]: remove_redundant_loads +2025-09-05T19:19:14Z INFO 2007 [PreSched]: remove_redundant_loads: 0 +2025-09-05T19:19:14Z INFO 2007 [PreSched]: End remove redundncies Fri Sep 5 19:19:14 2025 +2025-09-05T19:19:14Z INFO 2007 [PreSched]: Start DCE Fri Sep 5 19:19:14 2025 +2025-09-05T19:19:14Z INFO 2007 [PreSched]: eliminateDeadStore removed 0 instructions +2025-09-05T19:19:14Z INFO 2007 [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:19:14Z INFO 2007 [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:19:14Z INFO 2007 [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:19:14Z INFO 2007 [PreSched]: End DCE Fri Sep 5 19:19:14 2025 +2025-09-05T19:19:14Z INFO 2007 [PreSched]: Start build flow dependencies Fri Sep 5 19:19:14 2025 +2025-09-05T19:19:14Z INFO 2007 [build_flow_deps]: Start build fdeps. Invocation: 1Fri Sep 5 19:19:14 2025 +2025-09-05T19:19:14Z INFO 2007 [build_flow_deps]: Allocs: 69132 instructions: 285701 +2025-09-05T19:19:15Z INFO 2007 [build_flow_deps]: Build fdeps inserted 714949 edges +2025-09-05T19:19:15Z INFO 2007 [build_flow_deps]: Done build fdeps 714949 Fri Sep 5 19:19:15 2025 +2025-09-05T19:19:15Z INFO 2007 [PreSched]: End build flow dependencies Fri Sep 5 19:19:15 2025 +2025-09-05T19:19:15Z INFO 2007 [PreSched]: Start remove useless insts Fri Sep 5 19:19:15 2025 +2025-09-05T19:19:15Z INFO 2007 [PreSched]: remove_useless_insts +2025-09-05T19:19:16Z INFO 2007 [PreSched]: remove Useless Instructions: 0 +2025-09-05T19:19:16Z INFO 2007 [PreSched]: End remove useless insts Fri Sep 5 19:19:16 2025 +2025-09-05T19:19:16Z INFO 2007 [PreSched]: Start scratchpad optimization Fri Sep 5 19:19:16 2025 +2025-09-05T19:19:16Z INFO 2007 [PreSched]: End scratchpad optimization Fri Sep 5 19:19:16 2025 +2025-09-05T19:19:16Z INFO 2007 [PreSched]: DONE PRE scheduling Fri Sep 5 19:19:16 2025 +2025-09-05T19:19:16Z USER 2007 [ModuleForkPass]: pre_sched finished after 1.914 seconds +2025-09-05T19:19:16Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1884mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:16Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:16Z USER 2007 [ModuleForkPass]: Running tensor_copy_elim +2025-09-05T19:19:16Z INFO 2007 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:16Z INFO 2007 [TensorCopyElim]: Tensor CP elimination: 0 +2025-09-05T19:19:16Z INFO 2007 [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:19:16Z INFO 2007 [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:19:16Z INFO 2007 [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-09-05T19:19:16Z INFO 2007 [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-09-05T19:19:16Z USER 2007 [ModuleForkPass]: tensor_copy_elim finished after 0.341 seconds +2025-09-05T19:19:16Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1884mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:16Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:16Z USER 2007 [ModuleForkPass]: Running dynamic_dma_setup +2025-09-05T19:19:16Z INFO 2007 [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:16Z USER 2007 [ModuleForkPass]: dynamic_dma_setup finished after 0.004 seconds +2025-09-05T19:19:16Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1884mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:16Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69133 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:16Z USER 2007 [ModuleForkPass]: Running runtime_memory_reservation +2025-09-05T19:19:16Z INFO 2007 [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=69133 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:16Z USER 2007 [ModuleForkPass]: runtime_memory_reservation finished after 0.004 seconds +2025-09-05T19:19:16Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1884mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:16Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69133 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:16Z USER 2007 [ModuleForkPass]: Running coloring_allocator_psum +2025-09-05T19:19:16Z INFO 2007 [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=69133 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:16Z INFO 2007 [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:19:16Z INFO 2007 [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:19:16Z INFO 2007 [PSUM_Allocator]: allocating PSUM +2025-09-05T19:19:16Z INFO 2007 [PSUM_Allocator]: main loop +2025-09-05T19:19:16Z INFO 2007 [PSUM_Allocator]: renumber locations +2025-09-05T19:19:16Z INFO 2007 [PSUM_Allocator]: size = 54337 +2025-09-05T19:19:16Z INFO 2007 [PSUM_Allocator]: build_no_bitmap start +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: 100% PSUM demand before spilling +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: found 182784 edges +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: mean: 6.72779 +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: median: 6.99997 +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: adjacency vectors require 1462272 bytes +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: build_no_bitmap done +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: find costs +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: simplify interference graph +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: initialize low and high +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: lo = 54337 +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: hi = 0 +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: inf = 0 +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: total = 54337 +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: simplify +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: new candidates = 0 +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: select ranges +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: no more spills +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-09-05T19:19:17Z INFO 2007 [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-09-05T19:19:17Z USER 2007 [ModuleForkPass]: coloring_allocator_psum finished after 0.597 seconds +2025-09-05T19:19:17Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1901mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:17Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69133 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:17Z USER 2007 [ModuleForkPass]: Running dma_optimization_psum +2025-09-05T19:19:17Z INFO 2007 [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=69133 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:17Z INFO 2007 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-09-05T19:19:17Z INFO 2007 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-09-05T19:19:17Z USER 2007 [ModuleForkPass]: dma_optimization_psum finished after 0.203 seconds +2025-09-05T19:19:17Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1901mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:17Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69133 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:17Z USER 2007 [ModuleForkPass]: Running address_rotation_psum +2025-09-05T19:19:17Z INFO 2007 [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=69133 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:18Z INFO 2007 [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-09-05T19:19:18Z INFO 2007 [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-09-05T19:19:19Z INFO 2007 [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-09-05T19:19:19Z USER 2007 [ModuleForkPass]: address_rotation_psum finished after 1.984 seconds +2025-09-05T19:19:19Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1907mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:19Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69133 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:19Z USER 2007 [ModuleForkPass]: Running coloring_allocator_sb +2025-09-05T19:19:19Z INFO 2007 [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=69133 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:19Z INFO 2007 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 7114104832 +2025-09-05T19:19:19Z INFO 2007 [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 7875 bytes +2025-09-05T19:19:19Z INFO 2007 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 7114072064 +2025-09-05T19:19:19Z INFO 2007 [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 7822 bytes +2025-09-05T19:19:19Z INFO 2007 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0 +2025-09-05T19:19:19Z INFO 2007 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes +2025-09-05T19:19:19Z INFO 2007 [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:19:19Z INFO 2007 [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:19:19Z INFO 2007 [SB_Allocator]: allocating SB +2025-09-05T19:19:19Z INFO 2007 [SB_Allocator]: main loop +2025-09-05T19:19:19Z INFO 2007 [SB_Allocator]: renumber locations +2025-09-05T19:19:19Z INFO 2007 [SB_Allocator]: size = 14212 +2025-09-05T19:19:19Z INFO 2007 [SB_Allocator]: find partners +2025-09-05T19:19:19Z INFO 2007 [SB_Allocator]: found 54337 accumulation groups +2025-09-05T19:19:19Z INFO 2007 [SB_Allocator]: largest = 19629.23884_i1022 +2025-09-05T19:19:19Z INFO 2007 [SB_Allocator]: tensors = 2 +2025-09-05T19:19:19Z INFO 2007 [SB_Allocator]: requires 8448 bytes/partition +2025-09-05T19:19:19Z INFO 2007 [SB_Allocator]: expanding partners +2025-09-05T19:19:19Z INFO 2007 []: find first defs for local +2025-09-05T19:19:19Z INFO 2007 []: find first defs for global +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: find loads +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: 1 pin count +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: 6081 remat count +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: build interference graph +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: pass 1 int-tree +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Num intervals 14212 Num locations 14212 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: IntervalTree Build Done +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: info.neighbors init Done +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: info.neighbors partners Done +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: IntervalTree readback Done +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: edge: 31236 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: mean: 4.39572 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: median: 2.00087 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: find costs +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: simplify interference graph +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: initialize safe & unsafe +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: safe = 14210 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: unsafe = 1 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: inf = 0 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: total = 14211 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: simplify +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 14212 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: new candidates = 0 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: select ranges +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Total: 14211 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Spilled: 0.000 (0) +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Allocated: 1.000 (14211) +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Rover zone: 0.989 (14051) +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Pre-rover zone: 0.009 (128) +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Post-rover zone: 0.002 (32) +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Slice zone: 0.000 (0) +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Blocks medium: 0.000 (0) +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Blocks tall: 1.000 (14211) +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Visited until tall blocking (mean): 0.997 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: Success +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: SB spills = 0 tensors +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: remats = 0 tensors +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: unpinned = 0 tensors +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: size = 0 bytes/partition +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: SB score = 0 +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: spilling from SB cost about 0 cycles +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: pinning saved approximately 9010 cycles +2025-09-05T19:19:20Z INFO 2007 [SB_Allocator]: 0% SB utilization after allocation +2025-09-05T19:19:20Z INFO 2007 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 7114104832 +2025-09-05T19:19:20Z INFO 2007 [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 7875 bytes +2025-09-05T19:19:20Z INFO 2007 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 7114072064 +2025-09-05T19:19:20Z INFO 2007 [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 7822 bytes +2025-09-05T19:19:20Z INFO 2007 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0 +2025-09-05T19:19:20Z INFO 2007 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes +2025-09-05T19:19:20Z USER 2007 [ModuleForkPass]: coloring_allocator_sb finished after 1.103 seconds +2025-09-05T19:19:20Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1911mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:20Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69133 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:20Z USER 2007 [ModuleForkPass]: Running address_rotation_sb +2025-09-05T19:19:20Z INFO 2007 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=69133 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:20Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:19:20Z USER 2007 [ModuleForkPass]: address_rotation_sb finished after 0.290 seconds +2025-09-05T19:19:20Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1912mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:20Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69133 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:20Z USER 2007 [ModuleForkPass]: Running dma_optimization_sb +2025-09-05T19:19:20Z INFO 2007 [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=69133 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:20Z INFO 2007 [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 14228176896, 50.0001% input load, 49.9999% output write, 0% spill/reload [sg0000] +2025-09-05T19:19:20Z INFO 2007 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: removed 0 identical load +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(7.1141e+09) +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, -nan% out of total spill/reload dma traffic +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, -nan% out of total spill/reload dma traffic +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, -nan% out of total spill/reload dma traffic +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-09-05T19:19:21Z INFO 2007 [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: average loaded DMA size 7875 bytes +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: average saved DMA size 7822 bytes +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 7114104832 +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 7875 bytes +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 7114072064 +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 7822 bytes +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, -nan% out of total spill/reload dma traffic +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 14228176896, 50.0001% input load, 49.9999% output write, 0% spill/reload [sg0000] +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 7114104832 +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 7875 bytes +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 7114072064 +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 7822 bytes +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 0 +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 0 bytes +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 7848 bytes +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-09-05T19:19:22Z USER 2007 [ModuleForkPass]: dma_optimization_sb finished after 1.621 seconds +2025-09-05T19:19:22Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1932mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:22Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:22Z USER 2007 [ModuleForkPass]: Running address_rotation_sb +2025-09-05T19:19:22Z INFO 2007 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:22Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 5919 Sb address +2025-09-05T19:19:23Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 4699 Sb address +2025-09-05T19:19:23Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:19:23Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:19:23Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 1769 Sb address +2025-09-05T19:19:24Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: address_rotation_sb finished after 1.665 seconds +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1932mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: Running coloring_allocator_dram +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z INFO 2007 [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:19:24Z INFO 2007 [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: reserved space = 14496612352 bytes +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: spill space = 0 bytes +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: aligned spill space = 0 bytes +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: dram space = 107374182400 bytes +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: renumber locations +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: size = 0 +2025-09-05T19:19:24Z INFO 2007 []: find first defs for local +2025-09-05T19:19:24Z INFO 2007 []: find first defs for global +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: IntervalTree Build Done +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: info.neighbors init Done +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: IntervalTree readback Done +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: simplify interference graph +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: initialize low and high +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: lo = 0 +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: hi = 0 +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: total = 0 +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: simplify +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: new candidates = 0 +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: select ranges +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: CC buffer size limit 524288000 +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: Real CC buffer size 0 +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-09-05T19:19:24Z INFO 2007 [DRAM_Allocator]: DRAM allocation successful +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: coloring_allocator_dram finished after 0.469 seconds +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1937mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: Running address_rotation_dram +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z INFO 2007 [DMAOptimizationBase]: Runtime page size at 512MB +2025-09-05T19:19:24Z INFO 2007 [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-09-05T19:19:24Z INFO 2007 [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-09-05T19:19:24Z INFO 2007 [DMAOptimizationBase]: allreduce hwm 0 +2025-09-05T19:19:24Z INFO 2007 [DMAOptimizationBase]: Real CC buffer size 0 +2025-09-05T19:19:24Z INFO 2007 [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-09-05T19:19:24Z INFO 2007 [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: address_rotation_dram finished after 0.171 seconds +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1939mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: Running tensorcopy_accel +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z INFO 2007 [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-09-05T19:19:24Z INFO 2007 [TensorCopyAccel::Impl]: Accelerated 0 out of 54337 tensorcopy in Function: sg0000 average acceleration factor: -nan +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: tensorcopy_accel finished after 0.025 seconds +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1939mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: Running peephole_opts +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z INFO 2007 [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: peephole_opts finished after 0.055 seconds +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1939mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: Running lower_kernel +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z INFO 2007 [LowerKernel]: Started running LowerKernel +2025-09-05T19:19:24Z INFO 2007 [LowerKernel]: Start of kernel lowering pass, number of insts: 285701, number of allocs: 69132 +2025-09-05T19:19:24Z INFO 2007 [LowerKernel]: Scan BKs time (s): 0.013202 +2025-09-05T19:19:24Z INFO 2007 [LowerKernel]: Lower BKs time (s): 3e-06 +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: lower_kernel finished after 0.019 seconds +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1939mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: Running lower_nki_kernel +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: lower_nki_kernel finished after 0.017 seconds +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1939mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: Running dynamic_dma_cleanup +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: dynamic_dma_cleanup finished after 0.029 seconds +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1941mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:24Z USER 2007 [ModuleForkPass]: Running birverifier +2025-09-05T19:19:24Z INFO 2007 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:25Z USER 2007 [ModuleForkPass]: birverifier finished after 0.195 seconds +2025-09-05T19:19:25Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1941mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:25Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:25Z USER 2007 [ModuleForkPass]: Running dynamic_dma_scan +2025-09-05T19:19:25Z INFO 2007 [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:25Z USER 2007 [ModuleForkPass]: dynamic_dma_scan finished after 0.027 seconds +2025-09-05T19:19:25Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1941mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:25Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:25Z USER 2007 [ModuleForkPass]: Running build_fdeps +2025-09-05T19:19:25Z INFO 2007 [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:25Z INFO 2007 [build_flow_deps]: Start build fdeps. Invocation: 2Fri Sep 5 19:19:25 2025 +2025-09-05T19:19:25Z INFO 2007 [build_flow_deps]: Allocs: 69132 instructions: 285701 +2025-09-05T19:19:26Z INFO 2007 [build_flow_deps]: Build fdeps inserted 714949 edges +2025-09-05T19:19:26Z INFO 2007 [build_flow_deps]: Done build fdeps 714949 Fri Sep 5 19:19:26 2025 +2025-09-05T19:19:26Z USER 2007 [ModuleForkPass]: build_fdeps finished after 1.075 seconds +2025-09-05T19:19:26Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1963mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:26Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:26Z USER 2007 [ModuleForkPass]: Running remove_redundancies +2025-09-05T19:19:26Z INFO 2007 [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:26Z INFO 2007 [RemoveRedundancies]: remove_clobbered_writes +2025-09-05T19:19:26Z INFO 2007 [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-09-05T19:19:26Z INFO 2007 [RemoveRedundancies]: remove_useless_insts +2025-09-05T19:19:26Z INFO 2007 [RemoveRedundancies]: remove Useless Instructions: 0 +2025-09-05T19:19:26Z USER 2007 [ModuleForkPass]: remove_redundancies finished after 0.090 seconds +2025-09-05T19:19:26Z INFO 2007 [ModuleForkPass]: curr_vmrss: 1963mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:26Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:26Z USER 2007 [ModuleForkPass]: Running anti_dependency_analyzer +2025-09-05T19:19:26Z INFO 2007 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:26Z INFO 2007 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-09-05T19:19:26Z INFO 2007 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-09-05T19:19:26Z INFO 2007 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-09-05T19:19:27Z USER 2007 [ModuleForkPass]: anti_dependency_analyzer finished after 0.989 seconds +2025-09-05T19:19:27Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2058mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:27Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:27Z USER 2007 [ModuleForkPass]: Running tensor_copy_elim +2025-09-05T19:19:27Z INFO 2007 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:27Z INFO 2007 [TensorCopyElim]: Tensor CP elimination: 0 +2025-09-05T19:19:27Z INFO 2007 [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:19:27Z USER 2007 [ModuleForkPass]: tensor_copy_elim finished after 0.240 seconds +2025-09-05T19:19:27Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2071mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:27Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:27Z USER 2007 [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-09-05T19:19:27Z INFO 2007 [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:27Z USER 2007 [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.003 seconds +2025-09-05T19:19:27Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2071mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:27Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:27Z USER 2007 [ModuleForkPass]: Running post_sched +2025-09-05T19:19:27Z INFO 2007 [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:27Z INFO 2007 [post_scheduler]: Start PosT ScheD 3 sunda Fri Sep 5 19:19:27 2025 +2025-09-05T19:19:29Z INFO 2007 [post_scheduler]: Time-aware hwm post-sched +2025-09-05T19:19:31Z INFO 2007 [post_scheduler]: Time-aware simulation time: 59414448 +2025-09-05T19:19:32Z INFO 2007 [post_scheduler]: Done PosT ScheD Fri Sep 5 19:19:32 2025 +2025-09-05T19:19:32Z USER 2007 [ModuleForkPass]: post_sched finished after 4.715 seconds +2025-09-05T19:19:32Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2468mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:32Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:32Z USER 2007 [ModuleForkPass]: Running expand_scheduling_units +2025-09-05T19:19:32Z INFO 2007 [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:32Z USER 2007 [ModuleForkPass]: expand_scheduling_units finished after 0.027 seconds +2025-09-05T19:19:32Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2218mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:32Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:32Z USER 2007 [ModuleForkPass]: Running address_rotation_sb +2025-09-05T19:19:32Z INFO 2007 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:34Z INFO 2007 [DMAOptimizationBase]: PSUM Rotation rotated 9197 PSUM Banks +2025-09-05T19:19:34Z INFO 2007 [DMAOptimizationBase]: PSUM Rotation rotated 5583 PSUM Banks +2025-09-05T19:19:35Z INFO 2007 [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-09-05T19:19:35Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 2554 Sb address +2025-09-05T19:19:36Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 2329 Sb address +2025-09-05T19:19:36Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:19:36Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:19:37Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 59 Sb address +2025-09-05T19:19:37Z INFO 2007 [DMAOptimizationBase]: moved 0 MM forward +2025-09-05T19:19:37Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:19:38Z INFO 2007 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-09-05T19:19:38Z USER 2007 [ModuleForkPass]: address_rotation_sb finished after 5.695 seconds +2025-09-05T19:19:38Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2254mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:38Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:38Z USER 2007 [ModuleForkPass]: Running anti_dependency_analyzer +2025-09-05T19:19:38Z INFO 2007 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:38Z INFO 2007 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-09-05T19:19:38Z INFO 2007 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-09-05T19:19:38Z INFO 2007 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-09-05T19:19:39Z USER 2007 [ModuleForkPass]: anti_dependency_analyzer finished after 1.349 seconds +2025-09-05T19:19:39Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2285mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:39Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:39Z USER 2007 [ModuleForkPass]: Running anti_dependency_analyzer +2025-09-05T19:19:39Z INFO 2007 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:39Z INFO 2007 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-09-05T19:19:39Z INFO 2007 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-09-05T19:19:39Z INFO 2007 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-09-05T19:19:39Z USER 2007 [ModuleForkPass]: anti_dependency_analyzer finished after 0.188 seconds +2025-09-05T19:19:39Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2287mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:39Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:39Z USER 2007 [ModuleForkPass]: Running dep_opt +2025-09-05T19:19:39Z INFO 2007 [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:39Z INFO 2007 [build_flow_deps]: Start build fdeps. Invocation: 3Fri Sep 5 19:19:39 2025 +2025-09-05T19:19:39Z INFO 2007 [build_flow_deps]: Allocs: 69132 instructions: 285701 +2025-09-05T19:19:40Z INFO 2007 [build_flow_deps]: Build fdeps inserted 701433 edges +2025-09-05T19:19:40Z INFO 2007 [build_flow_deps]: Done build fdeps 701433 Fri Sep 5 19:19:40 2025 +2025-09-05T19:19:41Z USER 2007 [ModuleForkPass]: dep_opt finished after 1.572 seconds +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2248mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [ModuleForkPass]: Running report_stats +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z INFO 2007 [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼────────────┤ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 7105 │ 7114072064 │ +│ Save │ Internal -> ExternalOutput │ 7105 │ 7114072064 │ +└─────────────┴────────────────────────────┴───────┴────────────┘ + +2025-09-05T19:19:41Z INFO 2007 [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 64 │ 65 │ +│ 256 │ 66 │ +│ 7168 │ 4096 │ +│ 8192 │ 9984 │ +└─────────────────────┴───────┘ + +2025-09-05T19:19:41Z INFO 2007 [ReportStats]: MM Stats: #MatMults 217153 #MatMult-Transposes 217153 +2025-09-05T19:19:41Z INFO 2007 [ReportStats]: IO Tensor size combined: 14496579584 +2025-09-05T19:19:41Z INFO 2007 [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output289 │ ExternalOutput │ bfloat16 │ 134217728 │ +│ input0 │ ExternalInput │ bfloat16 │ 134217728 │ +│ output0 │ ExternalOutput │ bfloat16 │ 134217728 │ +│ input289 │ ExternalInput │ bfloat16 │ 134217728 │ +│ input7 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input9 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input24 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input18 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input15 │ ExternalInput │ bfloat16 │ 58720256 │ +│ input27 │ ExternalInput │ bfloat16 │ 58720256 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-09-05T19:19:41Z INFO 2007 [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ t2035_pftranspose_18320_i5 │ Internal │ bfloat16 │ 1048576 │ +│ t2035_pftranspose_18320_i2 │ Internal │ bfloat16 │ 1048576 │ +│ t2035_pftranspose_18320_i1 │ Internal │ bfloat16 │ 1048576 │ +│ t2035_pftranspose_18320_i3 │ Internal │ bfloat16 │ 1048576 │ +│ t2035_pftranspose_18320_i4 │ Internal │ bfloat16 │ 1048576 │ +│ t2035_pftranspose_18320_i6 │ Internal │ bfloat16 │ 1048576 │ +│ t2035_pftranspose_18320_i9 │ Internal │ bfloat16 │ 1048576 │ +│ t2035_pftranspose_18320_i8 │ Internal │ bfloat16 │ 1048576 │ +│ t2035_pftranspose_18320_i7 │ Internal │ bfloat16 │ 1048576 │ +└────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-09-05T19:19:41Z USER 2007 [ModuleForkPass]: report_stats finished after 0.059 seconds +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2248mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [BackendPassManager]: mod_parallel_pass finished after 28.058 seconds +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: curr_vmrss: 2248mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [BackendPassManager]: Running assign_trigger_engine +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: Inputs to assign_trigger_engine: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z INFO 2007 [AssignTriggerEngine]: Assigned trigger engine for 0 DMA instructions. Moved 0 DMA instructions to CC's engines. +2025-09-05T19:19:41Z USER 2007 [BackendPassManager]: assign_trigger_engine finished after 0.097 seconds +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: curr_vmrss: 2248mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [BackendPassManager]: Running subgraph_parallel_pass +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [SubgraphForkPass]: Running lower_local_collectives +2025-09-05T19:19:41Z INFO 2007 [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [SubgraphForkPass]: lower_local_collectives finished after 0.004 seconds +2025-09-05T19:19:41Z INFO 2007 [SubgraphForkPass]: curr_vmrss: 2248mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [SubgraphForkPass]: Running extend_shared_lifetimes +2025-09-05T19:19:41Z INFO 2007 [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [SubgraphForkPass]: extend_shared_lifetimes finished after 0.003 seconds +2025-09-05T19:19:41Z INFO 2007 [SubgraphForkPass]: curr_vmrss: 2248mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [SubgraphForkPass]: Running dead_code_elim +2025-09-05T19:19:41Z INFO 2007 [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z INFO 2007 [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-09-05T19:19:41Z USER 2007 [SubgraphForkPass]: dead_code_elim finished after 0.245 seconds +2025-09-05T19:19:41Z INFO 2007 [SubgraphForkPass]: curr_vmrss: 2249mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [BackendPassManager]: subgraph_parallel_pass finished after 0.266 seconds +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: curr_vmrss: 2249mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [BackendPassManager]: Running assign_hwdge_engine +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [BackendPassManager]: assign_hwdge_engine finished after 0.027 seconds +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: curr_vmrss: 2249mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [ModuleForkPass]: Running alloc_queues +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z INFO 2007 [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-09-05T19:19:41Z INFO 2007 [AllocQueues]: Alloc Queue info: +┌─────────────────┬────────────────┬────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├─────────────────┼────────────────┼────────┼────────────┼──────────────────┤ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 14210 │ +└─────────────────┴────────────────┴────────┴────────────┴──────────────────┘ + +2025-09-05T19:19:41Z USER 2007 [ModuleForkPass]: alloc_queues finished after 0.030 seconds +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2249mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [ModuleForkPass]: Running chain_dma_transposes +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [ModuleForkPass]: chain_dma_transposes finished after 0.003 seconds +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2249mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.003 seconds +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2249mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [ModuleForkPass]: Running lower_control +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z INFO 2007 [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-09-05T19:19:41Z USER 2007 [ModuleForkPass]: lower_control finished after 0.197 seconds +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2249mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [BackendPassManager]: mod_parallel_pass finished after 0.252 seconds +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: curr_vmrss: 2249mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [BackendPassManager]: Running nc_parallel_pass +2025-09-05T19:19:41Z INFO 2007 [BackendPassManager]: Inputs to nc_parallel_pass: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z USER 2007 [CoreForkPass]: Running dep_reduction +2025-09-05T19:19:41Z INFO 2007 [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:41Z INFO 2007 [DepReduction]: Start Dependency Reduction +2025-09-05T19:19:42Z INFO 2007 [DepReduction]: Processing async instrs... +2025-09-05T19:19:42Z INFO 2007 [DepReduction]: Processing secondary edges per engine... +2025-09-05T19:19:42Z INFO 2007 [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 484672 +2025-09-05T19:19:42Z INFO 2007 [DepReduction]: Processing redundant descendants, Done. Num edges removed 497220 +2025-09-05T19:19:42Z INFO 2007 [DepReduction]: Processing async instrs, Done. Num edges removed 497220 +2025-09-05T19:19:43Z INFO 2007 [DepReduction]: Num Async removed: 0 +2025-09-05T19:19:43Z INFO 2007 [DepReduction]: Finished dependency reduction: 1173793 removed, new total 115343 +2025-09-05T19:19:43Z INFO 2007 [DepReduction]: Finished Dependency Reduction +2025-09-05T19:19:43Z USER 2007 [CoreForkPass]: dep_reduction finished after 1.714 seconds +2025-09-05T19:19:43Z INFO 2007 [CoreForkPass]: curr_vmrss: 2263mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:43Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:43Z USER 2007 [CoreForkPass]: Running lower_dynamic_dma +2025-09-05T19:19:43Z INFO 2007 [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:43Z USER 2007 [CoreForkPass]: lower_dynamic_dma finished after 0.048 seconds +2025-09-05T19:19:43Z INFO 2007 [CoreForkPass]: curr_vmrss: 2263mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:43Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:43Z USER 2007 [CoreForkPass]: Running legalize_dynamic_dma +2025-09-05T19:19:43Z INFO 2007 [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:43Z INFO 2007 [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-09-05T19:19:43Z INFO 2007 [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-09-05T19:19:43Z INFO 2007 [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-09-05T19:19:43Z USER 2007 [CoreForkPass]: legalize_dynamic_dma finished after 0.100 seconds +2025-09-05T19:19:43Z INFO 2007 [CoreForkPass]: curr_vmrss: 2263mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:43Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285701 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:43Z USER 2007 [CoreForkPass]: Running lower_dma +2025-09-05T19:19:43Z INFO 2007 [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=69132 blocks=1 instructions=285701 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:43Z INFO 2007 [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 14145/14145 (100% DGE) + power-of-2 partition : 14210/14210 (100% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 14210/14210 (100% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + Spill/Reload + Copy (DGE/DMA) + 128 partition : 0/1 (0% DGE) + power-of-2 partition : 0/1 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/1 (0% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 0 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 0/0 + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-09-05T19:19:43Z USER 2007 [CoreForkPass]: lower_dma finished after 0.145 seconds +2025-09-05T19:19:43Z INFO 2007 [CoreForkPass]: curr_vmrss: 2263mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:43Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285710 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:43Z USER 2007 [CoreForkPass]: Running coalesce_dma_blocks +2025-09-05T19:19:43Z INFO 2007 [CoreForkPass]: Inputs to coalesce_dma_blocks: modules=1 functions=1 allocs=69132 blocks=1 instructions=285710 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z INFO 2007 [CoalesceDmaBlocks]: Coaleseced 0 DMA triggers +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: coalesce_dma_blocks finished after 0.133 seconds +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: curr_vmrss: 2264mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285710 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: Running expand_all_engine +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=69132 blocks=1 instructions=285710 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: expand_all_engine finished after 0.050 seconds +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: curr_vmrss: 2264mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285710 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: Running alloc_semaphores +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=69132 blocks=1 instructions=285710 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: alloc_semaphores finished after 0.242 seconds +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: curr_vmrss: 2264mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285710 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: Running expand_inst_late +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=69132 blocks=1 instructions=285710 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: expand_inst_late finished after 0.262 seconds +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: curr_vmrss: 2264mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285710 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: Running seq_inst_opt +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=69132 blocks=1 instructions=285710 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z INFO 2007 [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: seq_inst_opt finished after 0.027 seconds +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: curr_vmrss: 2264mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 285710 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: Running lower_sync +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=69132 blocks=1 instructions=285710 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: lower_sync finished after 0.102 seconds +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: curr_vmrss: 2264mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 300967 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: Running lower_act +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=69132 blocks=1 instructions=300967 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: lower_act finished after 0.036 seconds +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: curr_vmrss: 2264mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 300968 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z USER 2007 [CoreForkPass]: Running lower_dve +2025-09-05T19:19:44Z INFO 2007 [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=69132 blocks=1 instructions=300968 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:44Z INFO 2007 [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/conda/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json +2025-09-05T19:19:45Z USER 2007 [CoreForkPass]: lower_dve finished after 0.303 seconds +2025-09-05T19:19:45Z INFO 2007 [CoreForkPass]: curr_vmrss: 2296mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:45Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 300968 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [CoreForkPass]: Running lower_ap +2025-09-05T19:19:45Z INFO 2007 [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=69132 blocks=1 instructions=300968 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [CoreForkPass]: lower_ap finished after 0.066 seconds +2025-09-05T19:19:45Z INFO 2007 [CoreForkPass]: curr_vmrss: 2177mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:45Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 300968 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [CoreForkPass]: Running coloring_allocator_reg +2025-09-05T19:19:45Z INFO 2007 [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=69132 blocks=1 instructions=300968 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z INFO 2007 [ColoringAllocator::Rep]: Allocating functions +2025-09-05T19:19:45Z INFO 2007 [ColoringAllocator::Rep]: linearize and check +2025-09-05T19:19:45Z INFO 2007 [REG_Allocator]: allocating REG +2025-09-05T19:19:45Z INFO 2007 [REG_Allocator]: main loop iteration 1 +2025-09-05T19:19:45Z USER 2007 [CoreForkPass]: coloring_allocator_reg finished after 0.044 seconds +2025-09-05T19:19:45Z INFO 2007 [CoreForkPass]: curr_vmrss: 2190mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:45Z INFO 2007 [CoreForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 300968 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [BackendPassManager]: nc_parallel_pass finished after 3.389 seconds +2025-09-05T19:19:45Z INFO 2007 [BackendPassManager]: curr_vmrss: 2190mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:45Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 300968 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:19:45Z INFO 2007 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=69132 blocks=1 instructions=300968 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [ModuleForkPass]: Running birverifier +2025-09-05T19:19:45Z INFO 2007 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=69132 blocks=1 instructions=300968 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [ModuleForkPass]: birverifier finished after 0.254 seconds +2025-09-05T19:19:45Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2190mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:45Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 300968 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [BackendPassManager]: mod_parallel_pass finished after 0.261 seconds +2025-09-05T19:19:45Z INFO 2007 [BackendPassManager]: curr_vmrss: 2190mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:45Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 300968 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [BackendPassManager]: Running subgraph_parallel_pass +2025-09-05T19:19:45Z INFO 2007 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=69132 blocks=1 instructions=300968 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [SubgraphForkPass]: Running lnc_verifier +2025-09-05T19:19:45Z INFO 2007 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=69132 blocks=1 instructions=300968 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [SubgraphForkPass]: lnc_verifier finished after 0.003 seconds +2025-09-05T19:19:45Z INFO 2007 [SubgraphForkPass]: curr_vmrss: 2190mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:45Z INFO 2007 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 300968 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [BackendPassManager]: subgraph_parallel_pass finished after 0.010 seconds +2025-09-05T19:19:45Z INFO 2007 [BackendPassManager]: curr_vmrss: 2190mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:45Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 300968 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [BackendPassManager]: Running mod_parallel_pass +2025-09-05T19:19:45Z INFO 2007 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=69132 blocks=1 instructions=300968 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z USER 2007 [ModuleForkPass]: Running codegen +2025-09-05T19:19:45Z INFO 2007 [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=69132 blocks=1 instructions=300968 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:45Z INFO 2007 [Codegen]: Total compiler allocated DRAM tensors: 0 GB +2025-09-05T19:19:45Z INFO 2007 [Codegen]: Total un-allocated DRAM tensors by kind: +2025-09-05T19:19:45Z INFO 2007 [Codegen]: +┌───────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├───────────────┼─────────────┤ +│ ExternalInput │ 6.7505 │ +│ Const │ 3.05176e-05 │ +└───────────────┴─────────────┘ + +2025-09-05T19:19:45Z INFO 2007 [Codegen]: Total runtime managed DRAM tensors: 6.75053 GB +2025-09-05T19:19:46Z INFO 2007 [Codegen]: Instruction Stats: +2025-09-05T19:19:46Z INFO 2007 [Codegen]: +┌─────────────────────┬────────┐ +│ Opcode │ Count │ +├─────────────────────┼────────┤ +│ LDWEIGHTS │ 217153 │ +│ MATMUL │ 217153 │ +│ ACTIVATE │ 54337 │ +│ EVENT_SEMAPHORE │ 15257 │ +│ UNKNOWN(0xd4) │ 14210 │ +│ NOP │ 8 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ ACT_TABLE_LOAD │ 1 │ +│ PSEUDO_DMA_TRIGGER │ 1 │ +└─────────────────────┴────────┘ + +2025-09-05T19:19:46Z INFO 2007 [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 20924 │ +│ Scalar │ 56947 │ +│ Tensor │ 440254 │ +│ SyncDMA │ 0 │ +│ Vector │ 2 │ +│ Sync │ 3 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-09-05T19:19:46Z INFO 2007 [Codegen]: Total instructions: 518130 (0.030883 GB) +2025-09-05T19:19:46Z INFO 2007 [Codegen]: Total DynamicDMA instruction count: 14210 +2025-09-05T19:19:46Z USER 2007 [Codegen]: isa_gen finished after 1.105 seconds +2025-09-05T19:19:46Z INFO 2007 [Codegen]: Number of DMA descriptors on each queue instance: +┌─────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├─────────────────┼────────────────┤ +│ qSPSpillReload0 │ 256 │ +└─────────────────┴────────────────┘ + +Total descriptors: 256 (3.8147e-06 GB) +2025-09-05T19:19:46Z INFO 2007 [Codegen]: Number of DMA engines used by each queue: +┌─────────────────┬─────────────────────┐ +│ Queue │ DMA Engines │ +├─────────────────┼─────────────────────┤ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +├─────────────────┼─────────────────────┤ +│ TOTAL │ 32 (must be <= 176) │ +└─────────────────┴─────────────────────┘ + +2025-09-05T19:19:46Z INFO 2007 [Codegen]: Tensors with largest descriptor count: +┌──────────────────────┬──────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├──────────────────────┼──────────┼──────────┼──────────────────┤ +│ identity_local_22027 │ Internal │ bfloat16 │ 1 │ +│ identity_22025 │ Const │ bfloat16 │ 1 │ +└──────────────────────┴──────────┴──────────┴──────────────────┘ + +2025-09-05T19:19:46Z USER 2007 [Codegen]: dma_desc_gen finished after 0.004 seconds +2025-09-05T19:19:46Z INFO 2007 [Codegen]: Estimated peak DRAM usage: 6.78141 GB +2025-09-05T19:19:46Z INFO 2007 [Codegen]: Generating debug info +2025-09-05T19:19:47Z USER 2007 [Codegen]: debug_info_gen finished after 0.568 seconds +2025-09-05T19:19:47Z USER 2007 [ModuleForkPass]: codegen finished after 1.742 seconds +2025-09-05T19:19:47Z INFO 2007 [ModuleForkPass]: curr_vmrss: 2385mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:47Z INFO 2007 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 300968 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:47Z USER 2007 [BackendPassManager]: mod_parallel_pass finished after 1.775 seconds +2025-09-05T19:19:47Z INFO 2007 [BackendPassManager]: curr_vmrss: 2204mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:47Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 300968 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:47Z USER 2007 [BackendPassManager]: Running neff_packager +2025-09-05T19:19:47Z INFO 2007 [BackendPassManager]: Inputs to neff_packager: modules=1 functions=1 allocs=69132 blocks=1 instructions=300968 Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:47Z WARNING 2007 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.169490.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-09-05T19:19:47Z INFO 2007 [NeffFileWriter]: Neff will be written to: /models/mistral-7b-v0.3-instruct-neuronx/layout_opt/graph.neff +2025-09-05T19:19:47Z INFO 2007 [NeffFileWriter]: IR signature: b43fb3a95505587f066b6d2d419d7150 for neff artifacts +2025-09-05T19:19:47Z USER 2007 [BackendPassManager]: neff_packager finished after 0.299 seconds +2025-09-05T19:19:47Z INFO 2007 [BackendPassManager]: curr_vmrss: 2205mb, ru_maxrss: 2541mb (delta=0mb) +2025-09-05T19:19:47Z INFO 2007 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69132 memory location(s), 1 block(s), and 300968 instruction(s). Max writers: 128 Max Readers: 217153 +2025-09-05T19:19:47Z INFO 2007 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ module │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ module │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.000000 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.000000 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.000000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-09-05T19:19:47Z INFO 2007 [BackendDriver]: Backend completed successfully, tearing down. +2025-09-05T19:19:48Z INFO 1971 [job.WalrusDriver.0]: Job #0 finished +2025-09-05T19:19:48Z INFO 1971 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-09-05T19:19:48Z INFO 1971 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-09-05T19:19:48Z INFO 1971 [job.BIRLinker.0]: Replay this job by calling: /opt/conda/bin/neuronx-cc compile --framework XLA --state '{"model": ["/models/mistral-7b-v0.3-instruct-neuronx/layout_opt/model/graph.hlo"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/opt/vllm/neuronxcc-gss0wi_w/sg00", "state_id": "sg00"}' --pipeline BIRLinker +2025-09-05T19:19:48Z INFO 1971 [job.BIRLinker.0]: BIRLinker cwd: /opt/vllm/neuronxcc-gss0wi_w +2025-09-05T19:19:48Z INFO 1971 [job.BIRLinker.0]: Linking not needed. Netlist doesnt exist +2025-09-05T19:19:48Z INFO 1971 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-09-05T19:19:48Z INFO 1971 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-09-05T19:19:48Z INFO 1971 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-09-05T19:19:48Z INFO 1971 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-09-05T19:19:48Z INFO 1971 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-09-05T19:19:48Z INFO 1971 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-09-05T19:19:48Z INFO 1971 [job.NeffWrapper.0]: Processing input #0 +2025-09-05T19:19:48Z INFO 1971 [job.NeffWrapper.0]: Start NeffWrapper +2025-09-05T19:19:48Z INFO 1971 [job.NeffWrapper.0]: Executing: /opt/conda/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /models/mistral-7b-v0.3-instruct-neuronx/layout_opt/model/graph.hlo --neff /models/mistral-7b-v0.3-instruct-neuronx/layout_opt/graph.neff --io_transposes /opt/vllm/neuronxcc-gss0wi_w/io_transposes.json --output /models/mistral-7b-v0.3-instruct-neuronx/layout_opt/wrapped_neff.hlo --netlist /opt/vllm/neuronxcc-gss0wi_w/hlo_netlist.json +2025-09-05T19:19:48Z INFO 1971 [job.NeffWrapper.0]: Could not open file: /opt/vllm/neuronxcc-gss0wi_w/hlo_netlist.json +There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-09-05T19:19:48Z INFO 1971 [job.NeffWrapper.0]: Job #0 finished +2025-09-05T19:19:48Z INFO 1971 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-09-05T19:19:48Z INFO 1971 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-09-05T19:19:48Z INFO 1971 [pipeline.Pipeline.0]: Job #0 finished +2025-09-05T19:19:48Z INFO 1939 [root]: Subcommand returned with exitcode=0 diff --git a/layout_opt/metaneff b/layout_opt/metaneff new file mode 100644 index 0000000000000000000000000000000000000000..77136fa6c45af3c1ac5241d311f9dc94b9682dbb --- /dev/null +++ b/layout_opt/metaneff @@ -0,0 +1,874 @@ + +( +input02embed_tokens.weight8 +; +input1 2'layers.0.self_attn.o_proj.o_proj.weight8 += +input2 2)layers.0.self_attn.qkv_proj.v_proj.weight8 +1 +input3 2layers.0.input_layernorm.weight8 += +input4 2)layers.0.self_attn.qkv_proj.k_proj.weight8 += +input5 2)layers.0.self_attn.qkv_proj.q_proj.weight8 +1 +input6 82layers.0.mlp.down_proj.weight8 +/ +input78 2layers.0.mlp.up_proj.weight8 +: +input8 2(layers.0.post_attention_layernorm.weight8 +1 +input98 2layers.0.mlp.gate_proj.weight8 +< +input10 2'layers.1.self_attn.o_proj.o_proj.weight8 +> +input11 2)layers.1.self_attn.qkv_proj.v_proj.weight8 +2 +input12 2layers.1.input_layernorm.weight8 +> +input13 2)layers.1.self_attn.qkv_proj.k_proj.weight8 +> +input14 2)layers.1.self_attn.qkv_proj.q_proj.weight8 +2 +input15 82layers.1.mlp.down_proj.weight8 +0 +input168 2layers.1.mlp.up_proj.weight8 +; +input17 2(layers.1.post_attention_layernorm.weight8 +2 +input188 2layers.1.mlp.gate_proj.weight8 +< +input19 2'layers.2.self_attn.o_proj.o_proj.weight8 +> +input20 2)layers.2.self_attn.qkv_proj.v_proj.weight8 +2 +input21 2layers.2.input_layernorm.weight8 +> +input22 2)layers.2.self_attn.qkv_proj.k_proj.weight8 +> +input23 2)layers.2.self_attn.qkv_proj.q_proj.weight8 +2 +input24 82layers.2.mlp.down_proj.weight8 +0 +input258 2layers.2.mlp.up_proj.weight8 +; +input26 2(layers.2.post_attention_layernorm.weight8 +2 +input278 2layers.2.mlp.gate_proj.weight8 +< +input28 2'layers.3.self_attn.o_proj.o_proj.weight8 +> +input29 2)layers.3.self_attn.qkv_proj.v_proj.weight8 +2 +input30 2layers.3.input_layernorm.weight8 +> +input31 2)layers.3.self_attn.qkv_proj.k_proj.weight8 +> +input32 2)layers.3.self_attn.qkv_proj.q_proj.weight8 +2 +input33 82layers.3.mlp.down_proj.weight8 +0 +input348 2layers.3.mlp.up_proj.weight8 +; +input35 2(layers.3.post_attention_layernorm.weight8 +2 +input368 2layers.3.mlp.gate_proj.weight8 +< +input37 2'layers.4.self_attn.o_proj.o_proj.weight8 +> +input38 2)layers.4.self_attn.qkv_proj.v_proj.weight8 +2 +input39 2layers.4.input_layernorm.weight8 +> +input40 2)layers.4.self_attn.qkv_proj.k_proj.weight8 +> +input41 2)layers.4.self_attn.qkv_proj.q_proj.weight8 +2 +input42 82layers.4.mlp.down_proj.weight8 +0 +input438 2layers.4.mlp.up_proj.weight8 +; +input44 2(layers.4.post_attention_layernorm.weight8 +2 +input458 2layers.4.mlp.gate_proj.weight8 +< +input46 2'layers.5.self_attn.o_proj.o_proj.weight8 +> +input47 2)layers.5.self_attn.qkv_proj.v_proj.weight8 +2 +input48 2layers.5.input_layernorm.weight8 +> +input49 2)layers.5.self_attn.qkv_proj.k_proj.weight8 +> +input50 2)layers.5.self_attn.qkv_proj.q_proj.weight8 +2 +input51 82layers.5.mlp.down_proj.weight8 +0 +input528 2layers.5.mlp.up_proj.weight8 +; +input53 2(layers.5.post_attention_layernorm.weight8 +2 +input548 2layers.5.mlp.gate_proj.weight8 +< +input55 2'layers.6.self_attn.o_proj.o_proj.weight8 +> +input56 2)layers.6.self_attn.qkv_proj.v_proj.weight8 +2 +input57 2layers.6.input_layernorm.weight8 +> +input58 2)layers.6.self_attn.qkv_proj.k_proj.weight8 +> +input59 2)layers.6.self_attn.qkv_proj.q_proj.weight8 +2 +input60 82layers.6.mlp.down_proj.weight8 +0 +input618 2layers.6.mlp.up_proj.weight8 +; +input62 2(layers.6.post_attention_layernorm.weight8 +2 +input638 2layers.6.mlp.gate_proj.weight8 +< +input64 2'layers.7.self_attn.o_proj.o_proj.weight8 +> +input65 2)layers.7.self_attn.qkv_proj.v_proj.weight8 +2 +input66 2layers.7.input_layernorm.weight8 +> +input67 2)layers.7.self_attn.qkv_proj.k_proj.weight8 +> +input68 2)layers.7.self_attn.qkv_proj.q_proj.weight8 +2 +input69 82layers.7.mlp.down_proj.weight8 +0 +input708 2layers.7.mlp.up_proj.weight8 +; +input71 2(layers.7.post_attention_layernorm.weight8 +2 +input728 2layers.7.mlp.gate_proj.weight8 +< +input73 2'layers.8.self_attn.o_proj.o_proj.weight8 +> +input74 2)layers.8.self_attn.qkv_proj.v_proj.weight8 +2 +input75 2layers.8.input_layernorm.weight8 +> +input76 2)layers.8.self_attn.qkv_proj.k_proj.weight8 +> +input77 2)layers.8.self_attn.qkv_proj.q_proj.weight8 +2 +input78 82layers.8.mlp.down_proj.weight8 +0 +input798 2layers.8.mlp.up_proj.weight8 +; +input80 2(layers.8.post_attention_layernorm.weight8 +2 +input818 2layers.8.mlp.gate_proj.weight8 +< +input82 2'layers.9.self_attn.o_proj.o_proj.weight8 +> +input83 2)layers.9.self_attn.qkv_proj.v_proj.weight8 +2 +input84 2layers.9.input_layernorm.weight8 +> +input85 2)layers.9.self_attn.qkv_proj.k_proj.weight8 +> +input86 2)layers.9.self_attn.qkv_proj.q_proj.weight8 +2 +input87 82layers.9.mlp.down_proj.weight8 +0 +input888 2layers.9.mlp.up_proj.weight8 +; +input89 2(layers.9.post_attention_layernorm.weight8 +2 +input908 2layers.9.mlp.gate_proj.weight8 += +input91 2(layers.10.self_attn.o_proj.o_proj.weight8 +? +input92 2*layers.10.self_attn.qkv_proj.v_proj.weight8 +3 +input93 2 layers.10.input_layernorm.weight8 +? +input94 2*layers.10.self_attn.qkv_proj.k_proj.weight8 +? +input95 2*layers.10.self_attn.qkv_proj.q_proj.weight8 +3 +input96 82layers.10.mlp.down_proj.weight8 +1 +input978 2layers.10.mlp.up_proj.weight8 +< +input98 2)layers.10.post_attention_layernorm.weight8 +3 +input998 2layers.10.mlp.gate_proj.weight8 +> +input100 2(layers.11.self_attn.o_proj.o_proj.weight8 +@ +input101 2*layers.11.self_attn.qkv_proj.v_proj.weight8 +4 +input102 2 layers.11.input_layernorm.weight8 +@ +input103 2*layers.11.self_attn.qkv_proj.k_proj.weight8 +@ +input104 2*layers.11.self_attn.qkv_proj.q_proj.weight8 +4 +input105 82layers.11.mlp.down_proj.weight8 +2 +input1068 2layers.11.mlp.up_proj.weight8 += +input107 2)layers.11.post_attention_layernorm.weight8 +4 +input1088 2layers.11.mlp.gate_proj.weight8 +> +input109 2(layers.12.self_attn.o_proj.o_proj.weight8 +@ +input110 2*layers.12.self_attn.qkv_proj.v_proj.weight8 +4 +input111 2 layers.12.input_layernorm.weight8 +@ +input112 2*layers.12.self_attn.qkv_proj.k_proj.weight8 +@ +input113 2*layers.12.self_attn.qkv_proj.q_proj.weight8 +4 +input114 82layers.12.mlp.down_proj.weight8 +2 +input1158 2layers.12.mlp.up_proj.weight8 += +input116 2)layers.12.post_attention_layernorm.weight8 +4 +input1178 2layers.12.mlp.gate_proj.weight8 +> +input118 2(layers.13.self_attn.o_proj.o_proj.weight8 +@ +input119 2*layers.13.self_attn.qkv_proj.v_proj.weight8 +4 +input120 2 layers.13.input_layernorm.weight8 +@ +input121 2*layers.13.self_attn.qkv_proj.k_proj.weight8 +@ +input122 2*layers.13.self_attn.qkv_proj.q_proj.weight8 +4 +input123 82layers.13.mlp.down_proj.weight8 +2 +input1248 2layers.13.mlp.up_proj.weight8 += +input125 2)layers.13.post_attention_layernorm.weight8 +4 +input1268 2layers.13.mlp.gate_proj.weight8 +> +input127 2(layers.14.self_attn.o_proj.o_proj.weight8 +@ +input128 2*layers.14.self_attn.qkv_proj.v_proj.weight8 +4 +input129 2 layers.14.input_layernorm.weight8 +@ +input130 2*layers.14.self_attn.qkv_proj.k_proj.weight8 +@ +input131 2*layers.14.self_attn.qkv_proj.q_proj.weight8 +4 +input132 82layers.14.mlp.down_proj.weight8 +2 +input1338 2layers.14.mlp.up_proj.weight8 += +input134 2)layers.14.post_attention_layernorm.weight8 +4 +input1358 2layers.14.mlp.gate_proj.weight8 +> +input136 2(layers.15.self_attn.o_proj.o_proj.weight8 +@ +input137 2*layers.15.self_attn.qkv_proj.v_proj.weight8 +4 +input138 2 layers.15.input_layernorm.weight8 +@ +input139 2*layers.15.self_attn.qkv_proj.k_proj.weight8 +@ +input140 2*layers.15.self_attn.qkv_proj.q_proj.weight8 +4 +input141 82layers.15.mlp.down_proj.weight8 +2 +input1428 2layers.15.mlp.up_proj.weight8 += +input143 2)layers.15.post_attention_layernorm.weight8 +4 +input1448 2layers.15.mlp.gate_proj.weight8 +> +input145 2(layers.16.self_attn.o_proj.o_proj.weight8 +@ +input146 2*layers.16.self_attn.qkv_proj.v_proj.weight8 +4 +input147 2 layers.16.input_layernorm.weight8 +@ +input148 2*layers.16.self_attn.qkv_proj.k_proj.weight8 +@ +input149 2*layers.16.self_attn.qkv_proj.q_proj.weight8 +4 +input150 82layers.16.mlp.down_proj.weight8 +2 +input1518 2layers.16.mlp.up_proj.weight8 += +input152 2)layers.16.post_attention_layernorm.weight8 +4 +input1538 2layers.16.mlp.gate_proj.weight8 +> +input154 2(layers.17.self_attn.o_proj.o_proj.weight8 +@ +input155 2*layers.17.self_attn.qkv_proj.v_proj.weight8 +4 +input156 2 layers.17.input_layernorm.weight8 +@ +input157 2*layers.17.self_attn.qkv_proj.k_proj.weight8 +@ +input158 2*layers.17.self_attn.qkv_proj.q_proj.weight8 +4 +input159 82layers.17.mlp.down_proj.weight8 +2 +input1608 2layers.17.mlp.up_proj.weight8 += +input161 2)layers.17.post_attention_layernorm.weight8 +4 +input1628 2layers.17.mlp.gate_proj.weight8 +> +input163 2(layers.18.self_attn.o_proj.o_proj.weight8 +@ +input164 2*layers.18.self_attn.qkv_proj.v_proj.weight8 +4 +input165 2 layers.18.input_layernorm.weight8 +@ +input166 2*layers.18.self_attn.qkv_proj.k_proj.weight8 +@ +input167 2*layers.18.self_attn.qkv_proj.q_proj.weight8 +4 +input168 82layers.18.mlp.down_proj.weight8 +2 +input1698 2layers.18.mlp.up_proj.weight8 += +input170 2)layers.18.post_attention_layernorm.weight8 +4 +input1718 2layers.18.mlp.gate_proj.weight8 +> +input172 2(layers.19.self_attn.o_proj.o_proj.weight8 +@ +input173 2*layers.19.self_attn.qkv_proj.v_proj.weight8 +4 +input174 2 layers.19.input_layernorm.weight8 +@ +input175 2*layers.19.self_attn.qkv_proj.k_proj.weight8 +@ +input176 2*layers.19.self_attn.qkv_proj.q_proj.weight8 +4 +input177 82layers.19.mlp.down_proj.weight8 +2 +input1788 2layers.19.mlp.up_proj.weight8 += +input179 2)layers.19.post_attention_layernorm.weight8 +4 +input1808 2layers.19.mlp.gate_proj.weight8 +> +input181 2(layers.20.self_attn.o_proj.o_proj.weight8 +@ +input182 2*layers.20.self_attn.qkv_proj.v_proj.weight8 +4 +input183 2 layers.20.input_layernorm.weight8 +@ +input184 2*layers.20.self_attn.qkv_proj.k_proj.weight8 +@ +input185 2*layers.20.self_attn.qkv_proj.q_proj.weight8 +4 +input186 82layers.20.mlp.down_proj.weight8 +2 +input1878 2layers.20.mlp.up_proj.weight8 += +input188 2)layers.20.post_attention_layernorm.weight8 +4 +input1898 2layers.20.mlp.gate_proj.weight8 +> +input190 2(layers.21.self_attn.o_proj.o_proj.weight8 +@ +input191 2*layers.21.self_attn.qkv_proj.v_proj.weight8 +4 +input192 2 layers.21.input_layernorm.weight8 +@ +input193 2*layers.21.self_attn.qkv_proj.k_proj.weight8 +@ +input194 2*layers.21.self_attn.qkv_proj.q_proj.weight8 +4 +input195 82layers.21.mlp.down_proj.weight8 +2 +input1968 2layers.21.mlp.up_proj.weight8 += +input197 2)layers.21.post_attention_layernorm.weight8 +4 +input1988 2layers.21.mlp.gate_proj.weight8 +> +input199 2(layers.22.self_attn.o_proj.o_proj.weight8 +@ +input200 2*layers.22.self_attn.qkv_proj.v_proj.weight8 +4 +input201 2 layers.22.input_layernorm.weight8 +@ +input202 2*layers.22.self_attn.qkv_proj.k_proj.weight8 +@ +input203 2*layers.22.self_attn.qkv_proj.q_proj.weight8 +4 +input204 82layers.22.mlp.down_proj.weight8 +2 +input2058 2layers.22.mlp.up_proj.weight8 += +input206 2)layers.22.post_attention_layernorm.weight8 +4 +input2078 2layers.22.mlp.gate_proj.weight8 +> +input208 2(layers.23.self_attn.o_proj.o_proj.weight8 +@ +input209 2*layers.23.self_attn.qkv_proj.v_proj.weight8 +4 +input210 2 layers.23.input_layernorm.weight8 +@ +input211 2*layers.23.self_attn.qkv_proj.k_proj.weight8 +@ +input212 2*layers.23.self_attn.qkv_proj.q_proj.weight8 +4 +input213 82layers.23.mlp.down_proj.weight8 +2 +input2148 2layers.23.mlp.up_proj.weight8 += +input215 2)layers.23.post_attention_layernorm.weight8 +4 +input2168 2layers.23.mlp.gate_proj.weight8 +> +input217 2(layers.24.self_attn.o_proj.o_proj.weight8 +@ +input218 2*layers.24.self_attn.qkv_proj.v_proj.weight8 +4 +input219 2 layers.24.input_layernorm.weight8 +@ +input220 2*layers.24.self_attn.qkv_proj.k_proj.weight8 +@ +input221 2*layers.24.self_attn.qkv_proj.q_proj.weight8 +4 +input222 82layers.24.mlp.down_proj.weight8 +2 +input2238 2layers.24.mlp.up_proj.weight8 += +input224 2)layers.24.post_attention_layernorm.weight8 +4 +input2258 2layers.24.mlp.gate_proj.weight8 +> +input226 2(layers.25.self_attn.o_proj.o_proj.weight8 +@ +input227 2*layers.25.self_attn.qkv_proj.v_proj.weight8 +4 +input228 2 layers.25.input_layernorm.weight8 +@ +input229 2*layers.25.self_attn.qkv_proj.k_proj.weight8 +@ +input230 2*layers.25.self_attn.qkv_proj.q_proj.weight8 +4 +input231 82layers.25.mlp.down_proj.weight8 +2 +input2328 2layers.25.mlp.up_proj.weight8 += +input233 2)layers.25.post_attention_layernorm.weight8 +4 +input2348 2layers.25.mlp.gate_proj.weight8 +> +input235 2(layers.26.self_attn.o_proj.o_proj.weight8 +@ +input236 2*layers.26.self_attn.qkv_proj.v_proj.weight8 +4 +input237 2 layers.26.input_layernorm.weight8 +@ +input238 2*layers.26.self_attn.qkv_proj.k_proj.weight8 +@ +input239 2*layers.26.self_attn.qkv_proj.q_proj.weight8 +4 +input240 82layers.26.mlp.down_proj.weight8 +2 +input2418 2layers.26.mlp.up_proj.weight8 += +input242 2)layers.26.post_attention_layernorm.weight8 +4 +input2438 2layers.26.mlp.gate_proj.weight8 +> +input244 2(layers.27.self_attn.o_proj.o_proj.weight8 +@ +input245 2*layers.27.self_attn.qkv_proj.v_proj.weight8 +4 +input246 2 layers.27.input_layernorm.weight8 +@ +input247 2*layers.27.self_attn.qkv_proj.k_proj.weight8 +@ +input248 2*layers.27.self_attn.qkv_proj.q_proj.weight8 +4 +input249 82layers.27.mlp.down_proj.weight8 +2 +input2508 2layers.27.mlp.up_proj.weight8 += +input251 2)layers.27.post_attention_layernorm.weight8 +4 +input2528 2layers.27.mlp.gate_proj.weight8 +> +input253 2(layers.28.self_attn.o_proj.o_proj.weight8 +@ +input254 2*layers.28.self_attn.qkv_proj.v_proj.weight8 +4 +input255 2 layers.28.input_layernorm.weight8 +@ +input256 2*layers.28.self_attn.qkv_proj.k_proj.weight8 +@ +input257 2*layers.28.self_attn.qkv_proj.q_proj.weight8 +4 +input258 82layers.28.mlp.down_proj.weight8 +2 +input2598 2layers.28.mlp.up_proj.weight8 += +input260 2)layers.28.post_attention_layernorm.weight8 +4 +input2618 2layers.28.mlp.gate_proj.weight8 +> +input262 2(layers.29.self_attn.o_proj.o_proj.weight8 +@ +input263 2*layers.29.self_attn.qkv_proj.v_proj.weight8 +4 +input264 2 layers.29.input_layernorm.weight8 +@ +input265 2*layers.29.self_attn.qkv_proj.k_proj.weight8 +@ +input266 2*layers.29.self_attn.qkv_proj.q_proj.weight8 +4 +input267 82layers.29.mlp.down_proj.weight8 +2 +input2688 2layers.29.mlp.up_proj.weight8 += +input269 2)layers.29.post_attention_layernorm.weight8 +4 +input2708 2layers.29.mlp.gate_proj.weight8 +> +input271 2(layers.30.self_attn.o_proj.o_proj.weight8 +@ +input272 2*layers.30.self_attn.qkv_proj.v_proj.weight8 +4 +input273 2 layers.30.input_layernorm.weight8 +@ +input274 2*layers.30.self_attn.qkv_proj.k_proj.weight8 +@ +input275 2*layers.30.self_attn.qkv_proj.q_proj.weight8 +4 +input276 82layers.30.mlp.down_proj.weight8 +2 +input2778 2layers.30.mlp.up_proj.weight8 += +input278 2)layers.30.post_attention_layernorm.weight8 +4 +input2798 2layers.30.mlp.gate_proj.weight8 +> +input280 2(layers.31.self_attn.o_proj.o_proj.weight8 +@ +input281 2*layers.31.self_attn.qkv_proj.v_proj.weight8 +4 +input282 2 layers.31.input_layernorm.weight8 +@ +input283 2*layers.31.self_attn.qkv_proj.k_proj.weight8 +@ +input284 2*layers.31.self_attn.qkv_proj.q_proj.weight8 +4 +input285 82layers.31.mlp.down_proj.weight8 +2 +input2868 2layers.31.mlp.up_proj.weight8 += +input287 2)layers.31.post_attention_layernorm.weight8 +4 +input2888 2layers.31.mlp.gate_proj.weight8 +% +input289 2lm_head.weight8 + +input290 2 norm.weight8' +output02embed_tokens.weight> +output12'layers.0.self_attn.o_proj.o_proj.weight> +output2 2)layers.0.self_attn.qkv_proj.v_proj.weight1 +output3 2layers.0.input_layernorm.weight> +output4 @2)layers.0.self_attn.qkv_proj.k_proj.weight? +output5 @2)layers.0.self_attn.qkv_proj.q_proj.weight3 +output6 2layers.0.mlp.down_proj.weight0 +output78 2layers.0.mlp.up_proj.weight: +output8 2(layers.0.post_attention_layernorm.weight2 +output98 2layers.0.mlp.gate_proj.weight? +output102'layers.1.self_attn.o_proj.o_proj.weight? +output11 2)layers.1.self_attn.qkv_proj.v_proj.weight2 +output12 2layers.1.input_layernorm.weight? +output13 @2)layers.1.self_attn.qkv_proj.k_proj.weight@ +output14 @2)layers.1.self_attn.qkv_proj.q_proj.weight4 +output15 2layers.1.mlp.down_proj.weight1 +output168 2layers.1.mlp.up_proj.weight; +output17 2(layers.1.post_attention_layernorm.weight3 +output188 2layers.1.mlp.gate_proj.weight? +output192'layers.2.self_attn.o_proj.o_proj.weight? +output20 2)layers.2.self_attn.qkv_proj.v_proj.weight2 +output21 2layers.2.input_layernorm.weight? +output22 @2)layers.2.self_attn.qkv_proj.k_proj.weight@ +output23 @2)layers.2.self_attn.qkv_proj.q_proj.weight4 +output24 2layers.2.mlp.down_proj.weight1 +output258 2layers.2.mlp.up_proj.weight; +output26 2(layers.2.post_attention_layernorm.weight3 +output278 2layers.2.mlp.gate_proj.weight? +output282'layers.3.self_attn.o_proj.o_proj.weight? +output29 2)layers.3.self_attn.qkv_proj.v_proj.weight2 +output30 2layers.3.input_layernorm.weight? +output31 @2)layers.3.self_attn.qkv_proj.k_proj.weight@ +output32 @2)layers.3.self_attn.qkv_proj.q_proj.weight4 +output33 2layers.3.mlp.down_proj.weight1 +output348 2layers.3.mlp.up_proj.weight; +output35 2(layers.3.post_attention_layernorm.weight3 +output368 2layers.3.mlp.gate_proj.weight? +output372'layers.4.self_attn.o_proj.o_proj.weight? +output38 2)layers.4.self_attn.qkv_proj.v_proj.weight2 +output39 2layers.4.input_layernorm.weight? +output40 @2)layers.4.self_attn.qkv_proj.k_proj.weight@ +output41 @2)layers.4.self_attn.qkv_proj.q_proj.weight4 +output42 2layers.4.mlp.down_proj.weight1 +output438 2layers.4.mlp.up_proj.weight; +output44 2(layers.4.post_attention_layernorm.weight3 +output458 2layers.4.mlp.gate_proj.weight? +output462'layers.5.self_attn.o_proj.o_proj.weight? +output47 2)layers.5.self_attn.qkv_proj.v_proj.weight2 +output48 2layers.5.input_layernorm.weight? +output49 @2)layers.5.self_attn.qkv_proj.k_proj.weight@ +output50 @2)layers.5.self_attn.qkv_proj.q_proj.weight4 +output51 2layers.5.mlp.down_proj.weight1 +output528 2layers.5.mlp.up_proj.weight; +output53 2(layers.5.post_attention_layernorm.weight3 +output548 2layers.5.mlp.gate_proj.weight? +output552'layers.6.self_attn.o_proj.o_proj.weight? +output56 2)layers.6.self_attn.qkv_proj.v_proj.weight2 +output57 2layers.6.input_layernorm.weight? +output58 @2)layers.6.self_attn.qkv_proj.k_proj.weight@ +output59 @2)layers.6.self_attn.qkv_proj.q_proj.weight4 +output60 2layers.6.mlp.down_proj.weight1 +output618 2layers.6.mlp.up_proj.weight; +output62 2(layers.6.post_attention_layernorm.weight3 +output638 2layers.6.mlp.gate_proj.weight? +output642'layers.7.self_attn.o_proj.o_proj.weight? +output65 2)layers.7.self_attn.qkv_proj.v_proj.weight2 +output66 2layers.7.input_layernorm.weight? +output67 @2)layers.7.self_attn.qkv_proj.k_proj.weight@ +output68 @2)layers.7.self_attn.qkv_proj.q_proj.weight4 +output69 2layers.7.mlp.down_proj.weight1 +output708 2layers.7.mlp.up_proj.weight; +output71 2(layers.7.post_attention_layernorm.weight3 +output728 2layers.7.mlp.gate_proj.weight? +output732'layers.8.self_attn.o_proj.o_proj.weight? +output74 2)layers.8.self_attn.qkv_proj.v_proj.weight2 +output75 2layers.8.input_layernorm.weight? +output76 @2)layers.8.self_attn.qkv_proj.k_proj.weight@ +output77 @2)layers.8.self_attn.qkv_proj.q_proj.weight4 +output78 2layers.8.mlp.down_proj.weight1 +output798 2layers.8.mlp.up_proj.weight; +output80 2(layers.8.post_attention_layernorm.weight3 +output818 2layers.8.mlp.gate_proj.weight? +output822'layers.9.self_attn.o_proj.o_proj.weight? +output83 2)layers.9.self_attn.qkv_proj.v_proj.weight2 +output84 2layers.9.input_layernorm.weight? +output85 @2)layers.9.self_attn.qkv_proj.k_proj.weight@ +output86 @2)layers.9.self_attn.qkv_proj.q_proj.weight4 +output87 2layers.9.mlp.down_proj.weight1 +output888 2layers.9.mlp.up_proj.weight; +output89 2(layers.9.post_attention_layernorm.weight3 +output908 2layers.9.mlp.gate_proj.weight@ +output912(layers.10.self_attn.o_proj.o_proj.weight@ +output92 2*layers.10.self_attn.qkv_proj.v_proj.weight3 +output93 2 layers.10.input_layernorm.weight@ +output94 @2*layers.10.self_attn.qkv_proj.k_proj.weightA +output95 @2*layers.10.self_attn.qkv_proj.q_proj.weight5 +output96 2layers.10.mlp.down_proj.weight2 +output978 2layers.10.mlp.up_proj.weight< +output98 2)layers.10.post_attention_layernorm.weight4 +output998 2layers.10.mlp.gate_proj.weightA + output1002(layers.11.self_attn.o_proj.o_proj.weightA + output101 2*layers.11.self_attn.qkv_proj.v_proj.weight4 + output102 2 layers.11.input_layernorm.weightA + output103 @2*layers.11.self_attn.qkv_proj.k_proj.weightB + output104 @2*layers.11.self_attn.qkv_proj.q_proj.weight6 + output105 2layers.11.mlp.down_proj.weight3 + output1068 2layers.11.mlp.up_proj.weight= + output107 2)layers.11.post_attention_layernorm.weight5 + output1088 2layers.11.mlp.gate_proj.weightA + output1092(layers.12.self_attn.o_proj.o_proj.weightA + output110 2*layers.12.self_attn.qkv_proj.v_proj.weight4 + output111 2 layers.12.input_layernorm.weightA + output112 @2*layers.12.self_attn.qkv_proj.k_proj.weightB + output113 @2*layers.12.self_attn.qkv_proj.q_proj.weight6 + output114 2layers.12.mlp.down_proj.weight3 + output1158 2layers.12.mlp.up_proj.weight= + output116 2)layers.12.post_attention_layernorm.weight5 + output1178 2layers.12.mlp.gate_proj.weightA + output1182(layers.13.self_attn.o_proj.o_proj.weightA + output119 2*layers.13.self_attn.qkv_proj.v_proj.weight4 + output120 2 layers.13.input_layernorm.weightA + output121 @2*layers.13.self_attn.qkv_proj.k_proj.weightB + output122 @2*layers.13.self_attn.qkv_proj.q_proj.weight6 + output123 2layers.13.mlp.down_proj.weight3 + output1248 2layers.13.mlp.up_proj.weight= + output125 2)layers.13.post_attention_layernorm.weight5 + output1268 2layers.13.mlp.gate_proj.weightA + output1272(layers.14.self_attn.o_proj.o_proj.weightA + output128 2*layers.14.self_attn.qkv_proj.v_proj.weight4 + output129 2 layers.14.input_layernorm.weightA + output130 @2*layers.14.self_attn.qkv_proj.k_proj.weightB + output131 @2*layers.14.self_attn.qkv_proj.q_proj.weight6 + output132 2layers.14.mlp.down_proj.weight3 + output1338 2layers.14.mlp.up_proj.weight= + output134 2)layers.14.post_attention_layernorm.weight5 + output1358 2layers.14.mlp.gate_proj.weightA + output1362(layers.15.self_attn.o_proj.o_proj.weightA + output137 2*layers.15.self_attn.qkv_proj.v_proj.weight4 + output138 2 layers.15.input_layernorm.weightA + output139 @2*layers.15.self_attn.qkv_proj.k_proj.weightB + output140 @2*layers.15.self_attn.qkv_proj.q_proj.weight6 + output141 2layers.15.mlp.down_proj.weight3 + output1428 2layers.15.mlp.up_proj.weight= + output143 2)layers.15.post_attention_layernorm.weight5 + output1448 2layers.15.mlp.gate_proj.weightA + output1452(layers.16.self_attn.o_proj.o_proj.weightA + output146 2*layers.16.self_attn.qkv_proj.v_proj.weight4 + output147 2 layers.16.input_layernorm.weightA + output148 @2*layers.16.self_attn.qkv_proj.k_proj.weightB + output149 @2*layers.16.self_attn.qkv_proj.q_proj.weight6 + output150 2layers.16.mlp.down_proj.weight3 + output1518 2layers.16.mlp.up_proj.weight= + output152 2)layers.16.post_attention_layernorm.weight5 + output1538 2layers.16.mlp.gate_proj.weightA + output1542(layers.17.self_attn.o_proj.o_proj.weightA + output155 2*layers.17.self_attn.qkv_proj.v_proj.weight4 + output156 2 layers.17.input_layernorm.weightA + output157 @2*layers.17.self_attn.qkv_proj.k_proj.weightB + output158 @2*layers.17.self_attn.qkv_proj.q_proj.weight6 + output159 2layers.17.mlp.down_proj.weight3 + output1608 2layers.17.mlp.up_proj.weight= + output161 2)layers.17.post_attention_layernorm.weight5 + output1628 2layers.17.mlp.gate_proj.weightA + output1632(layers.18.self_attn.o_proj.o_proj.weightA + output164 2*layers.18.self_attn.qkv_proj.v_proj.weight4 + output165 2 layers.18.input_layernorm.weightA + output166 @2*layers.18.self_attn.qkv_proj.k_proj.weightB + output167 @2*layers.18.self_attn.qkv_proj.q_proj.weight6 + output168 2layers.18.mlp.down_proj.weight3 + output1698 2layers.18.mlp.up_proj.weight= + output170 2)layers.18.post_attention_layernorm.weight5 + output1718 2layers.18.mlp.gate_proj.weightA + output1722(layers.19.self_attn.o_proj.o_proj.weightA + output173 2*layers.19.self_attn.qkv_proj.v_proj.weight4 + output174 2 layers.19.input_layernorm.weightA + output175 @2*layers.19.self_attn.qkv_proj.k_proj.weightB + output176 @2*layers.19.self_attn.qkv_proj.q_proj.weight6 + output177 2layers.19.mlp.down_proj.weight3 + output1788 2layers.19.mlp.up_proj.weight= + output179 2)layers.19.post_attention_layernorm.weight5 + output1808 2layers.19.mlp.gate_proj.weightA + output1812(layers.20.self_attn.o_proj.o_proj.weightA + output182 2*layers.20.self_attn.qkv_proj.v_proj.weight4 + output183 2 layers.20.input_layernorm.weightA + output184 @2*layers.20.self_attn.qkv_proj.k_proj.weightB + output185 @2*layers.20.self_attn.qkv_proj.q_proj.weight6 + output186 2layers.20.mlp.down_proj.weight3 + output1878 2layers.20.mlp.up_proj.weight= + output188 2)layers.20.post_attention_layernorm.weight5 + output1898 2layers.20.mlp.gate_proj.weightA + output1902(layers.21.self_attn.o_proj.o_proj.weightA + output191 2*layers.21.self_attn.qkv_proj.v_proj.weight4 + output192 2 layers.21.input_layernorm.weightA + output193 @2*layers.21.self_attn.qkv_proj.k_proj.weightB + output194 @2*layers.21.self_attn.qkv_proj.q_proj.weight6 + output195 2layers.21.mlp.down_proj.weight3 + output1968 2layers.21.mlp.up_proj.weight= + output197 2)layers.21.post_attention_layernorm.weight5 + output1988 2layers.21.mlp.gate_proj.weightA + output1992(layers.22.self_attn.o_proj.o_proj.weightA + output200 2*layers.22.self_attn.qkv_proj.v_proj.weight4 + output201 2 layers.22.input_layernorm.weightA + output202 @2*layers.22.self_attn.qkv_proj.k_proj.weightB + output203 @2*layers.22.self_attn.qkv_proj.q_proj.weight6 + output204 2layers.22.mlp.down_proj.weight3 + output2058 2layers.22.mlp.up_proj.weight= + output206 2)layers.22.post_attention_layernorm.weight5 + output2078 2layers.22.mlp.gate_proj.weightA + output2082(layers.23.self_attn.o_proj.o_proj.weightA + output209 2*layers.23.self_attn.qkv_proj.v_proj.weight4 + output210 2 layers.23.input_layernorm.weightA + output211 @2*layers.23.self_attn.qkv_proj.k_proj.weightB + output212 @2*layers.23.self_attn.qkv_proj.q_proj.weight6 + output213 2layers.23.mlp.down_proj.weight3 + output2148 2layers.23.mlp.up_proj.weight= + output215 2)layers.23.post_attention_layernorm.weight5 + output2168 2layers.23.mlp.gate_proj.weightA + output2172(layers.24.self_attn.o_proj.o_proj.weightA + output218 2*layers.24.self_attn.qkv_proj.v_proj.weight4 + output219 2 layers.24.input_layernorm.weightA + output220 @2*layers.24.self_attn.qkv_proj.k_proj.weightB + output221 @2*layers.24.self_attn.qkv_proj.q_proj.weight6 + output222 2layers.24.mlp.down_proj.weight3 + output2238 2layers.24.mlp.up_proj.weight= + output224 2)layers.24.post_attention_layernorm.weight5 + output2258 2layers.24.mlp.gate_proj.weightA + output2262(layers.25.self_attn.o_proj.o_proj.weightA + output227 2*layers.25.self_attn.qkv_proj.v_proj.weight4 + output228 2 layers.25.input_layernorm.weightA + output229 @2*layers.25.self_attn.qkv_proj.k_proj.weightB + output230 @2*layers.25.self_attn.qkv_proj.q_proj.weight6 + output231 2layers.25.mlp.down_proj.weight3 + output2328 2layers.25.mlp.up_proj.weight= + output233 2)layers.25.post_attention_layernorm.weight5 + output2348 2layers.25.mlp.gate_proj.weightA + output2352(layers.26.self_attn.o_proj.o_proj.weightA + output236 2*layers.26.self_attn.qkv_proj.v_proj.weight4 + output237 2 layers.26.input_layernorm.weightA + output238 @2*layers.26.self_attn.qkv_proj.k_proj.weightB + output239 @2*layers.26.self_attn.qkv_proj.q_proj.weight6 + output240 2layers.26.mlp.down_proj.weight3 + output2418 2layers.26.mlp.up_proj.weight= + output242 2)layers.26.post_attention_layernorm.weight5 + output2438 2layers.26.mlp.gate_proj.weightA + output2442(layers.27.self_attn.o_proj.o_proj.weightA + output245 2*layers.27.self_attn.qkv_proj.v_proj.weight4 + output246 2 layers.27.input_layernorm.weightA + output247 @2*layers.27.self_attn.qkv_proj.k_proj.weightB + output248 @2*layers.27.self_attn.qkv_proj.q_proj.weight6 + output249 2layers.27.mlp.down_proj.weight3 + output2508 2layers.27.mlp.up_proj.weight= + output251 2)layers.27.post_attention_layernorm.weight5 + output2528 2layers.27.mlp.gate_proj.weightA + output2532(layers.28.self_attn.o_proj.o_proj.weightA + output254 2*layers.28.self_attn.qkv_proj.v_proj.weight4 + output255 2 layers.28.input_layernorm.weightA + output256 @2*layers.28.self_attn.qkv_proj.k_proj.weightB + output257 @2*layers.28.self_attn.qkv_proj.q_proj.weight6 + output258 2layers.28.mlp.down_proj.weight3 + output2598 2layers.28.mlp.up_proj.weight= + output260 2)layers.28.post_attention_layernorm.weight5 + output2618 2layers.28.mlp.gate_proj.weightA + output2622(layers.29.self_attn.o_proj.o_proj.weightA + output263 2*layers.29.self_attn.qkv_proj.v_proj.weight4 + output264 2 layers.29.input_layernorm.weightA + output265 @2*layers.29.self_attn.qkv_proj.k_proj.weightB + output266 @2*layers.29.self_attn.qkv_proj.q_proj.weight6 + output267 2layers.29.mlp.down_proj.weight3 + output2688 2layers.29.mlp.up_proj.weight= + output269 2)layers.29.post_attention_layernorm.weight5 + output2708 2layers.29.mlp.gate_proj.weightA + output2712(layers.30.self_attn.o_proj.o_proj.weightA + output272 2*layers.30.self_attn.qkv_proj.v_proj.weight4 + output273 2 layers.30.input_layernorm.weightA + output274 @2*layers.30.self_attn.qkv_proj.k_proj.weightB + output275 @2*layers.30.self_attn.qkv_proj.q_proj.weight6 + output276 2layers.30.mlp.down_proj.weight3 + output2778 2layers.30.mlp.up_proj.weight= + output278 2)layers.30.post_attention_layernorm.weight5 + output2798 2layers.30.mlp.gate_proj.weightA + output2802(layers.31.self_attn.o_proj.o_proj.weightA + output281 2*layers.31.self_attn.qkv_proj.v_proj.weight4 + output282 2 layers.31.input_layernorm.weightA + output283 @2*layers.31.self_attn.qkv_proj.k_proj.weightB + output284 @2*layers.31.self_attn.qkv_proj.q_proj.weight6 + output285 2layers.31.mlp.down_proj.weight3 + output2868 2layers.31.mlp.up_proj.weight= + output287 2)layers.31.post_attention_layernorm.weight5 + output2888 2layers.31.mlp.gate_proj.weight& + output289 2lm_head.weight + output290 2 norm.weight \ No newline at end of file diff --git a/layout_opt/model/graph.hlo b/layout_opt/model/graph.hlo new file mode 100644 index 0000000000000000000000000000000000000000..bae20b1da2e8ce8f6885f6de2d1b823e63c693be --- /dev/null +++ b/layout_opt/model/graph.hlo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7a76fc5f3f76d1d69d57e0e784721bafd07e3a61734f6594e8c815123a8a771 +size 176877 diff --git a/model.pt b/model.pt new file mode 100644 index 0000000000000000000000000000000000000000..028ad5230e32d35bf07f5f1aef99060d925f3fa5 --- /dev/null +++ b/model.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db6fd0c1be612908d3c6ede5cdedda302359d5279c7d078eaadcb48d17389030 +size 53720651 diff --git a/token_generation_model/_tp0_bk0/graph.neff b/token_generation_model/_tp0_bk0/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..26cb0d7f054b6bb0bb1e8aeea284b22055442a2b --- /dev/null +++ b/token_generation_model/_tp0_bk0/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f084665c9d486b682a226970bf7ab5170c50859a2f0cc8e46fee1811b6421349 +size 5612544 diff --git a/token_generation_model/_tp0_bk0/metaneff.pb b/token_generation_model/_tp0_bk0/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..6031aee709da76ecf55f82794ae286cdb591c993 --- /dev/null +++ b/token_generation_model/_tp0_bk0/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db597e62ef7dd0b4be31a941ce01bedf8ff4e3e418a571d927191d0fe1ac7749 +size 823209 diff --git a/token_generation_model/_tp0_bk0/model.MODULE_67d3774d5bacfe6ba851+72d461cc.hlo_module.pb b/token_generation_model/_tp0_bk0/model.MODULE_67d3774d5bacfe6ba851+72d461cc.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..af753cd3588c6b3a17b0a37c0f52023f7d7a8784 --- /dev/null +++ b/token_generation_model/_tp0_bk0/model.MODULE_67d3774d5bacfe6ba851+72d461cc.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b572d17843a963e8042dbdafc202058bb0d10fd3f7ce91e3f20bc1db70324d7 +size 802071 diff --git a/token_generation_model/_tp0_bk0/model.MODULE_67d3774d5bacfe6ba851+72d461cc.neff b/token_generation_model/_tp0_bk0/model.MODULE_67d3774d5bacfe6ba851+72d461cc.neff new file mode 100644 index 0000000000000000000000000000000000000000..26cb0d7f054b6bb0bb1e8aeea284b22055442a2b --- /dev/null +++ b/token_generation_model/_tp0_bk0/model.MODULE_67d3774d5bacfe6ba851+72d461cc.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f084665c9d486b682a226970bf7ab5170c50859a2f0cc8e46fee1811b6421349 +size 5612544 diff --git a/token_generation_model/_tp0_bk0/wrapped_neff.hlo b/token_generation_model/_tp0_bk0/wrapped_neff.hlo new file mode 100644 index 0000000000000000000000000000000000000000..5b33da4bc0392c0fc149714760e6c6c5ca7c0b38 --- /dev/null +++ b/token_generation_model/_tp0_bk0/wrapped_neff.hlo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f156b82b448a0ce64eea8895c40e4e50c9f548dea673deb2d7d240fc9df9fe8 +size 5786483 diff --git a/token_generation_model/_tp0_bk1/graph.neff b/token_generation_model/_tp0_bk1/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..4fd05157c664a9752921de63bd79ece285d5cc2d --- /dev/null +++ b/token_generation_model/_tp0_bk1/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6be078d7f3a246715b61da99d6878d44882d85a11eaa1685534babf11cde59e9 +size 5684224 diff --git a/token_generation_model/_tp0_bk1/metaneff.pb b/token_generation_model/_tp0_bk1/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..8d4c8a29776f0b3cd108d02668ef836016d71450 --- /dev/null +++ b/token_generation_model/_tp0_bk1/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:615f3bac52a53bbac64e539d918447282af0cafb90b1e95367b365bbeead8e67 +size 822474 diff --git a/token_generation_model/_tp0_bk1/model.MODULE_92bbfea7801df2fea75e+4948da29.hlo_module.pb b/token_generation_model/_tp0_bk1/model.MODULE_92bbfea7801df2fea75e+4948da29.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..d6299ac75139142b5a902d281057d66e51c4a712 --- /dev/null +++ b/token_generation_model/_tp0_bk1/model.MODULE_92bbfea7801df2fea75e+4948da29.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb1806c4248848e0be8635c95d728fa881fc2015e31447ec893a0beeb8b9509d +size 889786 diff --git a/token_generation_model/_tp0_bk1/model.MODULE_92bbfea7801df2fea75e+4948da29.neff b/token_generation_model/_tp0_bk1/model.MODULE_92bbfea7801df2fea75e+4948da29.neff new file mode 100644 index 0000000000000000000000000000000000000000..4fd05157c664a9752921de63bd79ece285d5cc2d --- /dev/null +++ b/token_generation_model/_tp0_bk1/model.MODULE_92bbfea7801df2fea75e+4948da29.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6be078d7f3a246715b61da99d6878d44882d85a11eaa1685534babf11cde59e9 +size 5684224 diff --git a/token_generation_model/_tp0_bk2/graph.neff b/token_generation_model/_tp0_bk2/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..f2c5820caeab49f3bbb355f92d2456f4e919fc05 --- /dev/null +++ b/token_generation_model/_tp0_bk2/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29908d241bdaf407bcb11c682477667cf52ec74ad12cdf8d715dc83bd83a5cbe +size 5766144 diff --git a/token_generation_model/_tp0_bk2/metaneff.pb b/token_generation_model/_tp0_bk2/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..baa6904105e0a37a41dc5c9ea2184360b5130116 --- /dev/null +++ b/token_generation_model/_tp0_bk2/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b42bbad04ce54008f13f9aa4f7bc225fbf166e7956990acd2074f725ca8e6c9b +size 822474 diff --git a/token_generation_model/_tp0_bk2/model.MODULE_2f686dc6ba7ef3326a56+6113de8c.hlo_module.pb b/token_generation_model/_tp0_bk2/model.MODULE_2f686dc6ba7ef3326a56+6113de8c.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..68c0b784d16e8787c2121f749354bcbaf8ab6494 --- /dev/null +++ b/token_generation_model/_tp0_bk2/model.MODULE_2f686dc6ba7ef3326a56+6113de8c.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6c389f0365aeb0a927dc422d904d7ad06f1694d48b6db93e81f9c03a07b7cfc +size 889786 diff --git a/token_generation_model/_tp0_bk2/model.MODULE_2f686dc6ba7ef3326a56+6113de8c.neff b/token_generation_model/_tp0_bk2/model.MODULE_2f686dc6ba7ef3326a56+6113de8c.neff new file mode 100644 index 0000000000000000000000000000000000000000..f2c5820caeab49f3bbb355f92d2456f4e919fc05 --- /dev/null +++ b/token_generation_model/_tp0_bk2/model.MODULE_2f686dc6ba7ef3326a56+6113de8c.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29908d241bdaf407bcb11c682477667cf52ec74ad12cdf8d715dc83bd83a5cbe +size 5766144 diff --git a/token_generation_model/_tp0_bk3/graph.neff b/token_generation_model/_tp0_bk3/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..b6663cc0ed8654d3ed13b28e858f8a760cf62018 --- /dev/null +++ b/token_generation_model/_tp0_bk3/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c34bc7faf28217f485f87fb2c1965b9511b78ff793098e760345b19c84e6079 +size 5970944 diff --git a/token_generation_model/_tp0_bk3/metaneff.pb b/token_generation_model/_tp0_bk3/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..7c8105097a13d03a2791a69f7fa4d3924285427b --- /dev/null +++ b/token_generation_model/_tp0_bk3/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22dccc9c15899011ced61b37b01373a7c26af06476601e88b3b3130f496e557b +size 822474 diff --git a/token_generation_model/_tp0_bk3/model.MODULE_668122c92a86c0ce6817+f94fe8ed.hlo_module.pb b/token_generation_model/_tp0_bk3/model.MODULE_668122c92a86c0ce6817+f94fe8ed.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..24a489c35cb95880397b523be2815c981a07fb67 --- /dev/null +++ b/token_generation_model/_tp0_bk3/model.MODULE_668122c92a86c0ce6817+f94fe8ed.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1603ab6b63d0fe2137a74b7ae0dc8b540b1b1b2803dfefb2f395a5056091fcd0 +size 889786 diff --git a/token_generation_model/_tp0_bk3/model.MODULE_668122c92a86c0ce6817+f94fe8ed.neff b/token_generation_model/_tp0_bk3/model.MODULE_668122c92a86c0ce6817+f94fe8ed.neff new file mode 100644 index 0000000000000000000000000000000000000000..b6663cc0ed8654d3ed13b28e858f8a760cf62018 --- /dev/null +++ b/token_generation_model/_tp0_bk3/model.MODULE_668122c92a86c0ce6817+f94fe8ed.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c34bc7faf28217f485f87fb2c1965b9511b78ff793098e760345b19c84e6079 +size 5970944 diff --git a/token_generation_model/_tp0_bk4/graph.neff b/token_generation_model/_tp0_bk4/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..99d4b7cbbea433c33628672ffa52a007ba52c93a --- /dev/null +++ b/token_generation_model/_tp0_bk4/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc6a4930f6d966ef852a1cb158291f5f95003e6e3fdbb67e3779ce3e37b22d22 +size 6329344 diff --git a/token_generation_model/_tp0_bk4/metaneff.pb b/token_generation_model/_tp0_bk4/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..99812ec38d8a65325cc2c09764b2469ac922fd53 --- /dev/null +++ b/token_generation_model/_tp0_bk4/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c33773f7ab7761cf5a551d3d2878e82f146ff68eeeb4be0ec4339b054f8c4696 +size 822474 diff --git a/token_generation_model/_tp0_bk4/model.MODULE_fb6decaa94b1936d08da+1b5847e3.hlo_module.pb b/token_generation_model/_tp0_bk4/model.MODULE_fb6decaa94b1936d08da+1b5847e3.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..37a46a059df48f4b04e7b95f3e36cadcb9c8e2ec --- /dev/null +++ b/token_generation_model/_tp0_bk4/model.MODULE_fb6decaa94b1936d08da+1b5847e3.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80beb21f8d38e94da08a724987591262e76a022e4832e69bde1ad8c9fc6302f9 +size 889786 diff --git a/token_generation_model/_tp0_bk4/model.MODULE_fb6decaa94b1936d08da+1b5847e3.neff b/token_generation_model/_tp0_bk4/model.MODULE_fb6decaa94b1936d08da+1b5847e3.neff new file mode 100644 index 0000000000000000000000000000000000000000..99d4b7cbbea433c33628672ffa52a007ba52c93a --- /dev/null +++ b/token_generation_model/_tp0_bk4/model.MODULE_fb6decaa94b1936d08da+1b5847e3.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc6a4930f6d966ef852a1cb158291f5f95003e6e3fdbb67e3779ce3e37b22d22 +size 6329344 diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..f3b30c02e7f12eb7fbc7e89595b97c43067d73c6 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89 +size 587404 diff --git a/tokenizer.model.v3 b/tokenizer.model.v3 new file mode 100644 index 0000000000000000000000000000000000000000..f3b30c02e7f12eb7fbc7e89595b97c43067d73c6 --- /dev/null +++ b/tokenizer.model.v3 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89 +size 587404 diff --git a/weights/tp0_sharded_checkpoint.safetensors b/weights/tp0_sharded_checkpoint.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8201110c0152244ab8d0add13f3ec2a76095ad8c --- /dev/null +++ b/weights/tp0_sharded_checkpoint.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6794a3d7f2b1d071399a899a42bcd5652e83ebdd140f02f562d90b292ae750aa +size 7248325764 diff --git a/weights/tp1_sharded_checkpoint.safetensors b/weights/tp1_sharded_checkpoint.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b7535252ca9062a02232f04bdbf3ead3c445e293 --- /dev/null +++ b/weights/tp1_sharded_checkpoint.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14c5bd3b07c4f4b752a65ee99fe9c79ae0110c7e61df0d83ef4993c1ee63a749 +size 7248325764