ImportError: cannot import name 'intel' from 'triton._C.libtriton' (/home/fkurushin/venv/pqr/lib/python3.11/site-packages/triton/_C/libtriton.so)

by Fedor99 - opened Aug 25, 2025

Discussion

Fedor99

Aug 25, 2025

•

edited Aug 25, 2025

Hello! I have some problems running your model on gpu:

The minimal code to reproduce the error:

from transformers import AutoTokenizer, AutoModel
MODEL_NAME = "deepvk/USER2-small"
TOKENIZER_NAME = "deepvk/USER2-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(TOKENIZER_NAME)
model.to('cuda:7')
output = model(
    **tokenizer(
        ["iphone 15", "iphone 16"], padding=True, truncation=True, return_tensors="pt"
    )
)

The error itself:

TorchRuntimeError: Dynamo failed to run FX node with fake tensors: call_function <function embedding at 0x7f2364d46ac0>(*(FakeTensor(..., size=(s0, s1), dtype=torch.int64), Parameter(FakeTensor(..., device='cuda:7', size=(50368, 384), requires_grad=True)), 50283, None, 2.0, False, False), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.embedding.default, found two different devices cuda:7, cpu')

from user code:
   File "/home/fkurushin/venv/s2p3_11/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py", line 207, in compiled_embeddings
    return self.drop(self.norm(self.tok_embeddings(input_ids)))
  File "/home/fkurushin/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 190, in forward
    return F.embedding(

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"

This is the python setup:

pytorch-triton==3.4.0
pytorch-triton-xpu==3.3.1
torch==2.7.1+cu128
torchaudio==2.7.1+cu128
torchcodec==0.4.0+cu128
torchvision==0.22.1+cu128
...
pytorch-triton==3.4.0
pytorch-triton-xpu==3.3.1
triton==3.3.1

OS:

Distributor ID: Debian
Description:    Debian GNU/Linux 12 (bookworm)
Release:        12
Codename:       bookworm

Thank you!

SpirinEgor

deep vk org Aug 28, 2025

Hi!

**tokenizer(
        ["iphone 15", "iphone 16"], padding=True, truncation=True, return_tensors="pt"
    )

Its look like tokenizer returns tensors on CPU, but model is on GPU. Try to move tensors to GPU before passing them into model

Fedor99

Aug 28, 2025

•

edited Aug 28, 2025

Hi, thank you for the reply. The example is bad, i'm sorry.

from transformers import AutoTokenizer, AutoModel
MODEL_NAME = "deepvk/USER2-small"
TOKENIZER_NAME = "deepvk/USER2-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(TOKENIZER_NAME)
model.to('cuda:6')
tok = tokenizer(
        ["iphone 15", "iphone 16"], padding=True, truncation=True, return_tensors="pt"
)
tok.to('cuda:6')
output = model(
    **tok
)

This is he GPU example and the GPU error as well, although the code runs on cpu without any problems.

---------------------------------------------------------------------------
InductorError                             Traceback (most recent call last)
Cell In[3], line 1
----> 1 output = model(
      2     **tok
      3 )

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1751, in Module._wrapped_call_impl(self, *args, **kwargs)
   1749     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1750 else:
-> 1751     return self._call_impl(*args, **kwargs)

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1762, in Module._call_impl(self, *args, **kwargs)
   1757 # If we don't have any hooks, we want to skip the rest of the logic in
   1758 # this function, and just call forward.
   1759 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1760         or _global_backward_pre_hooks or _global_backward_hooks
   1761         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1762     return forward_call(*args, **kwargs)
   1764 result = None
   1765 called_always_called_hooks = set()

File ~/venv/s2p3_11/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py:850, in ModernBertModel.forward(self, input_ids, attention_mask, sliding_window_mask, position_ids, inputs_embeds, indices, cu_seqlens, max_seqlen, batch_size, seq_len, output_attentions, output_hidden_states, return_dict)
    844         position_ids = torch.arange(seq_len, device=device).unsqueeze(0)
    846     attention_mask, sliding_window_mask = self._update_attention_mask(
    847         attention_mask, output_attentions=output_attentions
    848     )
--> 850 hidden_states = self.embeddings(input_ids=input_ids, inputs_embeds=inputs_embeds)
    852 for encoder_layer in self.layers:
    853     if output_hidden_states:

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1751, in Module._wrapped_call_impl(self, *args, **kwargs)
   1749     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1750 else:
-> 1751     return self._call_impl(*args, **kwargs)

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1762, in Module._call_impl(self, *args, **kwargs)
   1757 # If we don't have any hooks, we want to skip the rest of the logic in
   1758 # this function, and just call forward.
   1759 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1760         or _global_backward_pre_hooks or _global_backward_hooks
   1761         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1762     return forward_call(*args, **kwargs)
   1764 result = None
   1765 called_always_called_hooks = set()

File ~/venv/s2p3_11/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py:216, in ModernBertEmbeddings.forward(self, input_ids, inputs_embeds)
    213     hidden_states = self.drop(self.norm(inputs_embeds))
    214 else:
    215     hidden_states = (
--> 216         self.compiled_embeddings(input_ids)
    217         if self.config.reference_compile
    218         else self.drop(self.norm(self.tok_embeddings(input_ids)))
    219     )
    220 return hidden_states

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py:663, in _TorchDynamoContext.__call__.<locals>._fn(*args, **kwargs)
    659     raise e.with_traceback(None) from None
    660 except ShortenTraceback as e:
    661     # Failures in the backend likely don't have useful
    662     # data in the TorchDynamo frames, so we strip them out.
--> 663     raise e.remove_dynamo_frames() from None  # see TORCHDYNAMO_VERBOSE=1
    664 finally:
    665     # Restore the dynamic layer stack depth if necessary.
    666     set_eval_frame(None)

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:760, in _compile_fx_inner(gm, example_inputs, **graph_kwargs)
    758     raise
    759 except Exception as e:
--> 760     raise InductorError(e, currentframe()).with_traceback(
    761         e.__traceback__
    762     ) from None
    763 finally:
    764     TritonBundler.end_compile()

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:745, in _compile_fx_inner(gm, example_inputs, **graph_kwargs)
    743 TritonBundler.begin_compile()
    744 try:
--> 745     mb_compiled_graph = fx_codegen_and_compile(
    746         gm, example_inputs, inputs_to_check, **graph_kwargs
    747     )
    748     assert mb_compiled_graph is not None
    749     mb_compiled_graph._time_taken_ns = time.time_ns() - start_time

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:1295, in fx_codegen_and_compile(gm, example_inputs, inputs_to_check, **graph_kwargs)
   1291     from .compile_fx_subproc import _SubprocessFxCompile
   1293     scheme = _SubprocessFxCompile()
-> 1295 return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:1197, in _InProcessFxCompile.codegen_and_compile(self, gm, example_inputs, inputs_to_check, graph_kwargs)
   1184             compiled_fn = AotCodeCompiler.compile(
   1185                 graph,
   1186                 wrapper_code.value,
   (...)   1194                 ],
   1195             )
   1196     else:
-> 1197         compiled_fn = graph.compile_to_module().call
   1199 num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
   1200 metrics.num_bytes_accessed += num_bytes

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/graph.py:2083, in GraphLowering.compile_to_module(self)
   2076 def compile_to_module(self) -> ModuleType:
   2077     with dynamo_timed(
   2078         "GraphLowering.compile_to_module",
   2079         phase_name="code_gen",
   2080         log_pt2_compile_event=True,
   2081         dynamo_compile_column_us="inductor_code_gen_cumulative_compile_time_us",
   2082     ):
-> 2083         return self._compile_to_module()

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/graph.py:2130, in GraphLowering._compile_to_module(self)
   2124     trace_structured(
   2125         "inductor_output_code",
   2126         lambda: {"filename": path},
   2127         payload_fn=lambda: wrapper_code.value,
   2128     )
   2129 with dynamo_timed("PyCodeCache.load_by_key_path", log_pt2_compile_event=True):
-> 2130     mod = PyCodeCache.load_by_key_path(
   2131         key,
   2132         path,
   2133         linemap=linemap,  # type: ignore[arg-type]
   2134         attrs={**self.constants, **self.torchbind_constants},
   2135     )
   2136 self.cache_key = key
   2137 self.cache_path = path

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/codecache.py:2747, in PyCodeCache.load_by_key_path(cls, key, path, linemap, attrs)
   2744 if linemap is None:
   2745     linemap = []
-> 2747 mod = _reload_python_module(key, path)
   2749 # unzip into separate lines/nodes lists
   2750 cls.linemaps[path] = list(zip(*linemap))

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/compile_tasks.py:36, in _reload_python_module(key, path)
     34 mod.__file__ = path
     35 mod.key = key  # type: ignore[attr-defined]
---> 36 exec(code, mod.__dict__, mod.__dict__)
     37 sys.modules[mod.__name__] = mod
     38 return mod

File /tmp/torchinductor_fkurushin/tf/ctf7mar2xkgqznrahfvgzfthle4v7qpb4opxscap5r6ee3rwcbvv.py:119
     36 # kernel path: /tmp/torchinductor_fkurushin/au/caujvggywsoktbv4uj7v54rnzomxiwuv64al5jvxs5gitkhjc4zz.py
     37 # Topologically Sorted Source Nodes: [embedding, layer_norm], Original ATen: [aten.embedding, aten.native_layer_norm]
     38 # Source node to ATen node mapping:
   (...)     47 #   %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_2, %rsqrt), kwargs = {})
     48 #   %mul_4 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_3, %primals_5), kwargs = {})
     49 triton_per_fused_embedding_native_layer_norm_0 = async_compile.triton('triton_per_fused_embedding_native_layer_norm_0', '''
     50 import triton
     51 import triton.language as tl
   (...)    115     tl.store(out_ptr1 + (x0), tmp16, None)
    116 ''', device_str='cuda')
--> 119 async_compile.wait(globals())
    120 del async_compile
    122 def call(args):

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/async_compile.py:424, in AsyncCompile.wait(self, scope)
    417 if get_compile_threads() > 1:
    418     with dynamo_timed(
    419         "async_compile.wait",
    420         log_pt2_compile_event=True,
    421         dynamo_compile_column_us="triton_compile_time_us",
    422         log_waitcounter=True,
    423     ):
--> 424         self._wait_futures(scope)
    426 _compile_end()

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/async_compile.py:445, in AsyncCompile._wait_futures(self, scope)
    443     pbar.set_postfix_str(key)
    444 try:
--> 445     scope[key] = result.result()
    446 except BrokenProcessPool as e:
    447     raise RuntimeError(
    448         "A compilation subprocess exited unexpectedly. This "
    449         "is likely due to a crash. To facilitate debugging, "
    450         "you can re-run with TORCHINDUCTOR_COMPILE_THREADS=1 "
    451         "to cause compilation to occur in the main process."
    452     ) from e

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/codecache.py:3224, in LambdaFuture.result(self)
   3223 def result(self) -> Callable[..., Any]:  # type: ignore[override]
-> 3224     return self.result_fn()

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/async_compile.py:325, in AsyncCompile.triton.<locals>.get_result()
    322 # Now that we've compiled, we should clear the future
    323 # so it can't be used again
    324 CompiledTritonKernels.remove_future(source_code)
--> 325 kernel.precompile(
    326     warm_cache_only=False, reload_kernel=reload_kernel_in_parent
    327 )
    328 get_metrics_context().add_top_n(
    329     "triton_kernel_compile_times_us", kernel_name, elapsed_us
    330 )
    331 return kernel

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py:277, in CachingAutotuner.precompile(self, warm_cache_only, reload_kernel)
    275     self._reload_kernel = reload_kernel
    276 self._precompile_worker()
--> 277 self._make_launchers()
    278 self._dynamic_scale_rblock()

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py:434, in CachingAutotuner._make_launchers(self)
    432 for result in self.compile_results:
    433     try:
--> 434         launchers.append(result.make_launcher())
    436     except (OutOfResources, PTXASError) as e:
    437         exc = e

File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py:1153, in TritonCompileResult.make_launcher(self)
   1140     def_args = [
   1141         name
   1142         for name in fn.arg_names
   1143         if name not in cfg_dict and name not in none_args
   1144     ]
   1146 binary_shared = (
   1147     binary.shared if hasattr(binary, "shared") else binary.metadata.shared
   1148 )
   1150 scope = {
   1151     "grid_meta": cfg.kwargs,
   1152     "bin": binary,
-> 1153     "launch_enter_hook": binary.__class__.launch_enter_hook,
   1154     "launch_exit_hook": binary.__class__.launch_exit_hook,
   1155     "metadata": (
   1156         binary.packed_metadata
   1157         if hasattr(binary, "packed_metadata")
   1158         else binary.metadata
   1159     ),
   1160     "shared": binary_shared,
   1161     "num_warps": (
   1162         binary.num_warps
   1163         if hasattr(binary, "num_warps")
   1164         else binary.metadata.num_warps
   1165     ),
   1166     "cta_args": (
   1167         (
   1168             binary.num_ctas,
   1169             *get_first_attr(binary, "cluster_dims", "clusterDims"),
   1170         )
   1171         if hasattr(binary, "num_ctas")
   1172         else (
   1173             (binary.metadata.num_ctas, *binary.metadata.cluster_dims)
   1174             if hasattr(binary, "metadata")
   1175             else ()
   1176         )
   1177     ),
   1178     "function": get_first_attr(binary, "function", "cu_function"),
   1179     "runner": get_first_attr(binary, "run", "c_wrapper"),
   1180 }
   1182 if not hasattr(binary, "launch_metadata"):
   1183     # launch args before CompiledKernel.launch_metadata is added.
   1184     # TODO(jansel): delete this branch in mid-2025
   1185     runner_args = [
   1186         "grid_0",
   1187         "grid_1",
   (...)   1197         *call_args,
   1198     ]

InductorError: AttributeError: type object 'CompiledKernel' has no attribute 'launch_enter_hook'

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"

Thank you

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment