ImportError: cannot import name 'intel' from 'triton._C.libtriton' (/home/fkurushin/venv/pqr/lib/python3.11/site-packages/triton/_C/libtriton.so)
#2
by
Fedor99
- opened
Hello! I have some problems running your model on gpu:
The minimal code to reproduce the error:
from transformers import AutoTokenizer, AutoModel
MODEL_NAME = "deepvk/USER2-small"
TOKENIZER_NAME = "deepvk/USER2-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(TOKENIZER_NAME)
model.to('cuda:7')
output = model(
**tokenizer(
["iphone 15", "iphone 16"], padding=True, truncation=True, return_tensors="pt"
)
)
The error itself:
TorchRuntimeError: Dynamo failed to run FX node with fake tensors: call_function <function embedding at 0x7f2364d46ac0>(*(FakeTensor(..., size=(s0, s1), dtype=torch.int64), Parameter(FakeTensor(..., device='cuda:7', size=(50368, 384), requires_grad=True)), 50283, None, 2.0, False, False), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.embedding.default, found two different devices cuda:7, cpu')
from user code:
File "/home/fkurushin/venv/s2p3_11/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py", line 207, in compiled_embeddings
return self.drop(self.norm(self.tok_embeddings(input_ids)))
File "/home/fkurushin/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 190, in forward
return F.embedding(
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
This is the python setup:
pytorch-triton==3.4.0
pytorch-triton-xpu==3.3.1
torch==2.7.1+cu128
torchaudio==2.7.1+cu128
torchcodec==0.4.0+cu128
torchvision==0.22.1+cu128
...
pytorch-triton==3.4.0
pytorch-triton-xpu==3.3.1
triton==3.3.1
OS:
Distributor ID: Debian
Description: Debian GNU/Linux 12 (bookworm)
Release: 12
Codename: bookworm
Thank you!
Hi!
**tokenizer(
["iphone 15", "iphone 16"], padding=True, truncation=True, return_tensors="pt"
)
Its look like tokenizer returns tensors on CPU, but model is on GPU. Try to move tensors to GPU before passing them into model
Hi, thank you for the reply. The example is bad, i'm sorry.
from transformers import AutoTokenizer, AutoModel
MODEL_NAME = "deepvk/USER2-small"
TOKENIZER_NAME = "deepvk/USER2-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(TOKENIZER_NAME)
model.to('cuda:6')
tok = tokenizer(
["iphone 15", "iphone 16"], padding=True, truncation=True, return_tensors="pt"
)
tok.to('cuda:6')
output = model(
**tok
)
This is he GPU example and the GPU error as well, although the code runs on cpu without any problems.
---------------------------------------------------------------------------
InductorError Traceback (most recent call last)
Cell In[3], line 1
----> 1 output = model(
2 **tok
3 )
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1751, in Module._wrapped_call_impl(self, *args, **kwargs)
1749 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1750 else:
-> 1751 return self._call_impl(*args, **kwargs)
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1762, in Module._call_impl(self, *args, **kwargs)
1757 # If we don't have any hooks, we want to skip the rest of the logic in
1758 # this function, and just call forward.
1759 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1760 or _global_backward_pre_hooks or _global_backward_hooks
1761 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1762 return forward_call(*args, **kwargs)
1764 result = None
1765 called_always_called_hooks = set()
File ~/venv/s2p3_11/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py:850, in ModernBertModel.forward(self, input_ids, attention_mask, sliding_window_mask, position_ids, inputs_embeds, indices, cu_seqlens, max_seqlen, batch_size, seq_len, output_attentions, output_hidden_states, return_dict)
844 position_ids = torch.arange(seq_len, device=device).unsqueeze(0)
846 attention_mask, sliding_window_mask = self._update_attention_mask(
847 attention_mask, output_attentions=output_attentions
848 )
--> 850 hidden_states = self.embeddings(input_ids=input_ids, inputs_embeds=inputs_embeds)
852 for encoder_layer in self.layers:
853 if output_hidden_states:
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1751, in Module._wrapped_call_impl(self, *args, **kwargs)
1749 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1750 else:
-> 1751 return self._call_impl(*args, **kwargs)
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/nn/modules/module.py:1762, in Module._call_impl(self, *args, **kwargs)
1757 # If we don't have any hooks, we want to skip the rest of the logic in
1758 # this function, and just call forward.
1759 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1760 or _global_backward_pre_hooks or _global_backward_hooks
1761 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1762 return forward_call(*args, **kwargs)
1764 result = None
1765 called_always_called_hooks = set()
File ~/venv/s2p3_11/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py:216, in ModernBertEmbeddings.forward(self, input_ids, inputs_embeds)
213 hidden_states = self.drop(self.norm(inputs_embeds))
214 else:
215 hidden_states = (
--> 216 self.compiled_embeddings(input_ids)
217 if self.config.reference_compile
218 else self.drop(self.norm(self.tok_embeddings(input_ids)))
219 )
220 return hidden_states
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py:663, in _TorchDynamoContext.__call__.<locals>._fn(*args, **kwargs)
659 raise e.with_traceback(None) from None
660 except ShortenTraceback as e:
661 # Failures in the backend likely don't have useful
662 # data in the TorchDynamo frames, so we strip them out.
--> 663 raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1
664 finally:
665 # Restore the dynamic layer stack depth if necessary.
666 set_eval_frame(None)
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:760, in _compile_fx_inner(gm, example_inputs, **graph_kwargs)
758 raise
759 except Exception as e:
--> 760 raise InductorError(e, currentframe()).with_traceback(
761 e.__traceback__
762 ) from None
763 finally:
764 TritonBundler.end_compile()
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:745, in _compile_fx_inner(gm, example_inputs, **graph_kwargs)
743 TritonBundler.begin_compile()
744 try:
--> 745 mb_compiled_graph = fx_codegen_and_compile(
746 gm, example_inputs, inputs_to_check, **graph_kwargs
747 )
748 assert mb_compiled_graph is not None
749 mb_compiled_graph._time_taken_ns = time.time_ns() - start_time
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:1295, in fx_codegen_and_compile(gm, example_inputs, inputs_to_check, **graph_kwargs)
1291 from .compile_fx_subproc import _SubprocessFxCompile
1293 scheme = _SubprocessFxCompile()
-> 1295 return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/compile_fx.py:1197, in _InProcessFxCompile.codegen_and_compile(self, gm, example_inputs, inputs_to_check, graph_kwargs)
1184 compiled_fn = AotCodeCompiler.compile(
1185 graph,
1186 wrapper_code.value,
(...) 1194 ],
1195 )
1196 else:
-> 1197 compiled_fn = graph.compile_to_module().call
1199 num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
1200 metrics.num_bytes_accessed += num_bytes
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/graph.py:2083, in GraphLowering.compile_to_module(self)
2076 def compile_to_module(self) -> ModuleType:
2077 with dynamo_timed(
2078 "GraphLowering.compile_to_module",
2079 phase_name="code_gen",
2080 log_pt2_compile_event=True,
2081 dynamo_compile_column_us="inductor_code_gen_cumulative_compile_time_us",
2082 ):
-> 2083 return self._compile_to_module()
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/graph.py:2130, in GraphLowering._compile_to_module(self)
2124 trace_structured(
2125 "inductor_output_code",
2126 lambda: {"filename": path},
2127 payload_fn=lambda: wrapper_code.value,
2128 )
2129 with dynamo_timed("PyCodeCache.load_by_key_path", log_pt2_compile_event=True):
-> 2130 mod = PyCodeCache.load_by_key_path(
2131 key,
2132 path,
2133 linemap=linemap, # type: ignore[arg-type]
2134 attrs={**self.constants, **self.torchbind_constants},
2135 )
2136 self.cache_key = key
2137 self.cache_path = path
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/codecache.py:2747, in PyCodeCache.load_by_key_path(cls, key, path, linemap, attrs)
2744 if linemap is None:
2745 linemap = []
-> 2747 mod = _reload_python_module(key, path)
2749 # unzip into separate lines/nodes lists
2750 cls.linemaps[path] = list(zip(*linemap))
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/compile_tasks.py:36, in _reload_python_module(key, path)
34 mod.__file__ = path
35 mod.key = key # type: ignore[attr-defined]
---> 36 exec(code, mod.__dict__, mod.__dict__)
37 sys.modules[mod.__name__] = mod
38 return mod
File /tmp/torchinductor_fkurushin/tf/ctf7mar2xkgqznrahfvgzfthle4v7qpb4opxscap5r6ee3rwcbvv.py:119
36 # kernel path: /tmp/torchinductor_fkurushin/au/caujvggywsoktbv4uj7v54rnzomxiwuv64al5jvxs5gitkhjc4zz.py
37 # Topologically Sorted Source Nodes: [embedding, layer_norm], Original ATen: [aten.embedding, aten.native_layer_norm]
38 # Source node to ATen node mapping:
(...) 47 # %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_2, %rsqrt), kwargs = {})
48 # %mul_4 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_3, %primals_5), kwargs = {})
49 triton_per_fused_embedding_native_layer_norm_0 = async_compile.triton('triton_per_fused_embedding_native_layer_norm_0', '''
50 import triton
51 import triton.language as tl
(...) 115 tl.store(out_ptr1 + (x0), tmp16, None)
116 ''', device_str='cuda')
--> 119 async_compile.wait(globals())
120 del async_compile
122 def call(args):
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/async_compile.py:424, in AsyncCompile.wait(self, scope)
417 if get_compile_threads() > 1:
418 with dynamo_timed(
419 "async_compile.wait",
420 log_pt2_compile_event=True,
421 dynamo_compile_column_us="triton_compile_time_us",
422 log_waitcounter=True,
423 ):
--> 424 self._wait_futures(scope)
426 _compile_end()
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/async_compile.py:445, in AsyncCompile._wait_futures(self, scope)
443 pbar.set_postfix_str(key)
444 try:
--> 445 scope[key] = result.result()
446 except BrokenProcessPool as e:
447 raise RuntimeError(
448 "A compilation subprocess exited unexpectedly. This "
449 "is likely due to a crash. To facilitate debugging, "
450 "you can re-run with TORCHINDUCTOR_COMPILE_THREADS=1 "
451 "to cause compilation to occur in the main process."
452 ) from e
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/codecache.py:3224, in LambdaFuture.result(self)
3223 def result(self) -> Callable[..., Any]: # type: ignore[override]
-> 3224 return self.result_fn()
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/async_compile.py:325, in AsyncCompile.triton.<locals>.get_result()
322 # Now that we've compiled, we should clear the future
323 # so it can't be used again
324 CompiledTritonKernels.remove_future(source_code)
--> 325 kernel.precompile(
326 warm_cache_only=False, reload_kernel=reload_kernel_in_parent
327 )
328 get_metrics_context().add_top_n(
329 "triton_kernel_compile_times_us", kernel_name, elapsed_us
330 )
331 return kernel
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py:277, in CachingAutotuner.precompile(self, warm_cache_only, reload_kernel)
275 self._reload_kernel = reload_kernel
276 self._precompile_worker()
--> 277 self._make_launchers()
278 self._dynamic_scale_rblock()
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py:434, in CachingAutotuner._make_launchers(self)
432 for result in self.compile_results:
433 try:
--> 434 launchers.append(result.make_launcher())
436 except (OutOfResources, PTXASError) as e:
437 exc = e
File ~/venv/s2p3_11/lib/python3.11/site-packages/torch/_inductor/runtime/triton_heuristics.py:1153, in TritonCompileResult.make_launcher(self)
1140 def_args = [
1141 name
1142 for name in fn.arg_names
1143 if name not in cfg_dict and name not in none_args
1144 ]
1146 binary_shared = (
1147 binary.shared if hasattr(binary, "shared") else binary.metadata.shared
1148 )
1150 scope = {
1151 "grid_meta": cfg.kwargs,
1152 "bin": binary,
-> 1153 "launch_enter_hook": binary.__class__.launch_enter_hook,
1154 "launch_exit_hook": binary.__class__.launch_exit_hook,
1155 "metadata": (
1156 binary.packed_metadata
1157 if hasattr(binary, "packed_metadata")
1158 else binary.metadata
1159 ),
1160 "shared": binary_shared,
1161 "num_warps": (
1162 binary.num_warps
1163 if hasattr(binary, "num_warps")
1164 else binary.metadata.num_warps
1165 ),
1166 "cta_args": (
1167 (
1168 binary.num_ctas,
1169 *get_first_attr(binary, "cluster_dims", "clusterDims"),
1170 )
1171 if hasattr(binary, "num_ctas")
1172 else (
1173 (binary.metadata.num_ctas, *binary.metadata.cluster_dims)
1174 if hasattr(binary, "metadata")
1175 else ()
1176 )
1177 ),
1178 "function": get_first_attr(binary, "function", "cu_function"),
1179 "runner": get_first_attr(binary, "run", "c_wrapper"),
1180 }
1182 if not hasattr(binary, "launch_metadata"):
1183 # launch args before CompiledKernel.launch_metadata is added.
1184 # TODO(jansel): delete this branch in mid-2025
1185 runner_args = [
1186 "grid_0",
1187 "grid_1",
(...) 1197 *call_args,
1198 ]
InductorError: AttributeError: type object 'CompiledKernel' has no attribute 'launch_enter_hook'
Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
Thank you