Runtimerror: "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU."

#35
by dophys - opened

Hello, I encountered an error. Can you help me?

from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("/data/models/jina-embeddings-v4", trust_remote_code=True)

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])

When i run the above, it shows the following runtime:

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
Cell In[1], line 4
      1 from sentence_transformers import SentenceTransformer
      3 # 1. Load a pretrained Sentence Transformer model
----> 4 model = SentenceTransformer("/data/models/jina-embeddings-v4", trust_remote_code=True)
      6 # The sentences to encode
      7 sentences = [
      8     "The weather is lovely today.",
      9     "It's so sunny outside!",
     10     "He drove to the stadium.",
     11 ]

File /opt/anaconda3/lib/python3.11/site-packages/sentence_transformers/SentenceTransformer.py:309, in SentenceTransformer.__init__(self, model_name_or_path, modules, device, prompts, default_prompt_name, similarity_fn_name, cache_folder, trust_remote_code, revision, local_files_only, token, use_auth_token, truncate_dim, model_kwargs, tokenizer_kwargs, config_kwargs, model_card_data, backend)
    300         model_name_or_path = __MODEL_HUB_ORGANIZATION__ + "/" + model_name_or_path
    302 if is_sentence_transformer_model(
    303     model_name_or_path,
    304     token,
   (...)
    307     local_files_only=local_files_only,
    308 ):
--> 309     modules, self.module_kwargs = self._load_sbert_model(
    310         model_name_or_path,
    311         token=token,
    312         cache_folder=cache_folder,
    313         revision=revision,
    314         trust_remote_code=trust_remote_code,
    315         local_files_only=local_files_only,
    316         model_kwargs=model_kwargs,
    317         tokenizer_kwargs=tokenizer_kwargs,
    318         config_kwargs=config_kwargs,
    319     )
    320 else:
    321     modules = self._load_auto_model(
    322         model_name_or_path,
    323         token=token,
   (...)
    330         config_kwargs=config_kwargs,
    331     )

File /opt/anaconda3/lib/python3.11/site-packages/sentence_transformers/SentenceTransformer.py:1808, in SentenceTransformer._load_sbert_model(self, model_name_or_path, token, cache_folder, revision, trust_remote_code, local_files_only, model_kwargs, tokenizer_kwargs, config_kwargs)
   1805 # Try to initialize the module with a lot of kwargs, but only if the module supports them
   1806 # Otherwise we fall back to the load method
   1807 try:
-> 1808     module = module_class(model_name_or_path, cache_dir=cache_folder, backend=self.backend, **kwargs)
   1809 except TypeError:
   1810     module = module_class.load(model_name_or_path)

File /data/cache/huggingface/modules/transformers_modules/jina-embeddings-v4/custom_st.py:45, in Transformer.__init__(self, model_name_or_path, max_seq_length, config_args, model_args, tokenizer_args, cache_dir, backend, **kwargs)
     40 if self.default_task and self.default_task not in self.config.task_names:
     41     raise ValueError(
     42         f"Invalid task: {self.default_task}. Must be one of {self.config.task_names}."
     43     )
---> 45 self.model = AutoModel.from_pretrained(
     46     model_name_or_path, config=self.config, cache_dir=cache_dir, **model_kwargs
     47 )
     48 self.processor = AutoProcessor.from_pretrained(
     49     model_name_or_path,
     50     cache_dir=cache_dir,
     51     use_fast=True,
     52     **tokenizer_kwargs,
     53 )
     54 self.max_seq_length = max_seq_length or 8192

File /opt/anaconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py:564, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
    562     cls.register(config.__class__, model_class, exist_ok=True)
    563     model_class = add_generation_mixin_to_remote_model(model_class)
--> 564     return model_class.from_pretrained(
    565         pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
    566     )
    567 elif type(config) in cls._model_mapping.keys():
    568     model_class = _get_model_class(config, cls._model_mapping)

File /data/cache/huggingface/modules/transformers_modules/jina-embeddings-v4/modeling_jina_embeddings_v4.py:565, in JinaEmbeddingsV4Model.from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs)
    562 if not is_flash_attn_2_available():
    563     kwargs["attn_implementation"] = "sdpa"
--> 565 base_model = super().from_pretrained(
    566     pretrained_model_name_or_path, *args, **kwargs
    567 )
    569 # Configure adapter directory
    570 if os.path.isdir(base_model.name_or_path):

File /opt/anaconda3/lib/python3.11/site-packages/transformers/modeling_utils.py:309, in restore_default_torch_dtype.<locals>._wrapper(*args, **kwargs)
    307 old_dtype = torch.get_default_dtype()
    308 try:
--> 309     return func(*args, **kwargs)
    310 finally:
    311     torch.set_default_dtype(old_dtype)

File /opt/anaconda3/lib/python3.11/site-packages/transformers/modeling_utils.py:4508, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)
   4499     config = cls._autoset_attn_implementation(
   4500         config,
   4501         use_flash_attention_2=use_flash_attention_2,
   4502         torch_dtype=torch_dtype,
   4503         device_map=device_map,
   4504     )
   4506 with ContextManagers(model_init_context):
   4507     # Let's make sure we don't run the init function of buffer modules
-> 4508     model = cls(config, *model_args, **model_kwargs)
   4510 # Make sure to tie the weights correctly
   4511 model.tie_weights()

File /data/cache/huggingface/modules/transformers_modules/jina-embeddings-v4/modeling_jina_embeddings_v4.py:145, in JinaEmbeddingsV4Model.__init__(self, config)
    143 self._init_projection_layer(config)
    144 self.post_init()
--> 145 self.processor = JinaEmbeddingsV4Processor.from_pretrained(
    146     self.name_or_path, trust_remote_code=True, use_fast=True
    147 )
    148 self.multi_vector_projector_dim = config.multi_vector_projector_dim
    149 self._task = None

File /opt/anaconda3/lib/python3.11/site-packages/transformers/processing_utils.py:1185, in ProcessorMixin.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, **kwargs)
   1182 if token is not None:
   1183     kwargs["token"] = token
-> 1185 args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
   1186 processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
   1187 return cls.from_args_and_dict(args, processor_dict, **kwargs)

File /opt/anaconda3/lib/python3.11/site-packages/transformers/processing_utils.py:1248, in ProcessorMixin._get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
   1245     else:
   1246         attribute_class = cls.get_possibly_dynamic_module(class_name)
-> 1248     args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
   1249 return args

File /opt/anaconda3/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2025, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
   2022     else:
   2023         logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2025 return cls._from_pretrained(
   2026     resolved_vocab_files,
   2027     pretrained_model_name_or_path,
   2028     init_configuration,
   2029     *init_inputs,
   2030     token=token,
   2031     cache_dir=cache_dir,
   2032     local_files_only=local_files_only,
   2033     _commit_hash=commit_hash,
   2034     _is_local=is_local,
   2035     trust_remote_code=trust_remote_code,
   2036     **kwargs,
   2037 )

File /opt/anaconda3/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2278, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
   2276 # Instantiate the tokenizer.
   2277 try:
-> 2278     tokenizer = cls(*init_inputs, **init_kwargs)
   2279 except import_protobuf_decode_error():
   2280     logger.info(
   2281         "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead."
   2282         "(Google protobuf error: Tried to load SPM model with non-SPM vocab file).",
   2283     )

File /opt/anaconda3/lib/python3.11/site-packages/transformers/models/qwen2/tokenization_qwen2_fast.py:120, in Qwen2TokenizerFast.__init__(self, vocab_file, merges_file, tokenizer_file, unk_token, bos_token, eos_token, pad_token, **kwargs)
    109 unk_token = (
    110     AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
    111     if isinstance(unk_token, str)
    112     else unk_token
    113 )
    114 pad_token = (
    115     AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
    116     if isinstance(pad_token, str)
    117     else pad_token
    118 )
--> 120 super().__init__(
    121     vocab_file=vocab_file,
    122     merges_file=merges_file,
    123     tokenizer_file=tokenizer_file,
    124     unk_token=unk_token,
    125     bos_token=bos_token,
    126     eos_token=eos_token,
    127     pad_token=pad_token,
    128     **kwargs,
    129 )

File /opt/anaconda3/lib/python3.11/site-packages/transformers/tokenization_utils_fast.py:117, in PreTrainedTokenizerFast.__init__(self, *args, **kwargs)
    114     fast_tokenizer = copy.deepcopy(tokenizer_object)
    115 elif fast_tokenizer_file is not None and not from_slow:
    116     # We have a serialization from tokenizers which let us directly build the backend
--> 117     fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
    118 elif slow_tokenizer:
    119     # We need to convert a slow tokenizer to build the backend
    120     fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)

Exception: expected value at line 1 column 1

This error seems to be related to Flash Attention 2. However, jina-embeddings-v3 is normal. And this problem also happens with transformers as well. Looking forward to your reply.

Jina AI org

Hi @dophys , my guess is that you haven’t pulled the tokenizer.json file from git lfs. When it tries to load, it throws the Exception: expected value at line 1 column 1 error. Can you try running git lfs pull tokenizer.json? I think this should fix your issue.

Jina AI org

The issue was that tokenizer.json wasn’t automatically pulled when cloning the model. I just fixed this, so next time you clone, you won’t have to pull it yourself.

Thank you. It works. However a new problem is You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with model.to('cuda'). It seems that the model is not being stored in the GPU. model.to("cuda") not work.

from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("/data/models/jina-embeddings-v4", trust_remote_code=True)
model.to("cuda")

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences,task="retrieval",prompt_name="query",)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])

Wait, I'm not sure about this problem. Please ignore the above for now.

I seem to be encountering a GPU-related error, but I haven't installed Flash Attention 2. It's automatically turning on, even though I didn't install it.

Snipaste_2025-07-01_22-51-20.png


ImportError Traceback (most recent call last)
Cell In[11], line 4
1 from sentence_transformers import SentenceTransformer
3 # 1. Load a pretrained Sentence Transformer model
----> 4 model = SentenceTransformer(str(model_embedding_path), trust_remote_code=True)
6 # The sentences to encode
7 sentences = [
8 "The weather is lovely today.",
9 "It's so sunny outside!",
10 "He drove to the stadium.",
11 ]

File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\sentence_transformers\SentenceTransformer.py:309, in SentenceTransformer.init(self, model_name_or_path, modules, device, prompts, default_prompt_name, similarity_fn_name, cache_folder, trust_remote_code, revision, local_files_only, token, use_auth_token, truncate_dim, model_kwargs, tokenizer_kwargs, config_kwargs, model_card_data, backend)
300 model_name_or_path = MODEL_HUB_ORGANIZATION + "/" + model_name_or_path
302 if is_sentence_transformer_model(
303 model_name_or_path,
304 token,
(...) 307 local_files_only=local_files_only,
308 ):
--> 309 modules, self.module_kwargs = self._load_sbert_model(
310 model_name_or_path,
311 token=token,
312 cache_folder=cache_folder,
313 revision=revision,
314 trust_remote_code=trust_remote_code,
315 local_files_only=local_files_only,
316 model_kwargs=model_kwargs,
317 tokenizer_kwargs=tokenizer_kwargs,
318 config_kwargs=config_kwargs,
319 )
320 else:
321 modules = self._load_auto_model(
322 model_name_or_path,
323 token=token,
(...) 330 config_kwargs=config_kwargs,
331 )

File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\sentence_transformers\SentenceTransformer.py:1808, in SentenceTransformer._load_sbert_model(self, model_name_or_path, token, cache_folder, revision, trust_remote_code, local_files_only, model_kwargs, tokenizer_kwargs, config_kwargs)
1805 # Try to initialize the module with a lot of kwargs, but only if the module supports them
1806 # Otherwise we fall back to the load method
1807 try:
-> 1808 module = module_class(model_name_or_path, cache_dir=cache_folder, backend=self.backend, **kwargs)
1809 except TypeError:
1810 module = module_class.load(model_name_or_path)

File ~.cache\huggingface\modules\transformers_modules\jina-embeddings-v4\custom_st.py:45, in Transformer.init(self, model_name_or_path, max_seq_length, config_args, model_args, tokenizer_args, cache_dir, backend, **kwargs)
40 if self.default_task and self.default_task not in self.config.task_names:
41 raise ValueError(
42 f"Invalid task: {self.default_task}. Must be one of {self.config.task_names}."
43 )
---> 45 self.model = AutoModel.from_pretrained(
46 model_name_or_path, config=self.config, cache_dir=cache_dir, **model_kwargs
47 )
48 self.processor = AutoProcessor.from_pretrained(
49 model_name_or_path,
50 cache_dir=cache_dir,
51 use_fast=True,
52 **tokenizer_kwargs,
53 )
54 self.max_seq_length = max_seq_length or 8192

File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\models\auto\auto_factory.py:593, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
591 model_class.register_for_auto_class(auto_class=cls)
592 model_class = add_generation_mixin_to_remote_model(model_class)
--> 593 return model_class.from_pretrained(
594 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
595 )
596 elif type(config) in cls._model_mapping.keys():
597 model_class = _get_model_class(config, cls._model_mapping)

File ~.cache\huggingface\modules\transformers_modules\jina-embeddings-v4\modeling_jina_embeddings_v4.py:565, in JinaEmbeddingsV4Model.from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs)
562 if not is_flash_attn_2_available():
563 kwargs["attn_implementation"] = "sdpa"
--> 565 base_model = super().from_pretrained(
566 pretrained_model_name_or_path, *args, **kwargs
567 )
569 # Configure adapter directory
570 if os.path.isdir(base_model.name_or_path):

File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\modeling_utils.py:311, in restore_default_torch_dtype.._wrapper(*args, **kwargs)
309 old_dtype = torch.get_default_dtype()
310 try:
--> 311 return func(*args, **kwargs)
312 finally:
313 torch.set_default_dtype(old_dtype)

File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\modeling_utils.py:4760, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)
4752 config = cls._autoset_attn_implementation(
4753 config,
4754 torch_dtype=torch_dtype,
4755 device_map=device_map,
4756 )
4758 with ContextManagers(model_init_context):
4759 # Let's make sure we don't run the init function of buffer modules
-> 4760 model = cls(config, *model_args, **model_kwargs)
4762 # Make sure to tie the weights correctly
4763 model.tie_weights()

File ~.cache\huggingface\modules\transformers_modules\jina-embeddings-v4\modeling_jina_embeddings_v4.py:142, in JinaEmbeddingsV4Model.init(self, config)
141 def init(self, config: JinaEmbeddingsV4Config):
--> 142 Qwen2_5_VLForConditionalGeneration.init(self, config)
143 self._init_projection_layer(config)
144 self.post_init()

File ~.cache\huggingface\modules\transformers_modules\jina-embeddings-v4\qwen2_5_vl.py:2121, in Qwen2_5_VLForConditionalGeneration.init(self, config)
2119 def init(self, config):
2120 super().init(config)
-> 2121 self.model = Qwen2_5_VLModel(config)
2122 self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
2124 self.post_init()

File ~.cache\huggingface\modules\transformers_modules\jina-embeddings-v4\qwen2_5_vl.py:1709, in Qwen2_5_VLModel.init(self, config)
1707 super().init(config)
1708 self.visual = Qwen2_5_VisionTransformerPretrainedModel._from_config(config.vision_config)
-> 1709 self.language_model = Qwen2_5_VLTextModel._from_config(config.text_config)
1710 self.rope_deltas = None # cache rope_deltas here
1712 # Initialize weights and apply final processing

File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\modeling_utils.py:311, in restore_default_torch_dtype.._wrapper(*args, **kwargs)
309 old_dtype = torch.get_default_dtype()
310 try:
--> 311 return func(*args, **kwargs)
312 finally:
313 torch.set_default_dtype(old_dtype)

File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\modeling_utils.py:2191, in PreTrainedModel._from_config(cls, config, **kwargs)
2189 config._attn_implementation = kwargs.pop("attn_implementation", attn_implementation)
2190 if not getattr(config, "_attn_implementation_autoset", False):
-> 2191 config = cls._autoset_attn_implementation(
2192 config,
2193 check_device_map=False,
2194 torch_dtype=torch_dtype,
2195 )
2197 if is_deepspeed_zero3_enabled() and not _is_quantized and not _is_ds_init_called:
2198 logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")

File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\modeling_utils.py:2315, in PreTrainedModel._autoset_attn_implementation(cls, config, torch_dtype, device_map, check_device_map)
2307 cls._check_and_enable_flash_attn_3(
2308 config,
2309 torch_dtype=torch_dtype,
(...) 2312 check_device_map=check_device_map,
2313 )
2314 elif config._attn_implementation == "flash_attention_2":
-> 2315 cls._check_and_enable_flash_attn_2(
2316 config,
2317 torch_dtype=torch_dtype,
2318 device_map=device_map,
2319 hard_check_only=False,
2320 check_device_map=check_device_map,
2321 )
2322 elif requested_attn_implementation == "flex_attention":
2323 config = cls._check_and_enable_flex_attn(config, hard_check_only=True)

File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\modeling_utils.py:2457, in PreTrainedModel._check_and_enable_flash_attn_2(cls, config, torch_dtype, device_map, check_device_map, hard_check_only)
2455 return config
2456 else:
-> 2457 raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
2459 flash_attention_version = version.parse(importlib.metadata.version("flash_attn"))
2460 if torch.version.cuda:

ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.

i got the same error
ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.

Jina AI org

@DT47-DE @vikyw89 thanks for reporting this issue. It should be fixed now.

Sign up or log in to comment