Runtimerror: "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU."
Hello, I encountered an error. Can you help me?
from sentence_transformers import SentenceTransformer
# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("/data/models/jina-embeddings-v4", trust_remote_code=True)
# The sentences to encode
sentences = [
"The weather is lovely today.",
"It's so sunny outside!",
"He drove to the stadium.",
]
# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]
# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
# [0.6660, 1.0000, 0.1411],
# [0.1046, 0.1411, 1.0000]])
When i run the above, it shows the following runtime:
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
Cell In[1], line 4
1 from sentence_transformers import SentenceTransformer
3 # 1. Load a pretrained Sentence Transformer model
----> 4 model = SentenceTransformer("/data/models/jina-embeddings-v4", trust_remote_code=True)
6 # The sentences to encode
7 sentences = [
8 "The weather is lovely today.",
9 "It's so sunny outside!",
10 "He drove to the stadium.",
11 ]
File /opt/anaconda3/lib/python3.11/site-packages/sentence_transformers/SentenceTransformer.py:309, in SentenceTransformer.__init__(self, model_name_or_path, modules, device, prompts, default_prompt_name, similarity_fn_name, cache_folder, trust_remote_code, revision, local_files_only, token, use_auth_token, truncate_dim, model_kwargs, tokenizer_kwargs, config_kwargs, model_card_data, backend)
300 model_name_or_path = __MODEL_HUB_ORGANIZATION__ + "/" + model_name_or_path
302 if is_sentence_transformer_model(
303 model_name_or_path,
304 token,
(...)
307 local_files_only=local_files_only,
308 ):
--> 309 modules, self.module_kwargs = self._load_sbert_model(
310 model_name_or_path,
311 token=token,
312 cache_folder=cache_folder,
313 revision=revision,
314 trust_remote_code=trust_remote_code,
315 local_files_only=local_files_only,
316 model_kwargs=model_kwargs,
317 tokenizer_kwargs=tokenizer_kwargs,
318 config_kwargs=config_kwargs,
319 )
320 else:
321 modules = self._load_auto_model(
322 model_name_or_path,
323 token=token,
(...)
330 config_kwargs=config_kwargs,
331 )
File /opt/anaconda3/lib/python3.11/site-packages/sentence_transformers/SentenceTransformer.py:1808, in SentenceTransformer._load_sbert_model(self, model_name_or_path, token, cache_folder, revision, trust_remote_code, local_files_only, model_kwargs, tokenizer_kwargs, config_kwargs)
1805 # Try to initialize the module with a lot of kwargs, but only if the module supports them
1806 # Otherwise we fall back to the load method
1807 try:
-> 1808 module = module_class(model_name_or_path, cache_dir=cache_folder, backend=self.backend, **kwargs)
1809 except TypeError:
1810 module = module_class.load(model_name_or_path)
File /data/cache/huggingface/modules/transformers_modules/jina-embeddings-v4/custom_st.py:45, in Transformer.__init__(self, model_name_or_path, max_seq_length, config_args, model_args, tokenizer_args, cache_dir, backend, **kwargs)
40 if self.default_task and self.default_task not in self.config.task_names:
41 raise ValueError(
42 f"Invalid task: {self.default_task}. Must be one of {self.config.task_names}."
43 )
---> 45 self.model = AutoModel.from_pretrained(
46 model_name_or_path, config=self.config, cache_dir=cache_dir, **model_kwargs
47 )
48 self.processor = AutoProcessor.from_pretrained(
49 model_name_or_path,
50 cache_dir=cache_dir,
51 use_fast=True,
52 **tokenizer_kwargs,
53 )
54 self.max_seq_length = max_seq_length or 8192
File /opt/anaconda3/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py:564, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
562 cls.register(config.__class__, model_class, exist_ok=True)
563 model_class = add_generation_mixin_to_remote_model(model_class)
--> 564 return model_class.from_pretrained(
565 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
566 )
567 elif type(config) in cls._model_mapping.keys():
568 model_class = _get_model_class(config, cls._model_mapping)
File /data/cache/huggingface/modules/transformers_modules/jina-embeddings-v4/modeling_jina_embeddings_v4.py:565, in JinaEmbeddingsV4Model.from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs)
562 if not is_flash_attn_2_available():
563 kwargs["attn_implementation"] = "sdpa"
--> 565 base_model = super().from_pretrained(
566 pretrained_model_name_or_path, *args, **kwargs
567 )
569 # Configure adapter directory
570 if os.path.isdir(base_model.name_or_path):
File /opt/anaconda3/lib/python3.11/site-packages/transformers/modeling_utils.py:309, in restore_default_torch_dtype.<locals>._wrapper(*args, **kwargs)
307 old_dtype = torch.get_default_dtype()
308 try:
--> 309 return func(*args, **kwargs)
310 finally:
311 torch.set_default_dtype(old_dtype)
File /opt/anaconda3/lib/python3.11/site-packages/transformers/modeling_utils.py:4508, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)
4499 config = cls._autoset_attn_implementation(
4500 config,
4501 use_flash_attention_2=use_flash_attention_2,
4502 torch_dtype=torch_dtype,
4503 device_map=device_map,
4504 )
4506 with ContextManagers(model_init_context):
4507 # Let's make sure we don't run the init function of buffer modules
-> 4508 model = cls(config, *model_args, **model_kwargs)
4510 # Make sure to tie the weights correctly
4511 model.tie_weights()
File /data/cache/huggingface/modules/transformers_modules/jina-embeddings-v4/modeling_jina_embeddings_v4.py:145, in JinaEmbeddingsV4Model.__init__(self, config)
143 self._init_projection_layer(config)
144 self.post_init()
--> 145 self.processor = JinaEmbeddingsV4Processor.from_pretrained(
146 self.name_or_path, trust_remote_code=True, use_fast=True
147 )
148 self.multi_vector_projector_dim = config.multi_vector_projector_dim
149 self._task = None
File /opt/anaconda3/lib/python3.11/site-packages/transformers/processing_utils.py:1185, in ProcessorMixin.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, **kwargs)
1182 if token is not None:
1183 kwargs["token"] = token
-> 1185 args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
1186 processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
1187 return cls.from_args_and_dict(args, processor_dict, **kwargs)
File /opt/anaconda3/lib/python3.11/site-packages/transformers/processing_utils.py:1248, in ProcessorMixin._get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
1245 else:
1246 attribute_class = cls.get_possibly_dynamic_module(class_name)
-> 1248 args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
1249 return args
File /opt/anaconda3/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2025, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
2022 else:
2023 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2025 return cls._from_pretrained(
2026 resolved_vocab_files,
2027 pretrained_model_name_or_path,
2028 init_configuration,
2029 *init_inputs,
2030 token=token,
2031 cache_dir=cache_dir,
2032 local_files_only=local_files_only,
2033 _commit_hash=commit_hash,
2034 _is_local=is_local,
2035 trust_remote_code=trust_remote_code,
2036 **kwargs,
2037 )
File /opt/anaconda3/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:2278, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
2276 # Instantiate the tokenizer.
2277 try:
-> 2278 tokenizer = cls(*init_inputs, **init_kwargs)
2279 except import_protobuf_decode_error():
2280 logger.info(
2281 "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead."
2282 "(Google protobuf error: Tried to load SPM model with non-SPM vocab file).",
2283 )
File /opt/anaconda3/lib/python3.11/site-packages/transformers/models/qwen2/tokenization_qwen2_fast.py:120, in Qwen2TokenizerFast.__init__(self, vocab_file, merges_file, tokenizer_file, unk_token, bos_token, eos_token, pad_token, **kwargs)
109 unk_token = (
110 AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
111 if isinstance(unk_token, str)
112 else unk_token
113 )
114 pad_token = (
115 AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
116 if isinstance(pad_token, str)
117 else pad_token
118 )
--> 120 super().__init__(
121 vocab_file=vocab_file,
122 merges_file=merges_file,
123 tokenizer_file=tokenizer_file,
124 unk_token=unk_token,
125 bos_token=bos_token,
126 eos_token=eos_token,
127 pad_token=pad_token,
128 **kwargs,
129 )
File /opt/anaconda3/lib/python3.11/site-packages/transformers/tokenization_utils_fast.py:117, in PreTrainedTokenizerFast.__init__(self, *args, **kwargs)
114 fast_tokenizer = copy.deepcopy(tokenizer_object)
115 elif fast_tokenizer_file is not None and not from_slow:
116 # We have a serialization from tokenizers which let us directly build the backend
--> 117 fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
118 elif slow_tokenizer:
119 # We need to convert a slow tokenizer to build the backend
120 fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
Exception: expected value at line 1 column 1
This error seems to be related to Flash Attention 2. However, jina-embeddings-v3 is normal. And this problem also happens with transformers as well. Looking forward to your reply.
Hi
@dophys
, my guess is that you haven’t pulled the tokenizer.json
file from git lfs. When it tries to load, it throws the Exception: expected value at line 1 column 1
error. Can you try running git lfs pull tokenizer.json
? I think this should fix your issue.
The issue was that tokenizer.json
wasn’t automatically pulled when cloning the model. I just fixed this, so next time you clone, you won’t have to pull it yourself.
Thank you. It works. However a new problem is You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with model.to('cuda')
. It seems that the model is not being stored in the GPU. model.to("cuda")
not work.
from sentence_transformers import SentenceTransformer
# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("/data/models/jina-embeddings-v4", trust_remote_code=True)
model.to("cuda")
# The sentences to encode
sentences = [
"The weather is lovely today.",
"It's so sunny outside!",
"He drove to the stadium.",
]
# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences,task="retrieval",prompt_name="query",)
print(embeddings.shape)
# [3, 384]
# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
# [0.6660, 1.0000, 0.1411],
# [0.1046, 0.1411, 1.0000]])
Wait, I'm not sure about this problem. Please ignore the above for now.
I seem to be encountering a GPU-related error, but I haven't installed Flash Attention 2. It's automatically turning on, even though I didn't install it.
ImportError Traceback (most recent call last)
Cell In[11], line 4
1 from sentence_transformers import SentenceTransformer
3 # 1. Load a pretrained Sentence Transformer model
----> 4 model = SentenceTransformer(str(model_embedding_path), trust_remote_code=True)
6 # The sentences to encode
7 sentences = [
8 "The weather is lovely today.",
9 "It's so sunny outside!",
10 "He drove to the stadium.",
11 ]
File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\sentence_transformers\SentenceTransformer.py:309, in SentenceTransformer.init(self, model_name_or_path, modules, device, prompts, default_prompt_name, similarity_fn_name, cache_folder, trust_remote_code, revision, local_files_only, token, use_auth_token, truncate_dim, model_kwargs, tokenizer_kwargs, config_kwargs, model_card_data, backend)
300 model_name_or_path = MODEL_HUB_ORGANIZATION + "/" + model_name_or_path
302 if is_sentence_transformer_model(
303 model_name_or_path,
304 token,
(...) 307 local_files_only=local_files_only,
308 ):
--> 309 modules, self.module_kwargs = self._load_sbert_model(
310 model_name_or_path,
311 token=token,
312 cache_folder=cache_folder,
313 revision=revision,
314 trust_remote_code=trust_remote_code,
315 local_files_only=local_files_only,
316 model_kwargs=model_kwargs,
317 tokenizer_kwargs=tokenizer_kwargs,
318 config_kwargs=config_kwargs,
319 )
320 else:
321 modules = self._load_auto_model(
322 model_name_or_path,
323 token=token,
(...) 330 config_kwargs=config_kwargs,
331 )
File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\sentence_transformers\SentenceTransformer.py:1808, in SentenceTransformer._load_sbert_model(self, model_name_or_path, token, cache_folder, revision, trust_remote_code, local_files_only, model_kwargs, tokenizer_kwargs, config_kwargs)
1805 # Try to initialize the module with a lot of kwargs, but only if the module supports them
1806 # Otherwise we fall back to the load method
1807 try:
-> 1808 module = module_class(model_name_or_path, cache_dir=cache_folder, backend=self.backend, **kwargs)
1809 except TypeError:
1810 module = module_class.load(model_name_or_path)
File ~.cache\huggingface\modules\transformers_modules\jina-embeddings-v4\custom_st.py:45, in Transformer.init(self, model_name_or_path, max_seq_length, config_args, model_args, tokenizer_args, cache_dir, backend, **kwargs)
40 if self.default_task and self.default_task not in self.config.task_names:
41 raise ValueError(
42 f"Invalid task: {self.default_task}. Must be one of {self.config.task_names}."
43 )
---> 45 self.model = AutoModel.from_pretrained(
46 model_name_or_path, config=self.config, cache_dir=cache_dir, **model_kwargs
47 )
48 self.processor = AutoProcessor.from_pretrained(
49 model_name_or_path,
50 cache_dir=cache_dir,
51 use_fast=True,
52 **tokenizer_kwargs,
53 )
54 self.max_seq_length = max_seq_length or 8192
File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\models\auto\auto_factory.py:593, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
591 model_class.register_for_auto_class(auto_class=cls)
592 model_class = add_generation_mixin_to_remote_model(model_class)
--> 593 return model_class.from_pretrained(
594 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
595 )
596 elif type(config) in cls._model_mapping.keys():
597 model_class = _get_model_class(config, cls._model_mapping)
File ~.cache\huggingface\modules\transformers_modules\jina-embeddings-v4\modeling_jina_embeddings_v4.py:565, in JinaEmbeddingsV4Model.from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs)
562 if not is_flash_attn_2_available():
563 kwargs["attn_implementation"] = "sdpa"
--> 565 base_model = super().from_pretrained(
566 pretrained_model_name_or_path, *args, **kwargs
567 )
569 # Configure adapter directory
570 if os.path.isdir(base_model.name_or_path):
File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\modeling_utils.py:311, in restore_default_torch_dtype.._wrapper(*args, **kwargs)
309 old_dtype = torch.get_default_dtype()
310 try:
--> 311 return func(*args, **kwargs)
312 finally:
313 torch.set_default_dtype(old_dtype)
File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\modeling_utils.py:4760, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)
4752 config = cls._autoset_attn_implementation(
4753 config,
4754 torch_dtype=torch_dtype,
4755 device_map=device_map,
4756 )
4758 with ContextManagers(model_init_context):
4759 # Let's make sure we don't run the init function of buffer modules
-> 4760 model = cls(config, *model_args, **model_kwargs)
4762 # Make sure to tie the weights correctly
4763 model.tie_weights()
File ~.cache\huggingface\modules\transformers_modules\jina-embeddings-v4\modeling_jina_embeddings_v4.py:142, in JinaEmbeddingsV4Model.init(self, config)
141 def init(self, config: JinaEmbeddingsV4Config):
--> 142 Qwen2_5_VLForConditionalGeneration.init(self, config)
143 self._init_projection_layer(config)
144 self.post_init()
File ~.cache\huggingface\modules\transformers_modules\jina-embeddings-v4\qwen2_5_vl.py:2121, in Qwen2_5_VLForConditionalGeneration.init(self, config)
2119 def init(self, config):
2120 super().init(config)
-> 2121 self.model = Qwen2_5_VLModel(config)
2122 self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
2124 self.post_init()
File ~.cache\huggingface\modules\transformers_modules\jina-embeddings-v4\qwen2_5_vl.py:1709, in Qwen2_5_VLModel.init(self, config)
1707 super().init(config)
1708 self.visual = Qwen2_5_VisionTransformerPretrainedModel._from_config(config.vision_config)
-> 1709 self.language_model = Qwen2_5_VLTextModel._from_config(config.text_config)
1710 self.rope_deltas = None # cache rope_deltas here
1712 # Initialize weights and apply final processing
File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\modeling_utils.py:311, in restore_default_torch_dtype.._wrapper(*args, **kwargs)
309 old_dtype = torch.get_default_dtype()
310 try:
--> 311 return func(*args, **kwargs)
312 finally:
313 torch.set_default_dtype(old_dtype)
File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\modeling_utils.py:2191, in PreTrainedModel._from_config(cls, config, **kwargs)
2189 config._attn_implementation = kwargs.pop("attn_implementation", attn_implementation)
2190 if not getattr(config, "_attn_implementation_autoset", False):
-> 2191 config = cls._autoset_attn_implementation(
2192 config,
2193 check_device_map=False,
2194 torch_dtype=torch_dtype,
2195 )
2197 if is_deepspeed_zero3_enabled() and not _is_quantized and not _is_ds_init_called:
2198 logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\modeling_utils.py:2315, in PreTrainedModel._autoset_attn_implementation(cls, config, torch_dtype, device_map, check_device_map)
2307 cls._check_and_enable_flash_attn_3(
2308 config,
2309 torch_dtype=torch_dtype,
(...) 2312 check_device_map=check_device_map,
2313 )
2314 elif config._attn_implementation == "flash_attention_2":
-> 2315 cls._check_and_enable_flash_attn_2(
2316 config,
2317 torch_dtype=torch_dtype,
2318 device_map=device_map,
2319 hard_check_only=False,
2320 check_device_map=check_device_map,
2321 )
2322 elif requested_attn_implementation == "flex_attention":
2323 config = cls._check_and_enable_flex_attn(config, hard_check_only=True)
File c:\Users\colorverse\miniconda3\envs\llm_qdrant\Lib\site-packages\transformers\modeling_utils.py:2457, in PreTrainedModel._check_and_enable_flash_attn_2(cls, config, torch_dtype, device_map, check_device_map, hard_check_only)
2455 return config
2456 else:
-> 2457 raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
2459 flash_attention_version = version.parse(importlib.metadata.version("flash_attn"))
2460 if torch.version.cuda:
ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
i got the same errorImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.