Unable to use flash_attention_2
#7
by
nam-withpi
- opened
Here is my code:
from config import HF_TOKEN
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
import os
import torch
os.environ["HF_TOKEN"] = HF_TOKEN
model_id = "EuroBERT/EuroBERT-610m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
config._attn_implementation = "flash_attention_2"
config.torch_dtype = "bfloat16"
print(config)
model = AutoModelForMaskedLM.from_pretrained(
model_id,
config=config,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
)
model.to("cuda")
print(model.config)
text = "The capital of France is <|mask|>."
inputs = tokenizer(text, return_tensors="pt")
inputs.to("cuda")
outputs = model(**inputs)
Here is the error:
tokenizer_config.json: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 55.4k/55.4k [00:00<00:00, 152MB/s]
tokenizer.json: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 17.2M/17.2M [00:00<00:00, 39.4MB/s]
special_tokens_map.json: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 590/590 [00:00<00:00, 1.94MB/s]
config.json: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1.24k/1.24k [00:00<00:00, 13.5MB/s]
configuration_eurobert.py: 100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 12.1k/12.1k [00:00<00:00, 44.2MB/s]
A new version of the following files was downloaded from https://huggingface.co/EuroBERT/EuroBERT-610m:
- configuration_eurobert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
EuroBertConfig {
"architectures": [
"EuroBertForMaskedLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"auto_map": {
"AutoConfig": "EuroBERT/EuroBERT-610m--configuration_eurobert.EuroBertConfig",
"AutoModel": "EuroBERT/EuroBERT-610m--modeling_eurobert.EuroBertModel",
"AutoModelForMaskedLM": "EuroBERT/EuroBERT-610m--modeling_eurobert.EuroBertForMaskedLM",
"AutoModelForPreTraining": "EuroBERT/EuroBERT-610m--modeling_eurobert.EuroBertPreTrainedModel",
"AutoModelForSequenceClassification": "EuroBERT/EuroBERT-610m--modeling_eurobert.EuroBertForSequenceClassification"
},
"bos_token": "<|begin_of_text|>",
"bos_token_id": 128000,
"clf_pooling": "late",
"eos_token": "<|end_of_text|>",
"eos_token_id": 128001,
"head_dim": 64,
"hidden_act": "silu",
"hidden_dropout": 0.0,
"hidden_size": 1152,
"initializer_range": 0.02,
"intermediate_size": 4096,
"mask_token": "<|mask|>",
"mask_token_id": 128002,
"max_position_embeddings": 8192,
"mlp_bias": false,
"model_type": "eurobert",
"num_attention_heads": 18,
"num_hidden_layers": 26,
"num_key_value_heads": 6,
"pad_token": "<|end_of_text|>",
"pad_token_id": 128001,
"pretraining_tp": 1,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"rope_theta": 250000,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.50.0",
"use_cache": false,
"vocab_size": 128256
}
modeling_eurobert.py: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 40.3k/40.3k [00:00<00:00, 68.1MB/s]
A new version of the following files was downloaded from https://huggingface.co/EuroBERT/EuroBERT-610m:
- modeling_eurobert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
model.safetensors: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3.02G/3.02G [00:19<00:00, 157MB/s]
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
EuroBertConfig {
"_attn_implementation_autoset": true,
"architectures": [
"EuroBertForMaskedLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"auto_map": {
"AutoConfig": "EuroBERT/EuroBERT-610m--configuration_eurobert.EuroBertConfig",
"AutoModel": "EuroBERT/EuroBERT-610m--modeling_eurobert.EuroBertModel",
"AutoModelForMaskedLM": "EuroBERT/EuroBERT-610m--modeling_eurobert.EuroBertForMaskedLM",
"AutoModelForPreTraining": "EuroBERT/EuroBERT-610m--modeling_eurobert.EuroBertPreTrainedModel",
"AutoModelForSequenceClassification": "EuroBERT/EuroBERT-610m--modeling_eurobert.EuroBertForSequenceClassification"
},
"bos_token": "<|begin_of_text|>",
"bos_token_id": 128000,
"clf_pooling": "late",
"eos_token": "<|end_of_text|>",
"eos_token_id": 128001,
"head_dim": 64,
"hidden_act": "silu",
"hidden_dropout": 0.0,
"hidden_size": 1152,
"initializer_range": 0.02,
"intermediate_size": 4096,
"mask_token": "<|mask|>",
"mask_token_id": 128002,
"max_position_embeddings": 8192,
"mlp_bias": false,
"model_type": "eurobert",
"num_attention_heads": 18,
"num_hidden_layers": 26,
"num_key_value_heads": 6,
"pad_token": "<|end_of_text|>",
"pad_token_id": 128001,
"pretraining_tp": 1,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"rope_theta": 250000,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.50.0",
"use_cache": false,
"vocab_size": 128256
}
Traceback (most recent call last):
File "/workspace/test.py", line 29, in <module>
outputs = model(**inputs)
^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.cache/huggingface/modules/transformers_modules/EuroBERT/EuroBERT-610m/2474bc87e3b68b052a53be523f91330fb9790d87/modeling_eurobert.py", line 740, in forward
encoder_output = self.model(
^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.cache/huggingface/modules/transformers_modules/EuroBERT/EuroBERT-610m/2474bc87e3b68b052a53be523f91330fb9790d87/modeling_eurobert.py", line 559, in forward
layer_outputs = encoder_layer(
^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.cache/huggingface/modules/transformers_modules/EuroBERT/EuroBERT-610m/2474bc87e3b68b052a53be523f91330fb9790d87/modeling_eurobert.py", line 361, in forward
hidden_states, self_attn_weights = self.self_attn(
^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.cache/huggingface/modules/transformers_modules/EuroBERT/EuroBERT-610m/2474bc87e3b68b052a53be523f91330fb9790d87/modeling_eurobert.py", line 192, in forward
attn_output, attn_weights = attention_interface(
^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/transformers/integrations/flash_attention.py", line 50, in flash_attention_forward
attn_output = _flash_attention_forward(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_flash_attention_utils.py", line 310, in _flash_attention_forward
attn_output_unpad = flash_attn_varlen_func(
^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/flash_attn/flash_attn_interface.py", line 1448, in flash_attn_varlen_func
return FlashAttnVarlenFunc.apply(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/autograd/function.py", line 574, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/flash_attn/flash_attn_interface.py", line 930, in forward
out_padded, softmax_lse, S_dmask, rng_state = _wrapped_flash_attn_varlen_forward(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/_ops.py", line 1061, in __call__
return self_._op(*args, **(kwargs or {}))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/_library/autograd.py", line 98, in autograd_impl
result = Generated.apply(*args, Metadata(keyset, keyword_only_args)) # type: ignore[attr-defined]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/autograd/function.py", line 574, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/_library/autograd.py", line 40, in forward
result = op.redispatch(keyset & _C._after_autograd_keyset, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/_ops.py", line 672, in redispatch
return self_._handle.redispatch_boxed(keyset, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/torch/_library/custom_ops.py", line 236, in backend_impl
result = self._backend_fns[device_type](*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/flash_attn/flash_attn_interface.py", line 170, in _flash_attn_varlen_forward
out, softmax_lse, S_dmask, rng_state = flash_attn_gpu.varlen_fwd(
^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: cu_seqlens_q must have shape (batch_size + 1)
can someone help take a look into this?
Hi
@nam-withpi
,
We're currently working on fixing our flash attention implementation it should be done shortly!
Sorry for the inconvenience.
Hey @nam-withpi , I just fixed the flash. Feel free to give it a try. You might need to clear your Hugging Face cache to redownload the model. Thanks for raising this issue π
Nicolas-BZRD
changed discussion status to
closed