3 days ago

!pip install hqq==0.1.8
!pip install bitblas

import torch
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model = HQQModelForCausalLM.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

patch_linearlayers(model, patch_add_quant_config,
BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))

model.eval();
cleanup()

/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Fetching 9 files: 100%
9/9 [01:43<00:00, 20.94s/it]
.gitattributes: 100%
1.70k/1.70k [00:00<00:00, 52.7kB/s]
README.md: 100%
4.13k/4.13k [00:00<00:00, 120kB/s]
adapter_v0.1.lora: 100%
83.0M/83.0M [00:02<00:00, 41.2MB/s]
llama3-2bit.gif: 100%
25.8M/25.8M [00:00<00:00, 35.6MB/s]
qmodel.pt: 100%
4.28G/4.28G [01:42<00:00, 47.9MB/s]
tokenizer.json: 100%
9.09M/9.09M [00:00<00:00, 28.4MB/s]
config.json: 100%
728/728 [00:00<00:00, 28.7kB/s]
special_tokens_map.json: 100%
296/296 [00:00<00:00, 6.14kB/s]
tokenizer_config.json: 100%
51.0k/51.0k [00:00<00:00, 1.45MB/s]
/usr/local/lib/python3.11/dist-packages/hqq/models/base.py:237: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
return torch.load(cls.get_weight_file(save_dir), map_location=map_location)
0%| | 0/32 [00:00<?, ?it/s]

AttributeError Traceback (most recent call last)
in <cell line: 0>()
8 ###################################################
9 model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
---> 10 model = HQQModelForCausalLM.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
11 tokenizer = AutoTokenizer.from_pretrained(model_id)
12

4 frames
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in getattr(self, name)
1929 if name in modules:
1930 return modules[name]
-> 1931 raise AttributeError(
1932 f"'{type(self).name}' object has no attribute '{name}'"
1933 )

AttributeError: 'LlamaAttention' object has no attribute 'rotary_emb'

rakmik

3 days ago

!pip list

DEPRECATION: Loading egg at /usr/local/lib/python3.11/dist-packages/hqq_aten-0.0.0-py3.11-linux-x86_64.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330
Package Version

absl-py 1.4.0
accelerate 1.2.1
aiohappyeyeballs 2.4.4
aiohttp 3.11.11
aiohttp-cors 0.7.0
aiosignal 1.3.2
airportsdata 20241001
alabaster 1.0.0
albucore 0.0.19
albumentations 1.4.20
ale-py 0.10.1
altair 5.5.0
annotated-types 0.7.0
anyio 3.7.1
argon2-cffi 23.1.0
argon2-cffi-bindings 21.2.0
array_record 0.6.0
arviz 0.20.0
astor 0.8.1
astropy 6.1.7
astropy-iers-data 0.2025.1.27.0.32.44
astunparse 1.6.3
atpublic 4.1.0
attrs 25.1.0
audioread 3.0.1
auto_gptq 0.7.1
autograd 1.7.0
babel 2.16.0
backcall 0.2.0
beautifulsoup4 4.12.3
bigframes 1.34.0
bigquery-magics 0.5.0
bitblas 0.1.0
blake3 1.0.4
bleach 6.2.0
blinker 1.9.0
blis 0.7.11
blosc2 3.0.0
bokeh 3.6.2
Bottleneck 1.4.2
bqplot 0.12.44
branca 0.8.1
CacheControl 0.14.2
cachetools 5.5.1
catalogue 2.0.10
certifi 2024.12.14
cffi 1.17.1
chardet 5.2.0
charset-normalizer 3.4.1
chex 0.1.88
clarabel 0.9.0
click 8.1.8
cloudpathlib 0.20.0
cloudpickle 3.1.1
cmake 3.31.4
cmdstanpy 1.2.5
colorcet 3.1.0
colorful 0.5.6
colorlover 0.3.0
colour 0.1.5
community 1.0.0b1
compressed-tensors 0.9.0
confection 0.1.5
cons 0.4.6
contourpy 1.3.1
cpplint 2.0.0
cramjam 2.9.1
cryptography 43.0.3
cuda-python 12.6.0
cudf-cu12 24.12.0
cufflinks 0.17.3
cupy-cuda12x 13.3.0
cvxopt 1.3.2
cvxpy 1.6.0
cycler 0.12.1
cyipopt 1.5.0
cymem 2.0.11
Cython 3.0.11
dask 2024.10.0
datascience 0.17.6
datasets 3.2.0
db-dtypes 1.4.0
dbus-python 1.2.18
debugpy 1.8.0
decorator 4.4.2
defusedxml 0.7.1
Deprecated 1.2.18
depyf 0.18.0
diffusers 0.32.2
dill 0.3.8
diskcache 5.6.3
distlib 0.3.9
distro 1.9.0
dlib 19.24.2
dm-tree 0.1.8
docker-pycreds 0.4.0
docstring_parser 0.16
docutils 0.21.2
dopamine_rl 4.1.2
dtlib 0.0.0.dev2
duckdb 1.1.3
earthengine-api 1.4.6
easydict 1.13
editdistance 0.8.1
eerepr 0.1.0
einops 0.8.0
en-core-web-sm 3.7.1
entrypoints 0.4
et_xmlfile 2.0.0
etils 1.11.0
etuples 0.3.9
eval_type_backport 0.2.2
execnet 2.1.1
Farama-Notifications 0.0.4
fastai 2.7.18
fastapi 0.115.8
fastcore 1.7.28
fastdownload 0.0.7
fastjsonschema 2.21.1
fastprogress 1.0.3
fastrlock 0.8.3
filelock 3.17.0
firebase-admin 6.6.0
Flask 3.1.0
flatbuffers 25.1.24
flax 0.10.2
folium 0.19.4
fonttools 4.55.7
frozendict 2.4.6
frozenlist 1.5.0
fsspec 2024.9.0
future 1.0.0
gast 0.6.0
gcsfs 2024.10.0
GDAL 3.6.4
gdown 5.2.0
geemap 0.35.1
gekko 1.2.1
gensim 4.3.3
geocoder 1.38.1
geographiclib 2.0
geopandas 1.0.1
geopy 2.4.1
gguf 0.10.0
gin-config 0.5.0
gitdb 4.0.12
GitPython 3.1.44
glob2 0.7
google 2.0.3
google-ai-generativelanguage 0.6.15
google-api-core 2.19.2
google-api-python-client 2.155.0
google-auth 2.27.0
google-auth-httplib2 0.2.0
google-auth-oauthlib 1.2.1
google-cloud-aiplatform 1.74.0
google-cloud-bigquery 3.25.0
google-cloud-bigquery-connection 1.17.0
google-cloud-bigquery-storage 2.27.0
google-cloud-bigtable 2.28.1
google-cloud-core 2.4.1
google-cloud-datastore 2.20.2
google-cloud-firestore 2.19.0
google-cloud-functions 1.19.0
google-cloud-iam 2.17.0
google-cloud-language 2.16.0
google-cloud-pubsub 2.25.0
google-cloud-resource-manager 1.14.0
google-cloud-spanner 3.51.0
google-cloud-storage 2.19.0
google-cloud-translate 3.19.0
google-colab 1.0.0
google-crc32c 1.6.0
google-genai 0.3.0
google-generativeai 0.8.4
google-pasta 0.2.0
google-resumable-media 2.7.2
googleapis-common-protos 1.66.0
googledrivedownloader 0.4
graphviz 0.20.3
greenlet 3.1.1
grpc-google-iam-v1 0.14.0
grpc-interceptor 0.15.4
grpcio 1.70.0
grpcio-status 1.62.3
gspread 6.1.4
gspread-dataframe 4.0.0
gym 0.25.2
gym-notices 0.0.8
gymnasium 1.0.0
h11 0.14.0
h5netcdf 1.5.0
h5py 3.12.1
highspy 1.9.0
holidays 0.65
holoviews 1.20.0
hqq 0.1.8
hqq_aten 0.0.0
hqq_aten 0.0.0
html5lib 1.1
httpcore 1.0.7
httpimport 1.4.0
httplib2 0.22.0
httptools 0.6.4
httpx 0.28.1
huggingface-hub 0.27.1
humanize 4.11.0
hyperopt 0.2.7
ibis-framework 9.2.0
idna 3.10
imageio 2.36.1
imageio-ffmpeg 0.6.0
imagesize 1.4.1
imbalanced-learn 0.13.0
imgaug 0.4.0
immutabledict 4.2.1
importlib_metadata 8.6.1
importlib_resources 6.5.2
imutils 0.5.4
inflect 7.5.0
iniconfig 2.0.0
intel-cmplr-lib-ur 2025.0.4
intel-openmp 2025.0.4
interegular 0.3.3
ipyevents 2.0.2
ipyfilechooser 0.6.0
ipykernel 5.5.6
ipyleaflet 0.19.2
ipyparallel 8.8.0
ipython 7.34.0
ipython-genutils 0.2.0
ipython-sql 0.5.0
ipytree 0.2.2
ipywidgets 7.7.1
itsdangerous 2.2.0
jax 0.4.33
jax-cuda12-pjrt 0.4.33
jax-cuda12-plugin 0.4.33
jaxlib 0.4.33
jeepney 0.7.1
jellyfish 1.1.0
jieba 0.42.1
Jinja2 3.1.5
jiter 0.8.2
joblib 1.4.2
jsonpatch 1.33
jsonpickle 4.0.1
jsonpointer 3.0.0
jsonschema 4.23.0
jsonschema-specifications 2024.10.1
jupyter-client 6.1.12
jupyter-console 6.1.0
jupyter_core 5.7.2
jupyter-leaflet 0.19.2
jupyter-server 1.24.0
jupyterlab_pygments 0.3.0
jupyterlab_widgets 3.0.13
kaggle 1.6.17
kagglehub 0.3.6
keras 3.8.0
keras-hub 0.18.1
keras-nlp 0.18.1
keyring 23.5.0
kiwisolver 1.4.8
langchain 0.3.16
langchain-core 0.3.32
langchain-text-splitters 0.3.5
langcodes 3.5.0
langsmith 0.3.2
language_data 1.3.0
lark 1.2.2
launchpadlib 1.10.16
lazr.restfulclient 0.14.4
lazr.uri 1.0.6
lazy_loader 0.4
libclang 18.1.1
libcudf-cu12 24.12.0
libkvikio-cu12 24.12.1
librosa 0.10.2.post1
lightgbm 4.5.0
linkify-it-py 2.0.3
llvmlite 0.43.0
lm-format-enforcer 0.10.9
locket 1.0.0
logical-unification 0.4.6
lxml 5.3.0
marisa-trie 1.2.1
Markdown 3.7
markdown-it-py 3.0.0
MarkupSafe 3.0.2
matplotlib 3.10.0
matplotlib-inline 0.1.7
matplotlib-venn 1.1.1
mdit-py-plugins 0.4.2
mdurl 0.1.2
miniKanren 1.0.3
missingno 0.5.2
mistral_common 1.5.2
mistune 3.1.1
mizani 0.13.1
mkl 2025.0.1
ml-dtypes 0.4.1
mlxtend 0.23.4
more-itertools 10.5.0
moviepy 1.0.3
mpmath 1.3.0
msgpack 1.1.0
msgspec 0.19.0
multidict 6.1.0
multipledispatch 1.0.0
multiprocess 0.70.16
multitasking 0.0.11
murmurhash 1.0.12
music21 9.3.0
namex 0.0.8
narwhals 1.24.1
natsort 8.4.0
nbclassic 1.2.0
nbclient 0.10.2
nbconvert 7.16.6
nbformat 5.10.4
ndindex 1.9.2
nest-asyncio 1.6.0
networkx 3.4.2
nibabel 5.3.2
nltk 3.9.1
notebook 6.5.5
notebook_shim 0.2.4
numba 0.60.0
numba-cuda 0.0.17.1
numexpr 2.10.2
numpy 1.26.4
nvidia-cublas-cu12 12.4.5.8
nvidia-cuda-cupti-cu12 12.4.127
nvidia-cuda-nvcc-cu12 12.5.82
nvidia-cuda-nvrtc-cu12 12.4.127
nvidia-cuda-runtime-cu12 12.4.127
nvidia-cudnn-cu12 9.1.0.70
nvidia-cufft-cu12 11.2.1.3
nvidia-curand-cu12 10.3.5.147
nvidia-cusolver-cu12 11.6.1.9
nvidia-cusparse-cu12 12.3.1.170
nvidia-ml-py 12.570.86
nvidia-nccl-cu12 2.21.5
nvidia-nvcomp-cu12 4.1.0.6
nvidia-nvjitlink-cu12 12.4.127
nvidia-nvtx-cu12 12.4.127
nvtx 0.2.10
nx-cugraph-cu12 24.12.0
oauth2client 4.1.3
oauthlib 3.2.2
openai 1.59.9
opencensus 0.11.4
opencensus-context 0.1.3
opencv-contrib-python 4.10.0.84
opencv-python 4.10.0.84
opencv-python-headless 4.11.0.86
openpyxl 3.1.5
opentelemetry-api 1.16.0
opentelemetry-sdk 1.16.0
opentelemetry-semantic-conventions 0.37b0
opt_einsum 3.4.0
optax 0.2.4
optree 0.14.0
orbax-checkpoint 0.6.4
orjson 3.10.15
osqp 0.6.7.post3
outlines 0.1.11
outlines_core 0.1.26
packaging 24.2
pandas 2.2.2
pandas-datareader 0.10.0
pandas-gbq 0.26.1
pandas-stubs 2.2.2.240909
pandocfilters 1.5.1
panel 1.6.0
param 2.2.0
parso 0.8.4
parsy 2.1
partd 1.4.2
partial-json-parser 0.2.1.1.post5
pathlib 1.0.1
patsy 1.0.1
peewee 3.17.8
peft 0.14.0
pexpect 4.9.0
pickleshare 0.7.5
pillow 10.4.0
pip 24.1.2
platformdirs 4.3.6
plotly 5.24.1
plotnine 0.14.5
pluggy 1.5.0
ply 3.11
polars 1.9.0
pooch 1.8.2
portpicker 1.5.2
preshed 3.0.9
prettytable 3.13.0
proglog 0.1.10
progressbar2 4.5.0
prometheus_client 0.21.1
prometheus-fastapi-instrumentator 7.0.2
promise 2.3
prompt_toolkit 3.0.50
propcache 0.2.1
prophet 1.1.6
proto-plus 1.26.0
protobuf 4.25.6
psutil 5.9.5
psycopg2 2.9.10
ptyprocess 0.7.0
py-cpuinfo 9.0.0
py-spy 0.4.0
py4j 0.10.9.7
pyarrow 17.0.0
pyasn1 0.6.1
pyasn1_modules 0.4.1
pybind11 2.13.6
pycocotools 2.0.8
pycountry 24.6.1
pycparser 2.22
pydantic 2.10.6
pydantic_core 2.27.2
pydata-google-auth 1.9.1
pydot 3.0.4
pydotplus 2.0.2
PyDrive 1.3.1
PyDrive2 1.21.3
pyerfa 2.0.1.5
pygame 2.6.1
pygit2 1.16.0
Pygments 2.18.0
PyGObject 3.42.1
PyJWT 2.10.1
pylibcudf-cu12 24.12.0
pylibcugraph-cu12 24.12.0
pylibraft-cu12 24.12.0
pymc 5.19.1
pymystem3 0.2.0
pynvjitlink-cu12 0.5.0
pyogrio 0.10.0
Pyomo 6.8.2
PyOpenGL 3.1.9
pyOpenSSL 24.2.1
pyparsing 3.2.1
pyperclip 1.9.0
pyproj 3.7.0
pyshp 2.3.1
PySocks 1.7.1
pyspark 3.5.4
pytensor 2.26.4
pytest 8.3.4
pytest-xdist 3.6.1
python-apt 0.0.0
python-box 7.3.2
python-dateutil 2.8.2
python-dotenv 1.0.1
python-louvain 0.16
python-slugify 8.0.4
python-snappy 0.7.3
python-utils 3.9.1
pytz 2024.2
pyviz_comms 3.0.4
PyYAML 6.0.2
pyzmq 24.0.1
qdldl 0.1.7.post5
RapidFuzz 3.12.1
ratelim 0.1.6
ray 2.42.0
referencing 0.36.2
regex 2024.11.6
requests 2.32.3
requests-oauthlib 1.3.1
requests-toolbelt 1.0.0
requirements-parser 0.9.0
rich 13.9.4
rmm-cu12 24.12.1
rouge 1.0.1
rpds-py 0.22.3
rpy2 3.4.2
rsa 4.9
safetensors 0.5.2
scikit-image 0.25.1
scikit-learn 1.6.1
scipy 1.13.1
scooby 0.10.0
scs 3.2.7.post2
seaborn 0.13.2
SecretStorage 3.3.1
Send2Trash 1.8.3
sentence-transformers 3.3.1
sentencepiece 0.2.0
sentry-sdk 2.20.0
setproctitle 1.3.4
setuptools 75.1.0
shap 0.46.0
shapely 2.0.6
shellingham 1.5.4
simple-parsing 0.1.7
six 1.17.0
sklearn-compat 0.1.3
sklearn-pandas 2.2.0
slicer 0.0.8
smart-open 7.1.0
smmap 5.0.2
sniffio 1.3.1
snowballstemmer 2.2.0
soundfile 0.13.1
soupsieve 2.6
soxr 0.5.0.post1
spacy 3.7.5
spacy-legacy 3.0.12
spacy-loggers 1.0.5
spanner-graph-notebook 1.0.9
Sphinx 8.1.3
sphinxcontrib-applehelp 2.0.0
sphinxcontrib-devhelp 2.0.0
sphinxcontrib-htmlhelp 2.1.0
sphinxcontrib-jsmath 1.0.1
sphinxcontrib-qthelp 2.0.0
sphinxcontrib-serializinghtml 2.0.0
SQLAlchemy 2.0.37
sqlglot 25.6.1
sqlparse 0.5.3
srsly 2.5.1
stanio 0.5.1
starlette 0.45.3
statsmodels 0.14.4
stringzilla 3.11.3
sympy 1.13.1
tables 3.10.2
tabulate 0.9.0
tbb 2022.0.0
tcmlib 1.2.0
tenacity 9.0.0
tensorboard 2.18.0
tensorboard-data-server 0.7.2
tensorflow 2.18.0
tensorflow-datasets 4.9.7
tensorflow-hub 0.16.1
tensorflow-io-gcs-filesystem 0.37.1
tensorflow-metadata 1.16.1
tensorflow-probability 0.24.0
tensorflow-text 2.18.1
tensorstore 0.1.71
termcolor 2.5.0
terminado 0.18.1
text-unidecode 1.3
textblob 0.17.1
tf_keras 2.18.0
tf-slim 1.1.0
thefuzz 0.22.1
thinc 8.2.5
threadpoolctl 3.5.0
tifffile 2025.1.10
tiktoken 0.7.0
timm 1.0.14
tinycss2 1.4.0
tokenizers 0.21.0
toml 0.10.2
toolz 0.12.1
torch 2.5.1+cu124
torchaudio 2.5.1+cu124
torchsummary 1.5.1
torchvision 0.20.1+cu124
tornado 6.4.2
tqdm 4.67.1
traitlets 5.7.1
traittypes 0.2.1
transformers 4.48.2
triton 3.1.0
tweepy 4.14.0
typeguard 4.4.1
typer 0.15.1
types-pytz 2024.2.0.20241221
types-setuptools 75.8.0.20250110
typing_extensions 4.12.2
tzdata 2025.1
tzlocal 5.2
uc-micro-py 1.0.3
umf 0.9.1
uritemplate 4.1.1
urllib3 2.3.0
uvicorn 0.34.0
uvloop 0.21.0
vega-datasets 0.9.0
virtualenv 20.29.1
vllm 0.7.1
wadllib 1.3.6
wandb 0.19.5
wasabi 1.1.3
watchfiles 1.0.4
wcwidth 0.2.13
weasel 0.4.1
webcolors 24.11.1
webencodings 0.5.1
websocket-client 1.8.0
websockets 14.2
Werkzeug 3.1.3
wheel 0.45.1
widgetsnbextension 3.6.10
wordcloud 1.9.4
wrapt 1.17.2
xarray 2025.1.1
xarray-einstats 0.8.0
xformers 0.0.28.post3
xgboost 2.1.3
xgrammar 0.1.11
xlrd 2.0.1
xxhash 3.5.0
xyzservices 2025.1.0
yarl 1.18.3
yellowbrick 1.5
yfinance 0.2.52
zipp 3.21.0
zstandard 0.23.0

rakmik

3 days ago

colab t4

rakmik

3 days ago

import torch
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

Load the model

###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model = HQQModelForCausalLM.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

patch_linearlayers(model, patch_add_quant_config,
BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))

model.eval();
cleanup()

Use optimized inference kernels

###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)

prepare_for_inference(model) #default backend

prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...

Generate

###################################################

For longer context, make sure to allocate enough cache via the cache_size= parameter

gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) #Slower generation but no warm-up

gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile="partial").warmup() #Faster generation, but warm-up takes a while

gen.generate("Write an essay about large language models", print_tokens=True)
gen.generate("Tell me a funny joke!", print_tokens=True)
gen.generate("How to make a yummy chocolate cake?", print_tokens=True)

/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Fetching 9 files: 100%
9/9 [00:00<00:00, 311.93it/s]
/usr/local/lib/python3.11/dist-packages/hqq/models/base.py:237: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
return torch.load(cls.get_weight_file(save_dir), map_location=map_location)
0%| | 0/32 [00:00<?, ?it/s]

AttributeError Traceback (most recent call last)
in <cell line: 0>()
8 ###################################################
9 model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
---> 10 model = HQQModelForCausalLM.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
11 tokenizer = AutoTokenizer.from_pretrained(model_id)
12

4 frames
/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py in getattr(self, name)
1727 if name in modules:
1728 return modules[name]
-> 1729 raise AttributeError(f"'{type(self).name}' object has no attribute '{name}'")
1730
1731 def setattr(self, name: str, value: Union[Tensor, 'Module']) -> None:

AttributeError: 'LlamaAttention' object has no attribute 'rotary_emb'

mobicham

Mobius Labs GmbH org 3 days ago

Same comment as before: make sure you use an updated version of transformers and a compatible GPU

mobiuslabsgmbh
/

Llama-3-8b-instruct_2bitgs64_hqq

erorr

Load the model

Use optimized inference kernels

prepare_for_inference(model) #default backend

Generate

For longer context, make sure to allocate enough cache via the cache_size= parameter

gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) #Slower generation but no warm-up