Fix zipnn patch
Browse files
README.md
CHANGED
@@ -20,9 +20,9 @@ pip install zipnn
|
|
20 |
|
21 |
Then simply add at the beginning of the file
|
22 |
```python
|
23 |
-
from zipnn import
|
24 |
|
25 |
-
|
26 |
```
|
27 |
And continue as usual. The patch will take care of decompressing the model correctly and safely.
|
28 |
|
@@ -64,9 +64,9 @@ You can run the model not using the optimized Mamba kernels, but it is **not** r
|
|
64 |
|
65 |
```python
|
66 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
67 |
-
from zipnn import
|
68 |
|
69 |
-
|
70 |
|
71 |
model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed")
|
72 |
tokenizer = AutoTokenizer.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed")
|
@@ -89,9 +89,9 @@ Please note that if you're using `transformers<4.40.0`, `trust_remote_code=True`
|
|
89 |
```python
|
90 |
from transformers import AutoModelForCausalLM
|
91 |
import torch
|
92 |
-
from zipnn import
|
93 |
|
94 |
-
|
95 |
|
96 |
model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed",
|
97 |
torch_dtype=torch.bfloat16) # you can also use torch_dtype=torch.float16
|
@@ -100,9 +100,9 @@ model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compres
|
|
100 |
When using half precision, you can enable the [FlashAttention2](https://github.com/Dao-AILab/flash-attention) implementation of the Attention blocks. In order to use it, you also need the model on a CUDA device. Since in this precision the model is to big to fit on a single 80GB GPU, you'll also need to parallelize it using [accelerate](https://huggingface.co/docs/accelerate/index):
|
101 |
```python
|
102 |
from transformers import AutoModelForCausalLM
|
103 |
-
from zipnn import
|
104 |
|
105 |
-
|
106 |
|
107 |
import torch
|
108 |
model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed",
|
@@ -118,9 +118,9 @@ model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compres
|
|
118 |
|
119 |
```python
|
120 |
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
121 |
-
from zipnn import
|
122 |
|
123 |
-
|
124 |
|
125 |
quantization_config = BitsAndBytesConfig(load_in_8bit=True,
|
126 |
llm_int8_skip_modules=["mamba"])
|
@@ -140,9 +140,9 @@ from datasets import load_dataset
|
|
140 |
from trl import SFTTrainer, SFTConfig
|
141 |
from peft import LoraConfig
|
142 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
|
143 |
-
from zipnn import
|
144 |
|
145 |
-
|
146 |
|
147 |
tokenizer = AutoTokenizer.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed")
|
148 |
model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed",
|
|
|
20 |
|
21 |
Then simply add at the beginning of the file
|
22 |
```python
|
23 |
+
from zipnn import zipnn_hf
|
24 |
|
25 |
+
zipnn_hf()
|
26 |
```
|
27 |
And continue as usual. The patch will take care of decompressing the model correctly and safely.
|
28 |
|
|
|
64 |
|
65 |
```python
|
66 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
67 |
+
from zipnn import zipnn_hf
|
68 |
|
69 |
+
zipnn_hf()
|
70 |
|
71 |
model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed")
|
72 |
tokenizer = AutoTokenizer.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed")
|
|
|
89 |
```python
|
90 |
from transformers import AutoModelForCausalLM
|
91 |
import torch
|
92 |
+
from zipnn import zipnn_hf
|
93 |
|
94 |
+
zipnn_hf()
|
95 |
|
96 |
model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed",
|
97 |
torch_dtype=torch.bfloat16) # you can also use torch_dtype=torch.float16
|
|
|
100 |
When using half precision, you can enable the [FlashAttention2](https://github.com/Dao-AILab/flash-attention) implementation of the Attention blocks. In order to use it, you also need the model on a CUDA device. Since in this precision the model is to big to fit on a single 80GB GPU, you'll also need to parallelize it using [accelerate](https://huggingface.co/docs/accelerate/index):
|
101 |
```python
|
102 |
from transformers import AutoModelForCausalLM
|
103 |
+
from zipnn import zipnn_hf
|
104 |
|
105 |
+
zipnn_hf()
|
106 |
|
107 |
import torch
|
108 |
model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed",
|
|
|
118 |
|
119 |
```python
|
120 |
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
|
121 |
+
from zipnn import zipnn_hf
|
122 |
|
123 |
+
zipnn_hf()
|
124 |
|
125 |
quantization_config = BitsAndBytesConfig(load_in_8bit=True,
|
126 |
llm_int8_skip_modules=["mamba"])
|
|
|
140 |
from trl import SFTTrainer, SFTConfig
|
141 |
from peft import LoraConfig
|
142 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
|
143 |
+
from zipnn import zipnn_hf
|
144 |
|
145 |
+
zipnn_hf()
|
146 |
|
147 |
tokenizer = AutoTokenizer.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed")
|
148 |
model = AutoModelForCausalLM.from_pretrained("royleibov/Jamba-v0.1-ZipNN-Compressed",
|