Initial Upload

Browse files

Files changed (9) hide show

README.md +85 -0
config.json +55 -0
generation_config.json +7 -0
model.safetensors +3 -0
quantization_config.json +24 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +43 -0

README.md ADDED Viewed

	@@ -0,0 +1,85 @@

+---
+tags:
+- pytorch
+- causal-lm
+- OpenLLaMA
+- autoround
+- auto-round
+- intel-autoround
+- gptq
+- woq
+- intel
+- pytorch
+- openlm-research
+license: apache-2.0
+datasets:
+- tiiuae/falcon-refinedweb
+- bigcode/starcoderdata
+- togethercomputer/RedPajama-Data-1T
+model_name: OpenLLaMA 3B v2
+base_model:
+- openlm-research/open_llama_3b_v2
+inference: false
+model_creator: openlm-research
+pipeline_tag: text-generation
+prompt_template: '{prompt}
+  '
+quantized_by: fbaldassarri
+---
+## Model Information
+Quantized version of [openlm-research/open_llama_3b_v2](https://huggingface.co/openlm-research/open_llama_3b_v2) using torch.float32 for quantization tuning.
+- 8 bits (INT8)
+- group size = 128
+- Symmetrical Quantization
+- Method WoQ (AutoRound format)
+Fast and low memory, 2-3X speedup (slight accuracy drop at W8G128)
+Quantization framework: [Intel AutoRound](https://github.com/intel/auto-round) v0.4.6
+Note: this INT8 version of open_llama_3b_v2 has been quantized to run inference through CPU.
+## Replication Recipe
+### Step 1 Install Requirements
+I suggest to install requirements into a dedicated python-virtualenv or a conda enviroment.
+```
+wget https://github.com/intel/auto-round/archive/refs/tags/v0.4.6.tar.gz
+tar -xvzf v0.4.6.tar.gz
+cd auto-round-0.4.6
+pip install -r requirements-cpu.txt --upgrade
+```
+### Step 2 Build Intel AutoRound wheel from sources
+```
+pip install -vvv --no-build-isolation -e .[cpu]
+```
+### Step 3 Script for Quantization
+```
+  from transformers import AutoModelForCausalLM, AutoTokenizer
+  model_name = "openlm-research/open_llama_3b_v2"
+  model = AutoModelForCausalLM.from_pretrained(model_name)
+  tokenizer = AutoTokenizer.from_pretrained(model_name)
+  from auto_round import AutoRound
+  bits, group_size, sym, device, amp = 8, 128, True, 'cpu', False
+  autoround = AutoRound(model, tokenizer, nsamples=128, iters=200, seqlen=512, batch_size=4, bits=bits, group_size=group_size, sym=sym, device=device, amp=amp)
+  autoround.quantize()
+  output_dir = "./AutoRound/openlm-research_open_llama_3b_v2-autoround-int8-gs128-sym"
+  autoround.save_quantized(output_dir, format='auto_round', inplace=True)
+```
+## License
+[Apache 2.0 License](https://choosealicense.com/licenses/apache-2.0/)
+## Disclaimer
+This quantized model comes with no warranty. It has been developed only for research purposes.

config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "_name_or_path": "openlm-research/open_llama_3b_v2",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 100,
+  "hidden_act": "silu",
+  "hidden_size": 3200,
+  "initializer_range": 0.02,
+  "intermediate_size": 8640,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 26,
+  "num_key_value_heads": 32,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "amp": false,
+    "autoround_version": "0.4.6",
+    "backend": "auto_round:gptq:exllamav2",
+    "batch_size": 4,
+    "bits": 8,
+    "data_type": "int",
+    "dataset": "NeelNanda/pile-10k",
+    "enable_minmax_tuning": true,
+    "enable_norm_bias_tuning": false,
+    "enable_quanted_input": true,
+    "gradient_accumulate_steps": 1,
+    "group_size": 128,
+    "iters": 200,
+    "low_gpu_mem_usage": false,
+    "lr": 0.005,
+    "minmax_lr": 0.005,
+    "nsamples": 128,
+    "quant_method": "intel/auto-round",
+    "scale_dtype": "torch.float16",
+    "seqlen": 512,
+    "sym": true,
+    "to_quant_block_names": null
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.47.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.47.1"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f054e8ec99a3e4cf292b1b01b32e4d28f60e4d1d59db9a128830597db637ad7
+size 4119994536

quantization_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bits": 8,
+  "group_size": 128,
+  "sym": true,
+  "data_type": "int",
+  "enable_quanted_input": true,
+  "enable_minmax_tuning": true,
+  "seqlen": 512,
+  "batch_size": 4,
+  "scale_dtype": "torch.float16",
+  "lr": 0.005,
+  "minmax_lr": 0.005,
+  "gradient_accumulate_steps": 1,
+  "iters": 200,
+  "amp": false,
+  "nsamples": 128,
+  "low_gpu_mem_usage": false,
+  "to_quant_block_names": null,
+  "enable_norm_bias_tuning": false,
+  "dataset": "NeelNanda/pile-10k",
+  "autoround_version": "0.4.6",
+  "quant_method": "intel/auto-round",
+  "backend": "auto_round:gptq:exllamav2"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91b289e85fa20fd375d8b33dc12f77616f18abc6359804471d1fafcb425fecb8
+size 511574

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}