vakodiya commited on
Commit
6d5de73
·
verified ·
1 Parent(s): ac1b2f5

Quantization in Model

Browse files
Files changed (1) hide show
  1. app.py +9 -2
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import streamlit as st
3
  import pickle
4
  import time
5
- from transformers import AutoTokenizer, AutoModelForCausalLM
6
  from langchain.llms.base import LLM
7
  from langchain.chains import RetrievalQAWithSourcesChain
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -16,7 +16,14 @@ login(os.getenv('HF_llama3chat8b'))
16
 
17
  class CustomHuggingFaceLLM(LLM):
18
  def __init__(self, model_name, temperature=0.7):
19
- self.model = AutoModelForCausalLM.from_pretrained(model_name)
 
 
 
 
 
 
 
20
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
21
  self.temperature = temperature
22
 
 
2
  import streamlit as st
3
  import pickle
4
  import time
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
6
  from langchain.llms.base import LLM
7
  from langchain.chains import RetrievalQAWithSourcesChain
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
16
 
17
  class CustomHuggingFaceLLM(LLM):
18
  def __init__(self, model_name, temperature=0.7):
19
+
20
+ # Configure 8-bit quantization using `BitsAndBytesConfig`
21
+ quantization_config = BitsAndBytesConfig(
22
+ load_in_8bit=True, # Enable 8-bit quantization
23
+ llm_int8_enable_fp32_cpu_offload=True # Offload FP32 operations to CPU for further memory savings
24
+ )
25
+
26
+ self.model = AutoModelForCausalLM.from_pretrained(model_name,quantization_config=quantization_config)
27
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
28
  self.temperature = temperature
29