mudabbirbhat's picture
Upload folder using huggingface_hub
e2eccc0 verified
import re
from threading import Thread
from typing import Iterator, List,Dict
import os
import torch
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
TextIteratorStreamer, pipeline,BitsAndBytesConfig)
MAX_INPUT_TOKEN_LENGTH = 4096
model_name = "vibhorag101/llama-2-7b-chat-hf-phr_mental_therapy_v2"
use_4bit=True
device_map = {"": 0}
bnb_config = BitsAndBytesConfig(
load_in_4bit=use_4bit,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype="float16",
bnb_4bit_use_double_quant=False,
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def get_input_token_length(messages) -> int:
return(len(tokenizer.apply_chat_template(messages)))
def get_LLAMA_response_stream(
messages:List[Dict[str, str]],
max_new_tokens: int = 1024,
temperature: float = 0.8,
top_p: float = 0.95,
top_k: int = 50) -> Iterator[str]:
prompt = tokenizer.apply_chat_template(messages,tokenize=False)
inputs = tokenizer(prompt, return_tensors='pt', add_special_tokens=False).to('cuda')
if(len(inputs["input_ids"])> MAX_INPUT_TOKEN_LENGTH):
raise ValueError(f"Input token length is {inputs['input_ids'].shape[1]}, which exceeds the maximum of {MAX_INPUT_TOKEN_LENGTH}.")
streamer = TextIteratorStreamer(tokenizer,
timeout=10.,
skip_prompt=True,
skip_special_tokens=True)
generate_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
num_beams=1,
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
outputs = []
for text in streamer:
outputs.append(text)
yield ''.join(outputs)
def get_LLAMA_response(
messages,
max_new_tokens: int = 1024,
temperature: float = 0.8,
top_p: float = 0.95,
top_k: int = 50) -> str:
prompt = tokenizer.apply_chat_template(messages,tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
input_ids = inputs["input_ids"]
if(len(input_ids) > MAX_INPUT_TOKEN_LENGTH):
raise ValueError(f"Input token length is {inputs['input_ids'].shape[1]}, which exceeds the maximum of {MAX_INPUT_TOKEN_LENGTH}.")
output_ids = model.generate(
**inputs,
max_length = 4096, # sum of input_tokens + max_new_tokens
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature)
output_text = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
return output_text