Spaces:
Runtime error
Runtime error
import re | |
from threading import Thread | |
from typing import Iterator, List,Dict | |
import os | |
import torch | |
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, | |
TextIteratorStreamer, pipeline,BitsAndBytesConfig) | |
MAX_INPUT_TOKEN_LENGTH = 4096 | |
model_name = "vibhorag101/llama-2-7b-chat-hf-phr_mental_therapy_v2" | |
use_4bit=True | |
device_map = {"": 0} | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=use_4bit, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype="float16", | |
bnb_4bit_use_double_quant=False, | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
quantization_config=bnb_config | |
) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def get_input_token_length(messages) -> int: | |
return(len(tokenizer.apply_chat_template(messages))) | |
def get_LLAMA_response_stream( | |
messages:List[Dict[str, str]], | |
max_new_tokens: int = 1024, | |
temperature: float = 0.8, | |
top_p: float = 0.95, | |
top_k: int = 50) -> Iterator[str]: | |
prompt = tokenizer.apply_chat_template(messages,tokenize=False) | |
inputs = tokenizer(prompt, return_tensors='pt', add_special_tokens=False).to('cuda') | |
if(len(inputs["input_ids"])> MAX_INPUT_TOKEN_LENGTH): | |
raise ValueError(f"Input token length is {inputs['input_ids'].shape[1]}, which exceeds the maximum of {MAX_INPUT_TOKEN_LENGTH}.") | |
streamer = TextIteratorStreamer(tokenizer, | |
timeout=10., | |
skip_prompt=True, | |
skip_special_tokens=True) | |
generate_kwargs = dict( | |
inputs, | |
streamer=streamer, | |
max_new_tokens=max_new_tokens, | |
do_sample=True, | |
top_p=top_p, | |
top_k=top_k, | |
temperature=temperature, | |
num_beams=1, | |
) | |
t = Thread(target=model.generate, kwargs=generate_kwargs) | |
t.start() | |
outputs = [] | |
for text in streamer: | |
outputs.append(text) | |
yield ''.join(outputs) | |
def get_LLAMA_response( | |
messages, | |
max_new_tokens: int = 1024, | |
temperature: float = 0.8, | |
top_p: float = 0.95, | |
top_k: int = 50) -> str: | |
prompt = tokenizer.apply_chat_template(messages,tokenize=False) | |
inputs = tokenizer(prompt, return_tensors="pt").to("cuda") | |
input_ids = inputs["input_ids"] | |
if(len(input_ids) > MAX_INPUT_TOKEN_LENGTH): | |
raise ValueError(f"Input token length is {inputs['input_ids'].shape[1]}, which exceeds the maximum of {MAX_INPUT_TOKEN_LENGTH}.") | |
output_ids = model.generate( | |
**inputs, | |
max_length = 4096, # sum of input_tokens + max_new_tokens | |
max_new_tokens=max_new_tokens, | |
do_sample=True, | |
top_p=top_p, | |
top_k=top_k, | |
temperature=temperature) | |
output_text = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True) | |
return output_text |