import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftConfig, PeftModel import torch from transformers import BitsAndBytesConfig # models base_model_name = "mistralai/Mistral-7B-Instruct-v0.2" adapter_model_name = "TymofiiNas/results" bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=False, ) model = AutoModelForCausalLM.from_pretrained( base_model_name, quantization_config=bnb_config, device_map={"": 0} ) model = PeftModel.from_pretrained(model, adapter_model_name) tokenizer = AutoTokenizer.from_pretrained(base_model_name) def generate_response(text): text = " [INST]" + text + "[/INST]" encoded_input = tokenizer(text, return_tensors="pt", add_special_tokens=False) model_inputs = encoded_input.to("cuda") generated_ids = model.generate( **model_inputs, max_new_tokens=400, do_sample=True, pad_token_id=tokenizer.eos_token_id, ) decoded_output = tokenizer.batch_decode(generated_ids) return decoded_output[0][len(text) :] demo = gr.Interface( fn=generate_response, inputs="text", outputs="text", ) gr.TabbedInterface([demo]).queue().launch()