--- license: apache-2.0 datasets: - Subh775/formatted-hindi-hinglish-cot language: - en - hi base_model: - unsloth/Mistral-Small-Instruct-2409 pipeline_tag: text-generation library_name: adapter-transformers tags: - LoRA - text-generation-inference - unsloth --- ## Inference Instructions: ```python !pip install unsloth ``` ```python from unsloth import FastLanguageModel from transformers import TextStreamer import torch # Load your fine-tuned model model, tokenizer = FastLanguageModel.from_pretrained( model_name="QuantumInk/Mistral-small-12B-Hinglish-cot", max_seq_length=2048, load_in_4bit=True ) FastLanguageModel.for_inference(model) # Streamer for real-time decoding text_streamer = TextStreamer(tokenizer) # Alpaca prompt template alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Input: {input_text} ### Response: {output}""" ``` ```python # Chat loop with memory def chat(): print("šŸ’¬ Chat with Qwen-2.5-Hindi-Hinglish-COT! Type '\\q' or 'quit' to exit.\n") chat_history = "" # Full chat history with prompts and responses while True: user_input = input("āž¤ ") if user_input.lower() in ["\\q", "quit"]: print("\nšŸ‘‹ Exiting chat. Goodbye!") break # Format the current prompt current_prompt = alpaca_prompt.format( instruction="Continue the following conversation.", input_text=user_input, output="" ) # Add to full chat history chat_history += current_prompt + "\n" # Tokenize the full prompt inputs = tokenizer([chat_history], return_tensors="pt").to("cuda") print("\nšŸ¤–: ", end="") # Prepare for streaming output # Generate response using streamer outputs = model.generate( **inputs, max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True, no_repeat_ngram_size=2, streamer=text_streamer ) # Decode and capture response for chat history full_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] response = full_output.split("### Response:")[-1].strip() # Add response to chat history chat_history += f"{response}\n" # Run the chat chat() ```