|
--- |
|
license: apache-2.0 |
|
datasets: |
|
- Subh775/formatted-hindi-hinglish-cot |
|
language: |
|
- en |
|
- hi |
|
base_model: |
|
- unsloth/Mistral-Small-Instruct-2409 |
|
pipeline_tag: text-generation |
|
library_name: adapter-transformers |
|
tags: |
|
- LoRA |
|
- text-generation-inference |
|
- unsloth |
|
--- |
|
|
|
## Inference Instructions: |
|
|
|
```python |
|
!pip install unsloth |
|
``` |
|
|
|
```python |
|
from unsloth import FastLanguageModel |
|
from transformers import TextStreamer |
|
import torch |
|
|
|
# Load your fine-tuned model |
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name="QuantumInk/Mistral-small-12B-Hinglish-cot", |
|
max_seq_length=2048, |
|
load_in_4bit=True |
|
) |
|
FastLanguageModel.for_inference(model) |
|
|
|
# Streamer for real-time decoding |
|
text_streamer = TextStreamer(tokenizer) |
|
|
|
# Alpaca prompt template |
|
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
|
### Instruction: |
|
{instruction} |
|
### Input: |
|
{input_text} |
|
### Response: |
|
{output}""" |
|
``` |
|
|
|
```python |
|
# Chat loop with memory |
|
def chat(): |
|
print("💬 Chat with Qwen-2.5-Hindi-Hinglish-COT! Type '\\q' or 'quit' to exit.\n") |
|
|
|
chat_history = "" # Full chat history with prompts and responses |
|
|
|
while True: |
|
user_input = input("➤ ") |
|
|
|
if user_input.lower() in ["\\q", "quit"]: |
|
print("\n👋 Exiting chat. Goodbye!") |
|
break |
|
|
|
# Format the current prompt |
|
current_prompt = alpaca_prompt.format( |
|
instruction="Continue the following conversation.", |
|
input_text=user_input, |
|
output="" |
|
) |
|
|
|
# Add to full chat history |
|
chat_history += current_prompt + "\n" |
|
|
|
# Tokenize the full prompt |
|
inputs = tokenizer([chat_history], return_tensors="pt").to("cuda") |
|
|
|
print("\n🤖: ", end="") # Prepare for streaming output |
|
|
|
# Generate response using streamer |
|
outputs = model.generate( |
|
**inputs, |
|
max_new_tokens=256, |
|
temperature=0.7, |
|
top_p=0.9, |
|
do_sample=True, |
|
no_repeat_ngram_size=2, |
|
streamer=text_streamer |
|
) |
|
|
|
# Decode and capture response for chat history |
|
full_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] |
|
response = full_output.split("### Response:")[-1].strip() |
|
|
|
# Add response to chat history |
|
chat_history += f"{response}\n" |
|
|
|
# Run the chat |
|
chat() |
|
``` |
|
|