henriceriocain
/

HenriAI

Text Generation

Model card Files Files and versions

henriceriocain commited on Jan 30

Commit

a0939c3

·

verified ·

1 Parent(s): 2f8418c

Create handler.py

Files changed (1) hide show

handler.py +79 -0

handler.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from typing import Dict, List
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from peft import PeftModel
+class EndpointHandler:
+    def __init__(self, path: str):
+        print("Loading base model...")
+        # Configure 4-bit quantization
+        self.bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+        )
+        # Load base model with 4-bit quantization
+        base_model = AutoModelForCausalLM.from_pretrained(
+            "EleutherAI/gpt-j-6B",
+            quantization_config=self.bnb_config,
+            device_map="auto",
+            torch_dtype=torch.float16
+        )
+        print("Loading adapter weights...")
+        # Load the adapter weights
+        self.model = PeftModel.from_pretrained(
+            base_model,
+            path
+        )
+        # Set up tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+    def __call__(self, data: Dict) -> List[str]:
+        """Matches your generate_response function exactly"""
+        # Get the question from the input
+        question = data.pop("inputs", data)
+        if isinstance(question, list):
+            question = question[0]
+        # Format prompt
+        prompt = f"Question: {question}\nAnswer:"
+        # Tokenize
+        inputs = self.tokenizer(
+            prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=512
+        ).to(self.model.device)
+        # Generate
+        with torch.inference_mode(), torch.cuda.amp.autocast():
+            outputs = self.model.generate(
+                **inputs,
+                max_length=512,
+                num_return_sequences=1,
+                temperature=0.7,
+                do_sample=True,
+                use_cache=True
+            )
+        # Decode exactly as in your test file
+        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Return as list for API compatibility
+        return [response]
+    def preprocess(self, request):
+        """Pre-process request for API compatibility"""
+        if request.content_type == "application/json":
+            return request.json
+        return request
+    def postprocess(self, response):
+        """Post-process response for API compatibility"""
+        return response