VaidikML0508
/

Access-Me-Instruct-V2

@@ -1,102 +0,0 @@
-from typing import Dict, List, Any
-import torch
-from unsloth import FastLanguageModel
-import os
-class EndpointHandler:
-    def __init__(self, path=""):
-        # Model configuration
-        self.max_seq_length = 8192
-        self.load_in_4bit = True
-        self.dtype = None  # Auto detection
-        # Print the CUDA version
-        print(f"CUDA version: {torch.version.cuda}")
-        # Load model and tokenizer
-        self.model_id = "VaidikML0508/Access-Me-Instruct-V2"
-        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
-            model_name=self.model_id,
-            max_seq_length=self.max_seq_length,
-            dtype=self.dtype,
-            load_in_4bit=self.load_in_4bit,
-            token=os.environ['HF_KEY']  # Replace with actual token if needed
-        )
-        # Prepare model for inference
-        FastLanguageModel.for_inference(self.model)
-        # Define prompt template
-        self.prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
-{}<|eot_id|><|start_header_id|>user<|end_header_id|>
-{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-{}<|eot_id|>"""
-    def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
-        """
-        Handle inference request
-        :param data: Dictionary containing 'system_instruction', 'question', and optional parameters
-        :return: Dictionary containing generated response
-        """
-        # Extract inputs
-        system_instruction = data.pop("system_instruction", "You are a helpful AI assistant.")
-        question = data.pop("question", None)
-        # Check if question is provided
-        if question is None:
-            return {"error": "Please provide a question."}
-        # Extract generation parameters
-        max_new_tokens = data.pop("max_new_tokens", 200)
-        use_cache = data.pop("use_cache", True)
-        try:
-            # Prepare input prompt
-            formatted_prompt = self.prompt_template.format(
-                system_instruction,
-                question,
-                ""  # Empty output for generation
-            )
-            # Tokenize input
-            inputs = self.tokenizer(
-                [formatted_prompt],
-                return_tensors="pt"
-            ).to("cuda")
-            # Generate response
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens,
-                use_cache=use_cache
-            )
-            # Decode output
-            generated_text = self.tokenizer.batch_decode(outputs)[0]
-            # print(generated_text)
-            # Extract the assistant's response
-            # Find the last assistant section in the generated text
-            assistant_parts = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")
-            if len(assistant_parts) > 1:
-                response = assistant_parts[-1].replace('<|eot_id|>', "").strip(" \n")
-            else:
-                response = generated_text
-            return {
-                "generated_text": response,
-                "full_prompt": formatted_prompt
-            }
-        except Exception as e:
-            return {
-                "error": f"Generation failed: {str(e)}",
-                "full_prompt": formatted_prompt if 'formatted_prompt' in locals() else None
-            }
-    @staticmethod
-    def check_cuda():
-        """
-        Verify CUDA availability
-        """
-        if not torch.cuda.is_available():
-            raise ValueError("CUDA is required for this model")