VaidikML0508 commited on
Commit
582a000
·
verified ·
1 Parent(s): 1aed14d

Delete handler.py

Browse files
Files changed (1) hide show
  1. handler.py +0 -102
handler.py DELETED
@@ -1,102 +0,0 @@
1
- from typing import Dict, List, Any
2
- import torch
3
- from unsloth import FastLanguageModel
4
- import os
5
-
6
- class EndpointHandler:
7
- def __init__(self, path=""):
8
- # Model configuration
9
- self.max_seq_length = 8192
10
- self.load_in_4bit = True
11
- self.dtype = None # Auto detection
12
-
13
- # Print the CUDA version
14
- print(f"CUDA version: {torch.version.cuda}")
15
-
16
- # Load model and tokenizer
17
- self.model_id = "VaidikML0508/Access-Me-Instruct-V2"
18
- self.model, self.tokenizer = FastLanguageModel.from_pretrained(
19
- model_name=self.model_id,
20
- max_seq_length=self.max_seq_length,
21
- dtype=self.dtype,
22
- load_in_4bit=self.load_in_4bit,
23
- token=os.environ['HF_KEY'] # Replace with actual token if needed
24
- )
25
-
26
- # Prepare model for inference
27
- FastLanguageModel.for_inference(self.model)
28
-
29
- # Define prompt template
30
- self.prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
31
- {}<|eot_id|><|start_header_id|>user<|end_header_id|>
32
- {}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
33
- {}<|eot_id|>"""
34
-
35
- def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
36
- """
37
- Handle inference request
38
- :param data: Dictionary containing 'system_instruction', 'question', and optional parameters
39
- :return: Dictionary containing generated response
40
- """
41
- # Extract inputs
42
- system_instruction = data.pop("system_instruction", "You are a helpful AI assistant.")
43
- question = data.pop("question", None)
44
-
45
- # Check if question is provided
46
- if question is None:
47
- return {"error": "Please provide a question."}
48
-
49
- # Extract generation parameters
50
- max_new_tokens = data.pop("max_new_tokens", 200)
51
- use_cache = data.pop("use_cache", True)
52
-
53
- try:
54
- # Prepare input prompt
55
- formatted_prompt = self.prompt_template.format(
56
- system_instruction,
57
- question,
58
- "" # Empty output for generation
59
- )
60
-
61
- # Tokenize input
62
- inputs = self.tokenizer(
63
- [formatted_prompt],
64
- return_tensors="pt"
65
- ).to("cuda")
66
-
67
- # Generate response
68
- outputs = self.model.generate(
69
- **inputs,
70
- max_new_tokens=max_new_tokens,
71
- use_cache=use_cache
72
- )
73
-
74
- # Decode output
75
- generated_text = self.tokenizer.batch_decode(outputs)[0]
76
- # print(generated_text)
77
- # Extract the assistant's response
78
- # Find the last assistant section in the generated text
79
- assistant_parts = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")
80
- if len(assistant_parts) > 1:
81
- response = assistant_parts[-1].replace('<|eot_id|>', "").strip(" \n")
82
- else:
83
- response = generated_text
84
-
85
- return {
86
- "generated_text": response,
87
- "full_prompt": formatted_prompt
88
- }
89
-
90
- except Exception as e:
91
- return {
92
- "error": f"Generation failed: {str(e)}",
93
- "full_prompt": formatted_prompt if 'formatted_prompt' in locals() else None
94
- }
95
-
96
- @staticmethod
97
- def check_cuda():
98
- """
99
- Verify CUDA availability
100
- """
101
- if not torch.cuda.is_available():
102
- raise ValueError("CUDA is required for this model")