| | from typing import Dict, Any |
| | from transformers import AutoTokenizer, AutoModelForSequenceClassification |
| | import torch |
| | import re |
| |
|
| | class EndpointHandler(): |
| | def __init__(self, path=""): |
| | self.tokenizer = AutoTokenizer.from_pretrained(path) |
| | self.model = AutoModelForSequenceClassification.from_pretrained(path) |
| | self.model.eval() |
| | self.id2label = {0: "Human", 1: "Mixed", 2: "AI"} |
| | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | self.model.to(self.device) |
| |
|
| | def split_into_sentences(self, text: str): |
| | sentences = re.split(r'(?<=[.!?])\s+', text) |
| | return [s.strip() for s in sentences if s.strip()] |
| |
|
| | def get_token_predictions(self, text: str): |
| | tokens = self.tokenizer.tokenize(text) |
| | token_predictions = [] |
| | for i in range(len(tokens)): |
| | start = max(0, i - 10) |
| | end = min(len(tokens), i + 10) |
| | context = self.tokenizer.convert_tokens_to_string(tokens[start:end]) |
| | inputs = self.tokenizer(context, return_tensors="pt", truncation=True, max_length=512) |
| | inputs = {k: v.to(self.device) for k, v in inputs.items()} |
| | with torch.no_grad(): |
| | outputs = self.model(**inputs) |
| | probs = torch.softmax(outputs.logits, dim=1) |
| | ai_prob = probs[0][2].item() |
| | token = tokens[i].replace("Ġ", " ").replace("▁", " ").replace("Ċ", " ").strip() |
| | if token: |
| | token_predictions.append({"token": token, "ai_prob": ai_prob}) |
| | return token_predictions |
| |
|
| | def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: |
| | text = data.get("inputs", "") |
| | |
| | inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512) |
| | inputs = {k: v.to(self.device) for k, v in inputs.items()} |
| | with torch.no_grad(): |
| | outputs = self.model(**inputs) |
| | probs = torch.softmax(outputs.logits, dim=1) |
| | pred = torch.argmax(probs, dim=1).item() |
| | doc_result = { |
| | "prediction": self.id2label[pred], |
| | "confidence": probs[0][pred].item(), |
| | "probabilities": {self.id2label[i]: float(p) for i, p in enumerate(probs[0])} |
| | } |
| | |
| | sentences = self.split_into_sentences(text) |
| | sent_results = [] |
| | for sent in sentences: |
| | inputs = self.tokenizer(sent, return_tensors="pt", truncation=True, max_length=512) |
| | inputs = {k: v.to(self.device) for k, v in inputs.items()} |
| | with torch.no_grad(): |
| | outputs = self.model(**inputs) |
| | probs = torch.softmax(outputs.logits, dim=1) |
| | pred = torch.argmax(probs, dim=1).item() |
| | sent_results.append({ |
| | "sentence": sent, |
| | "prediction": self.id2label[pred], |
| | "confidence": probs[0][pred].item(), |
| | "probabilities": {self.id2label[i]: float(p) for i, p in enumerate(probs[0])} |
| | }) |
| | |
| | token_results = self.get_token_predictions(text) |
| | return [{ |
| | "document": doc_result, |
| | "sentences": sent_results, |
| | "tokens": token_results |
| | }] |