|
|
|
|
|
preprocess_code = """ |
|
def preprocess(text): |
|
import re |
|
import string |
|
import spacy |
|
|
|
try: |
|
|
|
# Checking if it the string |
|
text = str(text) |
|
|
|
# remove html |
|
text = re.sub(r"<.*?>","", text) |
|
|
|
# Remove URL |
|
url_pattern = r"https?://\\S+|www\\.\\S+|\\S+\\.\\S{2," |
|
text = re.sub(url_pattern,"", text) |
|
|
|
# Remove Punctuation |
|
translator = str.maketrans("","", string.punctuation) |
|
text.translate(translator) |
|
|
|
# Lower case |
|
text.lower().strip() |
|
|
|
# Remove Unicodes - only applicable for english language. Because other language letters represented as unicodes. |
|
unicode_pattern = str.maketrans("","","\\xa0") |
|
text.translate(unicode_pattern) |
|
|
|
# Remove Escape sequences (\\n, \\t, \\r) |
|
text = re.sub(r"\\[nt\\r]"," ",text) |
|
|
|
# Remove Stop words using spacy |
|
|
|
spacy.prefer_gpu() # using GPU if available. may reduce the run time. |
|
nlp = spacy.load("en_core_web_sm") |
|
doc = nlp(text) |
|
text = " ".join([token.text for token in doc if not token.is_stop]) |
|
|
|
# Remove irrelevant white spaces |
|
text = re.sub(r"\\s+"," ",text) |
|
except: |
|
print(f"error occured") |
|
|
|
return text |
|
""" |
|
|
|
|
|
postprocess_code = """ |
|
def post_process(output): |
|
import torch |
|
classes = ['ACCOUNTANT', 'ADVOCATE', 'AGRICULTURE', 'APPAREL', 'ARTS', 'AUTOMOBILE', 'AVIATION', 'BANKING', 'BPO', 'BUSINESS-DEVELOPMENT', 'CHEF', 'CONSTRUCTION', 'CONSULTANT', 'DESIGNER', 'DIGITAL-MEDIA', 'ENGINEERING', 'FINANCE', 'FITNESS', 'HEALTHCARE', 'HR', 'INFORMATION-TECHNOLOGY', 'PUBLIC-RELATIONS', 'SALES', 'TEACHER'] |
|
try: |
|
logits = output.logits |
|
sigmoid = torch.nn.Sigmoid() |
|
probs = sigmoid(logits.squeeze().cpu()) |
|
temp = probs.sort() |
|
return classes[temp[-1][-1].item()] |
|
except: |
|
print("Some Error occured") |
|
""" |
|
|
|
|
|
|
|
from transformers import PretrainedConfig, AutoModel |
|
|
|
class CustomConfig(PretrainedConfig): |
|
|
|
def __init__(self, preprocess_function = None, postprocess_function = None, **kwargs): |
|
super().__init__(**kwargs) |
|
self.preprocess_function = preprocess_function |
|
self.postprocess_function = postprocess_function |
|
|
|
|
|
config = CustomConfig(preprocess_function= preprocess_code, postprocess_function=postprocess_code) |
|
|
|
|
|
|
|
config.save_pretrained("config with functions") |