Spaces:
Sleeping
Sleeping
add debertav3
Browse files- lemma_dict.pkl β BiLSTM/lemma_dict.pkl +0 -0
- model_1.h5 β BiLSTM/model_1.h5 +0 -0
- model_2.h5 β BiLSTM/model_2.h5 +0 -0
- model_3.h5 β BiLSTM/model_3.h5 +0 -0
- model_4.h5 β BiLSTM/model_4.h5 +0 -0
- word_dict.pkl β BiLSTM/word_dict.pkl +0 -0
- DeBERTaV3/QIQC-deberta-v3/added_tokens.json +1 -0
- DeBERTaV3/QIQC-deberta-v3/special_tokens_map.json +1 -0
- DeBERTaV3/QIQC-deberta-v3/spm.model +3 -0
- DeBERTaV3/QIQC-deberta-v3/tokenizer_config.json +1 -0
- DeBERTaV3/models/fastai_QIQC-deberta-v3.pkl +3 -0
- app.py +160 -51
- pip +0 -0
- requirements.txt +5 -0
lemma_dict.pkl β BiLSTM/lemma_dict.pkl
RENAMED
File without changes
|
model_1.h5 β BiLSTM/model_1.h5
RENAMED
File without changes
|
model_2.h5 β BiLSTM/model_2.h5
RENAMED
File without changes
|
model_3.h5 β BiLSTM/model_3.h5
RENAMED
File without changes
|
model_4.h5 β BiLSTM/model_4.h5
RENAMED
File without changes
|
word_dict.pkl β BiLSTM/word_dict.pkl
RENAMED
File without changes
|
DeBERTaV3/QIQC-deberta-v3/added_tokens.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"[MASK]": 128000}
|
DeBERTaV3/QIQC-deberta-v3/special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
DeBERTaV3/QIQC-deberta-v3/spm.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
|
3 |
+
size 2464616
|
DeBERTaV3/QIQC-deberta-v3/tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "split_by_punct": false, "vocab_type": "spm", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "microsoft/deberta-v3-base"}
|
DeBERTaV3/models/fastai_QIQC-deberta-v3.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ea49299262eff9de87d248861c80e826f3a710a6a14410c899fc2de8b4ea24a
|
3 |
+
size 746435557
|
app.py
CHANGED
@@ -7,27 +7,33 @@ import spacy
|
|
7 |
from tqdm import tqdm
|
8 |
import gc
|
9 |
import os
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
|
|
|
|
|
|
12 |
os.system("python -m spacy download en_core_web_lg")
|
13 |
|
14 |
-
|
15 |
-
model_1 = tf.keras.models.load_model("model_1.h5")
|
16 |
-
model_2 = tf.keras.models.load_model("model_2.h5")
|
17 |
-
model_3 = tf.keras.models.load_model("model_3.h5")
|
18 |
-
model_4 = tf.keras.models.load_model("model_4.h5")
|
19 |
|
20 |
-
|
21 |
-
with open('word_dict.pkl', 'rb') as f:
|
22 |
word_dict = pickle.load(f)
|
23 |
|
24 |
-
|
25 |
-
lemma_dict = pickle.load(f)
|
26 |
-
|
27 |
-
# Load SpaCy NLP model
|
28 |
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
|
29 |
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
|
30 |
|
|
|
31 |
def preprocess_text(text):
|
32 |
"""Preprocess the input text using SpaCy and return word indices."""
|
33 |
docs = nlp.pipe([text], n_process=1)
|
@@ -40,30 +46,124 @@ def preprocess_text(text):
|
|
40 |
word_seq.append(word_dict[token.text])
|
41 |
return word_seq
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
# Example questions
|
68 |
examples = [
|
69 |
"How do you train a pigeon to send messages?",
|
@@ -72,19 +172,28 @@ examples = [
|
|
72 |
"Which person has given the least f**ks and still turned out successful?"
|
73 |
]
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
)
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
from tqdm import tqdm
|
8 |
import gc
|
9 |
import os
|
10 |
+
import torch
|
11 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
12 |
+
from fastai.vision.all import *
|
13 |
+
from fastai.text.all import *
|
14 |
+
from torch.utils.data import Dataset
|
15 |
|
16 |
+
model_lst = ["DeBERTaV3", "BiLSTM"]
|
17 |
+
|
18 |
+
# BiLSTM Model
|
19 |
+
## Download the SpaCy model
|
20 |
os.system("python -m spacy download en_core_web_lg")
|
21 |
|
22 |
+
## Load models
|
23 |
+
model_1 = tf.keras.models.load_model("BiLSTM/model_1.h5")
|
24 |
+
model_2 = tf.keras.models.load_model("BiLSTM/model_2.h5")
|
25 |
+
model_3 = tf.keras.models.load_model("BiLSTM/model_3.h5")
|
26 |
+
model_4 = tf.keras.models.load_model("BiLSTM/model_4.h5")
|
27 |
|
28 |
+
## Load dictionaries
|
29 |
+
with open('BiLSTM/word_dict.pkl', 'rb') as f:
|
30 |
word_dict = pickle.load(f)
|
31 |
|
32 |
+
## Load SpaCy NLP model
|
|
|
|
|
|
|
33 |
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
|
34 |
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
|
35 |
|
36 |
+
## tokenizer
|
37 |
def preprocess_text(text):
|
38 |
"""Preprocess the input text using SpaCy and return word indices."""
|
39 |
docs = nlp.pipe([text], n_process=1)
|
|
|
46 |
word_seq.append(word_dict[token.text])
|
47 |
return word_seq
|
48 |
|
49 |
+
# DeBERTaV3 Model
|
50 |
+
## Load tokenizer and model
|
51 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
|
52 |
+
|
53 |
+
class QuestionDataset(Dataset):
|
54 |
+
def __init__(self, X, y, tokenizer):
|
55 |
+
self.text = X
|
56 |
+
self.targets = y
|
57 |
+
self.tok = tokenizer
|
58 |
+
|
59 |
+
def __len__(self):
|
60 |
+
return len(self.text)
|
61 |
+
|
62 |
+
def __getitem__(self, idx):
|
63 |
+
|
64 |
+
text = self.text[idx]
|
65 |
+
targ = self.targets[idx]
|
66 |
+
|
67 |
+
return self.tok(text, padding='max_length',
|
68 |
+
truncation=True,
|
69 |
+
max_length=30,
|
70 |
+
return_tensors="pt")["input_ids"][0], tensor(targ)
|
71 |
+
|
72 |
+
def new_empty(self):
|
73 |
+
return QuestionDataset([], [], self.tok)
|
74 |
+
|
75 |
+
learn_infer = load_learner('DeBERTaV3/models/fastai_QIQC-deberta-v3.pkl', cpu=True)
|
76 |
+
print("Learner loaded successfully.")
|
77 |
+
|
78 |
+
# ## define the model
|
79 |
+
# bert = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base').train()
|
80 |
+
|
81 |
+
# classifier = nn.Sequential(
|
82 |
+
# nn.Linear(768, 1024),
|
83 |
+
# nn.ReLU(),
|
84 |
+
# nn.Dropout(0.5),
|
85 |
+
# nn.Linear(1024, 2)
|
86 |
+
# )
|
87 |
+
|
88 |
+
# bert.classifier = classifier
|
89 |
+
|
90 |
+
# class BertClassifier(Module):
|
91 |
+
# def __init__(self, bert):
|
92 |
+
# self.bert = bert
|
93 |
+
# def forward(self, x):
|
94 |
+
# x = self.bert(x)
|
95 |
+
# return x.logits
|
96 |
+
|
97 |
+
# model = BertClassifier(bert)
|
98 |
+
|
99 |
+
## Recreate the DataLoader
|
100 |
+
class TestDS:
|
101 |
+
def __init__(self, tensors):
|
102 |
+
self.tensors = tensors
|
103 |
+
|
104 |
+
def __len__(self):
|
105 |
+
return len(self.tensors)
|
106 |
+
|
107 |
+
def __getitem__(self, idx):
|
108 |
+
t = self.tensors[idx]
|
109 |
+
return t, tensor(0)
|
110 |
+
|
111 |
+
class DeBERTaV3Model:
|
112 |
+
def __init__(self):
|
113 |
+
pass
|
114 |
+
|
115 |
+
def predict(self, text):
|
116 |
+
# Preprocess the text
|
117 |
+
test_tensor = tokenizer(text, padding="max_length", truncation=True, max_length=55, return_tensors="pt")["input_ids"]
|
118 |
+
test_dl = DataLoader(TestDS(test_tensor), bs=128)
|
119 |
|
120 |
+
# Get predictions
|
121 |
+
preds = learn_infer.get_preds(dl=test_dl)
|
122 |
+
label = "Insincere" if (F.softmax(preds[0], dim=1)[:, 1]>0.4878) else "Sincere"
|
123 |
+
probs = {
|
124 |
+
"Probability": float(F.softmax(preds[0], dim=1)[:, 1]),
|
125 |
+
"Sequence": test_tensor
|
126 |
+
}
|
127 |
+
return label, probs
|
128 |
+
|
129 |
+
class BiLSTMModel:
|
130 |
+
def __init__(self):
|
131 |
+
pass
|
132 |
+
|
133 |
+
def predict(self, text):
|
134 |
+
# Preprocess the text
|
135 |
+
seq = preprocess_text(text)
|
136 |
+
padded_seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=55)
|
137 |
+
|
138 |
+
BATCH_SIZE = 512
|
139 |
+
# Get predictions from each model
|
140 |
+
pred1 = 0.15 * np.squeeze(model_1.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
|
141 |
+
pred2 = 0.35 * np.squeeze(model_2.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
|
142 |
+
pred3 = 0.15 * np.squeeze(model_3.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
|
143 |
+
pred4 = 0.35 * np.squeeze(model_4.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
|
144 |
+
|
145 |
+
# Combine predictions
|
146 |
+
avg_pred = pred1 + pred2 + pred3 + pred4
|
147 |
+
label = "Insincere" if avg_pred > 0.35 else "Sincere"
|
148 |
+
|
149 |
+
probs = {
|
150 |
+
"Probability": float(avg_pred),
|
151 |
+
"Model Probabilities": {"Model 1": float(pred1), "Model 2": float(pred2), "Model 3": float(pred3), "Model 4": float(pred4)},
|
152 |
+
"Sequence": seq
|
153 |
+
}
|
154 |
+
return label, probs
|
155 |
+
|
156 |
+
class QuestionClassifier:
|
157 |
+
"""Main Class to manage the models"""
|
158 |
+
def __init__(self):
|
159 |
+
self.models = {
|
160 |
+
"DeBERTaV3": DeBERTaV3Model(),
|
161 |
+
"BiLSTM": BiLSTMModel()
|
162 |
+
}
|
163 |
+
|
164 |
+
def classify(self, model_name, text):
|
165 |
+
return self.models[model_name].predict(text)
|
166 |
+
|
167 |
# Example questions
|
168 |
examples = [
|
169 |
"How do you train a pigeon to send messages?",
|
|
|
172 |
"Which person has given the least f**ks and still turned out successful?"
|
173 |
]
|
174 |
|
175 |
+
def create_gradio_interface():
|
176 |
+
classifier = QuestionClassifier()
|
177 |
+
|
178 |
+
def classify_question(model_name, text):
|
179 |
+
return classifier.classify(model_name, text)
|
180 |
+
|
181 |
+
interface = gr.Interface(
|
182 |
+
fn=classify_question,
|
183 |
+
inputs=[
|
184 |
+
gr.Dropdown(choices=["DeBERTaV3", "BiLSTM"], label="Select Model", value="BiLSTM"),
|
185 |
+
gr.Textbox(lines=2, placeholder="Enter your question here...", label="Input Question")
|
186 |
+
],
|
187 |
+
outputs=[
|
188 |
+
gr.Textbox(label="Prediction"),
|
189 |
+
gr.JSON(label="Model Probabilities")
|
190 |
+
],
|
191 |
+
title="Quora Insincere Questions Classifier",
|
192 |
+
examples=examples,
|
193 |
+
description="Enter your question to classify it as sincere or insincere. Select an example question below."
|
194 |
+
)
|
195 |
+
interface.launch()
|
196 |
+
|
197 |
+
|
198 |
+
if __name__ == "__main__":
|
199 |
+
create_gradio_interface()
|
pip
ADDED
File without changes
|
requirements.txt
CHANGED
@@ -4,3 +4,8 @@ h5py
|
|
4 |
spacy
|
5 |
ml-dtypes
|
6 |
thinc
|
|
|
|
|
|
|
|
|
|
|
|
4 |
spacy
|
5 |
ml-dtypes
|
6 |
thinc
|
7 |
+
torch
|
8 |
+
transformers
|
9 |
+
fastai
|
10 |
+
tiktoken
|
11 |
+
sentencepiece
|