non2013 commited on
Commit
964d107
Β·
1 Parent(s): ca3c933

add debertav3

Browse files
lemma_dict.pkl β†’ BiLSTM/lemma_dict.pkl RENAMED
File without changes
model_1.h5 β†’ BiLSTM/model_1.h5 RENAMED
File without changes
model_2.h5 β†’ BiLSTM/model_2.h5 RENAMED
File without changes
model_3.h5 β†’ BiLSTM/model_3.h5 RENAMED
File without changes
model_4.h5 β†’ BiLSTM/model_4.h5 RENAMED
File without changes
word_dict.pkl β†’ BiLSTM/word_dict.pkl RENAMED
File without changes
DeBERTaV3/QIQC-deberta-v3/added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"[MASK]": 128000}
DeBERTaV3/QIQC-deberta-v3/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
DeBERTaV3/QIQC-deberta-v3/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
DeBERTaV3/QIQC-deberta-v3/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "split_by_punct": false, "vocab_type": "spm", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "microsoft/deberta-v3-base"}
DeBERTaV3/models/fastai_QIQC-deberta-v3.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ea49299262eff9de87d248861c80e826f3a710a6a14410c899fc2de8b4ea24a
3
+ size 746435557
app.py CHANGED
@@ -7,27 +7,33 @@ import spacy
7
  from tqdm import tqdm
8
  import gc
9
  import os
 
 
 
 
 
10
 
11
- # Download the SpaCy model
 
 
 
12
  os.system("python -m spacy download en_core_web_lg")
13
 
14
- # Load models
15
- model_1 = tf.keras.models.load_model("model_1.h5")
16
- model_2 = tf.keras.models.load_model("model_2.h5")
17
- model_3 = tf.keras.models.load_model("model_3.h5")
18
- model_4 = tf.keras.models.load_model("model_4.h5")
19
 
20
- # Load dictionaries
21
- with open('word_dict.pkl', 'rb') as f:
22
  word_dict = pickle.load(f)
23
 
24
- with open('lemma_dict.pkl', 'rb') as f:
25
- lemma_dict = pickle.load(f)
26
-
27
- # Load SpaCy NLP model
28
  nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
29
  nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
30
 
 
31
  def preprocess_text(text):
32
  """Preprocess the input text using SpaCy and return word indices."""
33
  docs = nlp.pipe([text], n_process=1)
@@ -40,30 +46,124 @@ def preprocess_text(text):
40
  word_seq.append(word_dict[token.text])
41
  return word_seq
42
 
43
- def classify_question(text):
44
- # Preprocess the text
45
- seq = preprocess_text(text)
46
- padded_seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=55) # Adjust maxlen if needed
47
- BATCH_SIZE = 512
48
- # Get predictions from each model
49
- pred1 = 0.15 * np.squeeze(model_1.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
50
- pred2 = 0.35 * np.squeeze(model_2.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
51
- pred3 = 0.15 * np.squeeze(model_3.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
52
- pred4 = 0.35 * np.squeeze(model_4.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
53
-
54
- # Combine predictions
55
- avg_pred = pred1 + pred2 + pred3 + pred4
56
- label = "Insincere" if avg_pred > 0.35 else "Sincere"
57
-
58
- # Create a list of probabilities for each model
59
- probs = {
60
- "Probability": float(avg_pred),
61
- "Model Probabilities": {"Model 1": float(pred1), "Model 2": float(pred2), "Model 3": float(pred3), "Model 4": float(pred4)},
62
- "Sequence": seq
63
- }
64
-
65
- return label, probs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  # Example questions
68
  examples = [
69
  "How do you train a pigeon to send messages?",
@@ -72,19 +172,28 @@ examples = [
72
  "Which person has given the least f**ks and still turned out successful?"
73
  ]
74
 
75
- # Gradio Interface
76
- interface = gr.Interface(
77
- fn=classify_question,
78
- inputs=[
79
- gr.Textbox(lines=2, placeholder="Enter your question here..."),
80
- ],
81
- outputs=[
82
- "text", # Output for label
83
- "json" # Output for probabilities
84
- ],
85
- title="Quora Insincere Questions Classifier",
86
- examples=examples,
87
- description="Enter your question to classify it as sincere or insincere. Select an example question below."
88
- )
89
-
90
- interface.launch()
 
 
 
 
 
 
 
 
 
 
7
  from tqdm import tqdm
8
  import gc
9
  import os
10
+ import torch
11
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
12
+ from fastai.vision.all import *
13
+ from fastai.text.all import *
14
+ from torch.utils.data import Dataset
15
 
16
+ model_lst = ["DeBERTaV3", "BiLSTM"]
17
+
18
+ # BiLSTM Model
19
+ ## Download the SpaCy model
20
  os.system("python -m spacy download en_core_web_lg")
21
 
22
+ ## Load models
23
+ model_1 = tf.keras.models.load_model("BiLSTM/model_1.h5")
24
+ model_2 = tf.keras.models.load_model("BiLSTM/model_2.h5")
25
+ model_3 = tf.keras.models.load_model("BiLSTM/model_3.h5")
26
+ model_4 = tf.keras.models.load_model("BiLSTM/model_4.h5")
27
 
28
+ ## Load dictionaries
29
+ with open('BiLSTM/word_dict.pkl', 'rb') as f:
30
  word_dict = pickle.load(f)
31
 
32
+ ## Load SpaCy NLP model
 
 
 
33
  nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
34
  nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
35
 
36
+ ## tokenizer
37
  def preprocess_text(text):
38
  """Preprocess the input text using SpaCy and return word indices."""
39
  docs = nlp.pipe([text], n_process=1)
 
46
  word_seq.append(word_dict[token.text])
47
  return word_seq
48
 
49
+ # DeBERTaV3 Model
50
+ ## Load tokenizer and model
51
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
52
+
53
+ class QuestionDataset(Dataset):
54
+ def __init__(self, X, y, tokenizer):
55
+ self.text = X
56
+ self.targets = y
57
+ self.tok = tokenizer
58
+
59
+ def __len__(self):
60
+ return len(self.text)
61
+
62
+ def __getitem__(self, idx):
63
+
64
+ text = self.text[idx]
65
+ targ = self.targets[idx]
66
+
67
+ return self.tok(text, padding='max_length',
68
+ truncation=True,
69
+ max_length=30,
70
+ return_tensors="pt")["input_ids"][0], tensor(targ)
71
+
72
+ def new_empty(self):
73
+ return QuestionDataset([], [], self.tok)
74
+
75
+ learn_infer = load_learner('DeBERTaV3/models/fastai_QIQC-deberta-v3.pkl', cpu=True)
76
+ print("Learner loaded successfully.")
77
+
78
+ # ## define the model
79
+ # bert = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base').train()
80
+
81
+ # classifier = nn.Sequential(
82
+ # nn.Linear(768, 1024),
83
+ # nn.ReLU(),
84
+ # nn.Dropout(0.5),
85
+ # nn.Linear(1024, 2)
86
+ # )
87
+
88
+ # bert.classifier = classifier
89
+
90
+ # class BertClassifier(Module):
91
+ # def __init__(self, bert):
92
+ # self.bert = bert
93
+ # def forward(self, x):
94
+ # x = self.bert(x)
95
+ # return x.logits
96
+
97
+ # model = BertClassifier(bert)
98
+
99
+ ## Recreate the DataLoader
100
+ class TestDS:
101
+ def __init__(self, tensors):
102
+ self.tensors = tensors
103
+
104
+ def __len__(self):
105
+ return len(self.tensors)
106
+
107
+ def __getitem__(self, idx):
108
+ t = self.tensors[idx]
109
+ return t, tensor(0)
110
+
111
+ class DeBERTaV3Model:
112
+ def __init__(self):
113
+ pass
114
+
115
+ def predict(self, text):
116
+ # Preprocess the text
117
+ test_tensor = tokenizer(text, padding="max_length", truncation=True, max_length=55, return_tensors="pt")["input_ids"]
118
+ test_dl = DataLoader(TestDS(test_tensor), bs=128)
119
 
120
+ # Get predictions
121
+ preds = learn_infer.get_preds(dl=test_dl)
122
+ label = "Insincere" if (F.softmax(preds[0], dim=1)[:, 1]>0.4878) else "Sincere"
123
+ probs = {
124
+ "Probability": float(F.softmax(preds[0], dim=1)[:, 1]),
125
+ "Sequence": test_tensor
126
+ }
127
+ return label, probs
128
+
129
+ class BiLSTMModel:
130
+ def __init__(self):
131
+ pass
132
+
133
+ def predict(self, text):
134
+ # Preprocess the text
135
+ seq = preprocess_text(text)
136
+ padded_seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=55)
137
+
138
+ BATCH_SIZE = 512
139
+ # Get predictions from each model
140
+ pred1 = 0.15 * np.squeeze(model_1.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
141
+ pred2 = 0.35 * np.squeeze(model_2.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
142
+ pred3 = 0.15 * np.squeeze(model_3.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
143
+ pred4 = 0.35 * np.squeeze(model_4.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
144
+
145
+ # Combine predictions
146
+ avg_pred = pred1 + pred2 + pred3 + pred4
147
+ label = "Insincere" if avg_pred > 0.35 else "Sincere"
148
+
149
+ probs = {
150
+ "Probability": float(avg_pred),
151
+ "Model Probabilities": {"Model 1": float(pred1), "Model 2": float(pred2), "Model 3": float(pred3), "Model 4": float(pred4)},
152
+ "Sequence": seq
153
+ }
154
+ return label, probs
155
+
156
+ class QuestionClassifier:
157
+ """Main Class to manage the models"""
158
+ def __init__(self):
159
+ self.models = {
160
+ "DeBERTaV3": DeBERTaV3Model(),
161
+ "BiLSTM": BiLSTMModel()
162
+ }
163
+
164
+ def classify(self, model_name, text):
165
+ return self.models[model_name].predict(text)
166
+
167
  # Example questions
168
  examples = [
169
  "How do you train a pigeon to send messages?",
 
172
  "Which person has given the least f**ks and still turned out successful?"
173
  ]
174
 
175
+ def create_gradio_interface():
176
+ classifier = QuestionClassifier()
177
+
178
+ def classify_question(model_name, text):
179
+ return classifier.classify(model_name, text)
180
+
181
+ interface = gr.Interface(
182
+ fn=classify_question,
183
+ inputs=[
184
+ gr.Dropdown(choices=["DeBERTaV3", "BiLSTM"], label="Select Model", value="BiLSTM"),
185
+ gr.Textbox(lines=2, placeholder="Enter your question here...", label="Input Question")
186
+ ],
187
+ outputs=[
188
+ gr.Textbox(label="Prediction"),
189
+ gr.JSON(label="Model Probabilities")
190
+ ],
191
+ title="Quora Insincere Questions Classifier",
192
+ examples=examples,
193
+ description="Enter your question to classify it as sincere or insincere. Select an example question below."
194
+ )
195
+ interface.launch()
196
+
197
+
198
+ if __name__ == "__main__":
199
+ create_gradio_interface()
pip ADDED
File without changes
requirements.txt CHANGED
@@ -4,3 +4,8 @@ h5py
4
  spacy
5
  ml-dtypes
6
  thinc
 
 
 
 
 
 
4
  spacy
5
  ml-dtypes
6
  thinc
7
+ torch
8
+ transformers
9
+ fastai
10
+ tiktoken
11
+ sentencepiece