Spaces:

non2013
/

SincereQuestions

Sleeping

App Files Files Community

non2013 commited on Oct 21, 2024

Commit

964d107

1 Parent(s): ca3c933

add debertav3

Browse files

Files changed (14) hide show

lemma_dict.pkl → BiLSTM/lemma_dict.pkl +0 -0
model_1.h5 → BiLSTM/model_1.h5 +0 -0
model_2.h5 → BiLSTM/model_2.h5 +0 -0
model_3.h5 → BiLSTM/model_3.h5 +0 -0
model_4.h5 → BiLSTM/model_4.h5 +0 -0
word_dict.pkl → BiLSTM/word_dict.pkl +0 -0
DeBERTaV3/QIQC-deberta-v3/added_tokens.json +1 -0
DeBERTaV3/QIQC-deberta-v3/special_tokens_map.json +1 -0
DeBERTaV3/QIQC-deberta-v3/spm.model +3 -0
DeBERTaV3/QIQC-deberta-v3/tokenizer_config.json +1 -0
DeBERTaV3/models/fastai_QIQC-deberta-v3.pkl +3 -0
app.py +160 -51
pip +0 -0
requirements.txt +5 -0

lemma_dict.pkl → BiLSTM/lemma_dict.pkl RENAMED Viewed

File without changes

model_1.h5 → BiLSTM/model_1.h5 RENAMED Viewed

File without changes

model_2.h5 → BiLSTM/model_2.h5 RENAMED Viewed

File without changes

model_3.h5 → BiLSTM/model_3.h5 RENAMED Viewed

File without changes

model_4.h5 → BiLSTM/model_4.h5 RENAMED Viewed

File without changes

word_dict.pkl → BiLSTM/word_dict.pkl RENAMED Viewed

File without changes

DeBERTaV3/QIQC-deberta-v3/added_tokens.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"[MASK]": 128000}

DeBERTaV3/QIQC-deberta-v3/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

DeBERTaV3/QIQC-deberta-v3/spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
+size 2464616

DeBERTaV3/QIQC-deberta-v3/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "split_by_punct": false, "vocab_type": "spm", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "microsoft/deberta-v3-base"}

DeBERTaV3/models/fastai_QIQC-deberta-v3.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ea49299262eff9de87d248861c80e826f3a710a6a14410c899fc2de8b4ea24a
+size 746435557

app.py CHANGED Viewed

@@ -7,27 +7,33 @@ import spacy
 from tqdm import tqdm
 import gc
 import os
-# Download the SpaCy model
 os.system("python -m spacy download en_core_web_lg")
-# Load models
-model_1 = tf.keras.models.load_model("model_1.h5")
-model_2 = tf.keras.models.load_model("model_2.h5")
-model_3 = tf.keras.models.load_model("model_3.h5")
-model_4 = tf.keras.models.load_model("model_4.h5")
-# Load dictionaries
-with open('word_dict.pkl', 'rb') as f:
     word_dict = pickle.load(f)
-with open('lemma_dict.pkl', 'rb') as f:
-    lemma_dict = pickle.load(f)
-# Load SpaCy NLP model
 nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
 nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
 def preprocess_text(text):
     """Preprocess the input text using SpaCy and return word indices."""
     docs = nlp.pipe([text], n_process=1)
@@ -40,30 +46,124 @@ def preprocess_text(text):
                 word_seq.append(word_dict[token.text])
     return word_seq
-def classify_question(text):
-    # Preprocess the text
-    seq = preprocess_text(text)
-    padded_seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=55)  # Adjust maxlen if needed
-    BATCH_SIZE = 512
-    # Get predictions from each model
-    pred1 = 0.15 * np.squeeze(model_1.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
-    pred2 = 0.35 * np.squeeze(model_2.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
-    pred3 = 0.15 * np.squeeze(model_3.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
-    pred4 = 0.35 * np.squeeze(model_4.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
-    # Combine predictions
-    avg_pred = pred1 + pred2 + pred3 + pred4
-    label = "Insincere" if avg_pred > 0.35 else "Sincere"
-    # Create a list of probabilities for each model
-    probs = {
-        "Probability": float(avg_pred),
-        "Model Probabilities": {"Model 1": float(pred1), "Model 2": float(pred2), "Model 3": float(pred3), "Model 4": float(pred4)},
-        "Sequence": seq
-    }
-    return label, probs
 # Example questions
 examples = [
     "How do you train a pigeon to send messages?",
@@ -72,19 +172,28 @@ examples = [
     "Which person has given the least f**ks and still turned out successful?"
 ]
-# Gradio Interface
-interface = gr.Interface(
-    fn=classify_question,
-    inputs=[
-        gr.Textbox(lines=2, placeholder="Enter your question here..."),
-    ],
-    outputs=[
-        "text",  # Output for label
-        "json"   # Output for probabilities
-    ],
-    title="Quora Insincere Questions Classifier",
-    examples=examples,
-    description="Enter your question to classify it as sincere or insincere. Select an example question below."
-)
-interface.launch()

 from tqdm import tqdm
 import gc
 import os
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from fastai.vision.all import *
+from fastai.text.all import *
+from torch.utils.data import Dataset
+model_lst = ["DeBERTaV3", "BiLSTM"]
+# BiLSTM Model
+## Download the SpaCy model
 os.system("python -m spacy download en_core_web_lg")
+## Load models
+model_1 = tf.keras.models.load_model("BiLSTM/model_1.h5")
+model_2 = tf.keras.models.load_model("BiLSTM/model_2.h5")
+model_3 = tf.keras.models.load_model("BiLSTM/model_3.h5")
+model_4 = tf.keras.models.load_model("BiLSTM/model_4.h5")
+## Load dictionaries
+with open('BiLSTM/word_dict.pkl', 'rb') as f:
     word_dict = pickle.load(f)
+## Load SpaCy NLP model
 nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
 nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
+## tokenizer
 def preprocess_text(text):
     """Preprocess the input text using SpaCy and return word indices."""
     docs = nlp.pipe([text], n_process=1)
                 word_seq.append(word_dict[token.text])
     return word_seq
+# DeBERTaV3 Model
+## Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
+class QuestionDataset(Dataset):
+    def __init__(self, X, y, tokenizer):
+        self.text = X
+        self.targets = y
+        self.tok = tokenizer
+    def __len__(self):
+        return len(self.text)
+    def __getitem__(self, idx):
+        text = self.text[idx]
+        targ = self.targets[idx]
+        return self.tok(text, padding='max_length',
+                        truncation=True,
+                        max_length=30,
+                        return_tensors="pt")["input_ids"][0], tensor(targ)
+    def new_empty(self):
+        return QuestionDataset([], [], self.tok)
+learn_infer = load_learner('DeBERTaV3/models/fastai_QIQC-deberta-v3.pkl', cpu=True)
+print("Learner loaded successfully.")
+# ## define the model
+# bert = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base').train()
+# classifier = nn.Sequential(
+#     nn.Linear(768, 1024),
+#     nn.ReLU(),
+#     nn.Dropout(0.5),
+#     nn.Linear(1024, 2)
+# )
+# bert.classifier = classifier
+# class BertClassifier(Module):
+#     def __init__(self, bert):
+#         self.bert = bert
+#     def forward(self, x):
+#         x = self.bert(x)
+#         return x.logits
+# model = BertClassifier(bert)
+## Recreate the DataLoader
+class TestDS:
+    def __init__(self, tensors):
+        self.tensors = tensors
+    def __len__(self):
+        return len(self.tensors)
+    def __getitem__(self, idx):
+        t = self.tensors[idx]
+        return t, tensor(0)
+class DeBERTaV3Model:
+    def __init__(self):
+        pass
+    def predict(self, text):
+        # Preprocess the text
+        test_tensor = tokenizer(text, padding="max_length", truncation=True, max_length=55, return_tensors="pt")["input_ids"]
+        test_dl = DataLoader(TestDS(test_tensor), bs=128)
+        # Get predictions
+        preds = learn_infer.get_preds(dl=test_dl)
+        label = "Insincere" if (F.softmax(preds[0], dim=1)[:, 1]>0.4878) else "Sincere"
+        probs = {
+            "Probability": float(F.softmax(preds[0], dim=1)[:, 1]),
+            "Sequence": test_tensor
+        }
+        return label, probs
+class BiLSTMModel:
+    def __init__(self):
+        pass
+    def predict(self, text):
+        # Preprocess the text
+        seq = preprocess_text(text)
+        padded_seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=55)
+        BATCH_SIZE = 512
+        # Get predictions from each model
+        pred1 = 0.15 * np.squeeze(model_1.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
+        pred2 = 0.35 * np.squeeze(model_2.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
+        pred3 = 0.15 * np.squeeze(model_3.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
+        pred4 = 0.35 * np.squeeze(model_4.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
+        # Combine predictions
+        avg_pred = pred1 + pred2 + pred3 + pred4
+        label = "Insincere" if avg_pred > 0.35 else "Sincere"
+        probs = {
+            "Probability": float(avg_pred),
+            "Model Probabilities": {"Model 1": float(pred1), "Model 2": float(pred2), "Model 3": float(pred3), "Model 4": float(pred4)},
+            "Sequence": seq
+        }
+        return label, probs
+class QuestionClassifier:
+    """Main Class to manage the models"""
+    def __init__(self):
+        self.models = {
+            "DeBERTaV3": DeBERTaV3Model(),
+            "BiLSTM": BiLSTMModel()
+        }
+    def classify(self, model_name, text):
+        return self.models[model_name].predict(text)
 # Example questions
 examples = [
     "How do you train a pigeon to send messages?",
     "Which person has given the least f**ks and still turned out successful?"
 ]
+def create_gradio_interface():
+    classifier = QuestionClassifier()
+    def classify_question(model_name, text):
+        return classifier.classify(model_name, text)
+    interface = gr.Interface(
+        fn=classify_question,
+        inputs=[
+            gr.Dropdown(choices=["DeBERTaV3", "BiLSTM"], label="Select Model", value="BiLSTM"),
+            gr.Textbox(lines=2, placeholder="Enter your question here...", label="Input Question")
+        ],
+        outputs=[
+            gr.Textbox(label="Prediction"),
+            gr.JSON(label="Model Probabilities")
+        ],
+        title="Quora Insincere Questions Classifier",
+        examples=examples,
+        description="Enter your question to classify it as sincere or insincere. Select an example question below."
+    )
+    interface.launch()
+if __name__ == "__main__":
+    create_gradio_interface()

pip ADDED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -4,3 +4,8 @@ h5py
 spacy
 ml-dtypes
 thinc

 spacy
 ml-dtypes
 thinc
+torch
+transformers
+fastai
+tiktoken
+sentencepiece