rizkims commited on
Commit
a60f537
·
1 Parent(s): c2b6622

Initial commit for hoax detector

Browse files
Files changed (4) hide show
  1. app.py +72 -0
  2. ensemble_model.pkl +3 -0
  3. requirements.txt +5 -0
  4. vectorizer.pkl +3 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pickle
3
+ from transformers import pipeline
4
+ import re
5
+ import unicodedata
6
+
7
+ # Load pipelines
8
+ qa_pipeline = pipeline("question-answering", model="Rifky/Indobert-QA", tokenizer="Rifky/Indobert-QA")
9
+ ner_pipeline = pipeline("ner", model="cahya/bert-base-indonesian-NER", tokenizer="cahya/bert-base-indonesian-NER", grouped_entities=True)
10
+
11
+ # Load model hoax
12
+ with open("ensemble_model.pkl", "rb") as f:
13
+ model = pickle.load(f)
14
+
15
+ with open("vectorizer.pkl", "rb") as f:
16
+ vectorizer = pickle.load(f)
17
+
18
+ def clean_text(text):
19
+ text = re.sub(r'[\n\r]+', ' ', text)
20
+ text = re.sub(r'\s{2,}', ' ', text)
21
+ text = text.strip()
22
+ text = unicodedata.normalize('NFKC', text)
23
+ text = text.lower()
24
+ text = re.sub(r'https?://\S+|www\.\S+', ' url ', text)
25
+ asian_char_pattern = re.compile(
26
+ r'[\u4e00-\u9FFF\u30A0-\u30FF\u3040-\u309F\uAC00-\uD7AF\u1100-\u11FF\u3130-\u318F]'
27
+ )
28
+ text = asian_char_pattern.sub(' ', text)
29
+ unwanted_scripts_pattern = re.compile(
30
+ r'[\u2D30-\u2D7F\uA980-\uA9DF\u1E00-\u1EFF\u0250-\u02AF\u1D00-\u1D7F]'
31
+ )
32
+ text = ' '.join(word for word in text.split() if not unwanted_scripts_pattern.search(word))
33
+ text = re.sub(r'[^a-z0-9\s.,!?;:\'\"()-]', ' ', text)
34
+ return re.sub(r'\s{2,}', ' ', text).strip()
35
+
36
+ # === Fungsi Utama ===
37
+ def detect_hoax(text):
38
+ cleaned = clean_text(text)
39
+ tfidf = vectorizer.transform([cleaned])
40
+ prediction = model.predict(tfidf)[0]
41
+ return "Hoaks" if prediction == 1 else "Bukan Hoaks"
42
+
43
+ def run_qa(context, question):
44
+ if not context or not question:
45
+ return "Masukkan context dan pertanyaan."
46
+ result = qa_pipeline(question=question, context=context)
47
+ return result["answer"]
48
+
49
+ def run_ner(text):
50
+ if not text:
51
+ return []
52
+ result = ner_pipeline(text)
53
+ return [(ent["word"], ent["entity_group"]) for ent in result]
54
+
55
+ # === Gradio UI ===
56
+ hoax_tab = gr.Interface(fn=detect_hoax, inputs="text", outputs="text", title="Deteksi Hoaks")
57
+
58
+ qa_tab = gr.Interface(
59
+ fn=run_qa,
60
+ inputs=[gr.Textbox(label="Context"), gr.Textbox(label="Pertanyaan")],
61
+ outputs="text",
62
+ title="Question Answering"
63
+ )
64
+
65
+ ner_tab = gr.Interface(
66
+ fn=run_ner,
67
+ inputs="text",
68
+ outputs=gr.HighlightedText(label="Hasil NER", combine_adjacent=True),
69
+ title="Named Entity Recognition"
70
+ )
71
+
72
+ gr.TabbedInterface([hoax_tab, qa_tab, ner_tab], ["Deteksi Hoaks", "QA", "NER"]).launch()
ensemble_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:354288cdacff965e08c0de8dff13282f64f5c546b30d709d00611ca10e7d2d39
3
+ size 599691306
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ scikit-learn
3
+ transformers
4
+ torch
5
+ regex
vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cede0d09e18bc5cfb31d36f2b38fe1635f20bb48fcfc34f1c01fe0bea9183c3f
3
+ size 3180887