Spaces:
Running
Running
Adding nltk_data
#2
by
aaryan24
- opened
- LICENSE +0 -21
- README.md +1 -57
- app.py +6 -16
- nltk_data.zip +0 -3
- requirements.txt +4 -4
LICENSE
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
MIT License
|
2 |
-
|
3 |
-
Copyright (c) 2025 Jainil Patel
|
4 |
-
|
5 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
-
of this software and associated documentation files (the "Software"), to deal
|
7 |
-
in the Software without restriction, including without limitation the rights
|
8 |
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
-
copies of the Software, and to permit persons to whom the Software is
|
10 |
-
furnished to do so, subject to the following conditions:
|
11 |
-
|
12 |
-
The above copyright notice and this permission notice shall be included in
|
13 |
-
all copies or substantial portions of the Software.
|
14 |
-
|
15 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21 |
-
THE SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -11,60 +11,4 @@ license: mit
|
|
11 |
short_description: 'Detects Fake News using the ensemble of 3 Models '
|
12 |
---
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
**Detects Fake News using an ensemble of 3 Models (Naive Bayes, Logistic Regression, and GloVe-based embeddings)**
|
17 |
-
|
18 |
-
---
|
19 |
-
|
20 |
-
## π¨ Important Disclaimer
|
21 |
-
|
22 |
-
> β οΈ This project is built purely for **educational and experimental purposes** to explore basic Natural Language Processing (NLP) and Machine Learning (ML) techniques.
|
23 |
-
>
|
24 |
-
> β It is **not suitable for real-world fact-checking or decision-making**.
|
25 |
-
>
|
26 |
-
> The models used are simple, non-contextual, and cannot understand language nuances or factual correctness. Misusing this tool for serious analysis may lead to incorrect or harmful conclusions.
|
27 |
-
>
|
28 |
-
> **Please do not trust or rely on the outputs of this demo.** It is meant for **learning only.**
|
29 |
-
|
30 |
-
---
|
31 |
-
|
32 |
-
## π― Purpose
|
33 |
-
|
34 |
-
This project was created as a part of our research internship as a way to:
|
35 |
-
- Practice building an ensemble model using different NLP approaches
|
36 |
-
- Learn to deploy ML apps with Gradio and Hugging Face Spaces
|
37 |
-
- Experiment with basic text classification on news headlines/articles
|
38 |
-
|
39 |
-
It is **not** a robust or reliable system for determining truth or accuracy in media.
|
40 |
-
|
41 |
-
---
|
42 |
-
|
43 |
-
## βοΈ How It Works
|
44 |
-
|
45 |
-
This Fake News Detector uses an ensemble of 3 models:
|
46 |
-
|
47 |
-
1. **Naive Bayes with TF-IDF** β assigns 55% weight
|
48 |
-
2. **Logistic Regression** β assigns 10% weight
|
49 |
-
3. **GloVe Embedding-Based Classifier** β assigns 35% weight
|
50 |
-
|
51 |
-
Each model contributes a score between 0 and 1 indicating the likelihood of the input text being "Real." The final prediction is based on a weighted average.
|
52 |
-
|
53 |
-
---
|
54 |
-
|
55 |
-
## π License & Attribution
|
56 |
-
|
57 |
-
This project is licensed under the **MIT License**.
|
58 |
-
|
59 |
-
### Libraries and Tools Used:
|
60 |
-
- π§ [GloVe Embeddings by Stanford NLP](https://nlp.stanford.edu/projects/glove/)
|
61 |
-
- π [Gradio Interface Library](https://www.gradio.app/)
|
62 |
-
- π [scikit-learn](https://scikit-learn.org/) for model implementation
|
63 |
-
- π [NLTK](https://www.nltk.org/) for basic NLP preprocessing
|
64 |
-
- [Dataset](https://www.kaggle.com/datasets/stevenpeutz/misinformation-fake-news-text-dataset-79k)
|
65 |
-
## π¦ Installation
|
66 |
-
|
67 |
-
```bash
|
68 |
-
pip install -r requirements.txt
|
69 |
-
python app.py
|
70 |
-
|
|
|
11 |
short_description: 'Detects Fake News using the ensemble of 3 Models '
|
12 |
---
|
13 |
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -9,18 +9,9 @@ from nltk.corpus import stopwords
|
|
9 |
from nltk.stem import WordNetLemmatizer
|
10 |
from nltk.tokenize import word_tokenize
|
11 |
import nltk
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
# Unzip local nltk_data.zip if not already unzipped
|
16 |
-
nltk_data_path = os.path.join(os.path.dirname(__file__), 'nltk_data')
|
17 |
-
if not os.path.exists(nltk_data_path):
|
18 |
-
with zipfile.ZipFile('nltk_data.zip', 'r') as zip_ref:
|
19 |
-
zip_ref.extractall(nltk_data_path)
|
20 |
-
|
21 |
-
# Tell NLTK to use the local data path
|
22 |
-
nltk.data.path.append(nltk_data_path)
|
23 |
-
|
24 |
|
25 |
# ============ Load Models and Tokenizers ============
|
26 |
with open("logreg_model.pkl", "rb") as f:
|
@@ -63,7 +54,7 @@ def predict_ensemble(text):
|
|
63 |
cleaned = clean_text(text)
|
64 |
|
65 |
# Check if cleaned text is too short
|
66 |
-
if len(cleaned.strip())
|
67 |
return "Input too short to analyze."
|
68 |
|
69 |
# TF-IDF-based predictions
|
@@ -77,8 +68,8 @@ def predict_ensemble(text):
|
|
77 |
prob_glove = model_glove.predict(glove_pad)[0][0]
|
78 |
|
79 |
# Weighted ensemble
|
80 |
-
ensemble_score = 0.
|
81 |
-
label = "β
Real News" if ensemble_score >= 0.
|
82 |
|
83 |
# Optional: Include probabilities
|
84 |
# Naive Bayes:
|
@@ -101,7 +92,6 @@ interface = gr.Interface(
|
|
101 |
outputs=gr.Markdown(label="Prediction"),
|
102 |
title="π° Fake News Detector",
|
103 |
description="This tool uses 3 models (Naive Bayes, Logistic Regression, GloVe-based Deep Learning) to classify news as real or fake using an ensemble method.",
|
104 |
-
article="β οΈ **Disclaimer:** This demo is for educational and experimental purposes only. It is not suitable for real-world fact-checking or decision-making. Please do not rely on this tool.",
|
105 |
allow_flagging="never"
|
106 |
)
|
107 |
|
|
|
9 |
from nltk.stem import WordNetLemmatizer
|
10 |
from nltk.tokenize import word_tokenize
|
11 |
import nltk
|
12 |
+
nltk.download('punkt')
|
13 |
+
nltk.download('stopwords')
|
14 |
+
nltk.download('wordnet')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# ============ Load Models and Tokenizers ============
|
17 |
with open("logreg_model.pkl", "rb") as f:
|
|
|
54 |
cleaned = clean_text(text)
|
55 |
|
56 |
# Check if cleaned text is too short
|
57 |
+
if len(cleaned.strip()) == 10:
|
58 |
return "Input too short to analyze."
|
59 |
|
60 |
# TF-IDF-based predictions
|
|
|
68 |
prob_glove = model_glove.predict(glove_pad)[0][0]
|
69 |
|
70 |
# Weighted ensemble
|
71 |
+
ensemble_score = 0.55 * prob_nb + 0.1 * prob_logreg + 0.35 * prob_glove
|
72 |
+
label = "β
Real News" if ensemble_score >= 0.45 else "β Fake News"
|
73 |
|
74 |
# Optional: Include probabilities
|
75 |
# Naive Bayes:
|
|
|
92 |
outputs=gr.Markdown(label="Prediction"),
|
93 |
title="π° Fake News Detector",
|
94 |
description="This tool uses 3 models (Naive Bayes, Logistic Regression, GloVe-based Deep Learning) to classify news as real or fake using an ensemble method.",
|
|
|
95 |
allow_flagging="never"
|
96 |
)
|
97 |
|
nltk_data.zip
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:e7ca5b931a531c962d2539b042daf7d37badd4ce59523dfa063083f61a1dae72
|
3 |
-
size 52292335
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
gradio
|
2 |
-
tensorflow
|
3 |
-
scikit-learn
|
4 |
-
nltk
|
5 |
numpy
|
|
|
1 |
+
gradio
|
2 |
+
tensorflow
|
3 |
+
scikit-learn
|
4 |
+
nltk
|
5 |
numpy
|