Files changed (5) hide show
  1. LICENSE +0 -21
  2. README.md +1 -57
  3. app.py +6 -16
  4. nltk_data.zip +0 -3
  5. requirements.txt +4 -4
LICENSE DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2025 Jainil Patel
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in
13
- all copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
- THE SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -11,60 +11,4 @@ license: mit
11
  short_description: 'Detects Fake News using the ensemble of 3 Models '
12
  ---
13
 
14
- # πŸ“š Fake News Detector
15
-
16
- **Detects Fake News using an ensemble of 3 Models (Naive Bayes, Logistic Regression, and GloVe-based embeddings)**
17
-
18
- ---
19
-
20
- ## 🚨 Important Disclaimer
21
-
22
- > ⚠️ This project is built purely for **educational and experimental purposes** to explore basic Natural Language Processing (NLP) and Machine Learning (ML) techniques.
23
- >
24
- > ❗ It is **not suitable for real-world fact-checking or decision-making**.
25
- >
26
- > The models used are simple, non-contextual, and cannot understand language nuances or factual correctness. Misusing this tool for serious analysis may lead to incorrect or harmful conclusions.
27
- >
28
- > **Please do not trust or rely on the outputs of this demo.** It is meant for **learning only.**
29
-
30
- ---
31
-
32
- ## 🎯 Purpose
33
-
34
- This project was created as a part of our research internship as a way to:
35
- - Practice building an ensemble model using different NLP approaches
36
- - Learn to deploy ML apps with Gradio and Hugging Face Spaces
37
- - Experiment with basic text classification on news headlines/articles
38
-
39
- It is **not** a robust or reliable system for determining truth or accuracy in media.
40
-
41
- ---
42
-
43
- ## βš™οΈ How It Works
44
-
45
- This Fake News Detector uses an ensemble of 3 models:
46
-
47
- 1. **Naive Bayes with TF-IDF** – assigns 55% weight
48
- 2. **Logistic Regression** – assigns 10% weight
49
- 3. **GloVe Embedding-Based Classifier** – assigns 35% weight
50
-
51
- Each model contributes a score between 0 and 1 indicating the likelihood of the input text being "Real." The final prediction is based on a weighted average.
52
-
53
- ---
54
-
55
- ## πŸ“„ License & Attribution
56
-
57
- This project is licensed under the **MIT License**.
58
-
59
- ### Libraries and Tools Used:
60
- - 🧠 [GloVe Embeddings by Stanford NLP](https://nlp.stanford.edu/projects/glove/)
61
- - 🌐 [Gradio Interface Library](https://www.gradio.app/)
62
- - πŸ“š [scikit-learn](https://scikit-learn.org/) for model implementation
63
- - πŸ›  [NLTK](https://www.nltk.org/) for basic NLP preprocessing
64
- - [Dataset](https://www.kaggle.com/datasets/stevenpeutz/misinformation-fake-news-text-dataset-79k)
65
- ## πŸ“¦ Installation
66
-
67
- ```bash
68
- pip install -r requirements.txt
69
- python app.py
70
-
 
11
  short_description: 'Detects Fake News using the ensemble of 3 Models '
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -9,18 +9,9 @@ from nltk.corpus import stopwords
9
  from nltk.stem import WordNetLemmatizer
10
  from nltk.tokenize import word_tokenize
11
  import nltk
12
- import os
13
- import zipfile
14
-
15
- # Unzip local nltk_data.zip if not already unzipped
16
- nltk_data_path = os.path.join(os.path.dirname(__file__), 'nltk_data')
17
- if not os.path.exists(nltk_data_path):
18
- with zipfile.ZipFile('nltk_data.zip', 'r') as zip_ref:
19
- zip_ref.extractall(nltk_data_path)
20
-
21
- # Tell NLTK to use the local data path
22
- nltk.data.path.append(nltk_data_path)
23
-
24
 
25
  # ============ Load Models and Tokenizers ============
26
  with open("logreg_model.pkl", "rb") as f:
@@ -63,7 +54,7 @@ def predict_ensemble(text):
63
  cleaned = clean_text(text)
64
 
65
  # Check if cleaned text is too short
66
- if len(cleaned.strip()) <= 10:
67
  return "Input too short to analyze."
68
 
69
  # TF-IDF-based predictions
@@ -77,8 +68,8 @@ def predict_ensemble(text):
77
  prob_glove = model_glove.predict(glove_pad)[0][0]
78
 
79
  # Weighted ensemble
80
- ensemble_score = 0.50 * prob_nb + 0.1 * prob_logreg + 0.40 * prob_glove
81
- label = "βœ… Real News" if ensemble_score >= 0.47 else "❌ Fake News"
82
 
83
  # Optional: Include probabilities
84
  # Naive Bayes:
@@ -101,7 +92,6 @@ interface = gr.Interface(
101
  outputs=gr.Markdown(label="Prediction"),
102
  title="πŸ“° Fake News Detector",
103
  description="This tool uses 3 models (Naive Bayes, Logistic Regression, GloVe-based Deep Learning) to classify news as real or fake using an ensemble method.",
104
- article="⚠️ **Disclaimer:** This demo is for educational and experimental purposes only. It is not suitable for real-world fact-checking or decision-making. Please do not rely on this tool.",
105
  allow_flagging="never"
106
  )
107
 
 
9
  from nltk.stem import WordNetLemmatizer
10
  from nltk.tokenize import word_tokenize
11
  import nltk
12
+ nltk.download('punkt')
13
+ nltk.download('stopwords')
14
+ nltk.download('wordnet')
 
 
 
 
 
 
 
 
 
15
 
16
  # ============ Load Models and Tokenizers ============
17
  with open("logreg_model.pkl", "rb") as f:
 
54
  cleaned = clean_text(text)
55
 
56
  # Check if cleaned text is too short
57
+ if len(cleaned.strip()) == 10:
58
  return "Input too short to analyze."
59
 
60
  # TF-IDF-based predictions
 
68
  prob_glove = model_glove.predict(glove_pad)[0][0]
69
 
70
  # Weighted ensemble
71
+ ensemble_score = 0.55 * prob_nb + 0.1 * prob_logreg + 0.35 * prob_glove
72
+ label = "βœ… Real News" if ensemble_score >= 0.45 else "❌ Fake News"
73
 
74
  # Optional: Include probabilities
75
  # Naive Bayes:
 
92
  outputs=gr.Markdown(label="Prediction"),
93
  title="πŸ“° Fake News Detector",
94
  description="This tool uses 3 models (Naive Bayes, Logistic Regression, GloVe-based Deep Learning) to classify news as real or fake using an ensemble method.",
 
95
  allow_flagging="never"
96
  )
97
 
nltk_data.zip DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7ca5b931a531c962d2539b042daf7d37badd4ce59523dfa063083f61a1dae72
3
- size 52292335
 
 
 
 
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- gradio
2
- tensorflow
3
- scikit-learn
4
- nltk==3.7
5
  numpy
 
1
+ gradio
2
+ tensorflow
3
+ scikit-learn
4
+ nltk
5
  numpy