reddit_scraper / app.py
sahilmohurale03's picture
Upload 2 files
50e4ef7 verified
# Install necessary libraries
# pip install praw gradio pandas
import praw
import gradio as gr
import os
# Step 1: Set up Reddit API authentication using environment variables
reddit = praw.Reddit(
client_id=os.getenv("REDDIT_CLIENT_ID"), # Fetch from environment variables
client_secret=os.getenv("REDDIT_CLIENT_SECRET"), # Fetch from environment variables
user_agent="my-reddit-app/1.0" # User-agent can stay as is
)
# Step 2: Function to fetch posts from subreddits
def get_posts(subreddits, keywords, time_filter="day"):
posts = []
for subreddit in subreddits.split(","):
subreddit_obj = reddit.subreddit(subreddit.strip())
for post in subreddit_obj.search(" OR ".join(keywords.split(",")), time_filter=time_filter):
posts.append({
"title": post.title,
"url": post.url,
"created": post.created_utc,
"selftext": post.selftext
})
return posts
# Step 3: Function to filter posts based on keywords
def filter_posts(posts, keywords):
filtered_posts = []
for post in posts:
if any(keyword.lower() in (post['title'] + post['selftext']).lower() for keyword in keywords.split(",")):
filtered_posts.append(post)
return filtered_posts
# Step 4: Function to categorize posts into topics
def categorize_posts(posts):
categories = {
"AI": ["AI", "machine learning", "deep learning"],
"Programming": ["Python", "JavaScript", "coding"],
"Tech": ["gadgets", "technology", "innovation"]
}
categorized_posts = {category: [] for category in categories}
for post in posts:
for category, keywords in categories.items():
if any(keyword.lower() in post['title'].lower() for keyword in keywords):
categorized_posts[category].append(post)
return categorized_posts
# Step 5: Main function to process Reddit data
def process_reddit(subreddits, keywords, time_filter):
posts = get_posts(subreddits, keywords, time_filter)
filtered = filter_posts(posts, keywords)
categorized = categorize_posts(filtered)
result = {}
for category, posts in categorized.items():
result[category] = [{"Title": post["title"], "URL": post["url"]} for post in posts]
return result
# Step 6: Create Gradio interface
interface = gr.Interface(
fn=process_reddit,
inputs=[
gr.Textbox(label="Subreddits (comma-separated)", placeholder="e.g., technology, python"),
gr.Textbox(label="Keywords (comma-separated)", placeholder="e.g., AI, Python, gadgets"),
gr.Dropdown(choices=["hour", "day", "week", "month", "year", "all"], value="day", label="Time Filter")
],
outputs=gr.JSON(label="Categorized Posts"),
title="Reddit Post Filter",
description="Search specific subreddits for posts containing certain keywords and categorize them into topics."
)
# Launch the Gradio interface
if __name__ == "__main__":
interface.launch()