Spaces:

aus10powell
/

TwitterAccounts

Runtime error

App Files Files Community

TwitterAccounts / scripts /twitter_scraper.py

aus10powell

Update scripts/twitter_scraper.py

c8433b9 over 2 years ago

raw

history blame

8.73 kB

	import snscrape.modules.twitter as sntwitter
	import pandas as pd
	import datetime as dt
	from tqdm import tqdm
	import requests
	from scripts import sentiment

	import tweepy
	import configparser
	import os
	import pandas as pd
	from datetime import datetime, date, timedelta


	def get_latest_account_tweets(handle):
	try:
	if os.path.exists("tweepy_auth.ini"):
	config = configparser.ConfigParser()
	config.read("tweepy_auth.ini")
	# Get the authentication details
	authentication_section = config["AUTHENTICATION"]
	consumer_key = authentication_section["twitter_consumer_key"]
	consumer_secret = authentication_section["twitter_consumer_secret"]
	access_token = authentication_section["twitter_access_token"]
	access_token_secret = authentication_section["twitter_access_token_secret"]
	else:
	consumer_key = os.environ["twitter_consumer_key"]
	consumer_secret = os.environ["twitter_consumer_secret"]
	access_token = os.environ["twitter_access_token"]
	access_token_secret = os.environ["twitter_access_token_secret"]

	auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
	auth.set_access_token(access_token, access_token_secret)

	# create the API object
	api = tweepy.API(auth)

	# load the tweets from a specific user
	tweets = api.user_timeline(
	screen_name=handle, count=10000000, tweet_mode="extended"
	)

	df_tweets = pd.DataFrame(data=[t._json for t in tweets])
	df_tweets["created_at"] = pd.to_datetime(df_tweets["created_at"])
	df_tweets = df_tweets.sort_values("created_at")

	# print the tweet texts
	tweets_txt = []
	for tweet in tweets:
	tweets_txt.append(sentiment.tweet_cleaner(tweet.full_text))
	df_tweets["clean_text"] = tweets_txt
	df_tweets["handle"] = df_tweets.user.iloc[0]["screen_name"]

	return df_tweets

	except tweepy.TweepError as e:
	# Handle specific error conditions
	if e.api_code == 63:
	print("User has been suspended.")
	elif e.api_code == 88:
	print("Rate limit exceeded. Please try again later.")
	else:
	print("Error occurred during API call:", str(e))
	return str(e)

	except Exception as e:
	print("An error occurred:", str(e))
	return str(e)
	return None


	def get_tweets(
	handle: str,
	):
	"""
	Fetches tweets from Twitter based on a given query and returns a list of extracted tweet information.

	Args:
	query (str): The query to search for tweets on Twitter.

	Returns:
	A list of extracted tweet information.
	"""
	# Get the current date
	today = datetime.today()
	two_months_ago = today - timedelta(days=2 * 30)

	start_date = two_months_ago.strftime("%Y-%m-%d")
	end_date = today.strftime("%Y-%m-%d")

	query = f"from:{handle} since:{start_date} until:{end_date} -filter:replies -filter:retweets"

	fetched_tweets = sntwitter.TwitterSearchScraper(query).get_items()
	tweets = [extract_tweet_info(tweet) for tweet in tqdm(fetched_tweets)]
	df_tweets = pd.DataFrame(tweets)
	df_tweets["full_text"] = df_tweets["content"]
	df_tweets["clean_text"] = df_tweets["full_text"].apply(
	lambda r: sentiment.tweet_cleaner(r)
	)
	df_tweets["handle"] = df_tweets["username"]
	df_tweets["created_at"] = df_tweets["date"]
	return df_tweets


	def get_replies(username: str, conversation_id: str, max_tweets: int) -> list:
	"""
	Fetches the replies for a given Twitter user and conversation, and returns a list of extracted tweet information.

	Args:
	username (str): The username of the Twitter user whose replies are to be fetched.
	conversation_id (str): The ID of the conversation for which replies are to be fetched.

	Returns:
	A list of extracted tweet information for the replies.
	"""
	print(
	f"Fetching replies for username {username} and conversation {conversation_id}"
	)
	query = f"to:{username} since_id:{conversation_id} filter:safe"

	tweets_list = []
	for i, tweet in tqdm(enumerate(sntwitter.TwitterSearchScraper(query).get_items())):
	if i > max_tweets:
	break
	else:
	tweets_list.append(extract_tweet_info(tweet))
	return tweets_list


	def get_tweet_by_id_and_username(username: str, tweet_id: str):
	"""
	Fetches a tweet from Twitter based on the given username and tweet ID.

	Args:
	username (str): The username of the Twitter user who posted the tweet.
	tweet_id (str): The ID of the tweet to fetch.

	Returns:
	The fetched tweet.
	"""
	tweet_url = f"https://twitter.com/{username}/status/{tweet_id}"
	return sntwitter.TwitterSearchScraper(tweet_url).get_items()


	def extract_tweet_info(tweet):
	"""
	Extracts relevant information from a tweet object and returns a dictionary with the extracted values.

	Args:
	tweet: A tweet object.

	Returns:
	A dictionary with the extracted tweet information.
	"""
	return {
	"date": tweet.date,
	"username": tweet.user.username,
	"content": tweet.rawContent,
	"retweet_count": tweet.retweetCount,
	"tweet_id": tweet.id,
	"like_count": tweet.likeCount,
	"reply_count": tweet.replyCount,
	"in_reply_to_tweet_id": tweet.inReplyToTweetId,
	"conversation_id": tweet.conversationId,
	"view_count": tweet.viewCount,
	}


	def get_follower_ids(username: str, limit: int = 20):
	"""
	Retrieves a list of Twitter IDs for users who follow a given Twitter handle.

	Args:
	username (str): The Twitter handle to retrieve follower IDs for.
	limit (int): The maximum number of follower IDs to retrieve.

	Returns:
	A list of Twitter user IDs (as strings).
	"""
	# Construct the search query using snscrape
	query = f"from:{username} replies:True"

	start_date = dt.date(year=2023, month=3, day=10)
	end_date = dt.date(year=2023, month=3, day=22)
	query = f"from:{username} since:{start_date} until:{end_date}"
	tweets = get_tweets(query=query)
	one_tweet = tweets[-1]
	one_tweet_id = one_tweet["tweet_id"]

	replies = get_replies(
	username=username, conversation_id=one_tweet_id, max_tweets=1000
	)

	return one_tweet, replies


	def get_twitter_account_info(twitter_handle: str) -> dict:
	"""
	Extracts the name, username, follower count, and last tweet of a Twitter user using snscrape.

	Args:
	twitter_handle (str): The Twitter username to retrieve information for.

	Returns:
	dict: A dictionary containing the name, username, follower count, and last tweet of the Twitter user.
	"""

	# Create a TwitterUserScraper object
	user_scraper = sntwitter.TwitterUserScraper(twitter_handle)

	# Get the user's profile information
	user_profile = user_scraper.entity
	check_string = lambda s: "false" if str(s).lower() == "false" else "true"
	return {
	"name": user_profile.displayname,
	"username": user_profile.username,
	"user_id": user_profile.id,
	"follower_count": user_profile.followersCount,
	"friends_count": user_profile.friendsCount,
	"verified": check_string(user_profile.verified),
	}


	if __name__ == "__main__":
	## Testing extracting tweets from an account
	# Set the search variables (dates for when account tweeted. Does not take into account replies)
	account = "taylorlorenz"
	start_date = dt.date(year=2023, month=2, day=1)
	end_date = dt.date(year=2023, month=3, day=11)

	# Format the query string
	query = f"from:{account} since:{start_date} until:{end_date}"
	print(f"query: {query}")
	tweets = get_tweets(query=query)

	df_tweets = pd.DataFrame(data=tweets)
	df_tweets = df_tweets.sort_values("in_reply_to_tweet_id")
	# Uncomment to save output
	df_tweets.to_csv("df_tweets.csv")

	print(df_tweets.head(2))
	print(df_tweets.tail(2))
	print(f"Total Tweets: {len(tweets)}")

	## Testing extracting conversatin threeds from conversation Id
	conversation_id = (
	1620650202305798144 # A tweet from elon musk about turbulent times
	)
	max_tweets = 3000
	tweets = get_replies(
	username="elonmusk", conversation_id=conversation_id, max_tweets=max_tweets
	)
	df_replies = pd.DataFrame(data=tweets)

	# Uncomment to save output
	# df_replies.to_csv("df_replies.csv")
	print(
	f"Number of extracted tweets from conversation_id: {conversation_id}, {len(tweets)}"
	)