Yelp_Review / yelp_overview.py

Upload 4 files

f7ac105 verified 2 months ago

5.35 kB

	import json
	import matplotlib.pyplot as plt

	def count_reviews_by_stars_and_average(file_path):
	star_counts = {}
	total_stars = 0
	total_reviews = 0
	total_text_length = 0
	short_text_stars = 0
	short_text_count = 0
	word_frequencies = {}
	word_count_limit = 100000
	star_vote_totals = {stars: {'useful': 0, 'funny': 0, 'cool': 0, 'count': 0} for stars in range(1, 6)}

	with open(file_path, 'r', encoding='utf-8') as file:
	for i, line in enumerate(file):
	record = json.loads(line.strip())
	if "stars" in record and isinstance(record["stars"], (int, float)) and "text" in record:
	stars = record["stars"]
	text = record["text"]
	text_length = len(text)

	if stars not in star_counts:
	star_counts[stars] = 0
	star_counts[stars] += 1

	total_stars += stars
	total_reviews += 1
	total_text_length += text_length

	if text_length < 10:
	short_text_stars += stars
	short_text_count += 1

	if i < word_count_limit:
	words = text.lower().split()
	for word in words:
	word = ''.join(char for char in word if char.isalnum())
	if word:
	if word not in word_frequencies:
	word_frequencies[word] = 0
	word_frequencies[word] += 1

	if "useful" in record and "funny" in record and "cool" in record:
	star_vote_totals[stars]['useful'] += record["useful"]
	star_vote_totals[stars]['funny'] += record["funny"]
	star_vote_totals[stars]['cool'] += record["cool"]
	star_vote_totals[stars]['count'] += 1

	if total_reviews > 0:
	average_rating = total_stars / total_reviews
	average_text_length = total_text_length / total_reviews
	else:
	average_rating = 0
	average_text_length = 0

	if short_text_count > 0:
	average_short_text_rating = short_text_stars / short_text_count
	else:
	average_short_text_rating = 0

	most_common_word = None
	most_common_count = 0
	for word, count in word_frequencies.items():
	if count > most_common_count:
	most_common_word = word
	most_common_count = count

	average_votes_by_star = {}
	for stars, votes in star_vote_totals.items():
	if votes['count'] > 0:
	average_votes_by_star[stars] = {
	'useful': votes['useful'] / votes['count'],
	'funny': votes['funny'] / votes['count'],
	'cool': votes['cool'] / votes['count']
	}

	return star_counts, average_rating, average_text_length, average_short_text_rating, most_common_word, most_common_count, average_votes_by_star

	def plot_reviews_and_votes(star_counts, average_votes_by_star):
	star_ratings = [1, 2, 3, 4, 5]
	review_counts = [star_counts[star] for star in star_ratings]

	plt.figure(figsize=(10, 6))

	plt.subplot(1, 2, 1)
	plt.bar(star_ratings, review_counts, color='blue')
	plt.title('Number of Reviews per Star Rating')
	plt.xlabel('Star Rating')
	plt.ylabel('Number of Reviews')

	useful_votes = [average_votes_by_star[star]['useful'] for star in star_ratings]
	funny_votes = [average_votes_by_star[star]['funny'] for star in star_ratings]
	cool_votes = [average_votes_by_star[star]['cool'] for star in star_ratings]

	plt.subplot(1, 2, 2)
	width = 0.2
	x = [i - width for i in range(len(star_ratings))]

	plt.bar(x, useful_votes, width, label='Useful', color='green')
	plt.bar([i + width for i in range(len(star_ratings))], funny_votes, width, label='Funny', color='red')
	plt.bar([i + 2 * width for i in range(len(star_ratings))], cool_votes, width, label='Cool', color='blue')

	plt.title('Average Votes per Star Rating')
	plt.xlabel('Star Rating')
	plt.ylabel('Average Votes')
	plt.xticks(range(len(star_ratings)), star_ratings)
	plt.legend()

	plt.tight_layout()
	plt.show()

	if __name__ == "__main__":
	file_path = "yelp_academic_dataset_review.json"

	star_counts, average_rating, average_text_length, average_short_text_rating, most_common_word, most_common_count, average_votes_by_star = count_reviews_by_stars_and_average(file_path)

	for stars in sorted(star_counts):
	print(f"{stars} stars: {star_counts[stars]} reviews")

	print(f"Average rating: {average_rating:.2f}")
	print(f"Average text length: {average_text_length:.2f} characters")
	print(f"Average rating for reviews with text length < 10: {average_short_text_rating:.2f}")
	print(f"Most common word (in first 100,000 reviews): '{most_common_word}' (used {most_common_count} times)")

	print("Average votes per star rating:")
	for stars, votes in average_votes_by_star.items():
	print(f"{stars} stars - Useful: {votes['useful']:.2f}, Funny: {votes['funny']:.2f}, Cool: {votes['cool']:.2f}")

	plot_reviews_and_votes(star_counts, average_votes_by_star)