rhdang commited on
Commit
f7ac105
·
verified ·
1 Parent(s): fc0c09f

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. bert.py +211 -0
  3. gpt2.py +212 -0
  4. yelp_academic_dataset_review.json +3 -0
  5. yelp_overview.py +130 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ yelp_academic_dataset_review.json filter=lfs diff=lfs merge=lfs -text
bert.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import DataLoader, Dataset
3
+ from transformers import BertTokenizer, BertForSequenceClassification, AdamW
4
+ from sklearn.metrics import accuracy_score
5
+ import json
6
+ from collections import Counter
7
+ from torch.nn import CrossEntropyLoss, Dropout
8
+ from torch.nn.utils import clip_grad_norm_
9
+ import matplotlib.pyplot as plt
10
+
11
+ class YelpDataset(Dataset):
12
+ def __init__(self, texts, ratings, tokenizer, max_length=128):
13
+ self.texts = texts
14
+ self.ratings = ratings
15
+ self.tokenizer = tokenizer
16
+ self.max_length = max_length
17
+
18
+ def __len__(self):
19
+ return len(self.texts)
20
+
21
+ def __getitem__(self, idx):
22
+ text = self.texts[idx]
23
+ rating = self.ratings[idx] - 1 # turn 1-5 star ratings to 0-4
24
+ encoding = self.tokenizer(
25
+ text,
26
+ truncation=True,
27
+ padding="max_length",
28
+ max_length=self.max_length,
29
+ return_tensors="pt"
30
+ )
31
+ return {
32
+ 'input_ids': encoding['input_ids'].squeeze(0),
33
+ 'attention_mask': encoding['attention_mask'].squeeze(0),
34
+ 'label': torch.tensor(rating, dtype=torch.long)
35
+ }
36
+
37
+ def compute_class_weights(labels):
38
+ class_counts = Counter(labels)
39
+ total_samples = sum(class_counts.values())
40
+ weights = {cls: total_samples / count for cls, count in class_counts.items()}
41
+ return weights
42
+
43
+ def train_model(model, train_loader, optimizer, epochs, device, val_loader=None, patience=3):
44
+ model.to(device)
45
+ best_loss = float('inf')
46
+ patience_counter = 0
47
+
48
+ train_accuracies, val_accuracies = [], []
49
+ val_loss_per_batch, val_accuracy_per_batch = [], []
50
+ avg_tokens_all_batches = []
51
+ total_tokens_across_epochs = 0
52
+ total_batches = 0
53
+
54
+ for epoch in range(epochs):
55
+ model.train()
56
+ total_loss = 0
57
+ correct, total = 0, 0
58
+ epoch_tokens = 0
59
+
60
+ for batch in train_loader:
61
+ optimizer.zero_grad()
62
+ input_ids = batch['input_ids'].to(device)
63
+ attention_mask = batch['attention_mask'].to(device)
64
+ labels = batch['label'].to(device)
65
+
66
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
67
+ loss = loss_fn(outputs.logits, labels)
68
+ loss.backward()
69
+ clip_grad_norm_(model.parameters(), max_norm=1.0)
70
+ optimizer.step()
71
+
72
+ total_loss += loss.item()
73
+ preds = torch.argmax(outputs.logits, dim=1)
74
+ correct += (preds == labels).sum().item()
75
+ total += labels.size(0)
76
+ epoch_tokens += input_ids.size(1) * input_ids.size(0) # tokens per batch
77
+ total_batches += 1
78
+
79
+ if val_loader:
80
+ val_loss, val_accuracy = evaluate_model(model, val_loader, device, return_loss=True)
81
+ val_loss_per_batch.append(val_loss)
82
+ val_accuracy_per_batch.append(val_accuracy)
83
+
84
+ total_tokens_across_epochs += epoch_tokens
85
+ avg_loss = total_loss / len(train_loader)
86
+ train_accuracy = correct / total
87
+ train_accuracies.append(train_accuracy)
88
+ print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {avg_loss:.4f} - Training Accuracy: {train_accuracy:.4f}")
89
+
90
+ if val_loader:
91
+ val_loss, val_accuracy = evaluate_model(model, val_loader, device, return_loss=True)
92
+ val_accuracies.append(val_accuracy)
93
+
94
+ if val_loss < best_loss:
95
+ best_loss = val_loss
96
+ patience_counter = 0
97
+ else:
98
+ patience_counter += 1
99
+
100
+ if patience_counter >= patience:
101
+ print("Early stopping triggered.")
102
+ break
103
+
104
+ avg_tokens_all_batches.append(total_tokens_across_epochs / total_batches)
105
+ print(f"Average tokens per batch across all epochs: {avg_tokens_all_batches[-1]:.2f}")
106
+
107
+ if val_loader:
108
+ plot_accuracies(train_accuracies, val_accuracies)
109
+ plt.figure()
110
+ plt.title('Average Tokens per Epoch')
111
+ plt.plot(range(1, len(avg_tokens_all_batches) + 1), avg_tokens_all_batches, label='Avg Tokens per Epoch')
112
+ plt.xlabel('Epochs')
113
+ plt.ylabel('Tokens')
114
+ plt.legend()
115
+ plt.figure()
116
+ plt.title('Validation Metrics Over Batches')
117
+ plt.plot(val_loss_per_batch, label='Validation Loss')
118
+ plt.plot(val_accuracy_per_batch, label='Validation Accuracy')
119
+ plt.xlabel('Batches')
120
+ plt.ylabel('Metrics')
121
+ plt.legend()
122
+ plt.show()
123
+
124
+ def evaluate_model(model, val_loader, device, return_loss=False):
125
+ model.eval()
126
+ all_preds, all_labels = [], []
127
+ total_loss = 0
128
+
129
+ with torch.no_grad():
130
+ for batch in val_loader:
131
+ input_ids = batch['input_ids'].to(device)
132
+ attention_mask = batch['attention_mask'].to(device)
133
+ labels = batch['label'].to(device)
134
+
135
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
136
+ preds = torch.argmax(outputs.logits, dim=1)
137
+
138
+ all_preds.extend(preds.cpu().numpy())
139
+ all_labels.extend(labels.cpu().numpy())
140
+
141
+ if return_loss:
142
+ loss = loss_fn(outputs.logits, labels)
143
+ total_loss += loss.item()
144
+
145
+ accuracy = accuracy_score(all_labels, all_preds)
146
+ print(f"Validation Accuracy: {accuracy:.4f}")
147
+ if return_loss:
148
+ return total_loss / len(val_loader), accuracy
149
+ return accuracy
150
+
151
+ def plot_accuracies(train, val):
152
+ plt.plot(train, label="Training Accuracy")
153
+ plt.plot(val, label="Validation Accuracy")
154
+ plt.xlabel("Batch")
155
+ plt.ylabel("Accuracy")
156
+ plt.legend()
157
+ plt.show()
158
+
159
+ def stream_file(file_path, chunk_size):
160
+ with open(file_path, 'r', encoding='utf-8') as file:
161
+ chunk = []
162
+ for _, line in enumerate(file):
163
+ record = json.loads(line.strip())
164
+ if "stars" in record and isinstance(record["stars"], (int, float)):
165
+ chunk.append((record["text"], int(record["stars"])))
166
+ if len(chunk) == chunk_size:
167
+ yield chunk
168
+ chunk = []
169
+ if chunk:
170
+ yield chunk
171
+
172
+ if __name__ == "__main__":
173
+ file_path = "yelp_academic_dataset_review.json"
174
+ chunk_size = 10000 # process 10,000 lines at a time
175
+
176
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
177
+ model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)
178
+
179
+ model.dropout = Dropout(p=0.1)
180
+
181
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
182
+ optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
183
+
184
+ all_ratings = []
185
+ for i, chunk in enumerate(stream_file(file_path, chunk_size)):
186
+ _, ratings = zip(*chunk)
187
+ all_ratings.extend(ratings)
188
+ if i + 1 == 5:
189
+ break
190
+
191
+ class_weights = compute_class_weights(all_ratings)
192
+ weights_tensor = torch.tensor([class_weights[i] for i in sorted(class_weights)], dtype=torch.float).to(device)
193
+ loss_fn = CrossEntropyLoss(weight=weights_tensor)
194
+
195
+ first_chunk = next(stream_file(file_path, chunk_size))
196
+ val_texts, val_ratings = zip(*first_chunk[:2000]) # 2000 samples for validation set
197
+ val_dataset = YelpDataset(val_texts, val_ratings, tokenizer)
198
+ val_loader = DataLoader(val_dataset, batch_size=8)
199
+
200
+ for chunk_idx, chunk in enumerate(stream_file(file_path, chunk_size)):
201
+ if chunk_idx + 1 > 10:
202
+ break
203
+ print(f"Processing chunk #{chunk_idx + 1}")
204
+
205
+ texts, ratings = zip(*chunk)
206
+ dataset = YelpDataset(texts, ratings, tokenizer)
207
+ loader = DataLoader(dataset, batch_size=8, shuffle=True)
208
+
209
+ train_model(model, loader, optimizer, epochs=1, device=device, val_loader=val_loader)
210
+
211
+ evaluate_model(model, val_loader, device)
gpt2.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import DataLoader, Dataset
3
+ from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
4
+ from sklearn.metrics import accuracy_score
5
+ import json
6
+ from collections import Counter
7
+ from torch.nn import CrossEntropyLoss, Dropout
8
+ from torch.nn.utils import clip_grad_norm_
9
+ import matplotlib.pyplot as plt
10
+
11
+ class YelpDataset(Dataset):
12
+ def __init__(self, texts, ratings, tokenizer, max_length=128):
13
+ self.texts = texts
14
+ self.ratings = ratings
15
+ self.tokenizer = tokenizer
16
+ self.max_length = max_length
17
+
18
+ def __len__(self):
19
+ return len(self.texts)
20
+
21
+ def __getitem__(self, idx):
22
+ text = self.texts[idx]
23
+ rating = self.ratings[idx] - 1 # turn 1-5 star ratings to 0-4
24
+ encoding = self.tokenizer(
25
+ text,
26
+ truncation=True,
27
+ padding="max_length",
28
+ max_length=self.max_length,
29
+ return_tensors="pt"
30
+ )
31
+ return {
32
+ 'input_ids': encoding['input_ids'].squeeze(0),
33
+ 'attention_mask': encoding['attention_mask'].squeeze(0),
34
+ 'label': torch.tensor(rating, dtype=torch.long)
35
+ }
36
+
37
+ def compute_class_weights(labels):
38
+ class_counts = Counter(labels)
39
+ total_samples = sum(class_counts.values())
40
+ weights = {cls: total_samples / count for cls, count in class_counts.items()}
41
+ return weights
42
+
43
+ def train_model(model, train_loader, optimizer, epochs, device, val_loader=None, patience=3):
44
+ model.to(device)
45
+ best_loss = float('inf')
46
+ patience_counter = 0
47
+ train_accuracies, val_accuracies = [], []
48
+ val_loss_per_batch, val_accuracy_per_batch = [], []
49
+ avg_tokens_all_batches = []
50
+ total_tokens_across_epochs = 0
51
+ total_batches = 0
52
+
53
+ for epoch in range(epochs):
54
+ model.train()
55
+ total_loss = 0
56
+ correct, total = 0, 0
57
+ epoch_tokens = 0
58
+
59
+ for batch in train_loader:
60
+ optimizer.zero_grad()
61
+ input_ids = batch['input_ids'].to(device)
62
+ attention_mask = batch['attention_mask'].to(device)
63
+ labels = batch['label'].to(device)
64
+
65
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
66
+ loss = loss_fn(outputs.logits, labels)
67
+ loss.backward()
68
+ clip_grad_norm_(model.parameters(), max_norm=1.0)
69
+ optimizer.step()
70
+
71
+ total_loss += loss.item()
72
+ preds = torch.argmax(outputs.logits, dim=1)
73
+ correct += (preds == labels).sum().item()
74
+ total += labels.size(0)
75
+ epoch_tokens += input_ids.size(1) * input_ids.size(0) # tokens per batch
76
+ total_batches += 1
77
+
78
+ if val_loader:
79
+ val_loss, val_accuracy = evaluate_model(model, val_loader, device, return_loss=True)
80
+ val_loss_per_batch.append(val_loss)
81
+ val_accuracy_per_batch.append(val_accuracy)
82
+
83
+ total_tokens_across_epochs += epoch_tokens
84
+ avg_loss = total_loss / len(train_loader)
85
+ train_accuracy = correct / total
86
+ train_accuracies.append(train_accuracy)
87
+ print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {avg_loss:.4f} - Training Accuracy: {train_accuracy:.4f}")
88
+
89
+ if val_loader:
90
+ val_loss, val_accuracy = evaluate_model(model, val_loader, device, return_loss=True)
91
+ val_accuracies.append(val_accuracy)
92
+
93
+ if val_loss < best_loss:
94
+ best_loss = val_loss
95
+ patience_counter = 0
96
+ else:
97
+ patience_counter += 1
98
+
99
+ if patience_counter >= patience:
100
+ print("Early stopping triggered.")
101
+ break
102
+
103
+ avg_tokens_all_batches.append(total_tokens_across_epochs / total_batches)
104
+ print(f"Average tokens per batch across all epochs: {avg_tokens_all_batches[-1]:.2f}")
105
+
106
+ if val_loader:
107
+ plot_accuracies(train_accuracies, val_accuracies)
108
+ plt.figure()
109
+ plt.title('Average Tokens per Epoch')
110
+ plt.plot(range(1, len(avg_tokens_all_batches) + 1), avg_tokens_all_batches, label='Avg Tokens per Epoch')
111
+ plt.xlabel('Epochs')
112
+ plt.ylabel('Tokens')
113
+ plt.legend()
114
+ plt.figure()
115
+ plt.title('Validation Metrics Over Batches')
116
+ plt.plot(val_loss_per_batch, label='Validation Loss')
117
+ plt.plot(val_accuracy_per_batch, label='Validation Accuracy')
118
+ plt.xlabel('Batches')
119
+ plt.ylabel('Metrics')
120
+ plt.legend()
121
+ plt.show()
122
+
123
+ def evaluate_model(model, val_loader, device, return_loss=False):
124
+ model.eval()
125
+ all_preds, all_labels = [], []
126
+ total_loss = 0
127
+
128
+ with torch.no_grad():
129
+ for batch in val_loader:
130
+ input_ids = batch['input_ids'].to(device)
131
+ attention_mask = batch['attention_mask'].to(device)
132
+ labels = batch['label'].to(device)
133
+
134
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
135
+ preds = torch.argmax(outputs.logits, dim=1)
136
+
137
+ all_preds.extend(preds.cpu().numpy())
138
+ all_labels.extend(labels.cpu().numpy())
139
+
140
+ if return_loss:
141
+ loss = loss_fn(outputs.logits, labels)
142
+ total_loss += loss.item()
143
+
144
+ accuracy = accuracy_score(all_labels, all_preds)
145
+ print(f"Validation Accuracy: {accuracy:.4f}")
146
+ if return_loss:
147
+ return total_loss / len(val_loader), accuracy
148
+ return accuracy
149
+
150
+ def plot_accuracies(train_accuracies, val_accuracies):
151
+ epochs = range(1, len(train_accuracies) + 1)
152
+ plt.plot(epochs, train_accuracies, label='Training Accuracy')
153
+ plt.plot(epochs, val_accuracies, label='Validation Accuracy')
154
+ plt.xlabel('Epochs')
155
+ plt.ylabel('Accuracy')
156
+ plt.legend()
157
+
158
+ def stream_file(file_path, chunk_size):
159
+ with open(file_path, 'r', encoding='utf-8') as file:
160
+ chunk = []
161
+ for _, line in enumerate(file):
162
+ record = json.loads(line.strip())
163
+ if "stars" in record and isinstance(record["stars"], (int, float)):
164
+ chunk.append((record["text"], int(record["stars"])))
165
+ if len(chunk) == chunk_size:
166
+ yield chunk
167
+ chunk = []
168
+ if chunk:
169
+ yield chunk
170
+
171
+ if __name__ == "__main__":
172
+ file_path = "yelp_academic_dataset_review.json"
173
+ chunk_size = 10000 # process 10,000 lines at a time
174
+
175
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
176
+ tokenizer.pad_token = tokenizer.eos_token
177
+ model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=5)
178
+ model.config.pad_token_id = tokenizer.pad_token_id
179
+
180
+ model.dropout = Dropout(p=0.1)
181
+
182
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
183
+ optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
184
+
185
+ all_ratings = []
186
+ for i, chunk in enumerate(stream_file(file_path, chunk_size)):
187
+ _, ratings = zip(*chunk)
188
+ all_ratings.extend(ratings)
189
+ if i + 1 == 5:
190
+ break
191
+
192
+ class_weights = compute_class_weights(all_ratings)
193
+ weights_tensor = torch.tensor([class_weights[i] for i in sorted(class_weights)], dtype=torch.float).to(device)
194
+ loss_fn = CrossEntropyLoss(weight=weights_tensor)
195
+
196
+ first_chunk = next(stream_file(file_path, chunk_size))
197
+ val_texts, val_ratings = zip(*first_chunk[:2000]) # 2000 samples for validation set
198
+ val_dataset = YelpDataset(val_texts, val_ratings, tokenizer)
199
+ val_loader = DataLoader(val_dataset, batch_size=8)
200
+
201
+ for chunk_idx, chunk in enumerate(stream_file(file_path, chunk_size)):
202
+ if chunk_idx + 1 > 10:
203
+ break
204
+ print(f"Processing chunk #{chunk_idx + 1}")
205
+
206
+ texts, ratings = zip(*chunk)
207
+ dataset = YelpDataset(texts, ratings, tokenizer)
208
+ loader = DataLoader(dataset, batch_size=8, shuffle=True)
209
+
210
+ train_model(model, loader, optimizer, epochs=1, device=device, val_loader=val_loader)
211
+
212
+ evaluate_model(model, val_loader, device)
yelp_academic_dataset_review.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b70da234ab3f62a93cf8fe66c2f75541944526cc9620c8c9673df09a24f5d891
3
+ size 1048274318
yelp_overview.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import matplotlib.pyplot as plt
3
+
4
+ def count_reviews_by_stars_and_average(file_path):
5
+ star_counts = {}
6
+ total_stars = 0
7
+ total_reviews = 0
8
+ total_text_length = 0
9
+ short_text_stars = 0
10
+ short_text_count = 0
11
+ word_frequencies = {}
12
+ word_count_limit = 100000
13
+ star_vote_totals = {stars: {'useful': 0, 'funny': 0, 'cool': 0, 'count': 0} for stars in range(1, 6)}
14
+
15
+ with open(file_path, 'r', encoding='utf-8') as file:
16
+ for i, line in enumerate(file):
17
+ record = json.loads(line.strip())
18
+ if "stars" in record and isinstance(record["stars"], (int, float)) and "text" in record:
19
+ stars = record["stars"]
20
+ text = record["text"]
21
+ text_length = len(text)
22
+
23
+ if stars not in star_counts:
24
+ star_counts[stars] = 0
25
+ star_counts[stars] += 1
26
+
27
+ total_stars += stars
28
+ total_reviews += 1
29
+ total_text_length += text_length
30
+
31
+ if text_length < 10:
32
+ short_text_stars += stars
33
+ short_text_count += 1
34
+
35
+ if i < word_count_limit:
36
+ words = text.lower().split()
37
+ for word in words:
38
+ word = ''.join(char for char in word if char.isalnum())
39
+ if word:
40
+ if word not in word_frequencies:
41
+ word_frequencies[word] = 0
42
+ word_frequencies[word] += 1
43
+
44
+ if "useful" in record and "funny" in record and "cool" in record:
45
+ star_vote_totals[stars]['useful'] += record["useful"]
46
+ star_vote_totals[stars]['funny'] += record["funny"]
47
+ star_vote_totals[stars]['cool'] += record["cool"]
48
+ star_vote_totals[stars]['count'] += 1
49
+
50
+ if total_reviews > 0:
51
+ average_rating = total_stars / total_reviews
52
+ average_text_length = total_text_length / total_reviews
53
+ else:
54
+ average_rating = 0
55
+ average_text_length = 0
56
+
57
+ if short_text_count > 0:
58
+ average_short_text_rating = short_text_stars / short_text_count
59
+ else:
60
+ average_short_text_rating = 0
61
+
62
+ most_common_word = None
63
+ most_common_count = 0
64
+ for word, count in word_frequencies.items():
65
+ if count > most_common_count:
66
+ most_common_word = word
67
+ most_common_count = count
68
+
69
+ average_votes_by_star = {}
70
+ for stars, votes in star_vote_totals.items():
71
+ if votes['count'] > 0:
72
+ average_votes_by_star[stars] = {
73
+ 'useful': votes['useful'] / votes['count'],
74
+ 'funny': votes['funny'] / votes['count'],
75
+ 'cool': votes['cool'] / votes['count']
76
+ }
77
+
78
+ return star_counts, average_rating, average_text_length, average_short_text_rating, most_common_word, most_common_count, average_votes_by_star
79
+
80
+ def plot_reviews_and_votes(star_counts, average_votes_by_star):
81
+ star_ratings = [1, 2, 3, 4, 5]
82
+ review_counts = [star_counts[star] for star in star_ratings]
83
+
84
+ plt.figure(figsize=(10, 6))
85
+
86
+ plt.subplot(1, 2, 1)
87
+ plt.bar(star_ratings, review_counts, color='blue')
88
+ plt.title('Number of Reviews per Star Rating')
89
+ plt.xlabel('Star Rating')
90
+ plt.ylabel('Number of Reviews')
91
+
92
+ useful_votes = [average_votes_by_star[star]['useful'] for star in star_ratings]
93
+ funny_votes = [average_votes_by_star[star]['funny'] for star in star_ratings]
94
+ cool_votes = [average_votes_by_star[star]['cool'] for star in star_ratings]
95
+
96
+ plt.subplot(1, 2, 2)
97
+ width = 0.2
98
+ x = [i - width for i in range(len(star_ratings))]
99
+
100
+ plt.bar(x, useful_votes, width, label='Useful', color='green')
101
+ plt.bar([i + width for i in range(len(star_ratings))], funny_votes, width, label='Funny', color='red')
102
+ plt.bar([i + 2 * width for i in range(len(star_ratings))], cool_votes, width, label='Cool', color='blue')
103
+
104
+ plt.title('Average Votes per Star Rating')
105
+ plt.xlabel('Star Rating')
106
+ plt.ylabel('Average Votes')
107
+ plt.xticks(range(len(star_ratings)), star_ratings)
108
+ plt.legend()
109
+
110
+ plt.tight_layout()
111
+ plt.show()
112
+
113
+ if __name__ == "__main__":
114
+ file_path = "yelp_academic_dataset_review.json"
115
+
116
+ star_counts, average_rating, average_text_length, average_short_text_rating, most_common_word, most_common_count, average_votes_by_star = count_reviews_by_stars_and_average(file_path)
117
+
118
+ for stars in sorted(star_counts):
119
+ print(f"{stars} stars: {star_counts[stars]} reviews")
120
+
121
+ print(f"Average rating: {average_rating:.2f}")
122
+ print(f"Average text length: {average_text_length:.2f} characters")
123
+ print(f"Average rating for reviews with text length < 10: {average_short_text_rating:.2f}")
124
+ print(f"Most common word (in first 100,000 reviews): '{most_common_word}' (used {most_common_count} times)")
125
+
126
+ print("Average votes per star rating:")
127
+ for stars, votes in average_votes_by_star.items():
128
+ print(f"{stars} stars - Useful: {votes['useful']:.2f}, Funny: {votes['funny']:.2f}, Cool: {votes['cool']:.2f}")
129
+
130
+ plot_reviews_and_votes(star_counts, average_votes_by_star)