Upload 4 files
Browse files- .gitattributes +1 -0
- bert.py +211 -0
- gpt2.py +212 -0
- yelp_academic_dataset_review.json +3 -0
- yelp_overview.py +130 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
yelp_academic_dataset_review.json filter=lfs diff=lfs merge=lfs -text
|
bert.py
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.utils.data import DataLoader, Dataset
|
3 |
+
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
|
4 |
+
from sklearn.metrics import accuracy_score
|
5 |
+
import json
|
6 |
+
from collections import Counter
|
7 |
+
from torch.nn import CrossEntropyLoss, Dropout
|
8 |
+
from torch.nn.utils import clip_grad_norm_
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
|
11 |
+
class YelpDataset(Dataset):
|
12 |
+
def __init__(self, texts, ratings, tokenizer, max_length=128):
|
13 |
+
self.texts = texts
|
14 |
+
self.ratings = ratings
|
15 |
+
self.tokenizer = tokenizer
|
16 |
+
self.max_length = max_length
|
17 |
+
|
18 |
+
def __len__(self):
|
19 |
+
return len(self.texts)
|
20 |
+
|
21 |
+
def __getitem__(self, idx):
|
22 |
+
text = self.texts[idx]
|
23 |
+
rating = self.ratings[idx] - 1 # turn 1-5 star ratings to 0-4
|
24 |
+
encoding = self.tokenizer(
|
25 |
+
text,
|
26 |
+
truncation=True,
|
27 |
+
padding="max_length",
|
28 |
+
max_length=self.max_length,
|
29 |
+
return_tensors="pt"
|
30 |
+
)
|
31 |
+
return {
|
32 |
+
'input_ids': encoding['input_ids'].squeeze(0),
|
33 |
+
'attention_mask': encoding['attention_mask'].squeeze(0),
|
34 |
+
'label': torch.tensor(rating, dtype=torch.long)
|
35 |
+
}
|
36 |
+
|
37 |
+
def compute_class_weights(labels):
|
38 |
+
class_counts = Counter(labels)
|
39 |
+
total_samples = sum(class_counts.values())
|
40 |
+
weights = {cls: total_samples / count for cls, count in class_counts.items()}
|
41 |
+
return weights
|
42 |
+
|
43 |
+
def train_model(model, train_loader, optimizer, epochs, device, val_loader=None, patience=3):
|
44 |
+
model.to(device)
|
45 |
+
best_loss = float('inf')
|
46 |
+
patience_counter = 0
|
47 |
+
|
48 |
+
train_accuracies, val_accuracies = [], []
|
49 |
+
val_loss_per_batch, val_accuracy_per_batch = [], []
|
50 |
+
avg_tokens_all_batches = []
|
51 |
+
total_tokens_across_epochs = 0
|
52 |
+
total_batches = 0
|
53 |
+
|
54 |
+
for epoch in range(epochs):
|
55 |
+
model.train()
|
56 |
+
total_loss = 0
|
57 |
+
correct, total = 0, 0
|
58 |
+
epoch_tokens = 0
|
59 |
+
|
60 |
+
for batch in train_loader:
|
61 |
+
optimizer.zero_grad()
|
62 |
+
input_ids = batch['input_ids'].to(device)
|
63 |
+
attention_mask = batch['attention_mask'].to(device)
|
64 |
+
labels = batch['label'].to(device)
|
65 |
+
|
66 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
|
67 |
+
loss = loss_fn(outputs.logits, labels)
|
68 |
+
loss.backward()
|
69 |
+
clip_grad_norm_(model.parameters(), max_norm=1.0)
|
70 |
+
optimizer.step()
|
71 |
+
|
72 |
+
total_loss += loss.item()
|
73 |
+
preds = torch.argmax(outputs.logits, dim=1)
|
74 |
+
correct += (preds == labels).sum().item()
|
75 |
+
total += labels.size(0)
|
76 |
+
epoch_tokens += input_ids.size(1) * input_ids.size(0) # tokens per batch
|
77 |
+
total_batches += 1
|
78 |
+
|
79 |
+
if val_loader:
|
80 |
+
val_loss, val_accuracy = evaluate_model(model, val_loader, device, return_loss=True)
|
81 |
+
val_loss_per_batch.append(val_loss)
|
82 |
+
val_accuracy_per_batch.append(val_accuracy)
|
83 |
+
|
84 |
+
total_tokens_across_epochs += epoch_tokens
|
85 |
+
avg_loss = total_loss / len(train_loader)
|
86 |
+
train_accuracy = correct / total
|
87 |
+
train_accuracies.append(train_accuracy)
|
88 |
+
print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {avg_loss:.4f} - Training Accuracy: {train_accuracy:.4f}")
|
89 |
+
|
90 |
+
if val_loader:
|
91 |
+
val_loss, val_accuracy = evaluate_model(model, val_loader, device, return_loss=True)
|
92 |
+
val_accuracies.append(val_accuracy)
|
93 |
+
|
94 |
+
if val_loss < best_loss:
|
95 |
+
best_loss = val_loss
|
96 |
+
patience_counter = 0
|
97 |
+
else:
|
98 |
+
patience_counter += 1
|
99 |
+
|
100 |
+
if patience_counter >= patience:
|
101 |
+
print("Early stopping triggered.")
|
102 |
+
break
|
103 |
+
|
104 |
+
avg_tokens_all_batches.append(total_tokens_across_epochs / total_batches)
|
105 |
+
print(f"Average tokens per batch across all epochs: {avg_tokens_all_batches[-1]:.2f}")
|
106 |
+
|
107 |
+
if val_loader:
|
108 |
+
plot_accuracies(train_accuracies, val_accuracies)
|
109 |
+
plt.figure()
|
110 |
+
plt.title('Average Tokens per Epoch')
|
111 |
+
plt.plot(range(1, len(avg_tokens_all_batches) + 1), avg_tokens_all_batches, label='Avg Tokens per Epoch')
|
112 |
+
plt.xlabel('Epochs')
|
113 |
+
plt.ylabel('Tokens')
|
114 |
+
plt.legend()
|
115 |
+
plt.figure()
|
116 |
+
plt.title('Validation Metrics Over Batches')
|
117 |
+
plt.plot(val_loss_per_batch, label='Validation Loss')
|
118 |
+
plt.plot(val_accuracy_per_batch, label='Validation Accuracy')
|
119 |
+
plt.xlabel('Batches')
|
120 |
+
plt.ylabel('Metrics')
|
121 |
+
plt.legend()
|
122 |
+
plt.show()
|
123 |
+
|
124 |
+
def evaluate_model(model, val_loader, device, return_loss=False):
|
125 |
+
model.eval()
|
126 |
+
all_preds, all_labels = [], []
|
127 |
+
total_loss = 0
|
128 |
+
|
129 |
+
with torch.no_grad():
|
130 |
+
for batch in val_loader:
|
131 |
+
input_ids = batch['input_ids'].to(device)
|
132 |
+
attention_mask = batch['attention_mask'].to(device)
|
133 |
+
labels = batch['label'].to(device)
|
134 |
+
|
135 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
136 |
+
preds = torch.argmax(outputs.logits, dim=1)
|
137 |
+
|
138 |
+
all_preds.extend(preds.cpu().numpy())
|
139 |
+
all_labels.extend(labels.cpu().numpy())
|
140 |
+
|
141 |
+
if return_loss:
|
142 |
+
loss = loss_fn(outputs.logits, labels)
|
143 |
+
total_loss += loss.item()
|
144 |
+
|
145 |
+
accuracy = accuracy_score(all_labels, all_preds)
|
146 |
+
print(f"Validation Accuracy: {accuracy:.4f}")
|
147 |
+
if return_loss:
|
148 |
+
return total_loss / len(val_loader), accuracy
|
149 |
+
return accuracy
|
150 |
+
|
151 |
+
def plot_accuracies(train, val):
|
152 |
+
plt.plot(train, label="Training Accuracy")
|
153 |
+
plt.plot(val, label="Validation Accuracy")
|
154 |
+
plt.xlabel("Batch")
|
155 |
+
plt.ylabel("Accuracy")
|
156 |
+
plt.legend()
|
157 |
+
plt.show()
|
158 |
+
|
159 |
+
def stream_file(file_path, chunk_size):
|
160 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
161 |
+
chunk = []
|
162 |
+
for _, line in enumerate(file):
|
163 |
+
record = json.loads(line.strip())
|
164 |
+
if "stars" in record and isinstance(record["stars"], (int, float)):
|
165 |
+
chunk.append((record["text"], int(record["stars"])))
|
166 |
+
if len(chunk) == chunk_size:
|
167 |
+
yield chunk
|
168 |
+
chunk = []
|
169 |
+
if chunk:
|
170 |
+
yield chunk
|
171 |
+
|
172 |
+
if __name__ == "__main__":
|
173 |
+
file_path = "yelp_academic_dataset_review.json"
|
174 |
+
chunk_size = 10000 # process 10,000 lines at a time
|
175 |
+
|
176 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
177 |
+
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)
|
178 |
+
|
179 |
+
model.dropout = Dropout(p=0.1)
|
180 |
+
|
181 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
182 |
+
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
|
183 |
+
|
184 |
+
all_ratings = []
|
185 |
+
for i, chunk in enumerate(stream_file(file_path, chunk_size)):
|
186 |
+
_, ratings = zip(*chunk)
|
187 |
+
all_ratings.extend(ratings)
|
188 |
+
if i + 1 == 5:
|
189 |
+
break
|
190 |
+
|
191 |
+
class_weights = compute_class_weights(all_ratings)
|
192 |
+
weights_tensor = torch.tensor([class_weights[i] for i in sorted(class_weights)], dtype=torch.float).to(device)
|
193 |
+
loss_fn = CrossEntropyLoss(weight=weights_tensor)
|
194 |
+
|
195 |
+
first_chunk = next(stream_file(file_path, chunk_size))
|
196 |
+
val_texts, val_ratings = zip(*first_chunk[:2000]) # 2000 samples for validation set
|
197 |
+
val_dataset = YelpDataset(val_texts, val_ratings, tokenizer)
|
198 |
+
val_loader = DataLoader(val_dataset, batch_size=8)
|
199 |
+
|
200 |
+
for chunk_idx, chunk in enumerate(stream_file(file_path, chunk_size)):
|
201 |
+
if chunk_idx + 1 > 10:
|
202 |
+
break
|
203 |
+
print(f"Processing chunk #{chunk_idx + 1}")
|
204 |
+
|
205 |
+
texts, ratings = zip(*chunk)
|
206 |
+
dataset = YelpDataset(texts, ratings, tokenizer)
|
207 |
+
loader = DataLoader(dataset, batch_size=8, shuffle=True)
|
208 |
+
|
209 |
+
train_model(model, loader, optimizer, epochs=1, device=device, val_loader=val_loader)
|
210 |
+
|
211 |
+
evaluate_model(model, val_loader, device)
|
gpt2.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.utils.data import DataLoader, Dataset
|
3 |
+
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
|
4 |
+
from sklearn.metrics import accuracy_score
|
5 |
+
import json
|
6 |
+
from collections import Counter
|
7 |
+
from torch.nn import CrossEntropyLoss, Dropout
|
8 |
+
from torch.nn.utils import clip_grad_norm_
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
|
11 |
+
class YelpDataset(Dataset):
|
12 |
+
def __init__(self, texts, ratings, tokenizer, max_length=128):
|
13 |
+
self.texts = texts
|
14 |
+
self.ratings = ratings
|
15 |
+
self.tokenizer = tokenizer
|
16 |
+
self.max_length = max_length
|
17 |
+
|
18 |
+
def __len__(self):
|
19 |
+
return len(self.texts)
|
20 |
+
|
21 |
+
def __getitem__(self, idx):
|
22 |
+
text = self.texts[idx]
|
23 |
+
rating = self.ratings[idx] - 1 # turn 1-5 star ratings to 0-4
|
24 |
+
encoding = self.tokenizer(
|
25 |
+
text,
|
26 |
+
truncation=True,
|
27 |
+
padding="max_length",
|
28 |
+
max_length=self.max_length,
|
29 |
+
return_tensors="pt"
|
30 |
+
)
|
31 |
+
return {
|
32 |
+
'input_ids': encoding['input_ids'].squeeze(0),
|
33 |
+
'attention_mask': encoding['attention_mask'].squeeze(0),
|
34 |
+
'label': torch.tensor(rating, dtype=torch.long)
|
35 |
+
}
|
36 |
+
|
37 |
+
def compute_class_weights(labels):
|
38 |
+
class_counts = Counter(labels)
|
39 |
+
total_samples = sum(class_counts.values())
|
40 |
+
weights = {cls: total_samples / count for cls, count in class_counts.items()}
|
41 |
+
return weights
|
42 |
+
|
43 |
+
def train_model(model, train_loader, optimizer, epochs, device, val_loader=None, patience=3):
|
44 |
+
model.to(device)
|
45 |
+
best_loss = float('inf')
|
46 |
+
patience_counter = 0
|
47 |
+
train_accuracies, val_accuracies = [], []
|
48 |
+
val_loss_per_batch, val_accuracy_per_batch = [], []
|
49 |
+
avg_tokens_all_batches = []
|
50 |
+
total_tokens_across_epochs = 0
|
51 |
+
total_batches = 0
|
52 |
+
|
53 |
+
for epoch in range(epochs):
|
54 |
+
model.train()
|
55 |
+
total_loss = 0
|
56 |
+
correct, total = 0, 0
|
57 |
+
epoch_tokens = 0
|
58 |
+
|
59 |
+
for batch in train_loader:
|
60 |
+
optimizer.zero_grad()
|
61 |
+
input_ids = batch['input_ids'].to(device)
|
62 |
+
attention_mask = batch['attention_mask'].to(device)
|
63 |
+
labels = batch['label'].to(device)
|
64 |
+
|
65 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
|
66 |
+
loss = loss_fn(outputs.logits, labels)
|
67 |
+
loss.backward()
|
68 |
+
clip_grad_norm_(model.parameters(), max_norm=1.0)
|
69 |
+
optimizer.step()
|
70 |
+
|
71 |
+
total_loss += loss.item()
|
72 |
+
preds = torch.argmax(outputs.logits, dim=1)
|
73 |
+
correct += (preds == labels).sum().item()
|
74 |
+
total += labels.size(0)
|
75 |
+
epoch_tokens += input_ids.size(1) * input_ids.size(0) # tokens per batch
|
76 |
+
total_batches += 1
|
77 |
+
|
78 |
+
if val_loader:
|
79 |
+
val_loss, val_accuracy = evaluate_model(model, val_loader, device, return_loss=True)
|
80 |
+
val_loss_per_batch.append(val_loss)
|
81 |
+
val_accuracy_per_batch.append(val_accuracy)
|
82 |
+
|
83 |
+
total_tokens_across_epochs += epoch_tokens
|
84 |
+
avg_loss = total_loss / len(train_loader)
|
85 |
+
train_accuracy = correct / total
|
86 |
+
train_accuracies.append(train_accuracy)
|
87 |
+
print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {avg_loss:.4f} - Training Accuracy: {train_accuracy:.4f}")
|
88 |
+
|
89 |
+
if val_loader:
|
90 |
+
val_loss, val_accuracy = evaluate_model(model, val_loader, device, return_loss=True)
|
91 |
+
val_accuracies.append(val_accuracy)
|
92 |
+
|
93 |
+
if val_loss < best_loss:
|
94 |
+
best_loss = val_loss
|
95 |
+
patience_counter = 0
|
96 |
+
else:
|
97 |
+
patience_counter += 1
|
98 |
+
|
99 |
+
if patience_counter >= patience:
|
100 |
+
print("Early stopping triggered.")
|
101 |
+
break
|
102 |
+
|
103 |
+
avg_tokens_all_batches.append(total_tokens_across_epochs / total_batches)
|
104 |
+
print(f"Average tokens per batch across all epochs: {avg_tokens_all_batches[-1]:.2f}")
|
105 |
+
|
106 |
+
if val_loader:
|
107 |
+
plot_accuracies(train_accuracies, val_accuracies)
|
108 |
+
plt.figure()
|
109 |
+
plt.title('Average Tokens per Epoch')
|
110 |
+
plt.plot(range(1, len(avg_tokens_all_batches) + 1), avg_tokens_all_batches, label='Avg Tokens per Epoch')
|
111 |
+
plt.xlabel('Epochs')
|
112 |
+
plt.ylabel('Tokens')
|
113 |
+
plt.legend()
|
114 |
+
plt.figure()
|
115 |
+
plt.title('Validation Metrics Over Batches')
|
116 |
+
plt.plot(val_loss_per_batch, label='Validation Loss')
|
117 |
+
plt.plot(val_accuracy_per_batch, label='Validation Accuracy')
|
118 |
+
plt.xlabel('Batches')
|
119 |
+
plt.ylabel('Metrics')
|
120 |
+
plt.legend()
|
121 |
+
plt.show()
|
122 |
+
|
123 |
+
def evaluate_model(model, val_loader, device, return_loss=False):
|
124 |
+
model.eval()
|
125 |
+
all_preds, all_labels = [], []
|
126 |
+
total_loss = 0
|
127 |
+
|
128 |
+
with torch.no_grad():
|
129 |
+
for batch in val_loader:
|
130 |
+
input_ids = batch['input_ids'].to(device)
|
131 |
+
attention_mask = batch['attention_mask'].to(device)
|
132 |
+
labels = batch['label'].to(device)
|
133 |
+
|
134 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
135 |
+
preds = torch.argmax(outputs.logits, dim=1)
|
136 |
+
|
137 |
+
all_preds.extend(preds.cpu().numpy())
|
138 |
+
all_labels.extend(labels.cpu().numpy())
|
139 |
+
|
140 |
+
if return_loss:
|
141 |
+
loss = loss_fn(outputs.logits, labels)
|
142 |
+
total_loss += loss.item()
|
143 |
+
|
144 |
+
accuracy = accuracy_score(all_labels, all_preds)
|
145 |
+
print(f"Validation Accuracy: {accuracy:.4f}")
|
146 |
+
if return_loss:
|
147 |
+
return total_loss / len(val_loader), accuracy
|
148 |
+
return accuracy
|
149 |
+
|
150 |
+
def plot_accuracies(train_accuracies, val_accuracies):
|
151 |
+
epochs = range(1, len(train_accuracies) + 1)
|
152 |
+
plt.plot(epochs, train_accuracies, label='Training Accuracy')
|
153 |
+
plt.plot(epochs, val_accuracies, label='Validation Accuracy')
|
154 |
+
plt.xlabel('Epochs')
|
155 |
+
plt.ylabel('Accuracy')
|
156 |
+
plt.legend()
|
157 |
+
|
158 |
+
def stream_file(file_path, chunk_size):
|
159 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
160 |
+
chunk = []
|
161 |
+
for _, line in enumerate(file):
|
162 |
+
record = json.loads(line.strip())
|
163 |
+
if "stars" in record and isinstance(record["stars"], (int, float)):
|
164 |
+
chunk.append((record["text"], int(record["stars"])))
|
165 |
+
if len(chunk) == chunk_size:
|
166 |
+
yield chunk
|
167 |
+
chunk = []
|
168 |
+
if chunk:
|
169 |
+
yield chunk
|
170 |
+
|
171 |
+
if __name__ == "__main__":
|
172 |
+
file_path = "yelp_academic_dataset_review.json"
|
173 |
+
chunk_size = 10000 # process 10,000 lines at a time
|
174 |
+
|
175 |
+
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
176 |
+
tokenizer.pad_token = tokenizer.eos_token
|
177 |
+
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=5)
|
178 |
+
model.config.pad_token_id = tokenizer.pad_token_id
|
179 |
+
|
180 |
+
model.dropout = Dropout(p=0.1)
|
181 |
+
|
182 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
183 |
+
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
|
184 |
+
|
185 |
+
all_ratings = []
|
186 |
+
for i, chunk in enumerate(stream_file(file_path, chunk_size)):
|
187 |
+
_, ratings = zip(*chunk)
|
188 |
+
all_ratings.extend(ratings)
|
189 |
+
if i + 1 == 5:
|
190 |
+
break
|
191 |
+
|
192 |
+
class_weights = compute_class_weights(all_ratings)
|
193 |
+
weights_tensor = torch.tensor([class_weights[i] for i in sorted(class_weights)], dtype=torch.float).to(device)
|
194 |
+
loss_fn = CrossEntropyLoss(weight=weights_tensor)
|
195 |
+
|
196 |
+
first_chunk = next(stream_file(file_path, chunk_size))
|
197 |
+
val_texts, val_ratings = zip(*first_chunk[:2000]) # 2000 samples for validation set
|
198 |
+
val_dataset = YelpDataset(val_texts, val_ratings, tokenizer)
|
199 |
+
val_loader = DataLoader(val_dataset, batch_size=8)
|
200 |
+
|
201 |
+
for chunk_idx, chunk in enumerate(stream_file(file_path, chunk_size)):
|
202 |
+
if chunk_idx + 1 > 10:
|
203 |
+
break
|
204 |
+
print(f"Processing chunk #{chunk_idx + 1}")
|
205 |
+
|
206 |
+
texts, ratings = zip(*chunk)
|
207 |
+
dataset = YelpDataset(texts, ratings, tokenizer)
|
208 |
+
loader = DataLoader(dataset, batch_size=8, shuffle=True)
|
209 |
+
|
210 |
+
train_model(model, loader, optimizer, epochs=1, device=device, val_loader=val_loader)
|
211 |
+
|
212 |
+
evaluate_model(model, val_loader, device)
|
yelp_academic_dataset_review.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b70da234ab3f62a93cf8fe66c2f75541944526cc9620c8c9673df09a24f5d891
|
3 |
+
size 1048274318
|
yelp_overview.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
|
4 |
+
def count_reviews_by_stars_and_average(file_path):
|
5 |
+
star_counts = {}
|
6 |
+
total_stars = 0
|
7 |
+
total_reviews = 0
|
8 |
+
total_text_length = 0
|
9 |
+
short_text_stars = 0
|
10 |
+
short_text_count = 0
|
11 |
+
word_frequencies = {}
|
12 |
+
word_count_limit = 100000
|
13 |
+
star_vote_totals = {stars: {'useful': 0, 'funny': 0, 'cool': 0, 'count': 0} for stars in range(1, 6)}
|
14 |
+
|
15 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
16 |
+
for i, line in enumerate(file):
|
17 |
+
record = json.loads(line.strip())
|
18 |
+
if "stars" in record and isinstance(record["stars"], (int, float)) and "text" in record:
|
19 |
+
stars = record["stars"]
|
20 |
+
text = record["text"]
|
21 |
+
text_length = len(text)
|
22 |
+
|
23 |
+
if stars not in star_counts:
|
24 |
+
star_counts[stars] = 0
|
25 |
+
star_counts[stars] += 1
|
26 |
+
|
27 |
+
total_stars += stars
|
28 |
+
total_reviews += 1
|
29 |
+
total_text_length += text_length
|
30 |
+
|
31 |
+
if text_length < 10:
|
32 |
+
short_text_stars += stars
|
33 |
+
short_text_count += 1
|
34 |
+
|
35 |
+
if i < word_count_limit:
|
36 |
+
words = text.lower().split()
|
37 |
+
for word in words:
|
38 |
+
word = ''.join(char for char in word if char.isalnum())
|
39 |
+
if word:
|
40 |
+
if word not in word_frequencies:
|
41 |
+
word_frequencies[word] = 0
|
42 |
+
word_frequencies[word] += 1
|
43 |
+
|
44 |
+
if "useful" in record and "funny" in record and "cool" in record:
|
45 |
+
star_vote_totals[stars]['useful'] += record["useful"]
|
46 |
+
star_vote_totals[stars]['funny'] += record["funny"]
|
47 |
+
star_vote_totals[stars]['cool'] += record["cool"]
|
48 |
+
star_vote_totals[stars]['count'] += 1
|
49 |
+
|
50 |
+
if total_reviews > 0:
|
51 |
+
average_rating = total_stars / total_reviews
|
52 |
+
average_text_length = total_text_length / total_reviews
|
53 |
+
else:
|
54 |
+
average_rating = 0
|
55 |
+
average_text_length = 0
|
56 |
+
|
57 |
+
if short_text_count > 0:
|
58 |
+
average_short_text_rating = short_text_stars / short_text_count
|
59 |
+
else:
|
60 |
+
average_short_text_rating = 0
|
61 |
+
|
62 |
+
most_common_word = None
|
63 |
+
most_common_count = 0
|
64 |
+
for word, count in word_frequencies.items():
|
65 |
+
if count > most_common_count:
|
66 |
+
most_common_word = word
|
67 |
+
most_common_count = count
|
68 |
+
|
69 |
+
average_votes_by_star = {}
|
70 |
+
for stars, votes in star_vote_totals.items():
|
71 |
+
if votes['count'] > 0:
|
72 |
+
average_votes_by_star[stars] = {
|
73 |
+
'useful': votes['useful'] / votes['count'],
|
74 |
+
'funny': votes['funny'] / votes['count'],
|
75 |
+
'cool': votes['cool'] / votes['count']
|
76 |
+
}
|
77 |
+
|
78 |
+
return star_counts, average_rating, average_text_length, average_short_text_rating, most_common_word, most_common_count, average_votes_by_star
|
79 |
+
|
80 |
+
def plot_reviews_and_votes(star_counts, average_votes_by_star):
|
81 |
+
star_ratings = [1, 2, 3, 4, 5]
|
82 |
+
review_counts = [star_counts[star] for star in star_ratings]
|
83 |
+
|
84 |
+
plt.figure(figsize=(10, 6))
|
85 |
+
|
86 |
+
plt.subplot(1, 2, 1)
|
87 |
+
plt.bar(star_ratings, review_counts, color='blue')
|
88 |
+
plt.title('Number of Reviews per Star Rating')
|
89 |
+
plt.xlabel('Star Rating')
|
90 |
+
plt.ylabel('Number of Reviews')
|
91 |
+
|
92 |
+
useful_votes = [average_votes_by_star[star]['useful'] for star in star_ratings]
|
93 |
+
funny_votes = [average_votes_by_star[star]['funny'] for star in star_ratings]
|
94 |
+
cool_votes = [average_votes_by_star[star]['cool'] for star in star_ratings]
|
95 |
+
|
96 |
+
plt.subplot(1, 2, 2)
|
97 |
+
width = 0.2
|
98 |
+
x = [i - width for i in range(len(star_ratings))]
|
99 |
+
|
100 |
+
plt.bar(x, useful_votes, width, label='Useful', color='green')
|
101 |
+
plt.bar([i + width for i in range(len(star_ratings))], funny_votes, width, label='Funny', color='red')
|
102 |
+
plt.bar([i + 2 * width for i in range(len(star_ratings))], cool_votes, width, label='Cool', color='blue')
|
103 |
+
|
104 |
+
plt.title('Average Votes per Star Rating')
|
105 |
+
plt.xlabel('Star Rating')
|
106 |
+
plt.ylabel('Average Votes')
|
107 |
+
plt.xticks(range(len(star_ratings)), star_ratings)
|
108 |
+
plt.legend()
|
109 |
+
|
110 |
+
plt.tight_layout()
|
111 |
+
plt.show()
|
112 |
+
|
113 |
+
if __name__ == "__main__":
|
114 |
+
file_path = "yelp_academic_dataset_review.json"
|
115 |
+
|
116 |
+
star_counts, average_rating, average_text_length, average_short_text_rating, most_common_word, most_common_count, average_votes_by_star = count_reviews_by_stars_and_average(file_path)
|
117 |
+
|
118 |
+
for stars in sorted(star_counts):
|
119 |
+
print(f"{stars} stars: {star_counts[stars]} reviews")
|
120 |
+
|
121 |
+
print(f"Average rating: {average_rating:.2f}")
|
122 |
+
print(f"Average text length: {average_text_length:.2f} characters")
|
123 |
+
print(f"Average rating for reviews with text length < 10: {average_short_text_rating:.2f}")
|
124 |
+
print(f"Most common word (in first 100,000 reviews): '{most_common_word}' (used {most_common_count} times)")
|
125 |
+
|
126 |
+
print("Average votes per star rating:")
|
127 |
+
for stars, votes in average_votes_by_star.items():
|
128 |
+
print(f"{stars} stars - Useful: {votes['useful']:.2f}, Funny: {votes['funny']:.2f}, Cool: {votes['cool']:.2f}")
|
129 |
+
|
130 |
+
plot_reviews_and_votes(star_counts, average_votes_by_star)
|