Upload 5 files
Browse files- .gitattributes +1 -0
- app.py +288 -0
- spoc-train-eval.tsv +0 -0
- spoc-train-test.tsv +0 -0
- spoc-train-train.tsv +3 -0
- transformer_code.pth +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
spoc-train-train.tsv filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""F219273-F219151_A3_Q2.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/18DYHQ-7maDZJIdgRFN2yLqXw713PdPdb
|
8 |
+
"""
|
9 |
+
|
10 |
+
import math
|
11 |
+
import random
|
12 |
+
import time
|
13 |
+
import torch
|
14 |
+
import torch.nn as nn
|
15 |
+
import torch.optim as optim
|
16 |
+
import torch.nn.functional as F
|
17 |
+
import pandas as pd
|
18 |
+
from collections import Counter
|
19 |
+
from torch.nn.utils.rnn import pad_sequence
|
20 |
+
from torch.utils.data import DataLoader, Dataset
|
21 |
+
from tqdm import tqdm
|
22 |
+
|
23 |
+
PAD_TOKEN = "<pad>"
|
24 |
+
SOS_TOKEN = "<sos>"
|
25 |
+
EOS_TOKEN = "<eos>"
|
26 |
+
def simple_tokenizer(text):
|
27 |
+
return text.strip().split()
|
28 |
+
|
29 |
+
def build_vocab(sentences, min_freq=1):
|
30 |
+
counts = Counter(token for sentence in sentences for token in sentence)
|
31 |
+
vocab = {PAD_TOKEN: 0, SOS_TOKEN: 1, EOS_TOKEN: 2}
|
32 |
+
idx = len(vocab)
|
33 |
+
for token, count in counts.items():
|
34 |
+
if count >= min_freq and token not in vocab:
|
35 |
+
vocab[token] = idx
|
36 |
+
idx += 1
|
37 |
+
return vocab
|
38 |
+
def numericalize(sentence, vocab):
|
39 |
+
return [vocab[SOS_TOKEN]] + [vocab[token] for token in sentence if token in vocab] + [vocab[EOS_TOKEN]]
|
40 |
+
|
41 |
+
class PseudoCodeDataset(Dataset):
|
42 |
+
def __init__(self, data, src_vocab=None, tgt_vocab=None, build_vocabs=False, reverse_columns=False):
|
43 |
+
if isinstance(data, str):
|
44 |
+
self.df = pd.read_csv(data)
|
45 |
+
else:
|
46 |
+
self.df = data.copy()
|
47 |
+
# Force switch the first two columns if needed
|
48 |
+
if reverse_columns:
|
49 |
+
self.df = self.df.iloc[:, [1, 0]]
|
50 |
+
else:
|
51 |
+
self.df = self.df.iloc[:, :2]
|
52 |
+
# Rename columns: first column becomes "text" (source) and second becomes "code" (target)
|
53 |
+
self.df.columns = ["text", "code"]
|
54 |
+
# Fill missing values in both columns
|
55 |
+
self.df["text"] = self.df["text"].fillna("")
|
56 |
+
self.df["code"] = self.df["code"].fillna("")
|
57 |
+
# Tokenize the source and target strings
|
58 |
+
self.df["src_tokens"] = self.df["text"].apply(simple_tokenizer)
|
59 |
+
self.df["tgt_tokens"] = self.df["code"].apply(simple_tokenizer)
|
60 |
+
|
61 |
+
if build_vocabs:
|
62 |
+
self.src_vocab = build_vocab(self.df["src_tokens"].tolist())
|
63 |
+
self.tgt_vocab = build_vocab(self.df["tgt_tokens"].tolist())
|
64 |
+
else:
|
65 |
+
self.src_vocab = src_vocab
|
66 |
+
self.tgt_vocab = tgt_vocab
|
67 |
+
|
68 |
+
self.df["src_indices"] = self.df["src_tokens"].apply(lambda tokens: numericalize(tokens, self.src_vocab))
|
69 |
+
self.df["tgt_indices"] = self.df["tgt_tokens"].apply(lambda tokens: numericalize(tokens, self.tgt_vocab))
|
70 |
+
self.data = list(zip(self.df["src_indices"].tolist(), self.df["tgt_indices"].tolist()))
|
71 |
+
|
72 |
+
def __len__(self):
|
73 |
+
return len(self.data)
|
74 |
+
|
75 |
+
def __getitem__(self, idx):
|
76 |
+
return self.data[idx]
|
77 |
+
|
78 |
+
def collate_fn(batch):
|
79 |
+
src_batch, tgt_batch = zip(*batch)
|
80 |
+
src_tensors = [torch.tensor(seq, dtype=torch.long) for seq in src_batch]
|
81 |
+
tgt_tensors = [torch.tensor(seq, dtype=torch.long) for seq in tgt_batch]
|
82 |
+
src_padded = pad_sequence(src_tensors, batch_first=True, padding_value=0)
|
83 |
+
tgt_padded = pad_sequence(tgt_tensors, batch_first=True, padding_value=0)
|
84 |
+
return src_padded, tgt_padded
|
85 |
+
|
86 |
+
class PositionalEncoding(nn.Module):
|
87 |
+
def __init__(self, d_model, dropout=0.1, max_len=5000):
|
88 |
+
super().__init__()
|
89 |
+
self.dropout = nn.Dropout(p=dropout)
|
90 |
+
pe = torch.zeros(max_len, d_model)
|
91 |
+
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
92 |
+
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0)/d_model))
|
93 |
+
pe[:, 0::2] = torch.sin(position * div_term)
|
94 |
+
pe[:, 1::2] = torch.cos(position * div_term)
|
95 |
+
self.register_buffer('pe', pe.unsqueeze(0)) # (1, max_len, d_model)
|
96 |
+
def forward(self, x):
|
97 |
+
return self.dropout(x + self.pe[:, :x.size(1)])
|
98 |
+
|
99 |
+
class Transformer(nn.Module):
|
100 |
+
def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, nhead=8,
|
101 |
+
num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048,
|
102 |
+
dropout=0.1):
|
103 |
+
super().__init__()
|
104 |
+
self.d_model = d_model
|
105 |
+
self.src_embedding = nn.Embedding(src_vocab_size, d_model)
|
106 |
+
self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
|
107 |
+
self.pos_encoder = PositionalEncoding(d_model, dropout)
|
108 |
+
self.pos_decoder = PositionalEncoding(d_model, dropout)
|
109 |
+
self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers,
|
110 |
+
dim_feedforward, dropout)
|
111 |
+
self.fc_out = nn.Linear(d_model, tgt_vocab_size)
|
112 |
+
def generate_square_subsequent_mask(self, sz):
|
113 |
+
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
|
114 |
+
mask = mask.float().masked_fill(mask==0, float('-inf')).masked_fill(mask==1, float(0.0))
|
115 |
+
return mask
|
116 |
+
def forward(self, src, tgt):
|
117 |
+
src_seq_len = src.size(1)
|
118 |
+
tgt_seq_len = tgt.size(1)
|
119 |
+
src_emb = self.src_embedding(src) * math.sqrt(self.d_model)
|
120 |
+
src_emb = self.pos_encoder(src_emb)
|
121 |
+
tgt_emb = self.tgt_embedding(tgt) * math.sqrt(self.d_model)
|
122 |
+
tgt_emb = self.pos_decoder(tgt_emb)
|
123 |
+
src_emb = src_emb.transpose(0, 1)
|
124 |
+
tgt_emb = tgt_emb.transpose(0, 1)
|
125 |
+
tgt_mask = self.generate_square_subsequent_mask(tgt_emb.size(0)).to(src.device)
|
126 |
+
output = self.transformer(src_emb, tgt_emb, tgt_mask=tgt_mask)
|
127 |
+
output = self.fc_out(output)
|
128 |
+
return output.transpose(0, 1)
|
129 |
+
|
130 |
+
dft = pd.read_csv("spoc-train-train.tsv", sep="\t")
|
131 |
+
dfe = pd.read_csv("spoc-train-eval.tsv", sep="\t")
|
132 |
+
dfts = pd.read_csv("spoc-train-test.tsv", sep="\t")
|
133 |
+
# Sample 20% of each dataset
|
134 |
+
first_two_columns_train = dft.iloc[:, :2]
|
135 |
+
first_two_columns_eval = dfe.iloc[:, :2]
|
136 |
+
first_two_columns_test = dfts.iloc[:, :2]
|
137 |
+
print("Train Data (first two columns):")
|
138 |
+
print(first_two_columns_train.head())
|
139 |
+
|
140 |
+
train_dataset = PseudoCodeDataset(first_two_columns_train, build_vocabs=True, reverse_columns=True)
|
141 |
+
eval_dataset = PseudoCodeDataset(first_two_columns_eval, src_vocab=train_dataset.src_vocab,
|
142 |
+
tgt_vocab=train_dataset.tgt_vocab, build_vocabs=False, reverse_columns=True)
|
143 |
+
test_dataset = PseudoCodeDataset(first_two_columns_test, src_vocab=train_dataset.src_vocab,
|
144 |
+
tgt_vocab=train_dataset.tgt_vocab, build_vocabs=False, reverse_columns=True)
|
145 |
+
BATCH_SIZE = 64
|
146 |
+
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
|
147 |
+
eval_loader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
|
148 |
+
|
149 |
+
# ----- Training Functions -----
|
150 |
+
def train_epoch(model, dataloader, criterion, optimizer, device):
|
151 |
+
model.train()
|
152 |
+
total_loss = 0
|
153 |
+
progress_bar = tqdm(dataloader, desc="Training", leave=False)
|
154 |
+
for src_batch, tgt_batch in progress_bar:
|
155 |
+
src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
|
156 |
+
optimizer.zero_grad()
|
157 |
+
tgt_input = tgt_batch[:, :-1]
|
158 |
+
tgt_expected = tgt_batch[:, 1:]
|
159 |
+
output = model(src_batch, tgt_input)
|
160 |
+
output = output.reshape(-1, output.size(-1))
|
161 |
+
tgt_expected = tgt_expected.reshape(-1)
|
162 |
+
loss = criterion(output, tgt_expected)
|
163 |
+
loss.backward()
|
164 |
+
optimizer.step()
|
165 |
+
total_loss += loss.item()
|
166 |
+
progress_bar.set_postfix(loss=loss.item())
|
167 |
+
return total_loss / len(dataloader)
|
168 |
+
|
169 |
+
def evaluate(model, dataloader, criterion, device):
|
170 |
+
model.eval()
|
171 |
+
total_loss = 0
|
172 |
+
with torch.no_grad():
|
173 |
+
progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)
|
174 |
+
for src_batch, tgt_batch in progress_bar:
|
175 |
+
src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
|
176 |
+
tgt_input = tgt_batch[:, :-1]
|
177 |
+
tgt_expected = tgt_batch[:, 1:]
|
178 |
+
output = model(src_batch, tgt_input)
|
179 |
+
output = output.reshape(-1, output.size(-1))
|
180 |
+
tgt_expected = tgt_expected.reshape(-1)
|
181 |
+
loss = criterion(output, tgt_expected)
|
182 |
+
total_loss += loss.item()
|
183 |
+
progress_bar.set_postfix(loss=loss.item())
|
184 |
+
return total_loss / len(dataloader)
|
185 |
+
|
186 |
+
def generate_output(model, src_sentence, src_vocab, tgt_vocab, device, max_len=50):
|
187 |
+
model.eval()
|
188 |
+
tokens = simple_tokenizer(src_sentence)
|
189 |
+
src_indices = numericalize(tokens, src_vocab)
|
190 |
+
src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)
|
191 |
+
tgt_indices = [tgt_vocab[SOS_TOKEN]]
|
192 |
+
for _ in range(max_len):
|
193 |
+
tgt_tensor = torch.tensor(tgt_indices, dtype=torch.long).unsqueeze(0).to(device)
|
194 |
+
with torch.no_grad():
|
195 |
+
output = model(src_tensor, tgt_tensor)
|
196 |
+
next_token = torch.argmax(output[0, -1, :]).item()
|
197 |
+
tgt_indices.append(next_token)
|
198 |
+
if next_token == tgt_vocab[EOS_TOKEN]:
|
199 |
+
break
|
200 |
+
inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
|
201 |
+
generated_tokens = [inv_tgt_vocab[idx] for idx in tgt_indices if idx not in (tgt_vocab[SOS_TOKEN], tgt_vocab[EOS_TOKEN])]
|
202 |
+
return " ".join(generated_tokens)
|
203 |
+
|
204 |
+
# ----- Training Loop -----
|
205 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
206 |
+
model = Transformer(src_vocab_size=len(train_dataset.src_vocab),
|
207 |
+
tgt_vocab_size=len(train_dataset.tgt_vocab)).to(DEVICE)
|
208 |
+
criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.src_vocab[PAD_TOKEN])
|
209 |
+
optimizer = optim.Adam(model.parameters(), lr=1e-4)
|
210 |
+
NUM_EPOCHS = 2 # Increase for a better-trained model
|
211 |
+
|
212 |
+
|
213 |
+
# Define device
|
214 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
215 |
+
|
216 |
+
# Instantiate the model (assuming train_dataset is already defined)
|
217 |
+
model = Transformer(
|
218 |
+
src_vocab_size=len(train_dataset.src_vocab),
|
219 |
+
tgt_vocab_size=len(train_dataset.tgt_vocab)
|
220 |
+
).to(device)
|
221 |
+
|
222 |
+
# Load model checkpoint and set to evaluation mode
|
223 |
+
model.load_state_dict(torch.load("transformer_code.pth", map_location=device))
|
224 |
+
model.eval()
|
225 |
+
|
226 |
+
def generate_output(model, src_sentence, src_vocab, tgt_vocab, device, max_len=50):
|
227 |
+
model.eval()
|
228 |
+
tokens = simple_tokenizer(src_sentence)
|
229 |
+
src_indices = numericalize(tokens, src_vocab)
|
230 |
+
src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)
|
231 |
+
tgt_indices = [tgt_vocab[SOS_TOKEN]]
|
232 |
+
|
233 |
+
for _ in range(max_len):
|
234 |
+
tgt_tensor = torch.tensor(tgt_indices, dtype=torch.long).unsqueeze(0).to(device)
|
235 |
+
with torch.no_grad():
|
236 |
+
output = model(src_tensor, tgt_tensor)
|
237 |
+
next_token = torch.argmax(output[0, -1, :]).item()
|
238 |
+
tgt_indices.append(next_token)
|
239 |
+
if next_token == tgt_vocab[EOS_TOKEN]:
|
240 |
+
break
|
241 |
+
|
242 |
+
inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
|
243 |
+
generated_tokens = [
|
244 |
+
inv_tgt_vocab[idx] for idx in tgt_indices
|
245 |
+
if idx not in (tgt_vocab[SOS_TOKEN], tgt_vocab[EOS_TOKEN])
|
246 |
+
]
|
247 |
+
return " ".join(generated_tokens)
|
248 |
+
|
249 |
+
# ----- Inference Example -----
|
250 |
+
sample_code = "cin >> s;"
|
251 |
+
generated_pseudo = generate_output(model, sample_code, train_dataset.src_vocab, train_dataset.tgt_vocab, device)
|
252 |
+
print("\nSample C++ Code:")
|
253 |
+
print(sample_code)
|
254 |
+
print("\nGenerated Pseudocode:")
|
255 |
+
print(generated_pseudo)
|
256 |
+
|
257 |
+
import gradio as gr
|
258 |
+
import torch
|
259 |
+
|
260 |
+
# Load model
|
261 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
262 |
+
|
263 |
+
# Instantiate the model (assuming train_dataset is already defined)
|
264 |
+
model = Transformer(
|
265 |
+
src_vocab_size=len(train_dataset.src_vocab),
|
266 |
+
tgt_vocab_size=len(train_dataset.tgt_vocab)
|
267 |
+
).to(device)
|
268 |
+
|
269 |
+
# Load model checkpoint and set to evaluation mode
|
270 |
+
model.load_state_dict(torch.load("transformer_code.pth", map_location=device))
|
271 |
+
model.eval()
|
272 |
+
|
273 |
+
# Define inference function
|
274 |
+
def generate_pseudocode(code):
|
275 |
+
generated_pseudo = generate_output(model, code, train_dataset.src_vocab, train_dataset.tgt_vocab, device)
|
276 |
+
return generated_pseudo
|
277 |
+
|
278 |
+
# Gradio UI
|
279 |
+
demo = gr.Interface(
|
280 |
+
fn=generate_pseudocode,
|
281 |
+
inputs=gr.Textbox(lines=5, placeholder="Enter C++ code here..."),
|
282 |
+
outputs=gr.Textbox(label="Generated Pseudocode"),
|
283 |
+
title="Code to Pseudocode Generator",
|
284 |
+
description="Enter C++ code, and the model will generate pseudocode."
|
285 |
+
)
|
286 |
+
|
287 |
+
demo.launch()
|
288 |
+
|
spoc-train-eval.tsv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
spoc-train-test.tsv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
spoc-train-train.tsv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:06ca8235985484651aa7761af60eadffacb85a201261e01c9af54b1c83870648
|
3 |
+
size 14165947
|
transformer_code.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d4d188a72d9a15f6791a05c1b211b359c119ed80a23f5d466717152664ba00bc
|
3 |
+
size 371538696
|