NoorAfaqi commited on
Commit
eacad74
·
verified ·
1 Parent(s): 58f43d3

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ spoc-train-train.tsv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """F219273-F219151_A3_Q2.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/18DYHQ-7maDZJIdgRFN2yLqXw713PdPdb
8
+ """
9
+
10
+ import math
11
+ import random
12
+ import time
13
+ import torch
14
+ import torch.nn as nn
15
+ import torch.optim as optim
16
+ import torch.nn.functional as F
17
+ import pandas as pd
18
+ from collections import Counter
19
+ from torch.nn.utils.rnn import pad_sequence
20
+ from torch.utils.data import DataLoader, Dataset
21
+ from tqdm import tqdm
22
+
23
+ PAD_TOKEN = "<pad>"
24
+ SOS_TOKEN = "<sos>"
25
+ EOS_TOKEN = "<eos>"
26
+ def simple_tokenizer(text):
27
+ return text.strip().split()
28
+
29
+ def build_vocab(sentences, min_freq=1):
30
+ counts = Counter(token for sentence in sentences for token in sentence)
31
+ vocab = {PAD_TOKEN: 0, SOS_TOKEN: 1, EOS_TOKEN: 2}
32
+ idx = len(vocab)
33
+ for token, count in counts.items():
34
+ if count >= min_freq and token not in vocab:
35
+ vocab[token] = idx
36
+ idx += 1
37
+ return vocab
38
+ def numericalize(sentence, vocab):
39
+ return [vocab[SOS_TOKEN]] + [vocab[token] for token in sentence if token in vocab] + [vocab[EOS_TOKEN]]
40
+
41
+ class PseudoCodeDataset(Dataset):
42
+ def __init__(self, data, src_vocab=None, tgt_vocab=None, build_vocabs=False, reverse_columns=False):
43
+ if isinstance(data, str):
44
+ self.df = pd.read_csv(data)
45
+ else:
46
+ self.df = data.copy()
47
+ # Force switch the first two columns if needed
48
+ if reverse_columns:
49
+ self.df = self.df.iloc[:, [1, 0]]
50
+ else:
51
+ self.df = self.df.iloc[:, :2]
52
+ # Rename columns: first column becomes "text" (source) and second becomes "code" (target)
53
+ self.df.columns = ["text", "code"]
54
+ # Fill missing values in both columns
55
+ self.df["text"] = self.df["text"].fillna("")
56
+ self.df["code"] = self.df["code"].fillna("")
57
+ # Tokenize the source and target strings
58
+ self.df["src_tokens"] = self.df["text"].apply(simple_tokenizer)
59
+ self.df["tgt_tokens"] = self.df["code"].apply(simple_tokenizer)
60
+
61
+ if build_vocabs:
62
+ self.src_vocab = build_vocab(self.df["src_tokens"].tolist())
63
+ self.tgt_vocab = build_vocab(self.df["tgt_tokens"].tolist())
64
+ else:
65
+ self.src_vocab = src_vocab
66
+ self.tgt_vocab = tgt_vocab
67
+
68
+ self.df["src_indices"] = self.df["src_tokens"].apply(lambda tokens: numericalize(tokens, self.src_vocab))
69
+ self.df["tgt_indices"] = self.df["tgt_tokens"].apply(lambda tokens: numericalize(tokens, self.tgt_vocab))
70
+ self.data = list(zip(self.df["src_indices"].tolist(), self.df["tgt_indices"].tolist()))
71
+
72
+ def __len__(self):
73
+ return len(self.data)
74
+
75
+ def __getitem__(self, idx):
76
+ return self.data[idx]
77
+
78
+ def collate_fn(batch):
79
+ src_batch, tgt_batch = zip(*batch)
80
+ src_tensors = [torch.tensor(seq, dtype=torch.long) for seq in src_batch]
81
+ tgt_tensors = [torch.tensor(seq, dtype=torch.long) for seq in tgt_batch]
82
+ src_padded = pad_sequence(src_tensors, batch_first=True, padding_value=0)
83
+ tgt_padded = pad_sequence(tgt_tensors, batch_first=True, padding_value=0)
84
+ return src_padded, tgt_padded
85
+
86
+ class PositionalEncoding(nn.Module):
87
+ def __init__(self, d_model, dropout=0.1, max_len=5000):
88
+ super().__init__()
89
+ self.dropout = nn.Dropout(p=dropout)
90
+ pe = torch.zeros(max_len, d_model)
91
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
92
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0)/d_model))
93
+ pe[:, 0::2] = torch.sin(position * div_term)
94
+ pe[:, 1::2] = torch.cos(position * div_term)
95
+ self.register_buffer('pe', pe.unsqueeze(0)) # (1, max_len, d_model)
96
+ def forward(self, x):
97
+ return self.dropout(x + self.pe[:, :x.size(1)])
98
+
99
+ class Transformer(nn.Module):
100
+ def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, nhead=8,
101
+ num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048,
102
+ dropout=0.1):
103
+ super().__init__()
104
+ self.d_model = d_model
105
+ self.src_embedding = nn.Embedding(src_vocab_size, d_model)
106
+ self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
107
+ self.pos_encoder = PositionalEncoding(d_model, dropout)
108
+ self.pos_decoder = PositionalEncoding(d_model, dropout)
109
+ self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers,
110
+ dim_feedforward, dropout)
111
+ self.fc_out = nn.Linear(d_model, tgt_vocab_size)
112
+ def generate_square_subsequent_mask(self, sz):
113
+ mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
114
+ mask = mask.float().masked_fill(mask==0, float('-inf')).masked_fill(mask==1, float(0.0))
115
+ return mask
116
+ def forward(self, src, tgt):
117
+ src_seq_len = src.size(1)
118
+ tgt_seq_len = tgt.size(1)
119
+ src_emb = self.src_embedding(src) * math.sqrt(self.d_model)
120
+ src_emb = self.pos_encoder(src_emb)
121
+ tgt_emb = self.tgt_embedding(tgt) * math.sqrt(self.d_model)
122
+ tgt_emb = self.pos_decoder(tgt_emb)
123
+ src_emb = src_emb.transpose(0, 1)
124
+ tgt_emb = tgt_emb.transpose(0, 1)
125
+ tgt_mask = self.generate_square_subsequent_mask(tgt_emb.size(0)).to(src.device)
126
+ output = self.transformer(src_emb, tgt_emb, tgt_mask=tgt_mask)
127
+ output = self.fc_out(output)
128
+ return output.transpose(0, 1)
129
+
130
+ dft = pd.read_csv("spoc-train-train.tsv", sep="\t")
131
+ dfe = pd.read_csv("spoc-train-eval.tsv", sep="\t")
132
+ dfts = pd.read_csv("spoc-train-test.tsv", sep="\t")
133
+ # Sample 20% of each dataset
134
+ first_two_columns_train = dft.iloc[:, :2]
135
+ first_two_columns_eval = dfe.iloc[:, :2]
136
+ first_two_columns_test = dfts.iloc[:, :2]
137
+ print("Train Data (first two columns):")
138
+ print(first_two_columns_train.head())
139
+
140
+ train_dataset = PseudoCodeDataset(first_two_columns_train, build_vocabs=True, reverse_columns=True)
141
+ eval_dataset = PseudoCodeDataset(first_two_columns_eval, src_vocab=train_dataset.src_vocab,
142
+ tgt_vocab=train_dataset.tgt_vocab, build_vocabs=False, reverse_columns=True)
143
+ test_dataset = PseudoCodeDataset(first_two_columns_test, src_vocab=train_dataset.src_vocab,
144
+ tgt_vocab=train_dataset.tgt_vocab, build_vocabs=False, reverse_columns=True)
145
+ BATCH_SIZE = 64
146
+ train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
147
+ eval_loader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
148
+
149
+ # ----- Training Functions -----
150
+ def train_epoch(model, dataloader, criterion, optimizer, device):
151
+ model.train()
152
+ total_loss = 0
153
+ progress_bar = tqdm(dataloader, desc="Training", leave=False)
154
+ for src_batch, tgt_batch in progress_bar:
155
+ src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
156
+ optimizer.zero_grad()
157
+ tgt_input = tgt_batch[:, :-1]
158
+ tgt_expected = tgt_batch[:, 1:]
159
+ output = model(src_batch, tgt_input)
160
+ output = output.reshape(-1, output.size(-1))
161
+ tgt_expected = tgt_expected.reshape(-1)
162
+ loss = criterion(output, tgt_expected)
163
+ loss.backward()
164
+ optimizer.step()
165
+ total_loss += loss.item()
166
+ progress_bar.set_postfix(loss=loss.item())
167
+ return total_loss / len(dataloader)
168
+
169
+ def evaluate(model, dataloader, criterion, device):
170
+ model.eval()
171
+ total_loss = 0
172
+ with torch.no_grad():
173
+ progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)
174
+ for src_batch, tgt_batch in progress_bar:
175
+ src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
176
+ tgt_input = tgt_batch[:, :-1]
177
+ tgt_expected = tgt_batch[:, 1:]
178
+ output = model(src_batch, tgt_input)
179
+ output = output.reshape(-1, output.size(-1))
180
+ tgt_expected = tgt_expected.reshape(-1)
181
+ loss = criterion(output, tgt_expected)
182
+ total_loss += loss.item()
183
+ progress_bar.set_postfix(loss=loss.item())
184
+ return total_loss / len(dataloader)
185
+
186
+ def generate_output(model, src_sentence, src_vocab, tgt_vocab, device, max_len=50):
187
+ model.eval()
188
+ tokens = simple_tokenizer(src_sentence)
189
+ src_indices = numericalize(tokens, src_vocab)
190
+ src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)
191
+ tgt_indices = [tgt_vocab[SOS_TOKEN]]
192
+ for _ in range(max_len):
193
+ tgt_tensor = torch.tensor(tgt_indices, dtype=torch.long).unsqueeze(0).to(device)
194
+ with torch.no_grad():
195
+ output = model(src_tensor, tgt_tensor)
196
+ next_token = torch.argmax(output[0, -1, :]).item()
197
+ tgt_indices.append(next_token)
198
+ if next_token == tgt_vocab[EOS_TOKEN]:
199
+ break
200
+ inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
201
+ generated_tokens = [inv_tgt_vocab[idx] for idx in tgt_indices if idx not in (tgt_vocab[SOS_TOKEN], tgt_vocab[EOS_TOKEN])]
202
+ return " ".join(generated_tokens)
203
+
204
+ # ----- Training Loop -----
205
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
206
+ model = Transformer(src_vocab_size=len(train_dataset.src_vocab),
207
+ tgt_vocab_size=len(train_dataset.tgt_vocab)).to(DEVICE)
208
+ criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.src_vocab[PAD_TOKEN])
209
+ optimizer = optim.Adam(model.parameters(), lr=1e-4)
210
+ NUM_EPOCHS = 2 # Increase for a better-trained model
211
+
212
+
213
+ # Define device
214
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
215
+
216
+ # Instantiate the model (assuming train_dataset is already defined)
217
+ model = Transformer(
218
+ src_vocab_size=len(train_dataset.src_vocab),
219
+ tgt_vocab_size=len(train_dataset.tgt_vocab)
220
+ ).to(device)
221
+
222
+ # Load model checkpoint and set to evaluation mode
223
+ model.load_state_dict(torch.load("transformer_code.pth", map_location=device))
224
+ model.eval()
225
+
226
+ def generate_output(model, src_sentence, src_vocab, tgt_vocab, device, max_len=50):
227
+ model.eval()
228
+ tokens = simple_tokenizer(src_sentence)
229
+ src_indices = numericalize(tokens, src_vocab)
230
+ src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)
231
+ tgt_indices = [tgt_vocab[SOS_TOKEN]]
232
+
233
+ for _ in range(max_len):
234
+ tgt_tensor = torch.tensor(tgt_indices, dtype=torch.long).unsqueeze(0).to(device)
235
+ with torch.no_grad():
236
+ output = model(src_tensor, tgt_tensor)
237
+ next_token = torch.argmax(output[0, -1, :]).item()
238
+ tgt_indices.append(next_token)
239
+ if next_token == tgt_vocab[EOS_TOKEN]:
240
+ break
241
+
242
+ inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
243
+ generated_tokens = [
244
+ inv_tgt_vocab[idx] for idx in tgt_indices
245
+ if idx not in (tgt_vocab[SOS_TOKEN], tgt_vocab[EOS_TOKEN])
246
+ ]
247
+ return " ".join(generated_tokens)
248
+
249
+ # ----- Inference Example -----
250
+ sample_code = "cin >> s;"
251
+ generated_pseudo = generate_output(model, sample_code, train_dataset.src_vocab, train_dataset.tgt_vocab, device)
252
+ print("\nSample C++ Code:")
253
+ print(sample_code)
254
+ print("\nGenerated Pseudocode:")
255
+ print(generated_pseudo)
256
+
257
+ import gradio as gr
258
+ import torch
259
+
260
+ # Load model
261
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
262
+
263
+ # Instantiate the model (assuming train_dataset is already defined)
264
+ model = Transformer(
265
+ src_vocab_size=len(train_dataset.src_vocab),
266
+ tgt_vocab_size=len(train_dataset.tgt_vocab)
267
+ ).to(device)
268
+
269
+ # Load model checkpoint and set to evaluation mode
270
+ model.load_state_dict(torch.load("transformer_code.pth", map_location=device))
271
+ model.eval()
272
+
273
+ # Define inference function
274
+ def generate_pseudocode(code):
275
+ generated_pseudo = generate_output(model, code, train_dataset.src_vocab, train_dataset.tgt_vocab, device)
276
+ return generated_pseudo
277
+
278
+ # Gradio UI
279
+ demo = gr.Interface(
280
+ fn=generate_pseudocode,
281
+ inputs=gr.Textbox(lines=5, placeholder="Enter C++ code here..."),
282
+ outputs=gr.Textbox(label="Generated Pseudocode"),
283
+ title="Code to Pseudocode Generator",
284
+ description="Enter C++ code, and the model will generate pseudocode."
285
+ )
286
+
287
+ demo.launch()
288
+
spoc-train-eval.tsv ADDED
The diff for this file is too large to render. See raw diff
 
spoc-train-test.tsv ADDED
The diff for this file is too large to render. See raw diff
 
spoc-train-train.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06ca8235985484651aa7761af60eadffacb85a201261e01c9af54b1c83870648
3
+ size 14165947
transformer_code.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4d188a72d9a15f6791a05c1b211b359c119ed80a23f5d466717152664ba00bc
3
+ size 371538696