Aditi commited on
Commit
fd13009
·
1 Parent(s): f26c53f
FineTuneAndEvaluationscores_CLEANED.ipynb → FineTuneAndEvaluationscores.ipynb RENAMED
File without changes
finetuneandevaluationscores.py DELETED
@@ -1,710 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """FineTuneAndEvaluationscores.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/122o4g9XIObEOsSOo8-ZcfE0tgGRG-QrV
8
- """
9
-
10
- !pip install torch==2.4.1 transformers==4.44.2 datasets==3.0.1 nltk==3.9.1 pandas==2.2.3 matplotlib==3.8.4 evaluate==0.4.5 rouge_score>=0.1.2 sentence-transformers==2.7.0 -q
11
-
12
- # Uninstall conflicting packages
13
- !pip uninstall -y torch torchvision torchaudio pandas fsspec gcsfs -q
14
- # Install compatible versions
15
- !pip install torch torchvision torchaudio pandas transformers datasets nltk matplotlib evaluate rouge_score sentence-transformers -q
16
-
17
- !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O train-v1.1.json
18
-
19
- import json
20
-
21
- with open('train-v1.1.json', 'r', encoding='utf-8') as f:
22
- squad_data = json.load(f)
23
-
24
- # Print the first paragraph to inspect
25
- print("Sample data:", squad_data['data'][0]['paragraphs'][0])
26
-
27
- import pandas as pd
28
- from datasets import Dataset, Features, Value
29
-
30
- data = []
31
- for article in squad_data['data']:
32
- for paragraph in article['paragraphs']:
33
- context = paragraph['context'].strip()
34
- for qa in paragraph['qas']:
35
- question = qa['question'].strip()
36
- answer = qa['answers'][0]['text'].strip() if qa['answers'] else ""
37
- if context and question and answer: # Basic cleaning
38
- data.append({"context": context, "question": question, "answer": answer})
39
-
40
- # Limit to 100 samples for quick testing
41
- data = data[:100]
42
-
43
- # Create DataFrame and Dataset
44
- df = pd.DataFrame(data)
45
- features = Features({
46
- "context": Value("string"),
47
- "question": Value("string"),
48
- "answer": Value("string")
49
- })
50
- dataset = Dataset.from_pandas(df, features=features)
51
- train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
52
- train_dataset = train_test_split["train"]
53
- eval_dataset = train_test_split["test"]
54
-
55
- print(f"Train size: {len(train_dataset)} | Eval size: {len(eval_dataset)}")
56
- print("First train example:", train_dataset[0])
57
-
58
-
59
-
60
- # Install dependencies
61
- !pip uninstall -y torch torchvision torchaudio pandas fsspec gcsfs -q
62
- !pip install torch torchvision torchaudio pandas transformers datasets nltk matplotlib evaluate rouge_score sentence-transformers -q
63
- # Restart runtime after installation
64
-
65
- import json
66
- import pandas as pd
67
- from datasets import Dataset, Features, Value
68
- from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
69
- import evaluate
70
- import matplotlib.pyplot as plt
71
- import torch
72
- import nltk
73
- import numpy as np # Added missing import
74
- nltk.download('punkt')
75
-
76
- # Verify setup
77
- print(f"Torch version: {torch.__version__}")
78
- print(f"GPU available: {torch.cuda.is_available()}")
79
-
80
- # Step 2: Download and load dataset
81
- !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O train-v1.1.json
82
- with open('train-v1.1.json', 'r', encoding='utf-8') as f:
83
- squad_data = json.load(f)
84
- print("Sample data:", squad_data['data'][0]['paragraphs'][0])
85
-
86
- # Step 3: Clean and prepare dataset
87
- data = []
88
- for article in squad_data['data']:
89
- for paragraph in article['paragraphs']:
90
- context = paragraph['context'].strip()
91
- for qa in paragraph['qas']:
92
- question = qa['question'].strip()
93
- answer = qa['answers'][0]['text'].strip() if qa['answers'] else ""
94
- if context and question and answer:
95
- data.append({"context": context, "question": question, "answer": answer})
96
-
97
- data = data[:100]
98
- df = pd.DataFrame(data)
99
- features = Features({
100
- "context": Value("string"),
101
- "question": Value("string"),
102
- "answer": Value("string")
103
- })
104
- dataset = Dataset.from_pandas(df, features=features)
105
- train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
106
- train_dataset = train_test_split["train"]
107
- eval_dataset = train_test_split["test"]
108
- print(f"Train size: {len(train_dataset)} | Eval size: {len(eval_dataset)}")
109
- print("First train example:", train_dataset[0])
110
-
111
- # Step 4: Fine-tune the model
112
- model_name = "valhalla/t5-small-qg-hl"
113
- tokenizer = T5Tokenizer.from_pretrained(model_name)
114
- model = T5ForConditionalGeneration.from_pretrained(model_name)
115
-
116
- def preprocess(examples):
117
- inputs = [f"generate question: {ctx} {ans}" for ctx, ans in zip(examples['context'], examples['answer'])]
118
- targets = examples['question']
119
- model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length", return_tensors=None)
120
- labels = tokenizer(targets, max_length=32, truncation=True, padding="max_length")["input_ids"]
121
- model_inputs["labels"] = labels
122
- return model_inputs
123
-
124
- tokenized_train_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names, batched=True)
125
- tokenized_eval_dataset = eval_dataset.map(preprocess, remove_columns=eval_dataset.column_names, batched=True)
126
-
127
- tokenized_train_dataset = tokenized_train_dataset.with_format("torch")
128
- tokenized_eval_dataset = tokenized_eval_dataset.with_format("torch")
129
-
130
- training_args = TrainingArguments(
131
- output_dir="./qg-finetuned",
132
- per_device_train_batch_size=2,
133
- per_device_eval_batch_size=2,
134
- num_train_epochs=3, # Increased to 3
135
- eval_strategy="epoch",
136
- learning_rate=2e-5,
137
- logging_dir="./logs",
138
- logging_steps=10,
139
- save_strategy="epoch",
140
- save_total_limit=1,
141
- fp16=True,
142
- report_to="none",
143
- load_best_model_at_end=True,
144
- metric_for_best_model="eval_loss",
145
- greater_is_better=False
146
- )
147
-
148
-
149
- def compute_metrics(eval_pred):
150
- predictions, labels = eval_pred
151
- predictions = predictions[0] if isinstance(predictions, tuple) else predictions
152
- predictions = np.argmax(predictions, axis=-1) if predictions.ndim == 3 else predictions
153
- labels = np.argmax(labels, axis=-1) if labels.ndim == 3 else labels
154
-
155
- def decode_sequences(sequences):
156
- return [tokenizer.decode(seq, skip_special_tokens=True) for seq in sequences]
157
-
158
- decoded_preds = decode_sequences(predictions)
159
- decoded_labels = decode_sequences(labels)
160
-
161
- rouge = evaluate.load("rouge")
162
- rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)
163
-
164
- return {
165
- "rouge1": rouge_score["rouge1"],
166
- "rougeL": rouge_score["rougeL"]
167
- }
168
-
169
- trainer = Trainer(
170
- model=model,
171
- args=training_args,
172
- train_dataset=tokenized_train_dataset,
173
- eval_dataset=tokenized_eval_dataset,
174
- compute_metrics=compute_metrics
175
- )
176
-
177
- print("Fine-tuning started...")
178
- trainer.train()
179
- print("Running final evaluation...")
180
- results = trainer.evaluate()
181
- print("Final Evaluation Results:")
182
- for metric, score in results.items():
183
- print(f" {metric}: {score}")
184
-
185
- # Step 5: Generate and evaluate sample questions
186
- from transformers import GenerationConfig
187
- model.eval()
188
- sample = eval_dataset[0]
189
- inputs = tokenizer(f"generate question: {sample['context']} {sample['answer']}", max_length=256, truncation=True, padding="max_length", return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
190
-
191
- generation_config = GenerationConfig(early_stopping=True, num_beams=5, max_length=128) # Adjusted
192
- outputs = model.generate(**inputs, generation_config=generation_config)
193
- generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
194
-
195
- print(f"Context: {sample['context'][:100]}...")
196
- print(f"Answer: {sample['answer']}")
197
- print(f"Generated Question: {generated_question}")
198
- print(f"Reference Question: {sample['question']}")
199
-
200
- # Step 6: Plot evaluation scores
201
- log_history = trainer.state.log_history
202
- epochs = [entry['epoch'] for entry in log_history if 'eval_rouge1' in entry]
203
- rouge1_scores = [entry['eval_rouge1'] for entry in log_history if 'eval_rouge1' in entry]
204
- rougeL_scores = [entry['eval_rougeL'] for entry in log_history if 'eval_rougeL' in entry]
205
-
206
- plt.figure(figsize=(10, 5))
207
- plt.plot(epochs, rouge1_scores, label='ROUGE-1')
208
- plt.plot(epochs, rougeL_scores, label='ROUGE-L')
209
- plt.xlabel('Epoch')
210
- plt.ylabel('Score')
211
- plt.title('Evaluation Scores Over Epochs')
212
- plt.legend()
213
- plt.grid(True)
214
- plt.show()
215
-
216
- # Step 7: Save the model
217
- model.save_pretrained("./qg-finetuned/final")
218
- tokenizer.save_pretrained("./qg-finetuned/final")
219
- print("Model and tokenizer saved!")
220
-
221
-
222
-
223
- # Install dependencies
224
- !pip uninstall -y torch torchvision torchaudio pandas fsspec gcsfs -q
225
- !pip install torch torchvision torchaudio pandas transformers datasets nltk matplotlib evaluate rouge_score sentence-transformers -q
226
- # Restart runtime after installation
227
-
228
- import json
229
- import pandas as pd
230
- from datasets import Dataset, Features, Value
231
- from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
232
- import evaluate
233
- import matplotlib.pyplot as plt
234
- import torch
235
- import nltk
236
- import numpy as np # Added missing import
237
- nltk.download('punkt')
238
-
239
- # Verify setup
240
- print(f"Torch version: {torch.__version__}")
241
- print(f"GPU available: {torch.cuda.is_available()}")
242
-
243
- # Step 2: Download and load dataset
244
- !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O train-v1.1.json
245
- with open('train-v1.1.json', 'r', encoding='utf-8') as f:
246
- squad_data = json.load(f)
247
- print("Sample data:", squad_data['data'][0]['paragraphs'][0])
248
-
249
- # Step 3: Clean and prepare dataset
250
- data = []
251
- for article in squad_data['data']:
252
- for paragraph in article['paragraphs']:
253
- context = paragraph['context'].strip()
254
- for qa in paragraph['qas']:
255
- question = qa['question'].strip()
256
- answer = qa['answers'][0]['text'].strip() if qa['answers'] else ""
257
- if context and question and answer:
258
- data.append({"context": context, "question": question, "answer": answer})
259
-
260
- data = data[:800]
261
- df = pd.DataFrame(data)
262
- features = Features({
263
- "context": Value("string"),
264
- "question": Value("string"),
265
- "answer": Value("string")
266
- })
267
- dataset = Dataset.from_pandas(df, features=features)
268
- train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
269
- train_dataset = train_test_split["train"]
270
- eval_dataset = train_test_split["test"]
271
- print(f"Train size: {len(train_dataset)} | Eval size: {len(eval_dataset)}")
272
- print("First train example:", train_dataset[0])
273
-
274
- # Step 4: Fine-tune the model
275
- model_name = "valhalla/t5-small-qg-hl"
276
- tokenizer = T5Tokenizer.from_pretrained(model_name)
277
- model = T5ForConditionalGeneration.from_pretrained(model_name)
278
-
279
- def preprocess(examples):
280
- inputs = [f"generate question: {ctx} {ans}" for ctx, ans in zip(examples['context'], examples['answer'])]
281
- targets = examples['question']
282
- model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length", return_tensors=None)
283
- labels = tokenizer(targets, max_length=32, truncation=True, padding="max_length")["input_ids"]
284
- model_inputs["labels"] = labels
285
- return model_inputs
286
-
287
- tokenized_train_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names, batched=True)
288
- tokenized_eval_dataset = eval_dataset.map(preprocess, remove_columns=eval_dataset.column_names, batched=True)
289
-
290
- tokenized_train_dataset = tokenized_train_dataset.with_format("torch")
291
- tokenized_eval_dataset = tokenized_eval_dataset.with_format("torch")
292
-
293
- training_args = TrainingArguments(
294
- output_dir="./qg-finetuned",
295
- per_device_train_batch_size=4,
296
- per_device_eval_batch_size=4,
297
- num_train_epochs=2,
298
- eval_strategy="epoch",
299
- learning_rate=2e-5,
300
- logging_dir="./logs",
301
- logging_steps=10,
302
- save_strategy="epoch",
303
- save_total_limit=1,
304
- fp16=True,
305
- report_to="none",
306
- load_best_model_at_end=True,
307
- metric_for_best_model="eval_loss",
308
- greater_is_better=False
309
- )
310
-
311
-
312
- def compute_metrics(eval_pred):
313
- predictions, labels = eval_pred
314
- predictions = predictions[0] if isinstance(predictions, tuple) else predictions
315
- predictions = np.argmax(predictions, axis=-1) if predictions.ndim == 3 else predictions
316
- labels = np.argmax(labels, axis=-1) if labels.ndim == 3 else labels
317
-
318
- def decode_sequences(sequences):
319
- return [tokenizer.decode(seq, skip_special_tokens=True) for seq in sequences]
320
-
321
- decoded_preds = decode_sequences(predictions)
322
- decoded_labels = decode_sequences(labels)
323
-
324
- rouge = evaluate.load("rouge")
325
- rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)
326
-
327
- return {
328
- "rouge1": rouge_score["rouge1"],
329
- "rougeL": rouge_score["rougeL"]
330
- }
331
-
332
- trainer = Trainer(
333
- model=model,
334
- args=training_args,
335
- train_dataset=tokenized_train_dataset,
336
- eval_dataset=tokenized_eval_dataset,
337
- compute_metrics=compute_metrics
338
- )
339
-
340
- print("Fine-tuning started...")
341
- trainer.train()
342
- print("Running final evaluation...")
343
- results = trainer.evaluate()
344
- print("Final Evaluation Results:")
345
- for metric, score in results.items():
346
- print(f" {metric}: {score}")
347
-
348
- # Step 5: Generate and evaluate sample questions
349
- from transformers import GenerationConfig
350
- model.eval()
351
- sample = eval_dataset[0]
352
- inputs = tokenizer(f"generate question: {sample['context']} {sample['answer']}", max_length=256, truncation=True, padding="max_length", return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
353
-
354
- generation_config = GenerationConfig(early_stopping=True, num_beams=5, max_length=128) # Adjusted
355
- outputs = model.generate(**inputs, generation_config=generation_config)
356
- generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
357
-
358
- print(f"Context: {sample['context'][:100]}...")
359
- print(f"Answer: {sample['answer']}")
360
- print(f"Generated Question: {generated_question}")
361
- print(f"Reference Question: {sample['question']}")
362
-
363
- # Step 6: Plot evaluation scores
364
- log_history = trainer.state.log_history
365
- epochs = [entry['epoch'] for entry in log_history if 'eval_rouge1' in entry]
366
- rouge1_scores = [entry['eval_rouge1'] for entry in log_history if 'eval_rouge1' in entry]
367
- rougeL_scores = [entry['eval_rougeL'] for entry in log_history if 'eval_rougeL' in entry]
368
-
369
- plt.figure(figsize=(10, 5))
370
- plt.plot(epochs, rouge1_scores, label='ROUGE-1')
371
- plt.plot(epochs, rougeL_scores, label='ROUGE-L')
372
- plt.xlabel('Epoch')
373
- plt.ylabel('Score')
374
- plt.title('Evaluation Scores Over Epochs')
375
- plt.legend()
376
- plt.grid(True)
377
- plt.show()
378
-
379
- # Step 7: Save the model
380
- model.save_pretrained("./qg-finetuned/final")
381
- tokenizer.save_pretrained("./qg-finetuned/final")
382
- print("Model and tokenizer saved!")
383
-
384
-
385
-
386
- # Install dependencies
387
- !pip uninstall -y torch torchvision torchaudio pandas fsspec gcsfs -q
388
- !pip install torch torchvision torchaudio pandas transformers datasets nltk matplotlib evaluate rouge_score sentence-transformers -q
389
- # Restart runtime after installation
390
-
391
- import json
392
- import pandas as pd
393
- from datasets import Dataset, Features, Value
394
- from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
395
- import evaluate
396
- import matplotlib.pyplot as plt
397
- import torch
398
- import nltk
399
- import numpy as np # Added missing import
400
- nltk.download('punkt')
401
-
402
- # Verify setup
403
- print(f"Torch version: {torch.__version__}")
404
- print(f"GPU available: {torch.cuda.is_available()}")
405
-
406
- # Step 2: Download and load dataset
407
- !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O train-v1.1.json
408
- with open('train-v1.1.json', 'r', encoding='utf-8') as f:
409
- squad_data = json.load(f)
410
- print("Sample data:", squad_data['data'][0]['paragraphs'][0])
411
-
412
- # Step 3: Clean and prepare dataset
413
- data = []
414
- for article in squad_data['data']:
415
- for paragraph in article['paragraphs']:
416
- context = paragraph['context'].strip()
417
- for qa in paragraph['qas']:
418
- question = qa['question'].strip()
419
- answer = qa['answers'][0]['text'].strip() if qa['answers'] else ""
420
- if context and question and answer:
421
- data.append({"context": context, "question": question, "answer": answer})
422
-
423
- data = data[:800]
424
- df = pd.DataFrame(data)
425
- features = Features({
426
- "context": Value("string"),
427
- "question": Value("string"),
428
- "answer": Value("string")
429
- })
430
- dataset = Dataset.from_pandas(df, features=features)
431
- train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
432
- train_dataset = train_test_split["train"]
433
- eval_dataset = train_test_split["test"]
434
- print(f"Train size: {len(train_dataset)} | Eval size: {len(eval_dataset)}")
435
- print("First train example:", train_dataset[0])
436
-
437
- # Step 4: Fine-tune the model
438
- model_name = "valhalla/t5-small-qg-hl"
439
- tokenizer = T5Tokenizer.from_pretrained(model_name)
440
- model = T5ForConditionalGeneration.from_pretrained(model_name)
441
-
442
- def preprocess(examples):
443
- inputs = []
444
- for ctx, ans in zip(examples['context'], examples['answer']):
445
- if ans in ctx:
446
- highlighted = ctx.replace(ans, f"<hl> {ans} <hl>")
447
- inputs.append(f"generate question: {highlighted}")
448
- else:
449
- inputs.append(f"generate question: {ctx} <hl> {ans} <hl>")
450
- targets = examples['question']
451
- model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length", return_tensors=None)
452
- labels = tokenizer(targets, max_length=32, truncation=True, padding="max_length")["input_ids"]
453
- model_inputs["labels"] = labels
454
- return model_inputs
455
-
456
- tokenized_train_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names, batched=True)
457
- tokenized_eval_dataset = eval_dataset.map(preprocess, remove_columns=eval_dataset.column_names, batched=True)
458
-
459
- tokenized_train_dataset = tokenized_train_dataset.with_format("torch")
460
- tokenized_eval_dataset = tokenized_eval_dataset.with_format("torch")
461
-
462
- training_args = TrainingArguments(
463
- output_dir="./qg-finetuned",
464
- per_device_train_batch_size=4,
465
- per_device_eval_batch_size=4,
466
- num_train_epochs=2,
467
- eval_strategy="epoch",
468
- learning_rate=2e-5,
469
- logging_dir="./logs",
470
- logging_steps=10,
471
- save_strategy="epoch",
472
- save_total_limit=1,
473
- fp16=True,
474
- report_to="none",
475
- load_best_model_at_end=True,
476
- metric_for_best_model="eval_loss",
477
- greater_is_better=False
478
- )
479
-
480
-
481
- def compute_metrics(eval_pred):
482
- predictions, labels = eval_pred
483
- predictions = predictions[0] if isinstance(predictions, tuple) else predictions
484
- predictions = np.argmax(predictions, axis=-1) if predictions.ndim == 3 else predictions
485
- labels = np.argmax(labels, axis=-1) if labels.ndim == 3 else labels
486
-
487
- def decode_sequences(sequences):
488
- return [tokenizer.decode(seq, skip_special_tokens=True) for seq in sequences]
489
-
490
- decoded_preds = decode_sequences(predictions)
491
- decoded_labels = decode_sequences(labels)
492
-
493
- rouge = evaluate.load("rouge")
494
- rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)
495
-
496
- return {
497
- "rouge1": rouge_score["rouge1"],
498
- "rougeL": rouge_score["rougeL"]
499
- }
500
-
501
- trainer = Trainer(
502
- model=model,
503
- args=training_args,
504
- train_dataset=tokenized_train_dataset,
505
- eval_dataset=tokenized_eval_dataset,
506
- compute_metrics=compute_metrics
507
- )
508
-
509
- print("Fine-tuning started...")
510
- trainer.train()
511
- print("Running final evaluation...")
512
- results = trainer.evaluate()
513
- print("Final Evaluation Results:")
514
- for metric, score in results.items():
515
- print(f" {metric}: {score}")
516
-
517
- # Step 5: Generate and evaluate sample questions
518
- from transformers import GenerationConfig
519
- model.eval()
520
- sample = eval_dataset[0]
521
- inputs = tokenizer(f"generate question: {sample['context']} {sample['answer']}", max_length=256, truncation=True, padding="max_length", return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
522
-
523
- generation_config = GenerationConfig(early_stopping=True, num_beams=5, max_length=128) # Adjusted
524
- outputs = model.generate(**inputs, generation_config=generation_config)
525
- generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
526
-
527
- print(f"Context: {sample['context'][:100]}...")
528
- print(f"Answer: {sample['answer']}")
529
- print(f"Generated Question: {generated_question}")
530
- print(f"Reference Question: {sample['question']}")
531
-
532
- # Step 6: Plot evaluation scores
533
- log_history = trainer.state.log_history
534
- epochs = [entry['epoch'] for entry in log_history if 'eval_rouge1' in entry]
535
- rouge1_scores = [entry['eval_rouge1'] for entry in log_history if 'eval_rouge1' in entry]
536
- rougeL_scores = [entry['eval_rougeL'] for entry in log_history if 'eval_rougeL' in entry]
537
-
538
- plt.figure(figsize=(10, 5))
539
- plt.plot(epochs, rouge1_scores, label='ROUGE-1')
540
- plt.plot(epochs, rougeL_scores, label='ROUGE-L')
541
- plt.xlabel('Epoch')
542
- plt.ylabel('Score')
543
- plt.title('Evaluation Scores Over Epochs')
544
- plt.legend()
545
- plt.grid(True)
546
- plt.show()
547
-
548
- # Step 7: Save the model
549
- model.save_pretrained("./qg-finetuned/final")
550
- tokenizer.save_pretrained("./qg-finetuned/final")
551
- print("Model and tokenizer saved!")
552
-
553
- from tqdm import tqdm
554
-
555
- decoded_preds = []
556
- decoded_refs = []
557
-
558
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
559
- model.to(device)
560
- model.eval()
561
-
562
- for i, sample in enumerate(tqdm(eval_dataset)):
563
- if sample["answer"] in sample["context"]:
564
- highlighted_context = sample["context"].replace(sample["answer"], f"<hl> {sample['answer']} <hl>")
565
- else:
566
- highlighted_context = sample["context"] + f" <hl> {sample['answer']} <hl>"
567
-
568
- input_text = f"generate question: {highlighted_context}"
569
- inputs = tokenizer(
570
- input_text,
571
- return_tensors="pt",
572
- truncation=True,
573
- padding="max_length",
574
- max_length=256
575
- ).to(device)
576
-
577
- output_ids = model.generate(
578
- **inputs,
579
- max_length=64,
580
- num_beams=4,
581
- early_stopping=False, # <— loosen this up for now
582
- no_repeat_ngram_size=2
583
- )
584
-
585
- # 🪵 Debug print
586
- print(f"\n--- Sample {i + 1} ---")
587
- print("Raw token IDs:", output_ids[0].tolist())
588
-
589
- decoded_pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
590
- print("Decoded Prediction:", decoded_pred)
591
-
592
- decoded_preds.append(decoded_pred)
593
- decoded_refs.append(sample["question"])
594
-
595
- from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
596
-
597
- # Use smoothing to avoid zero score for short outputs
598
- smoothie = SmoothingFunction().method1
599
-
600
- bleu_scores = []
601
- print("\nSample Predictions vs References with BLEU-1:")
602
- print("-" * 50)
603
-
604
- for i in range(min(5, len(decoded_preds))):
605
- pred = decoded_preds[i]
606
- ref = decoded_refs[i]
607
- bleu = sentence_bleu([ref.split()], pred.split(), weights=(1, 0, 0, 0), smoothing_function=smoothie)
608
-
609
- print(f"\nSample {i + 1}")
610
- print(f"Prediction : {pred}")
611
- print(f"Reference : {ref}")
612
- print(f"BLEU-1 : {bleu:.4f}")
613
- bleu_scores.append(bleu)
614
-
615
- # Compute average BLEU-1 score across all examples
616
- avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
617
- print(f"\nAverage BLEU-1 Score on Eval Set: {avg_bleu:.4f}")
618
-
619
- from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
620
-
621
- for i, (pred, ref) in enumerate(zip(decoded_preds, decoded_refs)):
622
- bleu2 = sentence_bleu([ref.split()], pred.split(), weights=(0.5, 0.5), smoothing_function=smoothie)
623
- bleu4 = sentence_bleu([ref.split()], pred.split(), weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
624
- print(f"Sample {i+1}\nBLEU-2: {bleu2:.4f}, BLEU-4: {bleu4:.4f}")
625
-
626
- print("Length of decoded_preds:", len(decoded_preds))
627
- print("Length of decoded_refs:", len(decoded_refs))
628
- print("Length of bleu_scores:", len(bleu_scores))
629
-
630
- from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
631
-
632
- smoothing = SmoothingFunction().method1
633
- bleu_scores = [
634
- sentence_bleu([ref.split()], pred.split(), weights=(1, 0, 0, 0), smoothing_function=smoothing)
635
- for pred, ref in zip(decoded_preds, decoded_refs)
636
- ]
637
-
638
- df = pd.DataFrame({
639
- "Prediction": decoded_preds,
640
- "Reference": decoded_refs,
641
- "BLEU-1": bleu_scores
642
- })
643
- df.to_csv("question_generation_bleu_scores.csv", index=False)
644
-
645
- #preview of the file
646
- import pandas as pd
647
-
648
- df_check = pd.read_csv("question_generation_bleu_scores.csv")
649
- print(df_check.head())
650
-
651
- # Plot ROUGE-1 and ROUGE-L scores over epochs
652
- plt.figure(figsize=(10, 5))
653
- plt.plot(epochs, rouge1_scores, marker='o', label='ROUGE-1')
654
- plt.plot(epochs, rougeL_scores, marker='o', label='ROUGE-L')
655
- plt.xlabel('Epoch')
656
- plt.ylabel('Score')
657
- plt.title('ROUGE Scores over Epochs')
658
- plt.legend()
659
- plt.grid(True)
660
- plt.tight_layout()
661
- plt.show()
662
-
663
- #ADD TO YOUR REPORT :
664
- #INTERPRETATION : The line plot shows a steady increase in both ROUGE-1 and ROUGE-L scores over training epochs, indicating that the model's ability to generate relevant and coherent questions improved progressively. ROUGE-1 evaluates unigram overlap, while ROUGE-L captures longest common subsequence similarity, so their combined trend confirms enhanced syntactic and semantic alignment with reference questions.
665
-
666
- #Histogram: BLEU-1 Score Distribution
667
- import matplotlib.pyplot as plt
668
-
669
- # BLEU score histogram
670
- plt.figure(figsize=(8, 4))
671
- plt.hist(bleu_scores, bins=10, color='skyblue', edgecolor='black')
672
- plt.title('BLEU-1 Score Distribution')
673
- plt.xlabel('BLEU-1 Score')
674
- plt.ylabel('Frequency')
675
- plt.grid(True)
676
- plt.tight_layout()
677
- plt.show()
678
-
679
- #INTERPRETATION : The BLEU-1 histogram reveals that most generated questions received lower unigram overlap scores, with only a few predictions achieving high similarity with the reference. This is expected in generative tasks, especially when multiple valid phrasings exist for a single question.
680
-
681
- print("Length of BLEU-1 scores:", len(bleu_scores))
682
- print("Length of ROUGE-1 scores:", len(rouge1_scores))
683
-
684
- import evaluate
685
- rouge = evaluate.load("rouge")
686
-
687
- rouge1_scores = []
688
- rougeL_scores = []
689
-
690
- for pred, ref in zip(decoded_preds, decoded_refs):
691
- result = rouge.compute(predictions=[pred], references=[ref])
692
- rouge1_scores.append(result["rouge1"])
693
- rougeL_scores.append(result["rougeL"])
694
-
695
- print("Length of BLEU-1 scores:", len(bleu_scores))
696
- print("Length of ROUGE-1 scores:", len(rouge1_scores))
697
-
698
- #Scatter Plot Between BLEU-1 and ROUGE-1
699
- import matplotlib.pyplot as plt
700
-
701
- plt.figure(figsize=(8, 6))
702
- plt.scatter(bleu_scores, rouge1_scores, alpha=0.6, color='purple')
703
- plt.title('BLEU-1 vs ROUGE-1 Scores')
704
- plt.xlabel('BLEU-1 Score')
705
- plt.ylabel('ROUGE-1 Score')
706
- plt.grid(True)
707
- plt.show()
708
-
709
- #Interpretation : To assess the quality of the generated questions, we computed BLEU-1, ROUGE-1, and ROUGE-L scores across the evaluation set. While BLEU-1 captures exact n-gram overlap, ROUGE measures both lexical and semantic similarity more flexibly. A scatter plot comparing BLEU-1 and ROUGE-1 scores showed moderate variation, with some samples scoring high on ROUGE despite lower BLEU, suggesting semantic validity despite lexical mismatch. This highlights the limitation of using a single metric and motivates multi-metric evaluation for generative tasks.
710
-