GurgenGulay commited on
Commit
c094434
·
verified ·
1 Parent(s): 994d057

Update fine_tuning.py

Browse files
Files changed (1) hide show
  1. fine_tuning.py +141 -49
fine_tuning.py CHANGED
@@ -1,66 +1,158 @@
1
  from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
2
  from datasets import Dataset
3
  from sklearn.model_selection import train_test_split
 
 
 
 
 
4
 
5
- # Load model and tokenizer
6
- model_name = "t5-base"
7
- tokenizer = T5Tokenizer.from_pretrained(model_name)
8
- model = T5ForConditionalGeneration.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # Prepare the dataset for training
11
- def prepare_data(input_texts, target_texts, tokenizer):
12
- inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length")
13
- targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length")
14
- return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # Paraphrasing function
17
  def paraphrase_with_model(text, model, tokenizer):
18
- prompt = "Teach the following content: " + text
 
 
19
  inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
 
 
20
  output_ids = model.generate(
21
  inputs["input_ids"],
22
- do_sample=False, # For deterministic results
23
- max_length=150,
24
- no_repeat_ngram_size=2,
25
- early_stopping=True
 
 
 
26
  )
27
- return tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
 
28
 
29
- # Fine-tuning function
30
- def fine_tune_model(input_texts, target_texts):
31
- # Split data into training and validation sets
32
- train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts, target_texts, test_size=0.1)
 
 
 
33
 
34
- # Data augmentation with paraphrasing
35
- augmented_input_texts = input_texts + [paraphrase_with_model(text, model, tokenizer) for text in input_texts[:10]]
36
- augmented_target_texts = target_texts + [paraphrase_with_model(text, model, tokenizer) for text in target_texts[:10]]
37
 
38
- train_dataset = Dataset.from_dict(prepare_data(augmented_input_texts, augmented_target_texts, tokenizer))
39
- val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels, tokenizer))
40
-
41
- # Training arguments
42
- training_args = TrainingArguments(
43
- output_dir="./results",
44
- evaluation_strategy="steps",
45
- learning_rate=5e-5,
46
- per_device_train_batch_size=4,
47
- num_train_epochs=3,
48
- save_steps=500,
49
- logging_dir="./logs",
50
- logging_steps=10
51
- )
52
 
53
- # Trainer setup
54
- trainer = Trainer(
55
- model=model,
56
- args=training_args,
57
- train_dataset=train_dataset,
58
- eval_dataset=val_dataset
59
- )
60
 
61
- # Training
62
- trainer.train()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- # Save the fine-tuned model
65
- model.save_pretrained("./fine_tuned_model")
66
- tokenizer.save_pretrained("./fine_tuned_model")
 
1
  from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
2
  from datasets import Dataset
3
  from sklearn.model_selection import train_test_split
4
+ import random
5
+ import re
6
+ from nltk.corpus import stopwords
7
+ from nltk.tokenize import word_tokenize
8
+ from nltk.stem import PorterStemmer
9
 
10
+ def clean_pdf_text(pdf_text):
11
+ # Satır başlarındaki ve sonlarındaki boşlukları temizleyelim
12
+ pdf_text = pdf_text.strip()
13
+
14
+ # Gereksiz satır aralıklarını kaldırma
15
+ pdf_text = re.sub(r'\n+', ' ', pdf_text)
16
+
17
+ # Sayfa numarası gibi gereksiz kısımları kaldırma (örneğin 'Page 1', 'Page 2' gibi)
18
+ pdf_text = re.sub(r'\bPage \d+\b', '', pdf_text)
19
+
20
+ return pdf_text
21
+
22
+ def pdf_to_text(pdf_path):
23
+ """ Converts PDF to text """
24
+ pdf_text = extract_text(pdf_path)
25
+ return clean_pdf_text(pdf_text)
26
+
27
+ # Stop words
28
+ stop_words = set(stopwords.words('english'))
29
+
30
+ # Stemmer
31
+ ps = PorterStemmer()
32
 
33
+ # Metni temizleme fonksiyonu
34
+ def clean_text(text):
35
+ # Noktalama işaretlerini kaldırma
36
+ text = re.sub(r'[^\w\s]', '', text)
37
+
38
+ # Sayıları kaldırma
39
+ text = re.sub(r'\d+', '', text)
40
+
41
+ # Küçük harfe çevirme
42
+ text = text.lower()
43
+
44
+ # Stop words kaldırma
45
+ text = " ".join([word for word in text.split() if word not in stop_words])
46
+
47
+ # Kelimeleri köklerine indirgeme (stemming)
48
+ text = " ".join([ps.stem(word) for word in word_tokenize(text)])
49
+
50
+ return text
51
 
52
+ # Paraphrasing fonksiyonu
53
  def paraphrase_with_model(text, model, tokenizer):
54
+ prompt = "paraphrase: " + text # T5 modeline paraphrasing görevi verdiğimizi belirtiyoruz.
55
+
56
+ # Tokenizer ile metni tokenlara dönüştürme
57
  inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
58
+
59
+ # Sampling yöntemi ile model çalıştırma
60
  output_ids = model.generate(
61
  inputs["input_ids"],
62
+ do_sample=True, # Sampling modunu aktif hale getirme
63
+ top_k=50, # Top-k sampling
64
+ top_p=0.95, # Top-p sampling
65
+ temperature=1.0, # Daha fazla çeşitlilik için temperature
66
+ max_length=150, # Maksimum cümle uzunluğu
67
+ no_repeat_ngram_size=2, # Aynı n-gramların tekrarını engelle
68
+ early_stopping=True # Daha erken durdurma
69
  )
70
+
71
+ # Modelin çıktısını decode ederek metni çözme, maksimum uzunluğu sınırlandırın
72
+ paraphrased_text = tokenizer.decode(output_ids[0], skip_special_tokens=True, max_length=150)
73
 
74
+
75
+ return paraphrased_text
76
+
77
+ # Tokenizer ve modelin yüklenmesi
78
+ model_name = "google-t5/t5-base"
79
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
80
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
81
 
82
+ def prepare_data(input_texts, target_texts):
83
+ inputs = tokenizer(input_texts, max_length=512, truncation=True, padding=True, return_tensors="pt")
84
+ targets = tokenizer(target_texts, max_length=512, truncation=True, padding=True, return_tensors="pt")
85
 
86
+ # Labels olarak hedef metinleri ayarlıyoruz
87
+ inputs["labels"] = targets["input_ids"]
88
+
89
+ # Attention maskeleri de dahil et
90
+ inputs["attention_mask"] = inputs["attention_mask"]
91
+
92
+ return inputs
 
 
 
 
 
 
 
93
 
 
 
 
 
 
 
 
94
 
95
+ # Eğitim verileri
96
+ input_texts = [
97
+ "Site Reliability Engineering is a concept born at Google. Ben Trainor's team of seven people started it in 2003 to keep Google.com running reliably.",
98
+ "Reliability is critical for any system. Without reliability, even the best features are useless as users can't access them.",
99
+ "SRE teams at Google handle large-scale systems with efficiency, working closely with developers to ensure scalability, reliability, and cost-effectiveness.",
100
+ "Site Reliability Engineering treats operations as a software engineering problem, making it distinct from traditional operations teams."]
101
+ target_texts = [
102
+ "SRE was introduced at Google in 2003 by Ben Trainor's team to ensure the reliability of Google.com.",
103
+ "Reliability is essential for a system to be usable; without it, features lose value.",
104
+ "Google's SRE teams collaborate with developers to manage large-scale systems efficiently and reliably.",
105
+ "SRE approaches operations as a software engineering task, revolutionizing traditional operational methods."]
106
+
107
+ # Veriyi temizleme
108
+ input_texts_cleaned = [clean_text(text) for text in input_texts]
109
+ target_texts_cleaned = [clean_text(text) for text in target_texts]
110
+
111
+ # Eğitim ve doğrulama verisini ayırma
112
+ train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1)
113
+
114
+ # Eğitim ve doğrulama verilerini hazırlama
115
+ train_dataset = Dataset.from_dict(prepare_data(train_texts, train_labels))
116
+ val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels))
117
+
118
+ # Eğitim verisini augment etmek (opsiyonel)
119
+ augmented_input_texts = [paraphrase_with_model(text, model, tokenizer) for text in input_texts_cleaned[:10]] # Daha küçük bir örnekle başla
120
+ augmented_target_texts = [paraphrase_with_model(text, model, tokenizer) for text in target_texts_cleaned[:10]] # Aynı şekilde
121
+ augmented_dataset = prepare_data(augmented_input_texts, augmented_target_texts)
122
+ train_dataset = Dataset.from_dict(augmented_dataset)
123
+
124
+ # Eğitim argümanları
125
+ training_args = TrainingArguments(
126
+ output_dir="./results",
127
+ evaluation_strategy="steps",
128
+ learning_rate=5e-5,
129
+ per_device_train_batch_size=4,
130
+ per_device_eval_batch_size=4,
131
+ num_train_epochs=3,
132
+ weight_decay=0.01,
133
+ save_steps=500, # Modeli her 500 adımda bir kaydet
134
+ eval_steps=500, # Değerlendirmeyi her 500 adımda bir yap
135
+ save_total_limit=2,
136
+ logging_dir="./logs",
137
+ logging_steps=10,
138
+ load_best_model_at_end=True
139
+ )
140
+
141
+ # Trainer tanımı
142
+ trainer = Trainer(
143
+ model=model,
144
+ args=training_args,
145
+ train_dataset=train_dataset,
146
+ eval_dataset=val_dataset # Değerlendirme kümesi eklenmeli
147
+ )
148
+
149
+ # Fine-tuning başlatma
150
+ trainer.train()
151
+
152
+ # Fine-tuned modelin kaydedilmesi
153
+ model.save_pretrained("./fine_tuned_model")
154
+ tokenizer.save_pretrained("./fine_tuned_model")
155
 
156
+ # Fine-tuned modelinizi yükleyin
157
+ model = T5ForConditionalGeneration.from_pretrained("./fine_tuned_model")
158
+ tokenizer = T5Tokenizer.from_pretrained("./fine_tuned_model")