GurgenGulay commited on
Commit
769a5e8
·
verified ·
1 Parent(s): f942ea9

Update fine_tuning.py

Browse files
Files changed (1) hide show
  1. fine_tuning.py +44 -101
fine_tuning.py CHANGED
@@ -1,158 +1,101 @@
1
  from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
2
  from datasets import Dataset
3
  from sklearn.model_selection import train_test_split
4
- import random
5
  import re
6
  from nltk.corpus import stopwords
7
  from nltk.tokenize import word_tokenize
8
  from nltk.stem import PorterStemmer
9
 
10
- def clean_pdf_text(pdf_text):
11
- # Satır başlarındaki ve sonlarındaki boşlukları temizleyelim
12
- pdf_text = pdf_text.strip()
13
-
14
- # Gereksiz satır aralıklarını kaldırma
15
- pdf_text = re.sub(r'\n+', ' ', pdf_text)
16
-
17
- # Sayfa numarası gibi gereksiz kısımları kaldırma (örneğin 'Page 1', 'Page 2' gibi)
18
- pdf_text = re.sub(r'\bPage \d+\b', '', pdf_text)
19
-
20
- return pdf_text
21
-
22
- def pdf_to_text(pdf_path):
23
- """ Converts PDF to text """
24
- pdf_text = extract_text(pdf_path)
25
- return clean_pdf_text(pdf_text)
26
-
27
- # Stop words
28
  stop_words = set(stopwords.words('english'))
29
-
30
- # Stemmer
31
  ps = PorterStemmer()
32
 
33
  # Metni temizleme fonksiyonu
34
  def clean_text(text):
35
- # Noktalama işaretlerini kaldırma
36
  text = re.sub(r'[^\w\s]', '', text)
37
-
38
- # Sayıları kaldırma
39
  text = re.sub(r'\d+', '', text)
40
-
41
- # Küçük harfe çevirme
42
  text = text.lower()
43
-
44
- # Stop words kaldırma
45
  text = " ".join([word for word in text.split() if word not in stop_words])
46
-
47
- # Kelimeleri köklerine indirgeme (stemming)
48
  text = " ".join([ps.stem(word) for word in word_tokenize(text)])
49
-
50
  return text
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  # Paraphrasing fonksiyonu
53
  def paraphrase_with_model(text, model, tokenizer):
54
- prompt = "paraphrase: " + text # T5 modeline paraphrasing görevi verdiğimizi belirtiyoruz.
55
-
56
- # Tokenizer ile metni tokenlara dönüştürme
57
  inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
58
-
59
- # Sampling yöntemi ile model çalıştırma
60
  output_ids = model.generate(
61
  inputs["input_ids"],
62
- do_sample=True, # Sampling modunu aktif hale getirme
63
- top_k=50, # Top-k sampling
64
- top_p=0.95, # Top-p sampling
65
- temperature=1.0, # Daha fazla çeşitlilik için temperature
66
- max_length=150, # Maksimum cümle uzunluğu
67
- no_repeat_ngram_size=2, # Aynı n-gramların tekrarını engelle
68
- early_stopping=True # Daha erken durdurma
69
  )
70
-
71
- # Modelin çıktısını decode ederek metni çözme, maksimum uzunluğu sınırlandırın
72
- paraphrased_text = tokenizer.decode(output_ids[0], skip_special_tokens=True, max_length=150)
73
-
74
-
75
- return paraphrased_text
76
 
77
- # Tokenizer ve modelin yüklenmesi
78
- model_name = "google-t5/t5-base"
79
  tokenizer = T5Tokenizer.from_pretrained(model_name)
80
  model = T5ForConditionalGeneration.from_pretrained(model_name)
81
 
82
- def prepare_data(input_texts, target_texts):
83
- inputs = tokenizer(input_texts, max_length=512, truncation=True, padding=True, return_tensors="pt")
84
- targets = tokenizer(target_texts, max_length=512, truncation=True, padding=True, return_tensors="pt")
85
-
86
- # Labels olarak hedef metinleri ayarlıyoruz
87
- inputs["labels"] = targets["input_ids"]
88
-
89
- # Attention maskeleri de dahil et
90
- inputs["attention_mask"] = inputs["attention_mask"]
91
-
92
- return inputs
93
-
94
-
95
- # Eğitim verileri
96
- input_texts = [
97
- "Site Reliability Engineering is a concept born at Google. Ben Trainor's team of seven people started it in 2003 to keep Google.com running reliably.",
98
- "Reliability is critical for any system. Without reliability, even the best features are useless as users can't access them.",
99
- "SRE teams at Google handle large-scale systems with efficiency, working closely with developers to ensure scalability, reliability, and cost-effectiveness.",
100
- "Site Reliability Engineering treats operations as a software engineering problem, making it distinct from traditional operations teams."]
101
- target_texts = [
102
- "SRE was introduced at Google in 2003 by Ben Trainor's team to ensure the reliability of Google.com.",
103
- "Reliability is essential for a system to be usable; without it, features lose value.",
104
- "Google's SRE teams collaborate with developers to manage large-scale systems efficiently and reliably.",
105
- "SRE approaches operations as a software engineering task, revolutionizing traditional operational methods."]
106
-
107
- # Veriyi temizleme
108
  input_texts_cleaned = [clean_text(text) for text in input_texts]
109
  target_texts_cleaned = [clean_text(text) for text in target_texts]
110
 
111
  # Eğitim ve doğrulama verisini ayırma
112
  train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1)
113
 
114
- # Eğitim ve doğrulama verilerini hazırlama
115
- train_dataset = Dataset.from_dict(prepare_data(train_texts, train_labels))
 
 
116
  val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels))
117
 
118
- # Eğitim verisini augment etmek (opsiyonel)
119
- augmented_input_texts = [paraphrase_with_model(text, model, tokenizer) for text in input_texts_cleaned[:10]] # Daha küçük bir örnekle başla
120
- augmented_target_texts = [paraphrase_with_model(text, model, tokenizer) for text in target_texts_cleaned[:10]] # Aynı şekilde
121
- augmented_dataset = prepare_data(augmented_input_texts, augmented_target_texts)
122
- train_dataset = Dataset.from_dict(augmented_dataset)
123
-
124
  # Eğitim argümanları
125
  training_args = TrainingArguments(
126
  output_dir="./results",
127
  evaluation_strategy="steps",
128
  learning_rate=5e-5,
129
  per_device_train_batch_size=4,
130
- per_device_eval_batch_size=4,
131
  num_train_epochs=3,
132
- weight_decay=0.01,
133
- save_steps=500, # Modeli her 500 adımda bir kaydet
134
- eval_steps=500, # Değerlendirmeyi her 500 adımda bir yap
135
- save_total_limit=2,
136
  logging_dir="./logs",
137
- logging_steps=10,
138
- load_best_model_at_end=True
139
  )
140
 
141
- # Trainer tanımı
142
  trainer = Trainer(
143
  model=model,
144
  args=training_args,
145
  train_dataset=train_dataset,
146
- eval_dataset=val_dataset # Değerlendirme kümesi eklenmeli
147
  )
148
 
149
- # Fine-tuning başlatma
150
  trainer.train()
151
 
152
- # Fine-tuned modelin kaydedilmesi
153
  model.save_pretrained("./fine_tuned_model")
154
- tokenizer.save_pretrained("./fine_tuned_model")
155
-
156
- # Fine-tuned modelinizi yükleyin
157
- model = T5ForConditionalGeneration.from_pretrained("./fine_tuned_model")
158
- tokenizer = T5Tokenizer.from_pretrained("./fine_tuned_model")
 
1
  from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
2
  from datasets import Dataset
3
  from sklearn.model_selection import train_test_split
 
4
  import re
5
  from nltk.corpus import stopwords
6
  from nltk.tokenize import word_tokenize
7
  from nltk.stem import PorterStemmer
8
 
9
+ # Stop words ve stemmer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  stop_words = set(stopwords.words('english'))
 
 
11
  ps = PorterStemmer()
12
 
13
  # Metni temizleme fonksiyonu
14
  def clean_text(text):
 
15
  text = re.sub(r'[^\w\s]', '', text)
 
 
16
  text = re.sub(r'\d+', '', text)
 
 
17
  text = text.lower()
 
 
18
  text = " ".join([word for word in text.split() if word not in stop_words])
 
 
19
  text = " ".join([ps.stem(word) for word in word_tokenize(text)])
 
20
  return text
21
 
22
+ # Prompts okuma
23
+ def read_prompts(file_path):
24
+ input_texts = []
25
+ target_texts = []
26
+ with open(file_path, "r", encoding="utf-8") as file:
27
+ lines = file.readlines()
28
+ for line in lines:
29
+ if line.startswith("input:"):
30
+ input_texts.append(line.replace("input:", "").strip())
31
+ elif line.startswith("target:"):
32
+ target_texts.append(line.replace("target:", "").strip())
33
+ return input_texts, target_texts
34
+
35
+ # Dataset hazırlama
36
+ def prepare_data(input_texts, target_texts):
37
+ inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length")
38
+ targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length")
39
+ return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}
40
+
41
  # Paraphrasing fonksiyonu
42
  def paraphrase_with_model(text, model, tokenizer):
43
+ prompt = "paraphrase: " + text
 
 
44
  inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
 
 
45
  output_ids = model.generate(
46
  inputs["input_ids"],
47
+ do_sample=True,
48
+ top_k=50,
49
+ top_p=0.95,
50
+ temperature=1.0,
51
+ max_length=150,
52
+ no_repeat_ngram_size=2,
53
+ early_stopping=True
54
  )
55
+ return tokenizer.decode(output_ids[0], skip_special_tokens=True, max_length=150)
 
 
 
 
 
56
 
57
+ # Tokenizer ve model yükleme
58
+ model_name = "t5-base"
59
  tokenizer = T5Tokenizer.from_pretrained(model_name)
60
  model = T5ForConditionalGeneration.from_pretrained(model_name)
61
 
62
+ # Veriyi okuma ve temizleme
63
+ input_texts, target_texts = read_prompts("prompts.txt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  input_texts_cleaned = [clean_text(text) for text in input_texts]
65
  target_texts_cleaned = [clean_text(text) for text in target_texts]
66
 
67
  # Eğitim ve doğrulama verisini ayırma
68
  train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1)
69
 
70
+ # Augmentasyon ve dataset hazırlama
71
+ augmented_input_texts = input_texts_cleaned[:10] + [paraphrase_with_model(text, model, tokenizer) for text in input_texts_cleaned[:10]]
72
+ augmented_target_texts = target_texts_cleaned[:10] + [paraphrase_with_model(text, model, tokenizer) for text in target_texts_cleaned[:10]]
73
+ train_dataset = Dataset.from_dict(prepare_data(augmented_input_texts, augmented_target_texts))
74
  val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels))
75
 
 
 
 
 
 
 
76
  # Eğitim argümanları
77
  training_args = TrainingArguments(
78
  output_dir="./results",
79
  evaluation_strategy="steps",
80
  learning_rate=5e-5,
81
  per_device_train_batch_size=4,
 
82
  num_train_epochs=3,
83
+ save_steps=500,
 
 
 
84
  logging_dir="./logs",
85
+ logging_steps=10
 
86
  )
87
 
88
+ # Trainer
89
  trainer = Trainer(
90
  model=model,
91
  args=training_args,
92
  train_dataset=train_dataset,
93
+ eval_dataset=val_dataset
94
  )
95
 
96
+ # Eğitim
97
  trainer.train()
98
 
99
+ # Model kaydetme
100
  model.save_pretrained("./fine_tuned_model")
101
+ tokenizer.save_pretrained("./fine_tuned_model")