GurgenGulay commited on
Commit
1b94a22
·
verified ·
1 Parent(s): 2fd0f16

Update fine_tuning.py

Browse files
Files changed (1) hide show
  1. fine_tuning.py +37 -70
fine_tuning.py CHANGED
@@ -1,37 +1,11 @@
1
- import re
2
- from nltk.corpus import stopwords
3
- from nltk.tokenize import word_tokenize
4
- from nltk.stem import PorterStemmer
5
  from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
6
  from datasets import Dataset
7
  from sklearn.model_selection import train_test_split
8
- import nltk
9
 
10
-
11
- # Stop words ve stemmer
12
- stop_words = set(stopwords.words('english'))
13
- ps = PorterStemmer()
14
-
15
- # Metni temizleme fonksiyonu
16
- def clean_text_for_education(text):
17
- text = re.sub(r'[^\w\s]', '', text) # Noktalama işaretlerini temizler
18
- text = re.sub(r'\d+', '', text) # Sayıları temizler
19
- text = text.lower() # Küçük harfe çevirir
20
- text = " ".join([word for word in text.split() if word not in stop_words]) # Stopwords kaldırır
21
- return text
22
-
23
- # Prompts okuma
24
- def read_prompts(file_path):
25
- input_texts = []
26
- target_texts = []
27
- with open(file_path, "r", encoding="utf-8") as file:
28
- lines = file.readlines()
29
- for line in lines:
30
- if line.startswith("input:"):
31
- input_texts.append(line.replace("input:", "").strip())
32
- elif line.startswith("target:"):
33
- target_texts.append(line.replace("target:", "").strip())
34
- return input_texts, target_texts
35
 
36
  # Dataset hazırlama
37
  def prepare_data(input_texts, target_texts, tokenizer):
@@ -55,48 +29,41 @@ def paraphrase_with_model(text, model, tokenizer):
55
  )
56
  return tokenizer.decode(output_ids[0], skip_special_tokens=True)
57
 
58
- # Tokenizer ve model yükleme
59
- model_name = "t5-base"
60
- tokenizer = T5Tokenizer.from_pretrained(model_name)
61
- model = T5ForConditionalGeneration.from_pretrained(model_name)
62
-
63
- # Veriyi okuma ve temizleme
64
- input_texts, target_texts = read_prompts("prompts.txt")
65
- input_texts_cleaned = [clean_text_for_education(text) for text in input_texts]
66
- target_texts_cleaned = [clean_text_for_education(text) for text in target_texts]
67
 
68
- # Eğitim ve doğrulama verisini ayırma
69
- train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1)
 
 
 
 
70
 
71
- # Augmentasyon ve dataset hazırlama
72
- augmented_input_texts = input_texts_cleaned + [paraphrase_with_model(text, model, tokenizer) for text in input_texts_cleaned[:10]]
73
- augmented_target_texts = target_texts_cleaned + [paraphrase_with_model(text, model, tokenizer) for text in target_texts_cleaned[:10]]
74
- train_dataset = Dataset.from_dict(prepare_data(augmented_input_texts, augmented_target_texts, tokenizer))
75
- val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels, tokenizer))
76
-
77
- # Eğitim argümanları
78
- training_args = TrainingArguments(
79
- output_dir="./results",
80
- evaluation_strategy="steps",
81
- learning_rate=5e-5,
82
- per_device_train_batch_size=4,
83
- num_train_epochs=3,
84
- save_steps=500,
85
- logging_dir="./logs",
86
- logging_steps=10
87
- )
88
 
89
- # Trainer
90
- trainer = Trainer(
91
- model=model,
92
- args=training_args,
93
- train_dataset=train_dataset,
94
- eval_dataset=val_dataset
95
- )
96
 
97
- # Eğitim
98
- trainer.train()
99
 
100
- # Model kaydetme
101
- model.save_pretrained("./fine_tuned_model")
102
- tokenizer.save_pretrained("./fine_tuned_model")
 
 
 
 
 
1
  from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
2
  from datasets import Dataset
3
  from sklearn.model_selection import train_test_split
 
4
 
5
+ # Tokenizer ve model yükleme
6
+ model_name = "t5-base"
7
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
8
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # Dataset hazırlama
11
  def prepare_data(input_texts, target_texts, tokenizer):
 
29
  )
30
  return tokenizer.decode(output_ids[0], skip_special_tokens=True)
31
 
32
+ # Eğitim fonksiyonu
33
+ def fine_tune_model(input_texts, target_texts):
34
+ # Eğitim ve doğrulama verisini ayırma
35
+ train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts, target_texts, test_size=0.1)
 
 
 
 
 
36
 
37
+ # Augmentasyon ve dataset hazırlama
38
+ augmented_input_texts = input_texts + [paraphrase_with_model(text, model, tokenizer) for text in input_texts[:10]]
39
+ augmented_target_texts = target_texts + [paraphrase_with_model(text, model, tokenizer) for text in target_texts[:10]]
40
+
41
+ train_dataset = Dataset.from_dict(prepare_data(augmented_input_texts, augmented_target_texts, tokenizer))
42
+ val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels, tokenizer))
43
 
44
+ # Eğitim argümanları
45
+ training_args = TrainingArguments(
46
+ output_dir="./results",
47
+ evaluation_strategy="steps",
48
+ learning_rate=5e-5,
49
+ per_device_train_batch_size=4,
50
+ num_train_epochs=3,
51
+ save_steps=500,
52
+ logging_dir="./logs",
53
+ logging_steps=10
54
+ )
 
 
 
 
 
 
55
 
56
+ # Trainer
57
+ trainer = Trainer(
58
+ model=model,
59
+ args=training_args,
60
+ train_dataset=train_dataset,
61
+ eval_dataset=val_dataset
62
+ )
63
 
64
+ # Eğitim
65
+ trainer.train()
66
 
67
+ # Model kaydetme
68
+ model.save_pretrained("./fine_tuned_model")
69
+ tokenizer.save_pretrained("./fine_tuned_model")