GurgenGulay commited on
Commit
000e05e
·
verified ·
1 Parent(s): e72eae0

Update fine_tuning.py

Browse files
Files changed (1) hide show
  1. fine_tuning.py +12 -13
fine_tuning.py CHANGED
@@ -6,11 +6,10 @@ from nltk.corpus import stopwords
6
  from nltk.tokenize import word_tokenize
7
  from nltk.stem import PorterStemmer
8
 
9
- # Stop words ve stemmer
10
  stop_words = set(stopwords.words('english'))
11
  ps = PorterStemmer()
12
 
13
- # Metni temizleme fonksiyonu
14
  def clean_text(text):
15
  text = re.sub(r'[^\w\s]', '', text)
16
  text = re.sub(r'\d+', '', text)
@@ -19,7 +18,7 @@ def clean_text(text):
19
  text = " ".join([ps.stem(word) for word in word_tokenize(text)])
20
  return text
21
 
22
- # Prompts okuma
23
  def read_prompts(file_path):
24
  input_texts = []
25
  target_texts = []
@@ -32,13 +31,14 @@ def read_prompts(file_path):
32
  target_texts.append(line.replace("target:", "").strip())
33
  return input_texts, target_texts
34
 
35
- # Dataset hazırlama
36
  def prepare_data(input_texts, target_texts):
37
  inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length")
38
  targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length")
39
  return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}
40
 
41
- # Paraphrasing fonksiyonu
 
42
  def paraphrase_with_model(text, model, tokenizer):
43
  prompt = "paraphrase: " + text
44
  inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
@@ -54,26 +54,26 @@ def paraphrase_with_model(text, model, tokenizer):
54
  )
55
  return tokenizer.decode(output_ids[0], skip_special_tokens=True, max_length=150)
56
 
57
- # Tokenizer ve model yükleme
58
  model_name = "t5-base"
59
  tokenizer = T5Tokenizer.from_pretrained(model_name)
60
  model = T5ForConditionalGeneration.from_pretrained(model_name)
61
 
62
- # Veriyi okuma ve temizleme
63
  input_texts, target_texts = read_prompts("prompts.txt")
64
  input_texts_cleaned = [clean_text(text) for text in input_texts]
65
  target_texts_cleaned = [clean_text(text) for text in target_texts]
66
 
67
- # Eğitim ve doğrulama verisini ayırma
68
  train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1)
69
 
70
- # Augmentasyon ve dataset hazırlama
71
  augmented_input_texts = input_texts_cleaned[:10] + [paraphrase_with_model(text, model, tokenizer) for text in input_texts_cleaned[:10]]
72
  augmented_target_texts = target_texts_cleaned[:10] + [paraphrase_with_model(text, model, tokenizer) for text in target_texts_cleaned[:10]]
73
  train_dataset = Dataset.from_dict(prepare_data(augmented_input_texts, augmented_target_texts))
74
  val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels))
75
 
76
- # Eğitim argümanları
77
  training_args = TrainingArguments(
78
  output_dir="./results",
79
  evaluation_strategy="steps",
@@ -85,7 +85,7 @@ training_args = TrainingArguments(
85
  logging_steps=10
86
  )
87
 
88
- # Trainer
89
  trainer = Trainer(
90
  model=model,
91
  args=training_args,
@@ -93,9 +93,8 @@ trainer = Trainer(
93
  eval_dataset=val_dataset
94
  )
95
 
96
- # Eğitim
97
  trainer.train()
98
 
99
- # Model kaydetme
100
  model.save_pretrained("./fine_tuned_model")
101
  tokenizer.save_pretrained("./fine_tuned_model")
 
6
  from nltk.tokenize import word_tokenize
7
  from nltk.stem import PorterStemmer
8
 
9
+
10
  stop_words = set(stopwords.words('english'))
11
  ps = PorterStemmer()
12
 
 
13
  def clean_text(text):
14
  text = re.sub(r'[^\w\s]', '', text)
15
  text = re.sub(r'\d+', '', text)
 
18
  text = " ".join([ps.stem(word) for word in word_tokenize(text)])
19
  return text
20
 
21
+
22
  def read_prompts(file_path):
23
  input_texts = []
24
  target_texts = []
 
31
  target_texts.append(line.replace("target:", "").strip())
32
  return input_texts, target_texts
33
 
34
+
35
  def prepare_data(input_texts, target_texts):
36
  inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length")
37
  targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length")
38
  return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}
39
 
40
+
41
+
42
  def paraphrase_with_model(text, model, tokenizer):
43
  prompt = "paraphrase: " + text
44
  inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
 
54
  )
55
  return tokenizer.decode(output_ids[0], skip_special_tokens=True, max_length=150)
56
 
57
+
58
  model_name = "t5-base"
59
  tokenizer = T5Tokenizer.from_pretrained(model_name)
60
  model = T5ForConditionalGeneration.from_pretrained(model_name)
61
 
62
+
63
  input_texts, target_texts = read_prompts("prompts.txt")
64
  input_texts_cleaned = [clean_text(text) for text in input_texts]
65
  target_texts_cleaned = [clean_text(text) for text in target_texts]
66
 
67
+
68
  train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1)
69
 
70
+
71
  augmented_input_texts = input_texts_cleaned[:10] + [paraphrase_with_model(text, model, tokenizer) for text in input_texts_cleaned[:10]]
72
  augmented_target_texts = target_texts_cleaned[:10] + [paraphrase_with_model(text, model, tokenizer) for text in target_texts_cleaned[:10]]
73
  train_dataset = Dataset.from_dict(prepare_data(augmented_input_texts, augmented_target_texts))
74
  val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels))
75
 
76
+
77
  training_args = TrainingArguments(
78
  output_dir="./results",
79
  evaluation_strategy="steps",
 
85
  logging_steps=10
86
  )
87
 
88
+
89
  trainer = Trainer(
90
  model=model,
91
  args=training_args,
 
93
  eval_dataset=val_dataset
94
  )
95
 
96
+
97
  trainer.train()
98
 
 
99
  model.save_pretrained("./fine_tuned_model")
100
  tokenizer.save_pretrained("./fine_tuned_model")