GurgenGulay commited on
Commit
9612100
·
verified ·
1 Parent(s): c2386f3

Update fine_tuning.py

Browse files
Files changed (1) hide show
  1. fine_tuning.py +31 -55
fine_tuning.py CHANGED
@@ -3,24 +3,27 @@ from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, Train
3
  from datasets import Dataset
4
  from sklearn.model_selection import train_test_split
5
  import re
6
- from nltk.corpus import stopwords
7
- from nltk.tokenize import word_tokenize
8
- from nltk.stem import PorterStemmer
9
 
10
  # Logging Ayarları
11
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
12
  logger = logging.getLogger(__name__)
13
 
14
-
15
- stop_words = set(stopwords.words('english'))
16
- ps = PorterStemmer()
 
 
 
 
 
17
 
18
  def clean_text(text):
19
- text = re.sub(r'[^\w\s]', '', text)
20
- text = re.sub(r'\d+', '', text)
21
- text = text.lower()
22
- text = " ".join([word for word in text.split() if word not in stop_words])
23
- text = " ".join([ps.stem(word) for word in word_tokenize(text)])
 
24
  return text
25
 
26
  def read_prompts(file_path):
@@ -46,59 +49,21 @@ def paraphrase_with_model(text, model, tokenizer):
46
  output_ids = model.generate(
47
  inputs["input_ids"],
48
  do_sample=True,
49
- top_k=40,
50
- top_p=0.9,
51
- temperature=0.8,
52
- max_length=200,
53
- no_repeat_ngram_size=3,
54
  early_stopping=True
55
  )
56
  return tokenizer.decode(output_ids[0], skip_special_tokens=True)
57
 
 
58
  model_name = "t5-base"
59
  tokenizer = T5Tokenizer.from_pretrained(model_name)
60
  model = T5ForConditionalGeneration.from_pretrained(model_name)
61
 
62
- input_texts, target_texts = read_prompts("prompts.txt")
63
- input_texts_cleaned = [clean_text(text) for text in input_texts]
64
- target_texts_cleaned = [clean_text(text) for text in target_texts]
65
-
66
- train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1)
67
-
68
- augmented_input_texts = input_texts_cleaned[:10] + [paraphrase_with_model(text, model, tokenizer) for text in input_texts_cleaned[:10]]
69
- augmented_target_texts = target_texts_cleaned[:10] + [paraphrase_with_model(text, model, tokenizer) for text in target_texts_cleaned[:10]]
70
- train_dataset = Dataset.from_dict(prepare_data(augmented_input_texts, augmented_target_texts))
71
- val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels))
72
-
73
- training_args = TrainingArguments(
74
- output_dir="./results",
75
- evaluation_strategy="steps",
76
- learning_rate=5e-5,
77
- per_device_train_batch_size=4,
78
- num_train_epochs=3,
79
- save_steps=500,
80
- logging_dir="./logs",
81
- logging_steps=10
82
- )
83
-
84
- trainer = Trainer(
85
- model=model,
86
- args=training_args,
87
- train_dataset=train_dataset,
88
- eval_dataset=val_dataset
89
- )
90
-
91
- trainer.train()
92
-
93
- model.save_pretrained("./fine_tuned_model")
94
- tokenizer.save_pretrained("./fine_tuned_model")
95
-
96
  try:
97
- logger.info("Loading tokenizer and model.")
98
- model_name = "t5-base"
99
- tokenizer = T5Tokenizer.from_pretrained(model_name)
100
- model = T5ForConditionalGeneration.from_pretrained(model_name)
101
-
102
  logger.info("Reading and cleaning prompts.")
103
  input_texts, target_texts = read_prompts("prompts.txt")
104
  input_texts_cleaned = [clean_text(text) for text in input_texts]
@@ -111,6 +76,17 @@ try:
111
  train_dataset = Dataset.from_dict(prepare_data(train_texts, train_labels))
112
  val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels))
113
 
 
 
 
 
 
 
 
 
 
 
 
114
  logger.info("Starting model training.")
115
  trainer = Trainer(
116
  model=model,
 
3
  from datasets import Dataset
4
  from sklearn.model_selection import train_test_split
5
  import re
 
 
 
6
 
7
  # Logging Ayarları
8
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
9
  logger = logging.getLogger(__name__)
10
 
11
+ stop_words = {"and", "or", "but", "the", "is", "are", "was", "were", "a", "an", "in", "on", "at", "of", "to", "with"} # Örnek stop words
12
+ def stem_word(word):
13
+ """PorterStemmer yerine basit bir gövdeleme fonksiyonu."""
14
+ suffixes = ['ing', 'ed', 'ly', 's', 'es', 'er']
15
+ for suffix in suffixes:
16
+ if word.endswith(suffix):
17
+ return word[:-len(suffix)]
18
+ return word
19
 
20
  def clean_text(text):
21
+ """Metin temizleme fonksiyonu."""
22
+ text = re.sub(r'[^\w\s]', '', text) # Noktalama işaretlerini kaldır
23
+ text = re.sub(r'\d+', '', text) # Sayıları kaldır
24
+ text = text.lower() # Küçük harfe çevir
25
+ text = " ".join([word for word in text.split() if word not in stop_words]) # Stop words kaldır
26
+ text = " ".join([stem_word(word) for word in text.split()]) # Gövdeleme
27
  return text
28
 
29
  def read_prompts(file_path):
 
49
  output_ids = model.generate(
50
  inputs["input_ids"],
51
  do_sample=True,
52
+ top_k=40,
53
+ top_p=0.9,
54
+ temperature=0.8,
55
+ max_length=200,
56
+ no_repeat_ngram_size=3,
57
  early_stopping=True
58
  )
59
  return tokenizer.decode(output_ids[0], skip_special_tokens=True)
60
 
61
+ # Model ve Tokenizer Yükleme
62
  model_name = "t5-base"
63
  tokenizer = T5Tokenizer.from_pretrained(model_name)
64
  model = T5ForConditionalGeneration.from_pretrained(model_name)
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  try:
 
 
 
 
 
67
  logger.info("Reading and cleaning prompts.")
68
  input_texts, target_texts = read_prompts("prompts.txt")
69
  input_texts_cleaned = [clean_text(text) for text in input_texts]
 
76
  train_dataset = Dataset.from_dict(prepare_data(train_texts, train_labels))
77
  val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels))
78
 
79
+ training_args = TrainingArguments(
80
+ output_dir="./results",
81
+ evaluation_strategy="steps",
82
+ learning_rate=5e-5,
83
+ per_device_train_batch_size=4,
84
+ num_train_epochs=3,
85
+ save_steps=500,
86
+ logging_dir="./logs",
87
+ logging_steps=10
88
+ )
89
+
90
  logger.info("Starting model training.")
91
  trainer = Trainer(
92
  model=model,