GurgenGulay commited on
Commit
d745102
·
verified ·
1 Parent(s): 9612100

Update fine_tuning.py

Browse files
Files changed (1) hide show
  1. fine_tuning.py +8 -10
fine_tuning.py CHANGED
@@ -4,13 +4,12 @@ from datasets import Dataset
4
  from sklearn.model_selection import train_test_split
5
  import re
6
 
7
- # Logging Ayarları
8
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
9
  logger = logging.getLogger(__name__)
10
 
11
- stop_words = {"and", "or", "but", "the", "is", "are", "was", "were", "a", "an", "in", "on", "at", "of", "to", "with"} # Örnek stop words
12
  def stem_word(word):
13
- """PorterStemmer yerine basit bir gövdeleme fonksiyonu."""
14
  suffixes = ['ing', 'ed', 'ly', 's', 'es', 'er']
15
  for suffix in suffixes:
16
  if word.endswith(suffix):
@@ -18,12 +17,11 @@ def stem_word(word):
18
  return word
19
 
20
  def clean_text(text):
21
- """Metin temizleme fonksiyonu."""
22
- text = re.sub(r'[^\w\s]', '', text) # Noktalama işaretlerini kaldır
23
- text = re.sub(r'\d+', '', text) # Sayıları kaldır
24
- text = text.lower() # Küçük harfe çevir
25
- text = " ".join([word for word in text.split() if word not in stop_words]) # Stop words kaldır
26
- text = " ".join([stem_word(word) for word in text.split()]) # Gövdeleme
27
  return text
28
 
29
  def read_prompts(file_path):
@@ -58,7 +56,7 @@ def paraphrase_with_model(text, model, tokenizer):
58
  )
59
  return tokenizer.decode(output_ids[0], skip_special_tokens=True)
60
 
61
- # Model ve Tokenizer Yükleme
62
  model_name = "t5-base"
63
  tokenizer = T5Tokenizer.from_pretrained(model_name)
64
  model = T5ForConditionalGeneration.from_pretrained(model_name)
 
4
  from sklearn.model_selection import train_test_split
5
  import re
6
 
7
+
8
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
9
  logger = logging.getLogger(__name__)
10
 
11
+ stop_words = {"and", "or", "but", "the", "is", "are", "was", "were", "a", "an", "in", "on", "at", "of", "to", "with"}
12
  def stem_word(word):
 
13
  suffixes = ['ing', 'ed', 'ly', 's', 'es', 'er']
14
  for suffix in suffixes:
15
  if word.endswith(suffix):
 
17
  return word
18
 
19
  def clean_text(text):
20
+ text = re.sub(r'[^\w\s]', '', text)
21
+ text = re.sub(r'\d+', '', text)
22
+ text = text.lower()
23
+ text = " ".join([word for word in text.split() if word not in stop_words])
24
+ text = " ".join([stem_word(word) for word in text.split()])
 
25
  return text
26
 
27
  def read_prompts(file_path):
 
56
  )
57
  return tokenizer.decode(output_ids[0], skip_special_tokens=True)
58
 
59
+
60
  model_name = "t5-base"
61
  tokenizer = T5Tokenizer.from_pretrained(model_name)
62
  model = T5ForConditionalGeneration.from_pretrained(model_name)