Spaces:

GurgenGulay
/

case_study

Runtime error

GurgenGulay commited on Jan 3

Commit

d745102

verified ·

1 Parent(s): 9612100

Update fine_tuning.py

Files changed (1) hide show

fine_tuning.py CHANGED Viewed

@@ -4,13 +4,12 @@ from datasets import Dataset
 from sklearn.model_selection import train_test_split
 import re
-# Logging Ayarları
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
-stop_words = {"and", "or", "but", "the", "is", "are", "was", "were", "a", "an", "in", "on", "at", "of", "to", "with"}  # Örnek stop words
 def stem_word(word):
-    """PorterStemmer yerine basit bir gövdeleme fonksiyonu."""
     suffixes = ['ing', 'ed', 'ly', 's', 'es', 'er']
     for suffix in suffixes:
         if word.endswith(suffix):
@@ -18,12 +17,11 @@ def stem_word(word):
     return word
 def clean_text(text):
-    """Metin temizleme fonksiyonu."""
-    text = re.sub(r'[^\w\s]', '', text)  # Noktalama işaretlerini kaldır
-    text = re.sub(r'\d+', '', text)  # Sayıları kaldır
-    text = text.lower()  # Küçük harfe çevir
-    text = " ".join([word for word in text.split() if word not in stop_words])  # Stop words kaldır
-    text = " ".join([stem_word(word) for word in text.split()])  # Gövdeleme
     return text
 def read_prompts(file_path):
@@ -58,7 +56,7 @@ def paraphrase_with_model(text, model, tokenizer):
     )
     return tokenizer.decode(output_ids[0], skip_special_tokens=True)
-# Model ve Tokenizer Yükleme
 model_name = "t5-base"
 tokenizer = T5Tokenizer.from_pretrained(model_name)
 model = T5ForConditionalGeneration.from_pretrained(model_name)

 from sklearn.model_selection import train_test_split
 import re
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
+stop_words = {"and", "or", "but", "the", "is", "are", "was", "were", "a", "an", "in", "on", "at", "of", "to", "with"}
 def stem_word(word):
     suffixes = ['ing', 'ed', 'ly', 's', 'es', 'er']
     for suffix in suffixes:
         if word.endswith(suffix):
     return word
 def clean_text(text):
+    text = re.sub(r'[^\w\s]', '', text)
+    text = re.sub(r'\d+', '', text)
+    text = text.lower()
+    text = " ".join([word for word in text.split() if word not in stop_words])
+    text = " ".join([stem_word(word) for word in text.split()])
     return text
 def read_prompts(file_path):
     )
     return tokenizer.decode(output_ids[0], skip_special_tokens=True)
 model_name = "t5-base"
 tokenizer = T5Tokenizer.from_pretrained(model_name)
 model = T5ForConditionalGeneration.from_pretrained(model_name)