Spaces:
Runtime error
Runtime error
Update fine_tuning.py
Browse files- fine_tuning.py +8 -10
fine_tuning.py
CHANGED
@@ -4,13 +4,12 @@ from datasets import Dataset
|
|
4 |
from sklearn.model_selection import train_test_split
|
5 |
import re
|
6 |
|
7 |
-
|
8 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
9 |
logger = logging.getLogger(__name__)
|
10 |
|
11 |
-
stop_words = {"and", "or", "but", "the", "is", "are", "was", "were", "a", "an", "in", "on", "at", "of", "to", "with"}
|
12 |
def stem_word(word):
|
13 |
-
"""PorterStemmer yerine basit bir gövdeleme fonksiyonu."""
|
14 |
suffixes = ['ing', 'ed', 'ly', 's', 'es', 'er']
|
15 |
for suffix in suffixes:
|
16 |
if word.endswith(suffix):
|
@@ -18,12 +17,11 @@ def stem_word(word):
|
|
18 |
return word
|
19 |
|
20 |
def clean_text(text):
|
21 |
-
|
22 |
-
text = re.sub(r'
|
23 |
-
text =
|
24 |
-
text = text.
|
25 |
-
text = " ".join([word for word in text.split()
|
26 |
-
text = " ".join([stem_word(word) for word in text.split()]) # Gövdeleme
|
27 |
return text
|
28 |
|
29 |
def read_prompts(file_path):
|
@@ -58,7 +56,7 @@ def paraphrase_with_model(text, model, tokenizer):
|
|
58 |
)
|
59 |
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
60 |
|
61 |
-
|
62 |
model_name = "t5-base"
|
63 |
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
64 |
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
|
|
4 |
from sklearn.model_selection import train_test_split
|
5 |
import re
|
6 |
|
7 |
+
|
8 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
9 |
logger = logging.getLogger(__name__)
|
10 |
|
11 |
+
stop_words = {"and", "or", "but", "the", "is", "are", "was", "were", "a", "an", "in", "on", "at", "of", "to", "with"}
|
12 |
def stem_word(word):
|
|
|
13 |
suffixes = ['ing', 'ed', 'ly', 's', 'es', 'er']
|
14 |
for suffix in suffixes:
|
15 |
if word.endswith(suffix):
|
|
|
17 |
return word
|
18 |
|
19 |
def clean_text(text):
|
20 |
+
text = re.sub(r'[^\w\s]', '', text)
|
21 |
+
text = re.sub(r'\d+', '', text)
|
22 |
+
text = text.lower()
|
23 |
+
text = " ".join([word for word in text.split() if word not in stop_words])
|
24 |
+
text = " ".join([stem_word(word) for word in text.split()])
|
|
|
25 |
return text
|
26 |
|
27 |
def read_prompts(file_path):
|
|
|
56 |
)
|
57 |
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
58 |
|
59 |
+
|
60 |
model_name = "t5-base"
|
61 |
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
62 |
model = T5ForConditionalGeneration.from_pretrained(model_name)
|