Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,21 +1,18 @@
|
|
1 |
import re
|
2 |
-
from nltk.corpus import stopwords
|
3 |
import spacy
|
4 |
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
|
5 |
from datasets import Dataset
|
6 |
from sklearn.model_selection import train_test_split
|
7 |
from spacy.cli import download
|
8 |
|
9 |
-
#
|
10 |
download("en_core_web_sm")
|
11 |
-
|
12 |
-
# Spacy modelini yükle
|
13 |
-
nlp = spacy.load("en_core_web_sm") # İngilizce model
|
14 |
|
15 |
# Metni temizleme fonksiyonu
|
16 |
def clean_text_for_education_with_spacy(text):
|
17 |
-
doc = nlp(text)
|
18 |
-
tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
|
19 |
return " ".join(tokens)
|
20 |
|
21 |
# Prompts okuma
|
@@ -33,8 +30,8 @@ def read_prompts(file_path):
|
|
33 |
|
34 |
# Dataset hazırlama
|
35 |
def prepare_data(input_texts, target_texts, tokenizer):
|
36 |
-
inputs = tokenizer(input_texts, max_length=
|
37 |
-
targets = tokenizer(target_texts, max_length=
|
38 |
return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}
|
39 |
|
40 |
# Paraphrasing fonksiyonu
|
|
|
1 |
import re
|
|
|
2 |
import spacy
|
3 |
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
|
4 |
from datasets import Dataset
|
5 |
from sklearn.model_selection import train_test_split
|
6 |
from spacy.cli import download
|
7 |
|
8 |
+
# Spacy modelini indir ve yükle
|
9 |
download("en_core_web_sm")
|
10 |
+
nlp = spacy.load("en_core_web_sm")
|
|
|
|
|
11 |
|
12 |
# Metni temizleme fonksiyonu
|
13 |
def clean_text_for_education_with_spacy(text):
|
14 |
+
doc = nlp(text)
|
15 |
+
tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
|
16 |
return " ".join(tokens)
|
17 |
|
18 |
# Prompts okuma
|
|
|
30 |
|
31 |
# Dataset hazırlama
|
32 |
def prepare_data(input_texts, target_texts, tokenizer):
|
33 |
+
inputs = tokenizer(input_texts, max_length=256, truncation=True, padding="max_length")
|
34 |
+
targets = tokenizer(target_texts, max_length=256, truncation=True, padding="max_length")
|
35 |
return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}
|
36 |
|
37 |
# Paraphrasing fonksiyonu
|