GurgenGulay commited on
Commit
2fd0f16
·
verified ·
1 Parent(s): ec6340b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -9
app.py CHANGED
@@ -1,21 +1,18 @@
1
  import re
2
- from nltk.corpus import stopwords
3
  import spacy
4
  from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
5
  from datasets import Dataset
6
  from sklearn.model_selection import train_test_split
7
  from spacy.cli import download
8
 
9
- # Modeli indir
10
  download("en_core_web_sm")
11
-
12
- # Spacy modelini yükle
13
- nlp = spacy.load("en_core_web_sm") # İngilizce model
14
 
15
  # Metni temizleme fonksiyonu
16
  def clean_text_for_education_with_spacy(text):
17
- doc = nlp(text) # Spacy ile metni işleyin
18
- tokens = [token.text for token in doc if not token.is_stop and not token.is_punct] # Stop words ve noktalama işaretlerini kaldırır
19
  return " ".join(tokens)
20
 
21
  # Prompts okuma
@@ -33,8 +30,8 @@ def read_prompts(file_path):
33
 
34
  # Dataset hazırlama
35
  def prepare_data(input_texts, target_texts, tokenizer):
36
- inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length")
37
- targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length")
38
  return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}
39
 
40
  # Paraphrasing fonksiyonu
 
1
  import re
 
2
  import spacy
3
  from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
4
  from datasets import Dataset
5
  from sklearn.model_selection import train_test_split
6
  from spacy.cli import download
7
 
8
+ # Spacy modelini indir ve yükle
9
  download("en_core_web_sm")
10
+ nlp = spacy.load("en_core_web_sm")
 
 
11
 
12
  # Metni temizleme fonksiyonu
13
  def clean_text_for_education_with_spacy(text):
14
+ doc = nlp(text)
15
+ tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
16
  return " ".join(tokens)
17
 
18
  # Prompts okuma
 
30
 
31
  # Dataset hazırlama
32
  def prepare_data(input_texts, target_texts, tokenizer):
33
+ inputs = tokenizer(input_texts, max_length=256, truncation=True, padding="max_length")
34
+ targets = tokenizer(target_texts, max_length=256, truncation=True, padding="max_length")
35
  return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}
36
 
37
  # Paraphrasing fonksiyonu