GurgenGulay commited on
Commit
b30cab0
·
verified ·
1 Parent(s): 1b94a22

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -89
app.py CHANGED
@@ -1,97 +1,27 @@
1
- import re
2
  import spacy
3
- from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
4
- from datasets import Dataset
5
- from sklearn.model_selection import train_test_split
6
- from spacy.cli import download
7
 
8
- # Spacy modelini indir ve yükle
9
- download("en_core_web_sm")
10
  nlp = spacy.load("en_core_web_sm")
11
 
12
- # Metni temizleme fonksiyonu
13
- def clean_text_for_education_with_spacy(text):
14
  doc = nlp(text)
15
  tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
16
  return " ".join(tokens)
17
 
18
- # Prompts okuma
19
- def read_prompts(file_path):
20
- input_texts = []
21
- target_texts = []
22
- with open(file_path, "r", encoding="utf-8") as file:
23
- lines = file.readlines()
24
- for line in lines:
25
- if line.startswith("input:"):
26
- input_texts.append(line.replace("input:", "").strip())
27
- elif line.startswith("target:"):
28
- target_texts.append(line.replace("target:", "").strip())
29
- return input_texts, target_texts
30
-
31
- # Dataset hazırlama
32
- def prepare_data(input_texts, target_texts, tokenizer):
33
- inputs = tokenizer(input_texts, max_length=256, truncation=True, padding="max_length")
34
- targets = tokenizer(target_texts, max_length=256, truncation=True, padding="max_length")
35
- return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}
36
-
37
- # Paraphrasing fonksiyonu
38
- def paraphrase_with_model(text, model, tokenizer):
39
- prompt = "Teach the following content: " + text
40
- inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
41
- output_ids = model.generate(
42
- inputs["input_ids"],
43
- do_sample=True,
44
- top_k=50,
45
- top_p=0.95,
46
- temperature=1.0,
47
- max_length=150,
48
- no_repeat_ngram_size=2,
49
- early_stopping=True
50
- )
51
- return tokenizer.decode(output_ids[0], skip_special_tokens=True)
52
-
53
- # Tokenizer ve model yükleme
54
- model_name = "t5-base"
55
- tokenizer = T5Tokenizer.from_pretrained(model_name)
56
- model = T5ForConditionalGeneration.from_pretrained(model_name)
57
-
58
- # Veriyi okuma ve temizleme
59
- input_texts, target_texts = read_prompts("prompts.txt")
60
- input_texts_cleaned = [clean_text_for_education_with_spacy(text) for text in input_texts]
61
- target_texts_cleaned = [clean_text_for_education_with_spacy(text) for text in target_texts]
62
-
63
- # Eğitim ve doğrulama verisini ayırma
64
- train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1)
65
-
66
- # Augmentasyon ve dataset hazırlama
67
- augmented_input_texts = input_texts_cleaned + [paraphrase_with_model(text, model, tokenizer) for text in input_texts_cleaned[:10]]
68
- augmented_target_texts = target_texts_cleaned + [paraphrase_with_model(text, model, tokenizer) for text in target_texts_cleaned[:10]]
69
- train_dataset = Dataset.from_dict(prepare_data(augmented_input_texts, augmented_target_texts, tokenizer))
70
- val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels, tokenizer))
71
-
72
- # Eğitim argümanları
73
- training_args = TrainingArguments(
74
- output_dir="./results",
75
- evaluation_strategy="steps",
76
- learning_rate=5e-5,
77
- per_device_train_batch_size=4,
78
- num_train_epochs=3,
79
- save_steps=500,
80
- logging_dir="./logs",
81
- logging_steps=10
82
- )
83
-
84
- # Trainer
85
- trainer = Trainer(
86
- model=model,
87
- args=training_args,
88
- train_dataset=train_dataset,
89
- eval_dataset=val_dataset
90
- )
91
-
92
- # Eğitim
93
- trainer.train()
94
-
95
- # Model kaydetme
96
- model.save_pretrained("./fine_tuned_model")
97
- tokenizer.save_pretrained("./fine_tuned_model")
 
 
1
  import spacy
2
+ from transformers import T5Tokenizer
3
+ from fine_tuning import fine_tune_model # fine_tuning.py'deki fonksiyonu içe aktar
 
 
4
 
5
+ # spaCy modelini yükle
 
6
  nlp = spacy.load("en_core_web_sm")
7
 
8
+ def clean_text_with_spacy(text):
 
9
  doc = nlp(text)
10
  tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
11
  return " ".join(tokens)
12
 
13
+ # Temizlenmiş metni modelinize göndermek için fonksiyon
14
+ def process_input_for_fine_tuning(input_texts, target_texts):
15
+ # Metni temizle
16
+ cleaned_input_texts = [clean_text_with_spacy(text) for text in input_texts]
17
+ cleaned_target_texts = [clean_text_with_spacy(text) for text in target_texts]
18
+
19
+ # Temizlenmiş metni fine-tuning için gönder
20
+ fine_tune_model(cleaned_input_texts, cleaned_target_texts)
21
+
22
+ # Örnek metinler
23
+ input_texts = ["This is a sample input text.", "Another input text here."]
24
+ target_texts = ["This is the target output.", "Target output for second example."]
25
+
26
+ # Temizlenmiş veriyi fine_tuning.py'ye göndermek için işlemi başlat
27
+ process_input_for_fine_tuning(input_texts, target_texts)