Spaces:
Runtime error
Runtime error
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments | |
from datasets import Dataset | |
from sklearn.model_selection import train_test_split | |
import re | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import PorterStemmer | |
# Stop words ve stemmer | |
stop_words = set(stopwords.words('english')) | |
ps = PorterStemmer() | |
# Metni temizleme fonksiyonu | |
def clean_text(text): | |
text = re.sub(r'[^\w\s]', '', text) | |
text = re.sub(r'\d+', '', text) | |
text = text.lower() | |
text = " ".join([word for word in text.split() if word not in stop_words]) | |
text = " ".join([ps.stem(word) for word in word_tokenize(text)]) | |
return text | |
# Prompts okuma | |
def read_prompts(file_path): | |
input_texts = [] | |
target_texts = [] | |
with open(file_path, "r", encoding="utf-8") as file: | |
lines = file.readlines() | |
for line in lines: | |
if line.startswith("input:"): | |
input_texts.append(line.replace("input:", "").strip()) | |
elif line.startswith("target:"): | |
target_texts.append(line.replace("target:", "").strip()) | |
return input_texts, target_texts | |
# Dataset hazırlama | |
def prepare_data(input_texts, target_texts): | |
inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length") | |
targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length") | |
return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]} | |
# Paraphrasing fonksiyonu | |
def paraphrase_with_model(text, model, tokenizer): | |
prompt = "paraphrase: " + text | |
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
output_ids = model.generate( | |
inputs["input_ids"], | |
do_sample=True, | |
top_k=50, | |
top_p=0.95, | |
temperature=1.0, | |
max_length=150, | |
no_repeat_ngram_size=2, | |
early_stopping=True | |
) | |
return tokenizer.decode(output_ids[0], skip_special_tokens=True, max_length=150) | |
# Tokenizer ve model yükleme | |
model_name = "t5-base" | |
tokenizer = T5Tokenizer.from_pretrained(model_name) | |
model = T5ForConditionalGeneration.from_pretrained(model_name) | |
# Veriyi okuma ve temizleme | |
input_texts, target_texts = read_prompts("prompts.txt") | |
input_texts_cleaned = [clean_text(text) for text in input_texts] | |
target_texts_cleaned = [clean_text(text) for text in target_texts] | |
# Eğitim ve doğrulama verisini ayırma | |
train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1) | |
# Augmentasyon ve dataset hazırlama | |
augmented_input_texts = input_texts_cleaned[:10] + [paraphrase_with_model(text, model, tokenizer) for text in input_texts_cleaned[:10]] | |
augmented_target_texts = target_texts_cleaned[:10] + [paraphrase_with_model(text, model, tokenizer) for text in target_texts_cleaned[:10]] | |
train_dataset = Dataset.from_dict(prepare_data(augmented_input_texts, augmented_target_texts)) | |
val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels)) | |
# Eğitim argümanları | |
training_args = TrainingArguments( | |
output_dir="./results", | |
evaluation_strategy="steps", | |
learning_rate=5e-5, | |
per_device_train_batch_size=4, | |
num_train_epochs=3, | |
save_steps=500, | |
logging_dir="./logs", | |
logging_steps=10 | |
) | |
# Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=val_dataset | |
) | |
# Eğitim | |
trainer.train() | |
# Model kaydetme | |
model.save_pretrained("./fine_tuned_model") | |
tokenizer.save_pretrained("./fine_tuned_model") |