from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments from datasets import Dataset from sklearn.model_selection import train_test_split import re from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer # Stop words ve stemmer stop_words = set(stopwords.words('english')) ps = PorterStemmer() # Metni temizleme fonksiyonu def clean_text(text): text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'\d+', '', text) text = text.lower() text = " ".join([word for word in text.split() if word not in stop_words]) text = " ".join([ps.stem(word) for word in word_tokenize(text)]) return text # Prompts okuma def read_prompts(file_path): input_texts = [] target_texts = [] with open(file_path, "r", encoding="utf-8") as file: lines = file.readlines() for line in lines: if line.startswith("input:"): input_texts.append(line.replace("input:", "").strip()) elif line.startswith("target:"): target_texts.append(line.replace("target:", "").strip()) return input_texts, target_texts # Dataset hazırlama def prepare_data(input_texts, target_texts): inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length") targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length") return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]} # Paraphrasing fonksiyonu def paraphrase_with_model(text, model, tokenizer): prompt = "paraphrase: " + text inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512) output_ids = model.generate( inputs["input_ids"], do_sample=True, top_k=50, top_p=0.95, temperature=1.0, max_length=150, no_repeat_ngram_size=2, early_stopping=True ) return tokenizer.decode(output_ids[0], skip_special_tokens=True, max_length=150) # Tokenizer ve model yükleme model_name = "t5-base" tokenizer = T5Tokenizer.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name) # Veriyi okuma ve temizleme input_texts, target_texts = read_prompts("prompts.txt") input_texts_cleaned = [clean_text(text) for text in input_texts] target_texts_cleaned = [clean_text(text) for text in target_texts] # Eğitim ve doğrulama verisini ayırma train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1) # Augmentasyon ve dataset hazırlama augmented_input_texts = input_texts_cleaned[:10] + [paraphrase_with_model(text, model, tokenizer) for text in input_texts_cleaned[:10]] augmented_target_texts = target_texts_cleaned[:10] + [paraphrase_with_model(text, model, tokenizer) for text in target_texts_cleaned[:10]] train_dataset = Dataset.from_dict(prepare_data(augmented_input_texts, augmented_target_texts)) val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels)) # Eğitim argümanları training_args = TrainingArguments( output_dir="./results", evaluation_strategy="steps", learning_rate=5e-5, per_device_train_batch_size=4, num_train_epochs=3, save_steps=500, logging_dir="./logs", logging_steps=10 ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset ) # Eğitim trainer.train() # Model kaydetme model.save_pretrained("./fine_tuned_model") tokenizer.save_pretrained("./fine_tuned_model")