import logging from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments from datasets import Dataset from sklearn.model_selection import train_test_split import re logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) stop_words = {"and", "or", "but", "the", "is", "are", "was", "were", "a", "an", "in", "on", "at", "of", "to", "with"} def stem_word(word): suffixes = ['ing', 'ed', 'ly', 's', 'es', 'er'] for suffix in suffixes: if word.endswith(suffix): return word[:-len(suffix)] return word def clean_text(text): text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'\d+', '', text) text = text.lower() text = " ".join([word for word in text.split() if word not in stop_words]) text = " ".join([stem_word(word) for word in text.split()]) return text def read_prompts(file_path): input_texts = [] target_texts = [] with open(file_path, "r", encoding="utf-8") as file: lines = file.readlines() for line in lines: if line.startswith("input:"): input_texts.append(line.replace("input:", "").strip()) elif line.startswith("target:"): target_texts.append(line.replace("target:", "").strip()) return input_texts, target_texts def prepare_data(input_texts, target_texts): inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length") targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length") return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]} # Fine-tuning def fine_tune_model(): model_name = "t5-base" tokenizer = T5Tokenizer.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name) try: logger.info("Reading and cleaning prompts.") input_texts, target_texts = read_prompts("prompts.txt") input_texts_cleaned = [clean_text(text) for text in input_texts] target_texts_cleaned = [clean_text(text) for text in target_texts] logger.info("Splitting dataset into training and validation sets.") train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1) logger.info("Preparing datasets for training.") train_dataset = Dataset.from_dict(prepare_data(train_texts, train_labels, tokenizer)) val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels, tokenizer)) training_args = TrainingArguments( output_dir="./results", evaluation_strategy="steps", learning_rate=5e-5, per_device_train_batch_size=4, num_train_epochs=3, save_steps=500, logging_dir="./logs", logging_steps=10 ) logger.info("Starting model training.") trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset ) trainer.train() logger.info("Saving fine-tuned model.") model.save_pretrained("./fine_tuned_model") tokenizer.save_pretrained("./fine_tuned_model") except Exception as e: logger.error(f"An error occurred during fine-tuning: {str(e)}") fine_tune_model()