File size: 3,500 Bytes
1deb054
603416e
 
 
c094434
7dd8ada
d745102
1deb054
 
 
d745102
9612100
 
 
 
 
 
a2f38b7
c094434
d745102
 
 
 
 
c094434
a2f38b7
769a5e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8cda88c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import logging
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import re


logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

stop_words = {"and", "or", "but", "the", "is", "are", "was", "were", "a", "an", "in", "on", "at", "of", "to", "with"}
def stem_word(word):
    suffixes = ['ing', 'ed', 'ly', 's', 'es', 'er']
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text) 
    text = re.sub(r'\d+', '', text) 
    text = text.lower()  
    text = " ".join([word for word in text.split() if word not in stop_words])
    text = " ".join([stem_word(word) for word in text.split()])  
    return text

def read_prompts(file_path):
    input_texts = []
    target_texts = []
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            if line.startswith("input:"):
                input_texts.append(line.replace("input:", "").strip())
            elif line.startswith("target:"):
                target_texts.append(line.replace("target:", "").strip())
    return input_texts, target_texts

def prepare_data(input_texts, target_texts):
    inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length")
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}

# Fine-tuning
def fine_tune_model():
    model_name = "t5-base"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    try:
        logger.info("Reading and cleaning prompts.")
        input_texts, target_texts = read_prompts("prompts.txt")
        input_texts_cleaned = [clean_text(text) for text in input_texts]
        target_texts_cleaned = [clean_text(text) for text in target_texts]

        logger.info("Splitting dataset into training and validation sets.")
        train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1)

        logger.info("Preparing datasets for training.")
        train_dataset = Dataset.from_dict(prepare_data(train_texts, train_labels, tokenizer))
        val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels, tokenizer))

        training_args = TrainingArguments(
            output_dir="./results",
            evaluation_strategy="steps",
            learning_rate=5e-5,
            per_device_train_batch_size=4,
            num_train_epochs=3,
            save_steps=500,
            logging_dir="./logs",
            logging_steps=10
        )

        logger.info("Starting model training.")
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset
        )
        trainer.train()

        logger.info("Saving fine-tuned model.")
        model.save_pretrained("./fine_tuned_model")
        tokenizer.save_pretrained("./fine_tuned_model")

    except Exception as e:
        logger.error(f"An error occurred during fine-tuning: {str(e)}")

fine_tune_model()