Spaces:
Runtime error
Runtime error
File size: 3,500 Bytes
1deb054 603416e c094434 7dd8ada d745102 1deb054 d745102 9612100 a2f38b7 c094434 d745102 c094434 a2f38b7 769a5e8 8cda88c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import logging
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import re
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
stop_words = {"and", "or", "but", "the", "is", "are", "was", "were", "a", "an", "in", "on", "at", "of", "to", "with"}
def stem_word(word):
suffixes = ['ing', 'ed', 'ly', 's', 'es', 'er']
for suffix in suffixes:
if word.endswith(suffix):
return word[:-len(suffix)]
return word
def clean_text(text):
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\d+', '', text)
text = text.lower()
text = " ".join([word for word in text.split() if word not in stop_words])
text = " ".join([stem_word(word) for word in text.split()])
return text
def read_prompts(file_path):
input_texts = []
target_texts = []
with open(file_path, "r", encoding="utf-8") as file:
lines = file.readlines()
for line in lines:
if line.startswith("input:"):
input_texts.append(line.replace("input:", "").strip())
elif line.startswith("target:"):
target_texts.append(line.replace("target:", "").strip())
return input_texts, target_texts
def prepare_data(input_texts, target_texts):
inputs = tokenizer(input_texts, max_length=512, truncation=True, padding="max_length")
targets = tokenizer(target_texts, max_length=512, truncation=True, padding="max_length")
return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}
# Fine-tuning
def fine_tune_model():
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
try:
logger.info("Reading and cleaning prompts.")
input_texts, target_texts = read_prompts("prompts.txt")
input_texts_cleaned = [clean_text(text) for text in input_texts]
target_texts_cleaned = [clean_text(text) for text in target_texts]
logger.info("Splitting dataset into training and validation sets.")
train_texts, val_texts, train_labels, val_labels = train_test_split(input_texts_cleaned, target_texts_cleaned, test_size=0.1)
logger.info("Preparing datasets for training.")
train_dataset = Dataset.from_dict(prepare_data(train_texts, train_labels, tokenizer))
val_dataset = Dataset.from_dict(prepare_data(val_texts, val_labels, tokenizer))
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="steps",
learning_rate=5e-5,
per_device_train_batch_size=4,
num_train_epochs=3,
save_steps=500,
logging_dir="./logs",
logging_steps=10
)
logger.info("Starting model training.")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset
)
trainer.train()
logger.info("Saving fine-tuned model.")
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
except Exception as e:
logger.error(f"An error occurred during fine-tuning: {str(e)}")
fine_tune_model() |