|
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments |
|
from datasets import Dataset |
|
import json |
|
import torch |
|
|
|
|
|
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-uncased") |
|
model = BertForTokenClassification.from_pretrained("dbmdz/bert-base-italian-uncased", num_labels=5) |
|
|
|
|
|
with open('entity_dataset.json', 'r') as f: |
|
dataset = json.load(f) |
|
|
|
|
|
def prepare_dataset(dataset): |
|
input_texts = [entry["query"] for entry in dataset] |
|
labels = [entry["entities"] for entry in dataset] |
|
|
|
|
|
encodings = tokenizer(input_texts, truncation=True, padding=True, max_length=512) |
|
|
|
|
|
|
|
|
|
|
|
encodings['labels'] = torch.tensor(labels) |
|
|
|
return Dataset.from_dict(encodings) |
|
|
|
|
|
train_dataset = prepare_dataset(dataset) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
evaluation_strategy="epoch", |
|
learning_rate=2e-5, |
|
per_device_train_batch_size=16, |
|
num_train_epochs=3, |
|
weight_decay=0.01 |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
model.save_pretrained("./hotel_model") |
|
tokenizer.save_pretrained("./hotel_model") |
|
|