from transformers import TrainingArguments, Trainer from datasets import load_dataset import evaluate import numpy as np from modeling_octagon import OctagonForSequenceClassification, OctagonConfig from tokenization_octagon import OctagonTokenizer # Load dataset dataset = load_dataset("imdb") # Sample training (for demo purposes, use smaller subset) train_dataset = dataset["train"].shuffle(seed=42).select(range(1000)) eval_dataset = dataset["test"].shuffle(seed=42).select(range(200)) # Initialize tokenizer tokenizer = OctagonTokenizer.train_tokenizer( texts=train_dataset["text"], vocab_size=30522, save_path="octagon-tokenizer.json" ) # Tokenize function def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) tokenized_train = train_dataset.map(tokenize_function, batched=True) tokenized_eval = eval_dataset.map(tokenize_function, batched=True) # Model config config = OctagonConfig( vocab_size=30522, hidden_size=128, # Smaller for demo num_hidden_layers=4, num_attention_heads=4, intermediate_size=512, num_labels=2 ) model = OctagonForSequenceClassification(config) # Metrics metric = evaluate.load("accuracy") def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) # Training args training_args = TrainingArguments( output_dir="octagon_model", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01, load_best_model_at_end=True, ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_eval, compute_metrics=compute_metrics, ) # Train trainer.train() # Save model model.save_pretrained("octagon_model") tokenizer.save_pretrained("octagon_model")