Georg4000 commited on
Commit
2a9c962
·
verified ·
1 Parent(s): d14d713

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +76 -0
train.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import TrainingArguments, Trainer
2
+ from datasets import load_dataset
3
+ import evaluate
4
+ import numpy as np
5
+ from modeling_octagon import OctagonForSequenceClassification, OctagonConfig
6
+ from tokenization_octagon import OctagonTokenizer
7
+
8
+ # Load dataset
9
+ dataset = load_dataset("imdb")
10
+
11
+ # Sample training (for demo purposes, use smaller subset)
12
+ train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
13
+ eval_dataset = dataset["test"].shuffle(seed=42).select(range(200))
14
+
15
+ # Initialize tokenizer
16
+ tokenizer = OctagonTokenizer.train_tokenizer(
17
+ texts=train_dataset["text"],
18
+ vocab_size=30522,
19
+ save_path="octagon-tokenizer.json"
20
+ )
21
+
22
+ # Tokenize function
23
+ def tokenize_function(examples):
24
+ return tokenizer(examples["text"], padding="max_length", truncation=True)
25
+
26
+ tokenized_train = train_dataset.map(tokenize_function, batched=True)
27
+ tokenized_eval = eval_dataset.map(tokenize_function, batched=True)
28
+
29
+ # Model config
30
+ config = OctagonConfig(
31
+ vocab_size=30522,
32
+ hidden_size=128, # Smaller for demo
33
+ num_hidden_layers=4,
34
+ num_attention_heads=4,
35
+ intermediate_size=512,
36
+ num_labels=2
37
+ )
38
+
39
+ model = OctagonForSequenceClassification(config)
40
+
41
+ # Metrics
42
+ metric = evaluate.load("accuracy")
43
+
44
+ def compute_metrics(eval_pred):
45
+ logits, labels = eval_pred
46
+ predictions = np.argmax(logits, axis=-1)
47
+ return metric.compute(predictions=predictions, references=labels)
48
+
49
+ # Training args
50
+ training_args = TrainingArguments(
51
+ output_dir="octagon_model",
52
+ evaluation_strategy="epoch",
53
+ save_strategy="epoch",
54
+ learning_rate=2e-5,
55
+ per_device_train_batch_size=8,
56
+ per_device_eval_batch_size=8,
57
+ num_train_epochs=3,
58
+ weight_decay=0.01,
59
+ load_best_model_at_end=True,
60
+ )
61
+
62
+ # Trainer
63
+ trainer = Trainer(
64
+ model=model,
65
+ args=training_args,
66
+ train_dataset=tokenized_train,
67
+ eval_dataset=tokenized_eval,
68
+ compute_metrics=compute_metrics,
69
+ )
70
+
71
+ # Train
72
+ trainer.train()
73
+
74
+ # Save model
75
+ model.save_pretrained("octagon_model")
76
+ tokenizer.save_pretrained("octagon_model")