eyebotIndoBert / train_indobert.py
adechandra1987's picture
Upload 20 files
403d36c verified
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import json
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, classification_report
import numpy as np
import matplotlib.pyplot as plt
# Load dataset pelatihan dari datasets_new.json
with open('datasets_new.json', 'r') as f:
datasets = json.load(f)
texts = []
labels = []
tags = sorted(set(dataset['tag'] for dataset in datasets['intents']))
for dataset in datasets['intents']:
for pattern in dataset['patterns']:
texts.append(pattern)
labels.append(tags.index(dataset['tag']))
# Logging distribusi kelas sebelum pemisahan data
label_counts = np.bincount(labels)
print("Distribusi kelas sebelum pemisahan data:")
for tag, count in zip(tags, label_counts):
print(f"Tag: {tag}, Jumlah sampel: {count}")
# Split data menjadi train dan val
texts_train, texts_val, labels_train, labels_val = train_test_split(
texts, labels, test_size=0.2, random_state=42, stratify=labels
)
# Logging distribusi kelas setelah pemisahan
val_label_counts = np.bincount(labels_val, minlength=len(tags))
print("\nDistribusi kelas di data validasi:")
for tag, count in zip(tags, val_label_counts):
print(f"Tag: {tag}, Jumlah sampel: {count}")
# Load dataset pengujian dari test_dataset.json
with open('test_dataset.json', 'r') as f:
test_data = json.load(f)
# Ekstrak teks dan intent yang benar
texts_test = [item['text'] for item in test_data]
true_intents = [item['true_intent'] for item in test_data]
# Konversi intent ke indeks label berdasarkan tags
labels_test = [tags.index(intent) for intent in true_intents if intent in tags]
# Periksa apakah ada intent yang tidak ada dalam tags
missing_intents = set(true_intents) - set(tags)
if missing_intents:
print(f"Peringatan: Intent {missing_intents} tidak ditemukan dalam tags. Pastikan datasets_new.json mencakup semua intent.")
# Inisialisasi tokenizer dan model
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=len(tags))
class IndoBERTDataset(Dataset):
def __init__(self, texts, labels):
self.texts = texts
self.labels = labels
def __getitem__(self, idx):
encoding = tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=20, return_tensors='pt')
return {
'input_ids': encoding['input_ids'].squeeze(),
'attention_mask': encoding['attention_mask'].squeeze(),
'labels': torch.tensor(self.labels[idx], dtype=torch.long)
}
def __len__(self):
return len(self.texts)
# Buat dataset dan dataloader untuk train, val, dan test
train_dataset = IndoBERTDataset(texts_train, labels_train)
val_dataset = IndoBERTDataset(texts_val, labels_val)
test_dataset = IndoBERTDataset(texts_test, labels_test)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)
# Pelatihan
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
num_epochs = 10
best_val_loss = float('inf')
patience = 3
counter = 0
# Lists untuk menyimpan loss pelatihan dan validasi
train_losses = []
val_losses = []
for epoch in range(num_epochs):
model.train()
train_loss_epoch = 0
for batch in train_loader:
inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
labels = batch['labels'].to(device)
outputs = model(**inputs, labels=labels)
loss = outputs.loss
train_loss_epoch += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Hitung rata-rata loss pelatihan per epoch
train_loss_epoch /= len(train_loader)
train_losses.append(train_loss_epoch)
# Validasi
model.eval()
val_loss = 0
correct = 0
total = 0
all_preds = []
all_labels = []
with torch.no_grad():
for batch in val_loader:
inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
labels = batch['labels'].to(device)
outputs = model(**inputs, labels=labels)
val_loss += outputs.loss.item()
_, predicted = torch.max(outputs.logits, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
all_preds.extend(predicted.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
val_loss /= len(val_loader)
val_losses.append(val_loss)
val_accuracy = 100 * correct / total
# Hitung presisi, recall, dan F1-score untuk validasi
precision, recall, f1, _ = precision_recall_fscore_support(
all_labels, all_preds, average='weighted', zero_division=0
)
print(f'Epoch {epoch+1}, Train Loss: {train_loss_epoch:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')
# Laporan klasifikasi per tag untuk validasi
unique_labels = sorted(set(all_labels))
filtered_tags = [tags[i] for i in unique_labels]
print("\nValidation Classification Report:")
print(classification_report(all_labels, all_preds, labels=unique_labels, target_names=filtered_tags, zero_division=0))
# Early stopping
if val_loss < best_val_loss:
best_val_loss = val_loss
counter = 0
# Simpan model terbaik
model.save_pretrained("indobert_model")
tokenizer.save_pretrained("indobert_model")
else:
counter += 1
if counter >= patience:
print("Early stopping triggered")
break
# Evaluasi pada data uji dari test_dataset.json
model.eval()
test_correct = 0
test_total = 0
test_preds = []
test_labels = []
with torch.no_grad():
for batch in test_loader:
inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
labels = batch['labels'].to(device)
outputs = model(**inputs, labels=labels)
_, predicted = torch.max(outputs.logits, 1)
test_total += labels.size(0)
test_correct += (predicted == labels).sum().item()
test_preds.extend(predicted.cpu().numpy())
test_labels.extend(labels.cpu().numpy())
test_accuracy = 100 * test_correct / test_total
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
test_labels, test_preds, average='weighted', zero_division=0
)
# Laporan klasifikasi untuk data uji
unique_test_labels = sorted(set(test_labels))
filtered_test_tags = [tags[i] for i in unique_test_labels]
print(f'\nTest Accuracy: {test_accuracy:.2f}%')
print(f'Test Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1-Score: {test_f1:.4f}')
print("\nTest Classification Report:")
print(classification_report(test_labels, test_preds, labels=unique_test_labels, target_names=filtered_test_tags, zero_division=0))
print("Training Selesai. Best model disimpan di indobert_model/")
# Plot kurva loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Train Loss', marker='o')
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss', marker='s')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Kurva Penurunan Loss Pelatihan dan Validasi')
plt.legend()
plt.grid(True)
plt.savefig('loss_curve.png')
plt.show()