from transformers import AutoTokenizer, AutoModelForSequenceClassification from torch.utils.data import Dataset, DataLoader import json import torch from sklearn.model_selection import train_test_split from sklearn.metrics import precision_recall_fscore_support, classification_report import numpy as np import matplotlib.pyplot as plt # Load dataset pelatihan dari datasets_new.json with open('datasets_new.json', 'r') as f: datasets = json.load(f) texts = [] labels = [] tags = sorted(set(dataset['tag'] for dataset in datasets['intents'])) for dataset in datasets['intents']: for pattern in dataset['patterns']: texts.append(pattern) labels.append(tags.index(dataset['tag'])) # Logging distribusi kelas sebelum pemisahan data label_counts = np.bincount(labels) print("Distribusi kelas sebelum pemisahan data:") for tag, count in zip(tags, label_counts): print(f"Tag: {tag}, Jumlah sampel: {count}") # Split data menjadi train dan val texts_train, texts_val, labels_train, labels_val = train_test_split( texts, labels, test_size=0.2, random_state=42, stratify=labels ) # Logging distribusi kelas setelah pemisahan val_label_counts = np.bincount(labels_val, minlength=len(tags)) print("\nDistribusi kelas di data validasi:") for tag, count in zip(tags, val_label_counts): print(f"Tag: {tag}, Jumlah sampel: {count}") # Load dataset pengujian dari test_dataset.json with open('test_dataset.json', 'r') as f: test_data = json.load(f) # Ekstrak teks dan intent yang benar texts_test = [item['text'] for item in test_data] true_intents = [item['true_intent'] for item in test_data] # Konversi intent ke indeks label berdasarkan tags labels_test = [tags.index(intent) for intent in true_intents if intent in tags] # Periksa apakah ada intent yang tidak ada dalam tags missing_intents = set(true_intents) - set(tags) if missing_intents: print(f"Peringatan: Intent {missing_intents} tidak ditemukan dalam tags. Pastikan datasets_new.json mencakup semua intent.") # Inisialisasi tokenizer dan model tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1") model = AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=len(tags)) class IndoBERTDataset(Dataset): def __init__(self, texts, labels): self.texts = texts self.labels = labels def __getitem__(self, idx): encoding = tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=20, return_tensors='pt') return { 'input_ids': encoding['input_ids'].squeeze(), 'attention_mask': encoding['attention_mask'].squeeze(), 'labels': torch.tensor(self.labels[idx], dtype=torch.long) } def __len__(self): return len(self.texts) # Buat dataset dan dataloader untuk train, val, dan test train_dataset = IndoBERTDataset(texts_train, labels_train) val_dataset = IndoBERTDataset(texts_val, labels_val) test_dataset = IndoBERTDataset(texts_test, labels_test) train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=8) test_loader = DataLoader(test_dataset, batch_size=8) # Pelatihan device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) num_epochs = 10 best_val_loss = float('inf') patience = 3 counter = 0 # Lists untuk menyimpan loss pelatihan dan validasi train_losses = [] val_losses = [] for epoch in range(num_epochs): model.train() train_loss_epoch = 0 for batch in train_loader: inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'} labels = batch['labels'].to(device) outputs = model(**inputs, labels=labels) loss = outputs.loss train_loss_epoch += loss.item() optimizer.zero_grad() loss.backward() optimizer.step() # Hitung rata-rata loss pelatihan per epoch train_loss_epoch /= len(train_loader) train_losses.append(train_loss_epoch) # Validasi model.eval() val_loss = 0 correct = 0 total = 0 all_preds = [] all_labels = [] with torch.no_grad(): for batch in val_loader: inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'} labels = batch['labels'].to(device) outputs = model(**inputs, labels=labels) val_loss += outputs.loss.item() _, predicted = torch.max(outputs.logits, 1) total += labels.size(0) correct += (predicted == labels).sum().item() all_preds.extend(predicted.cpu().numpy()) all_labels.extend(labels.cpu().numpy()) val_loss /= len(val_loader) val_losses.append(val_loss) val_accuracy = 100 * correct / total # Hitung presisi, recall, dan F1-score untuk validasi precision, recall, f1, _ = precision_recall_fscore_support( all_labels, all_preds, average='weighted', zero_division=0 ) print(f'Epoch {epoch+1}, Train Loss: {train_loss_epoch:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%') print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}') # Laporan klasifikasi per tag untuk validasi unique_labels = sorted(set(all_labels)) filtered_tags = [tags[i] for i in unique_labels] print("\nValidation Classification Report:") print(classification_report(all_labels, all_preds, labels=unique_labels, target_names=filtered_tags, zero_division=0)) # Early stopping if val_loss < best_val_loss: best_val_loss = val_loss counter = 0 # Simpan model terbaik model.save_pretrained("indobert_model") tokenizer.save_pretrained("indobert_model") else: counter += 1 if counter >= patience: print("Early stopping triggered") break # Evaluasi pada data uji dari test_dataset.json model.eval() test_correct = 0 test_total = 0 test_preds = [] test_labels = [] with torch.no_grad(): for batch in test_loader: inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'} labels = batch['labels'].to(device) outputs = model(**inputs, labels=labels) _, predicted = torch.max(outputs.logits, 1) test_total += labels.size(0) test_correct += (predicted == labels).sum().item() test_preds.extend(predicted.cpu().numpy()) test_labels.extend(labels.cpu().numpy()) test_accuracy = 100 * test_correct / test_total test_precision, test_recall, test_f1, _ = precision_recall_fscore_support( test_labels, test_preds, average='weighted', zero_division=0 ) # Laporan klasifikasi untuk data uji unique_test_labels = sorted(set(test_labels)) filtered_test_tags = [tags[i] for i in unique_test_labels] print(f'\nTest Accuracy: {test_accuracy:.2f}%') print(f'Test Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1-Score: {test_f1:.4f}') print("\nTest Classification Report:") print(classification_report(test_labels, test_preds, labels=unique_test_labels, target_names=filtered_test_tags, zero_division=0)) print("Training Selesai. Best model disimpan di indobert_model/") # Plot kurva loss plt.figure(figsize=(10, 6)) plt.plot(range(1, len(train_losses) + 1), train_losses, label='Train Loss', marker='o') plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss', marker='s') plt.xlabel('Epoch') plt.ylabel('Loss') plt.title('Kurva Penurunan Loss Pelatihan dan Validasi') plt.legend() plt.grid(True) plt.savefig('loss_curve.png') plt.show()