Spaces:
Running
Running
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
from torch.utils.data import Dataset, DataLoader | |
import json | |
import torch | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import precision_recall_fscore_support, classification_report | |
import numpy as np | |
import matplotlib.pyplot as plt | |
# Load dataset pelatihan dari datasets_new.json | |
with open('datasets_new.json', 'r') as f: | |
datasets = json.load(f) | |
texts = [] | |
labels = [] | |
tags = sorted(set(dataset['tag'] for dataset in datasets['intents'])) | |
for dataset in datasets['intents']: | |
for pattern in dataset['patterns']: | |
texts.append(pattern) | |
labels.append(tags.index(dataset['tag'])) | |
# Logging distribusi kelas sebelum pemisahan data | |
label_counts = np.bincount(labels) | |
print("Distribusi kelas sebelum pemisahan data:") | |
for tag, count in zip(tags, label_counts): | |
print(f"Tag: {tag}, Jumlah sampel: {count}") | |
# Split data menjadi train dan val | |
texts_train, texts_val, labels_train, labels_val = train_test_split( | |
texts, labels, test_size=0.2, random_state=42, stratify=labels | |
) | |
# Logging distribusi kelas setelah pemisahan | |
val_label_counts = np.bincount(labels_val, minlength=len(tags)) | |
print("\nDistribusi kelas di data validasi:") | |
for tag, count in zip(tags, val_label_counts): | |
print(f"Tag: {tag}, Jumlah sampel: {count}") | |
# Load dataset pengujian dari test_dataset.json | |
with open('test_dataset.json', 'r') as f: | |
test_data = json.load(f) | |
# Ekstrak teks dan intent yang benar | |
texts_test = [item['text'] for item in test_data] | |
true_intents = [item['true_intent'] for item in test_data] | |
# Konversi intent ke indeks label berdasarkan tags | |
labels_test = [tags.index(intent) for intent in true_intents if intent in tags] | |
# Periksa apakah ada intent yang tidak ada dalam tags | |
missing_intents = set(true_intents) - set(tags) | |
if missing_intents: | |
print(f"Peringatan: Intent {missing_intents} tidak ditemukan dalam tags. Pastikan datasets_new.json mencakup semua intent.") | |
# Inisialisasi tokenizer dan model | |
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1") | |
model = AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=len(tags)) | |
class IndoBERTDataset(Dataset): | |
def __init__(self, texts, labels): | |
self.texts = texts | |
self.labels = labels | |
def __getitem__(self, idx): | |
encoding = tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=20, return_tensors='pt') | |
return { | |
'input_ids': encoding['input_ids'].squeeze(), | |
'attention_mask': encoding['attention_mask'].squeeze(), | |
'labels': torch.tensor(self.labels[idx], dtype=torch.long) | |
} | |
def __len__(self): | |
return len(self.texts) | |
# Buat dataset dan dataloader untuk train, val, dan test | |
train_dataset = IndoBERTDataset(texts_train, labels_train) | |
val_dataset = IndoBERTDataset(texts_val, labels_val) | |
test_dataset = IndoBERTDataset(texts_test, labels_test) | |
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True) | |
val_loader = DataLoader(val_dataset, batch_size=8) | |
test_loader = DataLoader(test_dataset, batch_size=8) | |
# Pelatihan | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
model.to(device) | |
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) | |
num_epochs = 10 | |
best_val_loss = float('inf') | |
patience = 3 | |
counter = 0 | |
# Lists untuk menyimpan loss pelatihan dan validasi | |
train_losses = [] | |
val_losses = [] | |
for epoch in range(num_epochs): | |
model.train() | |
train_loss_epoch = 0 | |
for batch in train_loader: | |
inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'} | |
labels = batch['labels'].to(device) | |
outputs = model(**inputs, labels=labels) | |
loss = outputs.loss | |
train_loss_epoch += loss.item() | |
optimizer.zero_grad() | |
loss.backward() | |
optimizer.step() | |
# Hitung rata-rata loss pelatihan per epoch | |
train_loss_epoch /= len(train_loader) | |
train_losses.append(train_loss_epoch) | |
# Validasi | |
model.eval() | |
val_loss = 0 | |
correct = 0 | |
total = 0 | |
all_preds = [] | |
all_labels = [] | |
with torch.no_grad(): | |
for batch in val_loader: | |
inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'} | |
labels = batch['labels'].to(device) | |
outputs = model(**inputs, labels=labels) | |
val_loss += outputs.loss.item() | |
_, predicted = torch.max(outputs.logits, 1) | |
total += labels.size(0) | |
correct += (predicted == labels).sum().item() | |
all_preds.extend(predicted.cpu().numpy()) | |
all_labels.extend(labels.cpu().numpy()) | |
val_loss /= len(val_loader) | |
val_losses.append(val_loss) | |
val_accuracy = 100 * correct / total | |
# Hitung presisi, recall, dan F1-score untuk validasi | |
precision, recall, f1, _ = precision_recall_fscore_support( | |
all_labels, all_preds, average='weighted', zero_division=0 | |
) | |
print(f'Epoch {epoch+1}, Train Loss: {train_loss_epoch:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%') | |
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}') | |
# Laporan klasifikasi per tag untuk validasi | |
unique_labels = sorted(set(all_labels)) | |
filtered_tags = [tags[i] for i in unique_labels] | |
print("\nValidation Classification Report:") | |
print(classification_report(all_labels, all_preds, labels=unique_labels, target_names=filtered_tags, zero_division=0)) | |
# Early stopping | |
if val_loss < best_val_loss: | |
best_val_loss = val_loss | |
counter = 0 | |
# Simpan model terbaik | |
model.save_pretrained("indobert_model") | |
tokenizer.save_pretrained("indobert_model") | |
else: | |
counter += 1 | |
if counter >= patience: | |
print("Early stopping triggered") | |
break | |
# Evaluasi pada data uji dari test_dataset.json | |
model.eval() | |
test_correct = 0 | |
test_total = 0 | |
test_preds = [] | |
test_labels = [] | |
with torch.no_grad(): | |
for batch in test_loader: | |
inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'} | |
labels = batch['labels'].to(device) | |
outputs = model(**inputs, labels=labels) | |
_, predicted = torch.max(outputs.logits, 1) | |
test_total += labels.size(0) | |
test_correct += (predicted == labels).sum().item() | |
test_preds.extend(predicted.cpu().numpy()) | |
test_labels.extend(labels.cpu().numpy()) | |
test_accuracy = 100 * test_correct / test_total | |
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support( | |
test_labels, test_preds, average='weighted', zero_division=0 | |
) | |
# Laporan klasifikasi untuk data uji | |
unique_test_labels = sorted(set(test_labels)) | |
filtered_test_tags = [tags[i] for i in unique_test_labels] | |
print(f'\nTest Accuracy: {test_accuracy:.2f}%') | |
print(f'Test Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1-Score: {test_f1:.4f}') | |
print("\nTest Classification Report:") | |
print(classification_report(test_labels, test_preds, labels=unique_test_labels, target_names=filtered_test_tags, zero_division=0)) | |
print("Training Selesai. Best model disimpan di indobert_model/") | |
# Plot kurva loss | |
plt.figure(figsize=(10, 6)) | |
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Train Loss', marker='o') | |
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss', marker='s') | |
plt.xlabel('Epoch') | |
plt.ylabel('Loss') | |
plt.title('Kurva Penurunan Loss Pelatihan dan Validasi') | |
plt.legend() | |
plt.grid(True) | |
plt.savefig('loss_curve.png') | |
plt.show() |