Upload models.py with huggingface_hub

e13fa31 verified 2 months ago

8.7 kB

	"""
	Module định nghĩa các mô hình cho spam review detection
	"""

	import torch
	import torch.nn as nn
	from transformers import AutoModel, AutoConfig, AutoModelForSequenceClassification
	from .custom_models import TextCNN, BiLSTM, RoBERTaGRU, SPhoBERT

	class TransformerForSpamDetection(nn.Module):
	"""
	Base transformer model cho spam review detection
	"""
	def __init__(self, model_name: str, num_labels: int):
	super().__init__()
	config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
	self.encoder = AutoModel.from_pretrained(model_name, config=config)
	self.classifier = nn.Linear(config.hidden_size, num_labels)
	self.dropout = nn.Dropout(0.1)

	def forward(self, input_ids, attention_mask, labels=None, **kwargs):
	# Filter out arguments that BertModel doesn't expect
	filtered_kwargs = {k: v for k, v in kwargs.items()
	if k not in ['num_items_in_batch', 'position_ids']}

	# Pass filtered arguments to encoder (including token_type_ids for BERT)
	out = self.encoder(input_ids=input_ids, attention_mask=attention_mask, **filtered_kwargs)
	pooled = out.last_hidden_state[:, 0] # CLS token
	pooled = self.dropout(pooled)
	logits = self.classifier(pooled)
	loss = None
	if labels is not None:
	loss_fn = nn.CrossEntropyLoss()
	loss = loss_fn(logits, labels)
	return {"loss": loss, "logits": logits}

	class ViT5ForSpamDetection(nn.Module):
	"""
	ViT5 model cho spam review detection - sử dụng encoder-only approach
	"""
	def __init__(self, model_name: str, num_labels: int):
	super().__init__()
	from transformers import T5EncoderModel, T5Config

	# Load T5 encoder only
	config = T5Config.from_pretrained(model_name)
	self.t5_encoder = T5EncoderModel.from_pretrained(model_name, config=config)

	# Classification head
	self.classifier = nn.Linear(config.d_model, num_labels)
	self.dropout = nn.Dropout(0.1)

	def forward(self, input_ids, attention_mask, labels=None, **kwargs):
	# Filter out arguments that T5EncoderModel doesn't expect
	filtered_kwargs = {k: v for k, v in kwargs.items()
	if k not in ['num_items_in_batch', 'position_ids']}

	# Chỉ sử dụng encoder của T5
	encoder_outputs = self.t5_encoder(input_ids=input_ids, attention_mask=attention_mask, **filtered_kwargs)

	# Lấy pooled representation (first token)
	pooled = encoder_outputs.last_hidden_state[:, 0]
	pooled = self.dropout(pooled)
	logits = self.classifier(pooled)

	loss = None
	if labels is not None:
	loss_fn = nn.CrossEntropyLoss()
	loss = loss_fn(logits, labels)

	return {"loss": loss, "logits": logits}

	def get_model(model_name: str, num_labels: int, vocab_size: int = None):
	"""
	Factory function để tạo model dựa trên tên model

	Args:
	model_name: Tên model (phobert-v2, textcnn, bilstm, etc.)
	num_labels: Số lượng classes
	vocab_size: Kích thước vocabulary (chỉ cần cho BiLSTM-CRF)

	Returns:
	Model instance
	"""
	# Mapping từ model name đến base model
	model_mapping = {
	"phobert-v1": "vinai/phobert-base",
	"phobert-v2": "vinai/phobert-base-v2",
	"bartpho": "vinai/bartpho-syllable",
	"visobert": "uitnlp/visobert",
	"xlm-r": "xlm-roberta-large",
	"mbert": "bert-base-multilingual-cased",
	"vit5": "VietAI/vit5-base"
	}

	if model_name == "vit5":
	# Sử dụng ViT5ForSpamDetection cho T5 model
	base_model_name = model_mapping[model_name]
	return ViT5ForSpamDetection(base_model_name, num_labels)
	elif model_name in model_mapping:
	# Sử dụng standard transformer model
	base_model_name = model_mapping[model_name]
	return TransformerForSpamDetection(base_model_name, num_labels)

	elif model_name == "textcnn":
	# TextCNN custom model
	base_model_name = "vinai/phobert-base-v2" # Sử dụng PhoBERT embeddings
	return TextCNN(base_model_name, num_labels)

	elif model_name == "bilstm":
	# BiLSTM custom model
	base_model_name = "vinai/phobert-base-v2"
	return BiLSTM(base_model_name, num_labels)

	elif model_name == "roberta-gru":
	# RoBERTa-GRU hybrid model
	base_model_name = "vinai/phobert-base-v2"
	return RoBERTaGRU(base_model_name, num_labels)

	elif model_name == "sphobert":
	# SPhoBERT fusion model
	base_model_name = "vinai/phobert-base-v2"
	return SPhoBERT(base_model_name, num_labels)

	elif model_name == "bilstm-crf":
	# BiLSTM-CRF model (placeholder implementation)
	# Trong thực tế cần implement CRF layer
	base_model_name = "vinai/phobert-base-v2"
	return BiLSTM(base_model_name, num_labels)

	else:
	raise ValueError(f"Unknown model name: {model_name}. Available models: {list(model_mapping.keys()) + ['textcnn', 'bilstm', 'roberta-gru', 'sphobert', 'bilstm-crf']}")

	def get_model_config(model_name: str):
	"""
	Lấy cấu hình cho model

	Args:
	model_name: Tên model

	Returns:
	Dict chứa cấu hình model
	"""
	configs = {
	"phobert-v1": {
	"model_name": "vinai/phobert-base",
	"description": "PhoBERT v1 - Pre-trained BERT for Vietnamese",
	"max_length": 256,
	"learning_rate": 5e-5
	},
	"phobert-v2": {
	"model_name": "vinai/phobert-base-v2",
	"description": "PhoBERT v2 - Improved PhoBERT for Vietnamese",
	"max_length": 256,
	"learning_rate": 5e-5
	},
	"bartpho": {
	"model_name": "vinai/bartpho-syllable",
	"description": "BART Pho - Vietnamese BART model",
	"max_length": 256,
	"learning_rate": 5e-5
	},
	"visobert": {
	"model_name": "uitnlp/visobert",
	"description": "ViSoBERT - Vietnamese Social BERT",
	"max_length": 256,
	"learning_rate": 5e-5
	},
	"xlm-r": {
	"model_name": "xlm-roberta-large",
	"description": "XLM-RoBERTa Large - Multilingual model",
	"max_length": 256,
	"learning_rate": 3e-5
	},
	"mbert": {
	"model_name": "bert-base-multilingual-cased",
	"description": "mBERT - Multilingual BERT model",
	"max_length": 256,
	"learning_rate": 5e-5
	},
	"vit5": {
	"model_name": "VietAI/vit5-base",
	"description": "ViT5 - Vietnamese T5",
	"max_length": 256,
	"learning_rate": 5e-5
	},
	"textcnn": {
	"model_name": "vinai/phobert-base-v2",
	"description": "TextCNN - Convolutional Neural Network for text",
	"max_length": 256,
	"learning_rate": 1e-3,
	"custom_model": True
	},
	"bilstm": {
	"model_name": "vinai/phobert-base-v2",
	"description": "BiLSTM - Bidirectional LSTM for text classification",
	"max_length": 256,
	"learning_rate": 1e-3,
	"custom_model": True
	},
	"roberta-gru": {
	"model_name": "vinai/phobert-base-v2",
	"description": "RoBERTa-GRU - Hybrid RoBERTa + GRU model",
	"max_length": 256,
	"learning_rate": 5e-5,
	"custom_model": True
	},
	"sphobert": {
	"model_name": "vinai/phobert-base-v2",
	"description": "SPhoBERT - PhoBERT + SentenceBERT embedding fusion",
	"max_length": 256,
	"learning_rate": 5e-5,
	"custom_model": True
	},
	"bilstm-crf": {
	"model_name": "vinai/phobert-base-v2",
	"description": "BiLSTM-CRF - Bidirectional LSTM with CRF",
	"max_length": 256,
	"learning_rate": 1e-3,
	"custom_model": True
	}
	}

	if model_name not in configs:
	raise ValueError(f"Model {model_name} not found. Available models: {list(configs.keys())}")

	return configs[model_name]