|
|
""" |
|
|
SentilensAI - Machine Learning Training Pipeline |
|
|
|
|
|
This module provides comprehensive machine learning capabilities for training |
|
|
custom sentiment analysis models specifically optimized for AI chatbot conversations. |
|
|
|
|
|
Features: |
|
|
- Multiple ML algorithms (Random Forest, SVM, Neural Networks, XGBoost, etc.) |
|
|
- Advanced feature engineering for chatbot text |
|
|
- Cross-validation and hyperparameter tuning |
|
|
- Model comparison and evaluation |
|
|
- Production-ready model persistence |
|
|
- Real-time prediction capabilities |
|
|
|
|
|
Author: Pravin Selvamuthu |
|
|
Repository: https://github.com/kernelseed/sentilens-ai |
|
|
""" |
|
|
|
|
|
import os |
|
|
import json |
|
|
import pickle |
|
|
import logging |
|
|
from typing import Dict, List, Tuple, Optional, Any, Union |
|
|
from datetime import datetime |
|
|
from pathlib import Path |
|
|
import warnings |
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer |
|
|
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler |
|
|
from sklearn.metrics import ( |
|
|
classification_report, confusion_matrix, accuracy_score, |
|
|
precision_score, recall_score, f1_score, roc_auc_score, |
|
|
balanced_accuracy_score, matthews_corrcoef, cohen_kappa_score |
|
|
) |
|
|
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier |
|
|
from sklearn.svm import SVC |
|
|
from sklearn.neural_network import MLPClassifier |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.tree import DecisionTreeClassifier |
|
|
from sklearn.naive_bayes import GaussianNB, MultinomialNB |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.calibration import CalibratedClassifierCV |
|
|
import joblib |
|
|
|
|
|
|
|
|
try: |
|
|
import xgboost as xgb |
|
|
XGBOOST_AVAILABLE = True |
|
|
except ImportError: |
|
|
XGBOOST_AVAILABLE = False |
|
|
|
|
|
try: |
|
|
import lightgbm as lgb |
|
|
LIGHTGBM_AVAILABLE = True |
|
|
except ImportError: |
|
|
LIGHTGBM_AVAILABLE = False |
|
|
|
|
|
try: |
|
|
import catboost as cb |
|
|
CATBOOST_AVAILABLE = True |
|
|
except ImportError: |
|
|
CATBOOST_AVAILABLE = False |
|
|
|
|
|
|
|
|
try: |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
PLOTTING_AVAILABLE = True |
|
|
except ImportError: |
|
|
PLOTTING_AVAILABLE = False |
|
|
|
|
|
|
|
|
from langchain.schema import BaseMessage |
|
|
from langchain.prompts import PromptTemplate |
|
|
from langchain.chains import LLMChain |
|
|
from langchain.llms import OpenAI |
|
|
|
|
|
|
|
|
from sentiment_analyzer import SentilensAIAnalyzer, SentimentResult |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class SentilensAITrainer: |
|
|
""" |
|
|
Advanced machine learning trainer for sentiment analysis models |
|
|
specifically designed for AI chatbot conversations |
|
|
""" |
|
|
|
|
|
def __init__(self, model_cache_dir: str = "./model_cache"): |
|
|
""" |
|
|
Initialize the SentimentsAI trainer |
|
|
|
|
|
Args: |
|
|
model_cache_dir: Directory to cache trained models |
|
|
""" |
|
|
self.model_cache_dir = Path(model_cache_dir) |
|
|
self.model_cache_dir.mkdir(exist_ok=True) |
|
|
|
|
|
|
|
|
self.analyzer = SentilensAIAnalyzer() |
|
|
self.label_encoder = LabelEncoder() |
|
|
self.scaler = RobustScaler() |
|
|
self.vectorizer = None |
|
|
self.models = {} |
|
|
self.training_data = None |
|
|
self.feature_names = None |
|
|
|
|
|
|
|
|
self._initialize_models() |
|
|
|
|
|
|
|
|
self.feature_params = { |
|
|
'max_features': 10000, |
|
|
'ngram_range': (1, 3), |
|
|
'min_df': 2, |
|
|
'max_df': 0.95, |
|
|
'stop_words': 'english' |
|
|
} |
|
|
|
|
|
def _initialize_models(self): |
|
|
"""Initialize available machine learning models""" |
|
|
self.models = { |
|
|
'random_forest': RandomForestClassifier( |
|
|
n_estimators=100, |
|
|
max_depth=10, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
), |
|
|
'extra_trees': ExtraTreesClassifier( |
|
|
n_estimators=100, |
|
|
max_depth=10, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
), |
|
|
'gradient_boosting': GradientBoostingClassifier( |
|
|
n_estimators=100, |
|
|
learning_rate=0.1, |
|
|
max_depth=6, |
|
|
random_state=42 |
|
|
), |
|
|
'svm': SVC( |
|
|
kernel='rbf', |
|
|
C=1.0, |
|
|
gamma='scale', |
|
|
random_state=42, |
|
|
probability=True |
|
|
), |
|
|
'neural_network': MLPClassifier( |
|
|
hidden_layer_sizes=(100, 50), |
|
|
activation='relu', |
|
|
solver='adam', |
|
|
alpha=0.001, |
|
|
learning_rate='adaptive', |
|
|
max_iter=500, |
|
|
random_state=42 |
|
|
), |
|
|
'logistic_regression': LogisticRegression( |
|
|
random_state=42, |
|
|
max_iter=1000, |
|
|
n_jobs=-1 |
|
|
), |
|
|
'decision_tree': DecisionTreeClassifier( |
|
|
max_depth=10, |
|
|
random_state=42 |
|
|
), |
|
|
'naive_bayes': MultinomialNB(alpha=1.0), |
|
|
'ada_boost': AdaBoostClassifier( |
|
|
n_estimators=50, |
|
|
learning_rate=1.0, |
|
|
random_state=42 |
|
|
) |
|
|
} |
|
|
|
|
|
|
|
|
if XGBOOST_AVAILABLE: |
|
|
self.models['xgboost'] = xgb.XGBClassifier( |
|
|
n_estimators=100, |
|
|
max_depth=6, |
|
|
learning_rate=0.1, |
|
|
random_state=42, |
|
|
n_jobs=-1 |
|
|
) |
|
|
|
|
|
if LIGHTGBM_AVAILABLE: |
|
|
self.models['lightgbm'] = lgb.LGBMClassifier( |
|
|
n_estimators=100, |
|
|
max_depth=6, |
|
|
learning_rate=0.1, |
|
|
random_state=42, |
|
|
n_jobs=-1, |
|
|
verbose=-1 |
|
|
) |
|
|
|
|
|
if CATBOOST_AVAILABLE: |
|
|
self.models['catboost'] = cb.CatBoostClassifier( |
|
|
iterations=100, |
|
|
depth=6, |
|
|
learning_rate=0.1, |
|
|
random_seed=42, |
|
|
verbose=False |
|
|
) |
|
|
|
|
|
def create_synthetic_training_data(self, num_samples: int = 1000) -> pd.DataFrame: |
|
|
""" |
|
|
Create synthetic training data for sentiment analysis |
|
|
|
|
|
Args: |
|
|
num_samples: Number of samples to generate |
|
|
|
|
|
Returns: |
|
|
DataFrame with text and sentiment labels |
|
|
""" |
|
|
logger.info(f"Creating {num_samples} synthetic training samples...") |
|
|
|
|
|
|
|
|
sentiment_data = { |
|
|
'positive': [ |
|
|
"I love this chatbot! It's amazing and so helpful.", |
|
|
"This is exactly what I needed. Thank you so much!", |
|
|
"Great service! The bot understood me perfectly.", |
|
|
"Excellent! This chatbot is fantastic and very user-friendly.", |
|
|
"Perfect! I'm so happy with this experience.", |
|
|
"Wonderful! The bot provided exactly the right information.", |
|
|
"Outstanding service! I'm impressed with the quality.", |
|
|
"Brilliant! This is the best chatbot I've ever used.", |
|
|
"Fantastic! The response was quick and accurate.", |
|
|
"Superb! I'm delighted with the help I received." |
|
|
], |
|
|
'negative': [ |
|
|
"This chatbot is terrible. It doesn't understand anything.", |
|
|
"Worst experience ever. The bot is completely useless.", |
|
|
"This is awful. I'm frustrated and disappointed.", |
|
|
"Horrible service! The bot keeps giving wrong answers.", |
|
|
"Disgusting! This chatbot is a complete waste of time.", |
|
|
"Terrible! I hate this bot and its responses.", |
|
|
"Awful experience. The bot is stupid and unhelpful.", |
|
|
"Disappointing! This chatbot is broken and useless.", |
|
|
"Frustrating! The bot doesn't know what it's doing.", |
|
|
"Pathetic! This is the worst chatbot I've ever seen." |
|
|
], |
|
|
'neutral': [ |
|
|
"Can you help me with my account information?", |
|
|
"I need to check my order status.", |
|
|
"What are your business hours?", |
|
|
"How do I reset my password?", |
|
|
"I want to update my profile details.", |
|
|
"Can you provide more information about this product?", |
|
|
"I need assistance with my subscription.", |
|
|
"What is your return policy?", |
|
|
"How can I contact customer support?", |
|
|
"I have a question about my recent purchase." |
|
|
] |
|
|
} |
|
|
|
|
|
|
|
|
data = [] |
|
|
samples_per_sentiment = num_samples // 3 |
|
|
|
|
|
for sentiment, texts in sentiment_data.items(): |
|
|
for i in range(samples_per_sentiment): |
|
|
|
|
|
base_text = np.random.choice(texts) |
|
|
|
|
|
|
|
|
variations = [ |
|
|
base_text, |
|
|
base_text + " Please help me.", |
|
|
"Hi, " + base_text.lower(), |
|
|
base_text + " Thanks!", |
|
|
"Hello, " + base_text.lower(), |
|
|
base_text + " I appreciate it.", |
|
|
"Hey, " + base_text.lower(), |
|
|
base_text + " Could you assist?", |
|
|
"Good morning, " + base_text.lower(), |
|
|
base_text + " That would be great." |
|
|
] |
|
|
|
|
|
text = np.random.choice(variations) |
|
|
data.append({ |
|
|
'text': text, |
|
|
'sentiment': sentiment, |
|
|
'confidence': np.random.uniform(0.6, 1.0), |
|
|
'polarity': np.random.uniform(-1, 1) if sentiment == 'neutral' else (1 if sentiment == 'positive' else -1), |
|
|
'subjectivity': np.random.uniform(0.3, 0.8), |
|
|
'message_type': 'user' if i % 2 == 0 else 'bot', |
|
|
'conversation_id': f'conv_{i//2}', |
|
|
'timestamp': datetime.now() |
|
|
}) |
|
|
|
|
|
|
|
|
mixed_examples = [ |
|
|
("I'm not sure if this is good or bad.", "neutral"), |
|
|
("It's okay, I guess.", "neutral"), |
|
|
("This is fine, nothing special.", "neutral"), |
|
|
("I have mixed feelings about this.", "neutral"), |
|
|
("It's decent but could be better.", "neutral") |
|
|
] |
|
|
|
|
|
for text, sentiment in mixed_examples: |
|
|
data.append({ |
|
|
'text': text, |
|
|
'sentiment': sentiment, |
|
|
'confidence': np.random.uniform(0.4, 0.7), |
|
|
'polarity': np.random.uniform(-0.3, 0.3), |
|
|
'subjectivity': np.random.uniform(0.5, 0.9), |
|
|
'message_type': 'user', |
|
|
'conversation_id': f'mixed_{len(data)}', |
|
|
'timestamp': datetime.now() |
|
|
}) |
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
logger.info(f"Created {len(df)} training samples") |
|
|
return df |
|
|
|
|
|
def extract_features(self, texts: List[str]) -> np.ndarray: |
|
|
""" |
|
|
Extract comprehensive features from text data |
|
|
|
|
|
Args: |
|
|
texts: List of text strings |
|
|
|
|
|
Returns: |
|
|
Feature matrix |
|
|
""" |
|
|
logger.info("Extracting features from text data...") |
|
|
|
|
|
|
|
|
if self.vectorizer is None: |
|
|
self.vectorizer = TfidfVectorizer( |
|
|
max_features=self.feature_params['max_features'], |
|
|
ngram_range=self.feature_params['ngram_range'], |
|
|
min_df=self.feature_params['min_df'], |
|
|
max_df=self.feature_params['max_df'], |
|
|
stop_words=self.feature_params['stop_words'] |
|
|
) |
|
|
|
|
|
|
|
|
tfidf_features = self.vectorizer.fit_transform(texts).toarray() |
|
|
|
|
|
|
|
|
text_features = [] |
|
|
for text in texts: |
|
|
features = [] |
|
|
|
|
|
|
|
|
features.append(len(text)) |
|
|
features.append(len(text.split())) |
|
|
features.append(len([c for c in text if c.isupper()])) |
|
|
features.append(len([c for c in text if c.isdigit()])) |
|
|
features.append(len([c for c in text if c in '!?'])) |
|
|
|
|
|
|
|
|
try: |
|
|
sentiment_result = self.analyzer.analyze_sentiment(text, method='ensemble') |
|
|
features.extend([ |
|
|
sentiment_result.polarity, |
|
|
sentiment_result.confidence, |
|
|
sentiment_result.subjectivity |
|
|
]) |
|
|
|
|
|
|
|
|
for emotion in ['joy', 'sadness', 'anger', 'fear', 'surprise', 'disgust']: |
|
|
features.append(sentiment_result.emotions.get(emotion, 0.0)) |
|
|
except: |
|
|
features.extend([0.0] * 9) |
|
|
|
|
|
|
|
|
words = text.split() |
|
|
if words: |
|
|
avg_word_length = np.mean([len(word) for word in words]) |
|
|
features.append(avg_word_length) |
|
|
else: |
|
|
features.append(0.0) |
|
|
|
|
|
text_features.append(features) |
|
|
|
|
|
text_features = np.array(text_features) |
|
|
|
|
|
|
|
|
all_features = np.hstack([tfidf_features, text_features]) |
|
|
|
|
|
logger.info(f"Extracted {all_features.shape[1]} features from {len(texts)} texts") |
|
|
return all_features |
|
|
|
|
|
def train_model(self, model_name: str, X: np.ndarray, y: np.ndarray, |
|
|
optimize_hyperparameters: bool = True) -> Dict[str, Any]: |
|
|
""" |
|
|
Train a specific model |
|
|
|
|
|
Args: |
|
|
model_name: Name of the model to train |
|
|
X: Feature matrix |
|
|
y: Target labels |
|
|
optimize_hyperparameters: Whether to optimize hyperparameters |
|
|
|
|
|
Returns: |
|
|
Training results dictionary |
|
|
""" |
|
|
if model_name not in self.models: |
|
|
raise ValueError(f"Unknown model: {model_name}") |
|
|
|
|
|
logger.info(f"Training {model_name} model...") |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, y, test_size=0.2, random_state=42, stratify=y |
|
|
) |
|
|
|
|
|
|
|
|
X_train_scaled = self.scaler.fit_transform(X_train) |
|
|
X_test_scaled = self.scaler.transform(X_test) |
|
|
|
|
|
|
|
|
model = self.models[model_name] |
|
|
|
|
|
|
|
|
if optimize_hyperparameters: |
|
|
model = self._optimize_hyperparameters(model, model_name, X_train_scaled, y_train) |
|
|
|
|
|
|
|
|
start_time = datetime.now() |
|
|
model.fit(X_train_scaled, y_train) |
|
|
training_time = (datetime.now() - start_time).total_seconds() |
|
|
|
|
|
|
|
|
y_pred = model.predict(X_test_scaled) |
|
|
y_pred_proba = model.predict_proba(X_test_scaled) if hasattr(model, 'predict_proba') else None |
|
|
|
|
|
|
|
|
results = self._evaluate_model(y_test, y_pred, y_pred_proba, model.classes_) |
|
|
results.update({ |
|
|
'model_name': model_name, |
|
|
'training_time': training_time, |
|
|
'model': model, |
|
|
'feature_importance': self._get_feature_importance(model, model_name) |
|
|
}) |
|
|
|
|
|
|
|
|
self.models[model_name] = model |
|
|
|
|
|
logger.info(f"Training completed for {model_name}") |
|
|
return results |
|
|
|
|
|
def _optimize_hyperparameters(self, model, model_name: str, X: np.ndarray, y: np.ndarray): |
|
|
"""Optimize hyperparameters using GridSearchCV""" |
|
|
param_grids = { |
|
|
'random_forest': { |
|
|
'n_estimators': [50, 100, 200], |
|
|
'max_depth': [5, 10, 15, None], |
|
|
'min_samples_split': [2, 5, 10] |
|
|
}, |
|
|
'extra_trees': { |
|
|
'n_estimators': [50, 100, 200], |
|
|
'max_depth': [5, 10, 15, None], |
|
|
'min_samples_split': [2, 5, 10] |
|
|
}, |
|
|
'gradient_boosting': { |
|
|
'n_estimators': [50, 100, 200], |
|
|
'learning_rate': [0.01, 0.1, 0.2], |
|
|
'max_depth': [3, 6, 10] |
|
|
}, |
|
|
'svm': { |
|
|
'C': [0.1, 1, 10, 100], |
|
|
'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], |
|
|
'kernel': ['rbf', 'linear'] |
|
|
}, |
|
|
'neural_network': { |
|
|
'hidden_layer_sizes': [(50,), (100,), (100, 50), (200, 100)], |
|
|
'alpha': [0.0001, 0.001, 0.01], |
|
|
'learning_rate': ['constant', 'adaptive'] |
|
|
}, |
|
|
'logistic_regression': { |
|
|
'C': [0.1, 1, 10, 100], |
|
|
'penalty': ['l1', 'l2'], |
|
|
'solver': ['liblinear', 'saga'] |
|
|
}, |
|
|
'decision_tree': { |
|
|
'max_depth': [5, 10, 15, None], |
|
|
'min_samples_split': [2, 5, 10], |
|
|
'min_samples_leaf': [1, 2, 4] |
|
|
}, |
|
|
'naive_bayes': { |
|
|
'alpha': [0.1, 0.5, 1.0, 2.0] |
|
|
}, |
|
|
'ada_boost': { |
|
|
'n_estimators': [25, 50, 100], |
|
|
'learning_rate': [0.5, 1.0, 1.5] |
|
|
} |
|
|
} |
|
|
|
|
|
if XGBOOST_AVAILABLE and model_name == 'xgboost': |
|
|
param_grids['xgboost'] = { |
|
|
'n_estimators': [50, 100, 200], |
|
|
'max_depth': [3, 6, 10], |
|
|
'learning_rate': [0.01, 0.1, 0.2] |
|
|
} |
|
|
|
|
|
if LIGHTGBM_AVAILABLE and model_name == 'lightgbm': |
|
|
param_grids['lightgbm'] = { |
|
|
'n_estimators': [50, 100, 200], |
|
|
'max_depth': [3, 6, 10], |
|
|
'learning_rate': [0.01, 0.1, 0.2] |
|
|
} |
|
|
|
|
|
if CATBOOST_AVAILABLE and model_name == 'catboost': |
|
|
param_grids['catboost'] = { |
|
|
'iterations': [50, 100, 200], |
|
|
'depth': [3, 6, 10], |
|
|
'learning_rate': [0.01, 0.1, 0.2] |
|
|
} |
|
|
|
|
|
if model_name in param_grids: |
|
|
logger.info(f"Optimizing hyperparameters for {model_name}...") |
|
|
grid_search = GridSearchCV( |
|
|
model, param_grids[model_name], |
|
|
cv=3, scoring='f1_macro', n_jobs=-1, verbose=0 |
|
|
) |
|
|
grid_search.fit(X, y) |
|
|
return grid_search.best_estimator_ |
|
|
|
|
|
return model |
|
|
|
|
|
def _evaluate_model(self, y_true, y_pred, y_pred_proba, classes) -> Dict[str, Any]: |
|
|
"""Comprehensive model evaluation""" |
|
|
results = { |
|
|
'accuracy': accuracy_score(y_true, y_pred), |
|
|
'balanced_accuracy': balanced_accuracy_score(y_true, y_pred), |
|
|
'precision_macro': precision_score(y_true, y_pred, average='macro', zero_division=0), |
|
|
'precision_micro': precision_score(y_true, y_pred, average='micro', zero_division=0), |
|
|
'precision_weighted': precision_score(y_true, y_pred, average='weighted', zero_division=0), |
|
|
'recall_macro': recall_score(y_true, y_pred, average='macro', zero_division=0), |
|
|
'recall_micro': recall_score(y_true, y_pred, average='micro', zero_division=0), |
|
|
'recall_weighted': recall_score(y_true, y_pred, average='weighted', zero_division=0), |
|
|
'f1_macro': f1_score(y_true, y_pred, average='macro', zero_division=0), |
|
|
'f1_micro': f1_score(y_true, y_pred, average='micro', zero_division=0), |
|
|
'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0), |
|
|
'matthews_corrcoef': matthews_corrcoef(y_true, y_pred), |
|
|
'cohen_kappa': cohen_kappa_score(y_true, y_pred), |
|
|
'classification_report': classification_report(y_true, y_pred, output_dict=True), |
|
|
'confusion_matrix': confusion_matrix(y_true, y_pred).tolist() |
|
|
} |
|
|
|
|
|
|
|
|
if y_pred_proba is not None and len(classes) > 2: |
|
|
try: |
|
|
results['roc_auc'] = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro') |
|
|
except: |
|
|
results['roc_auc'] = 0.0 |
|
|
elif y_pred_proba is not None and len(classes) == 2: |
|
|
try: |
|
|
results['roc_auc'] = roc_auc_score(y_true, y_pred_proba[:, 1]) |
|
|
except: |
|
|
results['roc_auc'] = 0.0 |
|
|
else: |
|
|
results['roc_auc'] = 0.0 |
|
|
|
|
|
return results |
|
|
|
|
|
def _get_feature_importance(self, model, model_name: str) -> Optional[Dict[str, float]]: |
|
|
"""Get feature importance if available""" |
|
|
try: |
|
|
if hasattr(model, 'feature_importances_'): |
|
|
importance = model.feature_importances_ |
|
|
if self.feature_names is not None: |
|
|
return dict(zip(self.feature_names, importance)) |
|
|
else: |
|
|
return {f'feature_{i}': imp for i, imp in enumerate(importance)} |
|
|
elif hasattr(model, 'coef_'): |
|
|
|
|
|
coef = np.abs(model.coef_[0]) if len(model.coef_.shape) > 1 else np.abs(model.coef_) |
|
|
if self.feature_names is not None: |
|
|
return dict(zip(self.feature_names, coef)) |
|
|
else: |
|
|
return {f'feature_{i}': imp for i, imp in enumerate(coef)} |
|
|
except: |
|
|
pass |
|
|
return None |
|
|
|
|
|
def compare_models(self, X: np.ndarray, y: np.ndarray, |
|
|
models_to_compare: Optional[List[str]] = None) -> Dict[str, Any]: |
|
|
""" |
|
|
Compare multiple models using cross-validation |
|
|
|
|
|
Args: |
|
|
X: Feature matrix |
|
|
y: Target labels |
|
|
models_to_compare: List of model names to compare (None for all) |
|
|
|
|
|
Returns: |
|
|
Comparison results |
|
|
""" |
|
|
if models_to_compare is None: |
|
|
models_to_compare = list(self.models.keys()) |
|
|
|
|
|
logger.info(f"Comparing {len(models_to_compare)} models...") |
|
|
|
|
|
|
|
|
X_scaled = self.scaler.fit_transform(X) |
|
|
|
|
|
results = {} |
|
|
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) |
|
|
|
|
|
for model_name in models_to_compare: |
|
|
if model_name not in self.models: |
|
|
continue |
|
|
|
|
|
logger.info(f"Evaluating {model_name}...") |
|
|
model = self.models[model_name] |
|
|
|
|
|
|
|
|
cv_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='f1_macro') |
|
|
|
|
|
|
|
|
model.fit(X_scaled, y) |
|
|
y_pred = model.predict(X_scaled) |
|
|
|
|
|
results[model_name] = { |
|
|
'cv_mean': cv_scores.mean(), |
|
|
'cv_std': cv_scores.std(), |
|
|
'cv_scores': cv_scores.tolist(), |
|
|
'accuracy': accuracy_score(y, y_pred), |
|
|
'f1_macro': f1_score(y, y_pred, average='macro', zero_division=0), |
|
|
'training_time': 0 |
|
|
} |
|
|
|
|
|
|
|
|
sorted_results = dict(sorted(results.items(), key=lambda x: x[1]['f1_macro'], reverse=True)) |
|
|
|
|
|
logger.info("Model comparison completed") |
|
|
return sorted_results |
|
|
|
|
|
def train_all_models(self, data: pd.DataFrame, optimize_hyperparameters: bool = True) -> Dict[str, Any]: |
|
|
""" |
|
|
Train all available models |
|
|
|
|
|
Args: |
|
|
data: Training data DataFrame |
|
|
optimize_hyperparameters: Whether to optimize hyperparameters |
|
|
|
|
|
Returns: |
|
|
Training results for all models |
|
|
""" |
|
|
logger.info("Training all available models...") |
|
|
|
|
|
|
|
|
texts = data['text'].tolist() |
|
|
labels = data['sentiment'].tolist() |
|
|
|
|
|
|
|
|
X = self.extract_features(texts) |
|
|
y = self.label_encoder.fit_transform(labels) |
|
|
|
|
|
|
|
|
if self.vectorizer is not None: |
|
|
tfidf_features = self.vectorizer.get_feature_names_out() |
|
|
additional_features = [ |
|
|
'text_length', 'word_count', 'uppercase_count', 'digit_count', |
|
|
'punctuation_count', 'polarity', 'confidence', 'subjectivity', |
|
|
'joy', 'sadness', 'anger', 'fear', 'surprise', 'disgust', 'avg_word_length' |
|
|
] |
|
|
self.feature_names = list(tfidf_features) + additional_features |
|
|
|
|
|
|
|
|
all_results = {} |
|
|
for model_name in self.models.keys(): |
|
|
try: |
|
|
results = self.train_model(model_name, X, y, optimize_hyperparameters) |
|
|
all_results[model_name] = results |
|
|
logger.info(f"โ
{model_name}: F1={results['f1_macro']:.3f}, Accuracy={results['accuracy']:.3f}") |
|
|
except Exception as e: |
|
|
logger.error(f"โ Failed to train {model_name}: {e}") |
|
|
all_results[model_name] = {'error': str(e)} |
|
|
|
|
|
|
|
|
self.training_data = data |
|
|
|
|
|
logger.info("All models training completed") |
|
|
return all_results |
|
|
|
|
|
def predict_sentiment(self, text: str, model_name: str = 'random_forest') -> Dict[str, Any]: |
|
|
""" |
|
|
Predict sentiment for a single text using trained model |
|
|
|
|
|
Args: |
|
|
text: Text to analyze |
|
|
model_name: Name of the model to use |
|
|
|
|
|
Returns: |
|
|
Prediction results |
|
|
""" |
|
|
if model_name not in self.models: |
|
|
raise ValueError(f"Model {model_name} not found. Available models: {list(self.models.keys())}") |
|
|
|
|
|
if self.vectorizer is None: |
|
|
raise ValueError("No trained model found. Please train a model first.") |
|
|
|
|
|
|
|
|
X = self.extract_features([text]) |
|
|
X_scaled = self.scaler.transform(X) |
|
|
|
|
|
|
|
|
model = self.models[model_name] |
|
|
prediction = model.predict(X_scaled)[0] |
|
|
probabilities = model.predict_proba(X_scaled)[0] if hasattr(model, 'predict_proba') else None |
|
|
|
|
|
|
|
|
sentiment = self.label_encoder.inverse_transform([prediction])[0] |
|
|
|
|
|
result = { |
|
|
'text': text, |
|
|
'sentiment': sentiment, |
|
|
'confidence': float(probabilities[prediction]) if probabilities is not None else 0.0, |
|
|
'probabilities': { |
|
|
label: float(prob) for label, prob in zip(self.label_encoder.classes_, probabilities) |
|
|
} if probabilities is not None else None, |
|
|
'model_used': model_name |
|
|
} |
|
|
|
|
|
return result |
|
|
|
|
|
def save_model(self, model_name: str, filepath: str): |
|
|
"""Save trained model to file""" |
|
|
if model_name not in self.models: |
|
|
raise ValueError(f"Model {model_name} not found") |
|
|
|
|
|
model_data = { |
|
|
'model': self.models[model_name], |
|
|
'label_encoder': self.label_encoder, |
|
|
'scaler': self.scaler, |
|
|
'vectorizer': self.vectorizer, |
|
|
'feature_names': self.feature_names, |
|
|
'feature_params': self.feature_params, |
|
|
'training_data_info': { |
|
|
'num_samples': len(self.training_data) if self.training_data is not None else 0, |
|
|
'features': X.shape[1] if hasattr(self, 'X') else 0 |
|
|
} if self.training_data is not None else None |
|
|
} |
|
|
|
|
|
with open(filepath, 'wb') as f: |
|
|
pickle.dump(model_data, f) |
|
|
|
|
|
logger.info(f"Model {model_name} saved to {filepath}") |
|
|
|
|
|
def load_model(self, filepath: str): |
|
|
"""Load trained model from file""" |
|
|
with open(filepath, 'rb') as f: |
|
|
model_data = pickle.load(f) |
|
|
|
|
|
self.models['loaded'] = model_data['model'] |
|
|
self.label_encoder = model_data['label_encoder'] |
|
|
self.scaler = model_data['scaler'] |
|
|
self.vectorizer = model_data['vectorizer'] |
|
|
self.feature_names = model_data['feature_names'] |
|
|
self.feature_params = model_data['feature_params'] |
|
|
|
|
|
logger.info(f"Model loaded from {filepath}") |
|
|
|
|
|
def get_training_summary(self) -> Dict[str, Any]: |
|
|
"""Get summary of training configuration and available models""" |
|
|
return { |
|
|
'available_models': list(self.models.keys()), |
|
|
'xgboost_available': XGBOOST_AVAILABLE, |
|
|
'lightgbm_available': LIGHTGBM_AVAILABLE, |
|
|
'catboost_available': CATBOOST_AVAILABLE, |
|
|
'plotting_available': PLOTTING_AVAILABLE, |
|
|
'feature_params': self.feature_params, |
|
|
'training_data_samples': len(self.training_data) if self.training_data is not None else 0, |
|
|
'model_cache_dir': str(self.model_cache_dir) |
|
|
} |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Demo function to showcase SentimentsAI ML training capabilities""" |
|
|
print("๐ค SentilensAI - Machine Learning Training Pipeline") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
trainer = SentilensAITrainer() |
|
|
|
|
|
|
|
|
summary = trainer.get_training_summary() |
|
|
print(f"\n๐ Training Configuration:") |
|
|
print(f"Available Models: {len(summary['available_models'])}") |
|
|
print(f"XGBoost Available: {summary['xgboost_available']}") |
|
|
print(f"LightGBM Available: {summary['lightgbm_available']}") |
|
|
print(f"CatBoost Available: {summary['catboost_available']}") |
|
|
print(f"Plotting Available: {summary['plotting_available']}") |
|
|
|
|
|
|
|
|
print(f"\n๐ Creating synthetic training data...") |
|
|
training_data = trainer.create_synthetic_training_data(num_samples=500) |
|
|
print(f"Created {len(training_data)} training samples") |
|
|
print(f"Sentiment distribution: {training_data['sentiment'].value_counts().to_dict()}") |
|
|
|
|
|
|
|
|
print(f"\n๐ Training all models...") |
|
|
results = trainer.train_all_models(training_data, optimize_hyperparameters=True) |
|
|
|
|
|
|
|
|
print(f"\n๐ Training Results:") |
|
|
print("-" * 60) |
|
|
for model_name, result in results.items(): |
|
|
if 'error' not in result: |
|
|
print(f"{model_name:20} | F1: {result['f1_macro']:.3f} | Accuracy: {result['accuracy']:.3f} | Time: {result['training_time']:.1f}s") |
|
|
else: |
|
|
print(f"{model_name:20} | Error: {result['error']}") |
|
|
|
|
|
|
|
|
print(f"\n๐ฎ Testing predictions...") |
|
|
test_texts = [ |
|
|
"I love this chatbot! It's amazing!", |
|
|
"This is terrible. I hate it.", |
|
|
"Can you help me with my account?" |
|
|
] |
|
|
|
|
|
for text in test_texts: |
|
|
try: |
|
|
prediction = trainer.predict_sentiment(text, 'random_forest') |
|
|
print(f"Text: '{text}'") |
|
|
print(f"Prediction: {prediction['sentiment']} (confidence: {prediction['confidence']:.3f})") |
|
|
except Exception as e: |
|
|
print(f"Prediction failed: {e}") |
|
|
print() |
|
|
|
|
|
|
|
|
best_model = max(results.keys(), key=lambda k: results[k].get('f1_macro', 0) if 'error' not in results[k] else 0) |
|
|
if 'error' not in results[best_model]: |
|
|
model_path = f"sentiments_ai_{best_model}_model.pkl" |
|
|
trainer.save_model(best_model, model_path) |
|
|
print(f"๐พ Best model ({best_model}) saved to {model_path}") |
|
|
|
|
|
print("\nโ
SentilensAI ML training demo completed!") |
|
|
print("๐ Ready for production sentiment analysis!") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|