""" SentilensAI - Machine Learning Training Pipeline This module provides comprehensive machine learning capabilities for training custom sentiment analysis models specifically optimized for AI chatbot conversations. Features: - Multiple ML algorithms (Random Forest, SVM, Neural Networks, XGBoost, etc.) - Advanced feature engineering for chatbot text - Cross-validation and hyperparameter tuning - Model comparison and evaluation - Production-ready model persistence - Real-time prediction capabilities Author: Pravin Selvamuthu Repository: https://github.com/kernelseed/sentilens-ai """ import os import json import pickle import logging from typing import Dict, List, Tuple, Optional, Any, Union from datetime import datetime from pathlib import Path import warnings warnings.filterwarnings('ignore') import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler from sklearn.metrics import ( classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, balanced_accuracy_score, matthews_corrcoef, cohen_kappa_score ) from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier from sklearn.svm import SVC from sklearn.neural_network import MLPClassifier from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.pipeline import Pipeline from sklearn.calibration import CalibratedClassifierCV import joblib # Advanced ML libraries try: import xgboost as xgb XGBOOST_AVAILABLE = True except ImportError: XGBOOST_AVAILABLE = False try: import lightgbm as lgb LIGHTGBM_AVAILABLE = True except ImportError: LIGHTGBM_AVAILABLE = False try: import catboost as cb CATBOOST_AVAILABLE = True except ImportError: CATBOOST_AVAILABLE = False # Visualization try: import matplotlib.pyplot as plt import seaborn as sns PLOTTING_AVAILABLE = True except ImportError: PLOTTING_AVAILABLE = False # LangChain integration from langchain.schema import BaseMessage from langchain.prompts import PromptTemplate from langchain.chains import LLMChain from langchain.llms import OpenAI # Import our sentiment analyzer from sentiment_analyzer import SentilensAIAnalyzer, SentimentResult # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class SentilensAITrainer: """ Advanced machine learning trainer for sentiment analysis models specifically designed for AI chatbot conversations """ def __init__(self, model_cache_dir: str = "./model_cache"): """ Initialize the SentimentsAI trainer Args: model_cache_dir: Directory to cache trained models """ self.model_cache_dir = Path(model_cache_dir) self.model_cache_dir.mkdir(exist_ok=True) # Initialize components self.analyzer = SentilensAIAnalyzer() self.label_encoder = LabelEncoder() self.scaler = RobustScaler() self.vectorizer = None self.models = {} self.training_data = None self.feature_names = None # Initialize available models self._initialize_models() # Feature engineering parameters self.feature_params = { 'max_features': 10000, 'ngram_range': (1, 3), 'min_df': 2, 'max_df': 0.95, 'stop_words': 'english' } def _initialize_models(self): """Initialize available machine learning models""" self.models = { 'random_forest': RandomForestClassifier( n_estimators=100, max_depth=10, random_state=42, n_jobs=-1 ), 'extra_trees': ExtraTreesClassifier( n_estimators=100, max_depth=10, random_state=42, n_jobs=-1 ), 'gradient_boosting': GradientBoostingClassifier( n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42 ), 'svm': SVC( kernel='rbf', C=1.0, gamma='scale', random_state=42, probability=True ), 'neural_network': MLPClassifier( hidden_layer_sizes=(100, 50), activation='relu', solver='adam', alpha=0.001, learning_rate='adaptive', max_iter=500, random_state=42 ), 'logistic_regression': LogisticRegression( random_state=42, max_iter=1000, n_jobs=-1 ), 'decision_tree': DecisionTreeClassifier( max_depth=10, random_state=42 ), 'naive_bayes': MultinomialNB(alpha=1.0), 'ada_boost': AdaBoostClassifier( n_estimators=50, learning_rate=1.0, random_state=42 ) } # Add advanced models if available if XGBOOST_AVAILABLE: self.models['xgboost'] = xgb.XGBClassifier( n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, n_jobs=-1 ) if LIGHTGBM_AVAILABLE: self.models['lightgbm'] = lgb.LGBMClassifier( n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, n_jobs=-1, verbose=-1 ) if CATBOOST_AVAILABLE: self.models['catboost'] = cb.CatBoostClassifier( iterations=100, depth=6, learning_rate=0.1, random_seed=42, verbose=False ) def create_synthetic_training_data(self, num_samples: int = 1000) -> pd.DataFrame: """ Create synthetic training data for sentiment analysis Args: num_samples: Number of samples to generate Returns: DataFrame with text and sentiment labels """ logger.info(f"Creating {num_samples} synthetic training samples...") # Define sentiment categories and sample texts sentiment_data = { 'positive': [ "I love this chatbot! It's amazing and so helpful.", "This is exactly what I needed. Thank you so much!", "Great service! The bot understood me perfectly.", "Excellent! This chatbot is fantastic and very user-friendly.", "Perfect! I'm so happy with this experience.", "Wonderful! The bot provided exactly the right information.", "Outstanding service! I'm impressed with the quality.", "Brilliant! This is the best chatbot I've ever used.", "Fantastic! The response was quick and accurate.", "Superb! I'm delighted with the help I received." ], 'negative': [ "This chatbot is terrible. It doesn't understand anything.", "Worst experience ever. The bot is completely useless.", "This is awful. I'm frustrated and disappointed.", "Horrible service! The bot keeps giving wrong answers.", "Disgusting! This chatbot is a complete waste of time.", "Terrible! I hate this bot and its responses.", "Awful experience. The bot is stupid and unhelpful.", "Disappointing! This chatbot is broken and useless.", "Frustrating! The bot doesn't know what it's doing.", "Pathetic! This is the worst chatbot I've ever seen." ], 'neutral': [ "Can you help me with my account information?", "I need to check my order status.", "What are your business hours?", "How do I reset my password?", "I want to update my profile details.", "Can you provide more information about this product?", "I need assistance with my subscription.", "What is your return policy?", "How can I contact customer support?", "I have a question about my recent purchase." ] } # Generate synthetic data data = [] samples_per_sentiment = num_samples // 3 for sentiment, texts in sentiment_data.items(): for i in range(samples_per_sentiment): # Select base text base_text = np.random.choice(texts) # Add variations variations = [ base_text, base_text + " Please help me.", "Hi, " + base_text.lower(), base_text + " Thanks!", "Hello, " + base_text.lower(), base_text + " I appreciate it.", "Hey, " + base_text.lower(), base_text + " Could you assist?", "Good morning, " + base_text.lower(), base_text + " That would be great." ] text = np.random.choice(variations) data.append({ 'text': text, 'sentiment': sentiment, 'confidence': np.random.uniform(0.6, 1.0), 'polarity': np.random.uniform(-1, 1) if sentiment == 'neutral' else (1 if sentiment == 'positive' else -1), 'subjectivity': np.random.uniform(0.3, 0.8), 'message_type': 'user' if i % 2 == 0 else 'bot', 'conversation_id': f'conv_{i//2}', 'timestamp': datetime.now() }) # Add some mixed sentiment examples mixed_examples = [ ("I'm not sure if this is good or bad.", "neutral"), ("It's okay, I guess.", "neutral"), ("This is fine, nothing special.", "neutral"), ("I have mixed feelings about this.", "neutral"), ("It's decent but could be better.", "neutral") ] for text, sentiment in mixed_examples: data.append({ 'text': text, 'sentiment': sentiment, 'confidence': np.random.uniform(0.4, 0.7), 'polarity': np.random.uniform(-0.3, 0.3), 'subjectivity': np.random.uniform(0.5, 0.9), 'message_type': 'user', 'conversation_id': f'mixed_{len(data)}', 'timestamp': datetime.now() }) df = pd.DataFrame(data) logger.info(f"Created {len(df)} training samples") return df def extract_features(self, texts: List[str]) -> np.ndarray: """ Extract comprehensive features from text data Args: texts: List of text strings Returns: Feature matrix """ logger.info("Extracting features from text data...") # Initialize vectorizer if not already done if self.vectorizer is None: self.vectorizer = TfidfVectorizer( max_features=self.feature_params['max_features'], ngram_range=self.feature_params['ngram_range'], min_df=self.feature_params['min_df'], max_df=self.feature_params['max_df'], stop_words=self.feature_params['stop_words'] ) # TF-IDF features tfidf_features = self.vectorizer.fit_transform(texts).toarray() # Additional text features text_features = [] for text in texts: features = [] # Basic text statistics features.append(len(text)) # Text length features.append(len(text.split())) # Word count features.append(len([c for c in text if c.isupper()])) # Uppercase count features.append(len([c for c in text if c.isdigit()])) # Digit count features.append(len([c for c in text if c in '!?'])) # Punctuation count # Sentiment features using our analyzer try: sentiment_result = self.analyzer.analyze_sentiment(text, method='ensemble') features.extend([ sentiment_result.polarity, sentiment_result.confidence, sentiment_result.subjectivity ]) # Emotion features for emotion in ['joy', 'sadness', 'anger', 'fear', 'surprise', 'disgust']: features.append(sentiment_result.emotions.get(emotion, 0.0)) except: features.extend([0.0] * 9) # Default values if analysis fails # Text complexity features words = text.split() if words: avg_word_length = np.mean([len(word) for word in words]) features.append(avg_word_length) else: features.append(0.0) text_features.append(features) text_features = np.array(text_features) # Combine all features all_features = np.hstack([tfidf_features, text_features]) logger.info(f"Extracted {all_features.shape[1]} features from {len(texts)} texts") return all_features def train_model(self, model_name: str, X: np.ndarray, y: np.ndarray, optimize_hyperparameters: bool = True) -> Dict[str, Any]: """ Train a specific model Args: model_name: Name of the model to train X: Feature matrix y: Target labels optimize_hyperparameters: Whether to optimize hyperparameters Returns: Training results dictionary """ if model_name not in self.models: raise ValueError(f"Unknown model: {model_name}") logger.info(f"Training {model_name} model...") # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Scale features X_train_scaled = self.scaler.fit_transform(X_train) X_test_scaled = self.scaler.transform(X_test) # Get base model model = self.models[model_name] # Optimize hyperparameters if requested if optimize_hyperparameters: model = self._optimize_hyperparameters(model, model_name, X_train_scaled, y_train) # Train model start_time = datetime.now() model.fit(X_train_scaled, y_train) training_time = (datetime.now() - start_time).total_seconds() # Make predictions y_pred = model.predict(X_test_scaled) y_pred_proba = model.predict_proba(X_test_scaled) if hasattr(model, 'predict_proba') else None # Evaluate model results = self._evaluate_model(y_test, y_pred, y_pred_proba, model.classes_) results.update({ 'model_name': model_name, 'training_time': training_time, 'model': model, 'feature_importance': self._get_feature_importance(model, model_name) }) # Store trained model self.models[model_name] = model logger.info(f"Training completed for {model_name}") return results def _optimize_hyperparameters(self, model, model_name: str, X: np.ndarray, y: np.ndarray): """Optimize hyperparameters using GridSearchCV""" param_grids = { 'random_forest': { 'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15, None], 'min_samples_split': [2, 5, 10] }, 'extra_trees': { 'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15, None], 'min_samples_split': [2, 5, 10] }, 'gradient_boosting': { 'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 6, 10] }, 'svm': { 'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], 'kernel': ['rbf', 'linear'] }, 'neural_network': { 'hidden_layer_sizes': [(50,), (100,), (100, 50), (200, 100)], 'alpha': [0.0001, 0.001, 0.01], 'learning_rate': ['constant', 'adaptive'] }, 'logistic_regression': { 'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga'] }, 'decision_tree': { 'max_depth': [5, 10, 15, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4] }, 'naive_bayes': { 'alpha': [0.1, 0.5, 1.0, 2.0] }, 'ada_boost': { 'n_estimators': [25, 50, 100], 'learning_rate': [0.5, 1.0, 1.5] } } if XGBOOST_AVAILABLE and model_name == 'xgboost': param_grids['xgboost'] = { 'n_estimators': [50, 100, 200], 'max_depth': [3, 6, 10], 'learning_rate': [0.01, 0.1, 0.2] } if LIGHTGBM_AVAILABLE and model_name == 'lightgbm': param_grids['lightgbm'] = { 'n_estimators': [50, 100, 200], 'max_depth': [3, 6, 10], 'learning_rate': [0.01, 0.1, 0.2] } if CATBOOST_AVAILABLE and model_name == 'catboost': param_grids['catboost'] = { 'iterations': [50, 100, 200], 'depth': [3, 6, 10], 'learning_rate': [0.01, 0.1, 0.2] } if model_name in param_grids: logger.info(f"Optimizing hyperparameters for {model_name}...") grid_search = GridSearchCV( model, param_grids[model_name], cv=3, scoring='f1_macro', n_jobs=-1, verbose=0 ) grid_search.fit(X, y) return grid_search.best_estimator_ return model def _evaluate_model(self, y_true, y_pred, y_pred_proba, classes) -> Dict[str, Any]: """Comprehensive model evaluation""" results = { 'accuracy': accuracy_score(y_true, y_pred), 'balanced_accuracy': balanced_accuracy_score(y_true, y_pred), 'precision_macro': precision_score(y_true, y_pred, average='macro', zero_division=0), 'precision_micro': precision_score(y_true, y_pred, average='micro', zero_division=0), 'precision_weighted': precision_score(y_true, y_pred, average='weighted', zero_division=0), 'recall_macro': recall_score(y_true, y_pred, average='macro', zero_division=0), 'recall_micro': recall_score(y_true, y_pred, average='micro', zero_division=0), 'recall_weighted': recall_score(y_true, y_pred, average='weighted', zero_division=0), 'f1_macro': f1_score(y_true, y_pred, average='macro', zero_division=0), 'f1_micro': f1_score(y_true, y_pred, average='micro', zero_division=0), 'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0), 'matthews_corrcoef': matthews_corrcoef(y_true, y_pred), 'cohen_kappa': cohen_kappa_score(y_true, y_pred), 'classification_report': classification_report(y_true, y_pred, output_dict=True), 'confusion_matrix': confusion_matrix(y_true, y_pred).tolist() } # Add ROC AUC if probabilities are available if y_pred_proba is not None and len(classes) > 2: try: results['roc_auc'] = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro') except: results['roc_auc'] = 0.0 elif y_pred_proba is not None and len(classes) == 2: try: results['roc_auc'] = roc_auc_score(y_true, y_pred_proba[:, 1]) except: results['roc_auc'] = 0.0 else: results['roc_auc'] = 0.0 return results def _get_feature_importance(self, model, model_name: str) -> Optional[Dict[str, float]]: """Get feature importance if available""" try: if hasattr(model, 'feature_importances_'): importance = model.feature_importances_ if self.feature_names is not None: return dict(zip(self.feature_names, importance)) else: return {f'feature_{i}': imp for i, imp in enumerate(importance)} elif hasattr(model, 'coef_'): # For linear models, use absolute coefficients coef = np.abs(model.coef_[0]) if len(model.coef_.shape) > 1 else np.abs(model.coef_) if self.feature_names is not None: return dict(zip(self.feature_names, coef)) else: return {f'feature_{i}': imp for i, imp in enumerate(coef)} except: pass return None def compare_models(self, X: np.ndarray, y: np.ndarray, models_to_compare: Optional[List[str]] = None) -> Dict[str, Any]: """ Compare multiple models using cross-validation Args: X: Feature matrix y: Target labels models_to_compare: List of model names to compare (None for all) Returns: Comparison results """ if models_to_compare is None: models_to_compare = list(self.models.keys()) logger.info(f"Comparing {len(models_to_compare)} models...") # Scale features X_scaled = self.scaler.fit_transform(X) results = {} cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) for model_name in models_to_compare: if model_name not in self.models: continue logger.info(f"Evaluating {model_name}...") model = self.models[model_name] # Cross-validation scores cv_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='f1_macro') # Train and evaluate model.fit(X_scaled, y) y_pred = model.predict(X_scaled) results[model_name] = { 'cv_mean': cv_scores.mean(), 'cv_std': cv_scores.std(), 'cv_scores': cv_scores.tolist(), 'accuracy': accuracy_score(y, y_pred), 'f1_macro': f1_score(y, y_pred, average='macro', zero_division=0), 'training_time': 0 # Could be measured if needed } # Sort by F1 score sorted_results = dict(sorted(results.items(), key=lambda x: x[1]['f1_macro'], reverse=True)) logger.info("Model comparison completed") return sorted_results def train_all_models(self, data: pd.DataFrame, optimize_hyperparameters: bool = True) -> Dict[str, Any]: """ Train all available models Args: data: Training data DataFrame optimize_hyperparameters: Whether to optimize hyperparameters Returns: Training results for all models """ logger.info("Training all available models...") # Prepare data texts = data['text'].tolist() labels = data['sentiment'].tolist() # Extract features X = self.extract_features(texts) y = self.label_encoder.fit_transform(labels) # Store feature names for importance analysis if self.vectorizer is not None: tfidf_features = self.vectorizer.get_feature_names_out() additional_features = [ 'text_length', 'word_count', 'uppercase_count', 'digit_count', 'punctuation_count', 'polarity', 'confidence', 'subjectivity', 'joy', 'sadness', 'anger', 'fear', 'surprise', 'disgust', 'avg_word_length' ] self.feature_names = list(tfidf_features) + additional_features # Train all models all_results = {} for model_name in self.models.keys(): try: results = self.train_model(model_name, X, y, optimize_hyperparameters) all_results[model_name] = results logger.info(f"āœ… {model_name}: F1={results['f1_macro']:.3f}, Accuracy={results['accuracy']:.3f}") except Exception as e: logger.error(f"āŒ Failed to train {model_name}: {e}") all_results[model_name] = {'error': str(e)} # Store training data self.training_data = data logger.info("All models training completed") return all_results def predict_sentiment(self, text: str, model_name: str = 'random_forest') -> Dict[str, Any]: """ Predict sentiment for a single text using trained model Args: text: Text to analyze model_name: Name of the model to use Returns: Prediction results """ if model_name not in self.models: raise ValueError(f"Model {model_name} not found. Available models: {list(self.models.keys())}") if self.vectorizer is None: raise ValueError("No trained model found. Please train a model first.") # Extract features X = self.extract_features([text]) X_scaled = self.scaler.transform(X) # Make prediction model = self.models[model_name] prediction = model.predict(X_scaled)[0] probabilities = model.predict_proba(X_scaled)[0] if hasattr(model, 'predict_proba') else None # Decode prediction sentiment = self.label_encoder.inverse_transform([prediction])[0] result = { 'text': text, 'sentiment': sentiment, 'confidence': float(probabilities[prediction]) if probabilities is not None else 0.0, 'probabilities': { label: float(prob) for label, prob in zip(self.label_encoder.classes_, probabilities) } if probabilities is not None else None, 'model_used': model_name } return result def save_model(self, model_name: str, filepath: str): """Save trained model to file""" if model_name not in self.models: raise ValueError(f"Model {model_name} not found") model_data = { 'model': self.models[model_name], 'label_encoder': self.label_encoder, 'scaler': self.scaler, 'vectorizer': self.vectorizer, 'feature_names': self.feature_names, 'feature_params': self.feature_params, 'training_data_info': { 'num_samples': len(self.training_data) if self.training_data is not None else 0, 'features': X.shape[1] if hasattr(self, 'X') else 0 } if self.training_data is not None else None } with open(filepath, 'wb') as f: pickle.dump(model_data, f) logger.info(f"Model {model_name} saved to {filepath}") def load_model(self, filepath: str): """Load trained model from file""" with open(filepath, 'rb') as f: model_data = pickle.load(f) self.models['loaded'] = model_data['model'] self.label_encoder = model_data['label_encoder'] self.scaler = model_data['scaler'] self.vectorizer = model_data['vectorizer'] self.feature_names = model_data['feature_names'] self.feature_params = model_data['feature_params'] logger.info(f"Model loaded from {filepath}") def get_training_summary(self) -> Dict[str, Any]: """Get summary of training configuration and available models""" return { 'available_models': list(self.models.keys()), 'xgboost_available': XGBOOST_AVAILABLE, 'lightgbm_available': LIGHTGBM_AVAILABLE, 'catboost_available': CATBOOST_AVAILABLE, 'plotting_available': PLOTTING_AVAILABLE, 'feature_params': self.feature_params, 'training_data_samples': len(self.training_data) if self.training_data is not None else 0, 'model_cache_dir': str(self.model_cache_dir) } def main(): """Demo function to showcase SentimentsAI ML training capabilities""" print("šŸ¤– SentilensAI - Machine Learning Training Pipeline") print("=" * 60) # Initialize trainer trainer = SentilensAITrainer() # Get training summary summary = trainer.get_training_summary() print(f"\nšŸ“Š Training Configuration:") print(f"Available Models: {len(summary['available_models'])}") print(f"XGBoost Available: {summary['xgboost_available']}") print(f"LightGBM Available: {summary['lightgbm_available']}") print(f"CatBoost Available: {summary['catboost_available']}") print(f"Plotting Available: {summary['plotting_available']}") # Create synthetic training data print(f"\nšŸ”„ Creating synthetic training data...") training_data = trainer.create_synthetic_training_data(num_samples=500) print(f"Created {len(training_data)} training samples") print(f"Sentiment distribution: {training_data['sentiment'].value_counts().to_dict()}") # Train all models print(f"\nšŸš€ Training all models...") results = trainer.train_all_models(training_data, optimize_hyperparameters=True) # Display results print(f"\nšŸ“ˆ Training Results:") print("-" * 60) for model_name, result in results.items(): if 'error' not in result: print(f"{model_name:20} | F1: {result['f1_macro']:.3f} | Accuracy: {result['accuracy']:.3f} | Time: {result['training_time']:.1f}s") else: print(f"{model_name:20} | Error: {result['error']}") # Test prediction print(f"\nšŸ”® Testing predictions...") test_texts = [ "I love this chatbot! It's amazing!", "This is terrible. I hate it.", "Can you help me with my account?" ] for text in test_texts: try: prediction = trainer.predict_sentiment(text, 'random_forest') print(f"Text: '{text}'") print(f"Prediction: {prediction['sentiment']} (confidence: {prediction['confidence']:.3f})") except Exception as e: print(f"Prediction failed: {e}") print() # Save best model best_model = max(results.keys(), key=lambda k: results[k].get('f1_macro', 0) if 'error' not in results[k] else 0) if 'error' not in results[best_model]: model_path = f"sentiments_ai_{best_model}_model.pkl" trainer.save_model(best_model, model_path) print(f"šŸ’¾ Best model ({best_model}) saved to {model_path}") print("\nāœ… SentilensAI ML training demo completed!") print("šŸš€ Ready for production sentiment analysis!") if __name__ == "__main__": main()