Spaces:

Ashokdll
/

hate-speech-detector-app

Sleeping

App Files Files Community

Ashokdll commited on Jun 5

Commit

8cdb327

verified ·

1 Parent(s): 8fce1f4

Update app.py

Browse files

Files changed (1) hide show

app.py +590 -233

app.py CHANGED Viewed

@@ -5,104 +5,183 @@ import numpy as np
 import json
 from datetime import datetime
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-class DistilBERTHateSpeechDetector:
     def __init__(self):
-        self.model = None
-        self.tokenizer = None
-        self.classifier = None
-        self.load_model()
-    def load_model(self):
-        """Load the fine-tuned DistilBERT model"""
         try:
-            logger.info("Loading DistilBERT hate speech detection model...")
-            # Try to load from local model directory
             model_path = "./model"
-            # Load tokenizer
-            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
-            logger.info("✅ Tokenizer loaded successfully")
-            # Load model
-            self.model = AutoModelForSequenceClassification.from_pretrained(
                 model_path,
-                torch_dtype=torch.float32,
-                device_map="auto"
             )
-            logger.info("✅ DistilBERT model loaded successfully")
-            # Create pipeline
-            self.classifier = pipeline(
                 "text-classification",
-                model=self.model,
-                tokenizer=self.tokenizer,
                 return_all_scores=True,
                 device=0 if torch.cuda.is_available() else -1
             )
-            # Get model info
-            logger.info(f"Model architecture: {self.model.config.architectures[0]}")
-            logger.info(f"Number of labels: {self.model.config.num_labels}")
-            logger.info(f"Max sequence length: {self.model.config.max_position_embeddings}")
         except Exception as e:
-            logger.error(f"❌ Error loading custom model: {e}")
-            logger.info("🔄 Falling back to public model...")
-            # Fallback to a public model
-            try:
-                self.classifier = pipeline(
-                    "text-classification",
-                    model="martin-ha/toxic-comment-model",
-                    return_all_scores=True
-                )
-                logger.info("✅ Fallback model loaded")
-            except Exception as fallback_error:
-                logger.error(f"❌ Fallback model also failed: {fallback_error}")
-                raise Exception("Failed to load any model")
-    def preprocess_text(self, text):
-        """Preprocess text for better analysis"""
-        if not text or not text.strip():
-            return ""
-        # Basic preprocessing
-        text = text.strip()
-        # Remove excessive whitespace
-        text = " ".join(text.split())
-        return text
     def detect_hate_speech(self, text):
-        """Detect hate speech with detailed analysis"""
         if not text or not text.strip():
             return {
                 "status": "❌ Please enter some text to analyze.",
                 "prediction": "No input",
                 "confidence": 0.0,
                 "all_scores": {},
-                "risk_level": "Unknown"
             }
         try:
-            # Preprocess text
-            processed_text = self.preprocess_text(text)
-            # Get predictions
-            results = self.classifier(processed_text)
-            # Handle different output formats
             if isinstance(results, list) and len(results) > 0:
                 if isinstance(results[0], list):
                     results = results[0]
-                # Parse results
                 all_scores = {}
                 max_score = 0
                 predicted_label = "UNKNOWN"
@@ -122,7 +201,6 @@ class DistilBERTHateSpeechDetector:
                 # Determine hate speech status
                 hate_keywords = ["HATE", "TOXIC", "NEGATIVE", "HARMFUL", "1", "LABEL_1"]
-                clean_keywords = ["CLEAN", "NORMAL", "POSITIVE", "SAFE", "0", "LABEL_0"]
                 is_hate_speech = False
                 risk_level = "Low"
@@ -153,274 +231,553 @@ class DistilBERTHateSpeechDetector:
                 }
         except Exception as e:
-            logger.error(f"Analysis error: {e}")
             return {
-                "status": f"❌ Error during analysis: {str(e)}",
                 "prediction": "Error",
                 "confidence": 0.0,
                 "all_scores": {},
-                "risk_level": "Unknown"
             }
-    def generate_counter_narrative(self, text, detection_result):
-        """Generate educational counter-narrative based on detection"""
-        if not detection_result.get("is_hate_speech", False):
-            return "✨ Great! This text promotes positive communication. Keep up the constructive dialogue!"
-        # Counter-narratives based on risk level
-        risk_level = detection_result.get("risk_level", "Low")
-        high_risk_responses = [
-            "🛡️ **Educational Response**: This type of language can cause real harm to individuals and communities. Consider how your words might affect others and try rephrasing with respect and empathy.",
-            "💡 **Constructive Alternative**: Instead of using harmful language, try expressing your concerns in a way that opens dialogue rather than shutting it down. Every person deserves dignity and respect.",
-            "🌍 **Community Impact**: Hate speech can escalate tensions and divide communities. Consider how you can contribute to a more inclusive and understanding environment.",
-            "📚 **Learning Opportunity**: Research shows that exposure to diverse perspectives actually strengthens critical thinking. Consider engaging with different viewpoints constructively."
-        ]
-        medium_risk_responses = [
-            "🤔 **Reflection Point**: This language might be interpreted as harmful by some. Consider rewording to express your point more constructively.",
-            "💬 **Communication Tip**: Try framing your message in a way that invites discussion rather than potentially excluding or hurting others.",
-            "🎯 **Focus Shift**: Instead of focusing on differences that divide, consider highlighting shared values or common ground.",
-            "🔄 **Reframe Opportunity**: How might you express this same sentiment in a way that brings people together rather than apart?"
-        ]
-        if risk_level == "High":
-            responses = high_risk_responses
-        elif risk_level == "Medium":
-            responses = medium_risk_responses
         else:
-            responses = [
-                "💭 **Gentle Reminder**: While this might not be clearly harmful, consider how your words might be received by others.",
-                "🌱 **Growth Mindset**: Every interaction is an opportunity to build understanding and connection.",
-                "🤝 **Bridge Building**: Consider how you can use your voice to bring people together rather than create distance."
-            ]
         import random
-        return random.choice(responses)
-    def get_model_info(self):
-        """Get information about the loaded model"""
-        if self.model:
-            return {
-                "Model Type": "DistilBERT (Fine-tuned)",
-                "Architecture": self.model.config.architectures[0],
-                "Parameters": f"~{66}M parameters",
-                "Max Length": self.model.config.max_position_embeddings,
-                "Labels": self.model.config.num_labels,
-                "Framework": "PyTorch + Transformers"
-            }
-        return {"Model": "Fallback model in use"}
-# Initialize the detector
-logger.info("Initializing DistilBERT Hate Speech Detector...")
-detector = DistilBERTHateSpeechDetector()
-def analyze_text(text):
-    """Main analysis function for Gradio interface"""
-    start_time = datetime.now()
-    # Perform detection
-    detection_result = detector.detect_hate_speech(text)
-    # Generate counter-narrative
-    counter_narrative = detector.generate_counter_narrative(text, detection_result)
-    # Calculate processing time
-    processing_time = (datetime.now() - start_time).total_seconds()
-    # Format results for display
-    status = detection_result["status"]
-    all_scores = detection_result["all_scores"]
-    # Add processing info
-    info_text = f"⏱️ Processed in {processing_time:.3f} seconds | Risk Level: {detection_result['risk_level']}"
-    return status, all_scores, counter_narrative, info_text
-def get_model_details():
-    """Get model information for display"""
-    return detector.get_model_info()
 # Create the Gradio interface
 with gr.Blocks(
-    title="DistilBERT Hate Speech Detection & Counter-Narrative Generator",
     theme=gr.themes.Soft(),
     css="""
     .gradio-container {
         max-width: 1400px !important;
     }
-    .status-box {
         padding: 1rem;
         border-radius: 8px;
-        margin: 0.5rem 0;
     }
     """
 ) as demo:
     gr.Markdown("""
-    # 🛡️ DistilBERT Hate Speech Detection & Counter-Narrative Generator
-    **Advanced AI Agent System for Content Moderation & Education**
-    🤖 **Powered by Fine-tuned DistilBERT** - Efficient and accurate hate speech detection
-    📚 **Educational Counter-Narratives** - AI-generated constructive responses
-    ⚡ **Real-time Processing** - Fast analysis with detailed confidence scores
-    🎯 **Multi-level Risk Assessment** - Nuanced understanding of content severity
     """)
-    with gr.Tab("🔍 Text Analysis"):
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
-                    label="Enter text to analyze",
-                    placeholder="Type or paste text here for hate speech analysis...",
                     lines=5,
                     max_lines=15
                 )
                 with gr.Row():
-                    analyze_btn = gr.Button("🔍 Analyze Text", variant="primary", size="lg")
                     clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
                 gr.Examples(
                     examples=[
-                        ["This is a wonderful day to learn something new!"],
-                        ["I respectfully disagree with that policy, but I understand your perspective."],
-                        ["The team did an excellent job on this project. Well done everyone!"],
-                        ["Thank you for sharing your thoughts. Let's discuss this constructively."],
-                        ["That restaurant has amazing food and great service!"],
-                        ["I appreciate you taking the time to explain your viewpoint."]
                     ],
                     inputs=text_input,
-                    label="📝 Try these positive examples:"
                 )
         with gr.Row():
-            with gr.Column(scale=1):
-                status_output = gr.Textbox(
-                    label="🎯 Detection Result",
                     interactive=False,
                     lines=3
                 )
-                processing_info = gr.Textbox(
-                    label="ℹ️ Processing Info",
                     interactive=False,
-                    lines=1
                 )
-            with gr.Column(scale=1):
-                counter_narrative_output = gr.Textbox(
-                    label="💡 Educational Counter-Narrative",
                     interactive=False,
-                    lines=4
                 )
         with gr.Row():
-            scores_output = gr.JSON(
-                label="📊 Detailed Confidence Scores",
                 visible=True
             )
-    with gr.Tab("🔧 Model Information"):
         with gr.Row():
             with gr.Column():
-                gr.Markdown("## 🤖 Model Details")
-                model_info = gr.JSON(
-                    label="Model Configuration",
-                    value=get_model_details()
-                )
             with gr.Column():
                 gr.Markdown("""
-                ## 📈 Performance Characteristics
-                **DistilBERT Advantages:**
-                - ⚡ **Fast Processing**: 60% smaller than BERT
-                - 🎯 **High Accuracy**: Retains 97% of BERT's performance
-                - 💾 **Memory Efficient**: Lower computational requirements
-                - 🔄 **Real-time Ready**: Suitable for production deployment
-                **Fine-tuning Benefits:**
-                - 🎯 **Domain-Specific**: Trained on hate speech datasets
-                - 📊 **Balanced Performance**: Optimized precision-recall balance
-                - 🔍 **Context-Aware**: Understanding of nuanced language patterns
                 """)
-    with gr.Tab("📋 About & Usage"):
         gr.Markdown("""
-        ## 🎯 System Overview
-        This demonstration showcases an advanced AI agent system combining:
-        ### 🤖 AI Agent Architecture
-        1. **Detection Agent**: Fine-tuned DistilBERT classifier
-        2. **Analysis Agent**: Risk assessment and confidence scoring
-        3. **Counter-Narrative Agent**: Educational response generation
-        4. **Monitoring Agent**: Performance tracking and logging
-        ### 🛡️ Content Moderation Pipeline
         ```
-        Input Text → Preprocessing → DistilBERT Analysis → Risk Assessment → Counter-Narrative Generation → Results
         ```
-        ### 📊 Risk Level Classification
-        - **🚨 High Risk (>80% confidence)**: Clear hate speech detection
-        - **⚠️ Medium Risk (60-80%)**: Potential harmful content
-        - **⚡ Low-Medium Risk (40-60%)**: Uncertain classification
-        - **✅ Low Risk (<40%)**: Safe content
-        ## 🔧 Technical Implementation
-        **Model Architecture:**
-        - Base: DistilBERT (distilbert-base-uncased)
-        - Task: Sequence Classification
-        - Parameters: ~66M (vs 110M for BERT-base)
-        - Max Sequence Length: 512 tokens
-        **Key Features:**
-        - Real-time inference with <1 second response time
-        - Confidence-based risk assessment
-        - Educational counter-narrative generation
-        - Comprehensive error handling and fallbacks
-        ## ⚠️ Important Disclaimers
-        - 🔬 **Research Demonstration**: Not ready for production without additional safeguards
-        - 👥 **Human Oversight Required**: AI should supplement, not replace human moderation
-        - ⚖️ **Bias Awareness**: Models can reflect biases present in training data
-        - 🔒 **Privacy Conscious**: No data is stored or logged from this demo
-        - 🌍 **Context Matters**: Cultural and contextual factors affect interpretation
-        ## 🚀 Potential Applications
-        - **Social Media Platforms**: Automated content moderation
-        - **Educational Tools**: Teaching about respectful communication
-        - **Community Forums**: Maintaining healthy discussions
-        - **Content Creation**: Writing assistance for inclusive language
-        - **Research**: Studying patterns in online discourse
-        ## 📞 Feedback & Development
-        This demo represents the cutting edge of AI-powered content moderation.
-        For production deployment, additional considerations include:
-        - Continuous model updates and retraining
-        - Human review workflows
-        - Appeal and correction mechanisms
-        - Cross-cultural validation
-        - Regulatory compliance
         """)
     # Event handlers
     analyze_btn.click(
-        fn=analyze_text,
         inputs=text_input,
-        outputs=[status_output, scores_output, counter_narrative_output, processing_info]
     )
     clear_btn.click(
         fn=lambda: ("", "", "", "", {}),
-        outputs=[text_input, status_output, counter_narrative_output, processing_info, scores_output]
     )
 # Launch configuration

 import json
 from datetime import datetime
 import logging
+import os
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+class PromptBasedMultiAgentSystem:
     def __init__(self):
+        self.detection_agent = None
+        self.counter_speech_agent = None
+        self.moderation_agent = None
+        self.sentiment_agent = None
+        # Load prompt configurations
+        self.counter_speech_prompts = self.load_prompts("counter_speech_prompts.json")
+        self.moderation_prompts = self.load_prompts("moderation_prompts.json")
+        self.initialize_agents()
+    def load_prompts(self, filename):
+        """Load prompts from JSON file with fallback"""
         try:
+            if os.path.exists(filename):
+                with open(filename, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            else:
+                logger.warning(f"Prompt file {filename} not found, using built-in prompts")
+                return self.get_default_prompts(filename)
+        except Exception as e:
+            logger.error(f"Error loading prompts from {filename}: {e}")
+            return self.get_default_prompts(filename)
+    def get_default_prompts(self, filename):
+        """Default prompts as fallback"""
+        if "counter_speech" in filename:
+            return {
+                "counter_speech_prompts": {
+                    "high_risk": {
+                        "system_prompt": "You are an expert educator specializing in counter-speech and conflict de-escalation.",
+                        "user_prompt_template": "Generate a respectful, educational counter-speech response to address harmful content while promoting understanding. Original text (Risk: {risk_level}, Confidence: {confidence}%, Sentiment: {sentiment}): \"{original_text}\"\n\nCounter-speech response:",
+                    },
+                    "general_template": {
+                        "fallback_responses": [
+                            "Thank you for sharing your thoughts. Building strong communities works best when we focus on shared values and constructive dialogue. How might we work together on the concerns you've raised?",
+                            "I appreciate your perspective. Sometimes our strongest feelings can be expressed in ways that bring people together. What specific positive changes would you like to see?",
+                            "Your engagement with this topic is clear. When we channel that energy into inclusive dialogue, we often find solutions that work for everyone."
+                        ]
+                    }
+                }
+            }
+        else:
+            return {
+                "moderation_prompts": {
+                    "comprehensive_analysis": {
+                        "system_prompt": "You are an expert content moderation specialist analyzing text for safety and compliance.",
+                        "user_prompt_template": "Analyze this text for potential violations: \"{text}\"\n\nProvide: 1) Safety assessment 2) Violation categories 3) Severity level 4) Confidence score 5) Recommended action\n\nAnalysis:",
+                    }
+                }
+            }
+    def initialize_agents(self):
+        """Initialize all AI agents"""
+        logger.info("🤖 Initializing Prompt-Based Multi-Agent System...")
+        self.setup_detection_agent()
+        self.setup_counter_speech_agent()
+        self.setup_moderation_agent()
+        self.setup_sentiment_agent()
+        logger.info("✅ All agents initialized successfully!")
+    def setup_detection_agent(self):
+        """Initialize the hate speech detection agent"""
+        try:
+            logger.info("🔍 Loading Detection Agent (Fine-tuned DistilBERT)...")
             model_path = "./model"
+            tokenizer = AutoTokenizer.from_pretrained(model_path)
+            model = AutoModelForSequenceClassification.from_pretrained(
                 model_path,
+                torch_dtype=torch.float32
             )
+            self.detection_agent = pipeline(
                 "text-classification",
+                model=model,
+                tokenizer=tokenizer,
                 return_all_scores=True,
                 device=0 if torch.cuda.is_available() else -1
             )
+            logger.info("✅ Detection Agent loaded successfully")
+        except Exception as e:
+            logger.error(f"❌ Detection Agent failed: {e}")
+            logger.info("🔄 Using fallback detection model...")
+            self.detection_agent = pipeline(
+                "text-classification",
+                model="unitary/toxic-bert",
+                return_all_scores=True
+            )
+    def setup_counter_speech_agent(self):
+        """Initialize counter-speech generation agent with prompts"""
+        try:
+            logger.info("💬 Loading Counter-Speech Agent with Custom Prompts...")
+            # Using FLAN-T5 which is excellent at following instructions
+            self.counter_speech_agent = pipeline(
+                "text2text-generation",
+                model="google/flan-t5-base",
+                max_length=200,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
+                device=0 if torch.cuda.is_available() else -1
+            )
+            logger.info("✅ Counter-Speech Agent loaded (FLAN-T5 with custom prompts)")
         except Exception as e:
+            logger.error(f"❌ Counter-Speech Agent failed: {e}")
+            self.counter_speech_agent = None
+    def setup_moderation_agent(self):
+        """Initialize content moderation agent with prompts"""
+        try:
+            logger.info("🛡️ Loading Moderation Agent with Custom Prompts...")
+            # Using FLAN-T5 for structured moderation analysis
+            self.moderation_agent = pipeline(
+                "text2text-generation",
+                model="google/flan-t5-base",
+                max_length=300,
+                do_sample=False,
+                device=0 if torch.cuda.is_available() else -1
+            )
+            logger.info("✅ Moderation Agent loaded (FLAN-T5 with analysis prompts)")
+        except Exception as e:
+            logger.error(f"❌ Moderation Agent failed: {e}")
+            self.moderation_agent = None
+    def setup_sentiment_agent(self):
+        """Initialize sentiment analysis agent"""
+        try:
+            logger.info("📊 Loading Sentiment Agent...")
+            self.sentiment_agent = pipeline(
+                "sentiment-analysis",
+                model="cardiffnlp/twitter-roberta-base-sentiment-latest",
+                return_all_scores=True,
+                device=0 if torch.cuda.is_available() else -1
+            )
+            logger.info("✅ Sentiment Agent loaded (Twitter-RoBERTa)")
+        except Exception as e:
+            logger.error(f"❌ Sentiment Agent failed: {e}")
+            self.sentiment_agent = None
     def detect_hate_speech(self, text):
+        """Detection Agent: Analyze text for hate speech"""
         if not text or not text.strip():
             return {
                 "status": "❌ Please enter some text to analyze.",
                 "prediction": "No input",
                 "confidence": 0.0,
                 "all_scores": {},
+                "risk_level": "Unknown",
+                "is_hate_speech": False
             }
         try:
+            results = self.detection_agent(text.strip())
             if isinstance(results, list) and len(results) > 0:
                 if isinstance(results[0], list):
                     results = results[0]
                 all_scores = {}
                 max_score = 0
                 predicted_label = "UNKNOWN"
                 # Determine hate speech status
                 hate_keywords = ["HATE", "TOXIC", "NEGATIVE", "HARMFUL", "1", "LABEL_1"]
                 is_hate_speech = False
                 risk_level = "Low"
                 }
         except Exception as e:
+            logger.error(f"Detection error: {e}")
             return {
+                "status": f"❌ Detection error: {str(e)}",
                 "prediction": "Error",
                 "confidence": 0.0,
                 "all_scores": {},
+                "risk_level": "Unknown",
+                "is_hate_speech": False
             }
+    def analyze_sentiment(self, text):
+        """Sentiment Agent: Analyze emotional tone"""
+        if not self.sentiment_agent or not text.strip():
+            return {"sentiment": "neutral", "confidence": 0.0}
+        try:
+            results = self.sentiment_agent(text.strip())
+            if isinstance(results, list) and len(results) > 0:
+                if isinstance(results[0], list):
+                    results = results[0]
+                best_sentiment = max(results, key=lambda x: x['score'])
+                return {
+                    "sentiment": best_sentiment['label'].lower(),
+                    "confidence": best_sentiment['score'],
+                    "all_sentiments": {r['label']: r['score'] for r in results}
+                }
+        except Exception as e:
+            logger.error(f"Sentiment analysis error: {e}")
+            return {"sentiment": "neutral", "confidence": 0.0}
+    def moderate_content_with_prompts(self, text, detection_result, sentiment_result):
+        """Moderation Agent: Structured analysis using prompts"""
+        if not self.moderation_agent or not text.strip():
+            return {"analysis": "Unable to perform moderation analysis", "confidence": 0.0}
+        try:
+            # Get the appropriate moderation prompt
+            moderation_config = self.moderation_prompts.get("moderation_prompts", {})
+            analysis_config = moderation_config.get("comprehensive_analysis", {})
+            # Construct the analysis prompt
+            system_prompt = analysis_config.get("system_prompt", "Analyze this text for safety concerns.")
+            user_prompt_template = analysis_config.get("user_prompt_template", "Analyze: {text}")
+            # Fill in the template
+            full_prompt = f"{system_prompt}\n\n{user_prompt_template.format(text=text)}"
+            # Generate analysis
+            result = self.moderation_agent(full_prompt, max_length=250, do_sample=False)
+            if result and len(result) > 0:
+                analysis_text = result[0]['generated_text']
+                # Parse the analysis for key information
+                confidence = self.extract_confidence_from_analysis(analysis_text)
+                safety_level = self.extract_safety_level_from_analysis(analysis_text)
+                return {
+                    "analysis": analysis_text,
+                    "confidence": confidence,
+                    "safety_level": safety_level,
+                    "prompt_used": "comprehensive_analysis"
+                }
+        except Exception as e:
+            logger.error(f"Moderation analysis error: {e}")
+        # Fallback analysis
+        return {
+            "analysis": f"Basic assessment: Risk level {detection_result.get('risk_level', 'unknown')}, requires review if confidence > 70%",
+            "confidence": detection_result.get('confidence', 0.0),
+            "safety_level": "review_needed" if detection_result.get('confidence', 0) > 0.7 else "acceptable"
+        }
+    def generate_counter_speech_with_prompts(self, text, detection_result, sentiment_result):
+        """Counter-Speech Agent: Generate response using custom prompts"""
+        if not detection_result.get("is_hate_speech", False):
+            return "✨ This text promotes positive communication. Great job maintaining respectful dialogue!"
+        risk_level = detection_result.get("risk_level", "Low").lower()
+        confidence = detection_result.get("confidence", 0.0) * 100
+        sentiment = sentiment_result.get("sentiment", "neutral")
+        # Get appropriate prompts based on risk level
+        counter_speech_config = self.counter_speech_prompts.get("counter_speech_prompts", {})
+        # Select prompt based on risk level
+        if risk_level == "high":
+            prompt_config = counter_speech_config.get("high_risk", {})
+        elif risk_level == "medium":
+            prompt_config = counter_speech_config.get("medium_risk", {})
         else:
+            prompt_config = counter_speech_config.get("low_risk", {})
+        # If no specific config, use general template
+        if not prompt_config:
+            prompt_config = counter_speech_config.get("general_template", {})
+        if self.counter_speech_agent and prompt_config:
+            try:
+                # Construct the prompt
+                system_prompt = prompt_config.get("system_prompt", "Generate a respectful counter-speech response.")
+                user_prompt_template = prompt_config.get("user_prompt_template",
+                    "Generate a counter-speech response for: {original_text}")
+                # Fill in the template
+                full_prompt = f"{system_prompt}\n\n{user_prompt_template.format(original_text=text, risk_level=risk_level, confidence=confidence, sentiment=sentiment)}"
+                # Generate counter-speech
+                result = self.counter_speech_agent(full_prompt, max_length=150, do_sample=True, temperature=0.7)
+                if result and len(result) > 0:
+                    generated_text = result[0]['generated_text']
+                    # Clean up the response
+                    if "Counter-speech response:" in generated_text:
+                        generated_text = generated_text.split("Counter-speech response:")[-1].strip()
+                    elif "response:" in generated_text.lower():
+                        parts = generated_text.lower().split("response:")
+                        if len(parts) > 1:
+                            generated_text = parts[-1].strip()
+                    return f"🤖 **AI-Generated Counter-Speech** ({risk_level.title()} Risk): {generated_text}"
+            except Exception as e:
+                logger.error(f"Counter-speech generation error: {e}")
+        # Fallback to template responses
+        fallback_responses = counter_speech_config.get("general_template", {}).get("fallback_responses", [
+            "Thank you for sharing your thoughts. Building strong communities works best when we focus on shared values and constructive dialogue."
+        ])
         import random
+        return f"📝 **Template Response** ({risk_level.title()} Risk): {random.choice(fallback_responses)}"
+    def extract_confidence_from_analysis(self, analysis_text):
+        """Extract confidence score from moderation analysis"""
+        import re
+        # Look for confidence patterns like "85%" or "confidence: 0.85"
+        patterns = [
+            r'(\d+)%',
+            r'confidence[:\s]+(\d*\.?\d+)',
+            r'(\d*\.?\d+)\s*confidence'
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, analysis_text.lower())
+            if match:
+                value = float(match.group(1))
+                return value / 100 if value > 1 else value
+        return 0.5  # Default moderate confidence
+    def extract_safety_level_from_analysis(self, analysis_text):
+        """Extract safety assessment from moderation analysis"""
+        analysis_lower = analysis_text.lower()
+        if any(word in analysis_lower for word in ['harmful', 'high risk', 'remove', 'violation']):
+            return "harmful"
+        elif any(word in analysis_lower for word in ['concerning', 'medium risk', 'review', 'warning']):
+            return "concerning"
+        elif any(word in analysis_lower for word in ['safe', 'low risk', 'acceptable', 'approve']):
+            return "safe"
+        else:
+            return "review_needed"
+    def comprehensive_analysis(self, text):
+        """Run all agents with prompt-based analysis"""
+        start_time = datetime.now()
+        # Run core agents
+        detection_result = self.detect_hate_speech(text)
+        sentiment_result = self.analyze_sentiment(text)
+        # Run prompt-based agents
+        moderation_result = self.moderate_content_with_prompts(text, detection_result, sentiment_result)
+        counter_speech = self.generate_counter_speech_with_prompts(text, detection_result, sentiment_result)
+        processing_time = (datetime.now() - start_time).total_seconds()
+        return {
+            "detection": detection_result,
+            "sentiment": sentiment_result,
+            "moderation": moderation_result,
+            "counter_speech": counter_speech,
+            "processing_time": processing_time,
+            "timestamp": datetime.now().isoformat()
+        }
+# Initialize the system
+logger.info("🚀 Starting Prompt-Based Multi-Agent System...")
+agent_system = PromptBasedMultiAgentSystem()
+def analyze_text_with_prompts(text):
+    """Main analysis function using prompt-based agents"""
+    if not text or not text.strip():
+        return (
+            "❌ Please enter some text to analyze.",
+            {},
+            "No analysis performed.",
+            "No input provided",
+            {}
+        )
+    # Run comprehensive analysis with prompts
+    results = agent_system.comprehensive_analysis(text)
+    # Extract results for display
+    detection_status = results["detection"]["status"]
+    detection_scores = results["detection"]["all_scores"]
+    counter_speech = results["counter_speech"]
+    # Create detailed agent summary
+    agent_summary = f"""
+🔍 **Detection Agent**: {results['detection']['risk_level']} risk ({results['detection']['confidence']:.2%} confidence)
+📊 **Sentiment Agent**: {results['sentiment']['sentiment'].title()} ({results['sentiment']['confidence']:.2%} confidence)
+🛡️ **Moderation Agent**: {results['moderation'].get('safety_level', 'unknown').title()} safety level ({results['moderation'].get('confidence', 0):.2%} confidence)
+💬 **Counter-Speech Agent**: {"Custom prompt-based" if "AI-Generated" in counter_speech else "Template-based"} response
+⏱️ **Processing Time**: {results['processing_time']:.3f} seconds
+📋 **Moderation Analysis**: {results['moderation'].get('analysis', 'No detailed analysis available')[:200]}...
+"""
+    # Compile comprehensive agent data
+    all_agent_data = {
+        "Detection_Analysis": {
+            "scores": detection_scores,
+            "risk_level": results['detection']['risk_level'],
+            "is_hate_speech": results['detection']['is_hate_speech']
+        },
+        "Sentiment_Analysis": {
+            "primary_sentiment": results['sentiment']['sentiment'],
+            "all_sentiments": results["sentiment"].get("all_sentiments", {})
+        },
+        "Moderation_Analysis": {
+            "safety_assessment": results['moderation'].get('safety_level', 'unknown'),
+            "detailed_analysis": results['moderation'].get('analysis', ''),
+            "confidence": results['moderation'].get('confidence', 0),
+            "prompt_used": results['moderation'].get('prompt_used', 'fallback')
+        },
+        "Counter_Speech": {
+            "response": counter_speech,
+            "generation_method": "AI-Generated" if "AI-Generated" in counter_speech else "Template-based"
+        },
+        "System_Info": {
+            "timestamp": results["timestamp"],
+            "processing_time_seconds": results["processing_time"],
+            "prompt_files_loaded": {
+                "counter_speech": bool(agent_system.counter_speech_prompts),
+                "moderation": bool(agent_system.moderation_prompts)
+            }
+        }
+    }
+    return detection_status, detection_scores, counter_speech, agent_summary, all_agent_data
+def reload_prompts():
+    """Reload prompt files for testing"""
+    try:
+        agent_system.counter_speech_prompts = agent_system.load_prompts("counter_speech_prompts.json")
+        agent_system.moderation_prompts = agent_system.load_prompts("moderation_prompts.json")
+        return "✅ Prompts reloaded successfully!"
+    except Exception as e:
+        return f"❌ Error reloading prompts: {e}"
+def get_prompt_info():
+    """Get information about loaded prompts"""
+    counter_prompts = len(agent_system.counter_speech_prompts.get("counter_speech_prompts", {}))
+    moderation_prompts = len(agent_system.moderation_prompts.get("moderation_prompts", {}))
+    return {
+        "counter_speech_prompt_categories": counter_prompts,
+        "moderation_prompt_categories": moderation_prompts,
+        "prompt_files_status": {
+            "counter_speech_prompts.json": "✅ Loaded" if counter_prompts > 0 else "❌ Not found",
+            "moderation_prompts.json": "✅ Loaded" if moderation_prompts > 0 else "❌ Not found"
+        }
+    }
 # Create the Gradio interface
 with gr.Blocks(
+    title="Prompt-Based Multi-Agent Hate Speech Detection System",
     theme=gr.themes.Soft(),
     css="""
     .gradio-container {
         max-width: 1400px !important;
     }
+    .prompt-info {
+        background: linear-gradient(90deg, #f0f9ff 0%, #e0f2fe 100%);
         padding: 1rem;
         border-radius: 8px;
+        border-left: 4px solid #0284c7;
+    }
+    .agent-summary {
+        background: linear-gradient(90deg, #fefce8 0%, #fef3c7 100%);
+        padding: 1rem;
+        border-radius: 8px;
+        border-left: 4px solid #f59e0b;
     }
     """
 ) as demo:
     gr.Markdown("""
+    # 🤖 Prompt-Based Multi-Agent Hate Speech Detection System
+    **Advanced AI Agent Collaboration with Custom Prompts**
+    🔍 **Detection Agent** - Your fine-tuned DistilBERT model
+    💬 **Counter-Speech Agent** - FLAN-T5 with custom prompt engineering
+    🛡️ **Moderation Agent** - Structured analysis using specialized prompts
+    📊 **Sentiment Agent** - Twitter-RoBERTa for emotional context
+    *Each agent uses carefully crafted prompts from external JSON files for optimal performance.*
     """)
+    with gr.Tab("🤖 Prompt-Based Analysis"):
         with gr.Row():
             with gr.Column(scale=2):
                 text_input = gr.Textbox(
+                    label="Enter text for comprehensive prompt-based analysis",
+                    placeholder="Enter text here to see how prompt-engineered AI agents collaborate...",
                     lines=5,
                     max_lines=15
                 )
                 with gr.Row():
+                    analyze_btn = gr.Button("🚀 Run Prompt-Based Analysis", variant="primary", size="lg")
                     clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
+                    reload_btn = gr.Button("🔄 Reload Prompts", variant="secondary")
                 gr.Examples(
                     examples=[
+                        ["This is a wonderful day to collaborate and learn from each other!"],
+                        ["I appreciate everyone's different perspectives and backgrounds."],
+                        ["Let's work together to build a more inclusive community."],
+                        ["Thank you for sharing your experience. I'd love to understand your viewpoint better."],
+                        ["The diversity in our group makes our discussions much richer and more meaningful."],
+                        ["I respectfully disagree, but I value your right to express your opinion."]
                     ],
                     inputs=text_input,
+                    label="📝 Try these examples with prompt-based agents:"
                 )
         with gr.Row():
+            with gr.Column():
+                detection_output = gr.Textbox(
+                    label="🎯 Primary Detection Result",
                     interactive=False,
                     lines=3
                 )
+                agent_summary = gr.Textbox(
+                    label="🤖 Prompt-Based Agent Summary",
                     interactive=False,
+                    lines=8,
+                    elem_classes=["agent-summary"]
                 )
+            with gr.Column():
+                counter_speech_output = gr.Textbox(
+                    label="💬 Prompt-Generated Counter-Speech",
                     interactive=False,
+                    lines=6
+                )
+                reload_status = gr.Textbox(
+                    label="🔄 Prompt Reload Status",
+                    interactive=False,
+                    lines=2
                 )
         with gr.Row():
+            all_agents_output = gr.JSON(
+                label="📊 Complete Prompt-Based Multi-Agent Analysis",
                 visible=True
             )
+    with gr.Tab("📝 Prompt Management"):
         with gr.Row():
             with gr.Column():
+                gr.Markdown("""
+                ## 📝 Counter-Speech Prompts
+                The system uses specialized prompts for different risk levels:
+                ### 🚨 High Risk Prompts
+                - **Purpose**: Address clear hate speech with educational responses
+                - **Tone**: Firm but respectful, educational focus
+                - **Length**: 50-100 words
+                - **Goal**: De-escalation and education
+                ### ⚠️ Medium Risk Prompts
+                - **Purpose**: Handle potentially problematic content
+                - **Tone**: Gentle guidance, supportive
+                - **Length**: 40-80 words
+                - **Goal**: Reflection and improvement
+                ### ⚡ Low Risk Prompts
+                - **Purpose**: Encourage even better communication
+                - **Tone**: Positive reinforcement
+                - **Length**: 30-60 words
+                - **Goal**: Enhancement and encouragement
+                """)
             with gr.Column():
                 gr.Markdown("""
+                ## 🛡️ Moderation Prompts
+                Structured analysis prompts for comprehensive assessment:
+                ### 🔍 Comprehensive Analysis
+                - **Safety Assessment**: SAFE/CONCERNING/HARMFUL
+                - **Violation Categories**: Specific policy areas
+                - **Severity Levels**: LOW/MEDIUM/HIGH
+                - **Confidence Scoring**: 0-100% certainty
+                - **Contextual Factors**: Cultural and situational
+                ### 📊 Specialized Analysis Types
+                - **Hate Speech Focus**: Protected group targeting
+                - **Toxicity Assessment**: Discourse quality impact
+                - **Context Analysis**: Cultural and situational factors
+                - **Action Recommendations**: Specific moderation steps
                 """)
+        with gr.Row():
+            prompt_info_output = gr.JSON(
+                label="📋 Current Prompt Configuration",
+                value=get_prompt_info()
+            )
         gr.Markdown("""
+        ## 📁 Prompt File Structure
+        To customize the system behavior, create these JSON files:
+        ### `counter_speech_prompts.json`
+        ```json
+        {
+          "counter_speech_prompts": {
+            "high_risk": {
+              "system_prompt": "You are an expert educator...",
+              "user_prompt_template": "Generate response for: {original_text}..."
+            }
+          }
+        }
+        ```
+        ### `moderation_prompts.json`
+        ```json
+        {
+          "moderation_prompts": {
+            "comprehensive_analysis": {
+              "system_prompt": "You are a content moderation expert...",
+              "user_prompt_template": "Analyze: {text}..."
+            }
+          }
+        }
+        ```
+        **Benefits of External Prompts:**
+        - 🎯 **Fine-tuned control** over agent behavior
+        - 🔄 **Easy iteration** without code changes
+        - 📊 **A/B testing** of different prompt strategies
+        - 🎨 **Domain-specific customization** for different platforms
+        - 📈 **Performance optimization** through prompt engineering
+        """)
+    with gr.Tab("🔧 System Architecture"):
+        gr.Markdown("""
+        ## 🏗️ Prompt-Based Agent Architecture
+        ### 🔄 Agent Collaboration Flow
         ```
+        Input Text
+        ├── Detection Agent → Risk Classification (DistilBERT)
+        ├── Sentiment Agent → Emotional Context (RoBERTa)
+        ├── Moderation Agent → Structured Analysis (FLAN-T5 + Prompts)
+        └── Counter-Speech Agent → Educational Response (FLAN-T5 + Prompts)
+               ↑
+        Uses custom prompts and outputs from all other agents
         ```
+        ### 📝 Prompt Engineering Advantages
+        #### 🎯 **Precision Control**
+        - **Task-specific instructions** for each scenario
+        - **Tone and style guidelines** for appropriate responses
+        - **Length and format specifications** for consistency
+        - **Context integration** from multiple agent outputs
+        #### 🔄 **Iterative Improvement**
+        - **Hot-swappable prompts** without system restart
+        - **A/B testing capabilities** for prompt effectiveness
+        - **Performance metrics** tracking for optimization
+        - **Domain adaptation** for different use cases
+        #### 🛡️ **Quality Assurance**
+        - **Bias mitigation** through careful prompt design
+        - **Safety guardrails** built into prompt structure
+        - **Consistency enforcement** across all responses
+        - **Cultural sensitivity** considerations
+        ### 🚀 Production Benefits
+        - **🎨 Customizable**: Adapt to different platforms and communities
+        - **📈 Scalable**: Easy to add new prompt categories
+        - **🔧 Maintainable**: Update behavior without code deployment
+        - **📊 Measurable**: Track prompt performance and effectiveness
+        - **🌍 Localizable**: Different prompts for different regions/cultures
+        ### ⚠️ Deployment Considerations
+        #### 🔒 Security
+        - **Prompt injection protection** for user inputs
+        - **Content filtering** on generated responses
+        - **Rate limiting** to prevent abuse
+        - **Audit logging** for compliance
+        #### 📊 Monitoring
+        - **Response quality metrics** tracking
+        - **User feedback integration** for continuous improvement
+        - **Error rate monitoring** across different prompt types
+        - **Performance benchmarking** against baseline models
+        #### 👥 Human Oversight
+        - **Expert review processes** for prompt updates
+        - **Community feedback loops** for prompt effectiveness
+        - **Escalation pathways** for edge cases
+        - **Regular bias audits** and prompt refinement
         """)
     # Event handlers
     analyze_btn.click(
+        fn=analyze_text_with_prompts,
         inputs=text_input,
+        outputs=[detection_output, all_agents_output, counter_speech_output, agent_summary, all_agents_output]
     )
     clear_btn.click(
         fn=lambda: ("", "", "", "", {}),
+        outputs=[text_input, detection_output, counter_speech_output, agent_summary, all_agents_output]
+    )
+    reload_btn.click(
+        fn=reload_prompts,
+        outputs=reload_status
     )
 # Launch configuration