Ashokdll commited on
Commit
8cdb327
·
verified ·
1 Parent(s): 8fce1f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +590 -233
app.py CHANGED
@@ -5,104 +5,183 @@ import numpy as np
5
  import json
6
  from datetime import datetime
7
  import logging
 
8
 
9
  # Set up logging
10
  logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
12
 
13
- class DistilBERTHateSpeechDetector:
14
  def __init__(self):
15
- self.model = None
16
- self.tokenizer = None
17
- self.classifier = None
18
- self.load_model()
 
 
 
 
 
 
19
 
20
- def load_model(self):
21
- """Load the fine-tuned DistilBERT model"""
22
  try:
23
- logger.info("Loading DistilBERT hate speech detection model...")
24
-
25
- # Try to load from local model directory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  model_path = "./model"
27
 
28
- # Load tokenizer
29
- self.tokenizer = AutoTokenizer.from_pretrained(model_path)
30
- logger.info("✅ Tokenizer loaded successfully")
31
-
32
- # Load model
33
- self.model = AutoModelForSequenceClassification.from_pretrained(
34
  model_path,
35
- torch_dtype=torch.float32,
36
- device_map="auto"
37
  )
38
- logger.info("✅ DistilBERT model loaded successfully")
39
 
40
- # Create pipeline
41
- self.classifier = pipeline(
42
  "text-classification",
43
- model=self.model,
44
- tokenizer=self.tokenizer,
45
  return_all_scores=True,
46
  device=0 if torch.cuda.is_available() else -1
47
  )
 
48
 
49
- # Get model info
50
- logger.info(f"Model architecture: {self.model.config.architectures[0]}")
51
- logger.info(f"Number of labels: {self.model.config.num_labels}")
52
- logger.info(f"Max sequence length: {self.model.config.max_position_embeddings}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  except Exception as e:
55
- logger.error(f"❌ Error loading custom model: {e}")
56
- logger.info("🔄 Falling back to public model...")
 
 
 
 
 
57
 
58
- # Fallback to a public model
59
- try:
60
- self.classifier = pipeline(
61
- "text-classification",
62
- model="martin-ha/toxic-comment-model",
63
- return_all_scores=True
64
- )
65
- logger.info("✅ Fallback model loaded")
66
- except Exception as fallback_error:
67
- logger.error(f"❌ Fallback model also failed: {fallback_error}")
68
- raise Exception("Failed to load any model")
 
 
69
 
70
- def preprocess_text(self, text):
71
- """Preprocess text for better analysis"""
72
- if not text or not text.strip():
73
- return ""
74
-
75
- # Basic preprocessing
76
- text = text.strip()
77
- # Remove excessive whitespace
78
- text = " ".join(text.split())
79
-
80
- return text
 
 
 
 
 
81
 
82
  def detect_hate_speech(self, text):
83
- """Detect hate speech with detailed analysis"""
84
  if not text or not text.strip():
85
  return {
86
  "status": "❌ Please enter some text to analyze.",
87
  "prediction": "No input",
88
  "confidence": 0.0,
89
  "all_scores": {},
90
- "risk_level": "Unknown"
 
91
  }
92
 
93
  try:
94
- # Preprocess text
95
- processed_text = self.preprocess_text(text)
96
 
97
- # Get predictions
98
- results = self.classifier(processed_text)
99
-
100
- # Handle different output formats
101
  if isinstance(results, list) and len(results) > 0:
102
  if isinstance(results[0], list):
103
  results = results[0]
104
 
105
- # Parse results
106
  all_scores = {}
107
  max_score = 0
108
  predicted_label = "UNKNOWN"
@@ -122,7 +201,6 @@ class DistilBERTHateSpeechDetector:
122
 
123
  # Determine hate speech status
124
  hate_keywords = ["HATE", "TOXIC", "NEGATIVE", "HARMFUL", "1", "LABEL_1"]
125
- clean_keywords = ["CLEAN", "NORMAL", "POSITIVE", "SAFE", "0", "LABEL_0"]
126
 
127
  is_hate_speech = False
128
  risk_level = "Low"
@@ -153,274 +231,553 @@ class DistilBERTHateSpeechDetector:
153
  }
154
 
155
  except Exception as e:
156
- logger.error(f"Analysis error: {e}")
157
  return {
158
- "status": f"❌ Error during analysis: {str(e)}",
159
  "prediction": "Error",
160
  "confidence": 0.0,
161
  "all_scores": {},
162
- "risk_level": "Unknown"
 
163
  }
164
 
165
- def generate_counter_narrative(self, text, detection_result):
166
- """Generate educational counter-narrative based on detection"""
167
- if not detection_result.get("is_hate_speech", False):
168
- return " Great! This text promotes positive communication. Keep up the constructive dialogue!"
169
 
170
- # Counter-narratives based on risk level
171
- risk_level = detection_result.get("risk_level", "Low")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
- high_risk_responses = [
174
- "🛡️ **Educational Response**: This type of language can cause real harm to individuals and communities. Consider how your words might affect others and try rephrasing with respect and empathy.",
175
- "💡 **Constructive Alternative**: Instead of using harmful language, try expressing your concerns in a way that opens dialogue rather than shutting it down. Every person deserves dignity and respect.",
176
- "🌍 **Community Impact**: Hate speech can escalate tensions and divide communities. Consider how you can contribute to a more inclusive and understanding environment.",
177
- "📚 **Learning Opportunity**: Research shows that exposure to diverse perspectives actually strengthens critical thinking. Consider engaging with different viewpoints constructively."
178
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
- medium_risk_responses = [
181
- "🤔 **Reflection Point**: This language might be interpreted as harmful by some. Consider rewording to express your point more constructively.",
182
- "💬 **Communication Tip**: Try framing your message in a way that invites discussion rather than potentially excluding or hurting others.",
183
- "🎯 **Focus Shift**: Instead of focusing on differences that divide, consider highlighting shared values or common ground.",
184
- "🔄 **Reframe Opportunity**: How might you express this same sentiment in a way that brings people together rather than apart?"
185
- ]
186
 
187
- if risk_level == "High":
188
- responses = high_risk_responses
189
- elif risk_level == "Medium":
190
- responses = medium_risk_responses
 
 
 
 
191
  else:
192
- responses = [
193
- "💭 **Gentle Reminder**: While this might not be clearly harmful, consider how your words might be received by others.",
194
- "🌱 **Growth Mindset**: Every interaction is an opportunity to build understanding and connection.",
195
- "🤝 **Bridge Building**: Consider how you can use your voice to bring people together rather than create distance."
196
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  import random
199
- return random.choice(responses)
200
 
201
- def get_model_info(self):
202
- """Get information about the loaded model"""
203
- if self.model:
204
- return {
205
- "Model Type": "DistilBERT (Fine-tuned)",
206
- "Architecture": self.model.config.architectures[0],
207
- "Parameters": f"~{66}M parameters",
208
- "Max Length": self.model.config.max_position_embeddings,
209
- "Labels": self.model.config.num_labels,
210
- "Framework": "PyTorch + Transformers"
211
- }
212
- return {"Model": "Fallback model in use"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
- # Initialize the detector
215
- logger.info("Initializing DistilBERT Hate Speech Detector...")
216
- detector = DistilBERTHateSpeechDetector()
217
 
218
- def analyze_text(text):
219
- """Main analysis function for Gradio interface"""
220
- start_time = datetime.now()
 
 
 
 
 
 
 
221
 
222
- # Perform detection
223
- detection_result = detector.detect_hate_speech(text)
224
 
225
- # Generate counter-narrative
226
- counter_narrative = detector.generate_counter_narrative(text, detection_result)
 
 
227
 
228
- # Calculate processing time
229
- processing_time = (datetime.now() - start_time).total_seconds()
230
-
231
- # Format results for display
232
- status = detection_result["status"]
233
- all_scores = detection_result["all_scores"]
 
 
 
 
234
 
235
- # Add processing info
236
- info_text = f"⏱️ Processed in {processing_time:.3f} seconds | Risk Level: {detection_result['risk_level']}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
- return status, all_scores, counter_narrative, info_text
239
 
240
- def get_model_details():
241
- """Get model information for display"""
242
- return detector.get_model_info()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
  # Create the Gradio interface
245
  with gr.Blocks(
246
- title="DistilBERT Hate Speech Detection & Counter-Narrative Generator",
247
  theme=gr.themes.Soft(),
248
  css="""
249
  .gradio-container {
250
  max-width: 1400px !important;
251
  }
252
- .status-box {
 
253
  padding: 1rem;
254
  border-radius: 8px;
255
- margin: 0.5rem 0;
 
 
 
 
 
 
256
  }
257
  """
258
  ) as demo:
259
 
260
  gr.Markdown("""
261
- # 🛡️ DistilBERT Hate Speech Detection & Counter-Narrative Generator
 
 
262
 
263
- **Advanced AI Agent System for Content Moderation & Education**
 
 
 
264
 
265
- 🤖 **Powered by Fine-tuned DistilBERT** - Efficient and accurate hate speech detection
266
- 📚 **Educational Counter-Narratives** - AI-generated constructive responses
267
- ⚡ **Real-time Processing** - Fast analysis with detailed confidence scores
268
- 🎯 **Multi-level Risk Assessment** - Nuanced understanding of content severity
269
  """)
270
 
271
- with gr.Tab("🔍 Text Analysis"):
272
  with gr.Row():
273
  with gr.Column(scale=2):
274
  text_input = gr.Textbox(
275
- label="Enter text to analyze",
276
- placeholder="Type or paste text here for hate speech analysis...",
277
  lines=5,
278
  max_lines=15
279
  )
280
 
281
  with gr.Row():
282
- analyze_btn = gr.Button("🔍 Analyze Text", variant="primary", size="lg")
283
  clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
 
284
 
285
  gr.Examples(
286
  examples=[
287
- ["This is a wonderful day to learn something new!"],
288
- ["I respectfully disagree with that policy, but I understand your perspective."],
289
- ["The team did an excellent job on this project. Well done everyone!"],
290
- ["Thank you for sharing your thoughts. Let's discuss this constructively."],
291
- ["That restaurant has amazing food and great service!"],
292
- ["I appreciate you taking the time to explain your viewpoint."]
293
  ],
294
  inputs=text_input,
295
- label="📝 Try these positive examples:"
296
  )
297
 
298
  with gr.Row():
299
- with gr.Column(scale=1):
300
- status_output = gr.Textbox(
301
- label="🎯 Detection Result",
302
  interactive=False,
303
  lines=3
304
  )
305
 
306
- processing_info = gr.Textbox(
307
- label="ℹ️ Processing Info",
308
  interactive=False,
309
- lines=1
 
310
  )
311
 
312
- with gr.Column(scale=1):
313
- counter_narrative_output = gr.Textbox(
314
- label="💡 Educational Counter-Narrative",
315
  interactive=False,
316
- lines=4
 
 
 
 
 
 
317
  )
318
 
319
  with gr.Row():
320
- scores_output = gr.JSON(
321
- label="📊 Detailed Confidence Scores",
322
  visible=True
323
  )
324
 
325
- with gr.Tab("🔧 Model Information"):
326
  with gr.Row():
327
  with gr.Column():
328
- gr.Markdown("## 🤖 Model Details")
329
- model_info = gr.JSON(
330
- label="Model Configuration",
331
- value=get_model_details()
332
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
  with gr.Column():
335
  gr.Markdown("""
336
- ## 📈 Performance Characteristics
 
 
337
 
338
- **DistilBERT Advantages:**
339
- - **Fast Processing**: 60% smaller than BERT
340
- - 🎯 **High Accuracy**: Retains 97% of BERT's performance
341
- - 💾 **Memory Efficient**: Lower computational requirements
342
- - 🔄 **Real-time Ready**: Suitable for production deployment
 
343
 
344
- **Fine-tuning Benefits:**
345
- - 🎯 **Domain-Specific**: Trained on hate speech datasets
346
- - 📊 **Balanced Performance**: Optimized precision-recall balance
347
- - 🔍 **Context-Aware**: Understanding of nuanced language patterns
 
348
  """)
349
-
350
- with gr.Tab("📋 About & Usage"):
 
 
 
 
 
351
  gr.Markdown("""
352
- ## 🎯 System Overview
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
 
354
- This demonstration showcases an advanced AI agent system combining:
 
 
 
 
 
 
 
 
 
 
355
 
356
- ### 🤖 AI Agent Architecture
357
- 1. **Detection Agent**: Fine-tuned DistilBERT classifier
358
- 2. **Analysis Agent**: Risk assessment and confidence scoring
359
- 3. **Counter-Narrative Agent**: Educational response generation
360
- 4. **Monitoring Agent**: Performance tracking and logging
 
 
 
 
 
 
361
 
362
- ### 🛡️ Content Moderation Pipeline
363
  ```
364
- Input Text → Preprocessing → DistilBERT Analysis → Risk Assessment → Counter-Narrative Generation → Results
 
 
 
 
 
 
365
  ```
366
 
367
- ### 📊 Risk Level Classification
368
- - **🚨 High Risk (>80% confidence)**: Clear hate speech detection
369
- - **⚠️ Medium Risk (60-80%)**: Potential harmful content
370
- - **⚡ Low-Medium Risk (40-60%)**: Uncertain classification
371
- - **✅ Low Risk (<40%)**: Safe content
372
-
373
- ## 🔧 Technical Implementation
374
-
375
- **Model Architecture:**
376
- - Base: DistilBERT (distilbert-base-uncased)
377
- - Task: Sequence Classification
378
- - Parameters: ~66M (vs 110M for BERT-base)
379
- - Max Sequence Length: 512 tokens
380
-
381
- **Key Features:**
382
- - Real-time inference with <1 second response time
383
- - Confidence-based risk assessment
384
- - Educational counter-narrative generation
385
- - Comprehensive error handling and fallbacks
386
-
387
- ## ⚠️ Important Disclaimers
388
-
389
- - 🔬 **Research Demonstration**: Not ready for production without additional safeguards
390
- - 👥 **Human Oversight Required**: AI should supplement, not replace human moderation
391
- - ⚖️ **Bias Awareness**: Models can reflect biases present in training data
392
- - 🔒 **Privacy Conscious**: No data is stored or logged from this demo
393
- - 🌍 **Context Matters**: Cultural and contextual factors affect interpretation
394
-
395
- ## 🚀 Potential Applications
396
-
397
- - **Social Media Platforms**: Automated content moderation
398
- - **Educational Tools**: Teaching about respectful communication
399
- - **Community Forums**: Maintaining healthy discussions
400
- - **Content Creation**: Writing assistance for inclusive language
401
- - **Research**: Studying patterns in online discourse
402
-
403
- ## 📞 Feedback & Development
404
-
405
- This demo represents the cutting edge of AI-powered content moderation.
406
- For production deployment, additional considerations include:
407
- - Continuous model updates and retraining
408
- - Human review workflows
409
- - Appeal and correction mechanisms
410
- - Cross-cultural validation
411
- - Regulatory compliance
 
 
412
  """)
413
 
414
  # Event handlers
415
  analyze_btn.click(
416
- fn=analyze_text,
417
  inputs=text_input,
418
- outputs=[status_output, scores_output, counter_narrative_output, processing_info]
419
  )
420
 
421
  clear_btn.click(
422
  fn=lambda: ("", "", "", "", {}),
423
- outputs=[text_input, status_output, counter_narrative_output, processing_info, scores_output]
 
 
 
 
 
424
  )
425
 
426
  # Launch configuration
 
5
  import json
6
  from datetime import datetime
7
  import logging
8
+ import os
9
 
10
  # Set up logging
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
14
+ class PromptBasedMultiAgentSystem:
15
  def __init__(self):
16
+ self.detection_agent = None
17
+ self.counter_speech_agent = None
18
+ self.moderation_agent = None
19
+ self.sentiment_agent = None
20
+
21
+ # Load prompt configurations
22
+ self.counter_speech_prompts = self.load_prompts("counter_speech_prompts.json")
23
+ self.moderation_prompts = self.load_prompts("moderation_prompts.json")
24
+
25
+ self.initialize_agents()
26
 
27
+ def load_prompts(self, filename):
28
+ """Load prompts from JSON file with fallback"""
29
  try:
30
+ if os.path.exists(filename):
31
+ with open(filename, 'r', encoding='utf-8') as f:
32
+ return json.load(f)
33
+ else:
34
+ logger.warning(f"Prompt file {filename} not found, using built-in prompts")
35
+ return self.get_default_prompts(filename)
36
+ except Exception as e:
37
+ logger.error(f"Error loading prompts from {filename}: {e}")
38
+ return self.get_default_prompts(filename)
39
+
40
+ def get_default_prompts(self, filename):
41
+ """Default prompts as fallback"""
42
+ if "counter_speech" in filename:
43
+ return {
44
+ "counter_speech_prompts": {
45
+ "high_risk": {
46
+ "system_prompt": "You are an expert educator specializing in counter-speech and conflict de-escalation.",
47
+ "user_prompt_template": "Generate a respectful, educational counter-speech response to address harmful content while promoting understanding. Original text (Risk: {risk_level}, Confidence: {confidence}%, Sentiment: {sentiment}): \"{original_text}\"\n\nCounter-speech response:",
48
+ },
49
+ "general_template": {
50
+ "fallback_responses": [
51
+ "Thank you for sharing your thoughts. Building strong communities works best when we focus on shared values and constructive dialogue. How might we work together on the concerns you've raised?",
52
+ "I appreciate your perspective. Sometimes our strongest feelings can be expressed in ways that bring people together. What specific positive changes would you like to see?",
53
+ "Your engagement with this topic is clear. When we channel that energy into inclusive dialogue, we often find solutions that work for everyone."
54
+ ]
55
+ }
56
+ }
57
+ }
58
+ else:
59
+ return {
60
+ "moderation_prompts": {
61
+ "comprehensive_analysis": {
62
+ "system_prompt": "You are an expert content moderation specialist analyzing text for safety and compliance.",
63
+ "user_prompt_template": "Analyze this text for potential violations: \"{text}\"\n\nProvide: 1) Safety assessment 2) Violation categories 3) Severity level 4) Confidence score 5) Recommended action\n\nAnalysis:",
64
+ }
65
+ }
66
+ }
67
+
68
+ def initialize_agents(self):
69
+ """Initialize all AI agents"""
70
+ logger.info("🤖 Initializing Prompt-Based Multi-Agent System...")
71
+
72
+ self.setup_detection_agent()
73
+ self.setup_counter_speech_agent()
74
+ self.setup_moderation_agent()
75
+ self.setup_sentiment_agent()
76
+
77
+ logger.info("✅ All agents initialized successfully!")
78
+
79
+ def setup_detection_agent(self):
80
+ """Initialize the hate speech detection agent"""
81
+ try:
82
+ logger.info("🔍 Loading Detection Agent (Fine-tuned DistilBERT)...")
83
  model_path = "./model"
84
 
85
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
86
+ model = AutoModelForSequenceClassification.from_pretrained(
 
 
 
 
87
  model_path,
88
+ torch_dtype=torch.float32
 
89
  )
 
90
 
91
+ self.detection_agent = pipeline(
 
92
  "text-classification",
93
+ model=model,
94
+ tokenizer=tokenizer,
95
  return_all_scores=True,
96
  device=0 if torch.cuda.is_available() else -1
97
  )
98
+ logger.info("✅ Detection Agent loaded successfully")
99
 
100
+ except Exception as e:
101
+ logger.error(f" Detection Agent failed: {e}")
102
+ logger.info("🔄 Using fallback detection model...")
103
+ self.detection_agent = pipeline(
104
+ "text-classification",
105
+ model="unitary/toxic-bert",
106
+ return_all_scores=True
107
+ )
108
+
109
+ def setup_counter_speech_agent(self):
110
+ """Initialize counter-speech generation agent with prompts"""
111
+ try:
112
+ logger.info("💬 Loading Counter-Speech Agent with Custom Prompts...")
113
+
114
+ # Using FLAN-T5 which is excellent at following instructions
115
+ self.counter_speech_agent = pipeline(
116
+ "text2text-generation",
117
+ model="google/flan-t5-base",
118
+ max_length=200,
119
+ do_sample=True,
120
+ temperature=0.7,
121
+ top_p=0.9,
122
+ device=0 if torch.cuda.is_available() else -1
123
+ )
124
+ logger.info("✅ Counter-Speech Agent loaded (FLAN-T5 with custom prompts)")
125
 
126
  except Exception as e:
127
+ logger.error(f"❌ Counter-Speech Agent failed: {e}")
128
+ self.counter_speech_agent = None
129
+
130
+ def setup_moderation_agent(self):
131
+ """Initialize content moderation agent with prompts"""
132
+ try:
133
+ logger.info("🛡️ Loading Moderation Agent with Custom Prompts...")
134
 
135
+ # Using FLAN-T5 for structured moderation analysis
136
+ self.moderation_agent = pipeline(
137
+ "text2text-generation",
138
+ model="google/flan-t5-base",
139
+ max_length=300,
140
+ do_sample=False,
141
+ device=0 if torch.cuda.is_available() else -1
142
+ )
143
+ logger.info("✅ Moderation Agent loaded (FLAN-T5 with analysis prompts)")
144
+
145
+ except Exception as e:
146
+ logger.error(f"❌ Moderation Agent failed: {e}")
147
+ self.moderation_agent = None
148
 
149
+ def setup_sentiment_agent(self):
150
+ """Initialize sentiment analysis agent"""
151
+ try:
152
+ logger.info("📊 Loading Sentiment Agent...")
153
+
154
+ self.sentiment_agent = pipeline(
155
+ "sentiment-analysis",
156
+ model="cardiffnlp/twitter-roberta-base-sentiment-latest",
157
+ return_all_scores=True,
158
+ device=0 if torch.cuda.is_available() else -1
159
+ )
160
+ logger.info("✅ Sentiment Agent loaded (Twitter-RoBERTa)")
161
+
162
+ except Exception as e:
163
+ logger.error(f"❌ Sentiment Agent failed: {e}")
164
+ self.sentiment_agent = None
165
 
166
  def detect_hate_speech(self, text):
167
+ """Detection Agent: Analyze text for hate speech"""
168
  if not text or not text.strip():
169
  return {
170
  "status": "❌ Please enter some text to analyze.",
171
  "prediction": "No input",
172
  "confidence": 0.0,
173
  "all_scores": {},
174
+ "risk_level": "Unknown",
175
+ "is_hate_speech": False
176
  }
177
 
178
  try:
179
+ results = self.detection_agent(text.strip())
 
180
 
 
 
 
 
181
  if isinstance(results, list) and len(results) > 0:
182
  if isinstance(results[0], list):
183
  results = results[0]
184
 
 
185
  all_scores = {}
186
  max_score = 0
187
  predicted_label = "UNKNOWN"
 
201
 
202
  # Determine hate speech status
203
  hate_keywords = ["HATE", "TOXIC", "NEGATIVE", "HARMFUL", "1", "LABEL_1"]
 
204
 
205
  is_hate_speech = False
206
  risk_level = "Low"
 
231
  }
232
 
233
  except Exception as e:
234
+ logger.error(f"Detection error: {e}")
235
  return {
236
+ "status": f"❌ Detection error: {str(e)}",
237
  "prediction": "Error",
238
  "confidence": 0.0,
239
  "all_scores": {},
240
+ "risk_level": "Unknown",
241
+ "is_hate_speech": False
242
  }
243
 
244
+ def analyze_sentiment(self, text):
245
+ """Sentiment Agent: Analyze emotional tone"""
246
+ if not self.sentiment_agent or not text.strip():
247
+ return {"sentiment": "neutral", "confidence": 0.0}
248
 
249
+ try:
250
+ results = self.sentiment_agent(text.strip())
251
+ if isinstance(results, list) and len(results) > 0:
252
+ if isinstance(results[0], list):
253
+ results = results[0]
254
+
255
+ best_sentiment = max(results, key=lambda x: x['score'])
256
+ return {
257
+ "sentiment": best_sentiment['label'].lower(),
258
+ "confidence": best_sentiment['score'],
259
+ "all_sentiments": {r['label']: r['score'] for r in results}
260
+ }
261
+ except Exception as e:
262
+ logger.error(f"Sentiment analysis error: {e}")
263
+ return {"sentiment": "neutral", "confidence": 0.0}
264
+
265
+ def moderate_content_with_prompts(self, text, detection_result, sentiment_result):
266
+ """Moderation Agent: Structured analysis using prompts"""
267
+ if not self.moderation_agent or not text.strip():
268
+ return {"analysis": "Unable to perform moderation analysis", "confidence": 0.0}
269
 
270
+ try:
271
+ # Get the appropriate moderation prompt
272
+ moderation_config = self.moderation_prompts.get("moderation_prompts", {})
273
+ analysis_config = moderation_config.get("comprehensive_analysis", {})
274
+
275
+ # Construct the analysis prompt
276
+ system_prompt = analysis_config.get("system_prompt", "Analyze this text for safety concerns.")
277
+ user_prompt_template = analysis_config.get("user_prompt_template", "Analyze: {text}")
278
+
279
+ # Fill in the template
280
+ full_prompt = f"{system_prompt}\n\n{user_prompt_template.format(text=text)}"
281
+
282
+ # Generate analysis
283
+ result = self.moderation_agent(full_prompt, max_length=250, do_sample=False)
284
+
285
+ if result and len(result) > 0:
286
+ analysis_text = result[0]['generated_text']
287
+
288
+ # Parse the analysis for key information
289
+ confidence = self.extract_confidence_from_analysis(analysis_text)
290
+ safety_level = self.extract_safety_level_from_analysis(analysis_text)
291
+
292
+ return {
293
+ "analysis": analysis_text,
294
+ "confidence": confidence,
295
+ "safety_level": safety_level,
296
+ "prompt_used": "comprehensive_analysis"
297
+ }
298
+
299
+ except Exception as e:
300
+ logger.error(f"Moderation analysis error: {e}")
301
+
302
+ # Fallback analysis
303
+ return {
304
+ "analysis": f"Basic assessment: Risk level {detection_result.get('risk_level', 'unknown')}, requires review if confidence > 70%",
305
+ "confidence": detection_result.get('confidence', 0.0),
306
+ "safety_level": "review_needed" if detection_result.get('confidence', 0) > 0.7 else "acceptable"
307
+ }
308
+
309
+ def generate_counter_speech_with_prompts(self, text, detection_result, sentiment_result):
310
+ """Counter-Speech Agent: Generate response using custom prompts"""
311
+ if not detection_result.get("is_hate_speech", False):
312
+ return "✨ This text promotes positive communication. Great job maintaining respectful dialogue!"
313
 
314
+ risk_level = detection_result.get("risk_level", "Low").lower()
315
+ confidence = detection_result.get("confidence", 0.0) * 100
316
+ sentiment = sentiment_result.get("sentiment", "neutral")
 
 
 
317
 
318
+ # Get appropriate prompts based on risk level
319
+ counter_speech_config = self.counter_speech_prompts.get("counter_speech_prompts", {})
320
+
321
+ # Select prompt based on risk level
322
+ if risk_level == "high":
323
+ prompt_config = counter_speech_config.get("high_risk", {})
324
+ elif risk_level == "medium":
325
+ prompt_config = counter_speech_config.get("medium_risk", {})
326
  else:
327
+ prompt_config = counter_speech_config.get("low_risk", {})
328
+
329
+ # If no specific config, use general template
330
+ if not prompt_config:
331
+ prompt_config = counter_speech_config.get("general_template", {})
332
+
333
+ if self.counter_speech_agent and prompt_config:
334
+ try:
335
+ # Construct the prompt
336
+ system_prompt = prompt_config.get("system_prompt", "Generate a respectful counter-speech response.")
337
+ user_prompt_template = prompt_config.get("user_prompt_template",
338
+ "Generate a counter-speech response for: {original_text}")
339
+
340
+ # Fill in the template
341
+ full_prompt = f"{system_prompt}\n\n{user_prompt_template.format(original_text=text, risk_level=risk_level, confidence=confidence, sentiment=sentiment)}"
342
+
343
+ # Generate counter-speech
344
+ result = self.counter_speech_agent(full_prompt, max_length=150, do_sample=True, temperature=0.7)
345
+
346
+ if result and len(result) > 0:
347
+ generated_text = result[0]['generated_text']
348
+
349
+ # Clean up the response
350
+ if "Counter-speech response:" in generated_text:
351
+ generated_text = generated_text.split("Counter-speech response:")[-1].strip()
352
+ elif "response:" in generated_text.lower():
353
+ parts = generated_text.lower().split("response:")
354
+ if len(parts) > 1:
355
+ generated_text = parts[-1].strip()
356
+
357
+ return f"🤖 **AI-Generated Counter-Speech** ({risk_level.title()} Risk): {generated_text}"
358
+
359
+ except Exception as e:
360
+ logger.error(f"Counter-speech generation error: {e}")
361
+
362
+ # Fallback to template responses
363
+ fallback_responses = counter_speech_config.get("general_template", {}).get("fallback_responses", [
364
+ "Thank you for sharing your thoughts. Building strong communities works best when we focus on shared values and constructive dialogue."
365
+ ])
366
 
367
  import random
368
+ return f"📝 **Template Response** ({risk_level.title()} Risk): {random.choice(fallback_responses)}"
369
 
370
+ def extract_confidence_from_analysis(self, analysis_text):
371
+ """Extract confidence score from moderation analysis"""
372
+ import re
373
+ # Look for confidence patterns like "85%" or "confidence: 0.85"
374
+ patterns = [
375
+ r'(\d+)%',
376
+ r'confidence[:\s]+(\d*\.?\d+)',
377
+ r'(\d*\.?\d+)\s*confidence'
378
+ ]
379
+
380
+ for pattern in patterns:
381
+ match = re.search(pattern, analysis_text.lower())
382
+ if match:
383
+ value = float(match.group(1))
384
+ return value / 100 if value > 1 else value
385
+
386
+ return 0.5 # Default moderate confidence
387
+
388
+ def extract_safety_level_from_analysis(self, analysis_text):
389
+ """Extract safety assessment from moderation analysis"""
390
+ analysis_lower = analysis_text.lower()
391
+
392
+ if any(word in analysis_lower for word in ['harmful', 'high risk', 'remove', 'violation']):
393
+ return "harmful"
394
+ elif any(word in analysis_lower for word in ['concerning', 'medium risk', 'review', 'warning']):
395
+ return "concerning"
396
+ elif any(word in analysis_lower for word in ['safe', 'low risk', 'acceptable', 'approve']):
397
+ return "safe"
398
+ else:
399
+ return "review_needed"
400
+
401
+ def comprehensive_analysis(self, text):
402
+ """Run all agents with prompt-based analysis"""
403
+ start_time = datetime.now()
404
+
405
+ # Run core agents
406
+ detection_result = self.detect_hate_speech(text)
407
+ sentiment_result = self.analyze_sentiment(text)
408
+
409
+ # Run prompt-based agents
410
+ moderation_result = self.moderate_content_with_prompts(text, detection_result, sentiment_result)
411
+ counter_speech = self.generate_counter_speech_with_prompts(text, detection_result, sentiment_result)
412
+
413
+ processing_time = (datetime.now() - start_time).total_seconds()
414
+
415
+ return {
416
+ "detection": detection_result,
417
+ "sentiment": sentiment_result,
418
+ "moderation": moderation_result,
419
+ "counter_speech": counter_speech,
420
+ "processing_time": processing_time,
421
+ "timestamp": datetime.now().isoformat()
422
+ }
423
 
424
+ # Initialize the system
425
+ logger.info("🚀 Starting Prompt-Based Multi-Agent System...")
426
+ agent_system = PromptBasedMultiAgentSystem()
427
 
428
+ def analyze_text_with_prompts(text):
429
+ """Main analysis function using prompt-based agents"""
430
+ if not text or not text.strip():
431
+ return (
432
+ "❌ Please enter some text to analyze.",
433
+ {},
434
+ "No analysis performed.",
435
+ "No input provided",
436
+ {}
437
+ )
438
 
439
+ # Run comprehensive analysis with prompts
440
+ results = agent_system.comprehensive_analysis(text)
441
 
442
+ # Extract results for display
443
+ detection_status = results["detection"]["status"]
444
+ detection_scores = results["detection"]["all_scores"]
445
+ counter_speech = results["counter_speech"]
446
 
447
+ # Create detailed agent summary
448
+ agent_summary = f"""
449
+ 🔍 **Detection Agent**: {results['detection']['risk_level']} risk ({results['detection']['confidence']:.2%} confidence)
450
+ 📊 **Sentiment Agent**: {results['sentiment']['sentiment'].title()} ({results['sentiment']['confidence']:.2%} confidence)
451
+ 🛡️ **Moderation Agent**: {results['moderation'].get('safety_level', 'unknown').title()} safety level ({results['moderation'].get('confidence', 0):.2%} confidence)
452
+ 💬 **Counter-Speech Agent**: {"Custom prompt-based" if "AI-Generated" in counter_speech else "Template-based"} response
453
+ ⏱️ **Processing Time**: {results['processing_time']:.3f} seconds
454
+
455
+ 📋 **Moderation Analysis**: {results['moderation'].get('analysis', 'No detailed analysis available')[:200]}...
456
+ """
457
 
458
+ # Compile comprehensive agent data
459
+ all_agent_data = {
460
+ "Detection_Analysis": {
461
+ "scores": detection_scores,
462
+ "risk_level": results['detection']['risk_level'],
463
+ "is_hate_speech": results['detection']['is_hate_speech']
464
+ },
465
+ "Sentiment_Analysis": {
466
+ "primary_sentiment": results['sentiment']['sentiment'],
467
+ "all_sentiments": results["sentiment"].get("all_sentiments", {})
468
+ },
469
+ "Moderation_Analysis": {
470
+ "safety_assessment": results['moderation'].get('safety_level', 'unknown'),
471
+ "detailed_analysis": results['moderation'].get('analysis', ''),
472
+ "confidence": results['moderation'].get('confidence', 0),
473
+ "prompt_used": results['moderation'].get('prompt_used', 'fallback')
474
+ },
475
+ "Counter_Speech": {
476
+ "response": counter_speech,
477
+ "generation_method": "AI-Generated" if "AI-Generated" in counter_speech else "Template-based"
478
+ },
479
+ "System_Info": {
480
+ "timestamp": results["timestamp"],
481
+ "processing_time_seconds": results["processing_time"],
482
+ "prompt_files_loaded": {
483
+ "counter_speech": bool(agent_system.counter_speech_prompts),
484
+ "moderation": bool(agent_system.moderation_prompts)
485
+ }
486
+ }
487
+ }
488
 
489
+ return detection_status, detection_scores, counter_speech, agent_summary, all_agent_data
490
 
491
+ def reload_prompts():
492
+ """Reload prompt files for testing"""
493
+ try:
494
+ agent_system.counter_speech_prompts = agent_system.load_prompts("counter_speech_prompts.json")
495
+ agent_system.moderation_prompts = agent_system.load_prompts("moderation_prompts.json")
496
+ return "✅ Prompts reloaded successfully!"
497
+ except Exception as e:
498
+ return f"❌ Error reloading prompts: {e}"
499
+
500
+ def get_prompt_info():
501
+ """Get information about loaded prompts"""
502
+ counter_prompts = len(agent_system.counter_speech_prompts.get("counter_speech_prompts", {}))
503
+ moderation_prompts = len(agent_system.moderation_prompts.get("moderation_prompts", {}))
504
+
505
+ return {
506
+ "counter_speech_prompt_categories": counter_prompts,
507
+ "moderation_prompt_categories": moderation_prompts,
508
+ "prompt_files_status": {
509
+ "counter_speech_prompts.json": "✅ Loaded" if counter_prompts > 0 else "❌ Not found",
510
+ "moderation_prompts.json": "✅ Loaded" if moderation_prompts > 0 else "❌ Not found"
511
+ }
512
+ }
513
 
514
  # Create the Gradio interface
515
  with gr.Blocks(
516
+ title="Prompt-Based Multi-Agent Hate Speech Detection System",
517
  theme=gr.themes.Soft(),
518
  css="""
519
  .gradio-container {
520
  max-width: 1400px !important;
521
  }
522
+ .prompt-info {
523
+ background: linear-gradient(90deg, #f0f9ff 0%, #e0f2fe 100%);
524
  padding: 1rem;
525
  border-radius: 8px;
526
+ border-left: 4px solid #0284c7;
527
+ }
528
+ .agent-summary {
529
+ background: linear-gradient(90deg, #fefce8 0%, #fef3c7 100%);
530
+ padding: 1rem;
531
+ border-radius: 8px;
532
+ border-left: 4px solid #f59e0b;
533
  }
534
  """
535
  ) as demo:
536
 
537
  gr.Markdown("""
538
+ # 🤖 Prompt-Based Multi-Agent Hate Speech Detection System
539
+
540
+ **Advanced AI Agent Collaboration with Custom Prompts**
541
 
542
+ 🔍 **Detection Agent** - Your fine-tuned DistilBERT model
543
+ 💬 **Counter-Speech Agent** - FLAN-T5 with custom prompt engineering
544
+ 🛡️ **Moderation Agent** - Structured analysis using specialized prompts
545
+ 📊 **Sentiment Agent** - Twitter-RoBERTa for emotional context
546
 
547
+ *Each agent uses carefully crafted prompts from external JSON files for optimal performance.*
 
 
 
548
  """)
549
 
550
+ with gr.Tab("🤖 Prompt-Based Analysis"):
551
  with gr.Row():
552
  with gr.Column(scale=2):
553
  text_input = gr.Textbox(
554
+ label="Enter text for comprehensive prompt-based analysis",
555
+ placeholder="Enter text here to see how prompt-engineered AI agents collaborate...",
556
  lines=5,
557
  max_lines=15
558
  )
559
 
560
  with gr.Row():
561
+ analyze_btn = gr.Button("🚀 Run Prompt-Based Analysis", variant="primary", size="lg")
562
  clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
563
+ reload_btn = gr.Button("🔄 Reload Prompts", variant="secondary")
564
 
565
  gr.Examples(
566
  examples=[
567
+ ["This is a wonderful day to collaborate and learn from each other!"],
568
+ ["I appreciate everyone's different perspectives and backgrounds."],
569
+ ["Let's work together to build a more inclusive community."],
570
+ ["Thank you for sharing your experience. I'd love to understand your viewpoint better."],
571
+ ["The diversity in our group makes our discussions much richer and more meaningful."],
572
+ ["I respectfully disagree, but I value your right to express your opinion."]
573
  ],
574
  inputs=text_input,
575
+ label="📝 Try these examples with prompt-based agents:"
576
  )
577
 
578
  with gr.Row():
579
+ with gr.Column():
580
+ detection_output = gr.Textbox(
581
+ label="🎯 Primary Detection Result",
582
  interactive=False,
583
  lines=3
584
  )
585
 
586
+ agent_summary = gr.Textbox(
587
+ label="🤖 Prompt-Based Agent Summary",
588
  interactive=False,
589
+ lines=8,
590
+ elem_classes=["agent-summary"]
591
  )
592
 
593
+ with gr.Column():
594
+ counter_speech_output = gr.Textbox(
595
+ label="💬 Prompt-Generated Counter-Speech",
596
  interactive=False,
597
+ lines=6
598
+ )
599
+
600
+ reload_status = gr.Textbox(
601
+ label="🔄 Prompt Reload Status",
602
+ interactive=False,
603
+ lines=2
604
  )
605
 
606
  with gr.Row():
607
+ all_agents_output = gr.JSON(
608
+ label="📊 Complete Prompt-Based Multi-Agent Analysis",
609
  visible=True
610
  )
611
 
612
+ with gr.Tab("📝 Prompt Management"):
613
  with gr.Row():
614
  with gr.Column():
615
+ gr.Markdown("""
616
+ ## 📝 Counter-Speech Prompts
617
+
618
+ The system uses specialized prompts for different risk levels:
619
+
620
+ ### 🚨 High Risk Prompts
621
+ - **Purpose**: Address clear hate speech with educational responses
622
+ - **Tone**: Firm but respectful, educational focus
623
+ - **Length**: 50-100 words
624
+ - **Goal**: De-escalation and education
625
+
626
+ ### ⚠️ Medium Risk Prompts
627
+ - **Purpose**: Handle potentially problematic content
628
+ - **Tone**: Gentle guidance, supportive
629
+ - **Length**: 40-80 words
630
+ - **Goal**: Reflection and improvement
631
+
632
+ ### ⚡ Low Risk Prompts
633
+ - **Purpose**: Encourage even better communication
634
+ - **Tone**: Positive reinforcement
635
+ - **Length**: 30-60 words
636
+ - **Goal**: Enhancement and encouragement
637
+ """)
638
 
639
  with gr.Column():
640
  gr.Markdown("""
641
+ ## 🛡️ Moderation Prompts
642
+
643
+ Structured analysis prompts for comprehensive assessment:
644
 
645
+ ### 🔍 Comprehensive Analysis
646
+ - **Safety Assessment**: SAFE/CONCERNING/HARMFUL
647
+ - **Violation Categories**: Specific policy areas
648
+ - **Severity Levels**: LOW/MEDIUM/HIGH
649
+ - **Confidence Scoring**: 0-100% certainty
650
+ - **Contextual Factors**: Cultural and situational
651
 
652
+ ### 📊 Specialized Analysis Types
653
+ - **Hate Speech Focus**: Protected group targeting
654
+ - **Toxicity Assessment**: Discourse quality impact
655
+ - **Context Analysis**: Cultural and situational factors
656
+ - **Action Recommendations**: Specific moderation steps
657
  """)
658
+
659
+ with gr.Row():
660
+ prompt_info_output = gr.JSON(
661
+ label="📋 Current Prompt Configuration",
662
+ value=get_prompt_info()
663
+ )
664
+
665
  gr.Markdown("""
666
+ ## 📁 Prompt File Structure
667
+
668
+ To customize the system behavior, create these JSON files:
669
+
670
+ ### `counter_speech_prompts.json`
671
+ ```json
672
+ {
673
+ "counter_speech_prompts": {
674
+ "high_risk": {
675
+ "system_prompt": "You are an expert educator...",
676
+ "user_prompt_template": "Generate response for: {original_text}..."
677
+ }
678
+ }
679
+ }
680
+ ```
681
 
682
+ ### `moderation_prompts.json`
683
+ ```json
684
+ {
685
+ "moderation_prompts": {
686
+ "comprehensive_analysis": {
687
+ "system_prompt": "You are a content moderation expert...",
688
+ "user_prompt_template": "Analyze: {text}..."
689
+ }
690
+ }
691
+ }
692
+ ```
693
 
694
+ **Benefits of External Prompts:**
695
+ - 🎯 **Fine-tuned control** over agent behavior
696
+ - 🔄 **Easy iteration** without code changes
697
+ - 📊 **A/B testing** of different prompt strategies
698
+ - 🎨 **Domain-specific customization** for different platforms
699
+ - 📈 **Performance optimization** through prompt engineering
700
+ """)
701
+
702
+ with gr.Tab("🔧 System Architecture"):
703
+ gr.Markdown("""
704
+ ## 🏗️ Prompt-Based Agent Architecture
705
 
706
+ ### 🔄 Agent Collaboration Flow
707
  ```
708
+ Input Text
709
+ ├── Detection Agent → Risk Classification (DistilBERT)
710
+ ├── Sentiment Agent → Emotional Context (RoBERTa)
711
+ ├── Moderation Agent → Structured Analysis (FLAN-T5 + Prompts)
712
+ └── Counter-Speech Agent → Educational Response (FLAN-T5 + Prompts)
713
+
714
+ Uses custom prompts and outputs from all other agents
715
  ```
716
 
717
+ ### 📝 Prompt Engineering Advantages
718
+
719
+ #### 🎯 **Precision Control**
720
+ - **Task-specific instructions** for each scenario
721
+ - **Tone and style guidelines** for appropriate responses
722
+ - **Length and format specifications** for consistency
723
+ - **Context integration** from multiple agent outputs
724
+
725
+ #### 🔄 **Iterative Improvement**
726
+ - **Hot-swappable prompts** without system restart
727
+ - **A/B testing capabilities** for prompt effectiveness
728
+ - **Performance metrics** tracking for optimization
729
+ - **Domain adaptation** for different use cases
730
+
731
+ #### 🛡️ **Quality Assurance**
732
+ - **Bias mitigation** through careful prompt design
733
+ - **Safety guardrails** built into prompt structure
734
+ - **Consistency enforcement** across all responses
735
+ - **Cultural sensitivity** considerations
736
+
737
+ ### 🚀 Production Benefits
738
+
739
+ - **🎨 Customizable**: Adapt to different platforms and communities
740
+ - **📈 Scalable**: Easy to add new prompt categories
741
+ - **🔧 Maintainable**: Update behavior without code deployment
742
+ - **📊 Measurable**: Track prompt performance and effectiveness
743
+ - **🌍 Localizable**: Different prompts for different regions/cultures
744
+
745
+ ### ⚠️ Deployment Considerations
746
+
747
+ #### 🔒 Security
748
+ - **Prompt injection protection** for user inputs
749
+ - **Content filtering** on generated responses
750
+ - **Rate limiting** to prevent abuse
751
+ - **Audit logging** for compliance
752
+
753
+ #### 📊 Monitoring
754
+ - **Response quality metrics** tracking
755
+ - **User feedback integration** for continuous improvement
756
+ - **Error rate monitoring** across different prompt types
757
+ - **Performance benchmarking** against baseline models
758
+
759
+ #### 👥 Human Oversight
760
+ - **Expert review processes** for prompt updates
761
+ - **Community feedback loops** for prompt effectiveness
762
+ - **Escalation pathways** for edge cases
763
+ - **Regular bias audits** and prompt refinement
764
  """)
765
 
766
  # Event handlers
767
  analyze_btn.click(
768
+ fn=analyze_text_with_prompts,
769
  inputs=text_input,
770
+ outputs=[detection_output, all_agents_output, counter_speech_output, agent_summary, all_agents_output]
771
  )
772
 
773
  clear_btn.click(
774
  fn=lambda: ("", "", "", "", {}),
775
+ outputs=[text_input, detection_output, counter_speech_output, agent_summary, all_agents_output]
776
+ )
777
+
778
+ reload_btn.click(
779
+ fn=reload_prompts,
780
+ outputs=reload_status
781
  )
782
 
783
  # Launch configuration