Upload app.py
Browse files
app.py
CHANGED
|
@@ -43,12 +43,17 @@ class HallucinationJudgment(BaseModel):
|
|
| 43 |
class PAS2:
|
| 44 |
"""Paraphrase-based Approach for LLM Systems - Using llm-as-judge methods"""
|
| 45 |
|
| 46 |
-
def __init__(self, mistral_api_key=None, openai_api_key=None, progress_callback=None):
|
| 47 |
"""Initialize the PAS2 with API keys"""
|
| 48 |
# For Hugging Face Spaces, we prioritize getting API keys from HF_* environment variables
|
| 49 |
# which are set from the Secrets tab in the Space settings
|
| 50 |
self.mistral_api_key = mistral_api_key or os.environ.get("HF_MISTRAL_API_KEY") or os.environ.get("MISTRAL_API_KEY")
|
| 51 |
self.openai_api_key = openai_api_key or os.environ.get("HF_OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
self.progress_callback = progress_callback
|
| 53 |
|
| 54 |
if not self.mistral_api_key:
|
|
@@ -59,12 +64,64 @@ class PAS2:
|
|
| 59 |
|
| 60 |
self.mistral_client = Mistral(api_key=self.mistral_api_key)
|
| 61 |
self.openai_client = OpenAI(api_key=self.openai_api_key)
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
self.mistral_model = "mistral-large-latest"
|
| 64 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
def generate_paraphrases(self, query: str, n_paraphrases: int = 3) -> List[str]:
|
| 70 |
"""Generate paraphrases of the input query using Mistral API"""
|
|
@@ -141,13 +198,38 @@ class PAS2:
|
|
| 141 |
|
| 142 |
return fallback_paraphrases
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
def _get_single_response(self, query: str, index: int = None) -> str:
|
| 145 |
-
"""Get a single response from
|
| 146 |
try:
|
| 147 |
query_description = f"Query {index}: {query}" if index is not None else f"Query: {query}"
|
| 148 |
-
logger.info("Getting response for %s", query_description)
|
| 149 |
start_time = time.time()
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
messages = [
|
| 152 |
{
|
| 153 |
"role": "system",
|
|
@@ -159,23 +241,32 @@ class PAS2:
|
|
| 159 |
}
|
| 160 |
]
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
-
result = response.choices[0].message.content
|
| 168 |
elapsed_time = time.time() - start_time
|
| 169 |
|
| 170 |
-
logger.info("Received response for %s (%.2f seconds)",
|
|
|
|
| 171 |
logger.debug("Response content for %s: %s", query_description, result[:100] + "..." if len(result) > 100 else result)
|
| 172 |
|
| 173 |
return result
|
| 174 |
|
| 175 |
except Exception as e:
|
| 176 |
-
error_msg = f"Error getting response for query '{query}': {e}"
|
| 177 |
logger.error(error_msg, exc_info=True)
|
| 178 |
-
return f"Error: Failed to get response for this query."
|
| 179 |
|
| 180 |
def get_responses(self, queries: List[str]) -> List[str]:
|
| 181 |
"""Get responses from Mistral API for each query in parallel"""
|
|
@@ -235,6 +326,10 @@ class PAS2:
|
|
| 235 |
logger.info("Starting hallucination detection for query: %s", query)
|
| 236 |
start_time = time.time()
|
| 237 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
# Report progress
|
| 239 |
if self.progress_callback:
|
| 240 |
self.progress_callback("starting", query=query)
|
|
@@ -250,9 +345,9 @@ class PAS2:
|
|
| 250 |
self.progress_callback("paraphrases_complete", query=query, count=len(all_queries))
|
| 251 |
|
| 252 |
# Get responses to all queries
|
| 253 |
-
logger.info("Step 2: Getting responses to all %d queries", len(all_queries))
|
| 254 |
if self.progress_callback:
|
| 255 |
-
self.progress_callback("getting_responses", query=query, total=len(all_queries))
|
| 256 |
|
| 257 |
all_responses = []
|
| 258 |
for i, q in enumerate(all_queries):
|
|
@@ -267,9 +362,9 @@ class PAS2:
|
|
| 267 |
self.progress_callback("responses_complete", query=query)
|
| 268 |
|
| 269 |
# Judge the responses for hallucinations
|
| 270 |
-
logger.info("Step 3: Judging for hallucinations")
|
| 271 |
if self.progress_callback:
|
| 272 |
-
self.progress_callback("judging", query=query)
|
| 273 |
|
| 274 |
# The first query is the original, rest are paraphrases
|
| 275 |
original_query = all_queries[0]
|
|
@@ -295,14 +390,17 @@ class PAS2:
|
|
| 295 |
"confidence_score": judgment.confidence_score,
|
| 296 |
"conflicting_facts": judgment.conflicting_facts,
|
| 297 |
"reasoning": judgment.reasoning,
|
| 298 |
-
"summary": judgment.summary
|
|
|
|
|
|
|
| 299 |
}
|
| 300 |
|
| 301 |
# Report completion
|
| 302 |
if self.progress_callback:
|
| 303 |
-
self.progress_callback("complete", query=query)
|
| 304 |
|
| 305 |
-
logger.info("Hallucination detection completed in %.2f seconds
|
|
|
|
| 306 |
return results
|
| 307 |
|
| 308 |
def judge_hallucination(self,
|
|
@@ -311,11 +409,17 @@ class PAS2:
|
|
| 311 |
paraphrased_queries: List[str],
|
| 312 |
paraphrased_responses: List[str]) -> HallucinationJudgment:
|
| 313 |
"""
|
| 314 |
-
Use
|
| 315 |
"""
|
| 316 |
-
logger.info("Judging hallucinations with
|
| 317 |
start_time = time.time()
|
| 318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
# Prepare the context for the judge
|
| 320 |
context = f"""
|
| 321 |
Original Question: {original_query}
|
|
@@ -344,18 +448,31 @@ Your response should be a JSON with the following fields:
|
|
| 344 |
"""
|
| 345 |
|
| 346 |
try:
|
| 347 |
-
logger.info("Sending judgment request to
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
|
| 357 |
-
|
| 358 |
-
logger.debug("Received judgment response: %s", result_json)
|
| 359 |
|
| 360 |
# Create the HallucinationJudgment object from the JSON response
|
| 361 |
judgment = HallucinationJudgment(
|
|
@@ -367,18 +484,18 @@ Your response should be a JSON with the following fields:
|
|
| 367 |
)
|
| 368 |
|
| 369 |
elapsed_time = time.time() - start_time
|
| 370 |
-
logger.info("Judgment completed in %.2f seconds", elapsed_time)
|
| 371 |
|
| 372 |
return judgment
|
| 373 |
|
| 374 |
except Exception as e:
|
| 375 |
-
logger.error("Error in hallucination judgment: %s", str(e), exc_info=True)
|
| 376 |
# Return a fallback judgment
|
| 377 |
return HallucinationJudgment(
|
| 378 |
hallucination_detected=False,
|
| 379 |
confidence_score=0.0,
|
| 380 |
conflicting_facts=[],
|
| 381 |
-
reasoning="Failed to obtain judgment from the model
|
| 382 |
summary="Analysis failed due to API error."
|
| 383 |
)
|
| 384 |
|
|
@@ -495,11 +612,21 @@ class HallucinationDetectorApp:
|
|
| 495 |
"conflicting_facts": results.get('conflicting_facts', []),
|
| 496 |
"reasoning": results.get('reasoning', ''),
|
| 497 |
"summary": results.get('summary', ''),
|
|
|
|
|
|
|
| 498 |
"user_feedback": feedback
|
| 499 |
}
|
| 500 |
|
| 501 |
# Insert document into collection
|
| 502 |
-
self.feedback_collection.insert_one(document)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 503 |
|
| 504 |
logger.info("Feedback saved successfully to MongoDB")
|
| 505 |
return "Feedback saved successfully!"
|
|
@@ -507,6 +634,266 @@ class HallucinationDetectorApp:
|
|
| 507 |
logger.error("Error saving feedback: %s", str(e), exc_info=True)
|
| 508 |
return f"Error saving feedback: {str(e)}"
|
| 509 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
def get_feedback_stats(self):
|
| 511 |
"""Get statistics about collected feedback from MongoDB"""
|
| 512 |
try:
|
|
@@ -541,6 +928,62 @@ class HallucinationDetectorApp:
|
|
| 541 |
except Exception as e:
|
| 542 |
logger.error("Error getting feedback stats: %s", str(e), exc_info=True)
|
| 543 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
|
| 545 |
def export_data_to_csv(self, filepath=None):
|
| 546 |
"""Export all feedback data to a CSV file for analysis"""
|
|
@@ -657,11 +1100,11 @@ class ProgressTracker:
|
|
| 657 |
"starting": {"status": "Starting process...", "progress": 5, "color": "#2196F3"},
|
| 658 |
"generating_paraphrases": {"status": "Generating paraphrases...", "progress": 15, "color": "#2196F3"},
|
| 659 |
"paraphrases_complete": {"status": "Paraphrases generated", "progress": 30, "color": "#2196F3"},
|
| 660 |
-
"getting_responses": {"status": "Getting responses
|
| 661 |
"responses_progress": {"status": "Getting responses ({completed}/{total})...", "progress": 40, "color": "#2196F3"},
|
| 662 |
"responses_complete": {"status": "All responses received", "progress": 65, "color": "#2196F3"},
|
| 663 |
-
"judging": {"status": "Analyzing responses for hallucinations...", "progress": 70, "color": "#2196F3"},
|
| 664 |
-
"complete": {"status": "Analysis complete!", "progress": 100, "color": "#4CAF50"},
|
| 665 |
"error": {"status": "Error: {error_message}", "progress": 100, "color": "#F44336"}
|
| 666 |
}
|
| 667 |
|
|
@@ -672,6 +1115,9 @@ class ProgressTracker:
|
|
| 672 |
self.completed_responses = 0
|
| 673 |
self.total_responses = 0
|
| 674 |
self.error_message = ""
|
|
|
|
|
|
|
|
|
|
| 675 |
self._lock = threading.Lock()
|
| 676 |
self._status_callback = None
|
| 677 |
self._stop_event = threading.Event()
|
|
@@ -698,6 +1144,12 @@ class ProgressTracker:
|
|
| 698 |
self.total_responses = value
|
| 699 |
elif key == 'error_message':
|
| 700 |
self.error_message = value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 701 |
|
| 702 |
# Format status message
|
| 703 |
if stage == 'responses_progress':
|
|
@@ -705,6 +1157,19 @@ class ProgressTracker:
|
|
| 705 |
completed=self.completed_responses,
|
| 706 |
total=self.total_responses
|
| 707 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 708 |
elif stage == 'error':
|
| 709 |
self.stage_data['status'] = self.stage_data['status'].format(
|
| 710 |
error_message=self.error_message
|
|
@@ -724,6 +1189,16 @@ class ProgressTracker:
|
|
| 724 |
# Only show status text if not in idle state
|
| 725 |
status_display = f'<div class="progress-status" style="color: {color};">{status_text}</div>' if self.stage != "idle" else ''
|
| 726 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 727 |
html = f"""
|
| 728 |
<div class="progress-container">
|
| 729 |
{query_info}
|
|
@@ -731,6 +1206,7 @@ class ProgressTracker:
|
|
| 731 |
<div class="progress-bar-container">
|
| 732 |
<div class="progress-bar" style="width: {progress_width}; background-color: {color};"></div>
|
| 733 |
</div>
|
|
|
|
| 734 |
</div>
|
| 735 |
"""
|
| 736 |
return html
|
|
@@ -1099,13 +1575,18 @@ def create_interface():
|
|
| 1099 |
combined_progress_callback("starting", query=query)
|
| 1100 |
time.sleep(0.3) # Ensure starting status is visible
|
| 1101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1102 |
# Step 2: Generate paraphrases (15-30%)
|
| 1103 |
combined_progress_callback("generating_paraphrases", query=query)
|
| 1104 |
all_queries = detector.pas2.generate_paraphrases(query)
|
| 1105 |
combined_progress_callback("paraphrases_complete", query=query, count=len(all_queries))
|
| 1106 |
|
| 1107 |
# Step 3: Get responses (35-65%)
|
| 1108 |
-
combined_progress_callback("getting_responses", query=query, total=len(all_queries))
|
| 1109 |
all_responses = []
|
| 1110 |
for i, q in enumerate(all_queries):
|
| 1111 |
# Show incremental progress for each response
|
|
@@ -1115,7 +1596,7 @@ def create_interface():
|
|
| 1115 |
combined_progress_callback("responses_complete", query=query)
|
| 1116 |
|
| 1117 |
# Step 4: Judge hallucinations (70-100%)
|
| 1118 |
-
combined_progress_callback("judging", query=query)
|
| 1119 |
|
| 1120 |
# The first query is the original, rest are paraphrases
|
| 1121 |
original_query = all_queries[0]
|
|
@@ -1141,11 +1622,13 @@ def create_interface():
|
|
| 1141 |
"confidence_score": judgment.confidence_score,
|
| 1142 |
"conflicting_facts": judgment.conflicting_facts,
|
| 1143 |
"reasoning": judgment.reasoning,
|
| 1144 |
-
"summary": judgment.summary
|
|
|
|
|
|
|
| 1145 |
}
|
| 1146 |
|
| 1147 |
# Show completion
|
| 1148 |
-
combined_progress_callback("complete", query=query)
|
| 1149 |
time.sleep(0.3) # Ensure complete status is visible
|
| 1150 |
|
| 1151 |
return results
|
|
@@ -1201,10 +1684,25 @@ def create_interface():
|
|
| 1201 |
reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
|
| 1202 |
conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "<strong>None identified</strong>"
|
| 1203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1204 |
html_output = f"""
|
| 1205 |
<div class="container">
|
| 1206 |
<h2 class="title">Hallucination Detection Results</h2>
|
| 1207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1208 |
<div class="stats-section">
|
| 1209 |
<div class="stat-item">
|
| 1210 |
<div class="stat-value">{'Yes' if hallucination_detected else 'No'}</div>
|
|
@@ -1234,7 +1732,7 @@ def create_interface():
|
|
| 1234 |
{original_query}
|
| 1235 |
</div>
|
| 1236 |
|
| 1237 |
-
<div class="section-title">Original Response</div>
|
| 1238 |
<div class="response-box">
|
| 1239 |
{original_response_safe}
|
| 1240 |
</div>
|
|
@@ -1249,14 +1747,14 @@ def create_interface():
|
|
| 1249 |
{q}
|
| 1250 |
</div>
|
| 1251 |
|
| 1252 |
-
<div class="section-title">Response {i}</div>
|
| 1253 |
<div class="response-box">
|
| 1254 |
{r}
|
| 1255 |
</div>
|
| 1256 |
"""
|
| 1257 |
|
| 1258 |
html_output += f"""
|
| 1259 |
-
<div class="section-title">Detailed Analysis</div>
|
| 1260 |
<div class="info-box">
|
| 1261 |
<p><strong>Reasoning:</strong></p>
|
| 1262 |
<p>{reasoning_safe}</p>
|
|
@@ -1264,6 +1762,10 @@ def create_interface():
|
|
| 1264 |
<p><strong>Conflicting Facts:</strong></p>
|
| 1265 |
<p>{conflicting_facts_text_safe}</p>
|
| 1266 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1267 |
</div>
|
| 1268 |
"""
|
| 1269 |
|
|
@@ -1289,8 +1791,11 @@ def create_interface():
|
|
| 1289 |
]
|
| 1290 |
|
| 1291 |
# Helper function to submit feedback
|
| 1292 |
-
def combine_feedback(
|
| 1293 |
-
combined_feedback = f"{
|
|
|
|
|
|
|
|
|
|
| 1294 |
if not results:
|
| 1295 |
return "No results to attach feedback to."
|
| 1296 |
|
|
@@ -1394,8 +1899,8 @@ def create_interface():
|
|
| 1394 |
This tool implements the Paraphrase-based Approach for Scrutinizing Systems (PAS2) with a model-as-judge enhancement:
|
| 1395 |
|
| 1396 |
1. **Paraphrase Generation**: Your question is paraphrased multiple ways while preserving its core meaning
|
| 1397 |
-
2. **Multiple Responses**: All questions (original + paraphrases) are sent to
|
| 1398 |
-
3. **Expert Judgment**:
|
| 1399 |
|
| 1400 |
### Why This Approach?
|
| 1401 |
|
|
@@ -1469,10 +1974,16 @@ def create_interface():
|
|
| 1469 |
gr.Markdown("### Help Improve the System")
|
| 1470 |
gr.Markdown("Your feedback helps us refine the hallucination detection system.")
|
| 1471 |
|
| 1472 |
-
|
| 1473 |
-
label="Was
|
| 1474 |
-
choices=["Yes,
|
| 1475 |
-
value="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1476 |
)
|
| 1477 |
|
| 1478 |
feedback_text = gr.Textbox(
|
|
@@ -1489,286 +2000,280 @@ def create_interface():
|
|
| 1489 |
gr.Markdown("## Hallucination Detection Scores")
|
| 1490 |
gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
|
| 1491 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1492 |
# Create leaderboard table for model combinations
|
| 1493 |
-
model_leaderboard_html = gr.HTML(
|
| 1494 |
-
|
| 1495 |
-
|
| 1496 |
-
|
| 1497 |
-
|
| 1498 |
-
|
| 1499 |
-
<th>Generator Model</th>
|
| 1500 |
-
<th>Judge Model</th>
|
| 1501 |
-
<th>ELO Score</th>
|
| 1502 |
-
<th>Accuracy</th>
|
| 1503 |
-
<th>Consistency</th>
|
| 1504 |
-
</tr>
|
| 1505 |
-
</thead>
|
| 1506 |
-
<tbody>
|
| 1507 |
-
<tr>
|
| 1508 |
-
<td>1</td>
|
| 1509 |
-
<td>gpt-4o</td>
|
| 1510 |
-
<td>o4-mini</td>
|
| 1511 |
-
<td>1878</td>
|
| 1512 |
-
<td>94.2%</td>
|
| 1513 |
-
<td>91.6%</td>
|
| 1514 |
-
</tr>
|
| 1515 |
-
<tr>
|
| 1516 |
-
<td>2</td>
|
| 1517 |
-
<td>gpt-4o</td>
|
| 1518 |
-
<td>gemini-2.5-pro</td>
|
| 1519 |
-
<td>1835</td>
|
| 1520 |
-
<td>92.8%</td>
|
| 1521 |
-
<td>89.2%</td>
|
| 1522 |
-
</tr>
|
| 1523 |
-
<tr>
|
| 1524 |
-
<td>3</td>
|
| 1525 |
-
<td>mistral-large</td>
|
| 1526 |
-
<td>o4-mini</td>
|
| 1527 |
-
<td>1795</td>
|
| 1528 |
-
<td>91.5%</td>
|
| 1529 |
-
<td>87.5%</td>
|
| 1530 |
-
</tr>
|
| 1531 |
-
<tr>
|
| 1532 |
-
<td>4</td>
|
| 1533 |
-
<td>Qwen3-235B-A22B</td>
|
| 1534 |
-
<td>o4-mini</td>
|
| 1535 |
-
<td>1768</td>
|
| 1536 |
-
<td>90.3%</td>
|
| 1537 |
-
<td>85.1%</td>
|
| 1538 |
-
</tr>
|
| 1539 |
-
<tr>
|
| 1540 |
-
<td>5</td>
|
| 1541 |
-
<td>grok-3</td>
|
| 1542 |
-
<td>o4-mini</td>
|
| 1543 |
-
<td>1742</td>
|
| 1544 |
-
<td>88.7%</td>
|
| 1545 |
-
<td>82.9%</td>
|
| 1546 |
-
</tr>
|
| 1547 |
-
<tr>
|
| 1548 |
-
<td>6</td>
|
| 1549 |
-
<td>mistral-large</td>
|
| 1550 |
-
<td>gemini-2.5-pro</td>
|
| 1551 |
-
<td>1716</td>
|
| 1552 |
-
<td>88.1%</td>
|
| 1553 |
-
<td>81.4%</td>
|
| 1554 |
-
</tr>
|
| 1555 |
-
<tr>
|
| 1556 |
-
<td>7</td>
|
| 1557 |
-
<td>deepseek-r1</td>
|
| 1558 |
-
<td>o4-mini</td>
|
| 1559 |
-
<td>1692</td>
|
| 1560 |
-
<td>87.3%</td>
|
| 1561 |
-
<td>80.3%</td>
|
| 1562 |
-
</tr>
|
| 1563 |
-
</tbody>
|
| 1564 |
-
</table>
|
| 1565 |
-
</div>
|
| 1566 |
|
| 1567 |
-
|
| 1568 |
-
|
| 1569 |
-
|
| 1570 |
-
|
| 1571 |
-
<
|
| 1572 |
-
|
| 1573 |
-
|
| 1574 |
-
|
| 1575 |
-
|
| 1576 |
-
|
| 1577 |
-
|
| 1578 |
-
|
| 1579 |
-
|
| 1580 |
-
|
| 1581 |
-
|
| 1582 |
-
|
| 1583 |
-
|
| 1584 |
-
<
|
| 1585 |
-
|
| 1586 |
-
|
| 1587 |
-
|
| 1588 |
-
|
| 1589 |
-
|
| 1590 |
-
|
| 1591 |
-
|
| 1592 |
-
|
| 1593 |
-
|
| 1594 |
-
|
| 1595 |
-
|
| 1596 |
-
|
| 1597 |
-
|
| 1598 |
-
|
| 1599 |
-
|
| 1600 |
-
|
| 1601 |
-
|
| 1602 |
-
|
| 1603 |
-
|
| 1604 |
-
|
| 1605 |
-
|
| 1606 |
-
|
| 1607 |
-
|
| 1608 |
-
|
| 1609 |
-
|
| 1610 |
-
|
| 1611 |
-
|
| 1612 |
-
</div>
|
| 1613 |
-
</div>
|
| 1614 |
-
</div>
|
| 1615 |
-
</div>
|
| 1616 |
-
<style>
|
| 1617 |
-
.leaderboard-container {
|
| 1618 |
-
margin: 15px 0;
|
| 1619 |
-
overflow-x: auto;
|
| 1620 |
-
}
|
| 1621 |
-
.leaderboard-table {
|
| 1622 |
-
width: 100%;
|
| 1623 |
-
border-collapse: collapse;
|
| 1624 |
-
font-size: 0.95em;
|
| 1625 |
-
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
| 1626 |
-
border-radius: 8px;
|
| 1627 |
-
overflow: hidden;
|
| 1628 |
-
}
|
| 1629 |
-
.leaderboard-table thead {
|
| 1630 |
-
background-color: #1565c0;
|
| 1631 |
-
color: white;
|
| 1632 |
-
}
|
| 1633 |
-
.leaderboard-table th, .leaderboard-table td {
|
| 1634 |
-
padding: 12px 15px;
|
| 1635 |
-
text-align: left;
|
| 1636 |
-
border-bottom: 1px solid #ddd;
|
| 1637 |
-
}
|
| 1638 |
-
.leaderboard-table tbody tr {
|
| 1639 |
-
transition: background-color 0.3s;
|
| 1640 |
-
}
|
| 1641 |
-
.leaderboard-table tbody tr:nth-child(even) {
|
| 1642 |
-
background-color: #cfd8dc;
|
| 1643 |
-
}
|
| 1644 |
-
.leaderboard-table tbody tr:hover {
|
| 1645 |
-
background-color: #b0bec5;
|
| 1646 |
-
}
|
| 1647 |
-
.leaderboard-table tbody tr:first-child {
|
| 1648 |
-
background-color: #80cbc4;
|
| 1649 |
-
color: #004d40;
|
| 1650 |
-
}
|
| 1651 |
-
.leaderboard-table tbody tr:nth-child(2) {
|
| 1652 |
-
background-color: #81c784;
|
| 1653 |
-
color: #1b5e20;
|
| 1654 |
-
}
|
| 1655 |
-
.leaderboard-table tbody tr:nth-child(4) {
|
| 1656 |
-
background-color: #aed581;
|
| 1657 |
-
color: #33691e;
|
| 1658 |
-
}
|
| 1659 |
-
.leaderboard-table tbody tr:nth-child(6) {
|
| 1660 |
-
background-color: #d7ccc8;
|
| 1661 |
-
color: #3e2723;
|
| 1662 |
-
}
|
| 1663 |
-
</style>
|
| 1664 |
-
""")
|
| 1665 |
|
| 1666 |
-
# Tab 3:
|
| 1667 |
-
with gr.TabItem("
|
| 1668 |
-
gr.Markdown("## Model
|
| 1669 |
-
gr.Markdown("Performance ranking of
|
| 1670 |
|
| 1671 |
-
#
|
| 1672 |
-
|
| 1673 |
-
|
| 1674 |
-
|
| 1675 |
-
|
| 1676 |
-
|
| 1677 |
-
|
| 1678 |
-
|
| 1679 |
-
<
|
| 1680 |
-
|
| 1681 |
-
<
|
| 1682 |
-
|
| 1683 |
-
|
| 1684 |
-
|
| 1685 |
-
|
| 1686 |
-
|
| 1687 |
-
|
| 1688 |
-
|
| 1689 |
-
|
| 1690 |
-
|
| 1691 |
-
|
| 1692 |
-
|
| 1693 |
-
|
| 1694 |
-
|
| 1695 |
-
|
| 1696 |
-
|
| 1697 |
-
|
| 1698 |
-
|
| 1699 |
-
|
| 1700 |
-
|
| 1701 |
-
|
| 1702 |
-
|
| 1703 |
-
|
| 1704 |
-
|
| 1705 |
-
|
| 1706 |
-
|
| 1707 |
-
|
| 1708 |
-
|
| 1709 |
-
|
| 1710 |
-
|
| 1711 |
-
|
| 1712 |
-
|
| 1713 |
-
|
| 1714 |
-
<td>
|
| 1715 |
-
<td>
|
| 1716 |
-
<td>
|
| 1717 |
-
<td>
|
| 1718 |
-
<td>
|
| 1719 |
-
|
| 1720 |
-
|
| 1721 |
-
<td>
|
| 1722 |
-
|
| 1723 |
-
|
| 1724 |
-
|
| 1725 |
-
|
| 1726 |
-
|
| 1727 |
-
<
|
| 1728 |
-
|
| 1729 |
-
|
| 1730 |
-
|
| 1731 |
-
|
| 1732 |
-
|
| 1733 |
-
</
|
| 1734 |
-
|
| 1735 |
-
|
| 1736 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1737 |
|
| 1738 |
-
|
| 1739 |
-
|
| 1740 |
-
|
| 1741 |
-
|
| 1742 |
-
|
| 1743 |
-
|
| 1744 |
-
|
| 1745 |
-
|
| 1746 |
-
|
| 1747 |
-
|
| 1748 |
-
|
| 1749 |
-
|
| 1750 |
-
|
| 1751 |
-
|
| 1752 |
-
|
| 1753 |
-
|
| 1754 |
-
|
| 1755 |
-
|
| 1756 |
-
<
|
| 1757 |
-
|
| 1758 |
-
|
| 1759 |
-
|
| 1760 |
-
|
| 1761 |
-
|
| 1762 |
-
|
| 1763 |
-
|
| 1764 |
-
|
| 1765 |
-
|
| 1766 |
-
|
| 1767 |
-
|
| 1768 |
-
</
|
| 1769 |
-
|
| 1770 |
-
|
| 1771 |
-
|
|
|
|
|
|
|
|
|
|
| 1772 |
|
| 1773 |
# Function to continuously update stats
|
| 1774 |
def update_stats():
|
|
@@ -1811,30 +2316,30 @@ def create_interface():
|
|
| 1811 |
live_stats = gr.HTML(update_stats())
|
| 1812 |
|
| 1813 |
# Add loading animation style
|
| 1814 |
-
gr.HTML(
|
| 1815 |
-
|
| 1816 |
-
|
| 1817 |
-
0% { opacity: 0.6; }
|
| 1818 |
-
50% { opacity: 1; }
|
| 1819 |
-
100% { opacity: 0.6; }
|
| 1820 |
-
|
| 1821 |
-
|
| 1822 |
-
content: "
|
| 1823 |
-
display: inline-block;
|
| 1824 |
-
margin-left: 8px;
|
| 1825 |
-
animation: pulse 1.5s infinite ease-in-out;
|
| 1826 |
-
color: #2e7d32;
|
| 1827 |
-
|
| 1828 |
-
|
| 1829 |
-
border: 1px solid #b3e5fc;
|
| 1830 |
-
border-radius: 10px;
|
| 1831 |
-
padding: 15px;
|
| 1832 |
-
margin: 10px 0;
|
| 1833 |
-
background-color: #0277bd;
|
| 1834 |
-
|
| 1835 |
-
|
| 1836 |
-
|
| 1837 |
-
|
| 1838 |
|
| 1839 |
# Create a refresh button that will be auto-clicked
|
| 1840 |
refresh_btn = gr.Button("Refresh Stats", visible=False)
|
|
@@ -2018,19 +2523,13 @@ def create_interface():
|
|
| 2018 |
|
| 2019 |
feedback_button.click(
|
| 2020 |
fn=combine_feedback,
|
| 2021 |
-
inputs=[
|
| 2022 |
outputs=[feedback_status]
|
| 2023 |
)
|
| 2024 |
|
| 2025 |
# Footer
|
| 2026 |
gr.HTML(
|
| 2027 |
-
"""
|
| 2028 |
-
<footer>
|
| 2029 |
-
<p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p>
|
| 2030 |
-
<p>Multiple LLM models tested as generators and judges for optimal hallucination detection</p>
|
| 2031 |
-
<p><small>Models in testing: mistral-large, gpt-4o, Qwen3-235B-A22B, grok-3, o4-mini, gemini-2.5-pro, deepseek-r1</small></p>
|
| 2032 |
-
</footer>
|
| 2033 |
-
"""
|
| 2034 |
)
|
| 2035 |
|
| 2036 |
return interface
|
|
@@ -2096,4 +2595,4 @@ if __name__ == "__main__":
|
|
| 2096 |
|
| 2097 |
# Uncomment this line to run the test function instead of the main interface
|
| 2098 |
# if __name__ == "__main__":
|
| 2099 |
-
# test_progress()
|
|
|
|
| 43 |
class PAS2:
|
| 44 |
"""Paraphrase-based Approach for LLM Systems - Using llm-as-judge methods"""
|
| 45 |
|
| 46 |
+
def __init__(self, mistral_api_key=None, openai_api_key=None, xai_api_key=None, qwen_api_key=None, deepseek_api_key=None, gemini_api_key=None, progress_callback=None):
|
| 47 |
"""Initialize the PAS2 with API keys"""
|
| 48 |
# For Hugging Face Spaces, we prioritize getting API keys from HF_* environment variables
|
| 49 |
# which are set from the Secrets tab in the Space settings
|
| 50 |
self.mistral_api_key = mistral_api_key or os.environ.get("HF_MISTRAL_API_KEY") or os.environ.get("MISTRAL_API_KEY")
|
| 51 |
self.openai_api_key = openai_api_key or os.environ.get("HF_OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY")
|
| 52 |
+
self.xai_api_key = xai_api_key or os.environ.get("HF_XAI_API_KEY") or os.environ.get("XAI_API_KEY")
|
| 53 |
+
self.qwen_api_key = qwen_api_key or os.environ.get("HF_QWEN_API_KEY") or os.environ.get("QWEN_API_KEY")
|
| 54 |
+
self.deepseek_api_key = deepseek_api_key or os.environ.get("HF_DEEPSEEK_API_KEY") or os.environ.get("DEEPSEEK_API_KEY")
|
| 55 |
+
self.gemini_api_key = gemini_api_key or os.environ.get("HF_GEMINI_API_KEY") or os.environ.get("GEMINI_API_KEY")
|
| 56 |
+
|
| 57 |
self.progress_callback = progress_callback
|
| 58 |
|
| 59 |
if not self.mistral_api_key:
|
|
|
|
| 64 |
|
| 65 |
self.mistral_client = Mistral(api_key=self.mistral_api_key)
|
| 66 |
self.openai_client = OpenAI(api_key=self.openai_api_key)
|
| 67 |
+
self.xai_client = OpenAI(api_key=self.xai_api_key, base_url="https://api.x.ai/v1")
|
| 68 |
+
self.qwen_client = OpenAI(api_key=self.qwen_api_key, base_url="https://router.huggingface.co/nebius/v1")
|
| 69 |
+
self.deepseek_client = OpenAI(api_key=self.deepseek_api_key, base_url="https://api.deepseek.com")
|
| 70 |
+
self.gemini_client = OpenAI(api_key=self.gemini_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
|
| 71 |
+
|
| 72 |
+
# Define model names
|
| 73 |
self.mistral_model = "mistral-large-latest"
|
| 74 |
+
self.openai_o4mini = "o4-mini"
|
| 75 |
+
self.openai_4o = "gpt-4o"
|
| 76 |
+
self.deepseek_model = "deepseek-reasoner"
|
| 77 |
+
self.grok_model = "grok-3-beta"
|
| 78 |
+
self.qwen_model = "Qwen/Qwen3-235B-A22B"
|
| 79 |
+
self.gemini_model = "gemini-2.5-pro-preview-05-06"
|
| 80 |
|
| 81 |
+
# Create a dictionary mapping model names to their clients and model identifiers
|
| 82 |
+
self.model_configs = {
|
| 83 |
+
"mistral-large": {
|
| 84 |
+
"client": self.mistral_client,
|
| 85 |
+
"model_id": self.mistral_model,
|
| 86 |
+
"type": "mistral"
|
| 87 |
+
},
|
| 88 |
+
"o4-mini": {
|
| 89 |
+
"client": self.openai_client,
|
| 90 |
+
"model_id": self.openai_o4mini,
|
| 91 |
+
"type": "openai"
|
| 92 |
+
},
|
| 93 |
+
"gpt-4o": {
|
| 94 |
+
"client": self.openai_client,
|
| 95 |
+
"model_id": self.openai_4o,
|
| 96 |
+
"type": "openai"
|
| 97 |
+
},
|
| 98 |
+
"deepseek-reasoner": {
|
| 99 |
+
"client": self.deepseek_client,
|
| 100 |
+
"model_id": self.deepseek_model,
|
| 101 |
+
"type": "openai"
|
| 102 |
+
},
|
| 103 |
+
"grok-3": {
|
| 104 |
+
"client": self.xai_client,
|
| 105 |
+
"model_id": self.grok_model,
|
| 106 |
+
"type": "openai"
|
| 107 |
+
},
|
| 108 |
+
"qwen-235b": {
|
| 109 |
+
"client": self.qwen_client,
|
| 110 |
+
"model_id": self.qwen_model,
|
| 111 |
+
"type": "openai"
|
| 112 |
+
},
|
| 113 |
+
"gemini-2.5-pro": {
|
| 114 |
+
"client": self.gemini_client,
|
| 115 |
+
"model_id": self.gemini_model,
|
| 116 |
+
"type": "openai"
|
| 117 |
+
}
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
# Set default models (will be randomized later)
|
| 121 |
+
self.generator_model = "mistral-large"
|
| 122 |
+
self.judge_model = "o4-mini"
|
| 123 |
+
|
| 124 |
+
logger.info("PAS2 initialized with available models: %s", ", ".join(self.model_configs.keys()))
|
| 125 |
|
| 126 |
def generate_paraphrases(self, query: str, n_paraphrases: int = 3) -> List[str]:
|
| 127 |
"""Generate paraphrases of the input query using Mistral API"""
|
|
|
|
| 198 |
|
| 199 |
return fallback_paraphrases
|
| 200 |
|
| 201 |
+
def set_random_model_pair(self):
|
| 202 |
+
"""Randomly select a pair of generator and judge models"""
|
| 203 |
+
import random
|
| 204 |
+
|
| 205 |
+
# Get list of available models
|
| 206 |
+
available_models = list(self.model_configs.keys())
|
| 207 |
+
|
| 208 |
+
# Randomly select generator and judge models
|
| 209 |
+
self.generator_model = random.choice(available_models)
|
| 210 |
+
|
| 211 |
+
# Make sure judge is different from generator
|
| 212 |
+
judge_options = [m for m in available_models if m != self.generator_model]
|
| 213 |
+
self.judge_model = random.choice(judge_options)
|
| 214 |
+
|
| 215 |
+
logger.info("Randomly selected model pair - Generator: %s, Judge: %s",
|
| 216 |
+
self.generator_model, self.judge_model)
|
| 217 |
+
|
| 218 |
+
return self.generator_model, self.judge_model
|
| 219 |
+
|
| 220 |
def _get_single_response(self, query: str, index: int = None) -> str:
|
| 221 |
+
"""Get a single response from the selected generator model for a query"""
|
| 222 |
try:
|
| 223 |
query_description = f"Query {index}: {query}" if index is not None else f"Query: {query}"
|
| 224 |
+
logger.info("Getting response for %s using %s", query_description, self.generator_model)
|
| 225 |
start_time = time.time()
|
| 226 |
|
| 227 |
+
# Get the model configuration
|
| 228 |
+
model_config = self.model_configs[self.generator_model]
|
| 229 |
+
client = model_config["client"]
|
| 230 |
+
model_id = model_config["model_id"]
|
| 231 |
+
model_type = model_config["type"]
|
| 232 |
+
|
| 233 |
messages = [
|
| 234 |
{
|
| 235 |
"role": "system",
|
|
|
|
| 241 |
}
|
| 242 |
]
|
| 243 |
|
| 244 |
+
# Use the appropriate client and model based on the type
|
| 245 |
+
if model_type == "mistral":
|
| 246 |
+
response = client.chat.complete(
|
| 247 |
+
model=model_id,
|
| 248 |
+
messages=messages
|
| 249 |
+
)
|
| 250 |
+
result = response.choices[0].message.content
|
| 251 |
+
else: # openai-compatible API
|
| 252 |
+
response = client.chat.completions.create(
|
| 253 |
+
model=model_id,
|
| 254 |
+
messages=messages
|
| 255 |
+
)
|
| 256 |
+
result = response.choices[0].message.content
|
| 257 |
|
|
|
|
| 258 |
elapsed_time = time.time() - start_time
|
| 259 |
|
| 260 |
+
logger.info("Received response from %s for %s (%.2f seconds)",
|
| 261 |
+
self.generator_model, query_description, elapsed_time)
|
| 262 |
logger.debug("Response content for %s: %s", query_description, result[:100] + "..." if len(result) > 100 else result)
|
| 263 |
|
| 264 |
return result
|
| 265 |
|
| 266 |
except Exception as e:
|
| 267 |
+
error_msg = f"Error getting response for query '{query}' with model {self.generator_model}: {e}"
|
| 268 |
logger.error(error_msg, exc_info=True)
|
| 269 |
+
return f"Error: Failed to get response for this query with model {self.generator_model}."
|
| 270 |
|
| 271 |
def get_responses(self, queries: List[str]) -> List[str]:
|
| 272 |
"""Get responses from Mistral API for each query in parallel"""
|
|
|
|
| 326 |
logger.info("Starting hallucination detection for query: %s", query)
|
| 327 |
start_time = time.time()
|
| 328 |
|
| 329 |
+
# Randomly select a model pair for this detection
|
| 330 |
+
generator_model, judge_model = self.set_random_model_pair()
|
| 331 |
+
logger.info("Using %s as generator and %s as judge for this detection", generator_model, judge_model)
|
| 332 |
+
|
| 333 |
# Report progress
|
| 334 |
if self.progress_callback:
|
| 335 |
self.progress_callback("starting", query=query)
|
|
|
|
| 345 |
self.progress_callback("paraphrases_complete", query=query, count=len(all_queries))
|
| 346 |
|
| 347 |
# Get responses to all queries
|
| 348 |
+
logger.info("Step 2: Getting responses to all %d queries using %s", len(all_queries), generator_model)
|
| 349 |
if self.progress_callback:
|
| 350 |
+
self.progress_callback("getting_responses", query=query, total=len(all_queries), model=generator_model)
|
| 351 |
|
| 352 |
all_responses = []
|
| 353 |
for i, q in enumerate(all_queries):
|
|
|
|
| 362 |
self.progress_callback("responses_complete", query=query)
|
| 363 |
|
| 364 |
# Judge the responses for hallucinations
|
| 365 |
+
logger.info("Step 3: Judging for hallucinations using %s", judge_model)
|
| 366 |
if self.progress_callback:
|
| 367 |
+
self.progress_callback("judging", query=query, model=judge_model)
|
| 368 |
|
| 369 |
# The first query is the original, rest are paraphrases
|
| 370 |
original_query = all_queries[0]
|
|
|
|
| 390 |
"confidence_score": judgment.confidence_score,
|
| 391 |
"conflicting_facts": judgment.conflicting_facts,
|
| 392 |
"reasoning": judgment.reasoning,
|
| 393 |
+
"summary": judgment.summary,
|
| 394 |
+
"generator_model": generator_model,
|
| 395 |
+
"judge_model": judge_model
|
| 396 |
}
|
| 397 |
|
| 398 |
# Report completion
|
| 399 |
if self.progress_callback:
|
| 400 |
+
self.progress_callback("complete", query=query, generator=generator_model, judge=judge_model)
|
| 401 |
|
| 402 |
+
logger.info("Hallucination detection completed in %.2f seconds using %s (generator) and %s (judge)",
|
| 403 |
+
time.time() - start_time, generator_model, judge_model)
|
| 404 |
return results
|
| 405 |
|
| 406 |
def judge_hallucination(self,
|
|
|
|
| 409 |
paraphrased_queries: List[str],
|
| 410 |
paraphrased_responses: List[str]) -> HallucinationJudgment:
|
| 411 |
"""
|
| 412 |
+
Use the selected judge model to detect hallucinations in the responses
|
| 413 |
"""
|
| 414 |
+
logger.info("Judging hallucinations with %s model", self.judge_model)
|
| 415 |
start_time = time.time()
|
| 416 |
|
| 417 |
+
# Get the model configuration for the judge
|
| 418 |
+
model_config = self.model_configs[self.judge_model]
|
| 419 |
+
client = model_config["client"]
|
| 420 |
+
model_id = model_config["model_id"]
|
| 421 |
+
model_type = model_config["type"]
|
| 422 |
+
|
| 423 |
# Prepare the context for the judge
|
| 424 |
context = f"""
|
| 425 |
Original Question: {original_query}
|
|
|
|
| 448 |
"""
|
| 449 |
|
| 450 |
try:
|
| 451 |
+
logger.info("Sending judgment request to %s...", self.judge_model)
|
| 452 |
+
|
| 453 |
+
# Use the appropriate client and model based on the type
|
| 454 |
+
if model_type == "mistral":
|
| 455 |
+
response = client.chat.complete(
|
| 456 |
+
model=model_id,
|
| 457 |
+
messages=[
|
| 458 |
+
{"role": "system", "content": system_prompt},
|
| 459 |
+
{"role": "user", "content": f"Evaluate these responses for hallucinations:\n\n{context}"}
|
| 460 |
+
],
|
| 461 |
+
response_format={"type": "json_object"}
|
| 462 |
+
)
|
| 463 |
+
result_json = json.loads(response.choices[0].message.content)
|
| 464 |
+
else: # openai-compatible API
|
| 465 |
+
response = client.chat.completions.create(
|
| 466 |
+
model=model_id,
|
| 467 |
+
messages=[
|
| 468 |
+
{"role": "system", "content": system_prompt},
|
| 469 |
+
{"role": "user", "content": f"Evaluate these responses for hallucinations:\n\n{context}"}
|
| 470 |
+
],
|
| 471 |
+
response_format={"type": "json_object"}
|
| 472 |
+
)
|
| 473 |
+
result_json = json.loads(response.choices[0].message.content)
|
| 474 |
|
| 475 |
+
logger.debug("Received judgment response from %s: %s", self.judge_model, result_json)
|
|
|
|
| 476 |
|
| 477 |
# Create the HallucinationJudgment object from the JSON response
|
| 478 |
judgment = HallucinationJudgment(
|
|
|
|
| 484 |
)
|
| 485 |
|
| 486 |
elapsed_time = time.time() - start_time
|
| 487 |
+
logger.info("Judgment completed by %s in %.2f seconds", self.judge_model, elapsed_time)
|
| 488 |
|
| 489 |
return judgment
|
| 490 |
|
| 491 |
except Exception as e:
|
| 492 |
+
logger.error("Error in hallucination judgment with %s: %s", self.judge_model, str(e), exc_info=True)
|
| 493 |
# Return a fallback judgment
|
| 494 |
return HallucinationJudgment(
|
| 495 |
hallucination_detected=False,
|
| 496 |
confidence_score=0.0,
|
| 497 |
conflicting_facts=[],
|
| 498 |
+
reasoning=f"Failed to obtain judgment from the {self.judge_model} model: {str(e)}",
|
| 499 |
summary="Analysis failed due to API error."
|
| 500 |
)
|
| 501 |
|
|
|
|
| 612 |
"conflicting_facts": results.get('conflicting_facts', []),
|
| 613 |
"reasoning": results.get('reasoning', ''),
|
| 614 |
"summary": results.get('summary', ''),
|
| 615 |
+
"generator_model": results.get('generator_model', 'unknown'),
|
| 616 |
+
"judge_model": results.get('judge_model', 'unknown'),
|
| 617 |
"user_feedback": feedback
|
| 618 |
}
|
| 619 |
|
| 620 |
# Insert document into collection
|
| 621 |
+
result = self.feedback_collection.insert_one(document)
|
| 622 |
+
|
| 623 |
+
# Update model leaderboard scores
|
| 624 |
+
self._update_model_scores(
|
| 625 |
+
generator=results.get('generator_model', 'unknown'),
|
| 626 |
+
judge=results.get('judge_model', 'unknown'),
|
| 627 |
+
feedback=feedback,
|
| 628 |
+
hallucination_detected=results.get('hallucination_detected', False)
|
| 629 |
+
)
|
| 630 |
|
| 631 |
logger.info("Feedback saved successfully to MongoDB")
|
| 632 |
return "Feedback saved successfully!"
|
|
|
|
| 634 |
logger.error("Error saving feedback: %s", str(e), exc_info=True)
|
| 635 |
return f"Error saving feedback: {str(e)}"
|
| 636 |
|
| 637 |
+
def _update_model_scores(self, generator, judge, feedback, hallucination_detected):
|
| 638 |
+
"""Update the ELO scores for the generator and judge models based on feedback"""
|
| 639 |
+
try:
|
| 640 |
+
if self.db is None:
|
| 641 |
+
logger.error("MongoDB connection not available. Cannot update model scores.")
|
| 642 |
+
return
|
| 643 |
+
|
| 644 |
+
# Access or create the models collection
|
| 645 |
+
models_collection = self.db.get_collection("model_scores")
|
| 646 |
+
|
| 647 |
+
# Create indexes if they don't exist
|
| 648 |
+
models_collection.create_index("model_name", unique=True)
|
| 649 |
+
|
| 650 |
+
# Parse the feedback to determine scenario
|
| 651 |
+
actual_hallucination = "Yes, there was a hallucination" in feedback
|
| 652 |
+
no_hallucination = "No, there was no hallucination" in feedback
|
| 653 |
+
judge_correct = "Yes, the judge was correct" in feedback
|
| 654 |
+
judge_incorrect = "No, the judge was incorrect" in feedback
|
| 655 |
+
|
| 656 |
+
# Determine scores based on different scenarios:
|
| 657 |
+
# 1. Actual hallucination + Judge correct = positive for judge, negative for generator
|
| 658 |
+
# 2. No hallucination + Judge correct = positive for both
|
| 659 |
+
# 3. No hallucination + Judge incorrect = negative for judge, positive for generator
|
| 660 |
+
# 4. Actual hallucination + Judge incorrect = negative for both
|
| 661 |
+
|
| 662 |
+
if judge_correct:
|
| 663 |
+
if actual_hallucination:
|
| 664 |
+
# Scenario 1: Judge correctly detected hallucination
|
| 665 |
+
judge_score = 1 # Positive for judge
|
| 666 |
+
generator_score = 0 # Negative for generator (hallucinated)
|
| 667 |
+
logger.info("Judge %s correctly detected hallucination from generator %s", judge, generator)
|
| 668 |
+
elif no_hallucination:
|
| 669 |
+
# Scenario 2: Judge correctly determined no hallucination
|
| 670 |
+
judge_score = 1 # Positive for judge
|
| 671 |
+
generator_score = 1 # Positive for generator (didn't hallucinate)
|
| 672 |
+
logger.info("Judge %s correctly determined no hallucination from generator %s", judge, generator)
|
| 673 |
+
else:
|
| 674 |
+
# User unsure about hallucination, but confirmed judge was correct
|
| 675 |
+
judge_score = 1 # Positive for judge
|
| 676 |
+
generator_score = 0.5 # Neutral for generator (unclear)
|
| 677 |
+
logger.info("User confirmed judge %s was correct, but unclear about hallucination from %s", judge, generator)
|
| 678 |
+
elif judge_incorrect:
|
| 679 |
+
if no_hallucination:
|
| 680 |
+
# Scenario 3: Judge incorrectly claimed hallucination (false positive)
|
| 681 |
+
judge_score = 0 # Negative for judge
|
| 682 |
+
generator_score = 1 # Positive for generator (unfairly accused)
|
| 683 |
+
logger.info("Judge %s incorrectly claimed hallucination from generator %s", judge, generator)
|
| 684 |
+
elif actual_hallucination:
|
| 685 |
+
# Scenario 4: Judge missed actual hallucination (false negative)
|
| 686 |
+
judge_score = 0 # Negative for judge
|
| 687 |
+
generator_score = 0 # Negative for generator (hallucination went undetected)
|
| 688 |
+
logger.info("Judge %s missed actual hallucination from generator %s", judge, generator)
|
| 689 |
+
else:
|
| 690 |
+
# User unsure about hallucination, but confirmed judge was incorrect
|
| 691 |
+
judge_score = 0 # Negative for judge
|
| 692 |
+
generator_score = 0.5 # Neutral for generator (unclear)
|
| 693 |
+
logger.info("User confirmed judge %s was incorrect, but unclear about hallucination from %s", judge, generator)
|
| 694 |
+
else:
|
| 695 |
+
# User unsure about judge correctness, don't update scores
|
| 696 |
+
judge_score = 0.5 # Neutral for judge (unclear)
|
| 697 |
+
generator_score = 0.5 # Neutral for generator (unclear)
|
| 698 |
+
logger.info("User unsure about judge %s correctness and generator %s hallucination", judge, generator)
|
| 699 |
+
|
| 700 |
+
# Update generator model stats with specific score
|
| 701 |
+
self._update_model_stats(models_collection, generator, generator_score, "generator")
|
| 702 |
+
|
| 703 |
+
# Update judge model stats with specific score
|
| 704 |
+
self._update_model_stats(models_collection, judge, judge_score, "judge")
|
| 705 |
+
|
| 706 |
+
# Determine if the detection was correct based on judge correctness
|
| 707 |
+
detection_correct = judge_correct
|
| 708 |
+
|
| 709 |
+
# Determine if there was actually hallucination based on user feedback
|
| 710 |
+
actual_hallucination_present = actual_hallucination
|
| 711 |
+
|
| 712 |
+
# Update model pair stats
|
| 713 |
+
self._update_model_pair_stats(generator, judge, detection_correct, actual_hallucination_present,
|
| 714 |
+
generator_score, judge_score)
|
| 715 |
+
|
| 716 |
+
logger.info("Updated model scores based on feedback: generator(%s)=%s, judge(%s)=%s",
|
| 717 |
+
generator, generator_score, judge, judge_score)
|
| 718 |
+
|
| 719 |
+
except Exception as e:
|
| 720 |
+
logger.error("Error updating model scores: %s", str(e), exc_info=True)
|
| 721 |
+
|
| 722 |
+
def _update_model_stats(self, collection, model_name, score, role):
|
| 723 |
+
"""Update statistics for a single model"""
|
| 724 |
+
# Simplified ELO calculation
|
| 725 |
+
K_FACTOR = 32 # Standard K-factor for ELO
|
| 726 |
+
|
| 727 |
+
# Get current model data or create if not exists
|
| 728 |
+
model_data = collection.find_one({"model_name": model_name})
|
| 729 |
+
|
| 730 |
+
if model_data is None:
|
| 731 |
+
# Initialize new model with default values
|
| 732 |
+
model_data = {
|
| 733 |
+
"model_name": model_name,
|
| 734 |
+
"elo_score": 1500, # Starting ELO
|
| 735 |
+
"total_samples": 0,
|
| 736 |
+
"correct_predictions": 0,
|
| 737 |
+
"accuracy": 0.0,
|
| 738 |
+
"as_generator": 0,
|
| 739 |
+
"as_judge": 0,
|
| 740 |
+
"as_generator_correct": 0,
|
| 741 |
+
"as_judge_correct": 0,
|
| 742 |
+
"neutral_samples": 0 # Add a counter for neutral samples
|
| 743 |
+
}
|
| 744 |
+
|
| 745 |
+
# Skip counting for neutral feedback (0.5)
|
| 746 |
+
if score == 0.5:
|
| 747 |
+
# Increment neutral samples counter instead
|
| 748 |
+
if "neutral_samples" not in model_data:
|
| 749 |
+
model_data["neutral_samples"] = 0
|
| 750 |
+
model_data["neutral_samples"] += 1
|
| 751 |
+
|
| 752 |
+
# Expected score based on current rating (vs average rating)
|
| 753 |
+
expected_score = 1 / (1 + 10**((1500 - model_data["elo_score"]) / 400))
|
| 754 |
+
|
| 755 |
+
# For neutral score, use a much smaller K factor to slightly adjust the ELO
|
| 756 |
+
# This handles the "unsure" case with minimal impact
|
| 757 |
+
model_data["elo_score"] = model_data["elo_score"] + (K_FACTOR/4) * (0.5 - expected_score)
|
| 758 |
+
|
| 759 |
+
# Update or insert the model data
|
| 760 |
+
collection.replace_one(
|
| 761 |
+
{"model_name": model_name},
|
| 762 |
+
model_data,
|
| 763 |
+
upsert=True
|
| 764 |
+
)
|
| 765 |
+
return
|
| 766 |
+
|
| 767 |
+
# Update sample counts for non-neutral cases
|
| 768 |
+
model_data["total_samples"] += 1
|
| 769 |
+
if role == "generator":
|
| 770 |
+
model_data["as_generator"] += 1
|
| 771 |
+
if score == 1: # Only count as correct if score is 1 (not 0)
|
| 772 |
+
model_data["as_generator_correct"] += 1
|
| 773 |
+
else: # role == "judge"
|
| 774 |
+
model_data["as_judge"] += 1
|
| 775 |
+
if score == 1: # Only count as correct if score is 1 (not 0)
|
| 776 |
+
model_data["as_judge_correct"] += 1
|
| 777 |
+
|
| 778 |
+
# Update correct predictions based on score
|
| 779 |
+
if score == 1:
|
| 780 |
+
model_data["correct_predictions"] += 1
|
| 781 |
+
|
| 782 |
+
# Calculate new accuracy
|
| 783 |
+
model_data["accuracy"] = model_data["correct_predictions"] / model_data["total_samples"]
|
| 784 |
+
|
| 785 |
+
# Update ELO score based on the specific score value (0 or 1)
|
| 786 |
+
# Expected score based on current rating (vs average rating)
|
| 787 |
+
expected_score = 1 / (1 + 10**((1500 - model_data["elo_score"]) / 400))
|
| 788 |
+
|
| 789 |
+
# Use the provided score (0 or 1)
|
| 790 |
+
actual_score = score
|
| 791 |
+
|
| 792 |
+
# New ELO calculation
|
| 793 |
+
model_data["elo_score"] = model_data["elo_score"] + K_FACTOR * (actual_score - expected_score)
|
| 794 |
+
|
| 795 |
+
# Update or insert the model data
|
| 796 |
+
collection.replace_one(
|
| 797 |
+
{"model_name": model_name},
|
| 798 |
+
model_data,
|
| 799 |
+
upsert=True
|
| 800 |
+
)
|
| 801 |
+
|
| 802 |
+
def _update_model_pair_stats(self, generator, judge, detection_correct, hallucination_detected,
|
| 803 |
+
generator_score, judge_score):
|
| 804 |
+
"""Update statistics for a model pair combination"""
|
| 805 |
+
try:
|
| 806 |
+
# Access or create the model pairs collection
|
| 807 |
+
pairs_collection = self.db.get_collection("model_pairs")
|
| 808 |
+
|
| 809 |
+
# Create compound index if it doesn't exist
|
| 810 |
+
pairs_collection.create_index([("generator", 1), ("judge", 1)], unique=True)
|
| 811 |
+
|
| 812 |
+
# Get current pair data or create if not exists
|
| 813 |
+
pair_data = pairs_collection.find_one({
|
| 814 |
+
"generator": generator,
|
| 815 |
+
"judge": judge
|
| 816 |
+
})
|
| 817 |
+
|
| 818 |
+
if pair_data is None:
|
| 819 |
+
# Initialize new pair with default values
|
| 820 |
+
pair_data = {
|
| 821 |
+
"generator": generator,
|
| 822 |
+
"judge": judge,
|
| 823 |
+
"elo_score": 1500, # Starting ELO
|
| 824 |
+
"total_samples": 0,
|
| 825 |
+
"correct_predictions": 0,
|
| 826 |
+
"accuracy": 0.0,
|
| 827 |
+
"hallucinations_detected": 0,
|
| 828 |
+
"generator_performance": 0.0,
|
| 829 |
+
"judge_performance": 0.0,
|
| 830 |
+
"consistency_score": 0.0
|
| 831 |
+
}
|
| 832 |
+
|
| 833 |
+
# Update sample counts
|
| 834 |
+
pair_data["total_samples"] += 1
|
| 835 |
+
if detection_correct:
|
| 836 |
+
pair_data["correct_predictions"] += 1
|
| 837 |
+
|
| 838 |
+
if hallucination_detected:
|
| 839 |
+
pair_data["hallucinations_detected"] += 1
|
| 840 |
+
|
| 841 |
+
# Track model-specific performances within the pair
|
| 842 |
+
if "generator_correct_count" not in pair_data:
|
| 843 |
+
pair_data["generator_correct_count"] = 0
|
| 844 |
+
if "judge_correct_count" not in pair_data:
|
| 845 |
+
pair_data["judge_correct_count"] = 0
|
| 846 |
+
|
| 847 |
+
# Update individual performance counters based on scores
|
| 848 |
+
if generator_score == 1:
|
| 849 |
+
pair_data["generator_correct_count"] += 1
|
| 850 |
+
if judge_score == 1:
|
| 851 |
+
pair_data["judge_correct_count"] += 1
|
| 852 |
+
|
| 853 |
+
# Calculate individual performance rates within the pair
|
| 854 |
+
pair_data["generator_performance"] = pair_data["generator_correct_count"] / pair_data["total_samples"]
|
| 855 |
+
pair_data["judge_performance"] = pair_data["judge_correct_count"] / pair_data["total_samples"]
|
| 856 |
+
|
| 857 |
+
# Calculate new accuracy for the pair (detection accuracy)
|
| 858 |
+
pair_data["accuracy"] = pair_data["correct_predictions"] / pair_data["total_samples"]
|
| 859 |
+
|
| 860 |
+
# Calculate consistency score - weighted average of individual performances
|
| 861 |
+
# Gives more weight to the generator when hallucinations are detected
|
| 862 |
+
if hallucination_detected:
|
| 863 |
+
# When hallucination is detected, judge's role is more critical
|
| 864 |
+
pair_data["consistency_score"] = (0.4 * pair_data["generator_performance"] +
|
| 865 |
+
0.6 * pair_data["judge_performance"])
|
| 866 |
+
else:
|
| 867 |
+
# When no hallucination is detected, both roles are equally important
|
| 868 |
+
pair_data["consistency_score"] = (0.5 * pair_data["generator_performance"] +
|
| 869 |
+
0.5 * pair_data["judge_performance"])
|
| 870 |
+
|
| 871 |
+
# Update ELO score (simplified version)
|
| 872 |
+
K_FACTOR = 24 # Slightly lower K-factor for pairs
|
| 873 |
+
|
| 874 |
+
# Expected score based on current rating
|
| 875 |
+
expected_score = 1 / (1 + 10**((1500 - pair_data["elo_score"]) / 400))
|
| 876 |
+
|
| 877 |
+
# Actual score - use the average of both model scores (0-1 range)
|
| 878 |
+
# This represents the pair's overall performance
|
| 879 |
+
actual_score = (generator_score + judge_score) / 2
|
| 880 |
+
|
| 881 |
+
# New ELO calculation
|
| 882 |
+
pair_data["elo_score"] = pair_data["elo_score"] + K_FACTOR * (actual_score - expected_score)
|
| 883 |
+
|
| 884 |
+
# Update or insert the pair data
|
| 885 |
+
pairs_collection.replace_one(
|
| 886 |
+
{"generator": generator, "judge": judge},
|
| 887 |
+
pair_data,
|
| 888 |
+
upsert=True
|
| 889 |
+
)
|
| 890 |
+
|
| 891 |
+
logger.info("Updated model pair stats for %s (generator) and %s (judge)", generator, judge)
|
| 892 |
+
|
| 893 |
+
except Exception as e:
|
| 894 |
+
logger.error("Error updating model pair stats: %s", str(e), exc_info=True)
|
| 895 |
+
return None
|
| 896 |
+
|
| 897 |
def get_feedback_stats(self):
|
| 898 |
"""Get statistics about collected feedback from MongoDB"""
|
| 899 |
try:
|
|
|
|
| 928 |
except Exception as e:
|
| 929 |
logger.error("Error getting feedback stats: %s", str(e), exc_info=True)
|
| 930 |
return None
|
| 931 |
+
|
| 932 |
+
def get_model_leaderboard(self):
|
| 933 |
+
"""Get the current model leaderboard data"""
|
| 934 |
+
try:
|
| 935 |
+
if self.db is None:
|
| 936 |
+
logger.error("MongoDB connection not available. Cannot get model leaderboard.")
|
| 937 |
+
return None
|
| 938 |
+
|
| 939 |
+
# Access models collection
|
| 940 |
+
models_collection = self.db.get_collection("model_scores")
|
| 941 |
+
|
| 942 |
+
# Get all models and sort by ELO score
|
| 943 |
+
models = list(models_collection.find().sort("elo_score", pymongo.DESCENDING))
|
| 944 |
+
|
| 945 |
+
# Format percentages and convert ObjectId
|
| 946 |
+
for model in models:
|
| 947 |
+
model["_id"] = str(model["_id"])
|
| 948 |
+
model["accuracy"] = round(model["accuracy"] * 100, 1)
|
| 949 |
+
if "as_generator" in model and model["as_generator"] > 0:
|
| 950 |
+
model["generator_accuracy"] = round((model["as_generator_correct"] / model["as_generator"]) * 100, 1)
|
| 951 |
+
else:
|
| 952 |
+
model["generator_accuracy"] = 0.0
|
| 953 |
+
|
| 954 |
+
if "as_judge" in model and model["as_judge"] > 0:
|
| 955 |
+
model["judge_accuracy"] = round((model["as_judge_correct"] / model["as_judge"]) * 100, 1)
|
| 956 |
+
else:
|
| 957 |
+
model["judge_accuracy"] = 0.0
|
| 958 |
+
|
| 959 |
+
return models
|
| 960 |
+
except Exception as e:
|
| 961 |
+
logger.error("Error getting model leaderboard: %s", str(e), exc_info=True)
|
| 962 |
+
return []
|
| 963 |
+
|
| 964 |
+
def get_pair_leaderboard(self):
|
| 965 |
+
"""Get the current model pair leaderboard data"""
|
| 966 |
+
try:
|
| 967 |
+
if self.db is None:
|
| 968 |
+
logger.error("MongoDB connection not available. Cannot get pair leaderboard.")
|
| 969 |
+
return None
|
| 970 |
+
|
| 971 |
+
# Access model pairs collection
|
| 972 |
+
pairs_collection = self.db.get_collection("model_pairs")
|
| 973 |
+
|
| 974 |
+
# Get all pairs and sort by ELO score
|
| 975 |
+
pairs = list(pairs_collection.find().sort("elo_score", pymongo.DESCENDING))
|
| 976 |
+
|
| 977 |
+
# Format percentages and convert ObjectId
|
| 978 |
+
for pair in pairs:
|
| 979 |
+
pair["_id"] = str(pair["_id"])
|
| 980 |
+
pair["accuracy"] = round(pair["accuracy"] * 100, 1)
|
| 981 |
+
pair["consistency_score"] = round(pair["consistency_score"] * 100, 1)
|
| 982 |
+
|
| 983 |
+
return pairs
|
| 984 |
+
except Exception as e:
|
| 985 |
+
logger.error("Error getting pair leaderboard: %s", str(e), exc_info=True)
|
| 986 |
+
return []
|
| 987 |
|
| 988 |
def export_data_to_csv(self, filepath=None):
|
| 989 |
"""Export all feedback data to a CSV file for analysis"""
|
|
|
|
| 1100 |
"starting": {"status": "Starting process...", "progress": 5, "color": "#2196F3"},
|
| 1101 |
"generating_paraphrases": {"status": "Generating paraphrases...", "progress": 15, "color": "#2196F3"},
|
| 1102 |
"paraphrases_complete": {"status": "Paraphrases generated", "progress": 30, "color": "#2196F3"},
|
| 1103 |
+
"getting_responses": {"status": "Getting responses using {model}...", "progress": 35, "color": "#2196F3"},
|
| 1104 |
"responses_progress": {"status": "Getting responses ({completed}/{total})...", "progress": 40, "color": "#2196F3"},
|
| 1105 |
"responses_complete": {"status": "All responses received", "progress": 65, "color": "#2196F3"},
|
| 1106 |
+
"judging": {"status": "Analyzing responses for hallucinations using {model}...", "progress": 70, "color": "#2196F3"},
|
| 1107 |
+
"complete": {"status": "Analysis complete! Using {generator} (generator) and {judge} (judge)", "progress": 100, "color": "#4CAF50"},
|
| 1108 |
"error": {"status": "Error: {error_message}", "progress": 100, "color": "#F44336"}
|
| 1109 |
}
|
| 1110 |
|
|
|
|
| 1115 |
self.completed_responses = 0
|
| 1116 |
self.total_responses = 0
|
| 1117 |
self.error_message = ""
|
| 1118 |
+
self.generator_model = ""
|
| 1119 |
+
self.judge_model = ""
|
| 1120 |
+
self.model = "" # For general model reference in status messages
|
| 1121 |
self._lock = threading.Lock()
|
| 1122 |
self._status_callback = None
|
| 1123 |
self._stop_event = threading.Event()
|
|
|
|
| 1144 |
self.total_responses = value
|
| 1145 |
elif key == 'error_message':
|
| 1146 |
self.error_message = value
|
| 1147 |
+
elif key == 'model':
|
| 1148 |
+
self.model = value
|
| 1149 |
+
elif key == 'generator':
|
| 1150 |
+
self.generator_model = value
|
| 1151 |
+
elif key == 'judge':
|
| 1152 |
+
self.judge_model = value
|
| 1153 |
|
| 1154 |
# Format status message
|
| 1155 |
if stage == 'responses_progress':
|
|
|
|
| 1157 |
completed=self.completed_responses,
|
| 1158 |
total=self.total_responses
|
| 1159 |
)
|
| 1160 |
+
elif stage == 'getting_responses' and 'model' in kwargs:
|
| 1161 |
+
self.stage_data['status'] = self.stage_data['status'].format(
|
| 1162 |
+
model=kwargs.get('model', 'selected model')
|
| 1163 |
+
)
|
| 1164 |
+
elif stage == 'judging' and 'model' in kwargs:
|
| 1165 |
+
self.stage_data['status'] = self.stage_data['status'].format(
|
| 1166 |
+
model=kwargs.get('model', 'selected model')
|
| 1167 |
+
)
|
| 1168 |
+
elif stage == 'complete' and 'generator' in kwargs and 'judge' in kwargs:
|
| 1169 |
+
self.stage_data['status'] = self.stage_data['status'].format(
|
| 1170 |
+
generator=self.generator_model,
|
| 1171 |
+
judge=self.judge_model
|
| 1172 |
+
)
|
| 1173 |
elif stage == 'error':
|
| 1174 |
self.stage_data['status'] = self.stage_data['status'].format(
|
| 1175 |
error_message=self.error_message
|
|
|
|
| 1189 |
# Only show status text if not in idle state
|
| 1190 |
status_display = f'<div class="progress-status" style="color: {color};">{status_text}</div>' if self.stage != "idle" else ''
|
| 1191 |
|
| 1192 |
+
# Add model information if available and we're not in idle or error state
|
| 1193 |
+
model_info = ''
|
| 1194 |
+
if self.stage not in ["idle", "error", "starting"] and (self.generator_model or self.judge_model):
|
| 1195 |
+
model_info = f'<div class="model-info" style="display: flex; justify-content: space-between; margin-top: 8px; font-size: 0.85em; color: #37474f; background-color: #e1f5fe; padding: 5px 10px; border-radius: 4px;">'
|
| 1196 |
+
if self.generator_model:
|
| 1197 |
+
model_info += f'<div><span style="font-weight: bold;">Generator:</span> {self.generator_model}</div>'
|
| 1198 |
+
if self.judge_model:
|
| 1199 |
+
model_info += f'<div><span style="font-weight: bold;">Judge:</span> {self.judge_model}</div>'
|
| 1200 |
+
model_info += '</div>'
|
| 1201 |
+
|
| 1202 |
html = f"""
|
| 1203 |
<div class="progress-container">
|
| 1204 |
{query_info}
|
|
|
|
| 1206 |
<div class="progress-bar-container">
|
| 1207 |
<div class="progress-bar" style="width: {progress_width}; background-color: {color};"></div>
|
| 1208 |
</div>
|
| 1209 |
+
{model_info}
|
| 1210 |
</div>
|
| 1211 |
"""
|
| 1212 |
return html
|
|
|
|
| 1575 |
combined_progress_callback("starting", query=query)
|
| 1576 |
time.sleep(0.3) # Ensure starting status is visible
|
| 1577 |
|
| 1578 |
+
# Step 1.5: Randomly select model pair
|
| 1579 |
+
generator_model, judge_model = detector.pas2.set_random_model_pair()
|
| 1580 |
+
combined_progress_callback("starting", query=query, generator=generator_model, judge=judge_model)
|
| 1581 |
+
time.sleep(0.3) # Ensure model info is visible
|
| 1582 |
+
|
| 1583 |
# Step 2: Generate paraphrases (15-30%)
|
| 1584 |
combined_progress_callback("generating_paraphrases", query=query)
|
| 1585 |
all_queries = detector.pas2.generate_paraphrases(query)
|
| 1586 |
combined_progress_callback("paraphrases_complete", query=query, count=len(all_queries))
|
| 1587 |
|
| 1588 |
# Step 3: Get responses (35-65%)
|
| 1589 |
+
combined_progress_callback("getting_responses", query=query, total=len(all_queries), model=generator_model)
|
| 1590 |
all_responses = []
|
| 1591 |
for i, q in enumerate(all_queries):
|
| 1592 |
# Show incremental progress for each response
|
|
|
|
| 1596 |
combined_progress_callback("responses_complete", query=query)
|
| 1597 |
|
| 1598 |
# Step 4: Judge hallucinations (70-100%)
|
| 1599 |
+
combined_progress_callback("judging", query=query, model=judge_model)
|
| 1600 |
|
| 1601 |
# The first query is the original, rest are paraphrases
|
| 1602 |
original_query = all_queries[0]
|
|
|
|
| 1622 |
"confidence_score": judgment.confidence_score,
|
| 1623 |
"conflicting_facts": judgment.conflicting_facts,
|
| 1624 |
"reasoning": judgment.reasoning,
|
| 1625 |
+
"summary": judgment.summary,
|
| 1626 |
+
"generator_model": generator_model,
|
| 1627 |
+
"judge_model": judge_model
|
| 1628 |
}
|
| 1629 |
|
| 1630 |
# Show completion
|
| 1631 |
+
combined_progress_callback("complete", query=query, generator=generator_model, judge=judge_model)
|
| 1632 |
time.sleep(0.3) # Ensure complete status is visible
|
| 1633 |
|
| 1634 |
return results
|
|
|
|
| 1684 |
reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
|
| 1685 |
conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "<strong>None identified</strong>"
|
| 1686 |
|
| 1687 |
+
# Get model info from the results
|
| 1688 |
+
generator_model = results.get("generator_model", "unknown model")
|
| 1689 |
+
judge_model = results.get("judge_model", "unknown model")
|
| 1690 |
+
|
| 1691 |
html_output = f"""
|
| 1692 |
<div class="container">
|
| 1693 |
<h2 class="title">Hallucination Detection Results</h2>
|
| 1694 |
|
| 1695 |
+
<div class="model-info-bar" style="background-color: #e1f5fe; padding: 10px 15px; border-radius: 8px; margin-bottom: 15px; display: flex; justify-content: space-between;">
|
| 1696 |
+
<div style="flex: 1; text-align: center; border-right: 1px solid #b3e5fc; padding-right: 10px;">
|
| 1697 |
+
<div style="font-weight: bold; color: #0277bd;">Generator Model</div>
|
| 1698 |
+
<div style="font-size: 1.2em; color: #01579b;">{generator_model}</div>
|
| 1699 |
+
</div>
|
| 1700 |
+
<div style="flex: 1; text-align: center; padding-left: 10px;">
|
| 1701 |
+
<div style="font-weight: bold; color: #0277bd;">Judge Model</div>
|
| 1702 |
+
<div style="font-size: 1.2em; color: #01579b;">{judge_model}</div>
|
| 1703 |
+
</div>
|
| 1704 |
+
</div>
|
| 1705 |
+
|
| 1706 |
<div class="stats-section">
|
| 1707 |
<div class="stat-item">
|
| 1708 |
<div class="stat-value">{'Yes' if hallucination_detected else 'No'}</div>
|
|
|
|
| 1732 |
{original_query}
|
| 1733 |
</div>
|
| 1734 |
|
| 1735 |
+
<div class="section-title">Original Response <span style="font-size: 0.8em; color: #607d8b;">(generated by {generator_model})</span></div>
|
| 1736 |
<div class="response-box">
|
| 1737 |
{original_response_safe}
|
| 1738 |
</div>
|
|
|
|
| 1747 |
{q}
|
| 1748 |
</div>
|
| 1749 |
|
| 1750 |
+
<div class="section-title">Response {i} <span style="font-size: 0.8em; color: #607d8b;">(generated by {generator_model})</span></div>
|
| 1751 |
<div class="response-box">
|
| 1752 |
{r}
|
| 1753 |
</div>
|
| 1754 |
"""
|
| 1755 |
|
| 1756 |
html_output += f"""
|
| 1757 |
+
<div class="section-title">Detailed Analysis <span style="font-size: 0.8em; color: #607d8b;">(judged by {judge_model})</span></div>
|
| 1758 |
<div class="info-box">
|
| 1759 |
<p><strong>Reasoning:</strong></p>
|
| 1760 |
<p>{reasoning_safe}</p>
|
|
|
|
| 1762 |
<p><strong>Conflicting Facts:</strong></p>
|
| 1763 |
<p>{conflicting_facts_text_safe}</p>
|
| 1764 |
</div>
|
| 1765 |
+
|
| 1766 |
+
<div style="margin-top: 20px; border-top: 1px dashed #ccc; padding-top: 15px; font-size: 0.9em; color: #607d8b; text-align: center;">
|
| 1767 |
+
Models randomly selected for this analysis: <strong>{generator_model}</strong> (Generator) and <strong>{judge_model}</strong> (Judge)
|
| 1768 |
+
</div>
|
| 1769 |
</div>
|
| 1770 |
"""
|
| 1771 |
|
|
|
|
| 1791 |
]
|
| 1792 |
|
| 1793 |
# Helper function to submit feedback
|
| 1794 |
+
def combine_feedback(hallucination_present, judge_correct, fb_text, results):
|
| 1795 |
+
combined_feedback = f"Hallucination: {hallucination_present}, Judge Correct: {judge_correct}"
|
| 1796 |
+
if fb_text:
|
| 1797 |
+
combined_feedback += f", Comments: {fb_text}"
|
| 1798 |
+
|
| 1799 |
if not results:
|
| 1800 |
return "No results to attach feedback to."
|
| 1801 |
|
|
|
|
| 1899 |
This tool implements the Paraphrase-based Approach for Scrutinizing Systems (PAS2) with a model-as-judge enhancement:
|
| 1900 |
|
| 1901 |
1. **Paraphrase Generation**: Your question is paraphrased multiple ways while preserving its core meaning
|
| 1902 |
+
2. **Multiple Responses**: All questions (original + paraphrases) are sent to a randomly selected generator model
|
| 1903 |
+
3. **Expert Judgment**: A randomly selected judge model analyzes all responses to detect factual inconsistencies
|
| 1904 |
|
| 1905 |
### Why This Approach?
|
| 1906 |
|
|
|
|
| 1974 |
gr.Markdown("### Help Improve the System")
|
| 1975 |
gr.Markdown("Your feedback helps us refine the hallucination detection system.")
|
| 1976 |
|
| 1977 |
+
hallucination_present = gr.Radio(
|
| 1978 |
+
label="Was there actually a hallucination in the responses?",
|
| 1979 |
+
choices=["Yes, there was a hallucination", "No, there was no hallucination", "Not sure"],
|
| 1980 |
+
value="Not sure"
|
| 1981 |
+
)
|
| 1982 |
+
|
| 1983 |
+
judge_correct = gr.Radio(
|
| 1984 |
+
label="Did the judge model correctly identify the situation?",
|
| 1985 |
+
choices=["Yes, the judge was correct", "No, the judge was incorrect", "Not sure"],
|
| 1986 |
+
value="Not sure"
|
| 1987 |
)
|
| 1988 |
|
| 1989 |
feedback_text = gr.Textbox(
|
|
|
|
| 2000 |
gr.Markdown("## Hallucination Detection Scores")
|
| 2001 |
gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
|
| 2002 |
|
| 2003 |
+
# Function to generate the HTML for the model pair leaderboard
|
| 2004 |
+
def generate_pair_leaderboard_html():
|
| 2005 |
+
try:
|
| 2006 |
+
# Get leaderboard data
|
| 2007 |
+
pairs = detector.get_pair_leaderboard() or []
|
| 2008 |
+
|
| 2009 |
+
if not pairs:
|
| 2010 |
+
return (
|
| 2011 |
+
"<div class=\"info-message\" style=\"padding: 20px; background-color: #e1f5fe; "
|
| 2012 |
+
"border-radius: 8px; text-align: center; margin: 20px 0;\">"
|
| 2013 |
+
"<h3 style=\"margin-top: 0; color: #0277bd;\">No Data Available Yet</h3>"
|
| 2014 |
+
"<p>Try the detector with more queries to populate the leaderboard!</p>"
|
| 2015 |
+
"</div>"
|
| 2016 |
+
)
|
| 2017 |
+
|
| 2018 |
+
# Generate table rows
|
| 2019 |
+
rows = ""
|
| 2020 |
+
for rank, pair in enumerate(pairs, 1):
|
| 2021 |
+
# Add special styling for top 3
|
| 2022 |
+
row_class = ""
|
| 2023 |
+
if rank == 1:
|
| 2024 |
+
row_class = "class='top-rank-1'"
|
| 2025 |
+
elif rank == 2:
|
| 2026 |
+
row_class = "class='top-rank-2'"
|
| 2027 |
+
elif rank == 3:
|
| 2028 |
+
row_class = "class='top-rank-3'"
|
| 2029 |
+
|
| 2030 |
+
# Format percentages for display
|
| 2031 |
+
generator_perf = f"{pair.get('generator_performance', 0) * 100:.1f}%" if 'generator_performance' in pair else "N/A"
|
| 2032 |
+
judge_perf = f"{pair.get('judge_performance', 0) * 100:.1f}%" if 'judge_performance' in pair else "N/A"
|
| 2033 |
+
consistency = f"{pair.get('consistency_score', 0)}%" if 'consistency_score' in pair else "N/A"
|
| 2034 |
+
|
| 2035 |
+
rows += (
|
| 2036 |
+
f"<tr {row_class}>"
|
| 2037 |
+
f"<td>{rank}</td>"
|
| 2038 |
+
f"<td>{pair.get('generator', 'unknown')}</td>"
|
| 2039 |
+
f"<td>{pair.get('judge', 'unknown')}</td>"
|
| 2040 |
+
f"<td>{round(pair.get('elo_score', 0))}</td>"
|
| 2041 |
+
f"<td>{pair.get('accuracy')}%</td>"
|
| 2042 |
+
f"<td style='color: #80cbc4; font-weight: 500;'>{generator_perf}</td>"
|
| 2043 |
+
f"<td style='color: #90caf9; font-weight: 500;'>{judge_perf}</td>"
|
| 2044 |
+
f"<td style='color: #ce93d8; font-weight: 500;'>{consistency}</td>"
|
| 2045 |
+
f"<td>{pair.get('total_samples', 0)}</td>"
|
| 2046 |
+
f"</tr>"
|
| 2047 |
+
)
|
| 2048 |
+
|
| 2049 |
+
# Build the full table
|
| 2050 |
+
html = (
|
| 2051 |
+
f"<div class=\"leaderboard-container\">"
|
| 2052 |
+
f"<table class=\"leaderboard-table\">"
|
| 2053 |
+
f"<thead>"
|
| 2054 |
+
f"<tr>"
|
| 2055 |
+
f"<th>Rank</th>"
|
| 2056 |
+
f"<th>Generator Model</th>"
|
| 2057 |
+
f"<th>Judge Model</th>"
|
| 2058 |
+
f"<th>ELO Score</th>"
|
| 2059 |
+
f"<th>Accuracy</th>"
|
| 2060 |
+
f"<th>Generator Perf.</th>"
|
| 2061 |
+
f"<th>Judge Perf.</th>"
|
| 2062 |
+
f"<th>Consistency</th>"
|
| 2063 |
+
f"<th>Sample Size</th>"
|
| 2064 |
+
f"</tr>"
|
| 2065 |
+
f"</thead>"
|
| 2066 |
+
f"<tbody>"
|
| 2067 |
+
f"{rows}"
|
| 2068 |
+
f"</tbody>"
|
| 2069 |
+
f"</table>"
|
| 2070 |
+
f"</div>"
|
| 2071 |
+
f"<div style='margin-top: 15px; padding: 12px; background-color: #263238; border-radius: 8px; font-size: 0.95em; color: #e0f7fa; box-shadow: 0 2px 5px rgba(0,0,0,0.2);'>"
|
| 2072 |
+
f"<p style='margin-bottom: 8px; color: #80deea;'><strong>Model Pair Performance Metrics:</strong></p>"
|
| 2073 |
+
f"<ul style='margin-top: 5px; padding-left: 20px; line-height: 1.4;'>"
|
| 2074 |
+
f"<li><strong style='color: #b2dfdb;'>Accuracy</strong>: Percentage of correct hallucination judgments based on user feedback</li>"
|
| 2075 |
+
f"<li><strong style='color: #b2dfdb;'>Generator Performance</strong>: How well the generator model avoids hallucinations</li>"
|
| 2076 |
+
f"<li><strong style='color: #b2dfdb;'>Judge Performance</strong>: How accurately the judge model identifies hallucinations</li>"
|
| 2077 |
+
f"<li><strong style='color: #b2dfdb;'>Consistency</strong>: Weighted measure of how well the pair works together</li>"
|
| 2078 |
+
f"</ul>"
|
| 2079 |
+
f"</div>"
|
| 2080 |
+
)
|
| 2081 |
+
|
| 2082 |
+
return html
|
| 2083 |
+
except Exception as e:
|
| 2084 |
+
logger.error("Error generating leaderboard HTML: %s", str(e), exc_info=True)
|
| 2085 |
+
return (
|
| 2086 |
+
f"<div class=\"error-message\" style=\"padding: 20px; background-color: #ffebee; "
|
| 2087 |
+
f"border-radius: 8px; text-align: center; margin: 20px 0;\">"
|
| 2088 |
+
f"<h3 style=\"margin-top: 0; color: #c62828;\">Error Loading Leaderboard</h3>"
|
| 2089 |
+
f"<p>{str(e)}</p>"
|
| 2090 |
+
f"</div>"
|
| 2091 |
+
)
|
| 2092 |
+
|
| 2093 |
# Create leaderboard table for model combinations
|
| 2094 |
+
model_leaderboard_html = gr.HTML(generate_pair_leaderboard_html())
|
| 2095 |
+
refresh_leaderboard_btn = gr.Button("Refresh Leaderboard", variant="primary")
|
| 2096 |
+
refresh_leaderboard_btn.click(
|
| 2097 |
+
fn=lambda: generate_pair_leaderboard_html(),
|
| 2098 |
+
outputs=[model_leaderboard_html]
|
| 2099 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2100 |
|
| 2101 |
+
# ELO rating explanation
|
| 2102 |
+
with gr.Accordion("ELO Rating System Explanation", open=False):
|
| 2103 |
+
gr.HTML(
|
| 2104 |
+
"<div style='margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);'>" +
|
| 2105 |
+
"<h3 style='margin-top: 0; color: #ffffff;'>ELO Rating System Explanation</h3>" +
|
| 2106 |
+
"<div style='display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;'>" +
|
| 2107 |
+
"<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
|
| 2108 |
+
"<h4 style='margin-top: 0; color: #ffffff;'>How ELO Scores Are Calculated</h4>" +
|
| 2109 |
+
"<p style='color: #eceff1;'>Our ELO rating system assigns scores to model pairs based on user feedback, using the following formula:</p>" +
|
| 2110 |
+
"<div style='background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;'>" +
|
| 2111 |
+
"<code style='color: #80deea;'>ELO_new = ELO_old + K * (S - E)</code><br><br>" +
|
| 2112 |
+
"Where:<br>* <strong style='color: #b2dfdb;'>ELO_old</strong>: Previous rating of the model combination<br>" +
|
| 2113 |
+
"* <strong style='color: #b2dfdb;'>K</strong>: Weight factor (24 for model pairs)<br>" +
|
| 2114 |
+
"* <strong style='color: #b2dfdb;'>S</strong>: Actual score from user feedback (1 for correct, 0 for incorrect)<br>" +
|
| 2115 |
+
"* <strong style='color: #b2dfdb;'>E</strong>: Expected score based on current rating<br><br>" +
|
| 2116 |
+
"<em style='color: #80deea;'>E = 1 / (1 + 10<sup>(1500 - ELO_model)/400</sup>)</em></div></div>" +
|
| 2117 |
+
"<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
|
| 2118 |
+
"<h4 style='margin-top: 0; color: #ffffff;'>Available Models</h4>" +
|
| 2119 |
+
"<p style='color: #eceff1;'>The system randomly selects from these models for each hallucination detection:</p>" +
|
| 2120 |
+
"<div style='display: flex; flex-wrap: wrap; gap: 10px; margin-top: 10px;'>" +
|
| 2121 |
+
"<div style='flex: 1; min-width: 120px;'>" +
|
| 2122 |
+
"<h5 style='margin-top: 0; margin-bottom: 5px; color: #b2dfdb;'>All Models (Used as both Generator & Judge)</h5>" +
|
| 2123 |
+
"<ul style='margin-bottom: 0; padding-left: 20px; color: #eceff1;'>" +
|
| 2124 |
+
"<li>mistral-large</li><li>gpt-4o</li><li>qwen-235b</li><li>grok-3</li>" +
|
| 2125 |
+
"<li>deepseek-reasoner</li><li>o4-mini</li><li>gemini-2.5-pro</li>" +
|
| 2126 |
+
"</ul></div></div></div></div></div>"
|
| 2127 |
+
)
|
| 2128 |
+
gr.HTML(
|
| 2129 |
+
"<style>" +
|
| 2130 |
+
".leaderboard-container {margin: 15px 0; overflow-x: auto;}" +
|
| 2131 |
+
".leaderboard-table {width: 100%; border-collapse: collapse; font-size: 0.95em; " +
|
| 2132 |
+
"box-shadow: 0 2px 10px rgba(0,0,0,0.2); border-radius: 8px; overflow: hidden;}" +
|
| 2133 |
+
".leaderboard-table thead {background-color: #0d47a1; color: white;}" +
|
| 2134 |
+
".leaderboard-table th, .leaderboard-table td {padding: 12px 15px; text-align: left; border-bottom: 1px solid #37474f; color: #eceff1;}" +
|
| 2135 |
+
".leaderboard-table tbody tr {transition: background-color 0.3s;}" +
|
| 2136 |
+
".leaderboard-table tbody tr:nth-child(even) {background-color: #37474f;}" +
|
| 2137 |
+
".leaderboard-table tbody tr:nth-child(odd) {background-color: #455a64;}" +
|
| 2138 |
+
".leaderboard-table tbody tr:hover {background-color: #263238;}" +
|
| 2139 |
+
".leaderboard-table tbody tr.top-rank-1 {background-color: #004d40; color: #e0f2f1; font-weight: bold;}" +
|
| 2140 |
+
".leaderboard-table tbody tr.top-rank-2 {background-color: #1b5e20; color: #e8f5e9; font-weight: 500;}" +
|
| 2141 |
+
".leaderboard-table tbody tr.top-rank-3 {background-color: #33691e; color: #f1f8e9; font-weight: 500;}" +
|
| 2142 |
+
".leaderboard-table td {position: relative;}" +
|
| 2143 |
+
".leaderboard-table td::after {content: ''; position: absolute; top: 0; left: 0; width: 100%; height: 100%; background: transparent; pointer-events: none;}" +
|
| 2144 |
+
"</style>"
|
| 2145 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2146 |
|
| 2147 |
+
# Tab 3: Individual Models Leaderboard
|
| 2148 |
+
with gr.TabItem("Individual Models", elem_id="user-feedback-tab"):
|
| 2149 |
+
gr.Markdown("## Individual Model Performance")
|
| 2150 |
+
gr.Markdown("Performance ranking of models based on user feedback, showing statistics for both generator and judge roles.")
|
| 2151 |
|
| 2152 |
+
# Function to generate individual model leaderboard HTML
|
| 2153 |
+
def generate_model_leaderboard_html():
|
| 2154 |
+
try:
|
| 2155 |
+
# Get model scores from MongoDB
|
| 2156 |
+
models = detector.get_model_leaderboard() or []
|
| 2157 |
+
|
| 2158 |
+
if not models:
|
| 2159 |
+
return (
|
| 2160 |
+
"<div class=\"info-message\" style=\"padding: 20px; background-color: #e1f5fe; "
|
| 2161 |
+
"border-radius: 8px; text-align: center; margin: 20px 0;\">"
|
| 2162 |
+
"<h3 style=\"margin-top: 0; color: #0277bd;\">No Data Available Yet</h3>"
|
| 2163 |
+
"<p>Try the detector with more queries to populate the model scores!</p>"
|
| 2164 |
+
"</div>"
|
| 2165 |
+
)
|
| 2166 |
+
|
| 2167 |
+
# Generate table rows
|
| 2168 |
+
rows = ""
|
| 2169 |
+
for rank, model in enumerate(models, 1):
|
| 2170 |
+
# Add special styling for top 3
|
| 2171 |
+
row_class = ""
|
| 2172 |
+
if rank == 1:
|
| 2173 |
+
row_class = "class='top-rank-1'"
|
| 2174 |
+
elif rank == 2:
|
| 2175 |
+
row_class = "class='top-rank-2'"
|
| 2176 |
+
elif rank == 3:
|
| 2177 |
+
row_class = "class='top-rank-3'"
|
| 2178 |
+
|
| 2179 |
+
# Calculate role distribution
|
| 2180 |
+
as_generator = model.get('as_generator', 0)
|
| 2181 |
+
as_judge = model.get('as_judge', 0)
|
| 2182 |
+
if as_generator + as_judge > 0:
|
| 2183 |
+
generator_pct = round((as_generator / (as_generator + as_judge)) * 100)
|
| 2184 |
+
judge_pct = 100 - generator_pct
|
| 2185 |
+
role_distribution = f"{generator_pct}% / {judge_pct}%"
|
| 2186 |
+
else:
|
| 2187 |
+
role_distribution = "N/A"
|
| 2188 |
+
|
| 2189 |
+
# Format percentages with better contrast against dark background
|
| 2190 |
+
generator_acc = f"{model.get('generator_accuracy', 0.0)}%"
|
| 2191 |
+
judge_acc = f"{model.get('judge_accuracy', 0.0)}%"
|
| 2192 |
+
|
| 2193 |
+
rows += (
|
| 2194 |
+
f"<tr {row_class}>"
|
| 2195 |
+
f"<td>{rank}</td>"
|
| 2196 |
+
f"<td>{model.get('model_name', 'unknown')}</td>"
|
| 2197 |
+
f"<td>{round(model.get('elo_score', 0))}</td>"
|
| 2198 |
+
f"<td>{model.get('accuracy')}%</td>"
|
| 2199 |
+
f"<td style='color: #80cbc4; font-weight: 500;'>{generator_acc}</td>"
|
| 2200 |
+
f"<td style='color: #90caf9; font-weight: 500;'>{judge_acc}</td>"
|
| 2201 |
+
f"<td>{model.get('total_samples', 0)}</td>"
|
| 2202 |
+
f"<td style='color: #ffcc80; font-weight: 500;'>{role_distribution}</td>"
|
| 2203 |
+
f"</tr>"
|
| 2204 |
+
)
|
| 2205 |
+
|
| 2206 |
+
# Build the full table
|
| 2207 |
+
html = (
|
| 2208 |
+
f"<div class=\"leaderboard-container\">"
|
| 2209 |
+
f"<table class=\"leaderboard-table\">"
|
| 2210 |
+
f"<thead>"
|
| 2211 |
+
f"<tr>"
|
| 2212 |
+
f"<th>Rank</th>"
|
| 2213 |
+
f"<th>Model</th>"
|
| 2214 |
+
f"<th>ELO Score</th>"
|
| 2215 |
+
f"<th>Overall Accuracy</th>"
|
| 2216 |
+
f"<th>Generator Accuracy</th>"
|
| 2217 |
+
f"<th>Judge Accuracy</th>"
|
| 2218 |
+
f"<th>Sample Size</th>"
|
| 2219 |
+
f"<th>Generator/Judge Ratio</th>"
|
| 2220 |
+
f"</tr>"
|
| 2221 |
+
f"</thead>"
|
| 2222 |
+
f"<tbody>"
|
| 2223 |
+
f"{rows}"
|
| 2224 |
+
f"</tbody>"
|
| 2225 |
+
f"</table>"
|
| 2226 |
+
f"</div>"
|
| 2227 |
+
)
|
| 2228 |
+
|
| 2229 |
+
return html
|
| 2230 |
+
except Exception as e:
|
| 2231 |
+
logger.error("Error generating model leaderboard HTML: %s", str(e), exc_info=True)
|
| 2232 |
+
return (
|
| 2233 |
+
f"<div class=\"error-message\" style=\"padding: 20px; background-color: #ffebee; "
|
| 2234 |
+
f"border-radius: 8px; text-align: center; margin: 20px 0;\">"
|
| 2235 |
+
f"<h3 style=\"margin-top: 0; color: #c62828;\">Error Loading Model Leaderboard</h3>"
|
| 2236 |
+
f"<p>{str(e)}</p>"
|
| 2237 |
+
f"</div>"
|
| 2238 |
+
)
|
| 2239 |
|
| 2240 |
+
# Create leaderboard table for individual models
|
| 2241 |
+
model_scores_html = gr.HTML(generate_model_leaderboard_html())
|
| 2242 |
+
refresh_models_btn = gr.Button("Refresh Model Scores", variant="primary")
|
| 2243 |
+
refresh_models_btn.click(
|
| 2244 |
+
fn=lambda: generate_model_leaderboard_html(),
|
| 2245 |
+
outputs=[model_scores_html]
|
| 2246 |
+
)
|
| 2247 |
+
|
| 2248 |
+
# ELO rating explanation for individual models
|
| 2249 |
+
with gr.Accordion("ELO Rating Explanation for Individual Models", open=False):
|
| 2250 |
+
gr.HTML(
|
| 2251 |
+
"<div style='margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);'>" +
|
| 2252 |
+
"<h3 style='margin-top: 0; color: #ffffff;'>Individual Model ELO Rating System</h3>" +
|
| 2253 |
+
"<div style='display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;'>" +
|
| 2254 |
+
"<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
|
| 2255 |
+
"<h4 style='margin-top: 0; color: #ffffff;'>How Individual ELO Scores Are Calculated</h4>" +
|
| 2256 |
+
"<p style='color: #eceff1;'>Our ELO rating system assigns scores to individual models based on user feedback, using the following formula:</p>" +
|
| 2257 |
+
"<div style='background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;'>" +
|
| 2258 |
+
"<code style='color: #80deea;'>ELO_new = ELO_old + K * (S - E)</code><br><br>" +
|
| 2259 |
+
"Where:<br>* <strong style='color: #b2dfdb;'>ELO_old</strong>: Previous rating of the model<br>" +
|
| 2260 |
+
"* <strong style='color: #b2dfdb;'>K</strong>: Weight factor (32 for individual models)<br>" +
|
| 2261 |
+
"* <strong style='color: #b2dfdb;'>S</strong>: Actual score (1 for correct judgment, 0 for incorrect)<br>" +
|
| 2262 |
+
"* <strong style='color: #b2dfdb;'>E</strong>: Expected score based on current rating<br><br>" +
|
| 2263 |
+
"<em style='color: #80deea;'>E = 1 / (1 + 10<sup>(1500 - ELO_model)/400</sup>)</em></div>" +
|
| 2264 |
+
"<p style='color: #eceff1; margin-top: 10px;'>All models start with a base ELO of 1500. Scores are updated after each user evaluation.</p></div>" +
|
| 2265 |
+
"<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
|
| 2266 |
+
"<h4 style='margin-top: 0; color: #ffffff;'>Interpretation Guidelines</h4>" +
|
| 2267 |
+
"<ul style='margin-bottom: 0; padding-left: 20px; color: #eceff1;'>" +
|
| 2268 |
+
"<li><strong style='color: #b2dfdb;'>1800+</strong>: Exceptional performance, very rare hallucinations</li>" +
|
| 2269 |
+
"<li><strong style='color: #b2dfdb;'>1700-1799</strong>: Superior performance, minimal hallucinations</li>" +
|
| 2270 |
+
"<li><strong style='color: #b2dfdb;'>1600-1699</strong>: Good performance, occasional hallucinations</li>" +
|
| 2271 |
+
"<li><strong style='color: #b2dfdb;'>1500-1599</strong>: Average performance</li>" +
|
| 2272 |
+
"<li><strong style='color: #b2dfdb;'><1500</strong>: Below average, frequent hallucinations</li>" +
|
| 2273 |
+
"</ul><p style='font-style: italic; color: #b3e5fc; margin-top: 10px;'>" +
|
| 2274 |
+
"Note: ELO scores are comparative and reflect relative performance between models in our specific hallucination detection tasks.</p>" +
|
| 2275 |
+
"</div></div></div>"
|
| 2276 |
+
)
|
| 2277 |
|
| 2278 |
# Function to continuously update stats
|
| 2279 |
def update_stats():
|
|
|
|
| 2316 |
live_stats = gr.HTML(update_stats())
|
| 2317 |
|
| 2318 |
# Add loading animation style
|
| 2319 |
+
gr.HTML(
|
| 2320 |
+
"<style>" +
|
| 2321 |
+
"@keyframes pulse {" +
|
| 2322 |
+
"0% { opacity: 0.6; }" +
|
| 2323 |
+
"50% { opacity: 1; }" +
|
| 2324 |
+
"100% { opacity: 0.6; }" +
|
| 2325 |
+
"}" +
|
| 2326 |
+
".refreshing::after {" +
|
| 2327 |
+
"content: \"⟳\";" +
|
| 2328 |
+
"display: inline-block;" +
|
| 2329 |
+
"margin-left: 8px;" +
|
| 2330 |
+
"animation: pulse 1.5s infinite ease-in-out;" +
|
| 2331 |
+
"color: #2e7d32;" +
|
| 2332 |
+
"}" +
|
| 2333 |
+
"#stats-container {" +
|
| 2334 |
+
"border: 1px solid #b3e5fc;" +
|
| 2335 |
+
"border-radius: 10px;" +
|
| 2336 |
+
"padding: 15px;" +
|
| 2337 |
+
"margin: 10px 0;" +
|
| 2338 |
+
"background-color: #0277bd;" +
|
| 2339 |
+
"}" +
|
| 2340 |
+
"</style>" +
|
| 2341 |
+
"<div class=\"refreshing\" style=\"text-align: right; font-size: 0.8em; color: #eceff1;\">Auto-refreshing</div>"
|
| 2342 |
+
)
|
| 2343 |
|
| 2344 |
# Create a refresh button that will be auto-clicked
|
| 2345 |
refresh_btn = gr.Button("Refresh Stats", visible=False)
|
|
|
|
| 2523 |
|
| 2524 |
feedback_button.click(
|
| 2525 |
fn=combine_feedback,
|
| 2526 |
+
inputs=[hallucination_present, judge_correct, feedback_text, hidden_results],
|
| 2527 |
outputs=[feedback_status]
|
| 2528 |
)
|
| 2529 |
|
| 2530 |
# Footer
|
| 2531 |
gr.HTML(
|
| 2532 |
+
"""<footer><p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p><p>Multiple LLM models tested as generators and judges for optimal hallucination detection</p><p><small>Models in testing: mistral-large, gpt-4o, Qwen3-235B-A22B, grok-3, o4-mini, gemini-2.5-pro, deepseek-r1</small></p></footer>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2533 |
)
|
| 2534 |
|
| 2535 |
return interface
|
|
|
|
| 2595 |
|
| 2596 |
# Uncomment this line to run the test function instead of the main interface
|
| 2597 |
# if __name__ == "__main__":
|
| 2598 |
+
# test_progress()
|