Spaces:

lcipolina
/

game_reasoning_arena

Sleeping

App Files Files Community

game_reasoning_arena / app.py

lcipolina

Fixed long names on Leaderboard

6268f97 verified 8 days ago

raw

history blame contribute delete

46.3 kB

	#!/usr/bin/env python3
	"""
	Game Reasoning Arena — Hugging Face Spaces Gradio App

	This module provides a web interface for playing games between humans and AI agents,
	analyzing LLM performance, and visualizing game statistics.

	Pipeline:
	User clicks "Start Game" in Gradio
	↓
	app.py (play_game)
	↓
	ui/gradio_config_generator.py (run_game_with_existing_infrastructure)
	↓
	src/game_reasoning_arena/ (core game infrastructure)
	↓
	Game results + metrics displayed in Gradio

	Features:
	- Interactive human vs AI gameplay
	- LLM leaderboards and performance metrics
	- Real-time game visualization
	- Database management for results
	"""

	from __future__ import annotations

	# =============================================================================
	# IMPORTS
	# =============================================================================

	# Standard library imports
	import sqlite3
	import sys
	import shutil
	from pathlib import Path
	from typing import List, Dict, Any, Tuple, Generator, TypedDict

	# Third-party imports
	import pandas as pd
	import gradio as gr

	# Logging configuration
	import logging
	logging.basicConfig(level=logging.INFO)
	log = logging.getLogger("arena_space")

	# Optional transformers import
	try:
	from transformers import pipeline # noqa: F401
	except Exception:
	pass

	# =============================================================================
	# PATH SETUP & CORE IMPORTS
	# =============================================================================

	# Make sure src is on PYTHONPATH
	src_path = Path(__file__).parent / "src"
	if str(src_path) not in sys.path:
	sys.path.insert(0, str(src_path))

	# Game arena core imports
	from game_reasoning_arena.arena.games.registry import (
	registry as games_registry
	)
	from game_reasoning_arena.backends.huggingface_backend import (
	HuggingFaceBackend,
	)
	from game_reasoning_arena.backends import (
	initialize_llm_registry, LLM_REGISTRY,
	)

	# UI utilities
	from ui.utils import clean_model_name

	# =============================================================================
	# GLOBAL CONFIGURATION
	# =============================================================================

	# Backend availability flag
	BACKEND_SYSTEM_AVAILABLE = True

	# HuggingFace demo-safe tiny models (CPU friendly)
	HUGGINGFACE_MODELS: Dict[str, str] = {
	"gpt2": "gpt2",
	"distilgpt2": "distilgpt2",
	"google/flan-t5-small": "google/flan-t5-small",
	"EleutherAI/gpt-neo-125M": "EleutherAI/gpt-neo-125M",
	}

	# Global registries
	GAMES_REGISTRY: Dict[str, Any] = {}

	# Database configuration
	db_dir = Path(__file__).resolve().parent / "results"

	# Leaderboard display columns
	LEADERBOARD_COLUMNS = [
	"agent_name", "agent_type", "# game instances", "total rewards",
	# "avg_generation_time (sec)", # Commented out - needs fixing
	"win-rate", "win vs_random (%)",
	]

	# =============================================================================
	# BACKEND INITIALIZATION
	# =============================================================================

	# Initialize HuggingFace backend and register models
	huggingface_backend = None
	if BACKEND_SYSTEM_AVAILABLE:
	try:
	huggingface_backend = HuggingFaceBackend()
	initialize_llm_registry()

	# Register available HuggingFace models
	for model_name in HUGGINGFACE_MODELS.keys():
	if huggingface_backend.is_model_available(model_name):
	registry_key = f"hf_{model_name}"
	LLM_REGISTRY[registry_key] = {
	"backend": huggingface_backend,
	"model_name": model_name,
	}
	log.info("Registered HuggingFace model: %s", registry_key)
	except Exception as e:
	log.error("Failed to initialize HuggingFace backend: %s", e)
	huggingface_backend = None

	# =============================================================================
	# GAMES REGISTRY SETUP
	# =============================================================================

	# Load available games from the registry
	try:
	if games_registry is not None:
	GAMES_REGISTRY = {
	name: cls for name, cls in games_registry._registry.items()
	}
	log.info("Successfully imported full arena - games are playable.")
	else:
	GAMES_REGISTRY = {}
	except Exception as e:
	log.warning("Failed to load games registry: %s", e)
	GAMES_REGISTRY = {}


	def _get_game_display_mapping() -> Dict[str, str]:
	"""
	Build a mapping from internal game keys to their human-friendly
	display names. If the registry is not available or a game has no
	explicit display_name, fall back to a title-cased version of the
	internal key.

	Returns:
	Dict mapping internal game keys to display names
	"""
	mapping: Dict[str, str] = {}
	if games_registry is not None and hasattr(games_registry, "_registry"):
	for key, info in games_registry._registry.items():
	if isinstance(info, dict):
	display = info.get("display_name")
	else:
	display = None
	if not display:
	display = key.replace("_", " ").title()
	mapping[key] = display
	return mapping


	# =============================================================================
	# DATABASE HELPER FUNCTIONS
	# =============================================================================

	def ensure_results_dir() -> None:
	"""Create the results directory if it doesn't exist."""
	db_dir.mkdir(parents=True, exist_ok=True)


	def iter_agent_databases() -> Generator[Tuple[str, str, str], None, None]:
	"""
	Yield (db_file, agent_type, model_name) for non-random agents.

	Yields:
	Tuple of (database file path, agent type, model name)
	"""
	for db_file in find_or_download_db():
	agent_type, model_name = extract_agent_info(db_file)
	if agent_type != "random":
	yield db_file, agent_type, model_name


	def find_or_download_db() -> List[str]:
	"""
	Return .db files; ensure random_None.db exists with minimal schema.

	Returns:
	List of database file paths
	"""
	ensure_results_dir()

	random_db_path = db_dir / "random_None.db"
	if not random_db_path.exists():
	conn = sqlite3.connect(str(random_db_path))
	try:
	conn.execute(
	"""
	CREATE TABLE IF NOT EXISTS games (
	id INTEGER PRIMARY KEY,
	game_name TEXT,
	player1 TEXT,
	player2 TEXT,
	winner INTEGER,
	timestamp TEXT
	)
	"""
	)
	conn.commit()
	finally:
	conn.close()

	return [str(p) for p in db_dir.glob("*.db")]


	def extract_agent_info(filename: str) -> Tuple[str, str]:
	"""
	Extract agent type and model name from database filename.

	Args:
	filename: Database filename (e.g., "llm_gpt2.db")

	Returns:
	Tuple of (agent_type, model_name)
	"""
	base_name = Path(filename).stem
	parts = base_name.split("_", 1)
	if len(parts) == 2:
	return parts[0], parts[1]
	return parts[0], "Unknown"


	def get_available_games(include_aggregated: bool = True) -> List[str]:
	"""
	Return only games from the registry.

	Args:
	include_aggregated: Whether to include "Aggregated Performance" option

	Returns:
	List of available game names
	"""
	if GAMES_REGISTRY:
	game_list = sorted(GAMES_REGISTRY.keys())
	else:
	game_list = ["tic_tac_toe", "kuhn_poker", "connect_four"]
	if include_aggregated:
	game_list.insert(0, "Aggregated Performance")
	return game_list


	def extract_illegal_moves_summary() -> pd.DataFrame:
	"""
	Extract summary of illegal moves per agent.

	Returns:
	DataFrame with agent names and illegal move counts
	"""
	summary = []
	for db_file, agent_type, model_name in iter_agent_databases():
	conn = sqlite3.connect(db_file)
	try:
	df = pd.read_sql_query(
	"SELECT COUNT(*) AS illegal_moves FROM illegal_moves", conn
	)
	count = int(df["illegal_moves"].iloc[0]) if not df.empty else 0
	except Exception:
	count = 0
	finally:
	conn.close()
	clean_name = clean_model_name(model_name)
	summary.append({"agent_name": clean_name, "illegal_moves": count})
	return pd.DataFrame(summary)


	# =============================================================================
	# PLAYER CONFIGURATION & TYPE DEFINITIONS
	# =============================================================================


	class PlayerConfigData(TypedDict, total=False):
	"""Type definition for player configuration data."""
	player_types: List[str]
	player_type_display: Dict[str, str]
	available_models: List[str]


	class GameArenaConfig(TypedDict, total=False):
	"""Type definition for game arena configuration."""
	available_games: List[str]
	player_config: PlayerConfigData
	model_info: str
	backend_available: bool


	def setup_player_config(
	player_type: str, player_model: str, player_id: str
	) -> Dict[str, Any]:
	"""
	Map dropdown selection to agent config for the runner.

	Args:
	player_type: Display label for player type
	player_model: Model name if LLM type
	player_id: Player identifier

	Returns:
	Agent configuration dictionary
	"""
	# Create a temporary config to get the display-to-key mapping
	temp_config = create_player_config()
	display_to_key = {
	v: k for k, v in
	temp_config["player_config"]["player_type_display"].items()
	}

	# Map display label back to internal key
	internal_key = display_to_key.get(player_type, player_type)

	if internal_key == "random_bot":
	return {"type": "random"}

	if internal_key == "human":
	return {"type": "human"}

	if (
	internal_key
	and (
	internal_key.startswith("llm_")
	or internal_key.startswith("hf_")
	)
	):
	model_id = internal_key.split("_", 1)[1]
	if BACKEND_SYSTEM_AVAILABLE and model_id in HUGGINGFACE_MODELS:
	return {"type": "llm", "model": model_id}

	if (
	internal_key == "llm"
	and player_model in HUGGINGFACE_MODELS
	and BACKEND_SYSTEM_AVAILABLE
	):
	return {"type": "llm", "model": player_model}

	return {"type": "random"}


	def create_player_config(include_aggregated: bool = False) -> GameArenaConfig:
	"""
	Create player and game configuration for the arena.

	Args:
	include_aggregated: Whether to include aggregated stats option

	Returns:
	Complete game arena configuration
	"""
	# Internal names for arena dropdown
	available_keys = get_available_games(include_aggregated=include_aggregated)

	# Map internal names to display names
	key_to_display = _get_game_display_mapping()
	mapped_games = [
	key_to_display.get(key, key.replace("_", " ").title())
	for key in available_keys
	]
	# Deduplicate while preserving order
	seen = set()
	available_games = []
	for name in mapped_games:
	if name not in seen:
	available_games.append(name)
	seen.add(name)

	# Define available player types
	player_types = ["human", "random_bot"]
	player_type_display = {
	"human": "Human Player",
	"random_bot": "Random Bot"
	}

	# Add HuggingFace models if backend is available
	if BACKEND_SYSTEM_AVAILABLE:
	for model_key in HUGGINGFACE_MODELS.keys():
	key = f"hf_{model_key}"
	player_types.append(key)
	# Clean up model names for display
	tag = model_key.split("/")[-1]
	if tag == "gpt2":
	display_name = "GPT-2"
	elif tag == "distilgpt2":
	display_name = "DistilGPT-2"
	elif tag == "flan-t5-small":
	display_name = "FLAN-T5 Small"
	elif tag == "gpt-neo-125M":
	display_name = "GPT-Neo 125M"
	else:
	# Fallback for any new models
	display_name = tag.replace("-", " ").title()
	player_type_display[key] = display_name

	all_models = list(HUGGINGFACE_MODELS.keys())
	model_info = (
	"HuggingFace transformer models integrated with backend system."
	if BACKEND_SYSTEM_AVAILABLE
	else "Backend system not available - limited functionality."
	)

	# Build display→key mapping for games
	display_to_key = {}
	for key in available_keys:
	display = key_to_display.get(key, key.replace("_", " ").title())
	if display not in display_to_key:
	display_to_key[display] = key

	return {
	"available_games": available_games,
	"game_display_to_key": display_to_key,
	"player_config": {
	"player_types": player_types,
	"player_type_display": player_type_display,
	"available_models": all_models,
	},
	"model_info": model_info,
	"backend_available": BACKEND_SYSTEM_AVAILABLE,
	}


	# =============================================================================
	# MAIN GAME LOGIC
	# =============================================================================

	def play_game(
	game_name: str,
	player1_type: str,
	player2_type: str,
	rounds: int = 1,
	seed: int \| None = None,
	) -> str:
	"""
	Execute a complete game simulation between two players.

	Args:
	game_name: Name of the game to play
	player1_type: Type of player 1 (display name like "Human Player", "GPT-2")
	player2_type: Type of player 2 (display name like "Human Player", "GPT-2")
	rounds: Number of rounds to play
	seed: Random seed for reproducibility

	Returns:
	Game result log as string
	"""
	if game_name == "No Games Found":
	return "No games available. Please add game databases."

	log.info(
	"Starting game: %s \| P1=%s P2=%s rounds=%d",
	game_name,
	player1_type,
	player2_type,
	rounds,
	)

	# Map human‑friendly game name back to internal key if needed
	config = create_player_config()
	if ("game_display_to_key" in config and
	game_name in config["game_display_to_key"]):
	game_name = config["game_display_to_key"][game_name]

	# Map display labels for player types back to keys
	display_to_key = {
	v: k for k, v in config["player_config"]["player_type_display"].items()
	}

	# Extract internal keys and models
	p1_key = display_to_key.get(player1_type, player1_type)
	p2_key = display_to_key.get(player2_type, player2_type)

	player1_model = None
	player2_model = None
	if p1_key.startswith("hf_"):
	player1_model = p1_key.split("_", 1)[1]
	if p2_key.startswith("hf_"):
	player2_model = p2_key.split("_", 1)[1]

	import time
	try:
	from ui.gradio_config_generator import (
	run_game_with_existing_infrastructure,
	)
	# Use a random seed if not provided
	if seed is None:
	seed = int(time.time() * 1000) % (2**31 - 1)
	result = run_game_with_existing_infrastructure(
	game_name=game_name,
	player1_type=p1_key,
	player2_type=p2_key,
	player1_model=player1_model,
	player2_model=player2_model,
	rounds=rounds,
	seed=seed,
	)
	return result
	except Exception as e:
	return f"Error during game simulation: {e}"


	# =============================================================================
	# LEADERBOARD & ANALYTICS
	# =============================================================================

	def extract_leaderboard_stats(game_name: str) -> pd.DataFrame:
	"""
	Extract leaderboard statistics for a specific game or all games.

	Args:
	game_name: Name of the game or "Aggregated Performance"

	Returns:
	DataFrame with leaderboard statistics
	"""
	all_stats = []
	for db_file, agent_type, model_name in iter_agent_databases():
	conn = sqlite3.connect(db_file)
	try:
	if game_name == "Aggregated Performance":
	# Get totals across all games in this DB
	df = pd.read_sql_query(
	"SELECT COUNT(*) AS total_games, SUM(reward) AS total_rewards "
	"FROM game_results",
	conn,
	)
	# Each row represents a game instance
	games_played = int(df["total_games"].iloc[0] or 0)
	# avg_time = conn.execute(
	# "SELECT AVG(generation_time) FROM moves"
	# ).fetchone()[0] or 0 # to fix later
	wins_vs_random = conn.execute(
	"SELECT COUNT(*) FROM game_results "
	"WHERE opponent = 'random_None' AND reward > 0",
	).fetchone()[0] or 0
	total_vs_random = conn.execute(
	"SELECT COUNT(*) FROM game_results "
	"WHERE opponent = 'random_None'",
	).fetchone()[0] or 0
	else:
	# Filter by the selected game
	df = pd.read_sql_query(
	"SELECT COUNT(*) AS total_games, SUM(reward) AS total_rewards "
	"FROM game_results WHERE game_name = ?",
	conn,
	params=(game_name,),
	)
	# Each row represents a game instance
	games_played = int(df["total_games"].iloc[0] or 0)
	# avg_time = conn.execute(
	# "SELECT AVG(generation_time) FROM moves "
	# "WHERE game_name = ?", (game_name,),
	# ).fetchone()[0] or 0
	wins_vs_random = conn.execute(
	"SELECT COUNT(*) FROM game_results "
	"WHERE opponent = 'random_None' AND reward > 0 "
	"AND game_name = ?",
	(game_name,),
	).fetchone()[0] or 0
	total_vs_random = conn.execute(
	"SELECT COUNT(*) FROM game_results "
	"WHERE opponent = 'random_None' AND game_name = ?",
	(game_name,),
	).fetchone()[0] or 0

	# If there were no results for this game, df will be empty or NaNs.
	if df.empty or df["total_games"].iloc[0] is None:
	games_played = 0
	total_rewards = 0.0
	else:
	total_rewards = float(df["total_rewards"].iloc[0] or 0) / 2.0

	vs_random_rate = (
	(wins_vs_random / total_vs_random) * 100.0
	if total_vs_random > 0
	else 0.0
	)

	# Build a single-row DataFrame for this agent
	row = {
	"agent_name": clean_model_name(model_name),
	"agent_type": agent_type,
	"# game instances": games_played,
	"total rewards": total_rewards,
	# "avg_generation_time (sec)": round(float(avg_time), 3),
	"win-rate": round(vs_random_rate, 2),
	"win vs_random (%)": round(vs_random_rate, 2),
	}
	all_stats.append(pd.DataFrame([row]))
	finally:
	conn.close()

	# Concatenate all rows; if all_stats is empty, return an empty DataFrame
	# with columns.
	if not all_stats:
	return pd.DataFrame(columns=LEADERBOARD_COLUMNS)

	leaderboard_df = pd.concat(all_stats, ignore_index=True)
	return leaderboard_df[LEADERBOARD_COLUMNS]


	# =============================================================================
	# VISUALIZATION HELPERS
	# =============================================================================

	def create_bar_plot(
	data: pd.DataFrame,
	x_col: str,
	y_col: str,
	title: str,
	x_label: str,
	y_label: str,
	horizontal: bool = False,
	) -> gr.BarPlot:
	"""
	Create a bar plot with optional horizontal orientation.

	Args:
	data: DataFrame containing the data
	x_col: Column name for x-axis
	y_col: Column name for y-axis
	title: Plot title
	x_label: X-axis label
	y_label: Y-axis label
	horizontal: Whether to create horizontal bars

	Returns:
	Gradio BarPlot component
	"""
	if horizontal:
	# Swap x and y for horizontal bars
	return gr.BarPlot(
	value=data,
	x=y_col, # metrics on x-axis
	y=x_col, # model names on y-axis
	title=title,
	x_label=y_label, # swap labels too
	y_label=x_label,
	)
	else:
	return gr.BarPlot(
	value=data,
	x=x_col,
	y=y_col,
	title=title,
	x_label=x_label,
	y_label=y_label,
	)


	# =============================================================================
	# FILE UPLOAD HANDLERS
	# =============================================================================

	def handle_db_upload(files: list[gr.File]) -> str:
	"""
	Handle upload of database files to the results directory.

	Args:
	files: List of uploaded files

	Returns:
	Status message about upload success
	"""
	ensure_results_dir()
	saved = []
	for f in files or []:
	dest = db_dir / Path(f.name).name
	Path(f.name).replace(dest)
	saved.append(dest.name)
	return (
	f"Uploaded: {', '.join(saved)}" if saved else "No files uploaded."
	)


	# =============================================================================
	# GRADIO USER INTERFACE
	# =============================================================================

	"""
	This section defines the complete Gradio web interface with the following tabs:
	1. Game Arena: Interactive gameplay between humans and AI
	2. Leaderboard: Performance statistics and rankings
	3. Metrics Dashboard: Visual analytics and charts
	4. Analysis of LLM Reasoning: Illegal moves and behavior analysis
	5. About: Documentation and information

	The interface supports:
	- Real-time human vs AI gameplay
	- Automatic AI move processing
	- Dynamic dropdown population
	- State management for interactive games
	- File upload for database results
	- Interactive visualizations
	"""

	with gr.Blocks() as interface:
	# =========================================================================
	# TAB 1: GAME ARENA
	# =========================================================================

	with gr.Tab("Game Arena"):
	config = create_player_config(include_aggregated=False)

	# Header and introduction
	gr.Markdown("# Interactive Game Reasoning Arena")
	gr.Markdown("Play games against LLMs, a random bot or watch LLMs compete!")
	gr.Markdown(
	f"> 🤖 Available AI Players: {config['model_info']}\n"
	"> Local transformer models run with Hugging Face transformers. "
	"No API tokens required!\n\n"
	"> ⚠️ Note on Reasoning Quality: The available models are "
	"relatively basic (GPT-2, DistilGPT-2, etc.) and may produce "
	"limited or nonsensical reasoning. They are suitable for "
	"demonstration purposes but don't expect sophisticated "
	"strategic thinking or coherent explanations."
	)

	# Game selection and configuration
	with gr.Row():
	game_dropdown = gr.Dropdown(
	choices=config["available_games"],
	label="Select a Game",
	value=(
	config["available_games"][0]
	if config["available_games"]
	else "No Games Found"
	),
	)
	rounds_slider = gr.Slider(
	minimum=1,
	maximum=10,
	value=1,
	step=1,
	label="Number of Rounds",
	)

	def player_selector_block(label: str):
	"""Create player selection UI block."""
	gr.Markdown(f"### {label}")
	# Create display choices (what user sees)
	display_choices = [
	config["player_config"]["player_type_display"][key]
	for key in config["player_config"]["player_types"]
	]
	# Set default to first display choice
	default_choice = display_choices[0] if display_choices else None

	dd_type = gr.Dropdown(
	choices=display_choices,
	label=f"{label}", # Just "Player 0" or "Player 1"
	value=default_choice,
	)
	return dd_type

	# Player configuration
	with gr.Row():
	p1_type = player_selector_block("Player 0")
	p2_type = player_selector_block("Player 1")

	# Validation error message
	validation_error = gr.Markdown(visible=False)

	# Game state management
	game_state = gr.State(value=None)
	human_choices_p0 = gr.State([])
	human_choices_p1 = gr.State([])

	# Interactive game components (initially hidden)
	with gr.Column(visible=False) as interactive_panel:
	gr.Markdown("## Interactive Game")

	with gr.Row():
	with gr.Column(scale=2):
	board_display = gr.Textbox(
	label="Game Board",
	lines=10,
	placeholder="Board state will appear here...",
	interactive=False,
	)

	with gr.Column(scale=1):
	# Human move controls
	gr.Markdown("### Your Move")

	# Player 0 move selection
	human_move_p0 = gr.Dropdown(
	choices=[],
	label="Your move (Player 0)",
	visible=False,
	interactive=True,
	)

	# Player 1 move selection
	human_move_p1 = gr.Dropdown(
	choices=[],
	label="Your move (Player 1)",
	visible=False,
	interactive=True,
	)

	submit_btn = gr.Button(
	"Submit Move",
	variant="primary",
	visible=False
	)

	reset_game_btn = gr.Button(
	"Reset Game",
	visible=False
	)

	# Game control buttons
	play_button = gr.Button("🎮 Start Game", variant="primary")
	start_btn = gr.Button(
	"🎯 Start Interactive Game",
	variant="secondary",
	visible=False
	)

	# Game output display
	game_output = gr.Textbox(
	label="Game Log",
	lines=20,
	placeholder="Game results will appear here...",
	)

	def check_for_human_players(p1_type, p2_type):
	"""Show/hide interactive controls based on player types."""
	# Map display labels back to internal keys
	display_to_key = {
	v: k for k, v in
	config["player_config"]["player_type_display"].items()
	}
	p1_key = display_to_key.get(p1_type, p1_type)
	p2_key = display_to_key.get(p2_type, p2_type)

	has_human = (p1_key == "human" or p2_key == "human")
	return (
	gr.update(visible=has_human), # interactive_panel
	gr.update(visible=has_human), # start_btn
	gr.update(visible=not has_human), # play_button (single-shot)
	)

	def validate_player_selection(p1_type, p2_type):
	"""Validate players and update dropdown choices accordingly."""
	# Map display labels back to internal keys
	display_to_key = {
	v: k for k, v in
	config["player_config"]["player_type_display"].items()
	}
	p1_key = display_to_key.get(p1_type, p1_type)
	p2_key = display_to_key.get(p2_type, p2_type)

	# Check if both players are human
	both_human = (p1_key == "human" and p2_key == "human")

	# Create display choices for dropdowns
	display_choices = [
	config["player_config"]["player_type_display"][key]
	for key in config["player_config"]["player_types"]
	]

	# Filter choices based on current selection
	p1_choices = display_choices.copy()
	p2_choices = display_choices.copy()

	# If Player 0 is human, remove "Human Player" from Player 1 choices
	if p1_key == "human":
	human_display = config["player_config"][
	"player_type_display"
	]["human"]
	if human_display in p2_choices:
	p2_choices.remove(human_display)

	# If Player 1 is human, remove "Human Player" from Player 0 choices
	if p2_key == "human":
	human_display = config["player_config"][
	"player_type_display"
	]["human"]
	if human_display in p1_choices:
	p1_choices.remove(human_display)

	# Generate error message if both are human
	error_msg = ""
	if both_human:
	error_msg = ("⚠️ Cannot have Human vs Human games! "
	"Please select an AI player for one side.")

	# Return updated dropdown choices and error message
	return (
	gr.update(choices=p1_choices), # p1_type dropdown
	gr.update(choices=p2_choices), # p2_type dropdown
	error_msg # validation error message
	)

	# Update UI when player types change
	def update_validation_and_ui(p1_type, p2_type):
	"""Update validation, player choices, and UI visibility."""
	# First update validation and dropdowns
	p1_update, p2_update, error_msg = validate_player_selection(
	p1_type, p2_type
	)

	# Then update UI visibility
	vis_update = check_for_human_players(p1_type, p2_type)

	# Show/hide error message
	error_visible = bool(error_msg)
	error_update = gr.update(
	value=error_msg,
	visible=error_visible
	)

	return (
	p1_update, # p1_type choices
	p2_update, # p2_type choices
	error_update, # validation_error
	vis_update[0], # interactive_panel
	vis_update[1], # start_btn
	vis_update[2], # play_button
	)

	# Wire up change handlers for both player dropdowns
	for player_dropdown in [p1_type, p2_type]:
	player_dropdown.change(
	update_validation_and_ui,
	inputs=[p1_type, p2_type],
	outputs=[
	p1_type, p2_type, validation_error,
	interactive_panel, start_btn, play_button
	],
	)

	# Standard single-shot game
	def start_game_with_validation(
	game_name, p1_type, p2_type, rounds
	):
	"""Start game only if validation passes."""
	# Map display labels back to internal keys
	display_to_key = {
	v: k for k, v in
	config["player_config"]["player_type_display"].items()
	}
	p1_key = display_to_key.get(p1_type, p1_type)
	p2_key = display_to_key.get(p2_type, p2_type)

	# Check if both players are human
	if p1_key == "human" and p2_key == "human":
	return ("⚠️ Cannot start Human vs Human game! "
	"Please select an AI player for one side.")

	# If validation passes, start the game
	return play_game(game_name, p1_type, p2_type, rounds)

	play_button.click(
	start_game_with_validation,
	inputs=[
	game_dropdown,
	p1_type,
	p2_type,
	rounds_slider,
	],
	outputs=[game_output],
	)

	# Interactive game functions
	def start_interactive_game(
	game_name, p1_type, p2_type, rounds
	):
	"""Initialize an interactive game session."""
	try:
	# Map display labels back to internal keys
	display_to_key = {
	v: k for k, v in
	config["player_config"]["player_type_display"].items()
	}
	p1_key = display_to_key.get(p1_type, p1_type)
	p2_key = display_to_key.get(p2_type, p2_type)

	# Check if both players are human
	if p1_key == "human" and p2_key == "human":
	return (
	None, # game_state
	[], # human_choices_p0
	[], # human_choices_p1
	("⚠️ Cannot start Human vs Human game! "
	"Please select an AI player for one side."),
	gr.update(choices=[], visible=False), # human_move_p0
	gr.update(choices=[], visible=False), # human_move_p1
	gr.update(visible=False), # submit_btn
	gr.update(visible=False), # reset_game_btn
	)

	from ui.gradio_config_generator import start_game_interactive
	import time

	# Map display game name back to internal key if needed
	game_display_to_key = config.get("game_display_to_key", {})
	internal_game = game_display_to_key.get(game_name, game_name)

	# Extract model from player type if it's an LLM
	p1_model = None
	p2_model = None
	if p1_key.startswith("hf_"):
	p1_model = p1_key.split("_", 1)[1]
	if p2_key.startswith("hf_"):
	p2_model = p2_key.split("_", 1)[1]

	# Use timestamp as seed
	seed = int(time.time() * 1000) % (2**31 - 1)

	log, state, legal_p0, legal_p1 = start_game_interactive(
	game_name=internal_game,
	player1_type=p1_key,
	player2_type=p2_key,
	player1_model=p1_model,
	player2_model=p2_model,
	rounds=rounds,
	seed=seed,
	)

	# Store choices in state for reliable mapping
	# [(action_id, label), ...] from _legal_actions_with_labels()
	p0_choices = legal_p0
	p1_choices = legal_p1

	# Create Gradio dropdown choices: user sees OpenSpiel action
	# labels, selects action IDs
	p0_dropdown_choices = [
	(label, action_id) for action_id, label in p0_choices
	]
	p1_dropdown_choices = [
	(label, action_id) for action_id, label in p1_choices
	]

	# Show/hide dropdowns based on whether each player is human
	p0_is_human = (p1_key == "human")
	p1_is_human = (p2_key == "human")

	return (
	state, # game_state
	p0_choices, # human_choices_p0
	p1_choices, # human_choices_p1
	log, # board_display
	gr.update(
	choices=p0_dropdown_choices,
	visible=p0_is_human,
	value=None
	), # human_move_p0
	gr.update(
	choices=p1_dropdown_choices,
	visible=p1_is_human,
	value=None
	), # human_move_p1
	gr.update(visible=True), # submit_btn
	gr.update(visible=True), # reset_game_btn
	)
	except Exception as e:
	return (
	None, # game_state
	[], # human_choices_p0
	[], # human_choices_p1
	f"Error starting interactive game: {e}", # board_display
	gr.update(choices=[], visible=False), # human_move_p0
	gr.update(choices=[], visible=False), # human_move_p1
	gr.update(visible=False), # submit_btn
	gr.update(visible=False), # reset_game_btn
	)

	def submit_human_move_handler(p0_action, p1_action, state, choices_p0, choices_p1):
	"""Process human moves and advance the game."""
	try:
	from ui.gradio_config_generator import submit_human_move

	if not state:
	return (
	state, [], [], "No game running.",
	gr.update(choices=[], visible=False),
	gr.update(choices=[], visible=False),
	gr.update(visible=False),
	gr.update(visible=False)
	)

	# The submit_human_move function already handles:
	# 1. Taking human actions for human players
	# 2. Computing AI actions for AI players
	# 3. Advancing the game with both actions
	# 4. Returning the next legal moves
	log_append, new_state, next_p0, next_p1 = submit_human_move(
	action_p0=p0_action, # None if P0 is AI, action_id if P0 is human
	action_p1=p1_action, # None if P1 is AI, action_id if P1 is human
	state=state,
	)

	# next_p0 and next_p1 are from _legal_actions_with_labels()
	# Format: [(action_id, label), ...] where label comes from OpenSpiel
	new_choices_p0 = next_p0
	new_choices_p1 = next_p1

	# Create Gradio dropdown choices: user sees OpenSpiel labels, selects action IDs
	p0_dropdown_choices = [(label, action_id) for action_id, label in new_choices_p0]
	p1_dropdown_choices = [(label, action_id) for action_id, label in new_choices_p1]

	# Check if game is finished
	game_over = (new_state.get("terminated", False) or
	new_state.get("truncated", False))

	return (
	new_state, # game_state
	new_choices_p0, # human_choices_p0
	new_choices_p1, # human_choices_p1
	log_append, # board_display (append to current)
	gr.update(choices=p0_dropdown_choices, visible=len(p0_dropdown_choices) > 0 and not game_over, value=None),
	gr.update(choices=p1_dropdown_choices, visible=len(p1_dropdown_choices) > 0 and not game_over, value=None),
	gr.update(visible=not game_over), # submit_btn
	gr.update(visible=True), # reset_game_btn
	)
	except Exception as e:
	return (
	state, choices_p0, choices_p1, f"Error processing move: {e}",
	gr.update(), gr.update(), gr.update(), gr.update()
	)

	def reset_interactive_game():
	"""Reset the interactive game state."""
	return (
	None, # game_state
	[], # human_choices_p0
	[], # human_choices_p1
	"Game reset. Click 'Start Interactive Game' to begin a new game.", # board_display
	gr.update(choices=[], visible=False), # human_move_p0
	gr.update(choices=[], visible=False), # human_move_p1
	gr.update(visible=False), # submit_btn
	gr.update(visible=False), # reset_game_btn
	)

	# Wire up interactive game handlers
	start_btn.click(
	start_interactive_game,
	inputs=[game_dropdown, p1_type, p2_type, rounds_slider],
	outputs=[game_state, human_choices_p0, human_choices_p1, board_display, human_move_p0, human_move_p1, submit_btn, reset_game_btn],
	)

	submit_btn.click(
	submit_human_move_handler,
	inputs=[human_move_p0, human_move_p1, game_state, human_choices_p0, human_choices_p1],
	outputs=[game_state, human_choices_p0, human_choices_p1, board_display, human_move_p0, human_move_p1, submit_btn, reset_game_btn],
	)

	reset_game_btn.click(
	reset_interactive_game,
	outputs=[game_state, human_choices_p0, human_choices_p1, board_display, human_move_p0, human_move_p1, submit_btn, reset_game_btn],
	)

	with gr.Tab("Leaderboard"):
	gr.Markdown(
	"# LLM Model Leaderboard\n"
	"Track performance across different games!"
	)
	# Use the same display logic as Game Arena
	leaderboard_config = create_player_config(include_aggregated=True)
	leaderboard_game_dropdown = gr.Dropdown(
	choices=leaderboard_config["available_games"],
	label="Select Game",
	value=(
	leaderboard_config["available_games"][0]
	if leaderboard_config["available_games"]
	else "No Games Found"
	),
	)
	leaderboard_table = gr.Dataframe(
	value=extract_leaderboard_stats("Aggregated Performance"),
	headers=LEADERBOARD_COLUMNS,
	interactive=False,
	)
	refresh_btn = gr.Button("🔄 Refresh")

	def _update_leaderboard(game: str) -> pd.DataFrame:
	# Map display name back to internal key
	display_to_key = leaderboard_config.get("game_display_to_key", {})
	internal_game = display_to_key.get(game, game)
	return extract_leaderboard_stats(internal_game)

	leaderboard_game_dropdown.change(
	_update_leaderboard,
	inputs=[leaderboard_game_dropdown],
	outputs=[leaderboard_table],
	)
	refresh_btn.click(
	_update_leaderboard,
	inputs=[leaderboard_game_dropdown],
	outputs=[leaderboard_table],
	)

	gr.Markdown("### Upload new `.db` result files")
	db_files = gr.Files(file_count="multiple", file_types=[".db"])
	upload_btn = gr.Button("⬆️ Upload to results/")
	upload_status = gr.Markdown()

	upload_btn.click(
	handle_db_upload, inputs=[db_files], outputs=[upload_status]
	)

	with gr.Tab("Metrics Dashboard"):
	gr.Markdown(
	"# 📊 Metrics Dashboard\n"
	"Visual summaries of LLM performance across games."
	)
	metrics_df = extract_leaderboard_stats("Aggregated Performance")

	with gr.Row():
	create_bar_plot(
	data=metrics_df,
	x_col="agent_name",
	y_col="win vs_random (%)",
	title="Win Rate vs Random Bot",
	x_label="LLM Model",
	y_label="Win Rate (%)",
	horizontal=True,
	)

	with gr.Row():
	# Commented out - avg_generation_time needs fixing
	# create_bar_plot(
	# data=metrics_df,
	# x_col="agent_name",
	# y_col="avg_generation_time (sec)",
	# title="Average Generation Time",
	# x_label="LLM Model",
	# y_label="Time (sec)",
	# )
	pass

	with gr.Row():
	gr.Dataframe(
	value=metrics_df,
	label="Performance Summary",
	interactive=False,
	)

	with gr.Tab("Analysis of LLM Reasoning"):
	gr.Markdown(
	"# 🧠 Analysis of LLM Reasoning\n"
	"Insights into move legality and decision behavior."
	)
	illegal_df = extract_illegal_moves_summary()

	with gr.Row():
	create_bar_plot(
	data=illegal_df,
	x_col="agent_name",
	y_col="illegal_moves",
	title="Illegal Moves by Model",
	x_label="LLM Model",
	y_label="# of Illegal Moves",
	horizontal=True,
	)

	with gr.Row():
	gr.Dataframe(
	value=illegal_df,
	label="Illegal Move Summary",
	interactive=False,
	)

	with gr.Tab("About"):
	gr.Markdown(
	"""
	# About Game Reasoning Arena

	This app analyzes and visualizes LLM performance in games.

	- Game Arena: Play games vs. LLMs or watch LLM vs. LLM
	- Leaderboard: Performance statistics across games
	- Metrics Dashboard: Visual summaries
	- Reasoning Analysis: Illegal moves & behavior

	Data: SQLite databases in `/results/`.
	"""
	)

	# Local run only. On Spaces, the runtime will serve `interface` automatically.
	if __name__ == "__main__":
	interface.launch(server_name="0.0.0.0", server_port=None, show_api=False)