Spaces:

lcipolina
/

game_reasoning_arena

Sleeping

File size: 46,282 Bytes

#!/usr/bin/env python3
"""
Game Reasoning Arena — Hugging Face Spaces Gradio App

This module provides a web interface for playing games between humans and AI agents,
analyzing LLM performance, and visualizing game statistics.

Pipeline:
User clicks "Start Game" in Gradio
    ↓
app.py (play_game)
    ↓
ui/gradio_config_generator.py (run_game_with_existing_infrastructure)
    ↓
src/game_reasoning_arena/ (core game infrastructure)
    ↓
Game results + metrics displayed in Gradio

Features:
- Interactive human vs AI gameplay
- LLM leaderboards and performance metrics
- Real-time game visualization
- Database management for results
"""

from __future__ import annotations

# =============================================================================
# IMPORTS
# =============================================================================

# Standard library imports
import sqlite3
import sys
import shutil
from pathlib import Path
from typing import List, Dict, Any, Tuple, Generator, TypedDict

# Third-party imports
import pandas as pd
import gradio as gr

# Logging configuration
import logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("arena_space")

# Optional transformers import
try:
    from transformers import pipeline  # noqa: F401
except Exception:
    pass

# =============================================================================
# PATH SETUP & CORE IMPORTS
# =============================================================================

# Make sure src is on PYTHONPATH
src_path = Path(__file__).parent / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Game arena core imports
from game_reasoning_arena.arena.games.registry import (
    registry as games_registry
)
from game_reasoning_arena.backends.huggingface_backend import (
    HuggingFaceBackend,
)
from game_reasoning_arena.backends import (
    initialize_llm_registry, LLM_REGISTRY,
)

# UI utilities
from ui.utils import clean_model_name

# =============================================================================
# GLOBAL CONFIGURATION
# =============================================================================

# Backend availability flag
BACKEND_SYSTEM_AVAILABLE = True

# HuggingFace demo-safe tiny models (CPU friendly)
HUGGINGFACE_MODELS: Dict[str, str] = {
    "gpt2": "gpt2",
    "distilgpt2": "distilgpt2",
    "google/flan-t5-small": "google/flan-t5-small",
    "EleutherAI/gpt-neo-125M": "EleutherAI/gpt-neo-125M",
}

# Global registries
GAMES_REGISTRY: Dict[str, Any] = {}

# Database configuration
db_dir = Path(__file__).resolve().parent / "results"

# Leaderboard display columns
LEADERBOARD_COLUMNS = [
    "agent_name", "agent_type", "# game instances", "total rewards",
    # "avg_generation_time (sec)",  # Commented out - needs fixing
    "win-rate", "win vs_random (%)",
]

# =============================================================================
# BACKEND INITIALIZATION
# =============================================================================

# Initialize HuggingFace backend and register models
huggingface_backend = None
if BACKEND_SYSTEM_AVAILABLE:
    try:
        huggingface_backend = HuggingFaceBackend()
        initialize_llm_registry()

        # Register available HuggingFace models
        for model_name in HUGGINGFACE_MODELS.keys():
            if huggingface_backend.is_model_available(model_name):
                registry_key = f"hf_{model_name}"
                LLM_REGISTRY[registry_key] = {
                    "backend": huggingface_backend,
                    "model_name": model_name,
                }
                log.info("Registered HuggingFace model: %s", registry_key)
    except Exception as e:
        log.error("Failed to initialize HuggingFace backend: %s", e)
        huggingface_backend = None

# =============================================================================
# GAMES REGISTRY SETUP
# =============================================================================

# Load available games from the registry
try:
    if games_registry is not None:
        GAMES_REGISTRY = {
            name: cls for name, cls in games_registry._registry.items()
        }
        log.info("Successfully imported full arena - games are playable.")
    else:
        GAMES_REGISTRY = {}
except Exception as e:
    log.warning("Failed to load games registry: %s", e)
    GAMES_REGISTRY = {}


def _get_game_display_mapping() -> Dict[str, str]:
    """
    Build a mapping from internal game keys to their human-friendly
    display names. If the registry is not available or a game has no
    explicit display_name, fall back to a title-cased version of the
    internal key.

    Returns:
        Dict mapping internal game keys to display names
    """
    mapping: Dict[str, str] = {}
    if games_registry is not None and hasattr(games_registry, "_registry"):
        for key, info in games_registry._registry.items():
            if isinstance(info, dict):
                display = info.get("display_name")
            else:
                display = None
            if not display:
                display = key.replace("_", " ").title()
            mapping[key] = display
    return mapping


# =============================================================================
# DATABASE HELPER FUNCTIONS
# =============================================================================

def ensure_results_dir() -> None:
    """Create the results directory if it doesn't exist."""
    db_dir.mkdir(parents=True, exist_ok=True)


def iter_agent_databases() -> Generator[Tuple[str, str, str], None, None]:
    """
    Yield (db_file, agent_type, model_name) for non-random agents.

    Yields:
        Tuple of (database file path, agent type, model name)
    """
    for db_file in find_or_download_db():
        agent_type, model_name = extract_agent_info(db_file)
        if agent_type != "random":
            yield db_file, agent_type, model_name


def find_or_download_db() -> List[str]:
    """
    Return .db files; ensure random_None.db exists with minimal schema.

    Returns:
        List of database file paths
    """
    ensure_results_dir()

    random_db_path = db_dir / "random_None.db"
    if not random_db_path.exists():
        conn = sqlite3.connect(str(random_db_path))
        try:
            conn.execute(
                """
                CREATE TABLE IF NOT EXISTS games (
                    id INTEGER PRIMARY KEY,
                    game_name TEXT,
                    player1 TEXT,
                    player2 TEXT,
                    winner INTEGER,
                    timestamp TEXT
                )
                """
            )
            conn.commit()
        finally:
            conn.close()

    return [str(p) for p in db_dir.glob("*.db")]


def extract_agent_info(filename: str) -> Tuple[str, str]:
    """
    Extract agent type and model name from database filename.

    Args:
        filename: Database filename (e.g., "llm_gpt2.db")

    Returns:
        Tuple of (agent_type, model_name)
    """
    base_name = Path(filename).stem
    parts = base_name.split("_", 1)
    if len(parts) == 2:
        return parts[0], parts[1]
    return parts[0], "Unknown"


def get_available_games(include_aggregated: bool = True) -> List[str]:
    """
    Return only games from the registry.

    Args:
        include_aggregated: Whether to include "Aggregated Performance" option

    Returns:
        List of available game names
    """
    if GAMES_REGISTRY:
        game_list = sorted(GAMES_REGISTRY.keys())
    else:
        game_list = ["tic_tac_toe", "kuhn_poker", "connect_four"]
    if include_aggregated:
        game_list.insert(0, "Aggregated Performance")
    return game_list


def extract_illegal_moves_summary() -> pd.DataFrame:
    """
    Extract summary of illegal moves per agent.

    Returns:
        DataFrame with agent names and illegal move counts
    """
    summary = []
    for db_file, agent_type, model_name in iter_agent_databases():
        conn = sqlite3.connect(db_file)
        try:
            df = pd.read_sql_query(
                "SELECT COUNT(*) AS illegal_moves FROM illegal_moves", conn
            )
            count = int(df["illegal_moves"].iloc[0]) if not df.empty else 0
        except Exception:
            count = 0
        finally:
            conn.close()
        clean_name = clean_model_name(model_name)
        summary.append({"agent_name": clean_name, "illegal_moves": count})
    return pd.DataFrame(summary)


# =============================================================================
# PLAYER CONFIGURATION & TYPE DEFINITIONS
# =============================================================================


class PlayerConfigData(TypedDict, total=False):
    """Type definition for player configuration data."""
    player_types: List[str]
    player_type_display: Dict[str, str]
    available_models: List[str]


class GameArenaConfig(TypedDict, total=False):
    """Type definition for game arena configuration."""
    available_games: List[str]
    player_config: PlayerConfigData
    model_info: str
    backend_available: bool


def setup_player_config(
    player_type: str, player_model: str, player_id: str
) -> Dict[str, Any]:
    """
    Map dropdown selection to agent config for the runner.

    Args:
        player_type: Display label for player type
        player_model: Model name if LLM type
        player_id: Player identifier

    Returns:
        Agent configuration dictionary
    """
    # Create a temporary config to get the display-to-key mapping
    temp_config = create_player_config()
    display_to_key = {
        v: k for k, v in
        temp_config["player_config"]["player_type_display"].items()
    }

    # Map display label back to internal key
    internal_key = display_to_key.get(player_type, player_type)

    if internal_key == "random_bot":
        return {"type": "random"}

    if internal_key == "human":
        return {"type": "human"}

    if (
        internal_key
        and (
            internal_key.startswith("llm_")
            or internal_key.startswith("hf_")
        )
    ):
        model_id = internal_key.split("_", 1)[1]
        if BACKEND_SYSTEM_AVAILABLE and model_id in HUGGINGFACE_MODELS:
            return {"type": "llm", "model": model_id}

    if (
        internal_key == "llm"
        and player_model in HUGGINGFACE_MODELS
        and BACKEND_SYSTEM_AVAILABLE
    ):
        return {"type": "llm", "model": player_model}

    return {"type": "random"}


def create_player_config(include_aggregated: bool = False) -> GameArenaConfig:
    """
    Create player and game configuration for the arena.

    Args:
        include_aggregated: Whether to include aggregated stats option

    Returns:
        Complete game arena configuration
    """
    # Internal names for arena dropdown
    available_keys = get_available_games(include_aggregated=include_aggregated)

    # Map internal names to display names
    key_to_display = _get_game_display_mapping()
    mapped_games = [
        key_to_display.get(key, key.replace("_", " ").title())
        for key in available_keys
    ]
    # Deduplicate while preserving order
    seen = set()
    available_games = []
    for name in mapped_games:
        if name not in seen:
            available_games.append(name)
            seen.add(name)

    # Define available player types
    player_types = ["human", "random_bot"]
    player_type_display = {
        "human": "Human Player",
        "random_bot": "Random Bot"
    }

    # Add HuggingFace models if backend is available
    if BACKEND_SYSTEM_AVAILABLE:
        for model_key in HUGGINGFACE_MODELS.keys():
            key = f"hf_{model_key}"
            player_types.append(key)
            # Clean up model names for display
            tag = model_key.split("/")[-1]
            if tag == "gpt2":
                display_name = "GPT-2"
            elif tag == "distilgpt2":
                display_name = "DistilGPT-2"
            elif tag == "flan-t5-small":
                display_name = "FLAN-T5 Small"
            elif tag == "gpt-neo-125M":
                display_name = "GPT-Neo 125M"
            else:
                # Fallback for any new models
                display_name = tag.replace("-", " ").title()
            player_type_display[key] = display_name

    all_models = list(HUGGINGFACE_MODELS.keys())
    model_info = (
        "HuggingFace transformer models integrated with backend system."
        if BACKEND_SYSTEM_AVAILABLE
        else "Backend system not available - limited functionality."
    )

    # Build display→key mapping for games
    display_to_key = {}
    for key in available_keys:
        display = key_to_display.get(key, key.replace("_", " ").title())
        if display not in display_to_key:
            display_to_key[display] = key

    return {
        "available_games": available_games,
        "game_display_to_key": display_to_key,
        "player_config": {
            "player_types": player_types,
            "player_type_display": player_type_display,
            "available_models": all_models,
        },
        "model_info": model_info,
        "backend_available": BACKEND_SYSTEM_AVAILABLE,
    }


# =============================================================================
# MAIN GAME LOGIC
# =============================================================================

def play_game(
    game_name: str,
    player1_type: str,
    player2_type: str,
    rounds: int = 1,
    seed: int | None = None,
) -> str:
    """
    Execute a complete game simulation between two players.

    Args:
        game_name: Name of the game to play
        player1_type: Type of player 1 (display name like "Human Player", "GPT-2")
        player2_type: Type of player 2 (display name like "Human Player", "GPT-2")
        rounds: Number of rounds to play
        seed: Random seed for reproducibility

    Returns:
        Game result log as string
    """
    if game_name == "No Games Found":
        return "No games available. Please add game databases."

    log.info(
        "Starting game: %s | P1=%s P2=%s rounds=%d",
        game_name,
        player1_type,
        player2_type,
        rounds,
    )

    # Map human‑friendly game name back to internal key if needed
    config = create_player_config()
    if ("game_display_to_key" in config and
            game_name in config["game_display_to_key"]):
        game_name = config["game_display_to_key"][game_name]

    # Map display labels for player types back to keys
    display_to_key = {
        v: k for k, v in config["player_config"]["player_type_display"].items()
    }

    # Extract internal keys and models
    p1_key = display_to_key.get(player1_type, player1_type)
    p2_key = display_to_key.get(player2_type, player2_type)

    player1_model = None
    player2_model = None
    if p1_key.startswith("hf_"):
        player1_model = p1_key.split("_", 1)[1]
    if p2_key.startswith("hf_"):
        player2_model = p2_key.split("_", 1)[1]

    import time
    try:
        from ui.gradio_config_generator import (
            run_game_with_existing_infrastructure,
        )
        # Use a random seed if not provided
        if seed is None:
            seed = int(time.time() * 1000) % (2**31 - 1)
        result = run_game_with_existing_infrastructure(
            game_name=game_name,
            player1_type=p1_key,
            player2_type=p2_key,
            player1_model=player1_model,
            player2_model=player2_model,
            rounds=rounds,
            seed=seed,
        )
        return result
    except Exception as e:
        return f"Error during game simulation: {e}"


# =============================================================================
# LEADERBOARD & ANALYTICS
# =============================================================================

def extract_leaderboard_stats(game_name: str) -> pd.DataFrame:
    """
    Extract leaderboard statistics for a specific game or all games.

    Args:
        game_name: Name of the game or "Aggregated Performance"

    Returns:
        DataFrame with leaderboard statistics
    """
    all_stats = []
    for db_file, agent_type, model_name in iter_agent_databases():
        conn = sqlite3.connect(db_file)
        try:
            if game_name == "Aggregated Performance":
                # Get totals across all games in this DB
                df = pd.read_sql_query(
                    "SELECT COUNT(*) AS total_games, SUM(reward) AS total_rewards "
                    "FROM game_results",
                    conn,
                )
                # Each row represents a game instance
                games_played = int(df["total_games"].iloc[0] or 0)
                # avg_time = conn.execute(
                #     "SELECT AVG(generation_time) FROM moves"
                # ).fetchone()[0] or 0 # to fix later
                wins_vs_random = conn.execute(
                    "SELECT COUNT(*) FROM game_results "
                    "WHERE opponent = 'random_None' AND reward > 0",
                ).fetchone()[0] or 0
                total_vs_random = conn.execute(
                    "SELECT COUNT(*) FROM game_results "
                    "WHERE opponent = 'random_None'",
                ).fetchone()[0] or 0
            else:
                # Filter by the selected game
                df = pd.read_sql_query(
                    "SELECT COUNT(*) AS total_games, SUM(reward) AS total_rewards "
                    "FROM game_results WHERE game_name = ?",
                    conn,
                    params=(game_name,),
                )
                # Each row represents a game instance
                games_played = int(df["total_games"].iloc[0] or 0)
                # avg_time = conn.execute(
                #     "SELECT AVG(generation_time) FROM moves "
                #     "WHERE game_name = ?", (game_name,),
                # ).fetchone()[0] or 0
                wins_vs_random = conn.execute(
                    "SELECT COUNT(*) FROM game_results "
                    "WHERE opponent = 'random_None' AND reward > 0 "
                    "AND game_name = ?",
                    (game_name,),
                ).fetchone()[0] or 0
                total_vs_random = conn.execute(
                    "SELECT COUNT(*) FROM game_results "
                    "WHERE opponent = 'random_None' AND game_name = ?",
                    (game_name,),
                ).fetchone()[0] or 0

            # If there were no results for this game, df will be empty or NaNs.
            if df.empty or df["total_games"].iloc[0] is None:
                games_played = 0
                total_rewards = 0.0
            else:
                total_rewards = float(df["total_rewards"].iloc[0] or 0) / 2.0

            vs_random_rate = (
                (wins_vs_random / total_vs_random) * 100.0
                if total_vs_random > 0
                else 0.0
            )

            # Build a single-row DataFrame for this agent
            row = {
                "agent_name": clean_model_name(model_name),
                "agent_type": agent_type,
                "# game instances": games_played,
                "total rewards": total_rewards,
                # "avg_generation_time (sec)": round(float(avg_time), 3),
                "win-rate": round(vs_random_rate, 2),
                "win vs_random (%)": round(vs_random_rate, 2),
            }
            all_stats.append(pd.DataFrame([row]))
        finally:
            conn.close()

    # Concatenate all rows; if all_stats is empty, return an empty DataFrame
    # with columns.
    if not all_stats:
        return pd.DataFrame(columns=LEADERBOARD_COLUMNS)

    leaderboard_df = pd.concat(all_stats, ignore_index=True)
    return leaderboard_df[LEADERBOARD_COLUMNS]


# =============================================================================
# VISUALIZATION HELPERS
# =============================================================================

def create_bar_plot(
    data: pd.DataFrame,
    x_col: str,
    y_col: str,
    title: str,
    x_label: str,
    y_label: str,
    horizontal: bool = False,
) -> gr.BarPlot:
    """
    Create a bar plot with optional horizontal orientation.

    Args:
        data: DataFrame containing the data
        x_col: Column name for x-axis
        y_col: Column name for y-axis
        title: Plot title
        x_label: X-axis label
        y_label: Y-axis label
        horizontal: Whether to create horizontal bars

    Returns:
        Gradio BarPlot component
    """
    if horizontal:
        # Swap x and y for horizontal bars
        return gr.BarPlot(
            value=data,
            x=y_col,  # metrics on x-axis
            y=x_col,  # model names on y-axis
            title=title,
            x_label=y_label,  # swap labels too
            y_label=x_label,
        )
    else:
        return gr.BarPlot(
            value=data,
            x=x_col,
            y=y_col,
            title=title,
            x_label=x_label,
            y_label=y_label,
        )


# =============================================================================
# FILE UPLOAD HANDLERS
# =============================================================================

def handle_db_upload(files: list[gr.File]) -> str:
    """
    Handle upload of database files to the results directory.

    Args:
        files: List of uploaded files

    Returns:
        Status message about upload success
    """
    ensure_results_dir()
    saved = []
    for f in files or []:
        dest = db_dir / Path(f.name).name
        Path(f.name).replace(dest)
        saved.append(dest.name)
    return (
        f"Uploaded: {', '.join(saved)}" if saved else "No files uploaded."
    )


# =============================================================================
# GRADIO USER INTERFACE
# =============================================================================

"""
This section defines the complete Gradio web interface with the following tabs:
1. Game Arena: Interactive gameplay between humans and AI
2. Leaderboard: Performance statistics and rankings
3. Metrics Dashboard: Visual analytics and charts
4. Analysis of LLM Reasoning: Illegal moves and behavior analysis
5. About: Documentation and information

The interface supports:
- Real-time human vs AI gameplay
- Automatic AI move processing
- Dynamic dropdown population
- State management for interactive games
- File upload for database results
- Interactive visualizations
"""

with gr.Blocks() as interface:
    # =========================================================================
    # TAB 1: GAME ARENA
    # =========================================================================

    with gr.Tab("Game Arena"):
        config = create_player_config(include_aggregated=False)

        # Header and introduction
        gr.Markdown("# Interactive Game Reasoning Arena")
        gr.Markdown("Play games against LLMs, a random bot or watch LLMs compete!")
        gr.Markdown(
            f"> **🤖 Available AI Players**: {config['model_info']}\n"
            "> Local transformer models run with Hugging Face transformers. "
            "No API tokens required!\n\n"
            "> **⚠️ Note on Reasoning Quality**: The available models are "
            "relatively basic (GPT-2, DistilGPT-2, etc.) and may produce "
            "limited or nonsensical reasoning. They are suitable for "
            "demonstration purposes but don't expect sophisticated "
            "strategic thinking or coherent explanations."
        )

        # Game selection and configuration
        with gr.Row():
            game_dropdown = gr.Dropdown(
                choices=config["available_games"],
                label="Select a Game",
                value=(
                    config["available_games"][0]
                    if config["available_games"]
                    else "No Games Found"
                ),
            )
            rounds_slider = gr.Slider(
                minimum=1,
                maximum=10,
                value=1,
                step=1,
                label="Number of Rounds",
            )

        def player_selector_block(label: str):
            """Create player selection UI block."""
            gr.Markdown(f"### {label}")
            # Create display choices (what user sees)
            display_choices = [
                config["player_config"]["player_type_display"][key]
                for key in config["player_config"]["player_types"]
            ]
            # Set default to first display choice
            default_choice = display_choices[0] if display_choices else None

            dd_type = gr.Dropdown(
                choices=display_choices,
                label=f"{label}",  # Just "Player 0" or "Player 1"
                value=default_choice,
            )
            return dd_type

        # Player configuration
        with gr.Row():
            p1_type = player_selector_block("Player 0")
            p2_type = player_selector_block("Player 1")

        # Validation error message
        validation_error = gr.Markdown(visible=False)

        # Game state management
        game_state = gr.State(value=None)
        human_choices_p0 = gr.State([])
        human_choices_p1 = gr.State([])

        # Interactive game components (initially hidden)
        with gr.Column(visible=False) as interactive_panel:
            gr.Markdown("## Interactive Game")

            with gr.Row():
                with gr.Column(scale=2):
                    board_display = gr.Textbox(
                        label="Game Board",
                        lines=10,
                        placeholder="Board state will appear here...",
                        interactive=False,
                    )

                with gr.Column(scale=1):
                    # Human move controls
                    gr.Markdown("### Your Move")

                    # Player 0 move selection
                    human_move_p0 = gr.Dropdown(
                        choices=[],
                        label="Your move (Player 0)",
                        visible=False,
                        interactive=True,
                    )

                    # Player 1 move selection
                    human_move_p1 = gr.Dropdown(
                        choices=[],
                        label="Your move (Player 1)",
                        visible=False,
                        interactive=True,
                    )

                    submit_btn = gr.Button(
                        "Submit Move",
                        variant="primary",
                        visible=False
                    )

                    reset_game_btn = gr.Button(
                        "Reset Game",
                        visible=False
                    )

        # Game control buttons
        play_button = gr.Button("🎮 Start Game", variant="primary")
        start_btn = gr.Button(
            "🎯 Start Interactive Game",
            variant="secondary",
            visible=False
        )

        # Game output display
        game_output = gr.Textbox(
            label="Game Log",
            lines=20,
            placeholder="Game results will appear here...",
        )

        def check_for_human_players(p1_type, p2_type):
            """Show/hide interactive controls based on player types."""
            # Map display labels back to internal keys
            display_to_key = {
                v: k for k, v in
                config["player_config"]["player_type_display"].items()
            }
            p1_key = display_to_key.get(p1_type, p1_type)
            p2_key = display_to_key.get(p2_type, p2_type)

            has_human = (p1_key == "human" or p2_key == "human")
            return (
                gr.update(visible=has_human),  # interactive_panel
                gr.update(visible=has_human),  # start_btn
                gr.update(visible=not has_human),  # play_button (single-shot)
            )

        def validate_player_selection(p1_type, p2_type):
            """Validate players and update dropdown choices accordingly."""
            # Map display labels back to internal keys
            display_to_key = {
                v: k for k, v in
                config["player_config"]["player_type_display"].items()
            }
            p1_key = display_to_key.get(p1_type, p1_type)
            p2_key = display_to_key.get(p2_type, p2_type)

            # Check if both players are human
            both_human = (p1_key == "human" and p2_key == "human")

            # Create display choices for dropdowns
            display_choices = [
                config["player_config"]["player_type_display"][key]
                for key in config["player_config"]["player_types"]
            ]

            # Filter choices based on current selection
            p1_choices = display_choices.copy()
            p2_choices = display_choices.copy()

            # If Player 0 is human, remove "Human Player" from Player 1 choices
            if p1_key == "human":
                human_display = config["player_config"][
                    "player_type_display"
                ]["human"]
                if human_display in p2_choices:
                    p2_choices.remove(human_display)

            # If Player 1 is human, remove "Human Player" from Player 0 choices
            if p2_key == "human":
                human_display = config["player_config"][
                    "player_type_display"
                ]["human"]
                if human_display in p1_choices:
                    p1_choices.remove(human_display)

            # Generate error message if both are human
            error_msg = ""
            if both_human:
                error_msg = ("⚠️ **Cannot have Human vs Human games!** "
                             "Please select an AI player for one side.")

            # Return updated dropdown choices and error message
            return (
                gr.update(choices=p1_choices),  # p1_type dropdown
                gr.update(choices=p2_choices),  # p2_type dropdown
                error_msg  # validation error message
            )

        # Update UI when player types change
        def update_validation_and_ui(p1_type, p2_type):
            """Update validation, player choices, and UI visibility."""
            # First update validation and dropdowns
            p1_update, p2_update, error_msg = validate_player_selection(
                p1_type, p2_type
            )

            # Then update UI visibility
            vis_update = check_for_human_players(p1_type, p2_type)

            # Show/hide error message
            error_visible = bool(error_msg)
            error_update = gr.update(
                value=error_msg,
                visible=error_visible
            )

            return (
                p1_update,      # p1_type choices
                p2_update,      # p2_type choices
                error_update,   # validation_error
                vis_update[0],  # interactive_panel
                vis_update[1],  # start_btn
                vis_update[2],  # play_button
            )

        # Wire up change handlers for both player dropdowns
        for player_dropdown in [p1_type, p2_type]:
            player_dropdown.change(
                update_validation_and_ui,
                inputs=[p1_type, p2_type],
                outputs=[
                    p1_type, p2_type, validation_error,
                    interactive_panel, start_btn, play_button
                ],
            )

        # Standard single-shot game
        def start_game_with_validation(
            game_name, p1_type, p2_type, rounds
        ):
            """Start game only if validation passes."""
            # Map display labels back to internal keys
            display_to_key = {
                v: k for k, v in
                config["player_config"]["player_type_display"].items()
            }
            p1_key = display_to_key.get(p1_type, p1_type)
            p2_key = display_to_key.get(p2_type, p2_type)

            # Check if both players are human
            if p1_key == "human" and p2_key == "human":
                return ("⚠️ **Cannot start Human vs Human game!** "
                        "Please select an AI player for one side.")

            # If validation passes, start the game
            return play_game(game_name, p1_type, p2_type, rounds)

        play_button.click(
            start_game_with_validation,
            inputs=[
                game_dropdown,
                p1_type,
                p2_type,
                rounds_slider,
            ],
            outputs=[game_output],
        )

        # Interactive game functions
        def start_interactive_game(
            game_name, p1_type, p2_type, rounds
        ):
            """Initialize an interactive game session."""
            try:
                # Map display labels back to internal keys
                display_to_key = {
                    v: k for k, v in
                    config["player_config"]["player_type_display"].items()
                }
                p1_key = display_to_key.get(p1_type, p1_type)
                p2_key = display_to_key.get(p2_type, p2_type)

                # Check if both players are human
                if p1_key == "human" and p2_key == "human":
                    return (
                        None,   # game_state
                        [],     # human_choices_p0
                        [],     # human_choices_p1
                        ("⚠️ **Cannot start Human vs Human game!** "
                         "Please select an AI player for one side."),
                        gr.update(choices=[], visible=False),  # human_move_p0
                        gr.update(choices=[], visible=False),  # human_move_p1
                        gr.update(visible=False),              # submit_btn
                        gr.update(visible=False),              # reset_game_btn
                    )

                from ui.gradio_config_generator import start_game_interactive
                import time

                # Map display game name back to internal key if needed
                game_display_to_key = config.get("game_display_to_key", {})
                internal_game = game_display_to_key.get(game_name, game_name)

                # Extract model from player type if it's an LLM
                p1_model = None
                p2_model = None
                if p1_key.startswith("hf_"):
                    p1_model = p1_key.split("_", 1)[1]
                if p2_key.startswith("hf_"):
                    p2_model = p2_key.split("_", 1)[1]

                # Use timestamp as seed
                seed = int(time.time() * 1000) % (2**31 - 1)

                log, state, legal_p0, legal_p1 = start_game_interactive(
                    game_name=internal_game,
                    player1_type=p1_key,
                    player2_type=p2_key,
                    player1_model=p1_model,
                    player2_model=p2_model,
                    rounds=rounds,
                    seed=seed,
                )

                # Store choices in state for reliable mapping
                # [(action_id, label), ...] from _legal_actions_with_labels()
                p0_choices = legal_p0
                p1_choices = legal_p1

                # Create Gradio dropdown choices: user sees OpenSpiel action
                # labels, selects action IDs
                p0_dropdown_choices = [
                    (label, action_id) for action_id, label in p0_choices
                ]
                p1_dropdown_choices = [
                    (label, action_id) for action_id, label in p1_choices
                ]

                # Show/hide dropdowns based on whether each player is human
                p0_is_human = (p1_key == "human")
                p1_is_human = (p2_key == "human")

                return (
                    state,  # game_state
                    p0_choices,  # human_choices_p0
                    p1_choices,  # human_choices_p1
                    log,    # board_display
                    gr.update(
                        choices=p0_dropdown_choices,
                        visible=p0_is_human,
                        value=None
                    ),  # human_move_p0
                    gr.update(
                        choices=p1_dropdown_choices,
                        visible=p1_is_human,
                        value=None
                    ),  # human_move_p1
                    gr.update(visible=True),  # submit_btn
                    gr.update(visible=True),  # reset_game_btn
                )
            except Exception as e:
                return (
                    None,   # game_state
                    [],     # human_choices_p0
                    [],     # human_choices_p1
                    f"Error starting interactive game: {e}",  # board_display
                    gr.update(choices=[], visible=False),     # human_move_p0
                    gr.update(choices=[], visible=False),     # human_move_p1
                    gr.update(visible=False),                 # submit_btn
                    gr.update(visible=False),                 # reset_game_btn
                )

        def submit_human_move_handler(p0_action, p1_action, state, choices_p0, choices_p1):
            """Process human moves and advance the game."""
            try:
                from ui.gradio_config_generator import submit_human_move

                if not state:
                    return (
                        state, [], [], "No game running.",
                        gr.update(choices=[], visible=False),
                        gr.update(choices=[], visible=False),
                        gr.update(visible=False),
                        gr.update(visible=False)
                    )

                # The submit_human_move function already handles:
                # 1. Taking human actions for human players
                # 2. Computing AI actions for AI players
                # 3. Advancing the game with both actions
                # 4. Returning the next legal moves
                log_append, new_state, next_p0, next_p1 = submit_human_move(
                    action_p0=p0_action,  # None if P0 is AI, action_id if P0 is human
                    action_p1=p1_action,  # None if P1 is AI, action_id if P1 is human
                    state=state,
                )

                # next_p0 and next_p1 are from _legal_actions_with_labels()
                # Format: [(action_id, label), ...] where label comes from OpenSpiel
                new_choices_p0 = next_p0
                new_choices_p1 = next_p1

                # Create Gradio dropdown choices: user sees OpenSpiel labels, selects action IDs
                p0_dropdown_choices = [(label, action_id) for action_id, label in new_choices_p0]
                p1_dropdown_choices = [(label, action_id) for action_id, label in new_choices_p1]

                # Check if game is finished
                game_over = (new_state.get("terminated", False) or
                           new_state.get("truncated", False))

                return (
                    new_state,  # game_state
                    new_choices_p0,  # human_choices_p0
                    new_choices_p1,  # human_choices_p1
                    log_append,  # board_display (append to current)
                    gr.update(choices=p0_dropdown_choices, visible=len(p0_dropdown_choices) > 0 and not game_over, value=None),
                    gr.update(choices=p1_dropdown_choices, visible=len(p1_dropdown_choices) > 0 and not game_over, value=None),
                    gr.update(visible=not game_over),  # submit_btn
                    gr.update(visible=True),           # reset_game_btn
                )
            except Exception as e:
                return (
                    state, choices_p0, choices_p1, f"Error processing move: {e}",
                    gr.update(), gr.update(), gr.update(), gr.update()
                )

        def reset_interactive_game():
            """Reset the interactive game state."""
            return (
                None,  # game_state
                [],    # human_choices_p0
                [],    # human_choices_p1
                "Game reset. Click 'Start Interactive Game' to begin a new game.",  # board_display
                gr.update(choices=[], visible=False),  # human_move_p0
                gr.update(choices=[], visible=False),  # human_move_p1
                gr.update(visible=False),              # submit_btn
                gr.update(visible=False),              # reset_game_btn
            )

        # Wire up interactive game handlers
        start_btn.click(
            start_interactive_game,
            inputs=[game_dropdown, p1_type, p2_type, rounds_slider],
            outputs=[game_state, human_choices_p0, human_choices_p1, board_display, human_move_p0, human_move_p1, submit_btn, reset_game_btn],
        )

        submit_btn.click(
            submit_human_move_handler,
            inputs=[human_move_p0, human_move_p1, game_state, human_choices_p0, human_choices_p1],
            outputs=[game_state, human_choices_p0, human_choices_p1, board_display, human_move_p0, human_move_p1, submit_btn, reset_game_btn],
        )

        reset_game_btn.click(
            reset_interactive_game,
            outputs=[game_state, human_choices_p0, human_choices_p1, board_display, human_move_p0, human_move_p1, submit_btn, reset_game_btn],
        )

    with gr.Tab("Leaderboard"):
        gr.Markdown(
            "# LLM Model Leaderboard\n"
            "Track performance across different games!"
        )
        # Use the same display logic as Game Arena
        leaderboard_config = create_player_config(include_aggregated=True)
        leaderboard_game_dropdown = gr.Dropdown(
            choices=leaderboard_config["available_games"],
            label="Select Game",
            value=(
                leaderboard_config["available_games"][0]
                if leaderboard_config["available_games"]
                else "No Games Found"
            ),
        )
        leaderboard_table = gr.Dataframe(
            value=extract_leaderboard_stats("Aggregated Performance"),
            headers=LEADERBOARD_COLUMNS,
            interactive=False,
        )
        refresh_btn = gr.Button("🔄 Refresh")

        def _update_leaderboard(game: str) -> pd.DataFrame:
            # Map display name back to internal key
            display_to_key = leaderboard_config.get("game_display_to_key", {})
            internal_game = display_to_key.get(game, game)
            return extract_leaderboard_stats(internal_game)

        leaderboard_game_dropdown.change(
            _update_leaderboard,
            inputs=[leaderboard_game_dropdown],
            outputs=[leaderboard_table],
        )
        refresh_btn.click(
            _update_leaderboard,
            inputs=[leaderboard_game_dropdown],
            outputs=[leaderboard_table],
        )

        gr.Markdown("### Upload new `.db` result files")
        db_files = gr.Files(file_count="multiple", file_types=[".db"])
        upload_btn = gr.Button("⬆️ Upload to results/")
        upload_status = gr.Markdown()

        upload_btn.click(
            handle_db_upload, inputs=[db_files], outputs=[upload_status]
        )

    with gr.Tab("Metrics Dashboard"):
        gr.Markdown(
            "# 📊 Metrics Dashboard\n"
            "Visual summaries of LLM performance across games."
        )
        metrics_df = extract_leaderboard_stats("Aggregated Performance")

        with gr.Row():
            create_bar_plot(
                data=metrics_df,
                x_col="agent_name",
                y_col="win vs_random (%)",
                title="Win Rate vs Random Bot",
                x_label="LLM Model",
                y_label="Win Rate (%)",
                horizontal=True,
            )

        with gr.Row():
            # Commented out - avg_generation_time needs fixing
            # create_bar_plot(
            #     data=metrics_df,
            #     x_col="agent_name",
            #     y_col="avg_generation_time (sec)",
            #     title="Average Generation Time",
            #     x_label="LLM Model",
            #     y_label="Time (sec)",
            # )
            pass

        with gr.Row():
            gr.Dataframe(
                value=metrics_df,
                label="Performance Summary",
                interactive=False,
            )

    with gr.Tab("Analysis of LLM Reasoning"):
        gr.Markdown(
            "# 🧠 Analysis of LLM Reasoning\n"
            "Insights into move legality and decision behavior."
        )
        illegal_df = extract_illegal_moves_summary()

        with gr.Row():
            create_bar_plot(
                data=illegal_df,
                x_col="agent_name",
                y_col="illegal_moves",
                title="Illegal Moves by Model",
                x_label="LLM Model",
                y_label="# of Illegal Moves",
                horizontal=True,
            )

        with gr.Row():
            gr.Dataframe(
                value=illegal_df,
                label="Illegal Move Summary",
                interactive=False,
            )

    with gr.Tab("About"):
        gr.Markdown(
            """
            # About Game Reasoning Arena

            This app analyzes and visualizes LLM performance in games.

            - **Game Arena**: Play games vs. LLMs or watch LLM vs. LLM
            - **Leaderboard**: Performance statistics across games
            - **Metrics Dashboard**: Visual summaries
            - **Reasoning Analysis**: Illegal moves & behavior

            **Data**: SQLite databases in `/results/`.
            """
        )

# Local run only. On Spaces, the runtime will serve `interface` automatically.
if __name__ == "__main__":
    interface.launch(server_name="0.0.0.0", server_port=None, show_api=False)