lcipolina's picture
Fixed long names on Leaderboard
6268f97 verified
#!/usr/bin/env python3
"""
Game Reasoning Arena — Hugging Face Spaces Gradio App
This module provides a web interface for playing games between humans and AI agents,
analyzing LLM performance, and visualizing game statistics.
Pipeline:
User clicks "Start Game" in Gradio
app.py (play_game)
ui/gradio_config_generator.py (run_game_with_existing_infrastructure)
src/game_reasoning_arena/ (core game infrastructure)
Game results + metrics displayed in Gradio
Features:
- Interactive human vs AI gameplay
- LLM leaderboards and performance metrics
- Real-time game visualization
- Database management for results
"""
from __future__ import annotations
# =============================================================================
# IMPORTS
# =============================================================================
# Standard library imports
import sqlite3
import sys
import shutil
from pathlib import Path
from typing import List, Dict, Any, Tuple, Generator, TypedDict
# Third-party imports
import pandas as pd
import gradio as gr
# Logging configuration
import logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("arena_space")
# Optional transformers import
try:
from transformers import pipeline # noqa: F401
except Exception:
pass
# =============================================================================
# PATH SETUP & CORE IMPORTS
# =============================================================================
# Make sure src is on PYTHONPATH
src_path = Path(__file__).parent / "src"
if str(src_path) not in sys.path:
sys.path.insert(0, str(src_path))
# Game arena core imports
from game_reasoning_arena.arena.games.registry import (
registry as games_registry
)
from game_reasoning_arena.backends.huggingface_backend import (
HuggingFaceBackend,
)
from game_reasoning_arena.backends import (
initialize_llm_registry, LLM_REGISTRY,
)
# UI utilities
from ui.utils import clean_model_name
# =============================================================================
# GLOBAL CONFIGURATION
# =============================================================================
# Backend availability flag
BACKEND_SYSTEM_AVAILABLE = True
# HuggingFace demo-safe tiny models (CPU friendly)
HUGGINGFACE_MODELS: Dict[str, str] = {
"gpt2": "gpt2",
"distilgpt2": "distilgpt2",
"google/flan-t5-small": "google/flan-t5-small",
"EleutherAI/gpt-neo-125M": "EleutherAI/gpt-neo-125M",
}
# Global registries
GAMES_REGISTRY: Dict[str, Any] = {}
# Database configuration
db_dir = Path(__file__).resolve().parent / "results"
# Leaderboard display columns
LEADERBOARD_COLUMNS = [
"agent_name", "agent_type", "# game instances", "total rewards",
# "avg_generation_time (sec)", # Commented out - needs fixing
"win-rate", "win vs_random (%)",
]
# =============================================================================
# BACKEND INITIALIZATION
# =============================================================================
# Initialize HuggingFace backend and register models
huggingface_backend = None
if BACKEND_SYSTEM_AVAILABLE:
try:
huggingface_backend = HuggingFaceBackend()
initialize_llm_registry()
# Register available HuggingFace models
for model_name in HUGGINGFACE_MODELS.keys():
if huggingface_backend.is_model_available(model_name):
registry_key = f"hf_{model_name}"
LLM_REGISTRY[registry_key] = {
"backend": huggingface_backend,
"model_name": model_name,
}
log.info("Registered HuggingFace model: %s", registry_key)
except Exception as e:
log.error("Failed to initialize HuggingFace backend: %s", e)
huggingface_backend = None
# =============================================================================
# GAMES REGISTRY SETUP
# =============================================================================
# Load available games from the registry
try:
if games_registry is not None:
GAMES_REGISTRY = {
name: cls for name, cls in games_registry._registry.items()
}
log.info("Successfully imported full arena - games are playable.")
else:
GAMES_REGISTRY = {}
except Exception as e:
log.warning("Failed to load games registry: %s", e)
GAMES_REGISTRY = {}
def _get_game_display_mapping() -> Dict[str, str]:
"""
Build a mapping from internal game keys to their human-friendly
display names. If the registry is not available or a game has no
explicit display_name, fall back to a title-cased version of the
internal key.
Returns:
Dict mapping internal game keys to display names
"""
mapping: Dict[str, str] = {}
if games_registry is not None and hasattr(games_registry, "_registry"):
for key, info in games_registry._registry.items():
if isinstance(info, dict):
display = info.get("display_name")
else:
display = None
if not display:
display = key.replace("_", " ").title()
mapping[key] = display
return mapping
# =============================================================================
# DATABASE HELPER FUNCTIONS
# =============================================================================
def ensure_results_dir() -> None:
"""Create the results directory if it doesn't exist."""
db_dir.mkdir(parents=True, exist_ok=True)
def iter_agent_databases() -> Generator[Tuple[str, str, str], None, None]:
"""
Yield (db_file, agent_type, model_name) for non-random agents.
Yields:
Tuple of (database file path, agent type, model name)
"""
for db_file in find_or_download_db():
agent_type, model_name = extract_agent_info(db_file)
if agent_type != "random":
yield db_file, agent_type, model_name
def find_or_download_db() -> List[str]:
"""
Return .db files; ensure random_None.db exists with minimal schema.
Returns:
List of database file paths
"""
ensure_results_dir()
random_db_path = db_dir / "random_None.db"
if not random_db_path.exists():
conn = sqlite3.connect(str(random_db_path))
try:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS games (
id INTEGER PRIMARY KEY,
game_name TEXT,
player1 TEXT,
player2 TEXT,
winner INTEGER,
timestamp TEXT
)
"""
)
conn.commit()
finally:
conn.close()
return [str(p) for p in db_dir.glob("*.db")]
def extract_agent_info(filename: str) -> Tuple[str, str]:
"""
Extract agent type and model name from database filename.
Args:
filename: Database filename (e.g., "llm_gpt2.db")
Returns:
Tuple of (agent_type, model_name)
"""
base_name = Path(filename).stem
parts = base_name.split("_", 1)
if len(parts) == 2:
return parts[0], parts[1]
return parts[0], "Unknown"
def get_available_games(include_aggregated: bool = True) -> List[str]:
"""
Return only games from the registry.
Args:
include_aggregated: Whether to include "Aggregated Performance" option
Returns:
List of available game names
"""
if GAMES_REGISTRY:
game_list = sorted(GAMES_REGISTRY.keys())
else:
game_list = ["tic_tac_toe", "kuhn_poker", "connect_four"]
if include_aggregated:
game_list.insert(0, "Aggregated Performance")
return game_list
def extract_illegal_moves_summary() -> pd.DataFrame:
"""
Extract summary of illegal moves per agent.
Returns:
DataFrame with agent names and illegal move counts
"""
summary = []
for db_file, agent_type, model_name in iter_agent_databases():
conn = sqlite3.connect(db_file)
try:
df = pd.read_sql_query(
"SELECT COUNT(*) AS illegal_moves FROM illegal_moves", conn
)
count = int(df["illegal_moves"].iloc[0]) if not df.empty else 0
except Exception:
count = 0
finally:
conn.close()
clean_name = clean_model_name(model_name)
summary.append({"agent_name": clean_name, "illegal_moves": count})
return pd.DataFrame(summary)
# =============================================================================
# PLAYER CONFIGURATION & TYPE DEFINITIONS
# =============================================================================
class PlayerConfigData(TypedDict, total=False):
"""Type definition for player configuration data."""
player_types: List[str]
player_type_display: Dict[str, str]
available_models: List[str]
class GameArenaConfig(TypedDict, total=False):
"""Type definition for game arena configuration."""
available_games: List[str]
player_config: PlayerConfigData
model_info: str
backend_available: bool
def setup_player_config(
player_type: str, player_model: str, player_id: str
) -> Dict[str, Any]:
"""
Map dropdown selection to agent config for the runner.
Args:
player_type: Display label for player type
player_model: Model name if LLM type
player_id: Player identifier
Returns:
Agent configuration dictionary
"""
# Create a temporary config to get the display-to-key mapping
temp_config = create_player_config()
display_to_key = {
v: k for k, v in
temp_config["player_config"]["player_type_display"].items()
}
# Map display label back to internal key
internal_key = display_to_key.get(player_type, player_type)
if internal_key == "random_bot":
return {"type": "random"}
if internal_key == "human":
return {"type": "human"}
if (
internal_key
and (
internal_key.startswith("llm_")
or internal_key.startswith("hf_")
)
):
model_id = internal_key.split("_", 1)[1]
if BACKEND_SYSTEM_AVAILABLE and model_id in HUGGINGFACE_MODELS:
return {"type": "llm", "model": model_id}
if (
internal_key == "llm"
and player_model in HUGGINGFACE_MODELS
and BACKEND_SYSTEM_AVAILABLE
):
return {"type": "llm", "model": player_model}
return {"type": "random"}
def create_player_config(include_aggregated: bool = False) -> GameArenaConfig:
"""
Create player and game configuration for the arena.
Args:
include_aggregated: Whether to include aggregated stats option
Returns:
Complete game arena configuration
"""
# Internal names for arena dropdown
available_keys = get_available_games(include_aggregated=include_aggregated)
# Map internal names to display names
key_to_display = _get_game_display_mapping()
mapped_games = [
key_to_display.get(key, key.replace("_", " ").title())
for key in available_keys
]
# Deduplicate while preserving order
seen = set()
available_games = []
for name in mapped_games:
if name not in seen:
available_games.append(name)
seen.add(name)
# Define available player types
player_types = ["human", "random_bot"]
player_type_display = {
"human": "Human Player",
"random_bot": "Random Bot"
}
# Add HuggingFace models if backend is available
if BACKEND_SYSTEM_AVAILABLE:
for model_key in HUGGINGFACE_MODELS.keys():
key = f"hf_{model_key}"
player_types.append(key)
# Clean up model names for display
tag = model_key.split("/")[-1]
if tag == "gpt2":
display_name = "GPT-2"
elif tag == "distilgpt2":
display_name = "DistilGPT-2"
elif tag == "flan-t5-small":
display_name = "FLAN-T5 Small"
elif tag == "gpt-neo-125M":
display_name = "GPT-Neo 125M"
else:
# Fallback for any new models
display_name = tag.replace("-", " ").title()
player_type_display[key] = display_name
all_models = list(HUGGINGFACE_MODELS.keys())
model_info = (
"HuggingFace transformer models integrated with backend system."
if BACKEND_SYSTEM_AVAILABLE
else "Backend system not available - limited functionality."
)
# Build display→key mapping for games
display_to_key = {}
for key in available_keys:
display = key_to_display.get(key, key.replace("_", " ").title())
if display not in display_to_key:
display_to_key[display] = key
return {
"available_games": available_games,
"game_display_to_key": display_to_key,
"player_config": {
"player_types": player_types,
"player_type_display": player_type_display,
"available_models": all_models,
},
"model_info": model_info,
"backend_available": BACKEND_SYSTEM_AVAILABLE,
}
# =============================================================================
# MAIN GAME LOGIC
# =============================================================================
def play_game(
game_name: str,
player1_type: str,
player2_type: str,
rounds: int = 1,
seed: int | None = None,
) -> str:
"""
Execute a complete game simulation between two players.
Args:
game_name: Name of the game to play
player1_type: Type of player 1 (display name like "Human Player", "GPT-2")
player2_type: Type of player 2 (display name like "Human Player", "GPT-2")
rounds: Number of rounds to play
seed: Random seed for reproducibility
Returns:
Game result log as string
"""
if game_name == "No Games Found":
return "No games available. Please add game databases."
log.info(
"Starting game: %s | P1=%s P2=%s rounds=%d",
game_name,
player1_type,
player2_type,
rounds,
)
# Map human‑friendly game name back to internal key if needed
config = create_player_config()
if ("game_display_to_key" in config and
game_name in config["game_display_to_key"]):
game_name = config["game_display_to_key"][game_name]
# Map display labels for player types back to keys
display_to_key = {
v: k for k, v in config["player_config"]["player_type_display"].items()
}
# Extract internal keys and models
p1_key = display_to_key.get(player1_type, player1_type)
p2_key = display_to_key.get(player2_type, player2_type)
player1_model = None
player2_model = None
if p1_key.startswith("hf_"):
player1_model = p1_key.split("_", 1)[1]
if p2_key.startswith("hf_"):
player2_model = p2_key.split("_", 1)[1]
import time
try:
from ui.gradio_config_generator import (
run_game_with_existing_infrastructure,
)
# Use a random seed if not provided
if seed is None:
seed = int(time.time() * 1000) % (2**31 - 1)
result = run_game_with_existing_infrastructure(
game_name=game_name,
player1_type=p1_key,
player2_type=p2_key,
player1_model=player1_model,
player2_model=player2_model,
rounds=rounds,
seed=seed,
)
return result
except Exception as e:
return f"Error during game simulation: {e}"
# =============================================================================
# LEADERBOARD & ANALYTICS
# =============================================================================
def extract_leaderboard_stats(game_name: str) -> pd.DataFrame:
"""
Extract leaderboard statistics for a specific game or all games.
Args:
game_name: Name of the game or "Aggregated Performance"
Returns:
DataFrame with leaderboard statistics
"""
all_stats = []
for db_file, agent_type, model_name in iter_agent_databases():
conn = sqlite3.connect(db_file)
try:
if game_name == "Aggregated Performance":
# Get totals across all games in this DB
df = pd.read_sql_query(
"SELECT COUNT(*) AS total_games, SUM(reward) AS total_rewards "
"FROM game_results",
conn,
)
# Each row represents a game instance
games_played = int(df["total_games"].iloc[0] or 0)
# avg_time = conn.execute(
# "SELECT AVG(generation_time) FROM moves"
# ).fetchone()[0] or 0 # to fix later
wins_vs_random = conn.execute(
"SELECT COUNT(*) FROM game_results "
"WHERE opponent = 'random_None' AND reward > 0",
).fetchone()[0] or 0
total_vs_random = conn.execute(
"SELECT COUNT(*) FROM game_results "
"WHERE opponent = 'random_None'",
).fetchone()[0] or 0
else:
# Filter by the selected game
df = pd.read_sql_query(
"SELECT COUNT(*) AS total_games, SUM(reward) AS total_rewards "
"FROM game_results WHERE game_name = ?",
conn,
params=(game_name,),
)
# Each row represents a game instance
games_played = int(df["total_games"].iloc[0] or 0)
# avg_time = conn.execute(
# "SELECT AVG(generation_time) FROM moves "
# "WHERE game_name = ?", (game_name,),
# ).fetchone()[0] or 0
wins_vs_random = conn.execute(
"SELECT COUNT(*) FROM game_results "
"WHERE opponent = 'random_None' AND reward > 0 "
"AND game_name = ?",
(game_name,),
).fetchone()[0] or 0
total_vs_random = conn.execute(
"SELECT COUNT(*) FROM game_results "
"WHERE opponent = 'random_None' AND game_name = ?",
(game_name,),
).fetchone()[0] or 0
# If there were no results for this game, df will be empty or NaNs.
if df.empty or df["total_games"].iloc[0] is None:
games_played = 0
total_rewards = 0.0
else:
total_rewards = float(df["total_rewards"].iloc[0] or 0) / 2.0
vs_random_rate = (
(wins_vs_random / total_vs_random) * 100.0
if total_vs_random > 0
else 0.0
)
# Build a single-row DataFrame for this agent
row = {
"agent_name": clean_model_name(model_name),
"agent_type": agent_type,
"# game instances": games_played,
"total rewards": total_rewards,
# "avg_generation_time (sec)": round(float(avg_time), 3),
"win-rate": round(vs_random_rate, 2),
"win vs_random (%)": round(vs_random_rate, 2),
}
all_stats.append(pd.DataFrame([row]))
finally:
conn.close()
# Concatenate all rows; if all_stats is empty, return an empty DataFrame
# with columns.
if not all_stats:
return pd.DataFrame(columns=LEADERBOARD_COLUMNS)
leaderboard_df = pd.concat(all_stats, ignore_index=True)
return leaderboard_df[LEADERBOARD_COLUMNS]
# =============================================================================
# VISUALIZATION HELPERS
# =============================================================================
def create_bar_plot(
data: pd.DataFrame,
x_col: str,
y_col: str,
title: str,
x_label: str,
y_label: str,
horizontal: bool = False,
) -> gr.BarPlot:
"""
Create a bar plot with optional horizontal orientation.
Args:
data: DataFrame containing the data
x_col: Column name for x-axis
y_col: Column name for y-axis
title: Plot title
x_label: X-axis label
y_label: Y-axis label
horizontal: Whether to create horizontal bars
Returns:
Gradio BarPlot component
"""
if horizontal:
# Swap x and y for horizontal bars
return gr.BarPlot(
value=data,
x=y_col, # metrics on x-axis
y=x_col, # model names on y-axis
title=title,
x_label=y_label, # swap labels too
y_label=x_label,
)
else:
return gr.BarPlot(
value=data,
x=x_col,
y=y_col,
title=title,
x_label=x_label,
y_label=y_label,
)
# =============================================================================
# FILE UPLOAD HANDLERS
# =============================================================================
def handle_db_upload(files: list[gr.File]) -> str:
"""
Handle upload of database files to the results directory.
Args:
files: List of uploaded files
Returns:
Status message about upload success
"""
ensure_results_dir()
saved = []
for f in files or []:
dest = db_dir / Path(f.name).name
Path(f.name).replace(dest)
saved.append(dest.name)
return (
f"Uploaded: {', '.join(saved)}" if saved else "No files uploaded."
)
# =============================================================================
# GRADIO USER INTERFACE
# =============================================================================
"""
This section defines the complete Gradio web interface with the following tabs:
1. Game Arena: Interactive gameplay between humans and AI
2. Leaderboard: Performance statistics and rankings
3. Metrics Dashboard: Visual analytics and charts
4. Analysis of LLM Reasoning: Illegal moves and behavior analysis
5. About: Documentation and information
The interface supports:
- Real-time human vs AI gameplay
- Automatic AI move processing
- Dynamic dropdown population
- State management for interactive games
- File upload for database results
- Interactive visualizations
"""
with gr.Blocks() as interface:
# =========================================================================
# TAB 1: GAME ARENA
# =========================================================================
with gr.Tab("Game Arena"):
config = create_player_config(include_aggregated=False)
# Header and introduction
gr.Markdown("# Interactive Game Reasoning Arena")
gr.Markdown("Play games against LLMs, a random bot or watch LLMs compete!")
gr.Markdown(
f"> **🤖 Available AI Players**: {config['model_info']}\n"
"> Local transformer models run with Hugging Face transformers. "
"No API tokens required!\n\n"
"> **⚠️ Note on Reasoning Quality**: The available models are "
"relatively basic (GPT-2, DistilGPT-2, etc.) and may produce "
"limited or nonsensical reasoning. They are suitable for "
"demonstration purposes but don't expect sophisticated "
"strategic thinking or coherent explanations."
)
# Game selection and configuration
with gr.Row():
game_dropdown = gr.Dropdown(
choices=config["available_games"],
label="Select a Game",
value=(
config["available_games"][0]
if config["available_games"]
else "No Games Found"
),
)
rounds_slider = gr.Slider(
minimum=1,
maximum=10,
value=1,
step=1,
label="Number of Rounds",
)
def player_selector_block(label: str):
"""Create player selection UI block."""
gr.Markdown(f"### {label}")
# Create display choices (what user sees)
display_choices = [
config["player_config"]["player_type_display"][key]
for key in config["player_config"]["player_types"]
]
# Set default to first display choice
default_choice = display_choices[0] if display_choices else None
dd_type = gr.Dropdown(
choices=display_choices,
label=f"{label}", # Just "Player 0" or "Player 1"
value=default_choice,
)
return dd_type
# Player configuration
with gr.Row():
p1_type = player_selector_block("Player 0")
p2_type = player_selector_block("Player 1")
# Validation error message
validation_error = gr.Markdown(visible=False)
# Game state management
game_state = gr.State(value=None)
human_choices_p0 = gr.State([])
human_choices_p1 = gr.State([])
# Interactive game components (initially hidden)
with gr.Column(visible=False) as interactive_panel:
gr.Markdown("## Interactive Game")
with gr.Row():
with gr.Column(scale=2):
board_display = gr.Textbox(
label="Game Board",
lines=10,
placeholder="Board state will appear here...",
interactive=False,
)
with gr.Column(scale=1):
# Human move controls
gr.Markdown("### Your Move")
# Player 0 move selection
human_move_p0 = gr.Dropdown(
choices=[],
label="Your move (Player 0)",
visible=False,
interactive=True,
)
# Player 1 move selection
human_move_p1 = gr.Dropdown(
choices=[],
label="Your move (Player 1)",
visible=False,
interactive=True,
)
submit_btn = gr.Button(
"Submit Move",
variant="primary",
visible=False
)
reset_game_btn = gr.Button(
"Reset Game",
visible=False
)
# Game control buttons
play_button = gr.Button("🎮 Start Game", variant="primary")
start_btn = gr.Button(
"🎯 Start Interactive Game",
variant="secondary",
visible=False
)
# Game output display
game_output = gr.Textbox(
label="Game Log",
lines=20,
placeholder="Game results will appear here...",
)
def check_for_human_players(p1_type, p2_type):
"""Show/hide interactive controls based on player types."""
# Map display labels back to internal keys
display_to_key = {
v: k for k, v in
config["player_config"]["player_type_display"].items()
}
p1_key = display_to_key.get(p1_type, p1_type)
p2_key = display_to_key.get(p2_type, p2_type)
has_human = (p1_key == "human" or p2_key == "human")
return (
gr.update(visible=has_human), # interactive_panel
gr.update(visible=has_human), # start_btn
gr.update(visible=not has_human), # play_button (single-shot)
)
def validate_player_selection(p1_type, p2_type):
"""Validate players and update dropdown choices accordingly."""
# Map display labels back to internal keys
display_to_key = {
v: k for k, v in
config["player_config"]["player_type_display"].items()
}
p1_key = display_to_key.get(p1_type, p1_type)
p2_key = display_to_key.get(p2_type, p2_type)
# Check if both players are human
both_human = (p1_key == "human" and p2_key == "human")
# Create display choices for dropdowns
display_choices = [
config["player_config"]["player_type_display"][key]
for key in config["player_config"]["player_types"]
]
# Filter choices based on current selection
p1_choices = display_choices.copy()
p2_choices = display_choices.copy()
# If Player 0 is human, remove "Human Player" from Player 1 choices
if p1_key == "human":
human_display = config["player_config"][
"player_type_display"
]["human"]
if human_display in p2_choices:
p2_choices.remove(human_display)
# If Player 1 is human, remove "Human Player" from Player 0 choices
if p2_key == "human":
human_display = config["player_config"][
"player_type_display"
]["human"]
if human_display in p1_choices:
p1_choices.remove(human_display)
# Generate error message if both are human
error_msg = ""
if both_human:
error_msg = ("⚠️ **Cannot have Human vs Human games!** "
"Please select an AI player for one side.")
# Return updated dropdown choices and error message
return (
gr.update(choices=p1_choices), # p1_type dropdown
gr.update(choices=p2_choices), # p2_type dropdown
error_msg # validation error message
)
# Update UI when player types change
def update_validation_and_ui(p1_type, p2_type):
"""Update validation, player choices, and UI visibility."""
# First update validation and dropdowns
p1_update, p2_update, error_msg = validate_player_selection(
p1_type, p2_type
)
# Then update UI visibility
vis_update = check_for_human_players(p1_type, p2_type)
# Show/hide error message
error_visible = bool(error_msg)
error_update = gr.update(
value=error_msg,
visible=error_visible
)
return (
p1_update, # p1_type choices
p2_update, # p2_type choices
error_update, # validation_error
vis_update[0], # interactive_panel
vis_update[1], # start_btn
vis_update[2], # play_button
)
# Wire up change handlers for both player dropdowns
for player_dropdown in [p1_type, p2_type]:
player_dropdown.change(
update_validation_and_ui,
inputs=[p1_type, p2_type],
outputs=[
p1_type, p2_type, validation_error,
interactive_panel, start_btn, play_button
],
)
# Standard single-shot game
def start_game_with_validation(
game_name, p1_type, p2_type, rounds
):
"""Start game only if validation passes."""
# Map display labels back to internal keys
display_to_key = {
v: k for k, v in
config["player_config"]["player_type_display"].items()
}
p1_key = display_to_key.get(p1_type, p1_type)
p2_key = display_to_key.get(p2_type, p2_type)
# Check if both players are human
if p1_key == "human" and p2_key == "human":
return ("⚠️ **Cannot start Human vs Human game!** "
"Please select an AI player for one side.")
# If validation passes, start the game
return play_game(game_name, p1_type, p2_type, rounds)
play_button.click(
start_game_with_validation,
inputs=[
game_dropdown,
p1_type,
p2_type,
rounds_slider,
],
outputs=[game_output],
)
# Interactive game functions
def start_interactive_game(
game_name, p1_type, p2_type, rounds
):
"""Initialize an interactive game session."""
try:
# Map display labels back to internal keys
display_to_key = {
v: k for k, v in
config["player_config"]["player_type_display"].items()
}
p1_key = display_to_key.get(p1_type, p1_type)
p2_key = display_to_key.get(p2_type, p2_type)
# Check if both players are human
if p1_key == "human" and p2_key == "human":
return (
None, # game_state
[], # human_choices_p0
[], # human_choices_p1
("⚠️ **Cannot start Human vs Human game!** "
"Please select an AI player for one side."),
gr.update(choices=[], visible=False), # human_move_p0
gr.update(choices=[], visible=False), # human_move_p1
gr.update(visible=False), # submit_btn
gr.update(visible=False), # reset_game_btn
)
from ui.gradio_config_generator import start_game_interactive
import time
# Map display game name back to internal key if needed
game_display_to_key = config.get("game_display_to_key", {})
internal_game = game_display_to_key.get(game_name, game_name)
# Extract model from player type if it's an LLM
p1_model = None
p2_model = None
if p1_key.startswith("hf_"):
p1_model = p1_key.split("_", 1)[1]
if p2_key.startswith("hf_"):
p2_model = p2_key.split("_", 1)[1]
# Use timestamp as seed
seed = int(time.time() * 1000) % (2**31 - 1)
log, state, legal_p0, legal_p1 = start_game_interactive(
game_name=internal_game,
player1_type=p1_key,
player2_type=p2_key,
player1_model=p1_model,
player2_model=p2_model,
rounds=rounds,
seed=seed,
)
# Store choices in state for reliable mapping
# [(action_id, label), ...] from _legal_actions_with_labels()
p0_choices = legal_p0
p1_choices = legal_p1
# Create Gradio dropdown choices: user sees OpenSpiel action
# labels, selects action IDs
p0_dropdown_choices = [
(label, action_id) for action_id, label in p0_choices
]
p1_dropdown_choices = [
(label, action_id) for action_id, label in p1_choices
]
# Show/hide dropdowns based on whether each player is human
p0_is_human = (p1_key == "human")
p1_is_human = (p2_key == "human")
return (
state, # game_state
p0_choices, # human_choices_p0
p1_choices, # human_choices_p1
log, # board_display
gr.update(
choices=p0_dropdown_choices,
visible=p0_is_human,
value=None
), # human_move_p0
gr.update(
choices=p1_dropdown_choices,
visible=p1_is_human,
value=None
), # human_move_p1
gr.update(visible=True), # submit_btn
gr.update(visible=True), # reset_game_btn
)
except Exception as e:
return (
None, # game_state
[], # human_choices_p0
[], # human_choices_p1
f"Error starting interactive game: {e}", # board_display
gr.update(choices=[], visible=False), # human_move_p0
gr.update(choices=[], visible=False), # human_move_p1
gr.update(visible=False), # submit_btn
gr.update(visible=False), # reset_game_btn
)
def submit_human_move_handler(p0_action, p1_action, state, choices_p0, choices_p1):
"""Process human moves and advance the game."""
try:
from ui.gradio_config_generator import submit_human_move
if not state:
return (
state, [], [], "No game running.",
gr.update(choices=[], visible=False),
gr.update(choices=[], visible=False),
gr.update(visible=False),
gr.update(visible=False)
)
# The submit_human_move function already handles:
# 1. Taking human actions for human players
# 2. Computing AI actions for AI players
# 3. Advancing the game with both actions
# 4. Returning the next legal moves
log_append, new_state, next_p0, next_p1 = submit_human_move(
action_p0=p0_action, # None if P0 is AI, action_id if P0 is human
action_p1=p1_action, # None if P1 is AI, action_id if P1 is human
state=state,
)
# next_p0 and next_p1 are from _legal_actions_with_labels()
# Format: [(action_id, label), ...] where label comes from OpenSpiel
new_choices_p0 = next_p0
new_choices_p1 = next_p1
# Create Gradio dropdown choices: user sees OpenSpiel labels, selects action IDs
p0_dropdown_choices = [(label, action_id) for action_id, label in new_choices_p0]
p1_dropdown_choices = [(label, action_id) for action_id, label in new_choices_p1]
# Check if game is finished
game_over = (new_state.get("terminated", False) or
new_state.get("truncated", False))
return (
new_state, # game_state
new_choices_p0, # human_choices_p0
new_choices_p1, # human_choices_p1
log_append, # board_display (append to current)
gr.update(choices=p0_dropdown_choices, visible=len(p0_dropdown_choices) > 0 and not game_over, value=None),
gr.update(choices=p1_dropdown_choices, visible=len(p1_dropdown_choices) > 0 and not game_over, value=None),
gr.update(visible=not game_over), # submit_btn
gr.update(visible=True), # reset_game_btn
)
except Exception as e:
return (
state, choices_p0, choices_p1, f"Error processing move: {e}",
gr.update(), gr.update(), gr.update(), gr.update()
)
def reset_interactive_game():
"""Reset the interactive game state."""
return (
None, # game_state
[], # human_choices_p0
[], # human_choices_p1
"Game reset. Click 'Start Interactive Game' to begin a new game.", # board_display
gr.update(choices=[], visible=False), # human_move_p0
gr.update(choices=[], visible=False), # human_move_p1
gr.update(visible=False), # submit_btn
gr.update(visible=False), # reset_game_btn
)
# Wire up interactive game handlers
start_btn.click(
start_interactive_game,
inputs=[game_dropdown, p1_type, p2_type, rounds_slider],
outputs=[game_state, human_choices_p0, human_choices_p1, board_display, human_move_p0, human_move_p1, submit_btn, reset_game_btn],
)
submit_btn.click(
submit_human_move_handler,
inputs=[human_move_p0, human_move_p1, game_state, human_choices_p0, human_choices_p1],
outputs=[game_state, human_choices_p0, human_choices_p1, board_display, human_move_p0, human_move_p1, submit_btn, reset_game_btn],
)
reset_game_btn.click(
reset_interactive_game,
outputs=[game_state, human_choices_p0, human_choices_p1, board_display, human_move_p0, human_move_p1, submit_btn, reset_game_btn],
)
with gr.Tab("Leaderboard"):
gr.Markdown(
"# LLM Model Leaderboard\n"
"Track performance across different games!"
)
# Use the same display logic as Game Arena
leaderboard_config = create_player_config(include_aggregated=True)
leaderboard_game_dropdown = gr.Dropdown(
choices=leaderboard_config["available_games"],
label="Select Game",
value=(
leaderboard_config["available_games"][0]
if leaderboard_config["available_games"]
else "No Games Found"
),
)
leaderboard_table = gr.Dataframe(
value=extract_leaderboard_stats("Aggregated Performance"),
headers=LEADERBOARD_COLUMNS,
interactive=False,
)
refresh_btn = gr.Button("🔄 Refresh")
def _update_leaderboard(game: str) -> pd.DataFrame:
# Map display name back to internal key
display_to_key = leaderboard_config.get("game_display_to_key", {})
internal_game = display_to_key.get(game, game)
return extract_leaderboard_stats(internal_game)
leaderboard_game_dropdown.change(
_update_leaderboard,
inputs=[leaderboard_game_dropdown],
outputs=[leaderboard_table],
)
refresh_btn.click(
_update_leaderboard,
inputs=[leaderboard_game_dropdown],
outputs=[leaderboard_table],
)
gr.Markdown("### Upload new `.db` result files")
db_files = gr.Files(file_count="multiple", file_types=[".db"])
upload_btn = gr.Button("⬆️ Upload to results/")
upload_status = gr.Markdown()
upload_btn.click(
handle_db_upload, inputs=[db_files], outputs=[upload_status]
)
with gr.Tab("Metrics Dashboard"):
gr.Markdown(
"# 📊 Metrics Dashboard\n"
"Visual summaries of LLM performance across games."
)
metrics_df = extract_leaderboard_stats("Aggregated Performance")
with gr.Row():
create_bar_plot(
data=metrics_df,
x_col="agent_name",
y_col="win vs_random (%)",
title="Win Rate vs Random Bot",
x_label="LLM Model",
y_label="Win Rate (%)",
horizontal=True,
)
with gr.Row():
# Commented out - avg_generation_time needs fixing
# create_bar_plot(
# data=metrics_df,
# x_col="agent_name",
# y_col="avg_generation_time (sec)",
# title="Average Generation Time",
# x_label="LLM Model",
# y_label="Time (sec)",
# )
pass
with gr.Row():
gr.Dataframe(
value=metrics_df,
label="Performance Summary",
interactive=False,
)
with gr.Tab("Analysis of LLM Reasoning"):
gr.Markdown(
"# 🧠 Analysis of LLM Reasoning\n"
"Insights into move legality and decision behavior."
)
illegal_df = extract_illegal_moves_summary()
with gr.Row():
create_bar_plot(
data=illegal_df,
x_col="agent_name",
y_col="illegal_moves",
title="Illegal Moves by Model",
x_label="LLM Model",
y_label="# of Illegal Moves",
horizontal=True,
)
with gr.Row():
gr.Dataframe(
value=illegal_df,
label="Illegal Move Summary",
interactive=False,
)
with gr.Tab("About"):
gr.Markdown(
"""
# About Game Reasoning Arena
This app analyzes and visualizes LLM performance in games.
- **Game Arena**: Play games vs. LLMs or watch LLM vs. LLM
- **Leaderboard**: Performance statistics across games
- **Metrics Dashboard**: Visual summaries
- **Reasoning Analysis**: Illegal moves & behavior
**Data**: SQLite databases in `/results/`.
"""
)
# Local run only. On Spaces, the runtime will serve `interface` automatically.
if __name__ == "__main__":
interface.launch(server_name="0.0.0.0", server_port=None, show_api=False)