<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>Computer Agent Evaluation Viewer</title> <style> /* CSS styles here */ body { font-family: Arial, sans-serif; margin: 0; padding: 20px; background-color: #f5f5f5; } .container { max-width: 1200px; margin: 0 auto; background-color: #fff; padding: 20px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); } h1, h2, h3 { color: #333; } select, input, button { padding: 8px 12px; margin: 5px 0; border: 1px solid #ddd; border-radius: 4px; } button { background-color: #4a6cf7; color: white; cursor: pointer; border: none; } button:hover { background-color: #3a5ce5; } button:disabled { background-color: #cccccc; cursor: not-allowed; } .row { display: flex; margin-bottom: 20px; } .col { flex: 1; padding: 0 10px; } .image-viewer { width: 100%; max-height: 500px; border: 1px solid #ddd; border-radius: 4px; overflow: hidden; margin-bottom: 10px; position: relative; } .image-viewer img { max-width: 100%; max-height: 450px; display: block; margin: 0 auto; } .image-controls { display: flex; justify-content: space-between; align-items: center; margin-top: 10px; } .nav-buttons { display: flex; gap: 10px; } .step { border: 1px solid #ddd; border-radius: 4px; margin-bottom: 10px; overflow: hidden; } .step-header { background-color: #f0f0f0; padding: 10px; font-weight: bold; cursor: pointer; display: flex; justify-content: space-between; } .step-content { padding: 15px; white-space: pre-wrap; font-family: monospace; background-color: #f9f9f9; max-height: 300px; overflow-y: auto; } .hidden { display: none; } .status-success { color: #22c55e; font-weight: bold; } .status-failure { color: #ef4444; font-weight: bold; } .tabs { display: flex; border-bottom: 1px solid #ddd; margin-bottom: 20px; } .tab { padding: 10px 20px; cursor: pointer; border-bottom: 2px solid transparent; } .tab.active { border-bottom-color: #4a6cf7; font-weight: bold; } .tab-content { display: none; } .tab-content.active { display: block; } pre { background-color: #f0f0f0; padding: 10px; border-radius: 4px; overflow-x: auto; white-space: pre-wrap; } .error-message { background-color: #fee2e2; color: #b91c1c; padding: 10px; border-radius: 4px; margin: 10px 0; } .loading { display: inline-block; width: 20px; height: 20px; border: 2px solid #f3f3f3; border-top: 2px solid #3498db; border-radius: 50%; animation: spin 1s linear infinite; margin-left: 10px; } @keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } } </style> </head> <body> <div class="container"> <h1>Computer Agent Evaluation Viewer</h1> <!-- Path and Eval Selection --> <div style="margin-bottom: 20px; padding: 15px; background-color: #f0f0f0; border-radius: 8px;"> <h2>Load Evaluation Data</h2> <div style="display: flex; gap: 10px; margin-top: 10px;"> <input type="text" id="base-path" placeholder="Base directory path (leave empty for default)" style="flex-grow: 1; padding: 8px; border: 1px solid #ddd; border-radius: 4px;"> <button id="refresh-evals-btn">Refresh</button> </div> <div style="margin-top: 10px;"> <label for="eval-select">Select Evaluation:</label> <select id="eval-select" style="min-width: 300px;"></select> </div> <div id="load-status" style="margin-top: 10px; font-style: italic;"></div> </div> <!-- Example and Run Selectors --> <div class="row"> <div class="col"> <label for="example-select">Select Example:</label> <select id="example-select"> <option value="">-- Select Example --</option> </select> </div> <div class="col"> <label for="run-select">Select Run:</label> <select id="run-select" disabled> <option value="">-- Select Run --</option> </select> </div> </div> <!-- Task & Status Display --> <div id="run-details" class="hidden"> <div> <h2>Task</h2> <pre id="task-text"></pre> </div> <div> <h2>Run Status</h2> <div id="status-display"></div> </div> <!-- Tabs --> <div class="tabs"> <div class="tab active" data-tab="screenshots">Screenshots</div> <div class="tab" data-tab="agent-trace">Agent Trace</div> <div class="tab" data-tab="raw-json">Raw JSON</div> </div> <!-- Screenshots Tab --> <div id="screenshots-tab" class="tab-content active"> <div id="no-images" class="hidden"> <p>No screenshots available for this run.</p> </div> <div id="image-container" class="image-viewer hidden"> <img id="current-image" src="" alt="Screenshot"> <p id="image-caption" class="text-center"></p> </div> <div class="image-controls hidden" id="image-controls"> <div class="nav-buttons"> <button id="prev-image">Previous</button> <span id="image-counter">0 / 0</span> <button id="next-image">Next</button> </div> <input type="range" id="image-slider" min="0" max="0" value="0" style="width: 100%"> </div> </div> <!-- Agent Trace Tab --> <div id="agent-trace-tab" class="tab-content"> <div id="agent-steps"></div> </div> <!-- Raw JSON Tab --> <div id="raw-json-tab" class="tab-content"> <div id="json-loading-indicator" class="hidden"> <p>Loading metadata... <span class="loading"></span></p> </div> <div id="json-error" class="error-message hidden"></div> <pre id="raw-json"></pre> </div> </div> </div> <script> // Application state const appState = { basePath: '', evalId: null, currentExampleId: null, currentRunId: null, currentImages: [], currentImageIndex: 0, loadedData: { examples: {}, runs: {}, metadata: {}, screenshots: {} } }; // DOM elements const basePathInput = document.getElementById('base-path'); const refreshEvalsBtn = document.getElementById('refresh-evals-btn'); const evalSelect = document.getElementById('eval-select'); const loadStatusDisplay = document.getElementById('load-status'); const exampleSelect = document.getElementById('example-select'); const runSelect = document.getElementById('run-select'); const runDetails = document.getElementById('run-details'); const taskText = document.getElementById('task-text'); const statusDisplay = document.getElementById('status-display'); const imageContainer = document.getElementById('image-container'); const noImages = document.getElementById('no-images'); const imageControls = document.getElementById('image-controls'); const currentImage = document.getElementById('current-image'); const imageCaption = document.getElementById('image-caption'); const imageCounter = document.getElementById('image-counter'); const imageSlider = document.getElementById('image-slider'); const prevImage = document.getElementById('prev-image'); const nextImage = document.getElementById('next-image'); const agentSteps = document.getElementById('agent-steps'); const rawJson = document.getElementById('raw-json'); const jsonLoadingIndicator = document.getElementById('json-loading-indicator'); const jsonError = document.getElementById('json-error'); // Initialize by loading available evaluations refreshEvalsBtn.addEventListener('click', loadEvaluations); // Load evaluations from server async function loadEvaluations() { appState.basePath = basePathInput.value.trim(); loadStatusDisplay.textContent = 'Loading evaluations...'; refreshEvalsBtn.disabled = true; try { const response = await fetch(`/api/evals?path=${encodeURIComponent(appState.basePath)}`); if (!response.ok) { const errorData = await response.json(); throw new Error(errorData.error || 'Failed to load evaluations'); } const evals = await response.json(); // Clear existing options evalSelect.innerHTML = '<option value="">-- Select Evaluation --</option>'; // Add new options evals.forEach(evalId => { const option = document.createElement('option'); option.value = evalId; option.textContent = evalId; evalSelect.appendChild(option); }); loadStatusDisplay.textContent = `Loaded ${evals.length} evaluations`; // AUTO-SELECT LATEST EVALUATION if (evals.length > 0) { // Sort evaluations to get the latest one evals.sort().reverse(); evalSelect.value = evals[0]; // Trigger change event to load examples evalSelect.dispatchEvent(new Event('change')); } } catch (err) { console.error('Error loading evaluations:', err); loadStatusDisplay.textContent = `Error: ${err.message}`; } finally { refreshEvalsBtn.disabled = false; } } // Handle evaluation selection evalSelect.addEventListener('change', async () => { appState.evalId = evalSelect.value; if (!appState.evalId) { exampleSelect.innerHTML = '<option value="">-- Select Example --</option>'; exampleSelect.disabled = true; runSelect.innerHTML = '<option value="">-- Select Run --</option>'; runSelect.disabled = true; runDetails.classList.add('hidden'); return; } try { loadStatusDisplay.textContent = 'Loading examples...'; evalSelect.disabled = true; const response = await fetch(`/api/eval/${appState.evalId}/examples?path=${encodeURIComponent(appState.basePath)}`); if (!response.ok) { const errorData = await response.json(); throw new Error(errorData.error || 'Failed to load examples'); } const examples = await response.json(); appState.loadedData.examples = examples; // Update example dropdown exampleSelect.innerHTML = '<option value="">-- Select Example --</option>'; for (const [exampleId, task] of Object.entries(examples)) { const option = document.createElement('option'); option.value = exampleId; option.textContent = exampleId; option.title = task; // Show task as tooltip exampleSelect.appendChild(option); } exampleSelect.disabled = false; runSelect.innerHTML = '<option value="">-- Select Run --</option>'; runSelect.disabled = true; runDetails.classList.add('hidden'); loadStatusDisplay.textContent = `Loaded ${Object.keys(examples).length} examples`; // AUTO-SELECT FIRST EXAMPLE if (Object.keys(examples).length > 0) { const firstExampleId = Object.keys(examples)[0]; exampleSelect.value = firstExampleId; // Trigger change event to load runs exampleSelect.dispatchEvent(new Event('change')); } } catch (err) { console.error('Error loading examples:', err); loadStatusDisplay.textContent = `Error: ${err.message}`; } finally { evalSelect.disabled = false; } }); // Example selection exampleSelect.addEventListener('change', async () => { appState.currentExampleId = exampleSelect.value; // Reset run selection runSelect.innerHTML = '<option value="">-- Select Run --</option>'; if (!appState.currentExampleId) { runSelect.disabled = true; runDetails.classList.add('hidden'); return; } try { loadStatusDisplay.textContent = 'Loading runs...'; exampleSelect.disabled = true; const response = await fetch(`/api/eval/${appState.evalId}/example/${appState.currentExampleId}/runs?path=${encodeURIComponent(appState.basePath)}`); if (!response.ok) { const errorData = await response.json(); throw new Error(errorData.error || 'Failed to load runs'); } const runs = await response.json(); appState.loadedData.runs[appState.currentExampleId] = runs; // SORT RUNS by ID (assuming run IDs have timestamps or sequence numbers) runs.sort((a, b) => a.id.localeCompare(b.id, undefined, {numeric: true})); // Update run dropdown with sorted runs runSelect.innerHTML = '<option value="">-- Select Run --</option>'; runs.forEach(run => { const option = document.createElement('option'); option.value = run.id; option.textContent = `${run.id} (${run.status})`; option.dataset.status = run.status; runSelect.appendChild(option); }); runSelect.disabled = false; runDetails.classList.add('hidden'); loadStatusDisplay.textContent = `Loaded ${runs.length} runs`; // AUTO-SELECT FIRST RUN if (runs.length > 0) { runSelect.value = runs[0].id; // Trigger change event to load run data runSelect.dispatchEvent(new Event('change')); } } catch (err) { console.error('Error loading runs:', err); loadStatusDisplay.textContent = `Error: ${err.message}`; } finally { exampleSelect.disabled = false; } }); // Run selection runSelect.addEventListener('change', () => { appState.currentRunId = runSelect.value; if (appState.currentRunId && appState.currentExampleId) { loadRunData(appState.currentExampleId, appState.currentRunId); runDetails.classList.remove('hidden'); } else { runDetails.classList.add('hidden'); } }); // Load run data async function loadRunData(exampleId, runId) { loadStatusDisplay.textContent = 'Loading run data...'; runSelect.disabled = true; jsonLoadingIndicator.classList.remove('hidden'); jsonError.classList.add('hidden'); try { // Get metadata const metadataResponse = await fetch(`/api/eval/${appState.evalId}/example/${exampleId}/run/${runId}/metadata?path=${encodeURIComponent(appState.basePath)}`); let metadata; if (metadataResponse.ok) { metadata = await metadataResponse.json(); } else { const errorData = await metadataResponse.json(); console.error('Error loading metadata:', errorData); jsonError.textContent = `Error loading metadata: ${errorData.error || 'Unknown error'}`; jsonError.classList.remove('hidden'); metadata = null; } appState.loadedData.metadata[exampleId] = appState.loadedData.metadata[exampleId] || {}; appState.loadedData.metadata[exampleId][runId] = metadata; // Display task const task = appState.loadedData.examples[exampleId]; taskText.textContent = task || "No task available"; // Display status let statusHtml = ""; if (metadata) { if (metadata.status === 'completed') { statusHtml = `<p><span class="status-success">✓ Completed successfully</span></p>`; } else { statusHtml = `<p><span class="status-failure">✗ Failed</span></p>`; if (metadata.error_message) { statusHtml += `<p>Error: ${metadata.error_message}</p>`; } } } else { statusHtml = "<p>Status information not available</p>"; } statusDisplay.innerHTML = statusHtml; // Get screenshots const screenshotsResponse = await fetch(`/api/eval/${appState.evalId}/example/${exampleId}/run/${runId}/screenshots?path=${encodeURIComponent(appState.basePath)}`); const screenshots = await screenshotsResponse.json(); appState.loadedData.screenshots[exampleId] = appState.loadedData.screenshots[exampleId] || {}; appState.loadedData.screenshots[exampleId][runId] = screenshots; // Load screenshots loadScreenshots(exampleId, runId); // Load agent trace renderAgentTrace(metadata); // Display raw JSON if (metadata) { rawJson.textContent = JSON.stringify(metadata, null, 2); } else { rawJson.textContent = "No metadata available"; } // Show screenshots tab by default document.querySelector('.tab[data-tab="screenshots"]').click(); loadStatusDisplay.textContent = 'Run data loaded successfully'; } catch (err) { console.error('Error loading run data:', err); loadStatusDisplay.textContent = `Error: ${err.message}`; jsonError.textContent = `Error loading data: ${err.message}`; jsonError.classList.remove('hidden'); } finally { jsonLoadingIndicator.classList.add('hidden'); runSelect.disabled = false; } } // Load screenshots function loadScreenshots(exampleId, runId) { appState.currentImages = appState.loadedData.screenshots[exampleId]?.[runId] || []; if (appState.currentImages.length === 0) { imageContainer.classList.add('hidden'); imageControls.classList.add('hidden'); noImages.classList.remove('hidden'); return; } // Setup image viewer noImages.classList.add('hidden'); imageContainer.classList.remove('hidden'); imageControls.classList.remove('hidden'); // Configure slider imageSlider.min = 0; imageSlider.max = appState.currentImages.length - 1; imageSlider.value = 0; // Reset to first image appState.currentImageIndex = 0; updateImageDisplay(); } // Update image display function updateImageDisplay() { if (appState.currentImages.length === 0) return; const image = appState.currentImages[appState.currentImageIndex]; currentImage.src = image.path; imageCaption.textContent = image.name; imageCounter.textContent = `${appState.currentImageIndex + 1} / ${appState.currentImages.length}`; imageSlider.value = appState.currentImageIndex; // Update button states prevImage.disabled = appState.currentImageIndex === 0; nextImage.disabled = appState.currentImageIndex === appState.currentImages.length - 1; } // Image navigation prevImage.addEventListener('click', () => { if (appState.currentImageIndex > 0) { appState.currentImageIndex--; updateImageDisplay(); } }); nextImage.addEventListener('click', () => { if (appState.currentImageIndex < appState.currentImages.length - 1) { appState.currentImageIndex++; updateImageDisplay(); } }); imageSlider.addEventListener('input', () => { appState.currentImageIndex = parseInt(imageSlider.value); updateImageDisplay(); }); // Tab handling document.querySelectorAll('.tab').forEach(tab => { tab.addEventListener('click', () => { // Set active tab document.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); tab.classList.add('active'); // Show active content const tabId = tab.getAttribute('data-tab'); document.querySelectorAll('.tab-content').forEach(content => { content.classList.remove('active'); }); document.getElementById(`${tabId}-tab`).classList.add('active'); }); }); // Render agent trace - UPDATED to show all sections expanded and remove duplicated task title function renderAgentTrace(metadata) { agentSteps.innerHTML = ''; if (!metadata || !metadata.summary || metadata.summary.length === 0) { agentSteps.innerHTML = '<p>No agent trace data available</p>'; return; } // Process each step metadata.summary.forEach((step, index) => { const stepDiv = document.createElement('div'); stepDiv.className = 'step'; // Create step header const headerDiv = document.createElement('div'); headerDiv.className = 'step-header'; let headerText = `Step ${index}`; if (index === 0 && step.task) { headerText = 'Task'; } else if (step.model_output_message) { headerText = 'Planning'; } else if (step.tool_calls) { headerText = `Action ${index}`; } else if (step.error) { headerText = 'Error'; } headerDiv.innerHTML = `<span>${headerText}</span><span>▲</span>`; stepDiv.appendChild(headerDiv); // Create step content const contentDiv = document.createElement('div'); contentDiv.className = 'step-content'; // Make all sections visible by default contentDiv.style.display = 'block'; let contentHtml = ''; // Task information - don't duplicate the title if (index === 0 && step.task) { // Just show the task content without the "Task:" title contentHtml += `${step.task}\n\n`; } // Model output and planning if (step.model_output_message && step.model_output_message.content) { contentHtml += `<strong>Model Output:</strong>\n${step.model_output_message.content}\n\n`; if (step.plan) { contentHtml += `<strong>Plan:</strong>\n${step.plan}\n\n`; } } // Tool calls if (step.tool_calls && step.tool_calls.length > 0) { step.tool_calls.forEach(toolCall => { if (toolCall.function) { contentHtml += `<strong>Tool Call:</strong> ${toolCall.function.name}\n`; if (toolCall.function.arguments) { contentHtml += `<strong>Arguments:</strong>\n${toolCall.function.arguments}\n\n`; } } }); } // Model reasoning if (step.model_output) { contentHtml += `<strong>Model Reasoning:</strong>\n${step.model_output}\n\n`; } // Observations if (step.observations) { contentHtml += `<strong>Observations:</strong>\n${step.observations}\n\n`; } // Action output if (step.action_output) { contentHtml += `<strong>Action Output:</strong>\n${step.action_output}\n\n`; } // Errors if (step.error) { contentHtml += `<strong>Error Type:</strong> ${step.error.type || 'Unknown'}\n`; if (step.error.message) { contentHtml += `<strong>Error Message:</strong> ${step.error.message}\n`; } } contentDiv.textContent = contentHtml || "No content available for this step"; stepDiv.appendChild(contentDiv); // Add click handler to toggle content headerDiv.addEventListener('click', () => { const isHidden = contentDiv.style.display === 'none'; contentDiv.style.display = isHidden ? 'block' : 'none'; headerDiv.querySelector('span:last-child').textContent = isHidden ? '▲' : '▼'; }); agentSteps.appendChild(stepDiv); }); // No need to expand the first step by default since all are now expanded } // Handle keyboard navigation for images document.addEventListener('keydown', (e) => { if (!appState.currentImages || appState.currentImages.length === 0) return; // Check if the screenshots tab is active const screenshotsTab = document.getElementById('screenshots-tab'); if (!screenshotsTab.classList.contains('active')) return; if (e.key === 'ArrowLeft' && appState.currentImageIndex > 0) { appState.currentImageIndex--; updateImageDisplay(); } else if (e.key === 'ArrowRight' && appState.currentImageIndex < appState.currentImages.length - 1) { appState.currentImageIndex++; updateImageDisplay(); } }); // Load evaluations on page load document.addEventListener('DOMContentLoaded', loadEvaluations); </script> </body> </html>