davanstrien's picture
davanstrien HF Staff
Add support for davanstrien/rolm-test dataset with model info display
1e32a60
/**
* Main Alpine.js application for OCR Text Explorer
*/
document.addEventListener('alpine:init', () => {
Alpine.data('ocrExplorer', () => ({
// Dataset state
datasetId: 'davanstrien/exams-ocr',
datasetConfig: 'default',
datasetSplit: 'train',
// Example datasets
exampleDatasets: [
{ id: 'davanstrien/exams-ocr', name: 'Exams OCR', description: 'Historical exam papers with VLM corrections' },
{ id: 'davanstrien/rolm-test', name: 'ROLM Test', description: 'Documents processed with RolmOCR model' }
],
// Navigation state
currentIndex: 0,
totalSamples: null,
currentSample: null,
jumpToPage: '',
// UI state
loading: false,
error: null,
activeTab: 'comparison',
diffMode: 'char',
darkMode: false,
showAbout: false,
showFlowView: false,
showDock: false,
renderMarkdown: false,
hasMarkdown: false,
// Flow view state
flowItems: [],
flowStartIndex: 0,
flowVisibleCount: 7,
flowOffset: 0,
// Dock state
dockItems: [],
dockHideTimeout: null,
dockStartIndex: 0,
dockVisibleCount: 10,
// Computed diff HTML
diffHtml: '',
// Statistics
similarity: 0,
charStats: { total: 0, added: 0, removed: 0 },
wordStats: { original: 0, improved: 0 },
// API instance
api: null,
// Markdown cache
markdownCache: new Map(),
// Model info
modelInfo: null,
columnInfo: null,
async init() {
// Initialize API
this.api = new DatasetAPI();
// Apply dark mode from localStorage
this.darkMode = localStorage.getItem('darkMode') === 'true';
this.$watch('darkMode', value => {
localStorage.setItem('darkMode', value);
document.documentElement.classList.toggle('dark', value);
});
document.documentElement.classList.toggle('dark', this.darkMode);
// Setup keyboard navigation
this.setupKeyboardNavigation();
// Load initial dataset
await this.loadDataset();
},
setupKeyboardNavigation() {
document.addEventListener('keydown', (e) => {
// Ignore if user is typing in input
if (e.target.tagName === 'INPUT') return;
switch(e.key) {
case 'ArrowLeft':
e.preventDefault();
if (e.shiftKey && this.showDock) {
this.scrollDockLeft();
} else {
this.previousSample();
}
break;
case 'ArrowRight':
e.preventDefault();
if (e.shiftKey && this.showDock) {
this.scrollDockRight();
} else {
this.nextSample();
}
break;
case 'k':
case 'K':
e.preventDefault();
this.previousSample();
break;
case 'j':
case 'J':
e.preventDefault();
this.nextSample();
break;
case '1':
this.activeTab = 'comparison';
break;
case '2':
this.activeTab = 'diff';
break;
case '3':
this.activeTab = 'improved';
break;
case 'v':
case 'V':
// Toggle dock with V key
if (this.showDock) {
this.hideDockPreview();
} else {
this.showDockPreview();
}
break;
}
});
},
async loadDataset() {
this.loading = true;
this.error = null;
// Clear markdown cache when loading new dataset
this.markdownCache.clear();
try {
// Validate dataset
await this.api.validateDataset(this.datasetId);
// Get dataset info
const info = await this.api.getDatasetInfo(this.datasetId);
this.datasetConfig = info.defaultConfig;
this.datasetSplit = info.defaultSplit;
// Get total rows
this.totalSamples = await this.api.getTotalRows(
this.datasetId,
this.datasetConfig,
this.datasetSplit
);
// Load first sample
this.currentIndex = 0;
await this.loadSample(0);
} catch (error) {
this.error = error.message;
} finally {
this.loading = false;
}
},
async loadSample(index) {
try {
const data = await this.api.getRow(
this.datasetId,
this.datasetConfig,
this.datasetSplit,
index
);
this.currentSample = data.row;
this.currentIndex = index;
this.columnInfo = data.columns;
// Extract model info if available
this.extractModelInfo();
// Debug: Log column info
console.log('Column info:', this.columnInfo);
console.log('Current sample keys:', Object.keys(this.currentSample));
// Check if improved text contains markdown
const improvedText = this.getImprovedText();
this.hasMarkdown = this.detectMarkdown(improvedText);
// Update diff when sample changes
this.updateDiff();
// Update URL without triggering navigation
const url = new URL(window.location);
url.searchParams.set('dataset', this.datasetId);
url.searchParams.set('index', index);
window.history.replaceState({}, '', url);
} catch (error) {
this.error = `Failed to load sample: ${error.message}`;
}
},
async nextSample() {
if (this.currentIndex < this.totalSamples - 1) {
await this.loadSample(this.currentIndex + 1);
}
},
async previousSample() {
if (this.currentIndex > 0) {
await this.loadSample(this.currentIndex - 1);
}
},
async jumpToSample() {
const pageNum = parseInt(this.jumpToPage);
if (!isNaN(pageNum) && pageNum >= 1 && pageNum <= this.totalSamples) {
// Convert 1-based page number to 0-based index
await this.loadSample(pageNum - 1);
// Clear the input after jumping
this.jumpToPage = '';
} else {
// Show error or just reset
this.jumpToPage = '';
}
},
async selectDataset(datasetId) {
this.datasetId = datasetId;
await this.loadDataset();
},
extractModelInfo() {
this.modelInfo = null;
if (!this.currentSample || !this.columnInfo || !this.columnInfo.inferenceInfo) {
console.log('No inference info column detected');
return;
}
const inferenceData = this.currentSample[this.columnInfo.inferenceInfo];
if (!inferenceData) {
console.log('No inference data in current sample');
return;
}
console.log('Raw inference data:', inferenceData);
const parsed = this.api.parseInferenceInfo(inferenceData);
console.log('Parsed inference data:', parsed);
if (parsed) {
const formattedInfo = this.formatModelInfo(parsed);
// Ensure it's a plain object, not a proxy
this.modelInfo = formattedInfo ? {...formattedInfo} : null;
console.log('Formatted model info:', this.modelInfo);
}
},
formatModelInfo(info) {
if (!info) return null;
return {
modelId: info.model_id || 'Unknown',
modelName: info.model_id ? info.model_id.split('/').pop() : 'Unknown',
processingDate: info.processing_date ? new Date(info.processing_date).toLocaleDateString() : null,
scriptVersion: info.script_version || null,
batchSize: info.batch_size || null,
maxTokens: info.max_tokens || null,
scriptUrl: info.script_url || null,
columnName: info.column_name || null
};
},
getOriginalText() {
if (!this.currentSample) return '';
const columns = this.api.detectColumns(null, this.currentSample);
return this.currentSample[columns.originalText] || 'No original text found';
},
getImprovedText() {
if (!this.currentSample) return '';
const columns = this.api.detectColumns(null, this.currentSample);
return this.currentSample[columns.improvedText] || 'No improved text found';
},
detectMarkdown(text) {
// Check for common markdown patterns
const markdownPatterns = [
/^#{1,6}\s/m, // Headers
/\*\*[^*]+\*\*/, // Bold
/\*[^*]+\*/, // Italic
/\[[^\]]+\]\([^)]+\)/, // Links
/^[-*+]\s/m, // Lists
/^\d+\.\s/m, // Numbered lists
/^>/m, // Blockquotes
/```[\s\S]*?```/, // Code blocks
/`[^`]+`/, // Inline code
/\|.*\|.*\|/m, // Tables (basic detection)
/<table>/i, // HTML tables
/<thead>/i // HTML table headers
];
return markdownPatterns.some(pattern => pattern.test(text));
},
escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
},
renderMarkdownText(text) {
if (!text || !this.renderMarkdown) return text;
// Check cache first
const cacheKey = `${this.currentIndex}_${text.substring(0, 100)}`;
if (this.markdownCache.has(cacheKey)) {
return this.markdownCache.get(cacheKey);
}
try {
// Configure marked options for security
const renderer = new marked.Renderer();
// Override link rendering to open in new tab and sanitize
const self = this;
renderer.link = function(href, title, text) {
// Basic URL sanitization
const safeHref = href.replace(/javascript:/gi, '').replace(/data:/gi, '');
const safeTitle = (title || '').replace(/"/g, '&quot;');
const safeText = self.escapeHtml(text);
return `<a href="${safeHref}" title="${safeTitle}" target="_blank" rel="noopener noreferrer" class="text-blue-600 dark:text-blue-400 underline">${safeText}</a>`;
};
// Override image rendering for safety
renderer.image = function(href, title, text) {
const safeHref = href.replace(/javascript:/gi, '').replace(/data:/gi, '');
const safeTitle = (title || '').replace(/"/g, '&quot;');
const safeAlt = self.escapeHtml(text);
return `<img src="${safeHref}" alt="${safeAlt}" title="${safeTitle}" class="max-w-full h-auto rounded">`;
};
// Override HTML rendering to prevent XSS but allow safe table elements
renderer.html = function(html) {
// Allow specific safe HTML tags for tables
const allowedTags = ['table', 'thead', 'tbody', 'tr', 'th', 'td'];
const tagPattern = new RegExp(`</?(?:${allowedTags.join('|')})(?:\\s[^>]*)?>`, 'gi');
// Check if the HTML contains only allowed tags
const strippedHtml = html.replace(tagPattern, '');
const hasDisallowedTags = /<[^>]+>/.test(strippedHtml);
if (!hasDisallowedTags) {
// Return the HTML if it only contains allowed table tags
return html;
}
// Strip all HTML by default
return '';
};
marked.setOptions({
renderer: renderer,
breaks: true, // Convert \n to <br>
gfm: true, // GitHub Flavored Markdown
pedantic: false,
smartLists: true,
smartypants: true,
headerIds: false, // Disable header IDs for security
mangle: false, // Don't mangle email addresses
sanitize: false, // We handle sanitization ourselves
tables: true // Enable table parsing
});
// Render markdown
let html = marked.parse(text);
// Add Tailwind classes to common elements
html = html.replace(/<h1>/g, '<h1 class="text-2xl font-bold mb-4 text-gray-900 dark:text-gray-100">');
html = html.replace(/<h2>/g, '<h2 class="text-xl font-bold mb-3 text-gray-900 dark:text-gray-100">');
html = html.replace(/<h3>/g, '<h3 class="text-lg font-bold mb-2 text-gray-900 dark:text-gray-100">');
html = html.replace(/<h4>/g, '<h4 class="text-base font-bold mb-2 text-gray-900 dark:text-gray-100">');
html = html.replace(/<p>/g, '<p class="mb-4 text-gray-900 dark:text-gray-100">');
html = html.replace(/<ul>/g, '<ul class="list-disc list-inside mb-4 text-gray-900 dark:text-gray-100">');
html = html.replace(/<ol>/g, '<ol class="list-decimal list-inside mb-4 text-gray-900 dark:text-gray-100">');
html = html.replace(/<li>/g, '<li class="mb-1">');
html = html.replace(/<blockquote>/g, '<blockquote class="border-l-4 border-gray-300 dark:border-gray-600 pl-4 my-4 italic text-gray-700 dark:text-gray-300">');
html = html.replace(/<code>/g, '<code class="bg-gray-100 dark:bg-gray-800 px-1 py-0.5 rounded text-sm font-mono">');
html = html.replace(/<pre>/g, '<pre class="bg-gray-100 dark:bg-gray-800 p-4 rounded-lg overflow-x-auto mb-4">');
html = html.replace(/<table>/g, '<table class="min-w-full divide-y divide-gray-300 dark:divide-gray-600 mb-4 border border-gray-300 dark:border-gray-600">');
html = html.replace(/<thead>/g, '<thead class="bg-gray-50 dark:bg-gray-800">');
html = html.replace(/<tbody>/g, '<tbody class="bg-white dark:bg-gray-900 divide-y divide-gray-200 dark:divide-gray-700">');
html = html.replace(/<tr>/g, '<tr class="hover:bg-gray-50 dark:hover:bg-gray-800">');
html = html.replace(/<th>/g, '<th class="px-3 py-3.5 text-left text-sm font-semibold text-gray-900 dark:text-gray-100 border border-gray-300 dark:border-gray-600">');
html = html.replace(/<td>/g, '<td class="px-3 py-4 text-sm text-gray-900 dark:text-gray-100 border border-gray-300 dark:border-gray-600">');
const result = `<div class="prose prose-sm dark:prose-invert max-w-none">${html}</div>`;
// Cache the result (limit cache size to prevent memory issues)
if (this.markdownCache.size > 50) {
// Remove oldest entries
const firstKey = this.markdownCache.keys().next().value;
this.markdownCache.delete(firstKey);
}
this.markdownCache.set(cacheKey, result);
return result;
} catch (error) {
console.error('Markdown rendering error:', error);
return text;
}
},
getImprovedTextRendered() {
const text = this.getImprovedText();
return this.renderMarkdownText(text);
},
getImageData() {
if (!this.currentSample) return null;
const columns = this.api.detectColumns(null, this.currentSample);
return columns.image ? this.currentSample[columns.image] : null;
},
getImageSrc() {
const imageData = this.getImageData();
return imageData?.src || '';
},
getImageDimensions() {
const imageData = this.getImageData();
if (imageData?.width && imageData?.height) {
return `${imageData.width}×${imageData.height}`;
}
return null;
},
updateDiff() {
const original = this.getOriginalText();
const improved = this.getImprovedText();
// Calculate statistics
this.calculateStatistics(original, improved);
// Use diff utility based on mode
switch(this.diffMode) {
case 'char':
this.diffHtml = createCharacterDiff(original, improved);
break;
case 'word':
this.diffHtml = createWordDiff(original, improved);
break;
case 'line':
this.diffHtml = createLineDiff(original, improved);
break;
case 'markdown':
// Pass the render function bound to this context
this.diffHtml = createMarkdownDiff(original, improved, (text) => this.renderMarkdownText(text));
break;
}
},
calculateStatistics(original, improved) {
// Calculate similarity
this.similarity = calculateSimilarity(original, improved);
// Character statistics
const charDiff = this.getCharacterDiffStats(original, improved);
this.charStats = charDiff;
// Word statistics
const originalWords = original.split(/\s+/).filter(w => w.length > 0);
const improvedWords = improved.split(/\s+/).filter(w => w.length > 0);
this.wordStats = {
original: originalWords.length,
improved: improvedWords.length
};
},
getCharacterDiffStats(original, improved) {
const dp = computeLCS(original, improved);
const diff = buildDiff(original, improved, dp);
let added = 0;
let removed = 0;
let unchanged = 0;
for (const part of diff) {
if (part.type === 'insert') {
added += part.value.length;
} else if (part.type === 'delete') {
removed += part.value.length;
} else {
unchanged += part.value.length;
}
}
return {
total: original.length,
added: added,
removed: removed,
unchanged: unchanged
};
},
async handleImageError(event) {
// Try to refresh the image URL
console.log('Image failed to load, refreshing URL...');
try {
const data = await this.api.refreshImageUrl(
this.datasetId,
this.datasetConfig,
this.datasetSplit,
this.currentIndex
);
// Update the image source
if (data.row && data.row[this.api.detectColumns(null, data.row).image]?.src) {
event.target.src = data.row[this.api.detectColumns(null, data.row).image].src;
}
} catch (error) {
console.error('Failed to refresh image URL:', error);
// Set a placeholder image
event.target.src = '';
}
},
exportComparison() {
const original = this.getOriginalText();
const improved = this.getImprovedText();
const metadata = {
dataset: this.datasetId,
page: this.currentIndex + 1,
totalPages: this.totalSamples,
exportDate: new Date().toISOString(),
similarity: `${this.similarity}%`,
statistics: {
characters: this.charStats,
words: this.wordStats
}
};
// Create export content
let content = `OCR Text Comparison Export\n`;
content += `==========================\n\n`;
content += `Dataset: ${metadata.dataset}\n`;
content += `Page: ${metadata.page} of ${metadata.totalPages}\n`;
content += `Export Date: ${new Date().toLocaleString()}\n`;
content += `Similarity: ${metadata.similarity}\n`;
content += `Characters: ${metadata.statistics.characters.total} total, `;
content += `${metadata.statistics.characters.added} added, `;
content += `${metadata.statistics.characters.removed} removed\n`;
content += `Words: ${metadata.statistics.words.original}${metadata.statistics.words.improved}\n`;
content += `\n${'='.repeat(50)}\n\n`;
content += `ORIGINAL OCR:\n`;
content += `${'='.repeat(50)}\n`;
content += original;
content += `\n\n${'='.repeat(50)}\n\n`;
content += `IMPROVED OCR:\n`;
content += `${'='.repeat(50)}\n`;
content += improved;
// Download file
const blob = new Blob([content], { type: 'text/plain' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `ocr-comparison-${this.datasetId.replace('/', '-')}-page-${this.currentIndex + 1}.txt`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
},
// Flow view methods
async toggleFlowView() {
this.showFlowView = !this.showFlowView;
if (this.showFlowView) {
// Reset to center around current page when opening
this.flowStartIndex = Math.max(0, this.currentIndex - Math.floor(this.flowVisibleCount / 2));
await this.loadFlowItems();
}
},
async loadFlowItems() {
// Load thumbnails from flowStartIndex
const startIdx = this.flowStartIndex;
this.flowItems = [];
// Load visible items
for (let i = 0; i < this.flowVisibleCount && (startIdx + i) < this.totalSamples; i++) {
const idx = startIdx + i;
try {
const data = await this.api.getRow(
this.datasetId,
this.datasetConfig,
this.datasetSplit,
idx
);
const columns = this.api.detectColumns(null, data.row);
const imageData = columns.image ? data.row[columns.image] : null;
this.flowItems.push({
index: idx,
imageSrc: imageData?.src || '',
row: data.row
});
} catch (error) {
console.error(`Failed to load flow item ${idx}:`, error);
}
}
},
scrollFlowLeft() {
if (this.flowStartIndex > 0) {
this.flowStartIndex = Math.max(0, this.flowStartIndex - this.flowVisibleCount);
this.loadFlowItems();
}
},
scrollFlowRight() {
if (this.flowStartIndex < this.totalSamples - this.flowVisibleCount) {
this.flowStartIndex = Math.min(
this.totalSamples - this.flowVisibleCount,
this.flowStartIndex + this.flowVisibleCount
);
this.loadFlowItems();
}
},
async jumpToFlowPage(index) {
this.showFlowView = false;
await this.loadSample(index);
},
async handleFlowImageError(event, index) {
// Try to refresh the image URL for flow item
try {
const data = await this.api.refreshImageUrl(
this.datasetId,
this.datasetConfig,
this.datasetSplit,
index
);
if (data.row) {
const columns = this.api.detectColumns(null, data.row);
const imageData = columns.image ? data.row[columns.image] : null;
if (imageData?.src) {
event.target.src = imageData.src;
// Update the flow item
const flowItem = this.flowItems.find(item => item.index === index);
if (flowItem) {
flowItem.imageSrc = imageData.src;
}
}
}
} catch (error) {
console.error('Failed to refresh flow image URL:', error);
}
},
// Dock methods
async showDockPreview() {
// Clear any hide timeout
if (this.dockHideTimeout) {
clearTimeout(this.dockHideTimeout);
this.dockHideTimeout = null;
}
this.showDock = true;
// Center dock around current page
this.dockStartIndex = Math.max(0,
Math.min(
this.currentIndex - Math.floor(this.dockVisibleCount / 2),
this.totalSamples - this.dockVisibleCount
)
);
// Always reload dock items to show current position
await this.loadDockItems();
},
hideDockPreview() {
// Add a small delay to prevent flickering
this.dockHideTimeout = setTimeout(() => {
this.showDock = false;
}, 300);
},
async loadDockItems() {
// Load thumbnails based on dock start index
const endIdx = Math.min(this.totalSamples, this.dockStartIndex + this.dockVisibleCount);
this.dockItems = [];
for (let i = this.dockStartIndex; i < endIdx; i++) {
try {
const data = await this.api.getRow(
this.datasetId,
this.datasetConfig,
this.datasetSplit,
i
);
const columns = this.api.detectColumns(null, data.row);
const imageData = columns.image ? data.row[columns.image] : null;
this.dockItems.push({
index: i,
imageSrc: imageData?.src || '',
row: data.row
});
} catch (error) {
console.error(`Failed to load dock item ${i}:`, error);
}
}
},
async scrollDockLeft() {
if (this.dockStartIndex > 0) {
this.dockStartIndex = Math.max(0, this.dockStartIndex - Math.floor(this.dockVisibleCount / 2));
await this.loadDockItems();
}
},
async scrollDockRight() {
if (this.dockStartIndex < this.totalSamples - this.dockVisibleCount) {
this.dockStartIndex = Math.min(
this.totalSamples - this.dockVisibleCount,
this.dockStartIndex + Math.floor(this.dockVisibleCount / 2)
);
await this.loadDockItems();
}
},
async jumpToDockPage(index) {
this.showDock = false;
await this.loadSample(index);
},
// Watch for diff mode changes
initWatchers() {
this.$watch('diffMode', () => this.updateDiff());
this.$watch('currentSample', () => this.updateDiff());
}
}));
});
// Initialize watchers after Alpine loads
document.addEventListener('alpine:initialized', () => {
Alpine.store('ocrExplorer')?.initWatchers?.();
});