Spaces:

davanstrien
/

ocr-time-capsule

Running

davanstrien HF Staff Claude commited on about 4 hours ago

Commit

1e32a60

1 Parent(s): dbda7b0

Add support for davanstrien/rolm-test dataset with model info display

- Add inference_info column detection and rolmocr_text as improved text column
- Add example dataset selector dropdown with quick access to both datasets
- Extract and display model metadata (model name, processing date, parameters)
- Move model info panel above metrics for better visibility
- Fix sidebar scrolling to prevent model info from being hidden
- Add debug logging for troubleshooting column detection

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (3) hide show

index.html +61 -2
js/app.js +64 -0
js/dataset-api.js +49 -3

index.html CHANGED Viewed

@@ -63,6 +63,27 @@
                         >
                             Load
                         </button>
                     </div>
                 </div>
@@ -202,8 +223,8 @@
         <!-- Content Area -->
         <div x-show="!loading && !error && currentSample" class="flex-1 flex h-full">
             <!-- Image Panel -->
-            <div class="w-1/3 bg-gray-100 dark:bg-gray-800 p-4 overflow-hidden border-r border-gray-200 dark:border-gray-700">
-                <div class="sticky top-0">
                     <div class="bg-white dark:bg-gray-700 rounded-lg shadow-sm overflow-hidden">
                         <img
                             :src="getImageSrc()"
@@ -221,6 +242,44 @@
                         </div>
                     </div>
                     <!-- Statistics Panel -->
                     <div class="mt-4 bg-white dark:bg-gray-700 rounded-lg shadow-sm p-4">
                         <h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3">OCR Quality Metrics</h3>

                         >
                             Load
                         </button>
+                        <!-- Example Dataset Selector -->
+                        <div class="relative group">
+                            <button class="px-3 py-1.5 text-sm text-gray-600 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-200 border border-gray-300 dark:border-gray-600 rounded-md flex items-center space-x-1">
+                                <svg class="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
+                                    <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2"></path>
+                                </svg>
+                                <span>Examples</span>
+                            </button>
+                            <div class="absolute left-0 mt-1 w-72 bg-white dark:bg-gray-800 rounded-md shadow-lg border border-gray-200 dark:border-gray-700 hidden group-hover:block z-50">
+                                <template x-for="dataset in exampleDatasets" :key="dataset.id">
+                                    <button
+                                        @click="selectDataset(dataset.id)"
+                                        class="block w-full text-left px-4 py-3 hover:bg-gray-50 dark:hover:bg-gray-700 border-b border-gray-100 dark:border-gray-600 last:border-b-0"
+                                    >
+                                        <div class="font-medium text-sm text-gray-900 dark:text-gray-100" x-text="dataset.name"></div>
+                                        <div class="text-xs text-gray-500 dark:text-gray-400 mt-1" x-text="dataset.description"></div>
+                                    </button>
+                                </template>
+                            </div>
+                        </div>
                     </div>
                 </div>
         <!-- Content Area -->
         <div x-show="!loading && !error && currentSample" class="flex-1 flex h-full">
             <!-- Image Panel -->
+            <div class="w-1/3 bg-gray-100 dark:bg-gray-800 p-4 overflow-y-auto border-r border-gray-200 dark:border-gray-700">
+                <div>
                     <div class="bg-white dark:bg-gray-700 rounded-lg shadow-sm overflow-hidden">
                         <img
                             :src="getImageSrc()"
                         </div>
                     </div>
+                    <!-- Model Info Panel -->
+                    <div x-show="modelInfo" x-transition class="mt-4 bg-white dark:bg-gray-700 rounded-lg shadow-sm p-4">
+                        <h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3 flex items-center">
+                            <svg class="w-4 h-4 mr-1" fill="none" stroke="currentColor" viewBox="0 0 24 24">
+                                <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9.75 17L9 20l-1 1h8l-1-1-.75-3M3 13h18M5 17h14a2 2 0 002-2V5a2 2 0 00-2-2H5a2 2 0 00-2 2v10a2 2 0 002 2z"></path>
+                            </svg>
+                            Model Information
+                        </h3>
+                        <div class="space-y-2 text-xs">
+                            <div class="flex justify-between items-center">
+                                <span class="text-gray-600 dark:text-gray-400">Model</span>
+                                <span class="font-medium text-gray-900 dark:text-gray-100" x-text="modelInfo?.modelName || '-'"></span>
+                            </div>
+                            <div x-show="modelInfo?.processingDate" class="flex justify-between items-center">
+                                <span class="text-gray-600 dark:text-gray-400">Processed</span>
+                                <span class="text-gray-900 dark:text-gray-100" x-text="modelInfo?.processingDate || '-'"></span>
+                            </div>
+                            <div x-show="modelInfo?.batchSize" class="flex justify-between items-center">
+                                <span class="text-gray-600 dark:text-gray-400">Batch Size</span>
+                                <span class="text-gray-900 dark:text-gray-100" x-text="modelInfo?.batchSize || '-'"></span>
+                            </div>
+                            <div x-show="modelInfo?.maxTokens" class="flex justify-between items-center">
+                                <span class="text-gray-600 dark:text-gray-400">Max Tokens</span>
+                                <span class="text-gray-900 dark:text-gray-100" x-text="modelInfo?.maxTokens?.toLocaleString() || '-'"></span>
+                            </div>
+                            <div x-show="modelInfo?.scriptUrl" class="mt-2 pt-2 border-t border-gray-200 dark:border-gray-600">
+                                <a :href="modelInfo?.scriptUrl"
+                                   target="_blank"
+                                   class="text-blue-600 dark:text-blue-400 hover:underline flex items-center">
+                                    <svg class="w-3 h-3 mr-1" fill="none" stroke="currentColor" viewBox="0 0 24 24">
+                                        <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14"></path>
+                                    </svg>
+                                    View Script
+                                </a>
+                            </div>
+                        </div>
+                    </div>
                     <!-- Statistics Panel -->
                     <div class="mt-4 bg-white dark:bg-gray-700 rounded-lg shadow-sm p-4">
                         <h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3">OCR Quality Metrics</h3>

js/app.js CHANGED Viewed

@@ -9,6 +9,12 @@ document.addEventListener('alpine:init', () => {
         datasetConfig: 'default',
         datasetSplit: 'train',
         // Navigation state
         currentIndex: 0,
         totalSamples: null,
@@ -53,6 +59,10 @@ document.addEventListener('alpine:init', () => {
         // Markdown cache
         markdownCache: new Map(),
         async init() {
             // Initialize API
             this.api = new DatasetAPI();
@@ -171,6 +181,14 @@ document.addEventListener('alpine:init', () => {
                 this.currentSample = data.row;
                 this.currentIndex = index;
                 // Check if improved text contains markdown
                 const improvedText = this.getImprovedText();
@@ -215,6 +233,52 @@ document.addEventListener('alpine:init', () => {
             }
         },
         getOriginalText() {
             if (!this.currentSample) return '';
             const columns = this.api.detectColumns(null, this.currentSample);

         datasetConfig: 'default',
         datasetSplit: 'train',
+        // Example datasets
+        exampleDatasets: [
+            { id: 'davanstrien/exams-ocr', name: 'Exams OCR', description: 'Historical exam papers with VLM corrections' },
+            { id: 'davanstrien/rolm-test', name: 'ROLM Test', description: 'Documents processed with RolmOCR model' }
+        ],
         // Navigation state
         currentIndex: 0,
         totalSamples: null,
         // Markdown cache
         markdownCache: new Map(),
+        // Model info
+        modelInfo: null,
+        columnInfo: null,
         async init() {
             // Initialize API
             this.api = new DatasetAPI();
                 this.currentSample = data.row;
                 this.currentIndex = index;
+                this.columnInfo = data.columns;
+                // Extract model info if available
+                this.extractModelInfo();
+                // Debug: Log column info
+                console.log('Column info:', this.columnInfo);
+                console.log('Current sample keys:', Object.keys(this.currentSample));
                 // Check if improved text contains markdown
                 const improvedText = this.getImprovedText();
             }
         },
+        async selectDataset(datasetId) {
+            this.datasetId = datasetId;
+            await this.loadDataset();
+        },
+        extractModelInfo() {
+            this.modelInfo = null;
+            if (!this.currentSample || !this.columnInfo || !this.columnInfo.inferenceInfo) {
+                console.log('No inference info column detected');
+                return;
+            }
+            const inferenceData = this.currentSample[this.columnInfo.inferenceInfo];
+            if (!inferenceData) {
+                console.log('No inference data in current sample');
+                return;
+            }
+            console.log('Raw inference data:', inferenceData);
+            const parsed = this.api.parseInferenceInfo(inferenceData);
+            console.log('Parsed inference data:', parsed);
+            if (parsed) {
+                const formattedInfo = this.formatModelInfo(parsed);
+                // Ensure it's a plain object, not a proxy
+                this.modelInfo = formattedInfo ? {...formattedInfo} : null;
+                console.log('Formatted model info:', this.modelInfo);
+            }
+        },
+        formatModelInfo(info) {
+            if (!info) return null;
+            return {
+                modelId: info.model_id || 'Unknown',
+                modelName: info.model_id ? info.model_id.split('/').pop() : 'Unknown',
+                processingDate: info.processing_date ? new Date(info.processing_date).toLocaleDateString() : null,
+                scriptVersion: info.script_version || null,
+                batchSize: info.batch_size || null,
+                maxTokens: info.max_tokens || null,
+                scriptUrl: info.script_url || null,
+                columnName: info.column_name || null
+            };
+        },
         getOriginalText() {
             if (!this.currentSample) return '';
             const columns = this.api.detectColumns(null, this.currentSample);

js/dataset-api.js CHANGED Viewed

@@ -176,6 +176,7 @@ class DatasetAPI {
         let imageColumn = null;
         let originalTextColumn = null;
         let improvedTextColumn = null;
         // Try to detect from features first
         for (const feature of features || []) {
@@ -192,9 +193,14 @@ class DatasetAPI {
                 originalTextColumn = name;
             }
-            if (!improvedTextColumn && ['markdown', 'new_ocr', 'corrected_text', 'improved', 'vlm_ocr', 'corrected'].includes(name)) {
                 improvedTextColumn = name;
             }
         }
         // Fallback: detect from sample row
@@ -217,15 +223,21 @@ class DatasetAPI {
             }
             if (!improvedTextColumn) {
-                const candidates = ['markdown', 'new_ocr', 'corrected_text', 'improved'];
                 improvedTextColumn = keys.find(k => candidates.includes(k)) || null;
             }
         }
         return {
             image: imageColumn,
             originalText: originalTextColumn,
-            improvedText: improvedTextColumn
         };
     }
@@ -267,6 +279,40 @@ class DatasetAPI {
     clearCache() {
         this.cache.clear();
     }
 }
 // Export for use in other scripts

         let imageColumn = null;
         let originalTextColumn = null;
         let improvedTextColumn = null;
+        let inferenceInfoColumn = null;
         // Try to detect from features first
         for (const feature of features || []) {
                 originalTextColumn = name;
             }
+            if (!improvedTextColumn && ['markdown', 'new_ocr', 'corrected_text', 'improved', 'vlm_ocr', 'corrected', 'rolmocr_text'].includes(name)) {
                 improvedTextColumn = name;
             }
+            // Detect inference info column
+            if (name === 'inference_info') {
+                inferenceInfoColumn = name;
+            }
         }
         // Fallback: detect from sample row
             }
             if (!improvedTextColumn) {
+                const candidates = ['markdown', 'new_ocr', 'corrected_text', 'improved', 'rolmocr_text'];
                 improvedTextColumn = keys.find(k => candidates.includes(k)) || null;
             }
+            // Check for inference info in sample row
+            if (!inferenceInfoColumn && keys.includes('inference_info')) {
+                inferenceInfoColumn = 'inference_info';
+            }
         }
         return {
             image: imageColumn,
             originalText: originalTextColumn,
+            improvedText: improvedTextColumn,
+            inferenceInfo: inferenceInfoColumn
         };
     }
     clearCache() {
         this.cache.clear();
     }
+    /**
+     * Parse inference info JSON safely
+     */
+    parseInferenceInfo(inferenceInfoData) {
+        if (!inferenceInfoData) return null;
+        try {
+            // Handle if it's already an object (some datasets might store it as object)
+            if (typeof inferenceInfoData === 'object' && !Array.isArray(inferenceInfoData)) {
+                return inferenceInfoData;
+            }
+            // Handle if it's a JSON string
+            if (typeof inferenceInfoData === 'string') {
+                const parsed = JSON.parse(inferenceInfoData);
+                // If it's an array, take the first item
+                if (Array.isArray(parsed) && parsed.length > 0) {
+                    return parsed[0];
+                }
+                return parsed;
+            }
+            // Handle if it's already an array
+            if (Array.isArray(inferenceInfoData) && inferenceInfoData.length > 0) {
+                return inferenceInfoData[0];
+            }
+            return null;
+        } catch (error) {
+            console.warn('Failed to parse inference info:', error);
+            return null;
+        }
+    }
 }
 // Export for use in other scripts