Spaces:
Running
Running
Commit
·
1e32a60
1
Parent(s):
dbda7b0
Add support for davanstrien/rolm-test dataset with model info display
Browse files- Add inference_info column detection and rolmocr_text as improved text column
- Add example dataset selector dropdown with quick access to both datasets
- Extract and display model metadata (model name, processing date, parameters)
- Move model info panel above metrics for better visibility
- Fix sidebar scrolling to prevent model info from being hidden
- Add debug logging for troubleshooting column detection
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <[email protected]>
- index.html +61 -2
- js/app.js +64 -0
- js/dataset-api.js +49 -3
index.html
CHANGED
@@ -63,6 +63,27 @@
|
|
63 |
>
|
64 |
Load
|
65 |
</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
</div>
|
67 |
</div>
|
68 |
|
@@ -202,8 +223,8 @@
|
|
202 |
<!-- Content Area -->
|
203 |
<div x-show="!loading && !error && currentSample" class="flex-1 flex h-full">
|
204 |
<!-- Image Panel -->
|
205 |
-
<div class="w-1/3 bg-gray-100 dark:bg-gray-800 p-4 overflow-
|
206 |
-
<div
|
207 |
<div class="bg-white dark:bg-gray-700 rounded-lg shadow-sm overflow-hidden">
|
208 |
<img
|
209 |
:src="getImageSrc()"
|
@@ -221,6 +242,44 @@
|
|
221 |
</div>
|
222 |
</div>
|
223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
<!-- Statistics Panel -->
|
225 |
<div class="mt-4 bg-white dark:bg-gray-700 rounded-lg shadow-sm p-4">
|
226 |
<h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3">OCR Quality Metrics</h3>
|
|
|
63 |
>
|
64 |
Load
|
65 |
</button>
|
66 |
+
|
67 |
+
<!-- Example Dataset Selector -->
|
68 |
+
<div class="relative group">
|
69 |
+
<button class="px-3 py-1.5 text-sm text-gray-600 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-200 border border-gray-300 dark:border-gray-600 rounded-md flex items-center space-x-1">
|
70 |
+
<svg class="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
71 |
+
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2"></path>
|
72 |
+
</svg>
|
73 |
+
<span>Examples</span>
|
74 |
+
</button>
|
75 |
+
<div class="absolute left-0 mt-1 w-72 bg-white dark:bg-gray-800 rounded-md shadow-lg border border-gray-200 dark:border-gray-700 hidden group-hover:block z-50">
|
76 |
+
<template x-for="dataset in exampleDatasets" :key="dataset.id">
|
77 |
+
<button
|
78 |
+
@click="selectDataset(dataset.id)"
|
79 |
+
class="block w-full text-left px-4 py-3 hover:bg-gray-50 dark:hover:bg-gray-700 border-b border-gray-100 dark:border-gray-600 last:border-b-0"
|
80 |
+
>
|
81 |
+
<div class="font-medium text-sm text-gray-900 dark:text-gray-100" x-text="dataset.name"></div>
|
82 |
+
<div class="text-xs text-gray-500 dark:text-gray-400 mt-1" x-text="dataset.description"></div>
|
83 |
+
</button>
|
84 |
+
</template>
|
85 |
+
</div>
|
86 |
+
</div>
|
87 |
</div>
|
88 |
</div>
|
89 |
|
|
|
223 |
<!-- Content Area -->
|
224 |
<div x-show="!loading && !error && currentSample" class="flex-1 flex h-full">
|
225 |
<!-- Image Panel -->
|
226 |
+
<div class="w-1/3 bg-gray-100 dark:bg-gray-800 p-4 overflow-y-auto border-r border-gray-200 dark:border-gray-700">
|
227 |
+
<div>
|
228 |
<div class="bg-white dark:bg-gray-700 rounded-lg shadow-sm overflow-hidden">
|
229 |
<img
|
230 |
:src="getImageSrc()"
|
|
|
242 |
</div>
|
243 |
</div>
|
244 |
|
245 |
+
<!-- Model Info Panel -->
|
246 |
+
<div x-show="modelInfo" x-transition class="mt-4 bg-white dark:bg-gray-700 rounded-lg shadow-sm p-4">
|
247 |
+
<h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3 flex items-center">
|
248 |
+
<svg class="w-4 h-4 mr-1" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
249 |
+
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9.75 17L9 20l-1 1h8l-1-1-.75-3M3 13h18M5 17h14a2 2 0 002-2V5a2 2 0 00-2-2H5a2 2 0 00-2 2v10a2 2 0 002 2z"></path>
|
250 |
+
</svg>
|
251 |
+
Model Information
|
252 |
+
</h3>
|
253 |
+
<div class="space-y-2 text-xs">
|
254 |
+
<div class="flex justify-between items-center">
|
255 |
+
<span class="text-gray-600 dark:text-gray-400">Model</span>
|
256 |
+
<span class="font-medium text-gray-900 dark:text-gray-100" x-text="modelInfo?.modelName || '-'"></span>
|
257 |
+
</div>
|
258 |
+
<div x-show="modelInfo?.processingDate" class="flex justify-between items-center">
|
259 |
+
<span class="text-gray-600 dark:text-gray-400">Processed</span>
|
260 |
+
<span class="text-gray-900 dark:text-gray-100" x-text="modelInfo?.processingDate || '-'"></span>
|
261 |
+
</div>
|
262 |
+
<div x-show="modelInfo?.batchSize" class="flex justify-between items-center">
|
263 |
+
<span class="text-gray-600 dark:text-gray-400">Batch Size</span>
|
264 |
+
<span class="text-gray-900 dark:text-gray-100" x-text="modelInfo?.batchSize || '-'"></span>
|
265 |
+
</div>
|
266 |
+
<div x-show="modelInfo?.maxTokens" class="flex justify-between items-center">
|
267 |
+
<span class="text-gray-600 dark:text-gray-400">Max Tokens</span>
|
268 |
+
<span class="text-gray-900 dark:text-gray-100" x-text="modelInfo?.maxTokens?.toLocaleString() || '-'"></span>
|
269 |
+
</div>
|
270 |
+
<div x-show="modelInfo?.scriptUrl" class="mt-2 pt-2 border-t border-gray-200 dark:border-gray-600">
|
271 |
+
<a :href="modelInfo?.scriptUrl"
|
272 |
+
target="_blank"
|
273 |
+
class="text-blue-600 dark:text-blue-400 hover:underline flex items-center">
|
274 |
+
<svg class="w-3 h-3 mr-1" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
275 |
+
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14"></path>
|
276 |
+
</svg>
|
277 |
+
View Script
|
278 |
+
</a>
|
279 |
+
</div>
|
280 |
+
</div>
|
281 |
+
</div>
|
282 |
+
|
283 |
<!-- Statistics Panel -->
|
284 |
<div class="mt-4 bg-white dark:bg-gray-700 rounded-lg shadow-sm p-4">
|
285 |
<h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3">OCR Quality Metrics</h3>
|
js/app.js
CHANGED
@@ -9,6 +9,12 @@ document.addEventListener('alpine:init', () => {
|
|
9 |
datasetConfig: 'default',
|
10 |
datasetSplit: 'train',
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
// Navigation state
|
13 |
currentIndex: 0,
|
14 |
totalSamples: null,
|
@@ -53,6 +59,10 @@ document.addEventListener('alpine:init', () => {
|
|
53 |
// Markdown cache
|
54 |
markdownCache: new Map(),
|
55 |
|
|
|
|
|
|
|
|
|
56 |
async init() {
|
57 |
// Initialize API
|
58 |
this.api = new DatasetAPI();
|
@@ -171,6 +181,14 @@ document.addEventListener('alpine:init', () => {
|
|
171 |
|
172 |
this.currentSample = data.row;
|
173 |
this.currentIndex = index;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
|
175 |
// Check if improved text contains markdown
|
176 |
const improvedText = this.getImprovedText();
|
@@ -215,6 +233,52 @@ document.addEventListener('alpine:init', () => {
|
|
215 |
}
|
216 |
},
|
217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
getOriginalText() {
|
219 |
if (!this.currentSample) return '';
|
220 |
const columns = this.api.detectColumns(null, this.currentSample);
|
|
|
9 |
datasetConfig: 'default',
|
10 |
datasetSplit: 'train',
|
11 |
|
12 |
+
// Example datasets
|
13 |
+
exampleDatasets: [
|
14 |
+
{ id: 'davanstrien/exams-ocr', name: 'Exams OCR', description: 'Historical exam papers with VLM corrections' },
|
15 |
+
{ id: 'davanstrien/rolm-test', name: 'ROLM Test', description: 'Documents processed with RolmOCR model' }
|
16 |
+
],
|
17 |
+
|
18 |
// Navigation state
|
19 |
currentIndex: 0,
|
20 |
totalSamples: null,
|
|
|
59 |
// Markdown cache
|
60 |
markdownCache: new Map(),
|
61 |
|
62 |
+
// Model info
|
63 |
+
modelInfo: null,
|
64 |
+
columnInfo: null,
|
65 |
+
|
66 |
async init() {
|
67 |
// Initialize API
|
68 |
this.api = new DatasetAPI();
|
|
|
181 |
|
182 |
this.currentSample = data.row;
|
183 |
this.currentIndex = index;
|
184 |
+
this.columnInfo = data.columns;
|
185 |
+
|
186 |
+
// Extract model info if available
|
187 |
+
this.extractModelInfo();
|
188 |
+
|
189 |
+
// Debug: Log column info
|
190 |
+
console.log('Column info:', this.columnInfo);
|
191 |
+
console.log('Current sample keys:', Object.keys(this.currentSample));
|
192 |
|
193 |
// Check if improved text contains markdown
|
194 |
const improvedText = this.getImprovedText();
|
|
|
233 |
}
|
234 |
},
|
235 |
|
236 |
+
async selectDataset(datasetId) {
|
237 |
+
this.datasetId = datasetId;
|
238 |
+
await this.loadDataset();
|
239 |
+
},
|
240 |
+
|
241 |
+
extractModelInfo() {
|
242 |
+
this.modelInfo = null;
|
243 |
+
|
244 |
+
if (!this.currentSample || !this.columnInfo || !this.columnInfo.inferenceInfo) {
|
245 |
+
console.log('No inference info column detected');
|
246 |
+
return;
|
247 |
+
}
|
248 |
+
|
249 |
+
const inferenceData = this.currentSample[this.columnInfo.inferenceInfo];
|
250 |
+
if (!inferenceData) {
|
251 |
+
console.log('No inference data in current sample');
|
252 |
+
return;
|
253 |
+
}
|
254 |
+
|
255 |
+
console.log('Raw inference data:', inferenceData);
|
256 |
+
const parsed = this.api.parseInferenceInfo(inferenceData);
|
257 |
+
console.log('Parsed inference data:', parsed);
|
258 |
+
|
259 |
+
if (parsed) {
|
260 |
+
const formattedInfo = this.formatModelInfo(parsed);
|
261 |
+
// Ensure it's a plain object, not a proxy
|
262 |
+
this.modelInfo = formattedInfo ? {...formattedInfo} : null;
|
263 |
+
console.log('Formatted model info:', this.modelInfo);
|
264 |
+
}
|
265 |
+
},
|
266 |
+
|
267 |
+
formatModelInfo(info) {
|
268 |
+
if (!info) return null;
|
269 |
+
|
270 |
+
return {
|
271 |
+
modelId: info.model_id || 'Unknown',
|
272 |
+
modelName: info.model_id ? info.model_id.split('/').pop() : 'Unknown',
|
273 |
+
processingDate: info.processing_date ? new Date(info.processing_date).toLocaleDateString() : null,
|
274 |
+
scriptVersion: info.script_version || null,
|
275 |
+
batchSize: info.batch_size || null,
|
276 |
+
maxTokens: info.max_tokens || null,
|
277 |
+
scriptUrl: info.script_url || null,
|
278 |
+
columnName: info.column_name || null
|
279 |
+
};
|
280 |
+
},
|
281 |
+
|
282 |
getOriginalText() {
|
283 |
if (!this.currentSample) return '';
|
284 |
const columns = this.api.detectColumns(null, this.currentSample);
|
js/dataset-api.js
CHANGED
@@ -176,6 +176,7 @@ class DatasetAPI {
|
|
176 |
let imageColumn = null;
|
177 |
let originalTextColumn = null;
|
178 |
let improvedTextColumn = null;
|
|
|
179 |
|
180 |
// Try to detect from features first
|
181 |
for (const feature of features || []) {
|
@@ -192,9 +193,14 @@ class DatasetAPI {
|
|
192 |
originalTextColumn = name;
|
193 |
}
|
194 |
|
195 |
-
if (!improvedTextColumn && ['markdown', 'new_ocr', 'corrected_text', 'improved', 'vlm_ocr', 'corrected'].includes(name)) {
|
196 |
improvedTextColumn = name;
|
197 |
}
|
|
|
|
|
|
|
|
|
|
|
198 |
}
|
199 |
|
200 |
// Fallback: detect from sample row
|
@@ -217,15 +223,21 @@ class DatasetAPI {
|
|
217 |
}
|
218 |
|
219 |
if (!improvedTextColumn) {
|
220 |
-
const candidates = ['markdown', 'new_ocr', 'corrected_text', 'improved'];
|
221 |
improvedTextColumn = keys.find(k => candidates.includes(k)) || null;
|
222 |
}
|
|
|
|
|
|
|
|
|
|
|
223 |
}
|
224 |
|
225 |
return {
|
226 |
image: imageColumn,
|
227 |
originalText: originalTextColumn,
|
228 |
-
improvedText: improvedTextColumn
|
|
|
229 |
};
|
230 |
}
|
231 |
|
@@ -267,6 +279,40 @@ class DatasetAPI {
|
|
267 |
clearCache() {
|
268 |
this.cache.clear();
|
269 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
}
|
271 |
|
272 |
// Export for use in other scripts
|
|
|
176 |
let imageColumn = null;
|
177 |
let originalTextColumn = null;
|
178 |
let improvedTextColumn = null;
|
179 |
+
let inferenceInfoColumn = null;
|
180 |
|
181 |
// Try to detect from features first
|
182 |
for (const feature of features || []) {
|
|
|
193 |
originalTextColumn = name;
|
194 |
}
|
195 |
|
196 |
+
if (!improvedTextColumn && ['markdown', 'new_ocr', 'corrected_text', 'improved', 'vlm_ocr', 'corrected', 'rolmocr_text'].includes(name)) {
|
197 |
improvedTextColumn = name;
|
198 |
}
|
199 |
+
|
200 |
+
// Detect inference info column
|
201 |
+
if (name === 'inference_info') {
|
202 |
+
inferenceInfoColumn = name;
|
203 |
+
}
|
204 |
}
|
205 |
|
206 |
// Fallback: detect from sample row
|
|
|
223 |
}
|
224 |
|
225 |
if (!improvedTextColumn) {
|
226 |
+
const candidates = ['markdown', 'new_ocr', 'corrected_text', 'improved', 'rolmocr_text'];
|
227 |
improvedTextColumn = keys.find(k => candidates.includes(k)) || null;
|
228 |
}
|
229 |
+
|
230 |
+
// Check for inference info in sample row
|
231 |
+
if (!inferenceInfoColumn && keys.includes('inference_info')) {
|
232 |
+
inferenceInfoColumn = 'inference_info';
|
233 |
+
}
|
234 |
}
|
235 |
|
236 |
return {
|
237 |
image: imageColumn,
|
238 |
originalText: originalTextColumn,
|
239 |
+
improvedText: improvedTextColumn,
|
240 |
+
inferenceInfo: inferenceInfoColumn
|
241 |
};
|
242 |
}
|
243 |
|
|
|
279 |
clearCache() {
|
280 |
this.cache.clear();
|
281 |
}
|
282 |
+
|
283 |
+
/**
|
284 |
+
* Parse inference info JSON safely
|
285 |
+
*/
|
286 |
+
parseInferenceInfo(inferenceInfoData) {
|
287 |
+
if (!inferenceInfoData) return null;
|
288 |
+
|
289 |
+
try {
|
290 |
+
// Handle if it's already an object (some datasets might store it as object)
|
291 |
+
if (typeof inferenceInfoData === 'object' && !Array.isArray(inferenceInfoData)) {
|
292 |
+
return inferenceInfoData;
|
293 |
+
}
|
294 |
+
|
295 |
+
// Handle if it's a JSON string
|
296 |
+
if (typeof inferenceInfoData === 'string') {
|
297 |
+
const parsed = JSON.parse(inferenceInfoData);
|
298 |
+
// If it's an array, take the first item
|
299 |
+
if (Array.isArray(parsed) && parsed.length > 0) {
|
300 |
+
return parsed[0];
|
301 |
+
}
|
302 |
+
return parsed;
|
303 |
+
}
|
304 |
+
|
305 |
+
// Handle if it's already an array
|
306 |
+
if (Array.isArray(inferenceInfoData) && inferenceInfoData.length > 0) {
|
307 |
+
return inferenceInfoData[0];
|
308 |
+
}
|
309 |
+
|
310 |
+
return null;
|
311 |
+
} catch (error) {
|
312 |
+
console.warn('Failed to parse inference info:', error);
|
313 |
+
return null;
|
314 |
+
}
|
315 |
+
}
|
316 |
}
|
317 |
|
318 |
// Export for use in other scripts
|