davanstrien HF Staff Claude commited on
Commit
1e32a60
·
1 Parent(s): dbda7b0

Add support for davanstrien/rolm-test dataset with model info display

Browse files

- Add inference_info column detection and rolmocr_text as improved text column
- Add example dataset selector dropdown with quick access to both datasets
- Extract and display model metadata (model name, processing date, parameters)
- Move model info panel above metrics for better visibility
- Fix sidebar scrolling to prevent model info from being hidden
- Add debug logging for troubleshooting column detection

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (3) hide show
  1. index.html +61 -2
  2. js/app.js +64 -0
  3. js/dataset-api.js +49 -3
index.html CHANGED
@@ -63,6 +63,27 @@
63
  >
64
  Load
65
  </button>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  </div>
67
  </div>
68
 
@@ -202,8 +223,8 @@
202
  <!-- Content Area -->
203
  <div x-show="!loading && !error && currentSample" class="flex-1 flex h-full">
204
  <!-- Image Panel -->
205
- <div class="w-1/3 bg-gray-100 dark:bg-gray-800 p-4 overflow-hidden border-r border-gray-200 dark:border-gray-700">
206
- <div class="sticky top-0">
207
  <div class="bg-white dark:bg-gray-700 rounded-lg shadow-sm overflow-hidden">
208
  <img
209
  :src="getImageSrc()"
@@ -221,6 +242,44 @@
221
  </div>
222
  </div>
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  <!-- Statistics Panel -->
225
  <div class="mt-4 bg-white dark:bg-gray-700 rounded-lg shadow-sm p-4">
226
  <h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3">OCR Quality Metrics</h3>
 
63
  >
64
  Load
65
  </button>
66
+
67
+ <!-- Example Dataset Selector -->
68
+ <div class="relative group">
69
+ <button class="px-3 py-1.5 text-sm text-gray-600 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-200 border border-gray-300 dark:border-gray-600 rounded-md flex items-center space-x-1">
70
+ <svg class="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
71
+ <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2"></path>
72
+ </svg>
73
+ <span>Examples</span>
74
+ </button>
75
+ <div class="absolute left-0 mt-1 w-72 bg-white dark:bg-gray-800 rounded-md shadow-lg border border-gray-200 dark:border-gray-700 hidden group-hover:block z-50">
76
+ <template x-for="dataset in exampleDatasets" :key="dataset.id">
77
+ <button
78
+ @click="selectDataset(dataset.id)"
79
+ class="block w-full text-left px-4 py-3 hover:bg-gray-50 dark:hover:bg-gray-700 border-b border-gray-100 dark:border-gray-600 last:border-b-0"
80
+ >
81
+ <div class="font-medium text-sm text-gray-900 dark:text-gray-100" x-text="dataset.name"></div>
82
+ <div class="text-xs text-gray-500 dark:text-gray-400 mt-1" x-text="dataset.description"></div>
83
+ </button>
84
+ </template>
85
+ </div>
86
+ </div>
87
  </div>
88
  </div>
89
 
 
223
  <!-- Content Area -->
224
  <div x-show="!loading && !error && currentSample" class="flex-1 flex h-full">
225
  <!-- Image Panel -->
226
+ <div class="w-1/3 bg-gray-100 dark:bg-gray-800 p-4 overflow-y-auto border-r border-gray-200 dark:border-gray-700">
227
+ <div>
228
  <div class="bg-white dark:bg-gray-700 rounded-lg shadow-sm overflow-hidden">
229
  <img
230
  :src="getImageSrc()"
 
242
  </div>
243
  </div>
244
 
245
+ <!-- Model Info Panel -->
246
+ <div x-show="modelInfo" x-transition class="mt-4 bg-white dark:bg-gray-700 rounded-lg shadow-sm p-4">
247
+ <h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3 flex items-center">
248
+ <svg class="w-4 h-4 mr-1" fill="none" stroke="currentColor" viewBox="0 0 24 24">
249
+ <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9.75 17L9 20l-1 1h8l-1-1-.75-3M3 13h18M5 17h14a2 2 0 002-2V5a2 2 0 00-2-2H5a2 2 0 00-2 2v10a2 2 0 002 2z"></path>
250
+ </svg>
251
+ Model Information
252
+ </h3>
253
+ <div class="space-y-2 text-xs">
254
+ <div class="flex justify-between items-center">
255
+ <span class="text-gray-600 dark:text-gray-400">Model</span>
256
+ <span class="font-medium text-gray-900 dark:text-gray-100" x-text="modelInfo?.modelName || '-'"></span>
257
+ </div>
258
+ <div x-show="modelInfo?.processingDate" class="flex justify-between items-center">
259
+ <span class="text-gray-600 dark:text-gray-400">Processed</span>
260
+ <span class="text-gray-900 dark:text-gray-100" x-text="modelInfo?.processingDate || '-'"></span>
261
+ </div>
262
+ <div x-show="modelInfo?.batchSize" class="flex justify-between items-center">
263
+ <span class="text-gray-600 dark:text-gray-400">Batch Size</span>
264
+ <span class="text-gray-900 dark:text-gray-100" x-text="modelInfo?.batchSize || '-'"></span>
265
+ </div>
266
+ <div x-show="modelInfo?.maxTokens" class="flex justify-between items-center">
267
+ <span class="text-gray-600 dark:text-gray-400">Max Tokens</span>
268
+ <span class="text-gray-900 dark:text-gray-100" x-text="modelInfo?.maxTokens?.toLocaleString() || '-'"></span>
269
+ </div>
270
+ <div x-show="modelInfo?.scriptUrl" class="mt-2 pt-2 border-t border-gray-200 dark:border-gray-600">
271
+ <a :href="modelInfo?.scriptUrl"
272
+ target="_blank"
273
+ class="text-blue-600 dark:text-blue-400 hover:underline flex items-center">
274
+ <svg class="w-3 h-3 mr-1" fill="none" stroke="currentColor" viewBox="0 0 24 24">
275
+ <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14"></path>
276
+ </svg>
277
+ View Script
278
+ </a>
279
+ </div>
280
+ </div>
281
+ </div>
282
+
283
  <!-- Statistics Panel -->
284
  <div class="mt-4 bg-white dark:bg-gray-700 rounded-lg shadow-sm p-4">
285
  <h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3">OCR Quality Metrics</h3>
js/app.js CHANGED
@@ -9,6 +9,12 @@ document.addEventListener('alpine:init', () => {
9
  datasetConfig: 'default',
10
  datasetSplit: 'train',
11
 
 
 
 
 
 
 
12
  // Navigation state
13
  currentIndex: 0,
14
  totalSamples: null,
@@ -53,6 +59,10 @@ document.addEventListener('alpine:init', () => {
53
  // Markdown cache
54
  markdownCache: new Map(),
55
 
 
 
 
 
56
  async init() {
57
  // Initialize API
58
  this.api = new DatasetAPI();
@@ -171,6 +181,14 @@ document.addEventListener('alpine:init', () => {
171
 
172
  this.currentSample = data.row;
173
  this.currentIndex = index;
 
 
 
 
 
 
 
 
174
 
175
  // Check if improved text contains markdown
176
  const improvedText = this.getImprovedText();
@@ -215,6 +233,52 @@ document.addEventListener('alpine:init', () => {
215
  }
216
  },
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  getOriginalText() {
219
  if (!this.currentSample) return '';
220
  const columns = this.api.detectColumns(null, this.currentSample);
 
9
  datasetConfig: 'default',
10
  datasetSplit: 'train',
11
 
12
+ // Example datasets
13
+ exampleDatasets: [
14
+ { id: 'davanstrien/exams-ocr', name: 'Exams OCR', description: 'Historical exam papers with VLM corrections' },
15
+ { id: 'davanstrien/rolm-test', name: 'ROLM Test', description: 'Documents processed with RolmOCR model' }
16
+ ],
17
+
18
  // Navigation state
19
  currentIndex: 0,
20
  totalSamples: null,
 
59
  // Markdown cache
60
  markdownCache: new Map(),
61
 
62
+ // Model info
63
+ modelInfo: null,
64
+ columnInfo: null,
65
+
66
  async init() {
67
  // Initialize API
68
  this.api = new DatasetAPI();
 
181
 
182
  this.currentSample = data.row;
183
  this.currentIndex = index;
184
+ this.columnInfo = data.columns;
185
+
186
+ // Extract model info if available
187
+ this.extractModelInfo();
188
+
189
+ // Debug: Log column info
190
+ console.log('Column info:', this.columnInfo);
191
+ console.log('Current sample keys:', Object.keys(this.currentSample));
192
 
193
  // Check if improved text contains markdown
194
  const improvedText = this.getImprovedText();
 
233
  }
234
  },
235
 
236
+ async selectDataset(datasetId) {
237
+ this.datasetId = datasetId;
238
+ await this.loadDataset();
239
+ },
240
+
241
+ extractModelInfo() {
242
+ this.modelInfo = null;
243
+
244
+ if (!this.currentSample || !this.columnInfo || !this.columnInfo.inferenceInfo) {
245
+ console.log('No inference info column detected');
246
+ return;
247
+ }
248
+
249
+ const inferenceData = this.currentSample[this.columnInfo.inferenceInfo];
250
+ if (!inferenceData) {
251
+ console.log('No inference data in current sample');
252
+ return;
253
+ }
254
+
255
+ console.log('Raw inference data:', inferenceData);
256
+ const parsed = this.api.parseInferenceInfo(inferenceData);
257
+ console.log('Parsed inference data:', parsed);
258
+
259
+ if (parsed) {
260
+ const formattedInfo = this.formatModelInfo(parsed);
261
+ // Ensure it's a plain object, not a proxy
262
+ this.modelInfo = formattedInfo ? {...formattedInfo} : null;
263
+ console.log('Formatted model info:', this.modelInfo);
264
+ }
265
+ },
266
+
267
+ formatModelInfo(info) {
268
+ if (!info) return null;
269
+
270
+ return {
271
+ modelId: info.model_id || 'Unknown',
272
+ modelName: info.model_id ? info.model_id.split('/').pop() : 'Unknown',
273
+ processingDate: info.processing_date ? new Date(info.processing_date).toLocaleDateString() : null,
274
+ scriptVersion: info.script_version || null,
275
+ batchSize: info.batch_size || null,
276
+ maxTokens: info.max_tokens || null,
277
+ scriptUrl: info.script_url || null,
278
+ columnName: info.column_name || null
279
+ };
280
+ },
281
+
282
  getOriginalText() {
283
  if (!this.currentSample) return '';
284
  const columns = this.api.detectColumns(null, this.currentSample);
js/dataset-api.js CHANGED
@@ -176,6 +176,7 @@ class DatasetAPI {
176
  let imageColumn = null;
177
  let originalTextColumn = null;
178
  let improvedTextColumn = null;
 
179
 
180
  // Try to detect from features first
181
  for (const feature of features || []) {
@@ -192,9 +193,14 @@ class DatasetAPI {
192
  originalTextColumn = name;
193
  }
194
 
195
- if (!improvedTextColumn && ['markdown', 'new_ocr', 'corrected_text', 'improved', 'vlm_ocr', 'corrected'].includes(name)) {
196
  improvedTextColumn = name;
197
  }
 
 
 
 
 
198
  }
199
 
200
  // Fallback: detect from sample row
@@ -217,15 +223,21 @@ class DatasetAPI {
217
  }
218
 
219
  if (!improvedTextColumn) {
220
- const candidates = ['markdown', 'new_ocr', 'corrected_text', 'improved'];
221
  improvedTextColumn = keys.find(k => candidates.includes(k)) || null;
222
  }
 
 
 
 
 
223
  }
224
 
225
  return {
226
  image: imageColumn,
227
  originalText: originalTextColumn,
228
- improvedText: improvedTextColumn
 
229
  };
230
  }
231
 
@@ -267,6 +279,40 @@ class DatasetAPI {
267
  clearCache() {
268
  this.cache.clear();
269
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  }
271
 
272
  // Export for use in other scripts
 
176
  let imageColumn = null;
177
  let originalTextColumn = null;
178
  let improvedTextColumn = null;
179
+ let inferenceInfoColumn = null;
180
 
181
  // Try to detect from features first
182
  for (const feature of features || []) {
 
193
  originalTextColumn = name;
194
  }
195
 
196
+ if (!improvedTextColumn && ['markdown', 'new_ocr', 'corrected_text', 'improved', 'vlm_ocr', 'corrected', 'rolmocr_text'].includes(name)) {
197
  improvedTextColumn = name;
198
  }
199
+
200
+ // Detect inference info column
201
+ if (name === 'inference_info') {
202
+ inferenceInfoColumn = name;
203
+ }
204
  }
205
 
206
  // Fallback: detect from sample row
 
223
  }
224
 
225
  if (!improvedTextColumn) {
226
+ const candidates = ['markdown', 'new_ocr', 'corrected_text', 'improved', 'rolmocr_text'];
227
  improvedTextColumn = keys.find(k => candidates.includes(k)) || null;
228
  }
229
+
230
+ // Check for inference info in sample row
231
+ if (!inferenceInfoColumn && keys.includes('inference_info')) {
232
+ inferenceInfoColumn = 'inference_info';
233
+ }
234
  }
235
 
236
  return {
237
  image: imageColumn,
238
  originalText: originalTextColumn,
239
+ improvedText: improvedTextColumn,
240
+ inferenceInfo: inferenceInfoColumn
241
  };
242
  }
243
 
 
279
  clearCache() {
280
  this.cache.clear();
281
  }
282
+
283
+ /**
284
+ * Parse inference info JSON safely
285
+ */
286
+ parseInferenceInfo(inferenceInfoData) {
287
+ if (!inferenceInfoData) return null;
288
+
289
+ try {
290
+ // Handle if it's already an object (some datasets might store it as object)
291
+ if (typeof inferenceInfoData === 'object' && !Array.isArray(inferenceInfoData)) {
292
+ return inferenceInfoData;
293
+ }
294
+
295
+ // Handle if it's a JSON string
296
+ if (typeof inferenceInfoData === 'string') {
297
+ const parsed = JSON.parse(inferenceInfoData);
298
+ // If it's an array, take the first item
299
+ if (Array.isArray(parsed) && parsed.length > 0) {
300
+ return parsed[0];
301
+ }
302
+ return parsed;
303
+ }
304
+
305
+ // Handle if it's already an array
306
+ if (Array.isArray(inferenceInfoData) && inferenceInfoData.length > 0) {
307
+ return inferenceInfoData[0];
308
+ }
309
+
310
+ return null;
311
+ } catch (error) {
312
+ console.warn('Failed to parse inference info:', error);
313
+ return null;
314
+ }
315
+ }
316
  }
317
 
318
  // Export for use in other scripts