ocr-time-capsule / js /dataset-api.js
davanstrien's picture
davanstrien HF Staff
Add support for davanstrien/rolm-test dataset with model info display
1e32a60
/**
* HuggingFace Dataset Viewer API wrapper
* Handles fetching data from the datasets-server API with caching and error handling
*/
class DatasetAPI {
constructor() {
this.baseURL = 'https://datasets-server.huggingface.co';
this.cache = new Map();
this.cacheExpiry = 45 * 60 * 1000; // 45 minutes (conservative for signed URLs)
this.rowsPerFetch = 100; // API maximum
}
/**
* Check if a dataset is valid and has viewer enabled
*/
async validateDataset(datasetId) {
try {
const response = await fetch(`${this.baseURL}/is-valid?dataset=${encodeURIComponent(datasetId)}`);
if (!response.ok) {
throw new Error(`Failed to validate dataset: ${response.statusText}`);
}
const data = await response.json();
if (!data.viewer) {
throw new Error('Dataset viewer is not available for this dataset');
}
return true;
} catch (error) {
throw new Error(`Dataset validation failed: ${error.message}`);
}
}
/**
* Get dataset info including splits and configs
*/
async getDatasetInfo(datasetId) {
const cacheKey = `info_${datasetId}`;
const cached = this.getFromCache(cacheKey);
if (cached) return cached;
try {
const response = await fetch(`${this.baseURL}/splits?dataset=${encodeURIComponent(datasetId)}`);
if (!response.ok) {
throw new Error(`Failed to get dataset info: ${response.statusText}`);
}
const data = await response.json();
// Extract the default config and split
const defaultConfig = data.splits[0]?.config || 'default';
const defaultSplit = data.splits.find(s => s.split === 'train')?.split || data.splits[0]?.split || 'train';
const info = {
configs: [...new Set(data.splits.map(s => s.config))],
splits: [...new Set(data.splits.map(s => s.split))],
defaultConfig,
defaultSplit,
raw: data
};
this.setCache(cacheKey, info);
return info;
} catch (error) {
throw new Error(`Failed to get dataset info: ${error.message}`);
}
}
/**
* Get the total number of rows in a dataset
*/
async getTotalRows(datasetId, config, split) {
const cacheKey = `size_${datasetId}_${config}_${split}`;
const cached = this.getFromCache(cacheKey);
if (cached) return cached;
try {
// First try to get from the size endpoint
const sizeResponse = await fetch(
`${this.baseURL}/size?dataset=${encodeURIComponent(datasetId)}&config=${encodeURIComponent(config)}&split=${encodeURIComponent(split)}`
);
if (sizeResponse.ok) {
const sizeData = await sizeResponse.json();
// The API returns num_rows in size.config or size.splits[0]
const size = sizeData.size?.config?.num_rows ||
sizeData.size?.splits?.[0]?.num_rows ||
0;
this.setCache(cacheKey, size);
return size;
}
// Fallback: get first rows and check num_rows_total
const rowsResponse = await fetch(
`${this.baseURL}/first-rows?dataset=${encodeURIComponent(datasetId)}&config=${encodeURIComponent(config)}&split=${encodeURIComponent(split)}`
);
if (!rowsResponse.ok) {
throw new Error('Unable to determine dataset size');
}
const rowsData = await rowsResponse.json();
const size = rowsData.num_rows_total || rowsData.rows?.length || 0;
this.setCache(cacheKey, size);
return size;
} catch (error) {
console.warn('Failed to get total rows:', error);
return null;
}
}
/**
* Fetch rows from the dataset
*/
async fetchRows(datasetId, config, split, offset, length = this.rowsPerFetch) {
const cacheKey = `rows_${datasetId}_${config}_${split}_${offset}_${length}`;
const cached = this.getFromCache(cacheKey);
if (cached) return cached;
try {
const response = await fetch(
`${this.baseURL}/rows?dataset=${encodeURIComponent(datasetId)}&config=${encodeURIComponent(config)}&split=${encodeURIComponent(split)}&offset=${offset}&length=${length}`
);
if (!response.ok) {
if (response.status === 403) {
throw new Error('Access denied. This dataset may be private or gated.');
}
throw new Error(`Failed to fetch rows: ${response.statusText}`);
}
const data = await response.json();
// Extract column information
const columns = this.detectColumns(data.features, data.rows[0]?.row);
const result = {
rows: data.rows,
features: data.features,
columns,
numRowsTotal: data.num_rows_total,
partial: data.partial || false
};
this.setCache(cacheKey, result);
return result;
} catch (error) {
throw new Error(`Failed to fetch rows: ${error.message}`);
}
}
/**
* Get a single row by index with smart batching
*/
async getRow(datasetId, config, split, index) {
// Calculate which batch this index falls into
const batchStart = Math.floor(index / this.rowsPerFetch) * this.rowsPerFetch;
const batchData = await this.fetchRows(datasetId, config, split, batchStart, this.rowsPerFetch);
const localIndex = index - batchStart;
if (localIndex >= 0 && localIndex < batchData.rows.length) {
return {
row: batchData.rows[localIndex].row,
columns: batchData.columns,
numRowsTotal: batchData.numRowsTotal
};
}
throw new Error(`Row ${index} not found`);
}
/**
* Detect column names for image and text data
*/
detectColumns(features, sampleRow) {
let imageColumn = null;
let originalTextColumn = null;
let improvedTextColumn = null;
let inferenceInfoColumn = null;
// Try to detect from features first
for (const feature of features || []) {
const name = feature.name;
const type = feature.type;
// Detect image column
if (type._type === 'Image' || type.dtype === 'image' || type.feature?._type === 'Image') {
imageColumn = name;
}
// Detect text columns based on common patterns
if (!originalTextColumn && ['text', 'ocr', 'original_text', 'original', 'ground_truth'].includes(name)) {
originalTextColumn = name;
}
if (!improvedTextColumn && ['markdown', 'new_ocr', 'corrected_text', 'improved', 'vlm_ocr', 'corrected', 'rolmocr_text'].includes(name)) {
improvedTextColumn = name;
}
// Detect inference info column
if (name === 'inference_info') {
inferenceInfoColumn = name;
}
}
// Fallback: detect from sample row
if (sampleRow) {
const keys = Object.keys(sampleRow);
if (!imageColumn) {
for (const key of keys) {
if (sampleRow[key]?.src && sampleRow[key]?.height !== undefined) {
imageColumn = key;
break;
}
}
}
// Additional text column detection from row data
if (!originalTextColumn) {
const candidates = ['text', 'ocr', 'original_text', 'original'];
originalTextColumn = keys.find(k => candidates.includes(k)) || null;
}
if (!improvedTextColumn) {
const candidates = ['markdown', 'new_ocr', 'corrected_text', 'improved', 'rolmocr_text'];
improvedTextColumn = keys.find(k => candidates.includes(k)) || null;
}
// Check for inference info in sample row
if (!inferenceInfoColumn && keys.includes('inference_info')) {
inferenceInfoColumn = 'inference_info';
}
}
return {
image: imageColumn,
originalText: originalTextColumn,
improvedText: improvedTextColumn,
inferenceInfo: inferenceInfoColumn
};
}
/**
* Refresh expired image URL by re-fetching the row
*/
async refreshImageUrl(datasetId, config, split, index) {
// Clear cache for this specific row batch
const batchStart = Math.floor(index / this.rowsPerFetch) * this.rowsPerFetch;
const cacheKey = `rows_${datasetId}_${config}_${split}_${batchStart}_${this.rowsPerFetch}`;
this.cache.delete(cacheKey);
// Re-fetch the row
return await this.getRow(datasetId, config, split, index);
}
/**
* Cache management utilities
*/
getFromCache(key) {
const cached = this.cache.get(key);
if (!cached) return null;
if (Date.now() - cached.timestamp > this.cacheExpiry) {
this.cache.delete(key);
return null;
}
return cached.data;
}
setCache(key, data) {
this.cache.set(key, {
data,
timestamp: Date.now()
});
}
clearCache() {
this.cache.clear();
}
/**
* Parse inference info JSON safely
*/
parseInferenceInfo(inferenceInfoData) {
if (!inferenceInfoData) return null;
try {
// Handle if it's already an object (some datasets might store it as object)
if (typeof inferenceInfoData === 'object' && !Array.isArray(inferenceInfoData)) {
return inferenceInfoData;
}
// Handle if it's a JSON string
if (typeof inferenceInfoData === 'string') {
const parsed = JSON.parse(inferenceInfoData);
// If it's an array, take the first item
if (Array.isArray(parsed) && parsed.length > 0) {
return parsed[0];
}
return parsed;
}
// Handle if it's already an array
if (Array.isArray(inferenceInfoData) && inferenceInfoData.length > 0) {
return inferenceInfoData[0];
}
return null;
} catch (error) {
console.warn('Failed to parse inference info:', error);
return null;
}
}
}
// Export for use in other scripts
window.DatasetAPI = DatasetAPI;