Spaces:

davanstrien
/

ocr-time-capsule

Running

App Files Files Community

ocr-time-capsule / js /dataset-api.js

davanstrien's picture

davanstrien HF Staff

Add support for davanstrien/rolm-test dataset with model info display

1e32a60 about 4 hours ago

history blame contribute delete

11.4 kB

	/**
	* HuggingFace Dataset Viewer API wrapper
	* Handles fetching data from the datasets-server API with caching and error handling
	*/

	class DatasetAPI {
	constructor() {
	this.baseURL = 'https://datasets-server.huggingface.co';
	this.cache = new Map();
	this.cacheExpiry = 45 * 60 * 1000; // 45 minutes (conservative for signed URLs)
	this.rowsPerFetch = 100; // API maximum
	}

	/**
	* Check if a dataset is valid and has viewer enabled
	*/
	async validateDataset(datasetId) {
	try {
	const response = await fetch(`${this.baseURL}/is-valid?dataset=${encodeURIComponent(datasetId)}`);
	if (!response.ok) {
	throw new Error(`Failed to validate dataset: ${response.statusText}`);
	}
	const data = await response.json();

	if (!data.viewer) {
	throw new Error('Dataset viewer is not available for this dataset');
	}

	return true;
	} catch (error) {
	throw new Error(`Dataset validation failed: ${error.message}`);
	}
	}

	/**
	* Get dataset info including splits and configs
	*/
	async getDatasetInfo(datasetId) {
	const cacheKey = `info_${datasetId}`;
	const cached = this.getFromCache(cacheKey);
	if (cached) return cached;

	try {
	const response = await fetch(`${this.baseURL}/splits?dataset=${encodeURIComponent(datasetId)}`);
	if (!response.ok) {
	throw new Error(`Failed to get dataset info: ${response.statusText}`);
	}
	const data = await response.json();

	// Extract the default config and split
	const defaultConfig = data.splits[0]?.config \|\| 'default';
	const defaultSplit = data.splits.find(s => s.split === 'train')?.split \|\| data.splits[0]?.split \|\| 'train';

	const info = {
	configs: [...new Set(data.splits.map(s => s.config))],
	splits: [...new Set(data.splits.map(s => s.split))],
	defaultConfig,
	defaultSplit,
	raw: data
	};

	this.setCache(cacheKey, info);
	return info;
	} catch (error) {
	throw new Error(`Failed to get dataset info: ${error.message}`);
	}
	}

	/**
	* Get the total number of rows in a dataset
	*/
	async getTotalRows(datasetId, config, split) {
	const cacheKey = `size_${datasetId}_${config}_${split}`;
	const cached = this.getFromCache(cacheKey);
	if (cached) return cached;

	try {
	// First try to get from the size endpoint
	const sizeResponse = await fetch(
	`${this.baseURL}/size?dataset=${encodeURIComponent(datasetId)}&config=${encodeURIComponent(config)}&split=${encodeURIComponent(split)}`
	);

	if (sizeResponse.ok) {
	const sizeData = await sizeResponse.json();
	// The API returns num_rows in size.config or size.splits[0]
	const size = sizeData.size?.config?.num_rows \|\|
	sizeData.size?.splits?.[0]?.num_rows \|\|
	0;
	this.setCache(cacheKey, size);
	return size;
	}

	// Fallback: get first rows and check num_rows_total
	const rowsResponse = await fetch(
	`${this.baseURL}/first-rows?dataset=${encodeURIComponent(datasetId)}&config=${encodeURIComponent(config)}&split=${encodeURIComponent(split)}`
	);

	if (!rowsResponse.ok) {
	throw new Error('Unable to determine dataset size');
	}

	const rowsData = await rowsResponse.json();
	const size = rowsData.num_rows_total \|\| rowsData.rows?.length \|\| 0;
	this.setCache(cacheKey, size);
	return size;
	} catch (error) {
	console.warn('Failed to get total rows:', error);
	return null;
	}
	}

	/**
	* Fetch rows from the dataset
	*/
	async fetchRows(datasetId, config, split, offset, length = this.rowsPerFetch) {
	const cacheKey = `rows_${datasetId}_${config}_${split}_${offset}_${length}`;
	const cached = this.getFromCache(cacheKey);
	if (cached) return cached;

	try {
	const response = await fetch(
	`${this.baseURL}/rows?dataset=${encodeURIComponent(datasetId)}&config=${encodeURIComponent(config)}&split=${encodeURIComponent(split)}&offset=${offset}&length=${length}`
	);

	if (!response.ok) {
	if (response.status === 403) {
	throw new Error('Access denied. This dataset may be private or gated.');
	}
	throw new Error(`Failed to fetch rows: ${response.statusText}`);
	}

	const data = await response.json();

	// Extract column information
	const columns = this.detectColumns(data.features, data.rows[0]?.row);

	const result = {
	rows: data.rows,
	features: data.features,
	columns,
	numRowsTotal: data.num_rows_total,
	partial: data.partial \|\| false
	};

	this.setCache(cacheKey, result);
	return result;
	} catch (error) {
	throw new Error(`Failed to fetch rows: ${error.message}`);
	}
	}

	/**
	* Get a single row by index with smart batching
	*/
	async getRow(datasetId, config, split, index) {
	// Calculate which batch this index falls into
	const batchStart = Math.floor(index / this.rowsPerFetch) * this.rowsPerFetch;
	const batchData = await this.fetchRows(datasetId, config, split, batchStart, this.rowsPerFetch);

	const localIndex = index - batchStart;
	if (localIndex >= 0 && localIndex < batchData.rows.length) {
	return {
	row: batchData.rows[localIndex].row,
	columns: batchData.columns,
	numRowsTotal: batchData.numRowsTotal
	};
	}

	throw new Error(`Row ${index} not found`);
	}

	/**
	* Detect column names for image and text data
	*/
	detectColumns(features, sampleRow) {
	let imageColumn = null;
	let originalTextColumn = null;
	let improvedTextColumn = null;
	let inferenceInfoColumn = null;

	// Try to detect from features first
	for (const feature of features \|\| []) {
	const name = feature.name;
	const type = feature.type;

	// Detect image column
	if (type._type === 'Image' \|\| type.dtype === 'image' \|\| type.feature?._type === 'Image') {
	imageColumn = name;
	}

	// Detect text columns based on common patterns
	if (!originalTextColumn && ['text', 'ocr', 'original_text', 'original', 'ground_truth'].includes(name)) {
	originalTextColumn = name;
	}

	if (!improvedTextColumn && ['markdown', 'new_ocr', 'corrected_text', 'improved', 'vlm_ocr', 'corrected', 'rolmocr_text'].includes(name)) {
	improvedTextColumn = name;
	}

	// Detect inference info column
	if (name === 'inference_info') {
	inferenceInfoColumn = name;
	}
	}

	// Fallback: detect from sample row
	if (sampleRow) {
	const keys = Object.keys(sampleRow);

	if (!imageColumn) {
	for (const key of keys) {
	if (sampleRow[key]?.src && sampleRow[key]?.height !== undefined) {
	imageColumn = key;
	break;
	}
	}
	}

	// Additional text column detection from row data
	if (!originalTextColumn) {
	const candidates = ['text', 'ocr', 'original_text', 'original'];
	originalTextColumn = keys.find(k => candidates.includes(k)) \|\| null;
	}

	if (!improvedTextColumn) {
	const candidates = ['markdown', 'new_ocr', 'corrected_text', 'improved', 'rolmocr_text'];
	improvedTextColumn = keys.find(k => candidates.includes(k)) \|\| null;
	}

	// Check for inference info in sample row
	if (!inferenceInfoColumn && keys.includes('inference_info')) {
	inferenceInfoColumn = 'inference_info';
	}
	}

	return {
	image: imageColumn,
	originalText: originalTextColumn,
	improvedText: improvedTextColumn,
	inferenceInfo: inferenceInfoColumn
	};
	}

	/**
	* Refresh expired image URL by re-fetching the row
	*/
	async refreshImageUrl(datasetId, config, split, index) {
	// Clear cache for this specific row batch
	const batchStart = Math.floor(index / this.rowsPerFetch) * this.rowsPerFetch;
	const cacheKey = `rows_${datasetId}_${config}_${split}_${batchStart}_${this.rowsPerFetch}`;
	this.cache.delete(cacheKey);

	// Re-fetch the row
	return await this.getRow(datasetId, config, split, index);
	}

	/**
	* Cache management utilities
	*/
	getFromCache(key) {
	const cached = this.cache.get(key);
	if (!cached) return null;

	if (Date.now() - cached.timestamp > this.cacheExpiry) {
	this.cache.delete(key);
	return null;
	}

	return cached.data;
	}

	setCache(key, data) {
	this.cache.set(key, {
	data,
	timestamp: Date.now()
	});
	}

	clearCache() {
	this.cache.clear();
	}

	/**
	* Parse inference info JSON safely
	*/
	parseInferenceInfo(inferenceInfoData) {
	if (!inferenceInfoData) return null;

	try {
	// Handle if it's already an object (some datasets might store it as object)
	if (typeof inferenceInfoData === 'object' && !Array.isArray(inferenceInfoData)) {
	return inferenceInfoData;
	}

	// Handle if it's a JSON string
	if (typeof inferenceInfoData === 'string') {
	const parsed = JSON.parse(inferenceInfoData);
	// If it's an array, take the first item
	if (Array.isArray(parsed) && parsed.length > 0) {
	return parsed[0];
	}
	return parsed;
	}

	// Handle if it's already an array
	if (Array.isArray(inferenceInfoData) && inferenceInfoData.length > 0) {
	return inferenceInfoData[0];
	}

	return null;
	} catch (error) {
	console.warn('Failed to parse inference info:', error);
	return null;
	}
	}
	}

	// Export for use in other scripts
	window.DatasetAPI = DatasetAPI;