SmolLM - blazingly fast and remarkably powerful
•
262
from datasets import load_dataset
pdfa_dataset = load_dataset('pixparse/pdfa-eng-wds', streaming=True)
IDL_dataset = load_dataset('pixparse/idl-wds', streaming=True)
import chug
task_cfg = chug.DataTaskDocReadCfg(
page_sampling='all',
)
data_cfg = chug.DataCfg(
source='pixparse/pdfa-eng-wds',
split='train',
batch_size=None,
format='hfids',
num_workers=0,
)
data_loader = chug.create_loader(
data_cfg,
task_cfg,
)
sample = next(iter(data_loader))