hf-train-frontend / dataset_config.json
George-API's picture
Upload folder using huggingface_hub
356ee13 verified
raw
history blame
1.14 kB
{
"dataset": {
"name": "George-API/cognitive-data",
"split": "train",
"column_mapping": {
"conversations": "text"
},
"processing": {
"sort_by_id": true,
"maintain_paper_order": true,
"max_seq_length": 2048
}
},
"data_formatting": {
"chat_template": "phi",
"roles": {
"system": "System: {content}\n\n",
"human": "Human: {content}\n\n",
"assistant": "Assistant: {content}\n\n",
"user": "Human: {content}\n\n"
},
"metadata_handling": {
"include_paper_id": true,
"include_chunk_number": true,
"metadata_format": "Paper ID: {paper_id} | Chunk: {chunk_number}"
}
},
"data_loading": {
"batch_size": 24,
"shuffle": false,
"drop_last": false,
"num_workers": 4,
"pin_memory": true,
"prefetch_factor": 4
},
"validation": {
"log_samples": 3,
"log_interval": 50,
"metrics": ["processed", "skipped", "avg_tokens", "unique_papers"]
}
}