File size: 1,144 Bytes
a57357b
 
 
 
 
356ee13
a57357b
 
 
 
 
 
 
 
 
 
 
 
356ee13
 
a57357b
 
 
 
 
 
 
 
3da7418
a57357b
 
356ee13
3da7418
 
a57357b
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
{
    "dataset": {
        "name": "George-API/cognitive-data",
        "split": "train",
        "column_mapping": {
            "conversations": "text"
        },
        "processing": {
            "sort_by_id": true,
            "maintain_paper_order": true,
            "max_seq_length": 2048
        }
    },
    "data_formatting": {
        "chat_template": "phi",
        "roles": {
            "system": "System: {content}\n\n",
            "human": "Human: {content}\n\n",
            "assistant": "Assistant: {content}\n\n",
            "user": "Human: {content}\n\n"
        },
        "metadata_handling": {
            "include_paper_id": true,
            "include_chunk_number": true,
            "metadata_format": "Paper ID: {paper_id} | Chunk: {chunk_number}"
        }
    },
    "data_loading": {
        "batch_size": 24,
        "shuffle": false,
        "drop_last": false,
        "num_workers": 4,
        "pin_memory": true,
        "prefetch_factor": 4
    },
    "validation": {
        "log_samples": 3,
        "log_interval": 50,
        "metrics": ["processed", "skipped", "avg_tokens", "unique_papers"]
    }
}