| checkpoints: null | |
| data: null | |
| general: | |
| benchmark_csv_path: null | |
| consumed_train_samples: null | |
| ignore_sanity_checks: false | |
| project: mistralai | |
| run: Mistral-7B-v0.1 | |
| seed: 42 | |
| step: 0 | |
| logging: null | |
| model: | |
| ddp_bucket_cap_mb: 25 | |
| dtype: bfloat16 | |
| init_method: | |
| std: 0.025 | |
| make_vocab_size_divisible_by: 1 | |
| model_config: | |
| attn_pdrop: 0.0 | |
| bos_token_id: 1 | |
| eos_token_id: 2 | |
| hidden_act: silu | |
| hidden_size: 4096 | |
| initializer_range: 0.02 | |
| intermediate_size: 14336 | |
| is_mistral_config: true | |
| max_position_embeddings: 32768 | |
| num_attention_heads: 32 | |
| num_hidden_layers: 32 | |
| num_key_value_heads: 8 | |
| pad_token_id: null | |
| pretraining_tp: 1 | |
| rms_norm_eps: 1.0e-05 | |
| rope_theta: 10000.0 | |
| sliding_window_size: 4096 | |
| tie_word_embeddings: false | |
| use_cache: true | |
| vocab_size: 32000 | |
| optimizer: null | |
| parallelism: | |
| dp: 2 | |
| pp: 2 | |
| pp_engine: 1f1b | |
| recompute_granularity: SELECTIVE | |
| tp: 2 | |
| tp_linear_async_communication: true | |
| tp_mode: REDUCE_SCATTER | |
| profiler: null | |
| tokenizer: | |
| tokenizer_max_length: null | |
| tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 | |
| tokenizer_revision: null | |
| tokens: null | |