Update README.md
Browse files
    	
        README.md
    CHANGED
    
    | @@ -2,112 +2,103 @@ | |
| 2 | 
             
            library_name: transformers
         | 
| 3 | 
             
            tags:
         | 
| 4 | 
             
            - generated_from_trainer
         | 
| 5 | 
            -
             | 
| 6 | 
            -
            -  | 
| 7 | 
            -
             | 
|  | |
|  | |
|  | |
| 8 | 
             
            ---
         | 
| 9 |  | 
| 10 | 
            -
             | 
| 11 | 
            -
            should probably proofread and complete it, then remove this comment. -->
         | 
| 12 | 
            -
             | 
| 13 | 
            -
            [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
         | 
| 14 | 
            -
            <details><summary>See axolotl config</summary>
         | 
| 15 |  | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 |  | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 |  | 
| 26 | 
            -
            chat_template: qwen_25
         | 
| 27 |  | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 |  | 
| 31 | 
            -
             | 
| 32 | 
            -
            sample_packing: true
         | 
| 33 |  | 
| 34 | 
            -
             | 
| 35 | 
            -
            micro_batch_size: 1
         | 
| 36 | 
            -
            num_epochs: 1
         | 
| 37 | 
            -
            auto_resume_from_checkpoints: true
         | 
| 38 |  | 
| 39 | 
            -
             | 
| 40 | 
            -
            lr_scheduler: warmup_stable_decay
         | 
| 41 | 
            -
            learning_rate: 3e-6
         | 
| 42 | 
            -
            lr_scheduler_kwargs:
         | 
| 43 | 
            -
              num_decay_steps: 300
         | 
| 44 | 
            -
              min_lr_ratio: 0.1
         | 
| 45 | 
            -
            warmup_steps: 150
         | 
| 46 |  | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
|  | |
| 49 |  | 
| 50 | 
            -
             | 
| 51 | 
            -
            logging_steps: 10
         | 
| 52 | 
            -
            flash_attention: true
         | 
| 53 |  | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 58 |  | 
| 59 | 
            -
             | 
| 60 | 
            -
            deepspeed: /lustre/fswork/projects/rech/qwv/udv55np/axolotl/zero3.json
         | 
| 61 | 
            -
            special_tokens:
         | 
| 62 | 
            -
              bos_token: "<|im_start|>"
         | 
| 63 | 
            -
              eos_token: "<|im_end|>"
         | 
| 64 | 
            -
              pad_token: "<|endoftext|>"
         | 
| 65 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 66 | 
             
            ```
         | 
| 67 | 
            -
             | 
| 68 | 
            -
            </details><br>
         | 
| 69 | 
            -
             | 
| 70 | 
            -
            # lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-14B_reasoning/1
         | 
| 71 | 
            -
             | 
| 72 | 
            -
            This model was trained from scratch on the None dataset.
         | 
| 73 | 
            -
             | 
| 74 | 
            -
            ## Model description
         | 
| 75 | 
            -
             | 
| 76 | 
            -
            More information needed
         | 
| 77 | 
            -
             | 
| 78 | 
            -
            ## Intended uses & limitations
         | 
| 79 | 
            -
             | 
| 80 | 
            -
            More information needed
         | 
| 81 | 
            -
             | 
| 82 | 
            -
            ## Training and evaluation data
         | 
| 83 | 
            -
             | 
| 84 | 
            -
            More information needed
         | 
| 85 | 
            -
             | 
| 86 | 
            -
            ## Training procedure
         | 
| 87 | 
            -
             | 
| 88 | 
            -
            ### Training hyperparameters
         | 
| 89 | 
            -
             | 
| 90 | 
            -
            The following hyperparameters were used during training:
         | 
| 91 | 
            -
            - learning_rate: 3e-06
         | 
| 92 | 
            -
            - train_batch_size: 1
         | 
| 93 | 
            -
            - eval_batch_size: 1
         | 
| 94 | 
            -
            - seed: 42
         | 
| 95 | 
            -
            - distributed_type: multi-GPU
         | 
| 96 | 
            -
            - num_devices: 16
         | 
| 97 | 
            -
            - total_train_batch_size: 16
         | 
| 98 | 
            -
            - total_eval_batch_size: 16
         | 
| 99 | 
            -
            - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
         | 
| 100 | 
            -
            - lr_scheduler_type: warmup_stable_decay
         | 
| 101 | 
            -
            - lr_scheduler_warmup_steps: 150
         | 
| 102 | 
            -
            - training_steps: 9771
         | 
| 103 | 
            -
             | 
| 104 | 
            -
            ### Training results
         | 
| 105 | 
            -
             | 
| 106 | 
            -
             | 
| 107 | 
            -
             | 
| 108 | 
            -
            ### Framework versions
         | 
| 109 | 
            -
             | 
| 110 | 
            -
            - Transformers 4.55.2
         | 
| 111 | 
            -
            - Pytorch 2.6.0+cu124
         | 
| 112 | 
            -
            - Datasets 4.0.0
         | 
| 113 | 
            -
            - Tokenizers 0.21.1
         | 
|  | |
| 2 | 
             
            library_name: transformers
         | 
| 3 | 
             
            tags:
         | 
| 4 | 
             
            - generated_from_trainer
         | 
| 5 | 
            +
            datasets:
         | 
| 6 | 
            +
            - When-Does-Reasoning-Matter/general-reasoning-ift-pairs
         | 
| 7 | 
            +
            - When-Does-Reasoning-Matter/math-reasoning-ift-pairs
         | 
| 8 | 
            +
            language:
         | 
| 9 | 
            +
            - en
         | 
| 10 | 
            +
            pipeline_tag: text-generation
         | 
| 11 | 
             
            ---
         | 
| 12 |  | 
| 13 | 
            +
            # When Does Reasoning Matter?
         | 
|  | |
|  | |
|  | |
|  | |
| 14 |  | 
| 15 | 
            +
            <p align="left">
         | 
| 16 | 
            +
              <img src="https://cdn-avatars.huggingface.co/v1/production/uploads/62be186a5f59ff2320e6e32b/GjJ15tY7-F4bqR96FN4pd.png" alt="Dataset Icon" width="180"/>
         | 
| 17 | 
            +
            </p>
         | 
| 18 |  | 
| 19 | 
            +
            <p align="left">
         | 
| 20 | 
            +
            <a href="https://arxiv.org/pdf/2509.22193" target="_blank" rel="noopener noreferrer">
         | 
| 21 | 
            +
              <img src="https://img.shields.io/badge/arXiv-2509.22193-b31b1b.svg?style=for-the-badge" alt="arXiv:2509.22193" />
         | 
| 22 | 
            +
              </a>
         | 
| 23 | 
            +
            </p>
         | 
| 24 |  | 
|  | |
| 25 |  | 
| 26 | 
            +
            This model was trained as part of the paper [When Does Reasoning Matter?](https://arxiv.org/pdf/2509.22193)
         | 
| 27 | 
            +
            It belongs to a collection of **General and Math-specific student models** distilled from Instruction-Fine-Tuned (IFT) or Reasoning answers generated by [Qwen/Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B).
         | 
| 28 |  | 
| 29 | 
            +
            <img src="https://huggingface.co/api/resolve-cache/models/When-Does-Reasoning-Matter/Qwen2.5-0.5B-ift/733797fee2fdd300e1a0453d368250327fe4cc44/results.png?%2FWhen-Does-Reasoning-Matter%2FQwen2.5-0.5B-ift%2Fresolve%2Fmain%2Fresults.png=&etag=%22d36dedfbca764a8ac9a7a5ebc043ca53f5ee4966%22" alt="results" width="600"/>
         | 
|  | |
| 30 |  | 
| 31 | 
            +
            ---
         | 
|  | |
|  | |
|  | |
| 32 |  | 
| 33 | 
            +
            ## Datasets
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 34 |  | 
| 35 | 
            +
            These models were trained on the **largest set of IFT and Reasoning answer pairs**:
         | 
| 36 | 
            +
            - **General dataset**: [general-reasoning-ift-pairs](https://huggingface.co/datasets/When-Does-Reasoning-Matter/general-reasoning-ift-pairs)  
         | 
| 37 | 
            +
            - **Math dataset**: [math-reasoning-ift-pairs](https://huggingface.co/datasets/When-Does-Reasoning-Matter/math-reasoning-ift-pairs)  
         | 
| 38 |  | 
| 39 | 
            +
            ---
         | 
|  | |
|  | |
| 40 |  | 
| 41 | 
            +
            ## Available Models
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            <table>
         | 
| 44 | 
            +
              <thead>
         | 
| 45 | 
            +
                <tr>
         | 
| 46 | 
            +
                  <th colspan="2">General</th>
         | 
| 47 | 
            +
                  <th colspan="2">Math</th>
         | 
| 48 | 
            +
                </tr>
         | 
| 49 | 
            +
                <tr>
         | 
| 50 | 
            +
                  <th>IFT Models</th>
         | 
| 51 | 
            +
                  <th>Reasoning Models</th>
         | 
| 52 | 
            +
                  <th>IFT Models</th>
         | 
| 53 | 
            +
                  <th>Reasoning Models</th>
         | 
| 54 | 
            +
                </tr>
         | 
| 55 | 
            +
              </thead>
         | 
| 56 | 
            +
              <tbody>
         | 
| 57 | 
            +
                <tr>
         | 
| 58 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-0.5B-ift">Qwen2.5-0.5B-ift</a></td>
         | 
| 59 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-0.5B-reasoning">Qwen2.5-0.5B-reasoning</a></td>
         | 
| 60 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-0.5B-math-ift">Qwen2.5-0.5B-math-ift</a></td>
         | 
| 61 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-0.5B-math-reasoning">Qwen2.5-0.5B-math-reasoning</a></td>
         | 
| 62 | 
            +
                </tr>
         | 
| 63 | 
            +
                <tr>
         | 
| 64 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-1.5B-ift">Qwen2.5-1.5B-ift</a></td>
         | 
| 65 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-1.5B-reasoning">Qwen2.5-1.5B-reasoning</a></td>
         | 
| 66 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-1.5B-math-ift">Qwen2.5-1.5B-math-ift</a></td>
         | 
| 67 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-1.5B-math-reasoning">Qwen2.5-1.5B-math-reasoning</a></td>
         | 
| 68 | 
            +
                </tr>
         | 
| 69 | 
            +
                <tr>
         | 
| 70 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-3B-ift">Qwen2.5-3B-ift</a></td>
         | 
| 71 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-3B-reasoning">Qwen2.5-3B-reasoning</a></td>
         | 
| 72 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-3B-math-ift">Qwen2.5-3B-math-ift</a></td>
         | 
| 73 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-3B-math-reasoning">Qwen2.5-3B-math-reasoning</a></td>
         | 
| 74 | 
            +
                </tr>
         | 
| 75 | 
            +
                <tr>
         | 
| 76 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-7B-ift">Qwen2.5-7B-ift</a></td>
         | 
| 77 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-7B-reasoning">Qwen2.5-7B-reasoning</a></td>
         | 
| 78 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-7B-math-ift">Qwen2.5-7B-math-ift</a></td>
         | 
| 79 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-7B-math-reasoning">Qwen2.5-7B-math-reasoning</a></td>
         | 
| 80 | 
            +
                </tr>
         | 
| 81 | 
            +
                <tr>
         | 
| 82 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-14B-ift">Qwen2.5-14B-ift</a></td>
         | 
| 83 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-14B-reasoning">Qwen2.5-14B-reasoning</a></td>
         | 
| 84 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-14B-math-ift">Qwen2.5-14B-math-ift</a></td>
         | 
| 85 | 
            +
                  <td><a href="https://huggingface.co/When-Does-Reasoning-Matter/Qwen2.5-14B-math-reasoning">Qwen2.5-14B-math-reasoning</a></td>
         | 
| 86 | 
            +
                </tr>
         | 
| 87 | 
            +
              </tbody>
         | 
| 88 | 
            +
            </table>
         | 
| 89 |  | 
| 90 | 
            +
            ---
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 91 |  | 
| 92 | 
            +
            If you use this dataset in your work, please cite: **[When Does Reasoning Matter?](https://arxiv.org/pdf/2509.22193)**
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            ```bibtex
         | 
| 95 | 
            +
            @misc{boizard2025doesreasoningmattercontrolled,
         | 
| 96 | 
            +
                  title={When Does Reasoning Matter? A Controlled Study of Reasoning's Contribution to Model Performance}, 
         | 
| 97 | 
            +
                  author={Nicolas Boizard and Hippolyte Gisserot-Boukhlef and Kevin El-Haddad and Céline Hudelot and Pierre Colombo},
         | 
| 98 | 
            +
                  year={2025},
         | 
| 99 | 
            +
                  eprint={2509.22193},
         | 
| 100 | 
            +
                  archivePrefix={arXiv},
         | 
| 101 | 
            +
                  primaryClass={cs.CL},
         | 
| 102 | 
            +
                  url={https://arxiv.org/abs/2509.22193}, 
         | 
| 103 | 
            +
            }
         | 
| 104 | 
             
            ```
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 

