sunblaze-ucb
/

Qwen2.5-3B-GRPO-MATH-1EPOCH

Text Generation

Model card Files Files and versions Community

Xuandong commited on 12 days ago

Commit

6cfb9d6

·

verified ·

1 Parent(s): f0c2736

Update Readme

Files changed (1) hide show

README.md +7 -0

README.md CHANGED Viewed

@@ -21,6 +21,13 @@ A GRPO-fine-tuned version of Qwen2.5-3B trained on the MATH dataset.
 ## Citation
 ```bibtex
 @article{sha2024deepseekmath,
   title     = {DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models},
   author    = {Shao, Zhihong and Wang, Peiyi and Zhu, Qihao and Xu, Runxin and Song, Junxiao and Bi, Xiao and … Guo, Daya},

 ## Citation
 ```bibtex
+@article{zhao2025learning,
+  title={Learning to Reason without External Rewards},
+  author={Zhao, Xuandong and Kang, Zhewei and Feng, Aosong and Levine, Sergey and Song, Dawn},
+  journal={arXiv preprint arXiv:2505.19590},
+  year={2025}
+}
 @article{sha2024deepseekmath,
   title     = {DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models},
   author    = {Shao, Zhihong and Wang, Peiyi and Zhu, Qihao and Xu, Runxin and Song, Junxiao and Bi, Xiao and … Guo, Daya},