intervitens commited on 29 days ago

Commit

a7c11fe

verified ·

1 Parent(s): 2fb5072

Upload folder using huggingface_hub

Browse files

Files changed (46) hide show

LICENSE +77 -0
Notice.txt +160 -0
README.md +329 -0
README_CN.md +456 -0
config.json +203 -0
configuration_hunyuan.py +319 -0
generation_config.json +13 -0
hunyuan.py +851 -0
hy.tiktoken +0 -0
model-00001-of-00033.safetensors +3 -0
model-00002-of-00033.safetensors +3 -0
model-00003-of-00033.safetensors +3 -0
model-00004-of-00033.safetensors +3 -0
model-00005-of-00033.safetensors +3 -0
model-00006-of-00033.safetensors +3 -0
model-00007-of-00033.safetensors +3 -0
model-00008-of-00033.safetensors +3 -0
model-00009-of-00033.safetensors +3 -0
model-00010-of-00033.safetensors +3 -0
model-00011-of-00033.safetensors +3 -0
model-00012-of-00033.safetensors +3 -0
model-00013-of-00033.safetensors +3 -0
model-00014-of-00033.safetensors +3 -0
model-00015-of-00033.safetensors +3 -0
model-00016-of-00033.safetensors +3 -0
model-00017-of-00033.safetensors +3 -0
model-00018-of-00033.safetensors +3 -0
model-00019-of-00033.safetensors +3 -0
model-00020-of-00033.safetensors +3 -0
model-00021-of-00033.safetensors +3 -0
model-00022-of-00033.safetensors +3 -0
model-00023-of-00033.safetensors +3 -0
model-00024-of-00033.safetensors +3 -0
model-00025-of-00033.safetensors +3 -0
model-00026-of-00033.safetensors +3 -0
model-00027-of-00033.safetensors +3 -0
model-00028-of-00033.safetensors +3 -0
model-00029-of-00033.safetensors +3 -0
model-00030-of-00033.safetensors +3 -0
model-00031-of-00033.safetensors +3 -0
model-00032-of-00033.safetensors +3 -0
model-00033-of-00033.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_hunyuan.py +1728 -0
tokenization_hy.py +298 -0
tokenizer_config.json +18 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,77 @@

+TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT
+Tencent Hunyuan A13B Release Date: June 27, 2025
+THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
+By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent Hunyuan Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
+1.	DEFINITIONS.
+a.	“Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
+b.	“Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent Hunyuan Works or any portion or element thereof set forth herein.
+c.	“Documentation” shall mean the specifications, manuals and documentation for Tencent Hunyuan made publicly available by Tencent.
+d.	“Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
+e.	“Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent Hunyuan Works for any purpose and in any field of use.
+f.	“Materials” shall mean, collectively, Tencent’s proprietary Tencent Hunyuan and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
+g.	“Model Derivatives” shall mean all: (i) modifications to Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; (ii) works based on Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent Hunyuan or any Model Derivative of Tencent Hunyuan, to that model in order to cause that model to perform similarly to Tencent Hunyuan or a Model Derivative of Tencent Hunyuan, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent Hunyuan or a Model Derivative of Tencent Hunyuan for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
+h.	“Output” shall mean the information and/or content output of Tencent Hunyuan or a Model Derivative that results from operating or otherwise using Tencent Hunyuan or a Model Derivative, including via a Hosted Service.
+i.	“Tencent,” “We” or “Us” shall mean the applicable entity or entities in the Tencent corporate family that own(s) intellectual property or other rights embodied in or utilized by the Materials.
+j.	“Tencent Hunyuan” shall mean the large language models, text/image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us, including, without limitation to, Tencent Hunyuan A13B released at [https://github.com/Tencent-Hunyuan/Hunyuan-A13B].
+k.	“Tencent Hunyuan Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
+l.	“Territory” shall mean the worldwide territory, excluding the territory of the European Union, United Kingdom and South Korea.
+m.	“Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
+n.	“including” shall mean including but not limited to.
+2.	GRANT OF RIGHTS.
+We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
+3.	DISTRIBUTION.
+You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent Hunyuan Works, exclusively in the Territory, provided that You meet all of the following conditions:
+a.	You must provide all such Third Party recipients of the Tencent Hunyuan Works or products or services using them a copy of this Agreement;
+b.	You must cause any modified files to carry prominent notices stating that You changed the files;
+c.	You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent Hunyuan Works; and (ii) mark the products or services developed by using the Tencent Hunyuan Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
+d.	All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement, Copyright © 2025 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
+You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent Hunyuan Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
+4.	ADDITIONAL COMMERCIAL TERMS.
+If, on the Tencent Hunyuan version release date, the monthly active users of all products or services made available by or for Licensee is greater than 100 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
+5.	RULES OF USE.
+a.	Your use of the Tencent Hunyuan Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent Hunyuan Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent Hunyuan Works and You must provide notice to subsequent users to whom You distribute that Tencent Hunyuan Works are subject to the use restrictions in these Sections 5(a) and 5(b).
+b.	You must not use the Tencent Hunyuan Works or any Output or results of the Tencent Hunyuan Works to improve any other AI model (other than Tencent Hunyuan or Model Derivatives thereof).
+c.	You must not use, reproduce, modify, distribute, or display the Tencent Hunyuan Works, Output or results of the Tencent Hunyuan Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
+6.	INTELLECTUAL PROPERTY.
+a.	Subject to Tencent’s ownership of Tencent Hunyuan Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
+b.	No trademark licenses are granted under this Agreement, and in connection with the Tencent Hunyuan Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent Hunyuan Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
+c.	If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent Hunyuan Works.
+d.	Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
+7.	DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
+a.	We are not obligated to support, update, provide training for, or develop any further version of the Tencent Hunyuan Works or to grant any license thereto.
+b.	UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUAN WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
+c.	TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+8.	SURVIVAL AND TERMINATION.
+a.	The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
+b.	We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent Hunyuan Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
+9.	GOVERNING LAW AND JURISDICTION.
+a.	This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
+b.	Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
+EXHIBIT A
+ACCEPTABLE USE POLICY
+Tencent reserves the right to update this Acceptable Use Policy from time to time.
+Last modified: November 5, 2024
+Tencent endeavors to promote safe and fair use of its tools and features, including Tencent Hunyuan. You agree not to use Tencent Hunyuan or Model Derivatives:
+1.	Outside the Territory;
+2.	In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
+3.	To harm Yourself or others;
+4.	To repurpose or distribute output from Tencent Hunyuan or any Model Derivatives to harm Yourself or others;
+5.	To override or circumvent the safety guardrails and safeguards We have put in place;
+6.	For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
+7.	To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
+8.	To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
+9.	To intentionally defame, disparage or otherwise harass others;
+10.	To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
+11.	To generate or disseminate personal identifiable information with the purpose of harming others;
+12.	To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
+13.	To impersonate another individual without consent, authorization, or legal right;
+14.	To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
+15.	In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
+16.	To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
+17.	For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
+18.	To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
+19.	For military purposes;
+20.	To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.

Notice.txt ADDED Viewed

	@@ -0,0 +1,160 @@

+Usage and Legal Notices:
+Tencent is pleased to support the open source community by making Tencent Hunyuan A13B available.
+Copyright (C) Tencent.  All rights reserved. The below software and/or models in this distribution may have been modified by Tencent ("Tencent Modifications"). All Tencent Modifications are Copyright (C) Tencent.
+Tencent Hunyuan A13B is licensed under TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT, which can be found in this repository called "LICENSE", except for the third-party components listed below. Tencent Hunyuan A13B does not impose any additional limitations beyond what is outlined in the respective licenses of these third-party components. Users must comply with all terms and conditions of original licenses of these third-party components and must ensure that the usage of the third party components adheres to all relevant laws and regulations.
+For avoidance of doubts, Tencent Hunyuan A13B refers to the inference code, training code, parameters and the weights of Tencent Hunyuan A13B only, which are made publicly available by Tencent in accordance with the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+Other dependencies and licenses:
+Open Source Software Licensed under the Apache License Version 2.0:
+The below software in this distribution may have been modified by Tencent ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2025 Tencent.
+--------------------------------------------------------------------
+1. pytorch
+Copyright 2016-2017 TorchAPI
+Copyright 2016-2017 Contributors
+2. VLLM
+Copyright (c) vllm original author and authors
+Please note this software has been modified by Tencent in this distribution.
+3. transformers
+Copyright 2018- The Hugging Face team. All rights reserved.
+4. accelerate
+Copyright (c) accelerate original author and authors
+Terms of the Apache License Version 2.0:
+--------------------------------------------------------------------
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+You must give any other recipients of the Work or Derivative Works a copy of this License; and
+You must cause any modified files to carry prominent notices stating that You changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+Open Source Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. pytorch
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+Terms of the BSD 3-Clause:
+--------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+For the license of other third party components, please refer to the following URL:
+https://github.com/pytorch/pytorch/blob/v2.1.1/NOTICE
+https://github.com/pytorch/pytorch/tree/v2.1.1/third_party
+Open Source Software Licensed under the BSD 3-Clause License:
+--------------------------------------------------------------------
+1. flash_attn
+Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
+All rights reserved.
+A copy of the BSD 3-Clause is included in this file.
+Open Source Software Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+The below software in this distribution is modified by Tencent ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2025 Tencent.
+--------------------------------------------------------------------
+1. sglang
+Copyright 2023-2024 SGLang Team
+A copy of the Apache 2.0 is included in this file.
+For the license of other third party components, please refer to the following URL:
+https://github.com/sgl-project/sglang/tree/v0.4.7/3rdparty/amd
+Open Source Software Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+The below software in this distribution is modified by Tencent ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2025 Tencent.
+--------------------------------------------------------------------
+1. TensorRT-LLM
+Copyright (c) TensorRT-LLM original author and authors
+A copy of the Apache 2.0 is included in this file.
+For the license of other third party components, please refer to the following URL:
+https://github.com/NVIDIA/TensorRT-LLM/tree/v0.20.0/3rdparty

README.md ADDED Viewed

	@@ -0,0 +1,329 @@

+---
+license: other
+license_name: tencent-hunyuan-a13b
+license_link: https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/LICENSE
+library_name: transformers
+---
+<p align="center">
+ <img src="https://dscache.tencent-cloud.cn/upload/uploader/hunyuan-64b418fd052c033b228e04bc77bbc4b54fd7f5bc.png" width="400"/> <br>
+</p><p></p>
+<p align="center">
+    🤗&nbsp;<a href="https://huggingface.co/tencent/Hunyuan-A13B-Instruct"><b>Hugging Face</b></a>&nbsp;&nbsp;|&nbsp;&nbsp;
+    🖥️&nbsp;<a href="https://hunyuan.tencent.com" style="color: red;"><b>Official Website</b></a>&nbsp;&nbsp;|&nbsp;&nbsp;
+    🕖&nbsp;<a href="https://cloud.tencent.com/product/hunyuan"><b>HunyuanAPI</b></a>&nbsp;&nbsp;|&nbsp;&nbsp;
+    🕹️&nbsp;<a href="https://hunyuan.tencent.com/?model=hunyuan-a13b"><b>Demo</b></a>&nbsp;&nbsp;|&nbsp;&nbsp;
+    🤖&nbsp;<a href="https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-A13B-Instruct"><b>ModelScope</b></a>
+</p>
+<p align="center">
+    <a href="https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf"><b>Technical Report</b> </a> |
+    <a href="https://github.com/Tencent-Hunyuan/Hunyuan-A13B"><b>GITHUB</b></a> |
+    <a href="https://cnb.cool/tencent/hunyuan/Hunyuan-A13B"><b>cnb.cool</b></a> |
+    <a href="https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/LICENSE"><b>LICENSE</b></a>
+</p>
+Welcome to the official repository of **Hunyuan-A13B**, an innovative and open-source large language model (LLM) built on a fine-grained Mixture-of-Experts (MoE) architecture. Designed for efficiency and scalability, Hunyuan-A13B delivers cutting-edge performance with minimal computational overhead, making it an ideal choice for advanced reasoning and general-purpose applications, especially in resource-constrained environments.
+## Model Introduction
+With the rapid advancement of artificial intelligence technology, large language models (LLMs) have achieved remarkable progress in natural language processing, computer vision, and scientific tasks. However, as model scales continue to expand, optimizing resource consumption while maintaining high performance has become a critical challenge. To address this, we have explored Mixture of Experts (MoE) architectures. The newly introduced Hunyuan-A13B model features a total of 80 billion parameters with 13 billion active parameters. It not only delivers high-performance results but also achieves optimal resource efficiency, successfully balancing computational power and resource utilization.
+### Key Features and Advantages
+- **Compact yet Powerful**: With only 13 billion active parameters (out of a total of 80 billion), the model delivers competitive performance on a wide range of benchmark tasks, rivaling much larger models.
+- **Hybrid Reasoning Support**: Supports both fast and slow thinking modes, allowing users to flexibly choose according to their needs.
+- **Ultra-Long Context Understanding**: Natively supports a 256K context window, maintaining stable performance on long-text tasks.
+- **Enhanced Agent Capabilities**: Optimized for agent tasks, achieving leading results on benchmarks such as BFCL-v3, τ-Bench and C3-Bench.
+- **Efficient Inference**: Utilizes Grouped Query Attention (GQA) and supports multiple quantization formats, enabling highly efficient inference.
+### Why Choose Hunyuan-A13B?
+As a powerful yet computationally efficient large model, Hunyuan-A13B is an ideal choice for researchers and developers seeking high performance under resource constraints. Whether for academic research, cost-effective AI solution development, or innovative application exploration, this model provides a robust foundation for advancement.
+&nbsp;
+## Related News
+* 2025.6.27 We have open-sourced  **Hunyuan-A13B-Pretrain** , **Hunyuan-A13B-Instruct** , **Hunyuan-A13B-Instruct-FP8** , **Hunyuan-A13B-Instruct-GPTQ-Int4** on Hugging Face. In addition, we have released a <a href="report/Hunyuan_A13B_Technical_Report.pdf">technical report </a> and a training and inference operation manual, which provide detailed information about the model’s capabilities as well as the operations for training and inference.
+<br>
+## Benchmark
+Note: The following benchmarks are evaluated by TRT-LLM-backend on several **base models**.
+| Model            | Hunyuan-Large | Qwen2.5-72B  | Qwen3-A22B | Hunyuan-A13B |
+|------------------|---------------|--------------|-------------|---------------|
+| MMLU             | 88.40          | 86.10         | 87.81        | 88.17          |
+| MMLU-Pro         | 60.20          | 58.10        | 68.18           | 67.23          |
+| MMLU-Redux              |  87.47         | 83.90         | 87.40        | 87.67          |
+| BBH        | 86.30             | 85.80            | 88.87        | 87.56          |
+| SuperGPQA    |  38.90         | 36.20          | 44.06           | 41.32          |
+| EvalPlus       | 75.69          | 65.93         | 77.60        | 78.64          |
+| MultiPL-E             | 59.13             | 60.50            | 65.94        | 69.33          |
+| MBPP | 72.60             | 76.00            | 81.40        | 83.86          |
+| CRUX-I             | 57.00          | 57.63          | -        | 70.13          |
+| CRUX-O             | 60.63          | 66.20          | 79.00        | 77.00          |
+| MATH            | 69.80          | 62.12         | 71.84        | 72.35          |
+| CMATH            | 91.30          | 84.80         | -        | 91.17          |
+| GSM8k         | 92.80             | 91.50           | 94.39        | 91.83          |
+| GPQA            | 25.18             | 45.90            | 47.47        | 49.12          |
+Hunyuan-A13B-Instruct has achieved highly competitive performance across multiple benchmarks, particularly in mathematics, science, agent domains, and more. We compared it with several powerful models, and the results are shown below.
+| Topic               |                        Bench                         | OpenAI-o1-1217 | DeepSeek R1 | Qwen3-A22B | Hunyuan-A13B-Instruct |
+|:-------------------:|:----------------------------------------------------:|:-------------:|:------------:|:-----------:|:---------------------:|
+| **Mathematics**     |            AIME 2024<br>AIME 2025<br>MATH            | 74.3<br>79.2<br>96.4 | 79.8<br>70<br>94.9 | 85.7<br>81.5<br>94.0 | 87.3<br>76.8<br>94.3 |
+| **Science**         |            GPQA-Diamond<br>OlympiadBench             | 78<br>83.1 | 71.5<br>82.4 | 71.1<br>85.7 | 71.2<br>82.7 |
+| **Coding**          |  Livecodebench<br>Fullstackbench<br>ArtifactsBench   | 63.9<br>64.6<br>38.6 | 65.9<br>71.6<br>44.6 | 70.7<br>65.6<br>44.6 | 63.9<br>67.8<br>43 |
+| **Reasoning**       |              BBH<br>DROP<br>ZebraLogic               | 80.4<br>90.2<br>81 | 83.7<br>92.2<br>78.7 | 88.9<br>90.3<br>80.3 | 89.1<br>91.1<br>84.7 |
+| **Instruction<br>Following** |                 IF-Eval<br>SysBench                  | 91.8<br>82.5 | 88.3<br>77.7 | 83.4<br>74.2 | 84.7<br>76.1 |
+| **Text<br>Creation**|                LengthCtrl<br>InsCtrl                 | 60.1<br>74.8 | 55.9<br>69 | 53.3<br>73.7 | 55.4<br>71.9 |
+| **NLU**             |               ComplexNLU<br>Word-Task                | 64.7<br>67.1 | 64.5<br>76.3 | 59.8<br>56.4 | 61.2<br>62.9 |
+| **Agent**           | BFCL v3<br> τ-Bench<br>ComplexFuncBench<br> C3-Bench | 67.8<br>60.4<br>47.6<br>58.8 | 56.9<br>43.8<br>41.1<br>55.3 | 70.8<br>44.6<br>40.6<br>51.7 | 78.3<br>54.7<br>61.2<br>63.5 |
+&nbsp;
+## Use with transformers
+Our model defaults to using slow-thinking reasoning, and there are two ways to disable CoT reasoning.
+1. Pass "enable_thinking=False" when calling apply_chat_template.
+2. Adding "/no_think" before the prompt will force the model not to use perform CoT reasoning. Similarly, adding "/think" before the prompt will force the model to perform CoT reasoning.
+The following code snippet shows how to use the transformers library to load and apply the model.
+It also demonstrates how to enable and disable the reasoning mode ,
+and how to parse the reasoning process along with the final output.
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import os
+import re
+model_name_or_path = os.environ['MODEL_PATH']
+# model_name_or_path = "tencent/Hunyuan-A13B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto",trust_remote_code=True)  # You may want to use bfloat16 and/or move to GPU here
+messages = [
+    {"role": "user", "content": "Write a short summary of the benefits of regular exercise"},
+]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors="pt",
+                                                enable_thinking=True # Toggle thinking mode (default: True)
+                                                )
+outputs = model.generate(tokenized_chat.to(model.device), max_new_tokens=4096)
+output_text = tokenizer.decode(outputs[0])
+think_pattern = r'<think>(.*?)</think>'
+think_matches = re.findall(think_pattern, output_text, re.DOTALL)
+answer_pattern = r'<answer>(.*?)</answer>'
+answer_matches = re.findall(answer_pattern, output_text, re.DOTALL)
+think_content = [match.strip() for match in think_matches][0]
+answer_content = [match.strip() for match in answer_matches][0]
+print(f"thinking_content:{think_content}\n\n")
+print(f"answer_content:{answer_content}\n\n")
+```
+### Fast and slow thinking switch
+This model supports two modes of operation:
+- Slow Thinking Mode (Default): Enables detailed internal reasoning steps before producing the final answer.
+- Fast Thinking Mode: Skips the internal reasoning process for faster inference, going straight to the final answer.
+**Switching to Fast Thinking Mode:**
+To disable the reasoning process, set `enable_thinking=False` in the apply_chat_template call:
+```
+tokenized_chat = tokenizer.apply_chat_template(
+    messages,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    enable_thinking=False  # Use fast thinking mode
+)
+```
+## Deployment
+For deployment, you can use frameworks such as **TensorRT-LLM**, **vLLM**, or **SGLang** to serve the model and create an OpenAI-compatible API endpoint.
+image: https://hub.docker.com/r/hunyuaninfer/hunyuan-a13b/tags
+### TensorRT-LLM
+#### Docker Image
+We provide a pre-built Docker image based on the latest version of TensorRT-LLM.
+- To get started:
+https://hub.docker.com/r/hunyuaninfer/hunyuan-large/tags
+```
+docker pull hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-trtllm
+```
+```
+docker run --name hunyuanLLM_infer --rm -it --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus=all hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-trtllm
+```
+- Prepare Configuration file:
+```
+cat >/path/to/extra-llm-api-config.yml <<EOF
+use_cuda_graph: true
+cuda_graph_padding_enabled: true
+cuda_graph_batch_sizes:
+- 1
+- 2
+- 4
+- 8
+- 16
+- 32
+print_iter_log: true
+EOF
+```
+- Start the API server:
+```
+trtllm-serve \
+  /path/to/HunYuan-moe-A13B \
+  --host localhost \
+  --port 8000 \
+  --backend pytorch \
+  --max_batch_size 32 \
+  --max_num_tokens 16384 \
+  --tp_size 2 \
+  --kv_cache_free_gpu_memory_fraction 0.6 \
+  --trust_remote_code \
+  --extra_llm_api_options /path/to/extra-llm-api-config.yml
+```
+### vLLM
+#### Docker Image
+We provide a pre-built Docker image containing vLLM 0.8.5 with full support for this model. The official vllm release is currently under development， **note: cuda 12.8 is require for this docker**.
+- To get started:
+```
+docker pull docker.cnb.cool/tencent/hunyuan/hunyuan-a13b:hunyuan-moe-A13B-vllm
+or
+docker pull hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-vllm
+```
+- Download Model file:
+  - Huggingface:  will download automicly by vllm.
+  - ModelScope: `modelscope download --model Tencent-Hunyuan/Hunyuan-A13B-Instruct`
+- Start the API server:
+model download by huggingface:
+```
+docker run --rm  --ipc=host \
+        -v ~/.cache:/root/.cache/ \
+        --security-opt seccomp=unconfined \
+        --net=host \
+        --gpus=all \
+        -it \
+        -e VLLM_USE_V1=0 \
+        --entrypoint python hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-vllm \
+        -m vllm.entrypoints.openai.api_server \
+        --host 0.0.0.0 \
+        --tensor-parallel-size 4 \
+        --port 8000 \
+        --model tencent/Hunyuan-A13B-Instruct  \
+        --trust_remote_code
+```
+model downloaded by modelscope:
+```
+docker run --rm  --ipc=host \
+        -v ~/.cache/modelscope:/root/.cache/modelscope \
+        --security-opt seccomp=unconfined \
+        --net=host \
+        --gpus=all \
+        -it \
+        -e VLLM_USE_V1=0 \
+        --entrypoint python mirror.ccs.tencentyun.com/hunyuaninfer/hunyuan-large:hunyuan-moe-A13B-vllm \
+        -m vllm.entrypoints.openai.api_server \
+        --host 0.0.0.0 \
+        --tensor-parallel-size 4 \
+        --port 8000 \
+        --model /root/.cache/modelscope/hub/models/Tencent-Hunyuan/Hunyuan-A13B-Instruct/  \
+        --trust_remote_code
+```
+#### Tool Calling with vLLM
+To support agent-based workflows and function calling capabilities, this model includes specialized parsing mechanisms for handling tool calls and internal reasoning steps.
+For a complete working example of how to implement and use these features in an agent setting, please refer to our full agent implementation on GitHub:
+🔗 [Hunyuan A13B Agent Example](https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/agent/)
+When deploying the model using **vLLM**, the following parameters can be used to configure the tool parsing behavior:
+| Parameter                | Value                                                                 |
+|--------------------------|-----------------------------------------------------------------------|
+| `--tool-parser-plugin`   | [Local Hunyuan A13B Tool Parser File](https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/agent/hunyuan_tool_parser.py) |
+| `--tool-call-parser`     | `hunyuan`                                                            |
+These settings enable vLLM to correctly interpret and route tool calls generated by the model according to the expected format.
+### Reasoning parser
+vLLM reasoning parser support on Hunyuan A13B model is under development.
+### SGLang
+#### Docker Image
+We also provide a pre-built Docker image based on the latest version of SGLang.
+To get started:
+- Pull the Docker image
+```
+docker pull docker.cnb.cool/tencent/hunyuan/hunyuan-a13b:hunyuan-moe-A13B-sglang
+or
+docker pull hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-sglang
+```
+- Start the API server:
+```
+docker run --gpus all \
+    --shm-size 32g \
+    -p 30000:30000 \
+    --ipc=host \
+    docker.cnb.cool/tencent/hunyuan/hunyuan-a13b:hunyuan-moe-A13B-sglang \
+    -m sglang.launch_server --model-path hunyuan/huanyuan_A13B --tp 4 --trust-remote-code --host 0.0.0.0 --port 30000
+```
+## Contact Us
+If you would like to leave a message for our R&D and product teams, Welcome to contact our open-source team . You can also contact us via email ([email protected]).

README_CN.md ADDED Viewed

	@@ -0,0 +1,456 @@

+<p align="center">
+ <img src="https://dscache.tencent-cloud.cn/upload/uploader/hunyuan-64b418fd052c033b228e04bc77bbc4b54fd7f5bc.png" width="400"/> <br>
+</p><p></p>
+<p align="center">
+    🫣&nbsp;<a href="https://huggingface.co/tencent/Hunyuan-A13B-Instruct"><b>Hugging Face</b></a>&nbsp;&nbsp;|&nbsp;&nbsp;
+    🖥️&nbsp;<a href="https://llm.hunyuan.tencent.com/" style="color: red;"><b>Official Website</b></a>&nbsp;&nbsp;|&nbsp;&nbsp;
+    🕖&nbsp;<a href="https://cloud.tencent.com/product/hunyuan"><b>HunyuanAPI</b></a>&nbsp;&nbsp;|&nbsp;&nbsp;
+    🕹️&nbsp;<a href="https://hunyuan.tencent.com/?model=hunyuan-a13b"><b>Demo</b></a>&nbsp;&nbsp;|&nbsp;&nbsp;
+    <img src="https://avatars.githubusercontent.com/u/109945100?s=200&v=4" width="16"/>&nbsp;<a href="https://modelscope.cn/models/Tencent-Hunyuan/Hunyuan-A13B-Instruct"><b>ModelScope</b></a>
+</p>
+<p align="center">
+    <a href="https://github.com/Tencent/Hunyuan-A13B"><b>GITHUB</b></a>
+</p>
+## 模型介绍
+随着人工智能技术的快速发展，大型语言模型（LLMs）在自然语言处理、计算机视觉和科学任务等领域取得了显著进展。然而，随着模型规模的扩大，如何在保持高性能的同时优化资源消耗成为一个关键挑战。为了应对这一挑战，我们研究了混合专家（MoE）模型，当前亮相的 Hunyuan-A13B 模型，拥有800亿总参数和130亿激活参数。不仅在效果上达到了高标准，而且在尺寸上也做到了极致的优化，成功平衡了模型性能与资源占用。
+### 核心特性与优势
+- **小参数量，高性能**：仅激活130亿参数（总参数量800亿），即可在多样化基准任务中媲美更大规模模型的竞争力表现
+- **混合推理支持**：同时支持快思考和慢思考两种模式，支持用户灵活选择
+- **超长上下文理解**：原生支持256K上下文窗口，在长文本任务中保持稳定性能
+- **增强Agent能力**：优化Agent能力，在BFCL-v3、τ-Bench等智能体基准测试中领先
+- **高效推理**：采用分组查询注意力（GQA）策略，支持多量化格式，实现高效推理
+### 为何选择Hunyuan-A13B？
+作为兼具强大性能与计算效率的大模型，Hunyuan-A13B是研究者与开发者在资源受限条件下追求高性能的理想选择。无论学术研究、高性价比AI解决方案开发，还是创新应用探索，本模型都能提供强大的基础支持。
+&nbsp;
+## 新闻
+<br>
+* 2025.6.26 我们在Hugging Face开源了 **Hunyuan-A13B-Instruct**，**Hunyuan-A13B-Pretrain**, **Hunyuan-A13B-Instruct-FP8**， **Hunyuan-A13B-Instruct-GPTQ-Int4**。并发布了技术报告和训练推理操作手册，详细介绍了模型能力和训练与推理的操作。
+## 模型结构
+Hunyuan-A13B采用了细粒度混合专家（Fine-grained Mixture of Experts，Fine-grained MoE）架构，包含800亿参数和130亿激活参数，累计训练了超过 20T tokens。该模型支持 256K 的上下文长度，以下为模型结构细节:
+* 总参数: 80B
+* 激活参数: 13B
+* 层数: 32
+* Attention Heads: 32
+* 共享专家数: 1
+* 非共享专家数: 64
+* 路由策略: Top-8
+* 激活函数: SwiGLU
+* 隐层维度: 4096
+* 专家隐层维度: 3072
+## Benchmark评估榜单
+**Hunyuan-A13B-Pretrain** 在 12/14 个任务上超越了Hunyuan上一代52B激活参数的MoE模型Hunyuan-Large，证实了它在预训练任务上出色的能力。与业界更大参数量的Dense和MoE模型相比, Hunyuan-A13B在多个代码和数学任务上都取得了最高分数。在MMLU, MMLU-PRO等诸多众聚合任务上, Hunyuan-A13B达到了与Qwen3-A22B模型同等的水平，表现出优秀的综合能力。
+| Model            | Hunyuan-Large | Qwen2.5-72B  | Qwen3-A22B | Hunyuan-A13B |
+|------------------|---------------|--------------|-------------|---------------|
+| MMLU             | 88.40          | 86.10         | 87.81        | 88.17          |
+| MMLU-Pro         | 60.20          | 58.10        | 68.18           | 67.23          |
+| MMLU-Redux              |  87.47         | 83.90         | 87.40        | 87.67          |
+| BBH        | 86.30             | 85.80            | 88.87        | 87.56          |
+| SuperGPQA    |  38.90         | 36.20          | 44.06           | 41.32          |
+| EvalPlus       | 75.69          | 65.93         | 77.60        | 78.64          |
+| MultiPL-E             | 59.13             | 60.50            | 65.94        | 69.33          |
+| MBPP | 72.60             | 76.00            | 81.40        | 83.86          |
+| CRUX-I             | 57.00          | 57.63          | -        | 70.13          |
+| CRUX-O             | 60.63          | 66.20          | 79.00        | 77.00          |
+| MATH            | 69.80          | 62.12         | 71.84        | 72.35          |
+| CMATH            | 91.30          | 84.80         | -        | 91.17          |
+| GSM8k         | 92.80             | 91.50           | 94.39        | 91.83          |
+| GPQA            | 25.18             | 45.90            | 47.47        | 49.12          |
+**Hunyuan-A13B-Instruct** 在多项基准测试中取得了极具有竞争力的表现，尤其是在数学、科学、agent等领域。我们与一些强力模型进行了对比，结果如下所示。
+| Topic               | Bench                         | OpenAI-o1-1217 | DeepSeek R1 | Qwen3-A22B | Hunyuan-A13B-Instruct |
+|:-------------------:|:-----------------------------:|:-------------:|:------------:|:-----------:|:---------------------:|
+| **Mathematics**     | AIME 2024<br>AIME 2025<br>MATH | 74.3<br>79.2<br>96.4 | 79.8<br>70<br>94.9 | 85.7<br>81.5<br>94.0 | 87.3<br>76.8<br>94.3 |
+| **Science**         | GPQA-Diamond<br>OlympiadBench | 78<br>83.1 | 71.5<br>82.4 | 71.1<br>85.7 | 71.2<br>82.7 |
+| **Coding**          | Livecodebench<br>Fullstackbench<br>ArtifactsBench | 63.9<br>64.6<br>38.6 | 65.9<br>71.6<br>44.6 | 70.7<br>65.6<br>44.6 | 63.9<br>67.8<br>43 |
+| **Reasoning**       | BBH<br>DROP<br>ZebraLogic    | 80.4<br>90.2<br>81 | 83.7<br>92.2<br>78.7 | 88.9<br>90.3<br>80.3 | 89.1<br>91.1<br>84.7 |
+| **Instruction<br>Following** | IF-Eval<br>SysBench  | 91.8<br>82.5 | 88.3<br>77.7 | 83.4<br>74.2 | 84.7<br>76.1 |
+| **Text<br>Creation**| LengthCtrl<br>InsCtrl       | 60.1<br>74.8 | 55.9<br>69 | 53.3<br>73.7 | 55.4<br>71.9 |
+| **NLU**             | ComplexNLU<br>Word-Task     | 64.7<br>67.1 | 64.5<br>76.3 | 59.8<br>56.4 | 61.2<br>62.9 |
+| **Agent**           | BDCL v3<br> τ-Bench<br>ComplexFuncBench<br> $C^3$-Bench | 67.8<br>60.4<br>47.6<br>58.8 | 56.9<br>43.8<br>41.1<br>55.3 | 70.8<br>44.6<br>40.6<br>51.7 | 78.3<br>54.7<br>61.2<br>63.5 |
+## 推理和部署
+HunyuanLLM可以采用vLLM，sglang或TensorRT-LLM部署。为了简化部署过程HunyuanLLM提供了预构建docker镜像。
+## 使用TensorRT-LLM推理
+### BF16部署
+#### Step1：执行推理
+#### 方式1：命令行推理
+下面我们展示一个代码片段，采用`TensorRT-LLM`快速请求chat model：
+修改 examples/pytorch/quickstart_advanced.py 中如下代码：
+```python
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._torch import LLM
+from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
+from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
+                                 MTPDecodingConfig)
+prompt = "Write a short summary of the benefits of regular exercise"
+def main():
+    args = parse_arguments()
+    llm, sampling_params = setup_llm(args)
+    new_prompts = []
+    if args.apply_chat_template:
+        messages = [{"role": "user", "content": f"{prompt}"}]
+        new_prompts.append(llm.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True)
+        )
+    outputs = llm.generate(new_prompts, sampling_params)
+    for i, output in enumerate(outputs):
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"[{i}] Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+运行方式：
+```shell
+python3 quickstart_advanced.py --model_dir "HunyuanLLM模型路径" --tp_size 4 --apply_chat_template
+```
+#### 方式2：服务化推理
+下面我们展示使用`TensorRT-LLM`服务化的方式部署模型和请求。
+```shell
+model_path="HunyuanLLM模型路径"
+trtllm-serve <model_path> [--backend pytorch --tp_size <tp> --ep_size <ep> --host <host> --port <port>]
+```
+服务启动成功后, 运行请求脚本：
+```python
+### OpenAI Chat Client
+from openai import OpenAI
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="tensorrt_llm",
+)
+response = client.chat.completions.create(
+    model="default",
+    messages=[{
+        "role": "user",
+        "content": "Write a short summary of the benefits of regular exercise"
+    }],
+    max_tokens=4096,
+)
+print(response)
+```
+#### FP8/Int4量化模型部署：
+目前 TensorRT-LLM 的 fp8 和 int4 量化模型正在支持中，敬请期待。
+## 使用vLLM推理
+### Docker:
+为了简化部署过程，HunyuanLLM提供了预构建docker镜像：
+[hunyuaninfer/hunyuan-large:hunyuan-moe-A13B-vllm](https://hub.docker.com/r/hunyuaninfer/hunyuan-large/tags) 。您只需要下载模型文件并用下面代码启动docker即可开始推理模型。
+```shell
+# 拉取
+docker pull hunyuaninfer/hunyuan-large:hunyuan-moe-A13B-vllm
+# 起镜像
+docker run --name hunyuanLLM_infer -itd --privileged --user root  --net=host --ipc=host --gpus=8 hunyuaninfer/hunyuan-large:hunyuan-moe-A13B-vllm
+```
+注: Docker容器权限管理。以上代码采用特权模式（--privileged）启动Docker容器会赋予容器较高的权限，增加数据泄露和集群安全风险。建议在非必要情况下避免使用特权模式，以降低安全威胁。对于必须使用特权模式的场景，应进行严格的安全评估，并实施相应的安全监控、加固措施。
+### BF16部署
+BF16可以在2张显存超过80G的GPU卡上部署，如果长文推荐TP4。按如下步骤执行：
+运行命令前请先设置如下环境变量：
+```shell
+export MODEL_PATH=PATH_TO_MODEL
+```
+#### Step1：执行推理
+#### 方式1：命令行推理
+下面我们展示一个代码片段，采用`vLLM`快速请求chat model：
+注: vLLM组件远程代码执行防护。下列代码中vLLM组件的trust-remote-code配置项若被启用，将允许加载并执行来自远程模型仓库的代码，这可能导致恶意代码的执行。除非业务需求明确要求，否则建议该配置项处于禁用状态，以降低潜在的安全威胁。
+```python
+import os
+from typing import List, Optional
+from vllm import LLM, SamplingParams
+from vllm.inputs import PromptType
+from transformers import AutoTokenizer
+model_path=os.environ.get('MODEL_PATH')
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+llm = LLM(model=model_path,
+        tokenizer=model_path,
+        trust_remote_code=True,
+        dtype='bfloat16',
+        tensor_parallel_size=4,
+        gpu_memory_utilization=0.9)
+sampling_params = SamplingParams(
+    temperature=0.7, top_p=0.8, max_tokens=4096, top_k=20, repetition_penalty=1.05)
+messages = [
+    {
+        "role": "system",
+        "content": "You are a helpful assistant.",
+    },
+    {"role": "user", "content": "Write a short summary of the benefits of regular exercise"},
+]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+dummy_inputs: List[PromptType] = [{
+    "prompt_token_ids": batch
+} for batch in tokenized_chat.numpy().tolist()]
+outputs = llm.generate(dummy_inputs, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+#### 方式2：服务化推理
+下面我们展示使用`vLLM`服务化的方式部署模型并请求
+在主节点上运行：
+```shell
+export VLLM_HOST_IP=${LOCAL_IP}
+```
+接着我们启动服务，运行 :
+```shell
+cd inference
+sh run_server.sh
+```
+运行`run_server.sh`成功后, 运行请求脚本：
+```shell
+sh openapi.sh
+```
+注意修改`openapi.sh`中的`${LOCAL_IP}`和`${MODEL_PATH}`为服务对应值。
+### 量化模型部署：
+本部分介绍采用vLLM部署量化后模型的流程。
+镜像：部署镜像同BF16。
+#### Int8量化模型部署：
+部署Int8-weight-only版本HunYuan-A13B模型只需设置`run_server_int8.sh`中的环境变量：
+```SHELL
+export MODEL_PATH=PATH_TO_BF16_MODEL
+```
+接着我们启动Int8服务。运行：
+```shell
+sh run_server_int8.sh
+```
+运行`run_server_int8.sh`成功后, 运行请求脚本：
+```shell
+sh openapi.sh
+```
+#### Int4量化模型部署：
+部署Int4-weight-only版本HunYuan-A13B模型只需设置`run_server_int4.sh`中的环境变量，采用GPTQ方式：
+```SHELL
+export MODEL_PATH=PATH_TO_INT4_MODEL
+```
+接着我们启动Int4服务。运行：
+```shell
+sh run_server_int4.sh
+```
+运行`run_server_int4.sh`成功后, 运行请求脚本：
+```shell
+sh openapi.sh
+```
+#### FP8量化模型部署：
+部署W8A8C8版本HunYuan-A13B模型只需设置`run_server_int8.sh`中的环境变量：
+```shell
+export MODEL_PATH=PATH_TO_FP8_MODEL
+```
+接着我们启动FP8服务。运行：
+```shell
+sh run_server_fp8.sh
+```
+运行`run_server_fp8.sh`成功后, 运行请求脚本：
+```shell
+sh openapi.sh
+```
+### 性能评估：
+本部分介绍采用vLLM部署各个模型（原始模型和量化模型）的效率测试结果，包括不同Batchsize下的推理速度(tokens/s), 测试环境（腾讯云，H80（96G）GPU x 卡数）:
+测试命令：
+```python
+python3 benchmark_throughput.py --backend vllm \
+         --input-len 2048 \
+         --output-len 14336 \
+         --model $MODEL_PATH \
+         --tensor-parallel-size $TP \
+         --use-v2-block-manager \
+         --async-engine \
+         --trust-remote-code \
+         --num_prompts $BATCH_SIZE \
+         --max-num-seqs $BATCH_SIZE
+```
+| 推理框架 | 模型                          | 部署卡数   | input_length | batch=1             | batch=16              | batch=32       |
+|------|-----------------------------|-----------|-------------------------|---------------------|----------------------|----------------------|
+| vLLM | Hunyuan-A13B-Instruct                   |    8     | 2048                  |      190.84     |       1246.54      |       1981.99     |
+| vLLM | Hunyuan-A13B-Instruct                   |    4     | 2048                  |     158.90      |       779.10       |    1301.75        |
+| vLLM | Hunyuan-A13B-Instruct                   |    2     | 2048                  |    111.72       |      327.31        |    346.54         |
+| vLLM | Hunyuan-A13B-Instruct(int8 weight only) |    2      | 2048                  |   109.10       |      444.17        |     721.93        |
+| vLLM | Hunyuan-A13B-Instruct(W8A8C8-FP8)       |    2      | 2048                  |    91.83       |      372.01        |      617.70       |
+| vLLM | Hunyuan-A13B-Instruct(W8A8C8-FP8)       |    1      | 2048                  |     60.07      |         148.80     |      160.41       |
+## 使用sglang推理
+### BF16部署
+#### Step1：执行推理
+#### 方式1：命令行推理
+下面我们展示一个代码片段，采用`sglang`快速请求chat model：
+```python
+import sglang as sgl
+from transformers import AutoTokenizer
+model_path=os.environ.get('MODEL_PATH')
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+messages = [
+    {
+        "role": "system",
+        "content": "You are a helpful assistant.",
+    },
+    {"role": "user", "content": "Write a short summary of the benefits of regular exercise"},
+]
+prompts = []
+prompts.append(tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+))
+print(prompts)
+llm = sgl.Engine(
+    model_path=model_path,
+    tp_size=4,
+    trust_remote_code=True,
+    mem_fraction_static=0.7,
+)
+sampling_params = {"temperature": 0.7, "top_p": 0.8, "top_k": 20, "max_new_tokens": 4096}
+outputs = llm.generate(prompts, sampling_params)
+for prompt, output in zip(prompts, outputs):
+    print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
+```
+#### 方式2：服务化推理
+下面我们展示使用`sglang`服务化的方式部署模型和请求。
+```shell
+model_path="HunyuanLLM模型路径"
+python3 -u -m sglang.launch_server \
+    --model-path $model_path \
+    --tp 4 \
+    --trust-remote-code \
+```
+服务启动成功后, 运行请求脚本：
+```python
+import openai
+client = openai.Client(
+    base_url="http://localhost:30000/v1", api_key="EMPTY")
+response = client.chat.completions.create(
+    model="default",
+    messages= [
+        {"role": "user", "content": "Write a short summary of the benefits of regular exercise"},
+    ],
+    temperature=0.7,
+    max_tokens=4096,
+    extra_body={"top_p": 0.8, "top_k": 20}
+)
+print(response)
+```
+#### FP8/Int4量化模型部署：
+目前 sglang 的 fp8 和 int4 量化模型正在支持中，敬请期待。
+## 交互式Demo Web
+hunyuan-A13B 现已开放网页demo。访问 https://hunyuan.tencent.com/?model=hunyuan-a13b 即可简单体验我们的模型。
+<br>
+## 引用
+如果你觉得我们的工作对你有帮助，欢迎引用我们的<a href="report/Hunyuan_A13B_Technical_Report.pdf">技术报告</a>！
+<br>
+## 联系我们
+如果你想给我们的研发和产品团队留言，欢迎联系我们腾讯混元LLM团队。你可以通过邮件（[email protected]）联系我们。

config.json ADDED Viewed

	@@ -0,0 +1,203 @@

+{
+  "add_classification_head": false,
+  "anyres_pooling_size": 2,
+  "anyres_vit_max_image_size": null,
+  "anyres_vit_two_views": false,
+  "architectures": [
+    "HunYuanMoEV1ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.1,
+  "attention_head_dim": 128,
+  "auto_map": {
+    "AutoConfig": "configuration_hunyuan.HunYuanConfig",
+    "AutoModel": "hunyuan.HunYuanModel",
+    "AutoModelForCausalLM": "hunyuan.HunYuanMoEV1ForCausalLM"
+  },
+  "bos_token_id": 1,
+  "cla_share_factor": 2,
+  "class_num": 0,
+  "dense_list": [
+    4096,
+    0
+  ],
+  "eod_token_id": 127967,
+  "eos_token_id": 127960,
+  "group_limited_greedy": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "im_end_id": 6,
+  "im_newline_id": 12,
+  "im_start_id": 5,
+  "image_token_id": 9,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "kv_lora_rank": null,
+  "mask_init_id": 13,
+  "max_position_embeddings": 32768,
+  "mlp_bias": false,
+  "model_type": "hunyuan",
+  "moe_drop_tokens": false,
+  "moe_intermediate_size": [
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072,
+    3072
+  ],
+  "moe_layer_num_skipped": 0,
+  "moe_random_routing_dropped_token": false,
+  "moe_topk": [
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8,
+    8
+  ],
+  "n_group": null,
+  "norm_topk_prob": true,
+  "norm_type": "rms",
+  "num_attention_heads": 32,
+  "num_experts": 64,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "num_media_embeds": 257,
+  "num_shared_expert": [
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "org_vocab_size": 128167,
+  "pad_id": 127961,
+  "pad_token_id": 127961,
+  "pool_type": "last",
+  "position_embedding_xdrope": false,
+  "pretraining_tp": 1,
+  "q_lora_rank": null,
+  "qk_nope_head_dim": null,
+  "qk_rope_head_dim": null,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "alpha": 1000.0,
+    "beta_fast": 32,
+    "beta_slow": 1,
+    "factor": 1.0,
+    "mscale": 1.0,
+    "mscale_all_dim": 1.0,
+    "type": "dynamic"
+  },
+  "rope_theta": 10000.0,
+  "routed_scaling_factor": 1.0,
+  "sep_token_id": 127962,
+  "skip_cls_token": false,
+  "text_end_id": 8,
+  "text_start_id": 7,
+  "tie_word_embeddings": true,
+  "topk_group": null,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.53.0",
+  "use_cache": true,
+  "use_cla": false,
+  "use_mixed_mlp_moe": true,
+  "use_mla": false,
+  "use_qk_norm": true,
+  "use_rotary_pos_emb": true,
+  "v_head_dim": null,
+  "video_end_id": 11,
+  "video_start_id": 10,
+  "vit_add_patchemb_bias": false,
+  "vit_input_resolution": 224,
+  "vit_mapping_type": "resampler",
+  "vit_norm_type": "fused",
+  "vit_patch": 1,
+  "vit_path": null,
+  "vit_remove_prenorm": false,
+  "vit_token": 64,
+  "vit_type": null,
+  "vit_used_rms_norm": false,
+  "vocab_size": 128167,
+  "xdrope_section": null
+}

configuration_hunyuan.py ADDED Viewed

	@@ -0,0 +1,319 @@

+# coding=utf-8
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+""" HunYuan model configuration"""
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from typing import List, Union, Optional
+logger = logging.get_logger(__name__)
+class HunYuanConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HunYuanModel`]. It is used to instantiate an
+    HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the HunYuan-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HunYuanModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations or shared MLP representations.
+        moe_intermediate_size (`int` or `List`, *optional*, defaults to 11008):
+            Dimension of the MLP representations in MoE. Use a list if you want a different size per layer.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        use_qk_norm (`bool`, *optional*, defaults to `False`):
+            Whether query and key in attention use norm
+        use_cla (`bool`, *optional*, defaults to `False`):
+            Whether to use CLA in attention
+        cla_share_factor (`int`, *optional*, defaults to 1):
+            The share factor of CLA
+        num_experts (`int` or `List`, *optional*, defaults to 1):
+            The number of experts for moe. If it is a list, it will be used as the number of experts for each layer.
+        num_shared_expert (`int` or `List`, *optional*, defaults to 1):
+            The number of shared experts for moe. If it is a list, it will be used as the number of shared experts for each layer.
+        moe_topk (`int` or `List`, *optional*, defaults to 1):
+            The topk value for moe. If it is a list, it will be used as the topk value for each layer.
+        capacity_factor (Not used) (`float` or `List`, *optional*, defaults to 1.0):
+            The capacity factor for moe. If it is a list, it will be used as the capacity factor for each layer.
+        moe_layer_num_skipped (`int`, *optional*, defaults to 0):
+            First moe_layer_num_skipped layers do not use MoE.
+    """
+    model_type = "hunyuan"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=290943,
+        org_vocab_size=290943,
+        hidden_size=4096,
+        intermediate_size: int=11008,
+        moe_intermediate_size: Union[int, List]=None,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        attention_head_dim=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        eod_token_id=3,
+        sep_token_id=4,
+        im_start_id=5,
+        im_end_id=6,
+        text_start_id=7,
+        text_end_id=8,
+        image_token_id=9,
+        video_start_id=10,
+        video_end_id=11,
+        im_newline_id=12,
+        mask_init_id=13,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        mlp_bias=False,
+        attention_dropout=0.0,
+        use_qk_norm=False,
+        use_rotary_pos_emb=True,
+        use_cla=False,
+        cla_share_factor=1,
+        norm_type="hf_rms",
+        num_experts: Union[int, List]=1,
+        use_mixed_mlp_moe=False,
+        num_shared_expert: Union[int, List]=1,
+        moe_topk: Union[int, List]=1,
+        # capacity_factor: Union[int, List]=1.0,
+        moe_drop_tokens=False,
+        moe_random_routing_dropped_token=False,
+        use_mla=False,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        moe_layer_num_skipped=0,
+        norm_topk_prob=True,
+        routed_scaling_factor=1.0,
+        group_limited_greedy=False,
+        n_group=None,
+        topk_group=None,
+        vit_path=None,
+        num_media_embeds=257,
+        vit_type="AnyResVit",
+        vit_input_resolution=224,
+        vit_token=64,
+        vit_patch=1,
+        vit_mapping_type="simple_conv_mlp",
+        vit_norm_type="fused",
+        vit_used_rms_norm=True,
+        vit_remove_prenorm=True,
+        vit_add_patchemb_bias=True,
+        anyres_vit_max_image_size=2048,
+        anyres_pooling_size=2,
+        anyres_vit_two_views=False,
+        skip_cls_token=False,
+        position_embedding_xdrope=False,
+        xdrope_section=None,
+        add_classification_head=False,
+        class_num=0,
+        pool_type="last",
+        pad_id=-1,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.org_vocab_size = org_vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_experts = num_experts
+        self.use_mixed_mlp_moe = use_mixed_mlp_moe
+        self.num_shared_expert = num_shared_expert
+        self.moe_topk = moe_topk
+        # self.capacity_factor = capacity_factor
+        self.moe_drop_tokens = moe_drop_tokens
+        self.moe_random_routing_dropped_token = moe_random_routing_dropped_token
+        if attention_head_dim is not None:
+            self.attention_head_dim = attention_head_dim
+        else:
+            self.attention_head_dim = self.hidden_size // num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # self._rope_scaling_validation()   # TODO: Need validation?
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.attention_dropout = attention_dropout
+        self.use_qk_norm = use_qk_norm
+        self.use_rotary_pos_emb = use_rotary_pos_emb
+        self.use_cla = use_cla
+        self.cla_share_factor = cla_share_factor
+        self.norm_type = norm_type
+        # MLA args
+        self.use_mla = use_mla
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.v_head_dim = v_head_dim
+        # DeepSeek related args
+        self.moe_layer_num_skipped = moe_layer_num_skipped
+        self.norm_topk_prob = norm_topk_prob
+        self.routed_scaling_factor = routed_scaling_factor
+        self.group_limited_greedy = group_limited_greedy
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.add_classification_head = add_classification_head
+        self.class_num = class_num
+        self.pool_type = pool_type
+        self.pad_id = pad_id
+        if self.class_num is not None:
+            self.dense_list = [self.hidden_size, self.class_num]
+        # Vit args
+        self.vit_path = vit_path
+        self.num_media_embeds = num_media_embeds
+        self.vit_type = vit_type
+        self.vit_input_resolution = vit_input_resolution
+        self.vit_token = vit_token
+        self.vit_patch = vit_patch
+        self.vit_mapping_type = vit_mapping_type
+        self.vit_norm_type = vit_norm_type
+        self.vit_used_rms_norm = vit_used_rms_norm
+        self.vit_remove_prenorm = vit_remove_prenorm
+        self.vit_add_patchemb_bias = vit_add_patchemb_bias
+        self.anyres_vit_max_image_size = anyres_vit_max_image_size
+        self.anyres_pooling_size = anyres_pooling_size
+        self.anyres_vit_two_views = anyres_vit_two_views
+        self.skip_cls_token = skip_cls_token
+        self.position_embedding_xdrope = position_embedding_xdrope
+        self.xdrope_section = xdrope_section
+        # token id
+        self.eod_token_id = eod_token_id
+        self.im_start_id = im_start_id
+        self.im_end_id = im_end_id
+        self.text_start_id = text_start_id
+        self.text_end_id = text_end_id
+        self.image_token_id = image_token_id
+        self.video_start_id = video_start_id
+        self.video_end_id = video_end_id
+        self.im_newline_id = im_newline_id
+        self.mask_init_id = mask_init_id
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            sep_token_id=sep_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor` or `type` and `alpha`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        rope_scaling_alpha = self.rope_scaling.get("alpha", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None and rope_scaling_alpha is None:
+            raise ValueError("`rope_scaling`'s factor or alpha field must be have one, got both of none")
+        if rope_scaling_factor is not None:
+            if not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+                raise ValueError(f"`rope_scaling`'s factor field must be a float > 1.0, got {rope_scaling_factor}")
+        if rope_scaling_alpha is not None:
+            if not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0:
+                raise ValueError(f"`rope_scaling`'s alpha field must be a float > 1.0, got {rope_scaling_alpha}")

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    127960,
+    127967
+  ],
+  "pad_token_id": 127961,
+  "repetition_penalty": 1.05,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.53.0"
+}

hunyuan.py ADDED Viewed

	@@ -0,0 +1,851 @@

+# coding=utf-8
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+#
+""" PyTorch HunYuan model."""
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+from torch import Tensor
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_attention_mask,
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.import_utils import is_torch_fx_available
+from transformers.generation.utils import GenerateOutput
+from .configuration_hunyuan import HunYuanConfig
+from .modeling_hunyuan import HunYuanDecoderLayer, HunYuanRMSNorm
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+_CONFIG_FOR_DOC = "HunYuanConfig"
+HUNYUAN_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`HunYuanConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare HunYuan Model outputting raw hidden-states without any specific head on top.",
+    HUNYUAN_START_DOCSTRING,
+)
+class HunYuanPreTrainedModel(PreTrainedModel):
+    config_class = HunYuanConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HunYuanDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+HUNYUAN_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare HunYuan Model outputting raw hidden-states without any specific head on top.",
+    HUNYUAN_START_DOCSTRING,
+)
+class HunYuanModel(HunYuanPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`HunYuanDecoderLayer`]
+    Args:
+        config: HunYuanConfig
+    """
+    def __init__(self, config: HunYuanConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.add_classification_head = config.add_classification_head
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HunYuanDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._use_sdpa = config._attn_implementation == "sdpa"
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        if not config.add_classification_head:
+            self.norm = HunYuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.cla = config.use_cla
+        self.cla_share_factor = config.cla_share_factor
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(HUNYUAN_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        # if input_ids is not None and inputs_embeds is not None:
+        #     raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        if input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # Fix lora with gradient checkpointing training
+        if self.training and inputs_embeds.is_leaf:
+            inputs_embeds.requires_grad = True
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._use_sdpa and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+        # embed positions
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        prev_kv_states = None
+        for layer_idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    prev_kv_states,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    kv_states=prev_kv_states
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+            kv_states = layer_outputs[-1]
+            if self.cla and layer_idx % self.cla_share_factor == 0:
+                prev_kv_states = kv_states
+        if not self.add_classification_head:
+            hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class HunYuanMoEV1ForCausalLM(HunYuanPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: HunYuanConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = HunYuanModel(config)
+        self.add_classification_head = config.add_classification_head
+        self.pad_id = config.pad_id
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        if config.add_classification_head:
+            self.pool_head = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+            self.pool_head2 = nn.Linear(config.hidden_size, config.class_num, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(HUNYUAN_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+        >>> model = AutoModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if not self.add_classification_head:
+            if self.config.pretraining_tp > 1:
+                lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+                logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+                logits = torch.cat(logits, dim=-1)
+            else:
+                logits = self.lm_head(hidden_states)
+            logits = logits.float()
+        else:
+            logits = hidden_states
+            logits = logits.float()
+            pooled_output = self.pool_head(logits)
+            pooled_output = torch.tanh(pooled_output)
+            pooled_output = self.pool_head2(pooled_output).contiguous()  # bs * class_num
+            if len(pooled_output.shape) < 2:
+                raise ValueError("pooled_output does not have enough dimensions for transpose")
+            if self.config.pool_type == "mean":
+                reward = pooled_output.mean(dim=1).squeeze(-1)
+            elif self.config.pool_type == "last":
+                # bs * hidden_size
+                seq_length = (input_ids != self.pad_id).long().sum(dim=1) - 1
+                batch_size = input_ids.size(0)
+                reward = pooled_output[torch.arange(batch_size, device=pooled_output.device), seq_length].squeeze(-1)
+            else:
+                reward = pooled_output[:, 0].squeeze(-1)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.reshape(-1, self.config.vocab_size)
+            shift_labels = shift_labels.reshape(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        output = CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+        if self.add_classification_head:
+            output['reward'] = reward
+        return output
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_cache_shape()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+class MultimodelHunYuanForCausalLM(HunYuanMoEV1ForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: HunYuanConfig):
+        super().__init__(config)
+    @add_start_docstrings_to_model_forward(HUNYUAN_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        imgs: Optional[List[torch.FloatTensor]] = None,
+        imgs_pos: Optional[List[int]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+        >>> model = AutoModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        mask_init_id = self.config.mask_init_id
+        pad_id = self.config.pad_token_id
+        eod_id = self.config.eod_token_id
+        image_token_id = self.config.image_token_id
+        im_start_id = self.config.im_start_id
+        im_end_id = self.config.im_end_id
+        video_start_id = self.config.video_start_id
+        video_end_id = self.config.video_end_id
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = logits
+            shift_labels = labels
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.reshape(-1, self.config.vocab_size)
+            shift_labels = shift_labels.reshape(-1)
+            shift_tokens = input_ids.reshape(-1)
+            # compute loss
+            mask = (shift_labels < mask_init_id) & (shift_labels != pad_id) & (shift_labels != image_token_id) & (shift_labels != im_start_id) \
+                    & (shift_labels != im_end_id) & (shift_labels != video_start_id) & (shift_labels != video_end_id) & (shift_tokens != pad_id) & (shift_tokens != eod_id)
+            shift_logits = shift_logits[mask, :]
+            shift_labels = shift_labels[mask]
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        imgs = kwargs.pop("imgs", None)
+        imgs_pos = kwargs.pop("imgs_pos", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if imgs is not None:
+            inputs['imgs'] = imgs
+        if imgs_pos is not None:
+            inputs['imgs_pos'] = imgs_pos
+        return inputs
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        imgs: Optional[List[torch.FloatTensor]] = None,
+        imgs_pos: Optional[List[int]] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        return super().generate(
+            inputs=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            eos_token_id=self.config.eod_token_id,
+            **kwargs
+        )
+@add_start_docstrings(
+    """
+    The HunYuan Model transformer with a sequence classification head on top (linear layer).
+    [`HunYuanForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    HUNYUAN_START_DOCSTRING,
+)
+class HunYuanForSequenceClassification(HunYuanPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = HunYuanModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(HUNYUAN_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.reshape(-1, self.num_labels), labels.reshape(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

hy.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3057ab04da86a1081276359ba6a8a57cdaafa4d411c13507b96f6cc333644fbd
+size 4985270248

model-00002-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3018440dddf88261fec7b6df7b6ff504a1f3d8cc8d7caef4f98dd812369331ab
+size 4992312432

model-00003-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3b9f3e4ff67a2f3c3c24bdf8ecac1333b879c86b6dd6b649458cc9d17b21a1d
+size 4992312432

model-00004-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad0065c21c67464110e41ce52b1246c74f8aebd44752009fd66f0b1ab315eceb
+size 4992312432

model-00005-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce245f6cf1e6991be4eaf0353c5204e0c44fc068363135f4476544e914b56c69
+size 4992312432

model-00006-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8bbd1ea7996be3c9a6a0da8b544e402acde048625307b0c5598c838b264ad75
+size 4992312432

model-00007-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9aebaab82552d506cdff90212a1e06c51ca2ceeaab354b8d14301d23e7cb82a6
+size 4992312432

model-00008-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aec67ab9212f8b8c7e0fc724b12f2085015e13747ba13da336d6bad44881c021
+size 4992312432

model-00009-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:399a73e4f8426029d829755dc48edc0992c20aa575139fe3f13d1be9bf4f2934
+size 4992312432

model-00010-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d226a5cf2b8b789b7121ab063bc2db1ad07818889cf776cea7c064ab8bad58a
+size 4992312432

model-00011-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cc756f423124a3589bc4203bff1db6b8f8e303bf7e3a8ef24d347ab76fb50ae
+size 4992312600

model-00012-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48f52fab039ab7ae66229b9a7367b2b828f0c87b04d223252dd3bf7dabe085b7
+size 4992312632

model-00013-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0eb025d5fcec82c706972bd68f34249823f40c1b3effeacb7cc99438e6ee5b6
+size 4992312632

model-00014-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:274e299f31c71a3292fe878ee3cab3485b2a3fab6dec9ef8bc907158ac30a271
+size 4992312632

model-00015-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a99f9737b1914b8c1ce27d50f9d40a645ff33d525b322d3162e88e7ba0a53a1
+size 4992312632

model-00016-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea5ae9a35dc905d8f32388e2d1b0e3417910211dcc8b25f8ab0424f94507a672
+size 4992312632

model-00017-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9be97c260719cd6da52d1e30bc34dd5b1844ac86640bc75d7284dc65f911019c
+size 4992312632

model-00018-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:074594f627910e48d5d86aac81424f0acbc3e354c635afe01a20c3b8525a3647
+size 4992312632

model-00019-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65cfedeeed8f1d0c85f36eb8cca2653151a3c7597d574e96c7747aaf05b5c6eb
+size 4992312632

model-00020-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8aaa1fb52cf9c9f588653222ec0b88ebe13f553a70f96aa8d50d013824d6d9d1
+size 4992312632

model-00021-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b445fe65502467a3085397602f17223a38e71b66692af9cc644cfbe72b920d69
+size 4992312632

model-00022-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:761851085e1732d60d2b1572b38f1ecdab2c8cba3656ee938e5af7133391d999
+size 4992312632

model-00023-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dadf06c900e764fcdedee6e82a53729c108742c108550f85bdf8e31f72570bd
+size 4992312632

model-00024-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5d2cedd5dfb913d1dcd1d7b577f13f5cb0f1ffc05044adc5e242589c9c35d63
+size 4992312632

model-00025-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59ded8fd9582ce18e20af6aac5b7b6f6e98d1804d6e220fae669836c8ef733b7
+size 4992312632

model-00026-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c50508a23df852b3c5997bad69d0419ec7ed3a5810f16b39857f69d78576131f
+size 4992312632

model-00027-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f8f918076d44128dec6c078e5f2b94b567f6d5b9fcf2fddf7ad81e9a1bc4341
+size 4992312632

model-00028-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e138c2158aa0a4a5087b3b99c31e553e676898acad1040abc42ba8b61035a015
+size 4992312632

model-00029-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1efa53df7fbc9271f5cf99a6654972c50c630710efe45162b27d5cff288e37ee
+size 4992312632

model-00030-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffa461ecfe206120663525656840b0d5577cfa460fd3e7ff297d62f0e5dfb990
+size 4992312632

model-00031-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be3ab112fc526c7c087f671dfa893e217af729bc3fa55ceed9b3b51dd9470bc6
+size 4992312632

model-00032-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30ce68700e6981065d55e45c95524e3974028ea95af178885da42eebd3cb22a7
+size 4992312632

model-00033-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb33d5b889131b4b1864fe947b516fbde8e7fb6f60b45a12bc61a2d0bf09f4cb
+size 1056994712

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_hunyuan.py ADDED Viewed

	@@ -0,0 +1,1728 @@

+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+#
+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent/Tencent-Hunyuan-Large/blob/main/License.docx
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+""" PyTorch HunYuan model."""
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+from torch import Tensor
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_attention_mask,
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_hunyuan import HunYuanConfig
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "HunYuanConfig"
+def topkgating(logits: Tensor, topk: int):
+    logits = logits.float()
+    gates = F.softmax(logits, dim=1)
+    # expert_capacity = topk * gates.shape[0]
+    expert_capacity = max(topk, topk * gates.shape[0] // gates.shape[1])
+    num_experts = int(gates.shape[1])
+    # Top-k router probability and corresponding expert indices for each token.
+    # Shape: [tokens_per_group, num_selected_experts].
+    expert_gate, expert_index = torch.topk(gates, topk)
+    expert_mask = F.one_hot(expert_index, num_experts)
+    # For a given token, determine if it was routed to a given expert.
+    # Shape: [tokens_per_group, num_experts]
+    expert_mask_aux = expert_mask.max(dim=-2)[0]
+    tokens_per_group_and_expert = torch.mean(expert_mask_aux.float(), dim=-2)
+    router_prob_per_group_and_expert = torch.mean(gates.float(), dim=-2)
+    l_aux = num_experts**2 * torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert)
+    gates_s = torch.clamp(
+        torch.matmul(expert_mask.float(), gates.unsqueeze(-1)).sum(dim=1), min=torch.finfo(gates.dtype).eps
+    )
+    router_probs = gates / gates_s
+    # Make num_selected_experts the leading axis to ensure that top-1 choices
+    # have priority over top-2 choices, which have priority over top-3 choices,
+    # etc.
+    expert_index = torch.transpose(expert_index, 0, 1)
+    # Shape: [num_selected_experts * tokens_per_group]
+    expert_index = expert_index.reshape(-1)
+    # Create mask out of indices.
+    # Shape: [tokens_per_group * num_selected_experts, num_experts].
+    expert_mask = F.one_hot(expert_index, num_experts).to(torch.int32)
+    exp_counts = torch.sum(expert_mask, dim=0).detach()
+    # Experts have a fixed capacity that we cannot exceed. A token's priority
+    # within the expert's buffer is given by the masked, cumulative capacity of
+    # its target expert.
+    # Shape: [tokens_per_group * num_selected_experts, num_experts].
+    token_priority = torch.cumsum(expert_mask, dim=0) * expert_mask - 1
+    # Shape: [num_selected_experts, tokens_per_group, num_experts].
+    token_priority = token_priority.reshape((topk, -1, num_experts))
+    # Shape: [tokens_per_group, num_selected_experts, num_experts].
+    token_priority = torch.transpose(token_priority, 0, 1)
+    # For each token, across all selected experts, select the only non-negative
+    # (unmasked) priority. Now, for group G routing to expert E, token T has
+    # non-negative priority (i.e. token_priority[G,T,E] >= 0) if and only if E
+    # is its targeted expert.
+    # Shape: [tokens_per_group, num_experts].
+    token_priority = torch.max(token_priority, dim=1)[0]
+    # Token T can only be routed to expert E if its priority is positive and
+    # less than the expert capacity. One-hot matrix will ignore indices outside
+    # the range [0, expert_capacity).
+    # Shape: [tokens_per_group, num_experts, expert_capacity].
+    valid_mask = torch.logical_and(token_priority >= 0, token_priority < expert_capacity)
+    token_priority = torch.masked_fill(token_priority, ~valid_mask, 0)
+    dispatch_mask = F.one_hot(token_priority, expert_capacity).to(torch.bool)
+    valid_mask = valid_mask.unsqueeze(-1).expand(-1, -1, expert_capacity)
+    dispatch_mask = torch.masked_fill(dispatch_mask, ~valid_mask, 0)
+    # The combine array will be used for combining expert outputs, scaled by the
+    # router probabilities. Shape: [num_groups, tokens_per_group, num_experts,
+    # expert_capacity].
+    combine_weights = torch.einsum("...te,...tec->...tec", router_probs, dispatch_mask)
+    exp_counts_capacity = torch.sum(dispatch_mask)
+    exp_capacity_rate = exp_counts_capacity / (logits.shape[0]*topk)
+    return [l_aux, exp_capacity_rate], combine_weights, dispatch_mask, exp_counts
+def top1gating(logits: Tensor, random_routing_dropped_token: bool = False):
+    """Implements Top1Gating on logits."""
+    # everything is in fp32 in this function
+    logits = logits.float()
+    gates = F.softmax(logits, dim=1)
+    capacity = gates.shape[0]
+    # Create a mask for 1st's expert per token
+    # noisy gating
+    indices1_s = torch.argmax(gates, dim=1)
+    num_experts = int(gates.shape[1])
+    mask1 = F.one_hot(indices1_s, num_classes=num_experts)
+    # gating decisions
+    # exp_counts = torch.sum(mask1, dim=0).detach().to('cpu')
+    exp_counts = torch.sum(mask1, dim=0).detach()
+    # Compute l_aux
+    me = torch.mean(gates, dim=0)
+    ce = torch.mean(mask1.float(), dim=0)
+    l_aux = torch.sum(me * ce) * num_experts
+    mask1_rand = mask1
+    top_idx = torch.topk(mask1_rand, k=capacity, dim=0)[1]
+    new_mask1 = mask1 * torch.zeros_like(mask1).scatter_(0, top_idx, 1)
+    mask1 = new_mask1
+    mask1_bk = mask1
+    if random_routing_dropped_token:
+        not_full = capacity - new_mask1.sum(dim=0)
+        sorted_notfull, indices_notfull = torch.sort(not_full, descending=True)
+        sorted_notfull = sorted_notfull.to(torch.int64)
+        not_full_experts_ids = torch.repeat_interleave(indices_notfull, sorted_notfull)
+        shuffle_not_full_ids = torch.randperm(not_full_experts_ids.shape[0])
+        not_full_experts_ids = not_full_experts_ids[shuffle_not_full_ids]
+        indices1_s_after_drop = torch.argmax(new_mask1, dim=1)
+        # get drop idx
+        drop_mask = 1 - new_mask1.sum(dim=1)
+        drop_mask = drop_mask.bool()
+        drop_idx = drop_mask.nonzero().view(-1)
+        drop_num = drop_mask.sum().to(torch.int64)
+        indices1_s_after_drop.scatter_(0, drop_idx, not_full_experts_ids[:drop_num])
+        nodrop_mask1 = F.one_hot(indices1_s_after_drop, num_classes=num_experts)
+        mask1 = nodrop_mask1
+    # Compute locations in capacity buffer
+    locations1 = torch.cumsum(mask1, dim=0) - 1
+    # Store the capacity location for each token
+    locations1_s = torch.sum(locations1 * mask1, dim=1)
+    # Normalize gate probabilities
+    mask1_float = mask1.float()
+    gates = gates * mask1_float
+    locations1_sc = F.one_hot(locations1_s, num_classes=capacity).float()   # one hot to float
+    combine_weights = torch.einsum("se,sc->sec", gates, locations1_sc)
+    dispatch_mask = combine_weights.bool()
+    exp_counts_capacity = torch.sum(mask1_bk)
+    exp_capacity_rate = exp_counts_capacity / (logits.shape[0])
+    return [l_aux, exp_capacity_rate], combine_weights, dispatch_mask, exp_counts
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    warnings.warn(
+        "Calling `transformers.models.llama.modeling_llama._prepare_4d_attention_mask` is deprecated and will be "
+        "removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
+    )
+    return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    warnings.warn(
+        "Calling `transformers.models.llama.modeling_llama._make_causal_mask` is deprecated and will be removed in "
+        "v4.37. Use `transformers.models.llama.modeling_llama.AttentionMaskConverter._make_causal_mask"
+    )
+    return AttentionMaskConverter._make_causal_mask(
+        input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
+    )
+class HunYuanRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        HunYuanRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+ALL_LAYERNORM_LAYERS.append(HunYuanRMSNorm)
+class HunYuanRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        # inv_freq = inv_freq.bfloat16()
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.float32)
+        self.inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1).float()
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached or self.inv_freq.dtype != torch.float32:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+class HunYuanLinearScalingRotaryEmbedding(HunYuanRotaryEmbedding):
+    """HunYuanRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class HunYuanDynamicNTKScalingRotaryEmbedding(HunYuanRotaryEmbedding):
+    """
+    HunYuanRotaryEmbedding extended with Dynamic NTK scaling.
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class HunYuanDynamicNTKAlphaRotaryEmbedding(HunYuanRotaryEmbedding):
+    """
+    HunYuanRotaryEmbedding extended with Dynamic NTK scaling.
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_alpha=1.0):
+        self.scaling_alpha = scaling_alpha
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        base = self.base * self.scaling_alpha ** (self.dim / (self.dim-2))
+        inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class HunYuanMLP(nn.Module):
+    def __init__(self, config: HunYuanConfig, layer_idx=None, is_shared_mlp=False):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        if is_shared_mlp:
+            self.intermediate_size = config.intermediate_size * config.num_shared_expert[0]
+        else:
+            self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class HunYuanTopKGate(nn.Module):
+    def __init__(self, config: HunYuanConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.moe_topk = config.moe_topk
+        self.drop_tokens = config.moe_drop_tokens
+        self.min_capacity = 8
+        self.random_routing_dropped_token = config.moe_random_routing_dropped_token
+        self.wg = nn.Linear(config.hidden_size, config.num_experts, bias=False, dtype=torch.float32)
+    def forward(self, hidden_states):
+        bsz, seq_len, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.reshape(-1, hidden_size)
+        if self.wg.weight.dtype == torch.float32:
+            hidden_states = hidden_states.float()
+        logits = self.wg(hidden_states)
+        if self.moe_topk == 1:
+            gate_output = top1gating(logits, random_routing_dropped_token=self.random_routing_dropped_token)
+        else:
+            gate_output = topkgating(logits, self.moe_topk[0])
+        return gate_output
+class HunYuanMoE(nn.Module):
+    def __init__(self, config: HunYuanConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.moe_topk = config.moe_topk
+        self.num_experts = config.num_experts
+        if config.use_mixed_mlp_moe:
+            self.shared_mlp = HunYuanMLP(config, layer_idx=layer_idx, is_shared_mlp=True)
+        self.gate = HunYuanTopKGate(config, layer_idx=layer_idx)
+        self.experts = nn.ModuleList(
+            [HunYuanMLP(config, layer_idx=layer_idx, is_shared_mlp=False) for _ in range(config.num_experts)]
+        )
+    def forward(self, hidden_states):
+        bsz, seq_len, hidden_size = hidden_states.shape
+        if self.config.use_mixed_mlp_moe:
+            hidden_states_mlp = self.shared_mlp(hidden_states)
+        l_moe, combine_weights, dispatch_mask, exp_counts = self.gate(hidden_states)
+        reshaped_input = hidden_states.reshape(-1, hidden_size)
+        dispatched_input = torch.einsum("sec,sm->ecm", dispatch_mask.type_as(hidden_states), reshaped_input)
+        chunks = dispatched_input.chunk(self.num_experts, dim=0)
+        expert_outputs = []
+        for chunk, expert in zip(chunks, self.experts):
+            expert_outputs.append(expert(chunk))
+        expert_output = torch.cat(expert_outputs, dim=0)
+        combined_output = torch.einsum("sec,ecm->sm", combine_weights.type_as(hidden_states), expert_output)
+        combined_output = combined_output.reshape(bsz, seq_len, hidden_size)
+        if self.config.use_mixed_mlp_moe:
+            output = hidden_states_mlp + combined_output
+        else:
+            output = combined_output
+        return output
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class HunYuanAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: HunYuanConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        # layer_idx 从 0 开始
+        self.attention_type = 'cross' if config.use_cla and layer_idx % config.cla_share_factor != 0 else 'self'
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.use_qk_norm = config.use_qk_norm
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        if self.attention_type == 'self':
+            self.k_proj = nn.Linear(
+                self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias
+            )
+            self.v_proj = nn.Linear(
+                self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias
+            )
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        if self.use_qk_norm:
+            self.query_layernorm = HunYuanRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.key_layernorm = HunYuanRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = HunYuanRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            scaling_alpha = self.config.rope_scaling["alpha"]
+            if scaling_type == "linear":
+                self.rotary_emb = HunYuanLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                if scaling_alpha:
+                    self.rotary_emb = HunYuanDynamicNTKAlphaRotaryEmbedding(
+                        self.head_dim,
+                        max_position_embeddings=self.max_position_embeddings,
+                        scaling_alpha=scaling_alpha,
+                        base=self.rope_theta,
+                    )
+                else:
+                    self.rotary_emb = HunYuanDynamicNTKScalingRotaryEmbedding(
+                        self.head_dim,
+                        max_position_embeddings=self.max_position_embeddings,
+                        scaling_factor=scaling_factor,
+                        base=self.rope_theta,
+                    )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        kv_states: torch.Tensor = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use "
+                "`attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+        if self.config.pretraining_tp > 1:
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+            if self.attention_type == "cross" and kv_states is not None and isinstance(kv_states, tuple):
+                orig_key_states, orig_value_states = kv_states
+                key_states, value_states = kv_states
+            else:
+                key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+                key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+                value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+                key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+                key_states = torch.cat(key_states, dim=-1)
+                value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+                value_states = torch.cat(value_states, dim=-1)
+                orig_key_states, orig_value_states = key_states, value_states
+        else:
+            query_states = self.q_proj(hidden_states)
+            if self.attention_type == "cross" and kv_states is not None and isinstance(kv_states, tuple):
+                orig_key_states, orig_value_states = kv_states
+                key_states, value_states = kv_states
+            else:
+                key_states = self.k_proj(hidden_states)
+                value_states = self.v_proj(hidden_states)
+                orig_key_states, orig_value_states = key_states, value_states
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if self.use_qk_norm:
+            query_states = self.query_layernorm(query_states)
+            key_states = self.key_layernorm(key_states)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value, (orig_key_states, orig_value_states)
+class HunYuanFlashAttention2(HunYuanAttention):
+    """
+    HunYuan flash attention module. This module inherits from `HunYuanAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        kv_states: torch.Tensor = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # HunYuanFlashAttention2 attention does not support output_attentions
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use "
+                "`attention_mask` instead.`"
+            )
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        if self.attention_type == "cross" and kv_states is not None and isinstance(kv_states, tuple):
+            orig_key_states, orig_value_states = kv_states
+            key_states, value_states = kv_states
+        else:
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+            orig_key_states, orig_value_states = key_states, value_states
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if self.use_qk_norm:
+            query_states = self.query_layernorm(query_states)
+            key_states = self.key_layernorm(key_states)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.attention_dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (HunYuanRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+        )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value, (orig_key_states, orig_value_states)
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            causal = self.is_causal and query_length != 1
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+        return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+class HunYuanSdpaAttention(HunYuanAttention):
+    """
+    HunYuan attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `HunYuanAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt
+    to SDPA API.
+    """
+    # Adapted from HunYuanAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        kv_states: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            logger.warning_once(
+                'HunYuanModel is using HunYuanSdpaAttention,'
+                'but `torch.nn.functional.scaled_dot_product_attention`'
+                'does not support `output_attentions=True`. Falling back to the manual attention implementation, '
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. '
+                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        if self.attention_type == "cross" and kv_states is not None and isinstance(kv_states, tuple):
+            orig_key_states, orig_value_states = kv_states
+            key_states, value_states = kv_states
+        else:
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+            orig_key_states, orig_value_states = key_states, value_states
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if self.use_qk_norm:
+            query_states = self.query_layernorm(query_states)
+            key_states = self.key_layernorm(key_states)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with
+        # custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a
+            # causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value, (orig_key_states, orig_value_states)
+HUNYUAN_ATTENTION_CLASSES = {
+    "eager": HunYuanAttention,
+    "flash_attention_2": HunYuanFlashAttention2,
+    "sdpa": HunYuanSdpaAttention,
+}
+class HunYuanDecoderLayer(nn.Module):
+    def __init__(self, config: HunYuanConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.self_attn = HUNYUAN_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        if config.num_experts > 1:
+            self.mlp = HunYuanMoE(config, layer_idx=layer_idx)
+        else:
+            self.mlp = HunYuanMLP(config, layer_idx=layer_idx, is_shared_mlp=False)
+        self.input_layernorm = HunYuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HunYuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        kv_states: Optional[Tuple[torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            kv_states (`Tuple(torch.FloatTensor)`, *optional*): Used when CLA is enabled,
+                key and value states from past attention blocks
+        """
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use "
+                "`attention_mask` instead.`"
+            )
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value, kv_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            kv_states=kv_states,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        outputs += (kv_states,)
+        return outputs
+HUNYUAN_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`HunYuanConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare HunYuan Model outputting raw hidden-states without any specific head on top.",
+    HUNYUAN_START_DOCSTRING,
+)
+class HunYuanPreTrainedModel(PreTrainedModel):
+    config_class = HunYuanConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HunYuanDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+HUNYUAN_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare HunYuan Model outputting raw hidden-states without any specific head on top.",
+    HUNYUAN_START_DOCSTRING,
+)
+class HunYuanModel(HunYuanPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`HunYuanDecoderLayer`]
+    Args:
+        config: HunYuanConfig
+    """
+    def __init__(self, config: HunYuanConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HunYuanDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._use_sdpa = config._attn_implementation == "sdpa"
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.norm = HunYuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.cla = config.use_cla
+        self.cla_share_factor = config.cla_share_factor
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(HUNYUAN_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # Fix lora with gradient checkpointing training
+        if self.training and inputs_embeds.is_leaf:
+            inputs_embeds.requires_grad = True
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._use_sdpa and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+        # embed positions
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        prev_kv_states = None
+        for layer_idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    prev_kv_states,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    kv_states=prev_kv_states
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+            kv_states = layer_outputs[-1]
+            if self.cla and layer_idx % self.cla_share_factor == 0:
+                prev_kv_states = kv_states
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class HunYuanMoEV1ForCausalLM(HunYuanPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: HunYuanConfig):
+        super().__init__(config)
+        self.model = HunYuanModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(HUNYUAN_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+        >>> model = AutoModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_cache_shape()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+@add_start_docstrings(
+    """
+    The HunYuan Model transformer with a sequence classification head on top (linear layer).
+    [`HunYuanForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    HUNYUAN_START_DOCSTRING,
+)
+class HunYuanForSequenceClassification(HunYuanPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = HunYuanModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(HUNYUAN_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

tokenization_hy.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import base64
+import logging
+import os
+import unicodedata
+from typing import Collection, Dict, List, Set, Tuple, Union
+import tiktoken
+from transformers import PreTrainedTokenizer, AddedToken
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "hy.tiktoken"}
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+# PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "<|endoftext|>"
+STARTOFTEXT = "<|startoftext|>"
+BOSTOKEN = "<|bos|>"
+EOSTOKEN = "<|eos|>"
+PADTOKEN = "<|pad|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+# changed to use actual index to avoid misconfiguration with vocabulary expansion
+SPECIAL_START_ID = 127957
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    # with open(tiktoken_bpe_file, "rb", encoding="utf-8") as f:
+    #     contents = f.read()
+    dic = {}
+    rank = 0
+    for line in open(tiktoken_bpe_file, "rb"):
+        if line:
+            token, _ = line.split()
+            if base64.b64decode(token) in dic:
+                continue
+            dic[base64.b64decode(token)] = int(rank)
+            rank += 1
+    global SPECIAL_START_ID
+    SPECIAL_START_ID=rank
+    return dic
+# NOTE: Please use the code line to check `SPECIAL_START_ID` right, this will affect the SPECIAL_START_ID
+# _load_tiktoken_bpe('/apdcephfs/share_1502809/shaneshu/tokenizer_exp/other_tokenizer_vocab/hy/' + VOCAB_FILES_NAMES['vocab_file'])
+# print(SPECIAL_START_ID)
+SPECIAL_TOKENS = tuple(
+    enumerate(
+        (
+            (
+                ENDOFTEXT,
+                STARTOFTEXT,
+                BOSTOKEN,
+                EOSTOKEN,
+                PADTOKEN,
+            )
+            + EXTRAS
+        ),
+        start=SPECIAL_START_ID,
+    )
+)
+# NOTE: Unused Token ID starts from 127962
+SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
+class HYTokenizer(PreTrainedTokenizer):
+    """hunyuan tokenizer."""
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        extra_vocab_file=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # how to handle errors in decoding UTF-8 byte sequences
+        # use ignore if you are in streaming inference
+        self.errors = errors
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: Dict[bytes, int]
+        self.special_tokens = {
+            token: index
+            for index, token in SPECIAL_TOKENS
+        }
+        # try load extra vocab from file
+        if extra_vocab_file is not None:
+            used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
+            extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
+            for token, index in extra_mergeable_ranks.items():
+                if token in self.mergeable_ranks:
+                    logger.info(f"extra token {token} exists, skipping")
+                    continue
+                if index in used_ids:
+                    logger.info(f'the index {index} for extra token {token} exists, skipping')
+                    continue
+                self.mergeable_ranks[token] = index
+            # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
+        enc = tiktoken.Encoding(
+            "HunYuan",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks)} + {len(self.special_tokens)} != {enc.n_vocab} in encoding"
+        self.decoder = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.tokenizer = enc  # type: tiktoken.Encoding
+        self.eod_id = self.tokenizer.eot_token
+        self.bod_id = self.special_tokens[STARTOFTEXT]
+        self.bos_id = self.special_tokens[BOSTOKEN]
+        self.eos_id = self.special_tokens[EOSTOKEN]
+        self.pad_id = self.special_tokens[PADTOKEN]
+    def __getstate__(self):
+        # for pickle lovers
+        state = self.__dict__.copy()
+        del state["tokenizer"]
+        return state
+    def __setstate__(self, state):
+        # tokenizer is not python native; don't pass it; rebuild it
+        self.__dict__.update(state)
+        enc = tiktoken.Encoding(
+            "HunYuan",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+    def convert_tokens_to_ids(
+        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+    def _add_tokens(
+        self,
+        new_tokens: Union[List[str], List[AddedToken]],
+        special_tokens: bool = False,
+    ) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Adding regular tokens is not supported")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS_SET:
+                raise ValueError("Adding unknown special tokens is not supported")
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "hunyuan.tiktoken")
+        with open(file_path, "w", encoding="utf-8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf-8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[t])
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
+# tests
+if __name__ == "__main__":
+    tokenizer = HYTokenizer.from_pretrained('./hy')
+    text = '你好，世界'
+    tokens = tokenizer.tokenize(text)
+    print(tokens)
+    ids = tokenizer.convert_tokens_to_ids(tokens)
+    print(ids)
+    text2 = tokenizer.convert_tokens_to_string(tokens)
+    print(text2)
+    ids2 = tokenizer.convert_tokens_to_ids(tokens)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "model_max_length": 1048576,
+  "tokenizer_class": "HYTokenizer",
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_hy.HYTokenizer",
+      null
+      ]
+  },
+  "eos_token": "<|eos|>",
+  "model_type": "gpt2",
+  "additional_special_tokens": ["<|startoftext|>", "<|extra_0|>", "<|extra_4|>", "<|extra_5|>", "<|eos|>"],
+  "pad_token": "<|pad|>",
+  "chat_template": "{% set loop_messages = messages %}\n{% if tools %}\n    {% set weekday_map = {'Monday': '星期一', 'Tuesday': '星期二', 'Wednesday': '星期三', 'Thursday': '星期四', 'Friday': '星期五', 'Saturday': '星期六', 'Sunday': '星期日'} %}\n    {% set weekday_cn = weekday_map[strftime_now('%A')] %}\n    {% set datetime_str = strftime_now('%Y-%m-%d %H:%M:%S') %}\n    {% set datetime_str = datetime_str + ' ' + weekday_cn %}\n    {% for message in loop_messages %}\n        {% if 'content' in message %}\n            {% set content = message['content'] %}\n        {% else %}\n            {% set content = '' %}\n        {% endif %}\n        {% if loop.index0 == 0 %}\n            {% set content_tmp = '你是一位函数组合专家。你会得到一个问题和一组可能的函数。根据问题，你需要进行一个或多个函数/工具调用以实现目的。\n如果没有一个函数可以使用，请直接使用自然语言回复用户，以助手：开头。\n如果给定的问题缺少函数所需的参数，请使用自然语言进行提问，向用户询问必要信息，以助手：开头。\n如果调用结果已经足够回答用户问题，请对历史结果进行总结，使用自然语言回复用户，以助手：开头。\n你应该只在工具调用部分返回函数调用。如果你决定调用任何函数，你必须将其格式化为<tool_calls>[{\"name\": \"func_name1\", \"arguments\": {\"argument1\": \"value1\", \"argument2\": \"value2\"}},...]</tool_calls>。你不应该在回复中包含任何其他文本。以下是你可以调用的函数列表，格式为JSON。\n' %}\n            {% set content_tmp = content_tmp + '\n' + tools | tojson + '\n' %}\n            {% if message['role'] == 'system' %}\n                {% set content_tmp = content_tmp + '\n额外要求：\n' + content + '\n\n如果你决定返回函数调用，请将其格式化为<tool_calls>[{\"name\": \"func_name1\", \"arguments\": {\"argument1\": \"value1\", \"argument2\": \"value2\"}},...]</tool_calls>，不得包含其他文本。如果额外要求里有格式要求，请忽略，以此处为准。\n否则，请参考开头说的三种情况，以助手：开头进行回复。\n\n如果额外要求里有时间信息，就以额外要求里的时间为准，否则，参考当前时间：' + datetime_str %}\n                {% set content = '<|startoftext|>' + content_tmp + '<|extra_4|>' %}\n            {% elif message['role'] == 'user' %}\n                {% set content_tmp = content_tmp + '\n如果你决定返回函数调用，请将其格式化为<tool_calls>[{\"name\": \"func_name1\", \"arguments\": {\"argument1\": \"value1\", \"argument2\": \"value2\"}},...]</tool_calls>，不得包含其他文本。\n否则，请参考开头说的三种情况，以助手：开头进行回复。\n\n当前时间：' + datetime_str %}\n                {% set content_tmp = '<|startoftext|>' + content_tmp + '<|extra_4|>'%}\n                {% set content = content_tmp + '用户：' + content + '<|extra_0|>' %}\n            {% endif %}\n        {% else %}\n            {% if message['role'] == 'user' %}\n                {% set content = '用户：' + content + '<|extra_0|>' %}\n            {% elif message['role'] == 'assistant' %}\n                {% if 'tool_calls' in message %}\n                    {% set tool_calls = message['tool_calls'] %}\n                    {% set ns = namespace(tool_calls=\"[\") %}\n                    {% for tool_call in tool_calls %}\n                        {% set function = tool_call['function'] %}\n                        {% set name = function['name'] %}\n                        {% set ns.tool_calls = ns.tool_calls + '{\"name\": \"' + name + '\", '%}\n                        {% set arguments = function['arguments'] %}\n                        {% if arguments is not string %}\n                            {% set arguments = arguments | tojson %}\n                        {% endif %}\n                        {% set ns.tool_calls = ns.tool_calls + '\"arguments\": ' + arguments + '}' %}\n                        {% if not loop.last %}\n                            {% set ns.tool_calls = ns.tool_calls + ', '%}\n                        {% endif %}\n                    {% endfor %}\n                    {% set ns.tool_calls = ns.tool_calls + ']' %}\n                    {% set content = content + '<tool_calls>' + ns.tool_calls + '</tool_calls>' %}\n                {% else %}\n                    {% set content = '助手：' + content %}\n                {% endif %}\n                {% set content = content + '<|eos|>' %}\n            {% elif message['role'] == 'tool' %}\n                {% if content is not string %}\n                    {set content = content | tojson }\n                {% endif %}\n                {% set content = '<tool_response>' + content + '</tool_response>' %}\n                {% set content = content + '<|extra_0|>' %}\n            {% endif %}\n        {% endif %}\n    {{- content -}}\n    {% endfor %}\n{% else %}\n    {% set context = {'has_head': true} %}\n    {% for message in loop_messages %}\n        {% if 'content' in message %}\n            {% set content = message['content'] %}\n        {% else %}\n            {% set content = '' %}\n        {% endif %}\n        {% if loop.index0 == 0 %}\n            {% if content == '' %}\n                {% set _ = context.update({'has_head': false}) %}\n            {% elif message['role'] == 'system' %}\n                {% set content = '<|startoftext|>' + content + '<|extra_4|>' %}\n            {% endif %}\n        {% endif %}\n        {% if message['role'] == 'user' %}\n            {% if loop.index0 == 1 and not context.has_head %}\n                {% set content = '<|startoftext|>' + content %}\n            {% endif %}\n            {% if loop.index0 == 1 and context.has_head %}\n                {% set content = content + '<|extra_0|>' %}\n            {% else %}\n                {% set content = '<|startoftext|>' + content + '<|extra_0|>' %}\n            {% endif %}\n        {% elif message['role'] == 'assistant' %}\n            {% set content = content + '<|eos|>' %}\n        {% elif message['role'] == 'tool' %}\n            {% set content = content + '<|extra_0|>' %}\n        {% endif %}\n        {{- content -}}\n    {% endfor %}\n{% endif %}\n{%- if enable_thinking is defined and enable_thinking is false %}\n    {{- '<think>\\n\\n</think>\\n' }}\n{%- endif %}"
+}