Upload configuration_minicpm.py
Browse files- configuration_minicpm.py +19 -18
configuration_minicpm.py
CHANGED
@@ -1,10 +1,5 @@
|
|
1 |
# coding=utf-8
|
2 |
-
# Copyright
|
3 |
-
#
|
4 |
-
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
5 |
-
# and OPT implementations in this library. It has been modified from its
|
6 |
-
# original forms to accommodate minor architectural differences compared
|
7 |
-
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
8 |
#
|
9 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
10 |
# you may not use this file except in compliance with the License.
|
@@ -22,7 +17,6 @@
|
|
22 |
from transformers.configuration_utils import PretrainedConfig
|
23 |
from transformers.utils import logging
|
24 |
|
25 |
-
|
26 |
logger = logging.get_logger(__name__)
|
27 |
|
28 |
MINICPM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
@@ -111,8 +105,8 @@ class MiniCPMConfig(PretrainedConfig):
|
|
111 |
>>> configuration = model.config
|
112 |
```"""
|
113 |
|
114 |
-
model_type =
|
115 |
-
keys_to_ignore_at_inference = [
|
116 |
|
117 |
def __init__(
|
118 |
self,
|
@@ -122,7 +116,7 @@ class MiniCPMConfig(PretrainedConfig):
|
|
122 |
num_hidden_layers=32,
|
123 |
num_attention_heads=32,
|
124 |
num_key_value_heads=None,
|
125 |
-
hidden_act=
|
126 |
max_position_embeddings=2048,
|
127 |
initializer_range=0.02,
|
128 |
rms_norm_eps=1e-6,
|
@@ -139,8 +133,10 @@ class MiniCPMConfig(PretrainedConfig):
|
|
139 |
scale_emb=1,
|
140 |
dim_model_base=1,
|
141 |
scale_depth=1,
|
142 |
-
|
143 |
-
|
|
|
|
|
144 |
self.vocab_size = vocab_size
|
145 |
self.max_position_embeddings = max_position_embeddings
|
146 |
self.hidden_size = hidden_size
|
@@ -166,6 +162,11 @@ class MiniCPMConfig(PretrainedConfig):
|
|
166 |
self.scale_emb = scale_emb
|
167 |
self.dim_model_base = dim_model_base
|
168 |
self.scale_depth = scale_depth
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
super().__init__(
|
171 |
pad_token_id=pad_token_id,
|
@@ -176,7 +177,7 @@ class MiniCPMConfig(PretrainedConfig):
|
|
176 |
)
|
177 |
try:
|
178 |
import flash_attn
|
179 |
-
self._attn_implementation =
|
180 |
except:
|
181 |
pass
|
182 |
|
@@ -189,12 +190,12 @@ class MiniCPMConfig(PretrainedConfig):
|
|
189 |
|
190 |
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
|
191 |
raise ValueError(
|
192 |
-
|
193 |
-
f
|
194 |
)
|
195 |
-
rope_scaling_type = self.rope_scaling.get(
|
196 |
-
rope_scaling_factor = self.rope_scaling.get(
|
197 |
-
if rope_scaling_type is None or rope_scaling_type not in [
|
198 |
raise ValueError(
|
199 |
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
|
200 |
)
|
|
|
1 |
# coding=utf-8
|
2 |
+
# Copyright 2025 The OpenBMB Team. All rights reserved.
|
|
|
|
|
|
|
|
|
|
|
3 |
#
|
4 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
# you may not use this file except in compliance with the License.
|
|
|
17 |
from transformers.configuration_utils import PretrainedConfig
|
18 |
from transformers.utils import logging
|
19 |
|
|
|
20 |
logger = logging.get_logger(__name__)
|
21 |
|
22 |
MINICPM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
|
|
105 |
>>> configuration = model.config
|
106 |
```"""
|
107 |
|
108 |
+
model_type = 'minicpm'
|
109 |
+
keys_to_ignore_at_inference = ['past_key_values']
|
110 |
|
111 |
def __init__(
|
112 |
self,
|
|
|
116 |
num_hidden_layers=32,
|
117 |
num_attention_heads=32,
|
118 |
num_key_value_heads=None,
|
119 |
+
hidden_act='silu',
|
120 |
max_position_embeddings=2048,
|
121 |
initializer_range=0.02,
|
122 |
rms_norm_eps=1e-6,
|
|
|
133 |
scale_emb=1,
|
134 |
dim_model_base=1,
|
135 |
scale_depth=1,
|
136 |
+
mup_denominator=32,
|
137 |
+
sparse_config=None,
|
138 |
+
**kwargs):
|
139 |
+
|
140 |
self.vocab_size = vocab_size
|
141 |
self.max_position_embeddings = max_position_embeddings
|
142 |
self.hidden_size = hidden_size
|
|
|
162 |
self.scale_emb = scale_emb
|
163 |
self.dim_model_base = dim_model_base
|
164 |
self.scale_depth = scale_depth
|
165 |
+
# only used for Eagle Head
|
166 |
+
self.mup_denominator = mup_denominator
|
167 |
+
|
168 |
+
# sparse config
|
169 |
+
self.sparse_config = sparse_config
|
170 |
|
171 |
super().__init__(
|
172 |
pad_token_id=pad_token_id,
|
|
|
177 |
)
|
178 |
try:
|
179 |
import flash_attn
|
180 |
+
self._attn_implementation = 'flash_attention_2'
|
181 |
except:
|
182 |
pass
|
183 |
|
|
|
190 |
|
191 |
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
|
192 |
raise ValueError(
|
193 |
+
'`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, '
|
194 |
+
f'got {self.rope_scaling}'
|
195 |
)
|
196 |
+
rope_scaling_type = self.rope_scaling.get('type', None)
|
197 |
+
rope_scaling_factor = self.rope_scaling.get('factor', None)
|
198 |
+
if rope_scaling_type is None or rope_scaling_type not in ['linear', 'dynamic']:
|
199 |
raise ValueError(
|
200 |
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
|
201 |
)
|