EricGLC
/

EEE

@@ -735,6 +735,26 @@ class Rwkv5Model(Rwkv5PreTrainedModel):
             hidden_states=all_hidden_states,  # None
             attentions=all_self_attentions,  # None
         )
     def _rescale_layers(self):
         # Layers should be rescaled for inference only.

             hidden_states=all_hidden_states,  # None
             attentions=all_self_attentions,  # None
         )
+    def _bnb_4bit_dequantize_and_rescale(self, target_layer, block_id):
+        r"""
+        Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
+        be quantized again.
+        """
+        if not is_bitsandbytes_available():
+            raise ImportError("Please install bitsandbytes to use this method.")
+        import bitsandbytes as bnb
+        dequant_weights = bnb.functional.dequantize_4bit(target_layer.weight.data, target_layer.weight.quant_state)
+        dequant_weights.div_(2 ** int(block_id // self.config.rescale_every))
+        # re-quantize the model:
+        # we need to put it first on CPU then back to the device
+        # this will create an overhead :/
+        # We set requires_grad=False as we cannot compute gradients on top of 4bit parameters anyway and to avoid
+        # bugs with bnb
+        quant_weight = bnb.nn.Params4bit(dequant_weights.to("cpu"), requires_grad=False).to(dequant_weights.device)
+        setattr(target_layer, "weight", quant_weight)
     def _rescale_layers(self):
         # Layers should be rescaled for inference only.