--- library_name: transformers tags: [] --- ## Uses ```python import torch import torch.nn as nn from transformers import Qwen2ForCausalLM, AutoTokenizer class ValueHead(nn.Module): r""" The ValueHead class implements a head for GPT2 that returns a scalar for each output token. """ def __init__(self, config, **kwargs): super().__init__() if not hasattr(config, "summary_dropout_prob"): summary_dropout_prob = kwargs.pop("summary_dropout_prob", 0.1) else: summary_dropout_prob = config.summary_dropout_prob self.dropout = ( nn.Dropout(summary_dropout_prob) if summary_dropout_prob else nn.Identity() ) # some models such as OPT have a projection layer before the word embeddings - e.g. OPT-350m if hasattr(config, "hidden_size"): hidden_size = config.hidden_size if hasattr(config, "word_embed_proj_dim"): hidden_size = config.word_embed_proj_dim elif hasattr(config, "is_encoder_decoder"): if config.is_encoder_decoder and hasattr(config, "decoder"): if hasattr(config.decoder, "hidden_size"): hidden_size = config.decoder.hidden_size self.summary = nn.Linear(hidden_size, 1) self.flatten = nn.Flatten() def forward(self, hidden_states): output = self.dropout(hidden_states) # For now force upcast in fp32 if needed. Let's keep the # output in fp32 for numerical stability. if output.dtype != self.summary.weight.dtype: output = output.to(self.summary.weight.dtype) output = self.summary(output) return output class Qwen2ForCausalRM(Qwen2ForCausalLM): def __init__(self, config): super().__init__(config) self.v_head = ValueHead(config) def forward( self, input_ids=None, past_key_values=None, attention_mask=None, return_past_key_values=False, **kwargs, ): r""" Applies a forward pass to the wrapped model and returns the logits of the value head. Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. past_key_values (`tuple(tuple(torch.FloatTensor))`, `optional`): Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `past_key_values` input) to speed up sequential decoding. attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. return_past_key_values (bool): A flag indicating if the computed hidden-states should be returned. kwargs (`dict`, `optional`): Additional keyword arguments, that are passed to the wrapped model. """ kwargs["output_hidden_states"] = ( True # this had already been set in the LORA / PEFT examples ) kwargs["past_key_values"] = past_key_values # if ( # self.is_peft_model # and # self.pretrained_model.active_peft_config.peft_type == "PREFIX_TUNING" # ): # kwargs.pop("past_key_values") base_model_output = super().forward( input_ids=input_ids, attention_mask=attention_mask, **kwargs, ) last_hidden_state = base_model_output.hidden_states[-1] lm_logits = base_model_output.logits loss = base_model_output.loss if last_hidden_state.device != self.v_head.summary.weight.device: last_hidden_state = last_hidden_state.to(self.v_head.summary.weight.device) value = self.v_head(last_hidden_state).squeeze(-1) # force upcast in fp32 if logits are in half-precision if lm_logits.dtype != torch.float32: lm_logits = lm_logits.float() if return_past_key_values: return (lm_logits, loss, value, base_model_output.past_key_values) else: return (lm_logits, loss, value) model_path = "CodeDPO/qwen_coder_2.5_rm" model = Qwen2ForCausalRM.from_pretrained(model_path, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) input_chat = [ {"role": "user", "content": "Hello, how are you?"}, { "role": "assistant", "content": "I'm doing great. How can I help you today?", }, { "role": "user", "content": "I'd like to show off how chat templating works!", }, ] input_tokens = tokenizer.apply_chat_template( input_chat, tokenize=True, return_dict=True, padding=True, return_tensors="pt", ).to(model.device) _, _, values = model( **input_tokens, output_hidden_states=True, return_dict=True, use_cache=False, ) masks = input_tokens["attention_mask"] chosen_scores = values.gather( dim=-1, index=(masks.sum(dim=-1, keepdim=True) - 1) ) # find the last token (eos) in each sequence, a chosen_scores = chosen_scores.squeeze() print(chosen_scores) ```