zhibinlan
/

LLaVE-2B

@@ -82,8 +82,7 @@ conv_template = "qwen_1_5"  # Make sure you use correct chat template for differ
 question = DEFAULT_IMAGE_TOKEN + " Represent the given image with the following question: What is in the image"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], question)
-conv.append_message(conv.roles[1], "
-")
 prompt_question = conv.get_prompt()
 input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
 attention_mask=input_ids.ne(tokenizer.pad_token_id)
@@ -93,8 +92,7 @@ query_embed = model.encode_multimodal_embeddings(input_ids, attention_mask=atten
 target_string = "A cat and a dog"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], target_string)
-conv.append_message(conv.roles[1], "
-")
 target_string = conv.get_prompt()
 target_input_ids = tokenizer(target_string, return_tensors="pt").input_ids.to(device)
 attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
@@ -106,8 +104,7 @@ print("A cat and a dog similarity score: ", query_embed @ target_embed.T)
 neg_string = "A cat and a tiger"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], neg_string)
-conv.append_message(conv.roles[1], "
-")
 neg_string = conv.get_prompt()
 neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
 attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
@@ -120,8 +117,7 @@ print("A cat and a tiger similarity score: ", query_embed @ neg_embed.T)
 pos_string = "Find me an everyday image that matches the given caption: A cat and a dog."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], pos_string)
-conv.append_message(conv.roles[1], "
-")
 pos_string = conv.get_prompt()
 pos_input_ids = tokenizer(pos_string, return_tensors="pt").input_ids.to(device)
 attention_mask=pos_input_ids.ne(tokenizer.pad_token_id)
@@ -130,8 +126,7 @@ pos_query_embed = model.encode_multimodal_embeddings(pos_input_ids, attention_ma
 target = DEFAULT_IMAGE_TOKEN + " Represent the given image."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], target)
-conv.append_message(conv.roles[1], "
-")
 prompt_target = conv.get_prompt()
 target_input_ids = tokenizer_image_token(prompt_target, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
 attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
@@ -144,8 +139,7 @@ print("A cat and a dog image similarity score: ", pos_query_embed @ target_embed
 neg_string = "Find me an everyday image that matches the given caption: A cat and a tiger."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], neg_string)
-conv.append_message(conv.roles[1], "
-")
 neg_string = conv.get_prompt()
 neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
 attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)

 question = DEFAULT_IMAGE_TOKEN + " Represent the given image with the following question: What is in the image"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], question)
+conv.append_message(conv.roles[1], "\n")
 prompt_question = conv.get_prompt()
 input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
 attention_mask=input_ids.ne(tokenizer.pad_token_id)
 target_string = "A cat and a dog"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], target_string)
+conv.append_message(conv.roles[1], "\n")
 target_string = conv.get_prompt()
 target_input_ids = tokenizer(target_string, return_tensors="pt").input_ids.to(device)
 attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
 neg_string = "A cat and a tiger"
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], neg_string)
+conv.append_message(conv.roles[1], "\n")
 neg_string = conv.get_prompt()
 neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
 attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
 pos_string = "Find me an everyday image that matches the given caption: A cat and a dog."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], pos_string)
+conv.append_message(conv.roles[1], "\n")
 pos_string = conv.get_prompt()
 pos_input_ids = tokenizer(pos_string, return_tensors="pt").input_ids.to(device)
 attention_mask=pos_input_ids.ne(tokenizer.pad_token_id)
 target = DEFAULT_IMAGE_TOKEN + " Represent the given image."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], target)
+conv.append_message(conv.roles[1], "\n")
 prompt_target = conv.get_prompt()
 target_input_ids = tokenizer_image_token(prompt_target, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
 attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
 neg_string = "Find me an everyday image that matches the given caption: A cat and a tiger."
 conv = copy.deepcopy(conv_templates[conv_template])
 conv.append_message(conv.roles[0], neg_string)
+conv.append_message(conv.roles[1], "\n")
 neg_string = conv.get_prompt()
 neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
 attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)