alexgambashidze/qwen2.5vl_image_preference_reasoner

This is a Listener-Rewarded (paper) fine-tuned version of QwenVL-2.5-7B-Instruct model to predict human preferences trained on the HPDv2 dataset.

This is a research-preview version. The model still often provides non-reliable reasoning traces, hallucinates, and judges from the first glance.

Example usage

from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image
import re
import json

reasoner = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "alexgambashidze/qwen2.5vl_image_preference_reasoner",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", max_pixels=720*28*28)

SYSTEM_PROMPT = (
    "The user has two images and a textual prompt. "
    "You need to reason carefully inside <think>...</think> tags and produce an answer in <answer>...</answer> tags where you should choose best image."
)

image1_path = 'Your first image path'
image2_path = 'Your second image path'

image1 = Image.open(image1_path).resize((512,512)), Image.LANCZOS)
image2 = Image.open(image2_path).resize((512,512)), Image.LANCZOS)

user_prompt = "A beautiful sunset over mountains"

user_content = [
    {"type": "image"},
    {"type": "image"},
    {
        "type": "text",
        "text": (
            f"User prompt: {user_prompt}\n\n"
            "Which image is better given the prompt?"
            "Provide your reasoning in <think>...</think> tags, "
            'and the final JSON answer in <answer>{"preferred":"second"}</answer> or {"preferred":"first"}.\n'
        ),
    },
]

conversation_prompt = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": user_content},
]

inputs = processor(
    text=[conversation_prompt],
    images=[[image1, image2]],
    return_tensors="pt",
    padding=True,
    truncation=True,
).to(reasoner.device)

with torch.no_grad():
    output = reasoner.generate(
        **inputs,
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=processor.tokenizer.eos_token_id
    )

generated_text = processor.decode(output[0], skip_special_tokens=True)

conversation_text = processor.apply_chat_template(
    conversation_prompt, 
    add_generation_prompt=True, 
    tokenize=False
)
assistant_response = generated_text[len(conversation_text):].strip()

print("Reasoner response:")
print(assistant_response)

pattern_answer_tags = r"<answer>(.*?)</answer>"
match = re.search(pattern_answer_tags, assistant_response, flags=re.DOTALL)
if match:
    answer_content = match.group(1)
    try:
        answer_json = json.loads(answer_content)
        preferred = answer_json.get("preferred", "unknown")
        print(f"\n preference: {preferred}")
    except json.JSONDecodeError:
        print(f"\nraw answer: {answer_content}")
else:
    print("\n no answer tags found")