letxbe
/

qwen2-7b-BoundingDocs-rephrased

@@ -34,17 +34,64 @@ The model should be prompted in the manner explained in the Qwen2-VL-7B model ca
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from qwen_vl_utils import process_vision_info
 from PIL import Image
-processor = AutoProcessor.from_pretrained("letxbe/qwen2-7b-BoundingDocs-rephrased")
-model = AutoModelForImageTextToText.from_pretrained("letxbe/qwen2-7b-BoundingDocs-rephrased")
 system_message = """You are a Vision Language Model specialized in extracting information from document images.
 Your task is to analyze the provided document image and extract relevant information accurately.
 Documents may contain text, tables, forms, and structured or unstructured data.
 Ensure responses are precise and concise, without additional explanations unless required for clarity."""
 TEMPLATE_PROMPT = """
 <starttask>
 Answer the following question about the document:
@@ -60,40 +107,29 @@ question = "question about the document"
 prompt = TEMPLATE_PROMPT.format(QUESTION=question)
-messages = [
-    {
-        "role": "user",
-        "content": [
             {
-                "type": "image",
-                "image": Image.new("RGB", (512, 512), (255, 255, 255)), # the document image
             },
-            {"type": "text", "text": prompt},
-        ],
-    }
 ]
-# Preparation for inference
-text = processor.apply_chat_template(
-    messages, tokenize=False, add_generation_prompt=True
-)
-image_inputs, video_inputs = process_vision_info(messages)
-inputs = processor(
-    text=[text],
-    images=image_inputs,
-    videos=video_inputs,
-    padding=True,
-    return_tensors="pt",
-)
-inputs = inputs.to("cuda")
-# Inference: Generation of the output
-generated_ids = model.generate(**inputs, max_new_tokens=128)
-generated_ids_trimmed = [
-    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-]
-output_text = processor.batch_decode(
-    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)
-print(output_text)
 ```

 from transformers import AutoProcessor, AutoModelForImageTextToText
 from qwen_vl_utils import process_vision_info
 from PIL import Image
+import torch
+from transformers import BitsAndBytesConfig
+def generate_text_from_sample(model, processor, sample, max_new_tokens=1024, device="cuda"):
+    # Prepare the text input by applying the chat template
+    text_input = processor.apply_chat_template(
+        sample[0:2], tokenize=False, add_generation_prompt=True
+    )
+    # Process the visual input from the sample
+    image_inputs, _ = process_vision_info(sample)
+    # Prepare the inputs for the model
+    model_inputs = processor(
+        text=[text_input],
+        images=image_inputs,
+        return_tensors="pt",
+    ).to(
+        device
+    )  # Move inputs to the specified device
+    # Generate text with the model
+    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)
+    # Trim the generated ids to remove the input ids
+    trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]
+    # Decode the output text
+    output_text = processor.batch_decode(
+        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return output_text[0]  # Return the first decoded output text
+min_pixels = 256*28*28
+max_pixels = 512*28*28
+processor = AutoProcessor.from_pretrained('Qwen/Qwen2-VL-7B-Instruct', min_pixels=min_pixels, max_pixels=max_pixels, use_fast=True)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.float16,
+)
+model = AutoModelForImageTextToText.from_pretrained(
+    "letxbe/qwen2-7b-BoundingDocs-rephrased",
+    device_map="cuda",
+    quantization_config=bnb_config
+)
 system_message = """You are a Vision Language Model specialized in extracting information from document images.
 Your task is to analyze the provided document image and extract relevant information accurately.
 Documents may contain text, tables, forms, and structured or unstructured data.
 Ensure responses are precise and concise, without additional explanations unless required for clarity."""
 TEMPLATE_PROMPT = """
 <starttask>
 Answer the following question about the document:
 prompt = TEMPLATE_PROMPT.format(QUESTION=question)
+message = [
+            # system message
             {
+                "role": "system",
+                "content": [{"type": "text", "text": system_message}],
             },
+            # question
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": Image.new("RGB", (512, 512), (255, 255, 255)),
+                    },
+                    {
+                        "type": "text",
+                        "text": prompt,
+                    },
+                ],
+            }
 ]
+output = generate_text_from_sample(model, processor, message)
+print(output)
 ```