Update README.md
Browse files
README.md
CHANGED
@@ -34,17 +34,64 @@ The model should be prompted in the manner explained in the Qwen2-VL-7B model ca
|
|
34 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
35 |
from qwen_vl_utils import process_vision_info
|
36 |
from PIL import Image
|
|
|
|
|
37 |
|
38 |
-
processor = AutoProcessor.from_pretrained("letxbe/qwen2-7b-BoundingDocs-rephrased")
|
39 |
-
model = AutoModelForImageTextToText.from_pretrained("letxbe/qwen2-7b-BoundingDocs-rephrased")
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
system_message = """You are a Vision Language Model specialized in extracting information from document images.
|
43 |
Your task is to analyze the provided document image and extract relevant information accurately.
|
44 |
Documents may contain text, tables, forms, and structured or unstructured data.
|
45 |
Ensure responses are precise and concise, without additional explanations unless required for clarity."""
|
46 |
|
47 |
-
|
48 |
TEMPLATE_PROMPT = """
|
49 |
<starttask>
|
50 |
Answer the following question about the document:
|
@@ -60,40 +107,29 @@ question = "question about the document"
|
|
60 |
|
61 |
prompt = TEMPLATE_PROMPT.format(QUESTION=question)
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
"role": "user",
|
66 |
-
"content": [
|
67 |
{
|
68 |
-
"
|
69 |
-
"
|
70 |
},
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
]
|
75 |
|
76 |
-
|
77 |
-
text = processor.apply_chat_template(
|
78 |
-
messages, tokenize=False, add_generation_prompt=True
|
79 |
-
)
|
80 |
-
image_inputs, video_inputs = process_vision_info(messages)
|
81 |
-
inputs = processor(
|
82 |
-
text=[text],
|
83 |
-
images=image_inputs,
|
84 |
-
videos=video_inputs,
|
85 |
-
padding=True,
|
86 |
-
return_tensors="pt",
|
87 |
-
)
|
88 |
-
inputs = inputs.to("cuda")
|
89 |
|
90 |
-
|
91 |
-
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
92 |
-
generated_ids_trimmed = [
|
93 |
-
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
94 |
-
]
|
95 |
-
output_text = processor.batch_decode(
|
96 |
-
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
97 |
-
)
|
98 |
-
print(output_text)
|
99 |
```
|
|
|
34 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
35 |
from qwen_vl_utils import process_vision_info
|
36 |
from PIL import Image
|
37 |
+
import torch
|
38 |
+
from transformers import BitsAndBytesConfig
|
39 |
|
|
|
|
|
40 |
|
41 |
+
def generate_text_from_sample(model, processor, sample, max_new_tokens=1024, device="cuda"):
|
42 |
+
# Prepare the text input by applying the chat template
|
43 |
+
text_input = processor.apply_chat_template(
|
44 |
+
sample[0:2], tokenize=False, add_generation_prompt=True
|
45 |
+
)
|
46 |
+
|
47 |
+
# Process the visual input from the sample
|
48 |
+
image_inputs, _ = process_vision_info(sample)
|
49 |
+
|
50 |
+
# Prepare the inputs for the model
|
51 |
+
model_inputs = processor(
|
52 |
+
text=[text_input],
|
53 |
+
images=image_inputs,
|
54 |
+
return_tensors="pt",
|
55 |
+
).to(
|
56 |
+
device
|
57 |
+
) # Move inputs to the specified device
|
58 |
+
|
59 |
+
# Generate text with the model
|
60 |
+
generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)
|
61 |
+
|
62 |
+
# Trim the generated ids to remove the input ids
|
63 |
+
trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]
|
64 |
+
|
65 |
+
# Decode the output text
|
66 |
+
output_text = processor.batch_decode(
|
67 |
+
trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
68 |
+
)
|
69 |
+
|
70 |
+
return output_text[0] # Return the first decoded output text
|
71 |
+
|
72 |
+
|
73 |
+
min_pixels = 256*28*28
|
74 |
+
max_pixels = 512*28*28
|
75 |
+
processor = AutoProcessor.from_pretrained('Qwen/Qwen2-VL-7B-Instruct', min_pixels=min_pixels, max_pixels=max_pixels, use_fast=True)
|
76 |
+
|
77 |
+
|
78 |
+
bnb_config = BitsAndBytesConfig(
|
79 |
+
load_in_4bit=True,
|
80 |
+
bnb_4bit_use_double_quant=True,
|
81 |
+
bnb_4bit_compute_dtype=torch.float16,
|
82 |
+
)
|
83 |
+
|
84 |
+
model = AutoModelForImageTextToText.from_pretrained(
|
85 |
+
"letxbe/qwen2-7b-BoundingDocs-rephrased",
|
86 |
+
device_map="cuda",
|
87 |
+
quantization_config=bnb_config
|
88 |
+
)
|
89 |
|
90 |
system_message = """You are a Vision Language Model specialized in extracting information from document images.
|
91 |
Your task is to analyze the provided document image and extract relevant information accurately.
|
92 |
Documents may contain text, tables, forms, and structured or unstructured data.
|
93 |
Ensure responses are precise and concise, without additional explanations unless required for clarity."""
|
94 |
|
|
|
95 |
TEMPLATE_PROMPT = """
|
96 |
<starttask>
|
97 |
Answer the following question about the document:
|
|
|
107 |
|
108 |
prompt = TEMPLATE_PROMPT.format(QUESTION=question)
|
109 |
|
110 |
+
message = [
|
111 |
+
# system message
|
|
|
|
|
112 |
{
|
113 |
+
"role": "system",
|
114 |
+
"content": [{"type": "text", "text": system_message}],
|
115 |
},
|
116 |
+
# question
|
117 |
+
{
|
118 |
+
"role": "user",
|
119 |
+
"content": [
|
120 |
+
{
|
121 |
+
"type": "image",
|
122 |
+
"image": Image.new("RGB", (512, 512), (255, 255, 255)),
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"type": "text",
|
126 |
+
"text": prompt,
|
127 |
+
},
|
128 |
+
],
|
129 |
+
}
|
130 |
]
|
131 |
|
132 |
+
output = generate_text_from_sample(model, processor, message)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
+
print(output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
```
|