simogiova commited on
Commit
221cddf
·
verified ·
1 Parent(s): d48d349

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +70 -34
README.md CHANGED
@@ -34,17 +34,64 @@ The model should be prompted in the manner explained in the Qwen2-VL-7B model ca
34
  from transformers import AutoProcessor, AutoModelForImageTextToText
35
  from qwen_vl_utils import process_vision_info
36
  from PIL import Image
 
 
37
 
38
- processor = AutoProcessor.from_pretrained("letxbe/qwen2-7b-BoundingDocs-rephrased")
39
- model = AutoModelForImageTextToText.from_pretrained("letxbe/qwen2-7b-BoundingDocs-rephrased")
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  system_message = """You are a Vision Language Model specialized in extracting information from document images.
43
  Your task is to analyze the provided document image and extract relevant information accurately.
44
  Documents may contain text, tables, forms, and structured or unstructured data.
45
  Ensure responses are precise and concise, without additional explanations unless required for clarity."""
46
 
47
-
48
  TEMPLATE_PROMPT = """
49
  <starttask>
50
  Answer the following question about the document:
@@ -60,40 +107,29 @@ question = "question about the document"
60
 
61
  prompt = TEMPLATE_PROMPT.format(QUESTION=question)
62
 
63
- messages = [
64
- {
65
- "role": "user",
66
- "content": [
67
  {
68
- "type": "image",
69
- "image": Image.new("RGB", (512, 512), (255, 255, 255)), # the document image
70
  },
71
- {"type": "text", "text": prompt},
72
- ],
73
- }
 
 
 
 
 
 
 
 
 
 
 
74
  ]
75
 
76
- # Preparation for inference
77
- text = processor.apply_chat_template(
78
- messages, tokenize=False, add_generation_prompt=True
79
- )
80
- image_inputs, video_inputs = process_vision_info(messages)
81
- inputs = processor(
82
- text=[text],
83
- images=image_inputs,
84
- videos=video_inputs,
85
- padding=True,
86
- return_tensors="pt",
87
- )
88
- inputs = inputs.to("cuda")
89
 
90
- # Inference: Generation of the output
91
- generated_ids = model.generate(**inputs, max_new_tokens=128)
92
- generated_ids_trimmed = [
93
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
94
- ]
95
- output_text = processor.batch_decode(
96
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
97
- )
98
- print(output_text)
99
  ```
 
34
  from transformers import AutoProcessor, AutoModelForImageTextToText
35
  from qwen_vl_utils import process_vision_info
36
  from PIL import Image
37
+ import torch
38
+ from transformers import BitsAndBytesConfig
39
 
 
 
40
 
41
+ def generate_text_from_sample(model, processor, sample, max_new_tokens=1024, device="cuda"):
42
+ # Prepare the text input by applying the chat template
43
+ text_input = processor.apply_chat_template(
44
+ sample[0:2], tokenize=False, add_generation_prompt=True
45
+ )
46
+
47
+ # Process the visual input from the sample
48
+ image_inputs, _ = process_vision_info(sample)
49
+
50
+ # Prepare the inputs for the model
51
+ model_inputs = processor(
52
+ text=[text_input],
53
+ images=image_inputs,
54
+ return_tensors="pt",
55
+ ).to(
56
+ device
57
+ ) # Move inputs to the specified device
58
+
59
+ # Generate text with the model
60
+ generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)
61
+
62
+ # Trim the generated ids to remove the input ids
63
+ trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]
64
+
65
+ # Decode the output text
66
+ output_text = processor.batch_decode(
67
+ trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
68
+ )
69
+
70
+ return output_text[0] # Return the first decoded output text
71
+
72
+
73
+ min_pixels = 256*28*28
74
+ max_pixels = 512*28*28
75
+ processor = AutoProcessor.from_pretrained('Qwen/Qwen2-VL-7B-Instruct', min_pixels=min_pixels, max_pixels=max_pixels, use_fast=True)
76
+
77
+
78
+ bnb_config = BitsAndBytesConfig(
79
+ load_in_4bit=True,
80
+ bnb_4bit_use_double_quant=True,
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ )
83
+
84
+ model = AutoModelForImageTextToText.from_pretrained(
85
+ "letxbe/qwen2-7b-BoundingDocs-rephrased",
86
+ device_map="cuda",
87
+ quantization_config=bnb_config
88
+ )
89
 
90
  system_message = """You are a Vision Language Model specialized in extracting information from document images.
91
  Your task is to analyze the provided document image and extract relevant information accurately.
92
  Documents may contain text, tables, forms, and structured or unstructured data.
93
  Ensure responses are precise and concise, without additional explanations unless required for clarity."""
94
 
 
95
  TEMPLATE_PROMPT = """
96
  <starttask>
97
  Answer the following question about the document:
 
107
 
108
  prompt = TEMPLATE_PROMPT.format(QUESTION=question)
109
 
110
+ message = [
111
+ # system message
 
 
112
  {
113
+ "role": "system",
114
+ "content": [{"type": "text", "text": system_message}],
115
  },
116
+ # question
117
+ {
118
+ "role": "user",
119
+ "content": [
120
+ {
121
+ "type": "image",
122
+ "image": Image.new("RGB", (512, 512), (255, 255, 255)),
123
+ },
124
+ {
125
+ "type": "text",
126
+ "text": prompt,
127
+ },
128
+ ],
129
+ }
130
  ]
131
 
132
+ output = generate_text_from_sample(model, processor, message)
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ print(output)
 
 
 
 
 
 
 
 
135
  ```