happyme531 commited on
Commit
aa4c9e8
·
verified ·
1 Parent(s): e2fa3aa

Update models and scripts with toolchain version 2.3.2

Browse files
onnx/convert.py CHANGED
@@ -1,6 +1,7 @@
1
  #!/usr/bin/env python
2
  # coding: utf-8
3
 
 
4
  from rknn.api import RKNN
5
  from math import exp
6
  from sys import exit
@@ -67,7 +68,64 @@ def convert_decoder():
67
  [batch_size, decoder_seq_len, 768]] for encoder_seq_len in encoder_seq_len_list]
68
  # pre-process config
69
  print('--> Config model')
70
- rknn.config(quantized_algorithm='normal', quantized_method='channel', target_platform='rk3588', optimization_level=3, single_core_mode=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  dynamic_input=input_shapes)
72
  print('done')
73
 
@@ -108,7 +166,7 @@ def convert_encoder():
108
  input_shapes = [[[batch_size, encoder_seq_len], [batch_size, encoder_seq_len, 768]] for encoder_seq_len in encoder_seq_len_list]
109
  # pre-process config
110
  print('--> Config model')
111
- rknn.config(quantized_algorithm='normal', quantized_method='channel', target_platform='rk3588', optimization_level=3, single_core_mode=True, dynamic_input=input_shapes)
112
  print('done')
113
 
114
  # Load ONNX model
@@ -137,49 +195,43 @@ def convert_encoder():
137
  print('done')
138
 
139
  def convert_vision():
 
 
 
 
 
 
 
140
  rknn = RKNN(verbose=True)
141
-
142
  ONNX_MODEL="vision_encoder.onnx"
 
143
  DATASET="dataset.txt"
144
  QUANTIZE=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- # split the first Transformers block into a separate model because it's too large to fit in the rknn
147
- onnx.utils.extract_model(ONNX_MODEL, "vision_encoder_part1.onnx", ['pixel_values'], ['/blocks.0/blocks.0.0/channel_block/channel_attn/Add_output_0'])
148
-
149
- ##### Build stage 1, this will crash the python process, so we need to run it in a separate process
150
- code = f"""
151
- from rknn.api import RKNN
152
- rknn = RKNN(verbose=True)
153
- ONNX_MODEL="vision_encoder.onnx"
154
- RKNN_MODEL=ONNX_MODEL.replace(".onnx",".rknn")
155
- DATASET="dataset.txt"
156
- QUANTIZE=False
157
- batch_size = {batch_size}
158
- # pre-process config
159
- print('--> Config model')
160
- rknn.config(quantized_algorithm='normal', quantized_method='channel', target_platform='rk3588', optimization_level=3, single_core_mode=True)
161
- print('done')
162
-
163
- # Load ONNX model
164
- print('--> Loading model')
165
- ret = rknn.load_onnx(model=ONNX_MODEL,
166
- inputs=["pixel_values"],
167
- input_size_list=[[batch_size, 3, 768, 768]],
168
- )
169
- if ret != 0:
170
- print('Load model failed!')
171
- exit(ret)
172
- print('done')
173
-
174
- print('--> Building model stage 1')
175
- ret = rknn.build(do_quantization=QUANTIZE, dataset=DATASET, rknn_batch_size=None)
176
- if ret != 0:
177
- print('Build model failed!')
178
- exit(ret)
179
- print('done')
180
- """
181
- run_python_code(code)
182
  print("Build stage 1 done")
 
183
 
184
  intermidiate_model = onnx.load("check3_fuse_ops.onnx")
185
 
@@ -210,9 +262,9 @@ print('done')
210
  intermidiate_model,
211
  pattern_rewrite_rules=rewrite_rule_set
212
  )
213
- onnx.save(fused_model, "vision_encoder_part2.onnx")
214
- ONNX_MODEL = "vision_encoder_part2.onnx"
215
- RKNN_MODEL=ONNX_MODEL.replace(".onnx",".rknn")
216
  del intermidiate_model
217
  del fused_model
218
 
@@ -221,14 +273,12 @@ print('done')
221
 
222
  # pre-process config
223
  print('--> Config model')
224
- rknn.config(quantized_algorithm='normal', quantized_method='channel', target_platform='rk3588', optimization_level=3, single_core_mode=True)
225
  print('done')
226
 
227
  # Load ONNX model
228
  print('--> Loading model')
229
- ret = rknn.load_onnx(model="check3_fuse_ops.onnx",
230
- inputs=["/blocks.0/blocks.0.0/channel_block/channel_attn/Add_output_0-rs"],
231
- input_size_list=[[batch_size, 128, 1, 36864]],)
232
  if ret != 0:
233
  print('Load model failed!')
234
  exit(ret)
@@ -249,10 +299,7 @@ print('done')
249
  print('Export RKNN model failed!')
250
  exit(ret)
251
  print('done')
252
-
253
-
254
-
255
-
256
 
257
 
258
 
@@ -266,7 +313,7 @@ def check_vision_model():
266
 
267
  # pre-process config
268
  print('--> Config model')
269
- rknn.config(quantized_algorithm='normal', quantized_method='channel', target_platform='rk3588', optimization_level=3, single_core_mode=True )
270
  print('done')
271
 
272
  # Load ONNX model
@@ -311,9 +358,6 @@ def check_vision_model():
311
  print('Precision check failed!')
312
  exit(ret)
313
  print('done')
314
-
315
-
316
-
317
 
318
 
319
  import argparse
 
1
  #!/usr/bin/env python
2
  # coding: utf-8
3
 
4
+ import numpy as np
5
  from rknn.api import RKNN
6
  from math import exp
7
  from sys import exit
 
68
  [batch_size, decoder_seq_len, 768]] for encoder_seq_len in encoder_seq_len_list]
69
  # pre-process config
70
  print('--> Config model')
71
+ rknn.config(quantized_algorithm='normal', quantized_method='channel', target_platform='rk3588', optimization_level=3,
72
+ dynamic_input=input_shapes)
73
+ print('done')
74
+
75
+ # Load ONNX model
76
+ print('--> Loading model')
77
+ ret = rknn.load_onnx(model=ONNX_MODEL,
78
+ )
79
+ if ret != 0:
80
+ print('Load model failed!')
81
+ exit(ret)
82
+ print('done')
83
+
84
+ # Build model
85
+ print('--> Building model')
86
+ ret = rknn.build(do_quantization=QUANTIZE, dataset=DATASET, rknn_batch_size=None)
87
+ if ret != 0:
88
+ print('Build model failed!')
89
+ exit(ret)
90
+ print('done')
91
+
92
+ #export
93
+ print('--> Export RKNN model')
94
+ ret = rknn.export_rknn(RKNN_MODEL)
95
+ if ret != 0:
96
+ print('Export RKNN model failed!')
97
+ exit(ret)
98
+ print('done')
99
+
100
+ def convert_decoder_2():
101
+ import onnx_graphsurgeon as gs
102
+ ONNX_MODEL="decoder_model_merged.onnx"
103
+
104
+ graph = gs.import_onnx(onnx.load(ONNX_MODEL))
105
+ inp = graph.inputs[27] # use_cache_branch
106
+ inp.to_constant(np.array([True], dtype=np.bool_))
107
+ ONNX_MODEL
108
+ onnx.save(gs.export_onnx(graph), "new_model.onnx")
109
+
110
+ np_true = np.array([True], dtype=np.bool_)
111
+ np.save("np_true.npy", np_true)
112
+
113
+
114
+ rknn = RKNN(verbose=True)
115
+
116
+ RKNN_MODEL=ONNX_MODEL.replace(".onnx",".rknn")
117
+ DATASET="dataset.txt"
118
+ QUANTIZE=False
119
+
120
+ # [[batch_size, encoder_seq_len],
121
+ # [batch_size, encoder_seq_len, 768],
122
+ # [batch_size, decoder_seq_len, 768]]
123
+ input_shapes =[[[batch_size, encoder_seq_len],
124
+ [batch_size, encoder_seq_len, 768],
125
+ [batch_size, decoder_seq_len, 768]] for encoder_seq_len in encoder_seq_len_list]
126
+ # pre-process config
127
+ print('--> Config model')
128
+ rknn.config(quantized_algorithm='normal', quantized_method='channel', target_platform='rk3588', optimization_level=3,
129
  dynamic_input=input_shapes)
130
  print('done')
131
 
 
166
  input_shapes = [[[batch_size, encoder_seq_len], [batch_size, encoder_seq_len, 768]] for encoder_seq_len in encoder_seq_len_list]
167
  # pre-process config
168
  print('--> Config model')
169
+ rknn.config(quantized_algorithm='normal', quantized_method='channel', target_platform='rk3588', optimization_level=3, dynamic_input=input_shapes)
170
  print('done')
171
 
172
  # Load ONNX model
 
195
  print('done')
196
 
197
  def convert_vision():
198
+ ONNX_MODEL="vision_encoder.onnx"
199
+ DATASET="dataset.txt"
200
+ QUANTIZE=False
201
+ global batch_size
202
+
203
+ ##### Build stage 1
204
+ from rknn.api import RKNN
205
  rknn = RKNN(verbose=True)
 
206
  ONNX_MODEL="vision_encoder.onnx"
207
+ RKNN_MODEL=ONNX_MODEL.replace(".onnx",".rknn")
208
  DATASET="dataset.txt"
209
  QUANTIZE=False
210
+ # pre-process config
211
+ print('--> Config model')
212
+ rknn.config(quantized_algorithm='normal', quantized_method='channel', target_platform='rk3588', optimization_level=3)
213
+ print('done')
214
+
215
+ # Load ONNX model
216
+ print('--> Loading model')
217
+ ret = rknn.load_onnx(model=ONNX_MODEL,
218
+ inputs=["pixel_values"],
219
+ input_size_list=[[batch_size, 3, 768, 768]],
220
+ )
221
+ if ret != 0:
222
+ print('Load model failed!')
223
+ exit(ret)
224
+ print('done')
225
+
226
+ print('--> Building model stage 1')
227
+ ret = rknn.build(do_quantization=QUANTIZE, dataset=DATASET, rknn_batch_size=None)
228
+ if ret != 0:
229
+ print('Build model failed!')
230
+ exit(ret)
231
+ print('done')
232
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  print("Build stage 1 done")
234
+ del rknn
235
 
236
  intermidiate_model = onnx.load("check3_fuse_ops.onnx")
237
 
 
262
  intermidiate_model,
263
  pattern_rewrite_rules=rewrite_rule_set
264
  )
265
+ onnx.save(fused_model, "vision_encoder_optimized.onnx")
266
+ ONNX_MODEL = "vision_encoder_optimized.onnx"
267
+ # RKNN_MODEL=ONNX_MODEL.replace(".onnx",".rknn")
268
  del intermidiate_model
269
  del fused_model
270
 
 
273
 
274
  # pre-process config
275
  print('--> Config model')
276
+ rknn.config(quantized_algorithm='normal', quantized_method='channel', target_platform='rk3588', optimization_level=3)
277
  print('done')
278
 
279
  # Load ONNX model
280
  print('--> Loading model')
281
+ ret = rknn.load_onnx(model=ONNX_MODEL)
 
 
282
  if ret != 0:
283
  print('Load model failed!')
284
  exit(ret)
 
299
  print('Export RKNN model failed!')
300
  exit(ret)
301
  print('done')
302
+ os.remove("vision_encoder_optimized.onnx")
 
 
 
303
 
304
 
305
 
 
313
 
314
  # pre-process config
315
  print('--> Config model')
316
+ rknn.config(quantized_algorithm='normal', quantized_method='channel', target_platform='rk3588', optimization_level=3)
317
  print('done')
318
 
319
  # Load ONNX model
 
358
  print('Precision check failed!')
359
  exit(ret)
360
  print('done')
 
 
 
361
 
362
 
363
  import argparse
onnx/decoder_model.rknn CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:331a6a05a524c72ac7287a494d6cadd425266888be5ff9375649c8760417f611
3
- size 194821309
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ccb57a522ab8b0fa73123d654807748fbaf841c6852c775eb293e054b520341
3
+ size 207755060
onnx/encoder_model.rknn CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a36af46c308219399dfe5f1df53c2093c3247c9dc248dc3c3167ab88975cf62c
3
- size 87231735
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3121d4ff0f5fc79420e6eda1d657eb8ff36355a414fcab3f236c72b2f4e9ddd1
3
+ size 106957934
onnx/rknnrun.py CHANGED
@@ -20,7 +20,7 @@ rknn_encoder = RKNNLite(verbose=False)
20
  rknn_decoder_prefill = RKNNLite(verbose=False)
21
 
22
  # Load RKNN models
23
- ret = rknn_vision_encoder.load_rknn('./vision_encoder_part2.rknn')
24
  ret = rknn_encoder.load_rknn('./encoder_model.rknn')
25
  ret = rknn_decoder_prefill.load_rknn('./decoder_model.rknn')
26
 
@@ -31,18 +31,18 @@ ret = rknn_decoder_prefill.init_runtime()
31
 
32
  text_embed = ort.InferenceSession("embed_tokens_fp16.onnx", providers=['CPUExecutionProvider'])
33
  decoder_decode = ort.InferenceSession("decoder_model_merged_q4.onnx", providers=['CPUExecutionProvider'])
34
- vision_encoder = ort.InferenceSession("vision_encoder_part1.onnx", providers=['CPUExecutionProvider'])
35
  prompt_tokens_list = [15, 17, 21, 25]
36
 
37
  # 1. prepare inputs
38
- processor = AutoProcessor.from_pretrained("/home/firefly/mnt/zt-rk3588-nn/expr/Florence-2-base-ft", trust_remote_code=True)
39
 
40
  # 2. prepare image
41
  image = Image.open("./test.jpg")
42
  original_image = image.copy()
43
  original_size = image.size
44
  # resize image to 768x768
45
- image = image.resize((768, 768))
46
  # 3. prepare text
47
  prompt = "<MORE_DETAILED_CAPTION>"
48
 
@@ -56,16 +56,17 @@ for i in prompt_tokens_list:
56
  pad_to = i
57
  break
58
  print("pad_to: ", pad_to)
59
- inputs = processor(text=prompt, images=image, return_tensors="np", do_resize=False, padding="max_length", max_length=pad_to + 577, truncation=True)
 
60
  for k, v in inputs.items():
61
  print(k, v.shape)
62
 
63
  # 4. run vision encoder using RKNN
64
  start_time = time.time()
65
- image_features0 = vision_encoder.run(None, {
66
- "pixel_values": inputs["pixel_values"]
67
- })[0]
68
- image_features = rknn_vision_encoder.inference(inputs=[image_features0.reshape(1, 128, 1, 36864)])[0]
69
 
70
  end_time = time.time()
71
  vision_encoder_time = (end_time - start_time) * 1000
@@ -90,6 +91,7 @@ batch_size, image_token_length = image_features.shape[:-1]
90
  image_attention_mask = np.ones((batch_size, image_token_length))
91
  task_prefix_embeds = inputs_embeds
92
  task_prefix_attention_mask = np.ones((batch_size, task_prefix_embeds.shape[1]))
 
93
  if len(task_prefix_attention_mask.shape) == 3:
94
  task_prefix_attention_mask = task_prefix_attention_mask[:, 0]
95
  inputs_embeds = np.concatenate([image_features, task_prefix_embeds], axis=1)
@@ -135,7 +137,7 @@ while generated_tokens.__len__() < max_new_tokens:
135
 
136
  # 使用argmax选择下一个token (贪心算法)
137
  next_token = np.argmax(next_token_logits, axis=-1)[0]
138
- print("next_token: ", next_token)
139
  # 将新生成的token添加到结果中
140
  generated_tokens.append(next_token)
141
 
@@ -220,7 +222,7 @@ def plot_bbox(image, data):
220
  font = ImageFont.load_default().font_variant(size=20) # 如果Arial不可用,使用默认字体并放大
221
 
222
  # Plot each bounding box
223
- for bbox, label in zip(data['bboxes'], data['labels']):
224
  # Unpack the bounding box coordinates
225
  x1, y1, x2, y2 = bbox
226
  # Draw the rectangle with thicker outline
@@ -312,14 +314,15 @@ def draw_ocr_bboxes(image, prediction, scale=1):
312
  # display(image)
313
  image.save("result_image.jpg")
314
 
315
-
316
- # draw_polygons(original_image, parsed_answer['<REFERRING_EXPRESSION_SEGMENTATION>'], fill_mask=True)
317
- # plot_bbox(original_image, parsed_answer[prompt.split(">")[0].strip() + ">"])
318
- # draw_ocr_bboxes(original_image, parsed_answer["<OCR_WITH_REGION>"], scale=1)
319
-
 
320
 
321
 
322
  # Release RKNNLite instances
323
  rknn_vision_encoder.release()
324
  rknn_encoder.release()
325
- rknn_decoder_prefill.release()
 
20
  rknn_decoder_prefill = RKNNLite(verbose=False)
21
 
22
  # Load RKNN models
23
+ ret = rknn_vision_encoder.load_rknn('./vision_encoder.rknn')
24
  ret = rknn_encoder.load_rknn('./encoder_model.rknn')
25
  ret = rknn_decoder_prefill.load_rknn('./decoder_model.rknn')
26
 
 
31
 
32
  text_embed = ort.InferenceSession("embed_tokens_fp16.onnx", providers=['CPUExecutionProvider'])
33
  decoder_decode = ort.InferenceSession("decoder_model_merged_q4.onnx", providers=['CPUExecutionProvider'])
34
+
35
  prompt_tokens_list = [15, 17, 21, 25]
36
 
37
  # 1. prepare inputs
38
+ processor = AutoProcessor.from_pretrained("..", trust_remote_code=True)
39
 
40
  # 2. prepare image
41
  image = Image.open("./test.jpg")
42
  original_image = image.copy()
43
  original_size = image.size
44
  # resize image to 768x768
45
+ # image = image.resize((768, 768))
46
  # 3. prepare text
47
  prompt = "<MORE_DETAILED_CAPTION>"
48
 
 
56
  pad_to = i
57
  break
58
  print("pad_to: ", pad_to)
59
+
60
+ inputs = processor(text=prompt, images=image, return_tensors="np", do_resize=True, padding="max_length", max_length=pad_to + 577, truncation=True)
61
  for k, v in inputs.items():
62
  print(k, v.shape)
63
 
64
  # 4. run vision encoder using RKNN
65
  start_time = time.time()
66
+ # image_features0 = vision_encoder.run(None, {
67
+ # "pixel_values": inputs["pixel_values"]
68
+ # })[0]
69
+ image_features = rknn_vision_encoder.inference(inputs=[inputs["pixel_values"]], data_format="nchw")[0]
70
 
71
  end_time = time.time()
72
  vision_encoder_time = (end_time - start_time) * 1000
 
91
  image_attention_mask = np.ones((batch_size, image_token_length))
92
  task_prefix_embeds = inputs_embeds
93
  task_prefix_attention_mask = np.ones((batch_size, task_prefix_embeds.shape[1]))
94
+ # task_prefix_attention_mask = inputs["attention_mask"]
95
  if len(task_prefix_attention_mask.shape) == 3:
96
  task_prefix_attention_mask = task_prefix_attention_mask[:, 0]
97
  inputs_embeds = np.concatenate([image_features, task_prefix_embeds], axis=1)
 
137
 
138
  # 使用argmax选择下一个token (贪心算法)
139
  next_token = np.argmax(next_token_logits, axis=-1)[0]
140
+ print("next_token: ", processor.decode([next_token]))
141
  # 将新生成的token添加到结果中
142
  generated_tokens.append(next_token)
143
 
 
222
  font = ImageFont.load_default().font_variant(size=20) # 如果Arial不可用,使用默认字体并放大
223
 
224
  # Plot each bounding box
225
+ for bbox, label in zip(data['bboxes'], data.get('labels', data.get('bboxes_labels'))):
226
  # Unpack the bounding box coordinates
227
  x1, y1, x2, y2 = bbox
228
  # Draw the rectangle with thicker outline
 
314
  # display(image)
315
  image.save("result_image.jpg")
316
 
317
+ if parsed_answer.get('<REFERRING_EXPRESSION_SEGMENTATION>'):
318
+ draw_polygons(original_image, parsed_answer['<REFERRING_EXPRESSION_SEGMENTATION>'], fill_mask=True)
319
+ elif parsed_answer.get("<OCR_WITH_REGION>"):
320
+ draw_ocr_bboxes(original_image, parsed_answer["<OCR_WITH_REGION>"], scale=1)
321
+ else:
322
+ plot_bbox(original_image, parsed_answer[prompt.split(">")[0].strip() + ">"])
323
 
324
 
325
  # Release RKNNLite instances
326
  rknn_vision_encoder.release()
327
  rknn_encoder.release()
328
+ rknn_decoder_prefill.release()
onnx/vision_encoder.rknn ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:463a02cf1643c26a3414096f543a5f267ea49f384c1bcff7210cee2168912a4b
3
+ size 261704579