zyliu commited on
Commit
db0dc74
·
1 Parent(s): 436a6bf
Files changed (1) hide show
  1. config.json +889 -0
config.json ADDED
@@ -0,0 +1,889 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "work_dirs/visionllmv2-7b-gen-edit-det/",
4
+ "architectures": [
5
+ "VisionLLMv2Model"
6
+ ],
7
+ "gdino_config": {
8
+ "_commit_hash": null,
9
+ "_name_or_path": "checkpoints/grounding-dino-tiny",
10
+ "activation_dropout": 0.0,
11
+ "activation_function": "relu",
12
+ "add_cross_attention": false,
13
+ "architectures": [
14
+ "GroundingDinoForObjectDetection"
15
+ ],
16
+ "attention_dropout": 0.0,
17
+ "auxiliary_loss": true,
18
+ "backbone_config": {
19
+ "_name_or_path": "",
20
+ "add_cross_attention": false,
21
+ "architectures": null,
22
+ "attention_probs_dropout_prob": 0.0,
23
+ "bad_words_ids": null,
24
+ "begin_suppress_tokens": null,
25
+ "bos_token_id": null,
26
+ "chunk_size_feed_forward": 0,
27
+ "cross_attention_hidden_size": null,
28
+ "decoder_start_token_id": null,
29
+ "depths": [
30
+ 2,
31
+ 2,
32
+ 6,
33
+ 2
34
+ ],
35
+ "diversity_penalty": 0.0,
36
+ "do_sample": false,
37
+ "drop_path_rate": 0.1,
38
+ "early_stopping": false,
39
+ "embed_dim": 96,
40
+ "encoder_no_repeat_ngram_size": 0,
41
+ "encoder_stride": 32,
42
+ "eos_token_id": null,
43
+ "exponential_decay_length_penalty": null,
44
+ "finetuning_task": null,
45
+ "forced_bos_token_id": null,
46
+ "forced_eos_token_id": null,
47
+ "hidden_act": "gelu",
48
+ "hidden_dropout_prob": 0.0,
49
+ "hidden_size": 768,
50
+ "id2label": {
51
+ "0": "LABEL_0",
52
+ "1": "LABEL_1"
53
+ },
54
+ "image_size": 224,
55
+ "initializer_range": 0.02,
56
+ "is_decoder": false,
57
+ "is_encoder_decoder": false,
58
+ "label2id": {
59
+ "LABEL_0": 0,
60
+ "LABEL_1": 1
61
+ },
62
+ "layer_norm_eps": 1e-05,
63
+ "length_penalty": 1.0,
64
+ "max_length": 20,
65
+ "min_length": 0,
66
+ "mlp_ratio": 4.0,
67
+ "model_type": "swin",
68
+ "no_repeat_ngram_size": 0,
69
+ "num_beam_groups": 1,
70
+ "num_beams": 1,
71
+ "num_channels": 3,
72
+ "num_heads": [
73
+ 3,
74
+ 6,
75
+ 12,
76
+ 24
77
+ ],
78
+ "num_layers": 4,
79
+ "num_return_sequences": 1,
80
+ "out_features": [
81
+ "stage1",
82
+ "stage2",
83
+ "stage3",
84
+ "stage4"
85
+ ],
86
+ "out_indices": [
87
+ 1,
88
+ 2,
89
+ 3,
90
+ 4
91
+ ],
92
+ "output_attentions": false,
93
+ "output_hidden_states": false,
94
+ "output_scores": false,
95
+ "pad_token_id": null,
96
+ "patch_size": 4,
97
+ "prefix": null,
98
+ "problem_type": null,
99
+ "pruned_heads": {},
100
+ "qkv_bias": true,
101
+ "remove_invalid_values": false,
102
+ "repetition_penalty": 1.0,
103
+ "return_dict": true,
104
+ "return_dict_in_generate": false,
105
+ "sep_token_id": null,
106
+ "stage_names": [
107
+ "stem",
108
+ "stage1",
109
+ "stage2",
110
+ "stage3",
111
+ "stage4"
112
+ ],
113
+ "suppress_tokens": null,
114
+ "task_specific_params": null,
115
+ "temperature": 1.0,
116
+ "tf_legacy_loss": false,
117
+ "tie_encoder_decoder": false,
118
+ "tie_word_embeddings": true,
119
+ "tokenizer_class": null,
120
+ "top_k": 50,
121
+ "top_p": 1.0,
122
+ "torch_dtype": null,
123
+ "torchscript": false,
124
+ "transformers_version": "4.34.0",
125
+ "typical_p": 1.0,
126
+ "use_absolute_embeddings": false,
127
+ "use_bfloat16": false,
128
+ "window_size": 7
129
+ },
130
+ "bad_words_ids": null,
131
+ "bbox_cost": 5.0,
132
+ "bbox_loss_coefficient": 5.0,
133
+ "begin_suppress_tokens": null,
134
+ "bos_token_id": null,
135
+ "box_cost": 5.0,
136
+ "box_weight": 5.0,
137
+ "chunk_size_feed_forward": 0,
138
+ "class_cost": 2.0,
139
+ "class_weight": 2.0,
140
+ "cross_attention_hidden_size": null,
141
+ "d_model": 256,
142
+ "decoder_attention_heads": 8,
143
+ "decoder_bbox_embed_share": true,
144
+ "decoder_ffn_dim": 2048,
145
+ "decoder_layers": 6,
146
+ "decoder_n_points": 4,
147
+ "decoder_start_token_id": null,
148
+ "dice_cost": 5.0,
149
+ "dice_weight": 5.0,
150
+ "disable_custom_kernels": false,
151
+ "diversity_penalty": 0.0,
152
+ "do_sample": false,
153
+ "dropout": 0.1,
154
+ "early_stopping": false,
155
+ "embedding_init_target": true,
156
+ "encoder_attention_heads": 8,
157
+ "encoder_ffn_dim": 2048,
158
+ "encoder_layers": 6,
159
+ "encoder_n_points": 4,
160
+ "encoder_no_repeat_ngram_size": 0,
161
+ "eos_token_id": null,
162
+ "exponential_decay_length_penalty": null,
163
+ "finetuning_task": null,
164
+ "focal_alpha": 0.25,
165
+ "forced_bos_token_id": null,
166
+ "forced_eos_token_id": null,
167
+ "fusion_dropout": 0.0,
168
+ "fusion_droppath": 0.1,
169
+ "giou_cost": 2.0,
170
+ "giou_loss_coefficient": 2.0,
171
+ "giou_weight": 2.0,
172
+ "id2label": {
173
+ "0": "LABEL_0",
174
+ "1": "LABEL_1"
175
+ },
176
+ "init_std": 0.02,
177
+ "is_decoder": false,
178
+ "is_encoder_decoder": true,
179
+ "l_hidden_size": 4096,
180
+ "label2id": {
181
+ "LABEL_0": 0,
182
+ "LABEL_1": 1
183
+ },
184
+ "length_penalty": 1.0,
185
+ "mask_cost": 5.0,
186
+ "mask_dim": 256,
187
+ "mask_weight": 5.0,
188
+ "max_length": 20,
189
+ "max_text_len": 256,
190
+ "min_length": 0,
191
+ "model_type": "grounding-dino",
192
+ "no_repeat_ngram_size": 0,
193
+ "norm": "GN",
194
+ "num_beam_groups": 1,
195
+ "num_beams": 1,
196
+ "num_embs": 4,
197
+ "num_feature_levels": 4,
198
+ "num_queries": 900,
199
+ "num_return_sequences": 1,
200
+ "output_attentions": false,
201
+ "output_hidden_states": false,
202
+ "output_scores": false,
203
+ "pad_token_id": null,
204
+ "position_embedding_type": "sine",
205
+ "positional_embedding_temperature": 20,
206
+ "prefix": null,
207
+ "problem_type": null,
208
+ "pruned_heads": {},
209
+ "query_dim": 4,
210
+ "remove_invalid_values": false,
211
+ "repetition_penalty": 1.0,
212
+ "return_dict": true,
213
+ "return_dict_in_generate": false,
214
+ "sep_token_id": null,
215
+ "suppress_tokens": null,
216
+ "task_specific_params": null,
217
+ "temperature": 1.0,
218
+ "text_backbone_config": {
219
+ "_name_or_path": "",
220
+ "add_cross_attention": false,
221
+ "architectures": null,
222
+ "attention_probs_dropout_prob": 0.1,
223
+ "bad_words_ids": null,
224
+ "begin_suppress_tokens": null,
225
+ "bos_token_id": null,
226
+ "chunk_size_feed_forward": 0,
227
+ "cross_attention_hidden_size": null,
228
+ "decoder_start_token_id": null,
229
+ "diversity_penalty": 0.0,
230
+ "do_sample": false,
231
+ "early_stopping": false,
232
+ "encoder_no_repeat_ngram_size": 0,
233
+ "eos_token_id": null,
234
+ "exponential_decay_length_penalty": null,
235
+ "finetuning_task": null,
236
+ "forced_bos_token_id": null,
237
+ "forced_eos_token_id": null,
238
+ "hidden_act": "gelu",
239
+ "hidden_dropout_prob": 0.1,
240
+ "hidden_size": 768,
241
+ "id2label": {
242
+ "0": "LABEL_0",
243
+ "1": "LABEL_1"
244
+ },
245
+ "init_std": 0.02,
246
+ "intermediate_size": 3072,
247
+ "is_decoder": false,
248
+ "is_encoder_decoder": false,
249
+ "label2id": {
250
+ "LABEL_0": 0,
251
+ "LABEL_1": 1
252
+ },
253
+ "layer_norm_eps": 1e-12,
254
+ "length_penalty": 1.0,
255
+ "max_length": 20,
256
+ "max_position_embeddings": 512,
257
+ "min_length": 0,
258
+ "model_type": "grounding-dino-text-prenet",
259
+ "no_repeat_ngram_size": 0,
260
+ "num_attention_heads": 12,
261
+ "num_beam_groups": 1,
262
+ "num_beams": 1,
263
+ "num_hidden_layers": 12,
264
+ "num_return_sequences": 1,
265
+ "output_attentions": false,
266
+ "output_hidden_states": false,
267
+ "output_scores": false,
268
+ "pad_token_id": 0,
269
+ "position_embedding_type": "absolute",
270
+ "prefix": null,
271
+ "problem_type": null,
272
+ "pruned_heads": {},
273
+ "remove_invalid_values": false,
274
+ "repetition_penalty": 1.0,
275
+ "return_dict": true,
276
+ "return_dict_in_generate": false,
277
+ "sep_token_id": null,
278
+ "suppress_tokens": null,
279
+ "task_specific_params": null,
280
+ "temperature": 1.0,
281
+ "tf_legacy_loss": false,
282
+ "tie_encoder_decoder": false,
283
+ "tie_word_embeddings": true,
284
+ "tokenizer_class": null,
285
+ "top_k": 50,
286
+ "top_p": 1.0,
287
+ "torch_dtype": null,
288
+ "torchscript": false,
289
+ "transformers_version": "4.34.0",
290
+ "type_vocab_size": 2,
291
+ "typical_p": 1.0,
292
+ "use_bfloat16": false,
293
+ "vocab_size": 30522
294
+ },
295
+ "text_enhancer_dropout": 0.0,
296
+ "tf_legacy_loss": false,
297
+ "tie_encoder_decoder": false,
298
+ "tie_word_embeddings": true,
299
+ "tokenizer_class": null,
300
+ "top_k": 50,
301
+ "top_p": 1.0,
302
+ "torch_dtype": "float32",
303
+ "torchscript": false,
304
+ "transformers_version": "4.36.0.dev0",
305
+ "two_stage": true,
306
+ "two_stage_bbox_embed_share": false,
307
+ "typical_p": 1.0,
308
+ "use_bfloat16": false
309
+ },
310
+ "ip2p_config": {
311
+ "_name_or_path": "visionllmv2/model/instruct_pix2pix/ip2p.json",
312
+ "add_cross_attention": false,
313
+ "architectures": [
314
+ "InstructPix2PixWithLLMEmbConfig"
315
+ ],
316
+ "bad_words_ids": null,
317
+ "begin_suppress_tokens": null,
318
+ "bos_token_id": null,
319
+ "cfg_drop_rate": 0.05,
320
+ "cfg_scale": 7.5,
321
+ "chunk_size_feed_forward": 0,
322
+ "cross_attention_hidden_size": null,
323
+ "decoder_start_token_id": null,
324
+ "diversity_penalty": 0.0,
325
+ "do_sample": false,
326
+ "early_stopping": false,
327
+ "embed_tokens": {
328
+ "emb": "[EMB]",
329
+ "emb2": "[EMB2]",
330
+ "emb3": "[EMB3]",
331
+ "emb4": "[EMB4]",
332
+ "emb5": "[EMB5]",
333
+ "emb6": "[EMB6]",
334
+ "emb7": "[EMB7]",
335
+ "emb8": "[EMB8]"
336
+ },
337
+ "encoder_no_repeat_ngram_size": 0,
338
+ "eos_token_id": null,
339
+ "exponential_decay_length_penalty": null,
340
+ "finetuning_task": null,
341
+ "forced_bos_token_id": null,
342
+ "forced_eos_token_id": null,
343
+ "id2label": {
344
+ "0": "LABEL_0",
345
+ "1": "LABEL_1"
346
+ },
347
+ "is_decoder": false,
348
+ "is_encoder_decoder": false,
349
+ "label2id": {
350
+ "LABEL_0": 0,
351
+ "LABEL_1": 1
352
+ },
353
+ "length_penalty": 1.0,
354
+ "llm_hidden_size": 4096,
355
+ "max_length": 20,
356
+ "min_length": 0,
357
+ "model_type": "instructpix2pix_with_llm_emb",
358
+ "no_repeat_ngram_size": 0,
359
+ "num_beam_groups": 1,
360
+ "num_beams": 1,
361
+ "num_decoder_layers": 1,
362
+ "num_embed_tokens": 64,
363
+ "num_encoder_layers": 1,
364
+ "num_queries": 77,
365
+ "num_return_sequences": 1,
366
+ "output_attentions": false,
367
+ "output_hidden_states": false,
368
+ "output_scores": false,
369
+ "pad_token_id": null,
370
+ "prefix": null,
371
+ "problem_type": null,
372
+ "pruned_heads": {},
373
+ "remove_invalid_values": false,
374
+ "repetition_penalty": 1.0,
375
+ "return_dict": true,
376
+ "return_dict_in_generate": false,
377
+ "sd_hidden_size": 768,
378
+ "sd_model_id": "checkpoints/instruct-pix2pix",
379
+ "sep_token_id": null,
380
+ "suppress_tokens": null,
381
+ "task_specific_params": null,
382
+ "temperature": 1.0,
383
+ "tf_legacy_loss": false,
384
+ "tie_encoder_decoder": false,
385
+ "tie_word_embeddings": true,
386
+ "tokenizer_class": null,
387
+ "top_k": 50,
388
+ "top_p": 1.0,
389
+ "torch_dtype": null,
390
+ "torchscript": false,
391
+ "transformers_version": "4.34.0",
392
+ "trigger_token": "[EDIT]",
393
+ "trigger_token_id": 32025,
394
+ "typical_p": 1.0,
395
+ "use_bfloat16": false
396
+ },
397
+ "llm_config": {
398
+ "_name_or_path": "checkpoints/vicuna-7b-v1.5",
399
+ "add_cross_attention": false,
400
+ "architectures": [
401
+ "LlamaForCausalLM"
402
+ ],
403
+ "attention_bias": false,
404
+ "bad_words_ids": null,
405
+ "begin_suppress_tokens": null,
406
+ "bos_token_id": 1,
407
+ "chunk_size_feed_forward": 0,
408
+ "cross_attention_hidden_size": null,
409
+ "decoder_start_token_id": null,
410
+ "diversity_penalty": 0.0,
411
+ "do_sample": false,
412
+ "early_stopping": false,
413
+ "encoder_no_repeat_ngram_size": 0,
414
+ "eos_token_id": 2,
415
+ "exponential_decay_length_penalty": null,
416
+ "finetuning_task": null,
417
+ "forced_bos_token_id": null,
418
+ "forced_eos_token_id": null,
419
+ "hidden_act": "silu",
420
+ "hidden_size": 4096,
421
+ "id2label": {
422
+ "0": "LABEL_0",
423
+ "1": "LABEL_1"
424
+ },
425
+ "initializer_range": 0.02,
426
+ "intermediate_size": 11008,
427
+ "is_decoder": false,
428
+ "is_encoder_decoder": false,
429
+ "label2id": {
430
+ "LABEL_0": 0,
431
+ "LABEL_1": 1
432
+ },
433
+ "length_penalty": 1.0,
434
+ "max_length": 20,
435
+ "max_position_embeddings": 4096,
436
+ "min_length": 0,
437
+ "model_type": "llama",
438
+ "no_repeat_ngram_size": 0,
439
+ "num_attention_heads": 32,
440
+ "num_beam_groups": 1,
441
+ "num_beams": 1,
442
+ "num_hidden_layers": 32,
443
+ "num_key_value_heads": 32,
444
+ "num_return_sequences": 1,
445
+ "output_attentions": false,
446
+ "output_hidden_states": false,
447
+ "output_scores": false,
448
+ "pad_token_id": 0,
449
+ "prefix": null,
450
+ "pretraining_tp": 1,
451
+ "problem_type": null,
452
+ "pruned_heads": {},
453
+ "remove_invalid_values": false,
454
+ "repetition_penalty": 1.0,
455
+ "return_dict": true,
456
+ "return_dict_in_generate": false,
457
+ "rms_norm_eps": 1e-05,
458
+ "rope_scaling": null,
459
+ "rope_theta": 10000.0,
460
+ "sep_token_id": null,
461
+ "suppress_tokens": null,
462
+ "task_specific_params": null,
463
+ "temperature": 1.0,
464
+ "tf_legacy_loss": false,
465
+ "tie_encoder_decoder": false,
466
+ "tie_word_embeddings": false,
467
+ "tokenizer_class": null,
468
+ "top_k": 50,
469
+ "top_p": 1.0,
470
+ "torch_dtype": "float16",
471
+ "torchscript": false,
472
+ "transformers_version": "4.34.0",
473
+ "typical_p": 1.0,
474
+ "use_bfloat16": false,
475
+ "use_cache": true,
476
+ "vocab_size": 32026
477
+ },
478
+ "model_type": "visionllmv2",
479
+ "num_embs": 4,
480
+ "num_embs_gen": 64,
481
+ "pretrained_vl_bridge": null,
482
+ "sd_config": {
483
+ "_name_or_path": "visionllmv2/model/stable_diffusion/sd.json",
484
+ "add_cross_attention": false,
485
+ "architectures": [
486
+ "StableDiffusionWithLLMEmbConfig"
487
+ ],
488
+ "bad_words_ids": null,
489
+ "begin_suppress_tokens": null,
490
+ "bos_token_id": null,
491
+ "cfg_drop_rate": 0.1,
492
+ "cfg_scale": 7.5,
493
+ "chunk_size_feed_forward": 0,
494
+ "cross_attention_hidden_size": null,
495
+ "decoder_start_token_id": null,
496
+ "diversity_penalty": 0.0,
497
+ "do_sample": false,
498
+ "early_stopping": false,
499
+ "embed_tokens": {
500
+ "emb": "[EMB]",
501
+ "emb2": "[EMB2]",
502
+ "emb3": "[EMB3]",
503
+ "emb4": "[EMB4]",
504
+ "emb5": "[EMB5]",
505
+ "emb6": "[EMB6]",
506
+ "emb7": "[EMB7]",
507
+ "emb8": "[EMB8]"
508
+ },
509
+ "encoder_no_repeat_ngram_size": 0,
510
+ "eos_token_id": null,
511
+ "exponential_decay_length_penalty": null,
512
+ "finetuning_task": null,
513
+ "forced_bos_token_id": null,
514
+ "forced_eos_token_id": null,
515
+ "id2label": {
516
+ "0": "LABEL_0",
517
+ "1": "LABEL_1"
518
+ },
519
+ "is_decoder": false,
520
+ "is_encoder_decoder": false,
521
+ "label2id": {
522
+ "LABEL_0": 0,
523
+ "LABEL_1": 1
524
+ },
525
+ "length_penalty": 1.0,
526
+ "llm_hidden_size": 4096,
527
+ "max_length": 20,
528
+ "min_length": 0,
529
+ "model_type": "stable_diffusion_with_llm_emb",
530
+ "no_repeat_ngram_size": 0,
531
+ "num_beam_groups": 1,
532
+ "num_beams": 1,
533
+ "num_decoder_layers": 1,
534
+ "num_embed_tokens": 64,
535
+ "num_encoder_layers": 1,
536
+ "num_queries": 77,
537
+ "num_return_sequences": 1,
538
+ "output_attentions": false,
539
+ "output_hidden_states": false,
540
+ "output_scores": false,
541
+ "pad_token_id": null,
542
+ "prefix": null,
543
+ "problem_type": null,
544
+ "pruned_heads": {},
545
+ "remove_invalid_values": false,
546
+ "repetition_penalty": 1.0,
547
+ "return_dict": true,
548
+ "return_dict_in_generate": false,
549
+ "sd_hidden_size": 768,
550
+ "sd_model_id": "checkpoints/stable-diffusion-v1-5",
551
+ "sep_token_id": null,
552
+ "suppress_tokens": null,
553
+ "task_specific_params": null,
554
+ "temperature": 1.0,
555
+ "tf_legacy_loss": false,
556
+ "tie_encoder_decoder": false,
557
+ "tie_word_embeddings": true,
558
+ "tokenizer_class": null,
559
+ "top_k": 50,
560
+ "top_p": 1.0,
561
+ "torch_dtype": null,
562
+ "torchscript": false,
563
+ "transformers_version": "4.34.0",
564
+ "trigger_token": "[GEN]",
565
+ "trigger_token_id": 32024,
566
+ "typical_p": 1.0,
567
+ "use_bfloat16": false
568
+ },
569
+ "torch_dtype": "bfloat16",
570
+ "transformers_version": null,
571
+ "unipose_config": {
572
+ "_commit_hash": null,
573
+ "_name_or_path": "checkpoints/unipose",
574
+ "add_channel_attention": false,
575
+ "add_cross_attention": false,
576
+ "add_pos_value": false,
577
+ "architectures": [
578
+ "UniPose"
579
+ ],
580
+ "aux_loss": true,
581
+ "backbone": "swin_T_224_1k",
582
+ "backbone_freeze_keywords": null,
583
+ "bad_words_ids": null,
584
+ "batch_norm_type": "FrozenBatchNorm2d",
585
+ "batch_size": 2,
586
+ "bbox_loss_coef": 5.0,
587
+ "begin_suppress_tokens": null,
588
+ "binary_query_selection": false,
589
+ "bos_token_id": null,
590
+ "box_attn_type": "roi_align",
591
+ "chunk_size_feed_forward": 0,
592
+ "clip_max_norm": 0.1,
593
+ "cls_loss_coef": 2.0,
594
+ "cross_attention_hidden_size": null,
595
+ "dabdetr_deformable_decoder": false,
596
+ "dabdetr_deformable_encoder": false,
597
+ "dabdetr_yolo_like_anchor_update": false,
598
+ "data_aug_max_size": 1333,
599
+ "data_aug_scale_overlap": null,
600
+ "data_aug_scales": [
601
+ 480,
602
+ 512,
603
+ 544,
604
+ 576,
605
+ 608,
606
+ 640,
607
+ 672,
608
+ 704,
609
+ 736,
610
+ 768,
611
+ 800
612
+ ],
613
+ "data_aug_scales2_crop": [
614
+ 384,
615
+ 600
616
+ ],
617
+ "data_aug_scales2_resize": [
618
+ 400,
619
+ 500,
620
+ 600
621
+ ],
622
+ "ddetr_lr_param": false,
623
+ "dec_layer_number": null,
624
+ "dec_layers": 6,
625
+ "dec_n_points": 4,
626
+ "dec_pred_bbox_embed_share": true,
627
+ "dec_pred_class_embed_share": true,
628
+ "decoder_layer_noise": false,
629
+ "decoder_module_seq": [
630
+ "sa",
631
+ "ca",
632
+ "ffn"
633
+ ],
634
+ "decoder_sa_type": "sa",
635
+ "decoder_start_token_id": null,
636
+ "dice_loss_coef": 1.0,
637
+ "dilation": false,
638
+ "dim_feedforward": 2048,
639
+ "diversity_penalty": 0.0,
640
+ "dln_hw_noise": 0.2,
641
+ "dln_xy_noise": 0.2,
642
+ "dn_bbox_coef": 1.0,
643
+ "dn_box_noise_scale": 1.0,
644
+ "dn_label_coef": 1.0,
645
+ "dn_label_noise_ratio": 0.5,
646
+ "dn_labelbook_size": 2000,
647
+ "dn_number": 100,
648
+ "do_sample": false,
649
+ "dropout": 0.0,
650
+ "early_stopping": false,
651
+ "ema_decay": 0.9997,
652
+ "ema_epoch": 0,
653
+ "embed_init_tgt": true,
654
+ "enc_layers": 6,
655
+ "enc_loss_coef": 1.0,
656
+ "enc_n_points": 4,
657
+ "encoder_no_repeat_ngram_size": 0,
658
+ "eos_token_id": null,
659
+ "epochs": 1,
660
+ "exponential_decay_length_penalty": null,
661
+ "ffn_extra_layernorm": false,
662
+ "finetuning_task": null,
663
+ "fix_refpoints_hw": -1,
664
+ "fix_size": false,
665
+ "focal_alpha": 0.25,
666
+ "forced_bos_token_id": null,
667
+ "forced_eos_token_id": null,
668
+ "frozen_weights": null,
669
+ "fusion_dropout": 0.0,
670
+ "fusion_droppath": 0.1,
671
+ "giou_loss_coef": 2.0,
672
+ "hidden_dim": 256,
673
+ "id2label": {
674
+ "0": "LABEL_0",
675
+ "1": "LABEL_1"
676
+ },
677
+ "interm_loss_coef": 1.0,
678
+ "is_decoder": false,
679
+ "is_encoder_decoder": false,
680
+ "keypoint_loss_coef": 10.0,
681
+ "l_hidden_size": 4096,
682
+ "label2id": {
683
+ "LABEL_0": 0,
684
+ "LABEL_1": 1
685
+ },
686
+ "length_penalty": 1.0,
687
+ "lr": 1e-05,
688
+ "lr_backbone": 1e-06,
689
+ "lr_backbone_names": [
690
+ "backbone.0"
691
+ ],
692
+ "lr_drop": 7,
693
+ "lr_drop_list": [
694
+ 31,
695
+ 45
696
+ ],
697
+ "lr_linear_proj_mult": 0.1,
698
+ "lr_linear_proj_names": [
699
+ "reference_points",
700
+ "sampling_offsets"
701
+ ],
702
+ "mask_loss_coef": 1.0,
703
+ "masks": false,
704
+ "match_unstable_error": true,
705
+ "matcher_type": "HungarianMatcher",
706
+ "max_length": 20,
707
+ "max_text_len": 256,
708
+ "min_length": 0,
709
+ "modelname": "UniPose",
710
+ "multi_step_lr": false,
711
+ "nheads": 8,
712
+ "nms_iou_threshold": -1,
713
+ "no_interm_box_loss": false,
714
+ "no_repeat_ngram_size": 0,
715
+ "num_beam_groups": 1,
716
+ "num_beams": 1,
717
+ "num_body_points": 68,
718
+ "num_box_decoder_layers": 2,
719
+ "num_classes": 2,
720
+ "num_embs": 4,
721
+ "num_feature_levels": 4,
722
+ "num_patterns": 0,
723
+ "num_queries": 900,
724
+ "num_return_sequences": 1,
725
+ "num_select": 50,
726
+ "oks_loss_coef": 4.0,
727
+ "onecyclelr": false,
728
+ "output_attentions": false,
729
+ "output_hidden_states": false,
730
+ "output_scores": false,
731
+ "pad_token_id": null,
732
+ "param_dict_type": "default",
733
+ "pdetr3_bbox_embed_diff_each_layer": false,
734
+ "pdetr3_refHW": -1,
735
+ "pe_temperatureH": 20,
736
+ "pe_temperatureW": 20,
737
+ "position_embedding": "sine",
738
+ "pre_norm": false,
739
+ "prefix": null,
740
+ "problem_type": null,
741
+ "pruned_heads": {},
742
+ "query_dim": 4,
743
+ "random_refpoints_xy": false,
744
+ "remove_invalid_values": false,
745
+ "repetition_penalty": 1.0,
746
+ "return_dict": true,
747
+ "return_dict_in_generate": false,
748
+ "return_interm_indices": [
749
+ 1,
750
+ 2,
751
+ 3
752
+ ],
753
+ "save_checkpoint_interval": 1,
754
+ "sep_token_id": null,
755
+ "set_cost_bbox": 5.0,
756
+ "set_cost_class": 2.0,
757
+ "set_cost_giou": 2.0,
758
+ "set_cost_keypoint": 10.0,
759
+ "set_cost_oks": 4.0,
760
+ "shuffle_type": null,
761
+ "sub_sentence_present": true,
762
+ "suppress_tokens": null,
763
+ "task_specific_params": null,
764
+ "temperature": 1.0,
765
+ "text_dropout": 0.0,
766
+ "text_encoder_type": "bert-base-uncased",
767
+ "tf_legacy_loss": false,
768
+ "tie_encoder_decoder": false,
769
+ "tie_word_embeddings": true,
770
+ "tokenizer_class": null,
771
+ "top_k": 50,
772
+ "top_p": 1.0,
773
+ "torch_dtype": "float32",
774
+ "torchscript": false,
775
+ "train_projection": true,
776
+ "transformer_activation": "relu",
777
+ "transformers_version": null,
778
+ "two_stage_add_query_num": 0,
779
+ "two_stage_bbox_embed_share": false,
780
+ "two_stage_class_embed_share": false,
781
+ "two_stage_default_hw": 0.05,
782
+ "two_stage_keep_all_tokens": false,
783
+ "two_stage_learn_wh": false,
784
+ "two_stage_pat_embed": 0,
785
+ "two_stage_type": "standard",
786
+ "typical_p": 1.0,
787
+ "unic_layers": 0,
788
+ "use_bfloat16": false,
789
+ "use_cdn": true,
790
+ "use_checkpoint": true,
791
+ "use_deformable_box_attn": false,
792
+ "use_detached_boxes_dec_out": false,
793
+ "use_dn": false,
794
+ "use_ema": false,
795
+ "use_fusion_layer": true,
796
+ "use_label_enc": true,
797
+ "use_text_cross_attention": true,
798
+ "use_text_enhancer": true,
799
+ "use_transformer_ckpt": true,
800
+ "weight_decay": 0.0001
801
+ },
802
+ "use_gdino": true,
803
+ "use_ip2p": true,
804
+ "use_llm_lora": false,
805
+ "use_pixelshuffle": false,
806
+ "use_region_encoder": true,
807
+ "use_sd": true,
808
+ "use_unipose": true,
809
+ "vis_encoder_config": {
810
+ "_name_or_path": "checkpoints/clip-vit-large-patch14-336",
811
+ "add_cross_attention": false,
812
+ "architectures": null,
813
+ "attention_dropout": 0.0,
814
+ "bad_words_ids": null,
815
+ "begin_suppress_tokens": null,
816
+ "bos_token_id": null,
817
+ "chunk_size_feed_forward": 0,
818
+ "cross_attention_hidden_size": null,
819
+ "decoder_start_token_id": null,
820
+ "diversity_penalty": 0.0,
821
+ "do_sample": false,
822
+ "dropout": 0.0,
823
+ "early_stopping": false,
824
+ "encoder_no_repeat_ngram_size": 0,
825
+ "eos_token_id": null,
826
+ "exponential_decay_length_penalty": null,
827
+ "finetuning_task": null,
828
+ "forced_bos_token_id": null,
829
+ "forced_eos_token_id": null,
830
+ "hidden_act": "quick_gelu",
831
+ "hidden_size": 1024,
832
+ "id2label": {
833
+ "0": "LABEL_0",
834
+ "1": "LABEL_1"
835
+ },
836
+ "image_size": 336,
837
+ "initializer_factor": 1.0,
838
+ "initializer_range": 0.02,
839
+ "intermediate_size": 4096,
840
+ "is_decoder": false,
841
+ "is_encoder_decoder": false,
842
+ "label2id": {
843
+ "LABEL_0": 0,
844
+ "LABEL_1": 1
845
+ },
846
+ "layer_norm_eps": 1e-05,
847
+ "length_penalty": 1.0,
848
+ "max_length": 20,
849
+ "min_length": 0,
850
+ "model_type": "clip_vision_model",
851
+ "no_repeat_ngram_size": 0,
852
+ "num_attention_heads": 16,
853
+ "num_beam_groups": 1,
854
+ "num_beams": 1,
855
+ "num_channels": 3,
856
+ "num_hidden_layers": 24,
857
+ "num_return_sequences": 1,
858
+ "output_attentions": false,
859
+ "output_hidden_states": false,
860
+ "output_scores": false,
861
+ "pad_token_id": null,
862
+ "patch_size": 14,
863
+ "prefix": null,
864
+ "problem_type": null,
865
+ "projection_dim": 768,
866
+ "pruned_heads": {},
867
+ "remove_invalid_values": false,
868
+ "repetition_penalty": 1.0,
869
+ "return_dict": true,
870
+ "return_dict_in_generate": false,
871
+ "sep_token_id": null,
872
+ "suppress_tokens": null,
873
+ "task_specific_params": null,
874
+ "temperature": 1.0,
875
+ "tf_legacy_loss": false,
876
+ "tie_encoder_decoder": false,
877
+ "tie_word_embeddings": true,
878
+ "tokenizer_class": null,
879
+ "top_k": 50,
880
+ "top_p": 1.0,
881
+ "torch_dtype": null,
882
+ "torchscript": false,
883
+ "transformers_version": "4.34.0",
884
+ "typical_p": 1.0,
885
+ "use_bfloat16": false
886
+ },
887
+ "vis_output_layer": -2,
888
+ "vl_bridge_type": "mlp2x_gelu"
889
+ }