|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Convert RT Detr checkpoints with Timm backbone""" |
|
|
|
import argparse |
|
import json |
|
from pathlib import Path |
|
|
|
import requests |
|
import torch |
|
from huggingface_hub import hf_hub_download |
|
from PIL import Image |
|
from torchvision import transforms |
|
|
|
from transformers import RTDetrConfig, RTDetrForObjectDetection, RTDetrImageProcessor |
|
from transformers.utils import logging |
|
|
|
|
|
logging.set_verbosity_info() |
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
def get_rt_detr_config(model_name: str) -> RTDetrConfig: |
|
config = RTDetrConfig() |
|
|
|
config.num_labels = 80 |
|
repo_id = "huggingface/label-files" |
|
filename = "coco-detection-mmdet-id2label.json" |
|
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) |
|
id2label = {int(k): v for k, v in id2label.items()} |
|
config.id2label = id2label |
|
config.label2id = {v: k for k, v in id2label.items()} |
|
|
|
if model_name == "rtdetr_r18vd": |
|
config.backbone_config.hidden_sizes = [64, 128, 256, 512] |
|
config.backbone_config.depths = [2, 2, 2, 2] |
|
config.backbone_config.layer_type = "basic" |
|
config.encoder_in_channels = [128, 256, 512] |
|
config.hidden_expansion = 0.5 |
|
config.decoder_layers = 3 |
|
elif model_name == "rtdetr_r34vd": |
|
config.backbone_config.hidden_sizes = [64, 128, 256, 512] |
|
config.backbone_config.depths = [3, 4, 6, 3] |
|
config.backbone_config.layer_type = "basic" |
|
config.encoder_in_channels = [128, 256, 512] |
|
config.hidden_expansion = 0.5 |
|
config.decoder_layers = 4 |
|
elif model_name == "rtdetr_r50vd_m": |
|
pass |
|
elif model_name == "rtdetr_r50vd": |
|
pass |
|
elif model_name == "rtdetr_r101vd": |
|
config.backbone_config.depths = [3, 4, 23, 3] |
|
config.encoder_ffn_dim = 2048 |
|
config.encoder_hidden_dim = 384 |
|
config.decoder_in_channels = [384, 384, 384] |
|
elif model_name == "rtdetr_r18vd_coco_o365": |
|
config.backbone_config.hidden_sizes = [64, 128, 256, 512] |
|
config.backbone_config.depths = [2, 2, 2, 2] |
|
config.backbone_config.layer_type = "basic" |
|
config.encoder_in_channels = [128, 256, 512] |
|
config.hidden_expansion = 0.5 |
|
config.decoder_layers = 3 |
|
elif model_name == "rtdetr_r50vd_coco_o365": |
|
pass |
|
elif model_name == "rtdetr_r101vd_coco_o365": |
|
config.backbone_config.depths = [3, 4, 23, 3] |
|
config.encoder_ffn_dim = 2048 |
|
config.encoder_hidden_dim = 384 |
|
config.decoder_in_channels = [384, 384, 384] |
|
|
|
return config |
|
|
|
|
|
def create_rename_keys(config): |
|
|
|
rename_keys = [] |
|
|
|
|
|
|
|
last_key = ["weight", "bias", "running_mean", "running_var"] |
|
|
|
for level in range(3): |
|
rename_keys.append((f"backbone.conv1.conv1_{level+1}.conv.weight", f"model.backbone.model.embedder.embedder.{level}.convolution.weight")) |
|
for last in last_key: |
|
rename_keys.append((f"backbone.conv1.conv1_{level+1}.norm.{last}", f"model.backbone.model.embedder.embedder.{level}.normalization.{last}")) |
|
|
|
for stage_idx in range(len(config.backbone_config.depths)): |
|
for layer_idx in range(config.backbone_config.depths[stage_idx]): |
|
|
|
if layer_idx == 0: |
|
if stage_idx == 0: |
|
rename_keys.append( |
|
( |
|
f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.weight", |
|
f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.convolution.weight", |
|
) |
|
) |
|
for last in last_key: |
|
rename_keys.append( |
|
( |
|
f"backbone.res_layers.{stage_idx}.blocks.0.short.norm.{last}", |
|
f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.normalization.{last}", |
|
) |
|
) |
|
else: |
|
rename_keys.append( |
|
( |
|
f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.conv.weight", |
|
f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.convolution.weight", |
|
) |
|
) |
|
for last in last_key: |
|
rename_keys.append( |
|
( |
|
f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.norm.{last}", |
|
f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.normalization.{last}", |
|
) |
|
) |
|
|
|
rename_keys.append( |
|
( |
|
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.conv.weight", |
|
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.convolution.weight", |
|
) |
|
) |
|
for last in last_key: |
|
rename_keys.append(( |
|
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.norm.{last}", |
|
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.{last}", |
|
)) |
|
|
|
rename_keys.append( |
|
( |
|
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.conv.weight", |
|
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.convolution.weight", |
|
) |
|
) |
|
for last in last_key: |
|
rename_keys.append(( |
|
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.norm.{last}", |
|
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.{last}", |
|
)) |
|
|
|
|
|
if config.backbone_config.layer_type != "basic": |
|
rename_keys.append( |
|
( |
|
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.conv.weight", |
|
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.convolution.weight", |
|
) |
|
) |
|
for last in last_key: |
|
rename_keys.append(( |
|
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.norm.{last}", |
|
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.normalization.{last}", |
|
)) |
|
|
|
|
|
for i in range(config.encoder_layers): |
|
|
|
rename_keys.append( |
|
( |
|
f"encoder.encoder.{i}.layers.0.self_attn.out_proj.weight", |
|
f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"encoder.encoder.{i}.layers.0.self_attn.out_proj.bias", |
|
f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.bias", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"encoder.encoder.{i}.layers.0.linear1.weight", |
|
f"model.encoder.encoder.{i}.layers.0.fc1.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"encoder.encoder.{i}.layers.0.linear1.bias", |
|
f"model.encoder.encoder.{i}.layers.0.fc1.bias", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"encoder.encoder.{i}.layers.0.linear2.weight", |
|
f"model.encoder.encoder.{i}.layers.0.fc2.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"encoder.encoder.{i}.layers.0.linear2.bias", |
|
f"model.encoder.encoder.{i}.layers.0.fc2.bias", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"encoder.encoder.{i}.layers.0.norm1.weight", |
|
f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"encoder.encoder.{i}.layers.0.norm1.bias", |
|
f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.bias", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"encoder.encoder.{i}.layers.0.norm2.weight", |
|
f"model.encoder.encoder.{i}.layers.0.final_layer_norm.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"encoder.encoder.{i}.layers.0.norm2.bias", |
|
f"model.encoder.encoder.{i}.layers.0.final_layer_norm.bias", |
|
) |
|
) |
|
|
|
for j in range(0, 3): |
|
rename_keys.append((f"encoder.input_proj.{j}.conv.weight", f"model.encoder_input_proj.{j}.0.weight")) |
|
for last in last_key: |
|
rename_keys.append((f"encoder.input_proj.{j}.norm.{last}", f"model.encoder_input_proj.{j}.1.{last}")) |
|
|
|
block_levels = 3 if config.backbone_config.layer_type != "basic" else 4 |
|
|
|
for i in range(len(config.encoder_in_channels) - 1): |
|
|
|
for j in range(1, block_levels): |
|
rename_keys.append( |
|
(f"encoder.fpn_blocks.{i}.conv{j}.conv.weight", f"model.encoder.fpn_blocks.{i}.conv{j}.conv.weight") |
|
) |
|
for last in last_key: |
|
rename_keys.append( |
|
( |
|
f"encoder.fpn_blocks.{i}.conv{j}.norm.{last}", |
|
f"model.encoder.fpn_blocks.{i}.conv{j}.norm.{last}", |
|
) |
|
) |
|
|
|
rename_keys.append((f"encoder.lateral_convs.{i}.conv.weight", f"model.encoder.lateral_convs.{i}.conv.weight")) |
|
for last in last_key: |
|
rename_keys.append( |
|
(f"encoder.lateral_convs.{i}.norm.{last}", f"model.encoder.lateral_convs.{i}.norm.{last}") |
|
) |
|
|
|
for j in range(3): |
|
for k in range(1, 3): |
|
rename_keys.append( |
|
( |
|
f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight", |
|
f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight", |
|
) |
|
) |
|
for last in last_key: |
|
rename_keys.append( |
|
( |
|
f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}", |
|
f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}", |
|
) |
|
) |
|
|
|
for j in range(1, block_levels): |
|
rename_keys.append( |
|
(f"encoder.pan_blocks.{i}.conv{j}.conv.weight", f"model.encoder.pan_blocks.{i}.conv{j}.conv.weight") |
|
) |
|
for last in last_key: |
|
rename_keys.append( |
|
( |
|
f"encoder.pan_blocks.{i}.conv{j}.norm.{last}", |
|
f"model.encoder.pan_blocks.{i}.conv{j}.norm.{last}", |
|
) |
|
) |
|
|
|
for j in range(3): |
|
for k in range(1, 3): |
|
rename_keys.append( |
|
( |
|
f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight", |
|
f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight", |
|
) |
|
) |
|
for last in last_key: |
|
rename_keys.append( |
|
( |
|
f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}", |
|
f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}", |
|
) |
|
) |
|
|
|
rename_keys.append( |
|
(f"encoder.downsample_convs.{i}.conv.weight", f"model.encoder.downsample_convs.{i}.conv.weight") |
|
) |
|
for last in last_key: |
|
rename_keys.append( |
|
(f"encoder.downsample_convs.{i}.norm.{last}", f"model.encoder.downsample_convs.{i}.norm.{last}") |
|
) |
|
|
|
for i in range(config.decoder_layers): |
|
|
|
rename_keys.append( |
|
( |
|
f"decoder.decoder.layers.{i}.self_attn.out_proj.weight", |
|
f"model.decoder.layers.{i}.self_attn.out_proj.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.decoder.layers.{i}.self_attn.out_proj.bias", |
|
f"model.decoder.layers.{i}.self_attn.out_proj.bias", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.weight", |
|
f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.bias", |
|
f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.decoder.layers.{i}.cross_attn.attention_weights.weight", |
|
f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.decoder.layers.{i}.cross_attn.attention_weights.bias", |
|
f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.decoder.layers.{i}.cross_attn.value_proj.weight", |
|
f"model.decoder.layers.{i}.encoder_attn.value_proj.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.decoder.layers.{i}.cross_attn.value_proj.bias", |
|
f"model.decoder.layers.{i}.encoder_attn.value_proj.bias", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.decoder.layers.{i}.cross_attn.output_proj.weight", |
|
f"model.decoder.layers.{i}.encoder_attn.output_proj.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.decoder.layers.{i}.cross_attn.output_proj.bias", |
|
f"model.decoder.layers.{i}.encoder_attn.output_proj.bias", |
|
) |
|
) |
|
rename_keys.append( |
|
(f"decoder.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight") |
|
) |
|
rename_keys.append( |
|
(f"decoder.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias") |
|
) |
|
rename_keys.append( |
|
(f"decoder.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight") |
|
) |
|
rename_keys.append( |
|
(f"decoder.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias") |
|
) |
|
rename_keys.append((f"decoder.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight")) |
|
rename_keys.append((f"decoder.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias")) |
|
rename_keys.append((f"decoder.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight")) |
|
rename_keys.append((f"decoder.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias")) |
|
rename_keys.append( |
|
(f"decoder.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight") |
|
) |
|
rename_keys.append( |
|
(f"decoder.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias") |
|
) |
|
|
|
for i in range(config.decoder_layers): |
|
|
|
rename_keys.append( |
|
( |
|
f"decoder.dec_score_head.{i}.weight", |
|
f"model.decoder.class_embed.{i}.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.dec_score_head.{i}.bias", |
|
f"model.decoder.class_embed.{i}.bias", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.dec_bbox_head.{i}.layers.0.weight", |
|
f"model.decoder.bbox_embed.{i}.layers.0.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.dec_bbox_head.{i}.layers.0.bias", |
|
f"model.decoder.bbox_embed.{i}.layers.0.bias", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.dec_bbox_head.{i}.layers.1.weight", |
|
f"model.decoder.bbox_embed.{i}.layers.1.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.dec_bbox_head.{i}.layers.1.bias", |
|
f"model.decoder.bbox_embed.{i}.layers.1.bias", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.dec_bbox_head.{i}.layers.2.weight", |
|
f"model.decoder.bbox_embed.{i}.layers.2.weight", |
|
) |
|
) |
|
rename_keys.append( |
|
( |
|
f"decoder.dec_bbox_head.{i}.layers.2.bias", |
|
f"model.decoder.bbox_embed.{i}.layers.2.bias", |
|
) |
|
) |
|
|
|
|
|
for i in range(len(config.decoder_in_channels)): |
|
rename_keys.append( |
|
( |
|
f"decoder.input_proj.{i}.conv.weight", |
|
f"model.decoder_input_proj.{i}.0.weight", |
|
) |
|
) |
|
for last in last_key: |
|
rename_keys.append( |
|
( |
|
f"decoder.input_proj.{i}.norm.{last}", |
|
f"model.decoder_input_proj.{i}.1.{last}", |
|
) |
|
) |
|
|
|
|
|
rename_keys.extend( |
|
[ |
|
("decoder.denoising_class_embed.weight", "model.denoising_class_embed.weight"), |
|
("decoder.query_pos_head.layers.0.weight", "model.decoder.query_pos_head.layers.0.weight"), |
|
("decoder.query_pos_head.layers.0.bias", "model.decoder.query_pos_head.layers.0.bias"), |
|
("decoder.query_pos_head.layers.1.weight", "model.decoder.query_pos_head.layers.1.weight"), |
|
("decoder.query_pos_head.layers.1.bias", "model.decoder.query_pos_head.layers.1.bias"), |
|
("decoder.enc_output.proj.weight", "model.enc_output.0.weight"), |
|
("decoder.enc_output.proj.bias", "model.enc_output.0.bias"), |
|
("decoder.enc_output.norm.weight", "model.enc_output.1.weight"), |
|
("decoder.enc_output.norm.bias", "model.enc_output.1.bias"), |
|
("decoder.enc_score_head.weight", "model.enc_score_head.weight"), |
|
("decoder.enc_score_head.bias", "model.enc_score_head.bias"), |
|
("decoder.enc_bbox_head.layers.0.weight", "model.enc_bbox_head.layers.0.weight"), |
|
("decoder.enc_bbox_head.layers.0.bias", "model.enc_bbox_head.layers.0.bias"), |
|
("decoder.enc_bbox_head.layers.1.weight", "model.enc_bbox_head.layers.1.weight"), |
|
("decoder.enc_bbox_head.layers.1.bias", "model.enc_bbox_head.layers.1.bias"), |
|
("decoder.enc_bbox_head.layers.2.weight", "model.enc_bbox_head.layers.2.weight"), |
|
("decoder.enc_bbox_head.layers.2.bias", "model.enc_bbox_head.layers.2.bias"), |
|
("decoder.decoder.layers.0.cross_attn.num_points_scale", "model.decoder.layers.0.cross_attn.num_points_scale"), |
|
("decoder.decoder.layers.1.cross_attn.num_points_scale", "model.decoder.layers.1.cross_attn.num_points_scale"), |
|
("decoder.decoder.layers.2.cross_attn.num_points_scale", "model.decoder.layers.2.cross_attn.num_points_scale"), |
|
("decoder.valid_mask", "model.decoder.valid_mask"), |
|
("decoder.anchors", "model.decoder.anchors"), |
|
] |
|
) |
|
|
|
return rename_keys |
|
|
|
|
|
def rename_key(state_dict, old, new): |
|
try: |
|
val = state_dict.pop(old) |
|
state_dict[new] = val |
|
except Exception: |
|
pass |
|
|
|
|
|
def read_in_q_k_v(state_dict, config): |
|
prefix = "" |
|
encoder_hidden_dim = config.encoder_hidden_dim |
|
|
|
|
|
for i in range(config.encoder_layers): |
|
|
|
in_proj_weight = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_weight") |
|
in_proj_bias = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_bias") |
|
|
|
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.weight"] = in_proj_weight[ |
|
:encoder_hidden_dim, : |
|
] |
|
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.bias"] = in_proj_bias[:encoder_hidden_dim] |
|
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.weight"] = in_proj_weight[ |
|
encoder_hidden_dim : 2 * encoder_hidden_dim, : |
|
] |
|
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.bias"] = in_proj_bias[ |
|
encoder_hidden_dim : 2 * encoder_hidden_dim |
|
] |
|
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.weight"] = in_proj_weight[ |
|
-encoder_hidden_dim:, : |
|
] |
|
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.bias"] = in_proj_bias[-encoder_hidden_dim:] |
|
|
|
for i in range(config.decoder_layers): |
|
|
|
in_proj_weight = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_weight") |
|
in_proj_bias = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_bias") |
|
|
|
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] |
|
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] |
|
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] |
|
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] |
|
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] |
|
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] |
|
|
|
|
|
|
|
def prepare_img(): |
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg" |
|
im = Image.open(requests.get(url, stream=True).raw) |
|
|
|
return im |
|
|
|
|
|
@torch.no_grad() |
|
def convert_rt_detr_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, repo_id): |
|
""" |
|
Copy/paste/tweak model's weights to our RTDETR structure. |
|
""" |
|
|
|
|
|
config = get_rt_detr_config(model_name) |
|
|
|
|
|
model_name_to_checkpoint_url = { |
|
"rtdetr_r18vd": "https://github.com/lyuwenyu/storage/releases/download/v0.2/rtdetrv2_r18vd_120e_coco_rerun_48.1.pth" |
|
} |
|
logger.info(f"Converting model {model_name}...") |
|
state_dict = torch.hub.load_state_dict_from_url(model_name_to_checkpoint_url[model_name], map_location="cpu")[ |
|
"ema" |
|
]["module"] |
|
|
|
|
|
for src, dest in create_rename_keys(config): |
|
rename_key(state_dict, src, dest) |
|
|
|
read_in_q_k_v(state_dict, config) |
|
|
|
for key in state_dict.copy().keys(): |
|
if key.endswith("num_batches_tracked"): |
|
del state_dict[key] |
|
|
|
if "bbox_embed" in key or ("class_embed" in key and "denoising_" not in key): |
|
state_dict[key.split("model.decoder.")[-1]] = state_dict[key] |
|
|
|
print("done renaming now loading") |
|
|
|
model = RTDetrForObjectDetection(config) |
|
model.load_state_dict(state_dict, strict=False) |
|
model.eval() |
|
|
|
|
|
image_processor = RTDetrImageProcessor() |
|
|
|
|
|
img = prepare_img() |
|
|
|
|
|
transformations = transforms.Compose( |
|
[ |
|
transforms.Resize([640, 640], interpolation=transforms.InterpolationMode.BILINEAR), |
|
transforms.ToTensor(), |
|
] |
|
) |
|
original_pixel_values = transformations(img).unsqueeze(0) |
|
|
|
encoding = image_processor(images=img, return_tensors="pt") |
|
pixel_values = encoding["pixel_values"] |
|
|
|
assert torch.allclose(original_pixel_values, pixel_values) |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
pixel_values = pixel_values.to(device) |
|
|
|
|
|
outputs = model(pixel_values) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if pytorch_dump_folder_path is not None: |
|
Path(pytorch_dump_folder_path).mkdir(exist_ok=True) |
|
print(f"Saving model {model_name} to {pytorch_dump_folder_path}") |
|
model.save_pretrained(pytorch_dump_folder_path) |
|
print(f"Saving image processor to {pytorch_dump_folder_path}") |
|
image_processor.save_pretrained(pytorch_dump_folder_path) |
|
|
|
if push_to_hub: |
|
|
|
logger.info("Uploading PyTorch model and image processor to the hub...") |
|
config.push_to_hub( |
|
repo_id=repo_id, commit_message="Add config from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py" |
|
) |
|
model.push_to_hub( |
|
repo_id=repo_id, commit_message="Add model from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py" |
|
) |
|
image_processor.push_to_hub( |
|
repo_id=repo_id, |
|
commit_message="Add image processor from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py", |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--model_name", |
|
default="rtdetr_r50vd", |
|
type=str, |
|
help="model_name of the checkpoint you'd like to convert.", |
|
) |
|
parser.add_argument( |
|
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." |
|
) |
|
parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.") |
|
parser.add_argument( |
|
"--repo_id", |
|
type=str, |
|
help="repo_id where the model will be pushed to.", |
|
) |
|
args = parser.parse_args() |
|
convert_rt_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.repo_id) |