HopooLinZ
/

VA-Count

Model card Files Files and versions Community

HopooLinZ commited on Sep 30, 2024

Commit

dc066a6

verified ·

1 Parent(s): 1d588ce

Upload 35 files

Browse files

Files changed (35) hide show

FSC_pretrain.py +380 -0
FSC_tain.py +532 -0
FSC_test.py +352 -0
LICENSE +21 -0
README.md +100 -3
__pycache__/models_crossvit.cpython-38.pyc +0 -0
__pycache__/models_mae_cross.cpython-38.pyc +0 -0
__pycache__/models_mae_noct.cpython-38.pyc +0 -0
__pycache__/models_mae_noct.cpython-39.pyc +0 -0
biclassify.py +163 -0
datasetmake.py +53 -0
figure.png +0 -0
grounding_neg.py +188 -0
grounding_pos.py +141 -0
models_crossvit.py +155 -0
models_mae_cross.py +253 -0
models_mae_noct.py +234 -0
requirements.txt +15 -0
util/FSC147.py +524 -0
util/__pycache__/FSC147.cpython-38.pyc +0 -0
util/__pycache__/FSC147.cpython-39.pyc +0 -0
util/__pycache__/FSC147_test.cpython-38.pyc +0 -0
util/__pycache__/lr_sched.cpython-38.pyc +0 -0
util/__pycache__/lr_sched.cpython-39.pyc +0 -0
util/__pycache__/misc.cpython-38.pyc +0 -0
util/__pycache__/misc.cpython-39.pyc +0 -0
util/__pycache__/pos_embed.cpython-38.pyc +0 -0
util/__pycache__/pos_embed.cpython-39.pyc +0 -0
util/crop.py +42 -0
util/datasets.py +65 -0
util/lars.py +47 -0
util/lr_decay.py +76 -0
util/lr_sched.py +21 -0
util/misc.py +624 -0
util/pos_embed.py +97 -0

FSC_pretrain.py ADDED Viewed

	@@ -0,0 +1,380 @@

+import argparse
+import datetime
+import json
+import PIL.Image
+import numpy as np
+import os
+import time
+import random
+from pathlib import Path
+import math
+import sys
+from PIL import Image
+import torch
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+import torch.nn.functional as F
+from torch.utils.data import Dataset
+import wandb
+import timm
+assert "0.4.5" <= timm.__version__ <= "0.4.9"  # version check
+import timm.optim.optim_factory as optim_factory
+import util.misc as misc
+from util.misc import NativeScalerWithGradNormCount as NativeScaler
+import util.lr_sched as lr_sched
+from util.FSC147 import transform_pre_train
+import models_mae_noct
+def get_args_parser():
+    parser = argparse.ArgumentParser('MAE pre-training', add_help=False)
+    parser.add_argument('--batch_size', default=8, type=int,
+                        help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus')
+    parser.add_argument('--epochs', default=200, type=int)
+    parser.add_argument('--accum_iter', default=1, type=int,
+                        help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)')
+    # Model parameters
+    parser.add_argument('--model', default='mae_vit_base_patch16', type=str, metavar='MODEL',
+                        help='Name of model to train')
+    parser.add_argument('--mask_ratio', default=0.5, type=float,
+                        help='Masking ratio (percentage of removed patches).')
+    parser.add_argument('--norm_pix_loss', action='store_true',
+                        help='Use (per-patch) normalized pixels as targets for computing loss')
+    parser.set_defaults(norm_pix_loss=False)
+    # Optimizer parameters
+    parser.add_argument('--weight_decay', type=float, default=0.05,
+                        help='weight decay (default: 0.05)')
+    parser.add_argument('--lr', type=float, default=None, metavar='LR',
+                        help='learning rate (absolute lr)')
+    parser.add_argument('--blr', type=float, default=1e-3, metavar='LR',
+                        help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
+    parser.add_argument('--min_lr', type=float, default=0., metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0')
+    parser.add_argument('--warmup_epochs', type=int, default=10, metavar='N',
+                        help='epochs to warmup LR')
+    # Dataset parameters
+    parser.add_argument('--data_path', default='./data/FSC147/', type=str,
+                        help='dataset path')
+    parser.add_argument('--anno_file', default='annotation_FSC147_384.json', type=str,
+                        help='annotation json file')
+    parser.add_argument('--data_split_file', default='Train_Test_Val_FSC_147.json', type=str,
+                        help='data split json file')
+    parser.add_argument('--im_dir', default='images_384_VarV2', type=str,
+                        help='images directory')
+    parser.add_argument('--gt_dir', default='gt_density_map_adaptive_384_VarV2', type=str,
+                        help='ground truth directory')
+    parser.add_argument('--output_dir', default='./data/out/pre_4_dir',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--device', default='cuda:5',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=0, type=int)
+    parser.add_argument('--resume', default='./weights/mae_pretrain_vit_base_full.pth',  # mae_visualize_vit_base
+                        help='resume from checkpoint')
+    # Training parameters
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--num_workers', default=10, type=int)
+    parser.add_argument('--pin_mem', action='store_true',
+                        help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+    parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
+    parser.set_defaults(pin_mem=True)
+    # Distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--local_rank', default=-1, type=int)
+    parser.add_argument('--dist_on_itp', action='store_true')
+    parser.add_argument('--dist_url', default='env://',
+                        help='url used to set up distributed training')
+    # Logging parameters
+    parser.add_argument('--log_dir', default='./logs/pre_4_dir',
+                        help='path where to tensorboard log')
+    parser.add_argument("--title", default="CounTR_pretraining", type=str)
+    parser.add_argument("--wandb", default="counting", type=str)
+    parser.add_argument("--team", default="wsense", type=str)
+    parser.add_argument("--wandb_id", default=None, type=str)
+    parser.add_argument('--anno_file_negative', default='annotation_FSC147_negative1.json', type=str,
+                     help='annotation json file')
+    return parser
+os.environ["CUDA_LAUNCH_BLOCKING"] = '5'
+class TrainData(Dataset):
+    def __init__(self):
+        self.img = data_split['train']
+        random.shuffle(self.img)
+        self.img_dir = im_dir
+        self.TransformPreTrain = transform_pre_train(data_path)
+    def __len__(self):
+        return len(self.img)
+    def __getitem__(self, idx):
+        im_id = self.img[idx]
+        anno = annotations[im_id]
+        bboxes = anno['box_examples_coordinates']
+        # box_coordinates = anno.get('box_examples_coordinates', {})  # 获取图像的边界框坐标信息
+        # # print(box_coordinates)
+        # # 获取第一个类别的边界框坐标列表
+        # first_category = next(iter(box_coordinates), None)
+        # # print(first_category)
+        # first_category_bboxes = box_coordinates[first_category]
+        # if first_category_bboxes:
+        #     # print(first_category_bboxes[0])
+        #     bboxes = first_category_bboxes[0]
+        # else:
+        #     bboxes = []
+        # # if first_category_bboxes:
+        # #     bboxes = first_category_bboxes[0]
+        # # else:
+        # #     pass
+        rects = list()
+        for bbox in bboxes:
+            x1 = bbox[0][0]
+            y1 = bbox[0][1]
+            x2 = bbox[2][0]
+            y2 = bbox[2][1]
+            rects.append([y1, x1, y2, x2])
+        image = Image.open('{}/{}'.format(im_dir, im_id))
+        image.load()
+        density_path = gt_dir / (im_id.split(".jpg")[0] + ".npy")
+        density = np.load(density_path).astype('float32')
+        sample = {'image': image, 'lines_boxes': rects, 'gt_density': density}
+        sample = self.TransformPreTrain(sample)
+        return sample['image']
+def main(args):
+    misc.init_distributed_mode(args)
+    print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
+    print("{}".format(args).replace(', ', ',\n'))
+    device = torch.device(args.device)
+    # fix the seed for reproducibility
+    seed = args.seed + misc.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    cudnn.benchmark = True
+    dataset_train = TrainData()
+    print(dataset_train)
+    if True:  # args.distributed:
+        num_tasks = misc.get_world_size()
+        global_rank = misc.get_rank()
+        sampler_train = torch.utils.data.DistributedSampler(
+            dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        )
+        print("Sampler_train = %s" % str(sampler_train))
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset_train)
+    if global_rank == 0:
+        if args.log_dir is not None:
+            os.makedirs(args.log_dir, exist_ok=True)
+            log_writer = SummaryWriter(log_dir=args.log_dir)
+        else:
+            log_writer = None
+        if args.wandb is not None:
+            wandb_run = wandb.init(
+                config=args,
+                resume="allow",
+                project=args.wandb,
+                name=args.title,
+                # entity=args.team,
+                tags=["CounTR", "pretraining"],
+                id=args.wandb_id,
+            )
+        else:
+            wandb_run = None
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train, sampler=sampler_train,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=args.pin_mem,
+        drop_last=False,
+    )
+    # define the model
+    model = models_mae_noct.__dict__[args.model](norm_pix_loss=args.norm_pix_loss)
+    model.to(device)
+    model_without_ddp = model
+    print("Model = %s" % str(model_without_ddp))
+    eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+    if args.lr is None:  # only base_lr is specified
+        args.lr = args.blr * eff_batch_size / 256
+    print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
+    print("actual lr: %.2e" % args.lr)
+    print("accumulate grad iterations: %d" % args.accum_iter)
+    print("effective batch size: %d" % eff_batch_size)
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
+        model_without_ddp = model.module
+    # following timm: set wd as 0 for bias and norm layers
+    param_groups = optim_factory.add_weight_decay(model_without_ddp, args.weight_decay)
+    optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
+    print(optimizer)
+    loss_scaler = NativeScaler()
+    misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
+    print(f"Start training for {args.epochs} epochs")
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            data_loader_train.sampler.set_epoch(epoch)
+        # train one epoch
+        model.train(True)
+        metric_logger = misc.MetricLogger(delimiter="  ")
+        metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+        header = 'Epoch: [{}]'.format(epoch)
+        print_freq = 20
+        accum_iter = args.accum_iter
+        optimizer.zero_grad()
+        if log_writer is not None:
+            print('log_dir: {}'.format(log_writer.log_dir))
+        model_ = getattr(models_mae_noct, args.model)()
+        for data_iter_step, samples in enumerate(metric_logger.log_every(data_loader_train, print_freq, header)):
+            epoch_1000x = int((data_iter_step / len(data_loader_train) + epoch) * 1000)
+            if data_iter_step % accum_iter == 0:
+                lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader_train) + epoch, args)
+            samples = samples.to(device, non_blocking=True)
+            with torch.cuda.amp.autocast():
+                loss, pred, mask = model(samples, mask_ratio=args.mask_ratio)
+            loss_value = loss.item()
+            if data_iter_step % 2000 == 0:
+                preds = model_.unpatchify(pred)
+                preds = preds.float()
+                preds = torch.einsum('nchw->nhwc', preds)
+                preds = torch.clip(preds, 0, 1)
+                if log_writer is not None:
+                    log_writer.add_images('reconstruction', preds, int(epoch), dataformats='NHWC')
+                if wandb_run is not None:
+                    wandb_images = []
+                    w_samples = torch.einsum('nchw->nhwc', samples.float()).clip(0, 1)
+                    masks = F.interpolate(
+                        mask.reshape(shape=(mask.shape[0], 1, int(mask.shape[1] ** .5), int(mask.shape[1] ** .5))),
+                        size=(preds.shape[1], preds.shape[2]))
+                    masks = torch.einsum('nchw->nhwc', masks.float())
+                    combos = (w_samples + masks.repeat(1, 1, 1, 3)).clip(0, 1)
+                    w_images = (torch.cat([w_samples, combos, preds], dim=2) * 255).detach().cpu()
+                    print("w_images:", w_samples.shape, combos.shape, preds.shape, "-->", w_images.shape)
+                    for i in range(w_images.shape[0]):
+                        wi = w_images[i, :, :, :]
+                        wandb_images += [wandb.Image(wi.numpy().astype(np.uint8),
+                                                     caption=f"Prediction {i} at epoch {epoch}")]
+                    wandb.log({f"reconstruction": wandb_images}, step=epoch_1000x, commit=False)
+            if not math.isfinite(loss_value):
+                print("Loss is {}, stopping training".format(loss_value))
+                sys.exit(1)
+            loss /= accum_iter
+            loss_scaler(loss, optimizer, parameters=model.parameters(),
+                        update_grad=(data_iter_step + 1) % accum_iter == 0)
+            if (data_iter_step + 1) % accum_iter == 0:
+                optimizer.zero_grad()
+            torch.cuda.synchronize()
+            metric_logger.update(loss=loss_value)
+            lr = optimizer.param_groups[0]["lr"]
+            metric_logger.update(lr=lr)
+            loss_value_reduce = misc.all_reduce_mean(loss_value)
+            if (data_iter_step + 1) % accum_iter == 0:
+                if log_writer is not None:
+                    """ We use epoch_1000x as the x-axis in tensorboard.
+                    This calibrates different curves when batch size changes.
+                    """
+                    log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x)
+                    log_writer.add_scalar('lr', lr, epoch_1000x)
+                if wandb_run is not None:
+                    log = {"train/loss": loss_value_reduce, "train/lr": lr}
+                    wandb.log(log, step=epoch_1000x, commit=True if data_iter_step == 0 else False)
+        metric_logger.synchronize_between_processes()
+        print("Averaged stats:", metric_logger)
+        train_stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+        # save train status and model
+        if args.output_dir and (epoch % 100 == 0 or epoch + 1 == args.epochs):
+            misc.save_model(args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
+                            loss_scaler=loss_scaler, epoch=epoch, suffix=f"pretraining_{epoch}")
+        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                     'epoch': epoch, }
+        if args.output_dir and misc.is_main_process():
+            if log_writer is not None:
+                log_writer.flush()
+            with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
+                f.write(json.dumps(log_stats) + "\n")
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+    wandb.run.finish()
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    # load data
+    data_path = Path(args.data_path)
+    anno_file = data_path / args.anno_file
+    data_split_file = data_path / args.data_split_file
+    im_dir = data_path / args.im_dir
+    gt_dir = data_path / args.gt_dir
+    with open(anno_file) as f:
+        annotations = json.load(f)
+    with open(data_split_file) as f:
+        data_split = json.load(f)
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)

FSC_tain.py ADDED Viewed

	@@ -0,0 +1,532 @@

+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import time
+import random
+from pathlib import Path
+import sys
+from PIL import Image
+import torch.nn.functional as F
+import torch
+import torch.backends.cudnn as cudnn
+from torch.utils.data import Dataset
+import torchvision
+import wandb
+import timm
+from tqdm import tqdm
+assert "0.4.5" <= timm.__version__ <= "0.4.9"  # version check
+import timm.optim.optim_factory as optim_factory
+import util.misc as misc
+from util.misc import NativeScalerWithGradNormCount as NativeScaler
+import util.lr_sched as lr_sched
+from util.FSC147 import transform_train, transform_val
+import models_mae_cross
+def get_args_parser():
+    parser = argparse.ArgumentParser('MAE pre-training', add_help=True)
+    parser.add_argument('--batch_size', default=26, type=int,
+                        help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus)')
+    parser.add_argument('--epochs', default=200, type=int)
+    parser.add_argument('--accum_iter', default=1, type=int,
+                        help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)')
+    # Model parameters
+    parser.add_argument('--model', default='mae_vit_base_patch16', type=str, metavar='MODEL',
+                        help='Name of model to train')
+    parser.add_argument('--mask_ratio', default=0.5, type=float,
+                        help='Masking ratio (percentage of removed patches).')
+    parser.add_argument('--norm_pix_loss', action='store_true',
+                        help='Use (per-patch) normalized pixels as targets for computing loss')
+    parser.set_defaults(norm_pix_loss=False)
+    # Optimizer parameters
+    parser.add_argument('--weight_decay', type=float, default=0.05,
+                        help='weight decay (default: 0.05)')
+    parser.add_argument('--lr', type=float, default=None, metavar='LR',
+                        help='learning rate (absolute lr)')
+    parser.add_argument('--blr', type=float, default=1e-3, metavar='LR',
+                        help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
+    parser.add_argument('--min_lr', type=float, default=0., metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0')
+    parser.add_argument('--warmup_epochs', type=int, default=10, metavar='N',
+                        help='epochs to warmup LR')
+    # Dataset parameters
+    parser.add_argument('--data_path', default='./data/FSC147/', type=str,
+                        help='dataset path')
+    parser.add_argument('--anno_file', default='annotation_FSC147_pos.json', type=str,
+                        help='annotation json file for positive samples')
+    parser.add_argument('--anno_file_negative', default='./data/FSC147/annotation_FSC147_neg.json', type=str,
+                        help='annotation json file for negative samples')
+    parser.add_argument('--data_split_file', default='Train_Test_Val_FSC_147.json', type=str,
+                        help='data split json file')
+    parser.add_argument('--class_file', default='ImageClasses_FSC147.txt', type=str,
+                        help='class json file')
+    parser.add_argument('--im_dir', default='images_384_VarV2', type=str,
+                        help='images directory')
+    parser.add_argument('--output_dir', default='./data/out/fim6_dir',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--device', default='cuda',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=0, type=int)
+    parser.add_argument('--resume', default='./data/checkpoint.pth',
+                        help='resume from checkpoint')
+    parser.add_argument('--do_resume', action='store_true',
+                        help='Resume training (e.g. if crashed).')
+    # Training parameters
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--num_workers', default=10, type=int)
+    parser.add_argument('--pin_mem', action='store_true',
+                        help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+    parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
+    parser.set_defaults(pin_mem=True)
+    parser.add_argument('--do_aug', action='store_true',
+                        help='Perform data augmentation.')
+    parser.add_argument('--no_do_aug', action='store_false', dest='do_aug')
+    parser.set_defaults(do_aug=True)
+    # Distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--local_rank', default=-1, type=int)
+    parser.add_argument('--dist_on_itp', action='store_true')
+    parser.add_argument('--dist_url', default='env://',
+                        help='url used to set up distributed training')
+    # Logging parameters
+    parser.add_argument("--title", default="count", type=str)
+    parser.add_argument("--wandb", default="240227", type=str)
+    parser.add_argument("--team", default="wsense", type=str)
+    parser.add_argument("--wandb_id", default=None, type=str)
+    return parser
+os.environ["CUDA_LAUNCH_BLOCKING"] = '0'
+class TrainData(Dataset):
+    def __init__(self, args, split='train', do_aug=True):
+        with open(args.anno_file) as f:
+            annotations = json.load(f)
+                # Load negative annotations
+        with open(args.anno_file_negative) as f:
+            neg_annotations = json.load(f)
+        with open(args.data_split_file) as f:
+            data_split = json.load(f)
+        self.img = data_split[split]
+        random.shuffle(self.img)
+        self.split = split
+        self.img_dir = im_dir
+        self.TransformTrain = transform_train(args, do_aug=do_aug)
+        self.TransformVal = transform_val(args)
+        self.annotations = annotations
+        self.neg_annotations = neg_annotations
+        self.im_dir = im_dir
+    def __len__(self):
+        return len(self.img)
+    def __getitem__(self, idx):
+        im_id = self.img[idx]
+        anno = self.annotations[im_id]
+        bboxes = anno['box_examples_coordinates']
+        dots = np.array(anno['points'])
+        # 加载负样本的框
+        neg_anno = self.neg_annotations[im_id]  # 假设每个图像ID在负样本注释中都有对应的条目
+        neg_bboxes = neg_anno['box_examples_coordinates']
+        rects = list()
+        for bbox in bboxes:
+            x1 = bbox[0][0]
+            y1 = bbox[0][1]
+            x2 = bbox[2][0]
+            y2 = bbox[2][1]
+            if x1 < 0:
+                x1 = 0
+            if x2 < 0:
+                x2 = 0
+            if y1 < 0:
+                y1 = 0
+            if y2 < 0:
+                y2 = 0
+            rects.append([y1, x1, y2, x2])
+        neg_rects = list()
+        for neg_bbox in neg_bboxes:
+            x1 = neg_bbox[0][0]
+            y1 = neg_bbox[0][1]
+            x2 = neg_bbox[2][0]
+            y2 = neg_bbox[2][1]
+            if x1 < 0:
+                x1 = 0
+            if x2 < 0:
+                x2 = 0
+            if y1 < 0:
+                y1 = 0
+            if y2 < 0:
+                y2 = 0
+            neg_rects.append([y1, x1, y2, x2])
+        image = Image.open('{}/{}'.format(self.im_dir, im_id))
+        if image.mode == "RGBA":
+            image = image.convert("RGB")
+        image.load()
+        m_flag = 0
+        sample = {'image': image, 'lines_boxes': rects, 'neg_lines_boxes': neg_rects,'dots': dots, 'id': im_id, 'm_flag': m_flag}
+        sample = self.TransformTrain(sample) if self.split == "train" else self.TransformVal(sample)
+        return sample['image'], sample['gt_density'], len(dots), sample['boxes'],sample['neg_boxes'], sample['pos'],sample['m_flag'], im_id
+def main(args):
+    wandb_run = None
+    try:
+        misc.init_distributed_mode(args)
+        print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
+        print("{}".format(args).replace(', ', ',\n'))
+        device = torch.device(args.device)
+        # if torch.cuda.is_available():
+        #     device = torch.device("cuda:5")
+        # fix the seed for reproducibility
+        seed = args.seed + misc.get_rank()
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+        cudnn.benchmark = True
+        dataset_train = TrainData(args, do_aug=args.do_aug)
+        dataset_val = TrainData(args, split='val')
+        num_tasks = misc.get_world_size()
+        global_rank = misc.get_rank()
+        sampler_train = torch.utils.data.DistributedSampler(
+            dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        )
+        sampler_val = torch.utils.data.DistributedSampler(
+            dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        )
+        if global_rank == 0:
+            if args.wandb is not None:
+                wandb_run = wandb.init(
+                    config=args,
+                    resume="allow",
+                    project=args.wandb,
+                    name=args.title,
+                    # entity=args.team,
+                    tags=["count", "finetuning"],
+                    id=args.wandb_id,
+                )
+        data_loader_train = torch.utils.data.DataLoader(
+            dataset_train, sampler=sampler_train,
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            pin_memory=args.pin_mem,
+            drop_last=False,
+        )
+        data_loader_val = torch.utils.data.DataLoader(
+            dataset_val, sampler=sampler_val,
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            pin_memory=args.pin_mem,
+            drop_last=False,
+        )
+        # define the model
+        model = models_mae_cross.__dict__[args.model](norm_pix_loss=args.norm_pix_loss)
+        model.to(device)
+        model_without_ddp = model
+        # print("Model = %s" % str(model_without_ddp))
+        eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+        if args.lr is None:  # only base_lr is specified
+            args.lr = args.blr * eff_batch_size / 256
+        print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
+        print("actual lr: %.2e" % args.lr)
+        print("accumulate grad iterations: %d" % args.accum_iter)
+        print("effective batch size: %d" % eff_batch_size)
+        if args.distributed:
+            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
+            model_without_ddp = model.module
+        # following timm: set wd as 0 for bias and norm layers
+        param_groups = optim_factory.add_weight_decay(model_without_ddp, args.weight_decay)
+        optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
+        print(optimizer)
+        loss_scaler = NativeScaler()
+        min_MAE = 99999
+        print_freq = 50
+        save_freq = 50
+        misc.load_model_FSC_full(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
+        print(f"Start training for {args.epochs - args.start_epoch} epochs   -   rank {global_rank}")
+        start_time = time.time()
+        for epoch in range(args.start_epoch, args.epochs):
+            if args.distributed:
+                data_loader_train.sampler.set_epoch(epoch)
+            # train one epoch
+            model.train(True)
+            accum_iter = args.accum_iter
+            # some parameters in training
+            train_mae = torch.tensor([0], dtype=torch.float64, device=device)
+            train_mse = torch.tensor([0], dtype=torch.float64, device=device)
+            val_mae = torch.tensor([0], dtype=torch.float64, device=device)
+            val_mse = torch.tensor([0], dtype=torch.float64, device=device)
+            val_nae = torch.tensor([0], dtype=torch.float64, device=device)
+            optimizer.zero_grad()
+            for data_iter_step, (samples, gt_density, _, pos_boxes, neg_boxes, pos, m_flag, im_names) in enumerate(
+                tqdm(data_loader_train, total=len(data_loader_train), desc=f"Train [e. {epoch} - r. {global_rank}]")):
+                idx = data_iter_step + (epoch * len(data_loader_train))
+                if data_iter_step % accum_iter == 0:
+                    lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader_train) + epoch, args)
+                samples = samples.to(device, non_blocking=True, dtype=torch.half)
+                gt_density = gt_density.to(device, non_blocking=True, dtype=torch.half)
+                pos_boxes = pos_boxes.to(device, non_blocking=True, dtype=torch.half)
+                neg_boxes = neg_boxes.to(device, non_blocking=True, dtype=torch.half)
+    # 如果至少有一个图像在批处理中使用了Type 2 Mosaic，则禁止0-shot。
+                flag = 0
+                for i in range(m_flag.shape[0]):
+                    flag += m_flag[i].item()
+                if flag == 0:
+                    shot_num = random.randint(0, 3)
+                else:
+                    shot_num = random.randint(1, 3)
+                with torch.cuda.amp.autocast():
+                    pos_output = model(samples, pos_boxes, shot_num)  # 正样本输出
+    # 计算正样本损失
+                mask = np.random.binomial(n=1, p=0.8, size=[384, 384])
+                masks = np.tile(mask, (pos_output.shape[0], 1))
+                masks = masks.reshape(pos_output.shape[0], 384, 384)
+                masks = torch.from_numpy(masks).to(device)
+                pos_loss = ((pos_output - gt_density) ** 2)
+                pos_loss = (pos_loss * masks / (384 * 384)).sum() / pos_output.shape[0]
+    # 负样本输出
+                with torch.cuda.amp.autocast():
+                    neg_output = model(samples, neg_boxes, 1)  # 负样本输出
+                cnt1 = 1-torch.exp(-(torch.abs(pos_output.sum()/60 - gt_density.sum()/60).mean()))
+                if neg_output.shape[0] == 0:
+                    cnt2 = 0
+                else:
+                    # cnt2 = torch.log(torch.abs((neg_output.sum() / neg_output.shape[0]) - 1).mean()+1)
+                    cnt2 = 1-torch.exp(-(torch.abs((neg_output.sum() / (neg_output.shape[0]*60)) - 1).mean()))
+                cnt = cnt1+cnt2
+    # 计算正样本损失
+                mask = np.random.binomial(n=1, p=0.8, size=[384, 384])
+                masks = np.tile(mask, (neg_output.shape[0], 1))
+                masks = masks.reshape(neg_output.shape[0], 384, 384)
+                masks = torch.from_numpy(masks).to(device)
+                neg_loss = ((neg_output - gt_density) ** 2)
+                if neg_output.shape[0] == 0:
+                    neg_loss = 1
+                else:
+                    neg_loss = (neg_loss * masks / (384 * 384)).sum() / neg_output.shape[0]
+                margin = 0.5
+                contrastive_loss = torch.relu(pos_loss - neg_loss + margin)
+                total_loss = contrastive_loss+pos_loss
+    # 更新 MAE 和 RMSE
+                with torch.no_grad():
+                    pred_cnt = (pos_output.view(len(samples), -1)).sum(1) / 60
+                    gt_cnt = (gt_density.view(len(samples), -1)).sum(1) / 60
+                    cnt_err = torch.abs(pred_cnt - gt_cnt).float()
+                    batch_mae = cnt_err.double().mean()
+                    batch_mse = (cnt_err ** 2).double().mean()
+                train_mae += batch_mae
+                train_mse += batch_mse
+                if not torch.isfinite(total_loss):
+                    print("Loss is {}, stopping training".format(total_loss))
+                    sys.exit(1)
+                total_loss /= accum_iter
+                loss_scaler(total_loss, optimizer, parameters=model.parameters(),
+                            update_grad=(data_iter_step + 1) % accum_iter == 0)
+                if (data_iter_step + 1) % accum_iter == 0:
+                    optimizer.zero_grad()
+                lr = optimizer.param_groups[0]["lr"]
+                loss_value_reduce = misc.all_reduce_mean(total_loss)
+                if (data_iter_step + 1) % (print_freq * accum_iter) == 0 and (data_iter_step + 1) != len(data_loader_train) and data_iter_step != 0:
+                    if wandb_run is not None:
+                        log = {"train/loss": loss_value_reduce,
+                               "train/lr": lr,
+                               "train/MAE": batch_mae,
+                               "train/RMSE": batch_mse ** 0.5}
+                        wandb.log(log, step=idx)
+            # evaluation on Validation split
+            for val_samples, val_gt_density, val_n_ppl, val_boxes,_, val_pos, _, val_im_names in \
+                tqdm(data_loader_val, total=len(data_loader_val),
+                     desc=f"Val [e. {epoch} - r. {global_rank}]"):
+                val_samples = val_samples.to(device, non_blocking=True, dtype=torch.half)
+                val_gt_density = val_gt_density.to(device, non_blocking=True, dtype=torch.half)
+                val_boxes = val_boxes.to(device, non_blocking=True, dtype=torch.half)
+                val_n_ppl = val_n_ppl.to(device, non_blocking=True)
+                shot_num = random.randint(0, 3)
+                with torch.no_grad():
+                    with torch.cuda.amp.autocast():
+                        val_output = model(val_samples, val_boxes, shot_num)
+                    val_pred_cnt = (val_output.view(len(val_samples), -1)).sum(1) / 60
+                    val_gt_cnt = (val_gt_density.view(len(val_samples), -1)).sum(1) / 60
+                    # print('val_pred_cnt',val_pred_cnt)
+                    # print('val_gt_cnt',val_gt_cnt)
+                    val_cnt_err = torch.abs(val_pred_cnt - val_gt_cnt).float()
+                    # print('val_cnt_err',val_cnt_err.mean())
+                    val_cnt_err[val_cnt_err == float('inf')] = 0
+                    val_mae += val_cnt_err.double().mean()
+                    # val_mae += val_cnt_err
+                    # print('val_mae',val_mae.mean())
+                    val_cnt_err[val_cnt_err == float('inf')] = 0
+                    val_mse += (val_cnt_err ** 2).double().mean()
+                    # val_mse += (val_cnt_err ** 2)
+                    _val_nae = val_cnt_err / val_gt_cnt
+                    _val_nae[_val_nae == float('inf')] = 0
+                    val_nae += _val_nae.double().mean()
+            # val_mae = val_mae/len(data_loader_val)
+            # val_mse = val_mse/len(data_loader_val)
+            # print('val_mae',val_mae)
+            # print('val_mse',val_mse)
+            # Output visualisation information to W&B
+            if wandb_run is not None:
+                train_wandb_densities = []
+                train_wandb_bboxes = []
+                val_wandb_densities = []
+                val_wandb_bboxes = []
+                black = torch.zeros([384, 384], device=device)
+                for i in range(pos_output.shape[0]):
+                    # gt and predicted density
+                    w_d_map = torch.stack([pos_output[i], black, black])
+                    gt_map = torch.stack([gt_density[i], black, black])
+                    box_map = misc.get_box_map(samples[i], pos[i], device)
+                    w_gt_density = samples[i] / 2 + gt_map + box_map
+                    w_d_map_overlay = samples[i] / 2 + w_d_map
+                    w_densities = torch.cat([w_gt_density, w_d_map, w_d_map_overlay], dim=2)
+                    w_densities = torch.clamp(w_densities, 0, 1)
+                    train_wandb_densities += [wandb.Image(torchvision.transforms.ToPILImage()(w_densities),
+                                                          caption=f"[E#{epoch}] {im_names[i]} ({torch.sum(gt_density[i]).item()}, {torch.sum(pos_output[i]).item()})")]
+                    # exemplars
+                    w_boxes = torch.cat([pos_boxes[i][x, :, :, :] for x in range(pos_boxes[i].shape[0])], 2)
+                    train_wandb_bboxes += [wandb.Image(torchvision.transforms.ToPILImage()(w_boxes),
+                                                       caption=f"[E#{epoch}] {im_names[i]}")]
+                for i in range(val_output.shape[0]):
+                    # gt and predicted density
+                    w_d_map = torch.stack([val_output[i], black, black])
+                    gt_map = torch.stack([val_gt_density[i], black, black])
+                    box_map = misc.get_box_map(val_samples[i], val_pos[i], device)
+                    w_gt_density = val_samples[i] / 2 + gt_map + box_map
+                    w_d_map_overlay = val_samples[i] / 2 + w_d_map
+                    w_densities = torch.cat([w_gt_density, w_d_map, w_d_map_overlay], dim=2)
+                    w_densities = torch.clamp(w_densities, 0, 1)
+                    val_wandb_densities += [wandb.Image(torchvision.transforms.ToPILImage()(w_densities),
+                                                        caption=f"[E#{epoch}] {val_im_names[i]} ({torch.sum(val_gt_density[i]).item()}, {torch.sum(val_output[i]).item()})")]
+                    # exemplars
+                    w_boxes = torch.cat([val_boxes[i][x, :, :, :] for x in range(val_boxes[i].shape[0])], 2)
+                    val_wandb_bboxes += [wandb.Image(torchvision.transforms.ToPILImage()(w_boxes),
+                                                     caption=f"[E#{epoch}] {val_im_names[i]}")]
+                log = {"train/loss": loss_value_reduce,
+                       "train/lr": lr,
+                       "train/MAE": batch_mae,
+                       "train/RMSE": batch_mse ** 0.5,
+                       "val/MAE": val_mae / len(data_loader_val),
+                       "val/RMSE": (val_mse / len(data_loader_val)) ** 0.5,
+                       "val/NAE": val_nae / len(data_loader_val),
+                       "train_densitss": train_wandb_densities,
+                       "val_densites": val_wandb_densities,
+                       "train_boxes": train_wandb_bboxes,
+                       "val_boxes": val_wandb_bboxes}
+                wandb.log(log, step=idx)
+            # save train status and model
+            if args.output_dir and (epoch % save_freq == 0 or epoch + 1 == args.epochs) and epoch != 0:
+                misc.save_model(
+                    args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
+                    loss_scaler=loss_scaler, epoch=epoch, suffix=f"finetuning_{epoch}", upload=epoch % 100 == 0)
+            elif True:
+                misc.save_model(
+                    args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
+                    loss_scaler=loss_scaler, epoch=epoch, suffix=f"finetuning_last", upload=False)
+            if args.output_dir and val_mae / len(data_loader_val) < min_MAE:
+                min_MAE = val_mae / len(data_loader_val)
+                misc.save_model(
+                    args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
+                    loss_scaler=loss_scaler, epoch=epoch, suffix="finetuning_minMAE")
+            print(f'[Train Epoch #{epoch}] - MAE: {train_mae.item() / len(data_loader_train):5.2f}, RMSE: {(train_mse.item() / len(data_loader_train)) ** 0.5:5.2f}', flush=True)
+            print(f'[Val Epoch #{epoch}] - MAE: {val_mae.item() / len(data_loader_val):5.2f}, RMSE: {(val_mse.item() / len(data_loader_val)) ** 0.5:5.2f}, NAE: {val_nae.item() / len(data_loader_val):5.2f}', flush=True)
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('Training time {}'.format(total_time_str))
+    finally:
+        if wandb_run is not None:
+            wandb.run.finish()
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    data_path = Path(args.data_path)
+    anno_file = data_path / args.anno_file
+    data_split_file = data_path / args.data_split_file
+    im_dir = data_path / args.im_dir
+    if args.do_aug:
+        class_file = data_path / args.class_file
+    else:
+        class_file = None
+    args.anno_file = anno_file
+    args.data_split_file = data_split_file
+    args.im_dir = im_dir
+    args.class_file = class_file
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)

FSC_test.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import argparse
+import json
+import numpy as np
+import os
+from pathlib import Path
+from PIL import Image, ImageDraw
+import matplotlib.pyplot as plt
+import scipy.ndimage as ndimage
+import pandas as pd
+import random
+import torch
+import torch.nn as nn
+import torch.backends.cudnn as cudnn
+from torch.utils.data import Dataset
+import torchvision
+from torchvision import transforms
+import torchvision.transforms.functional as TF
+import timm
+from util.FSC147 import transform_train, transform_val
+from tqdm import tqdm
+assert "0.4.5" <= timm.__version__ <= "0.4.9"  # version check
+import util.misc as misc
+import models_mae_cross
+def get_args_parser():
+    parser = argparse.ArgumentParser('MAE pre-training', add_help=False)
+    # Model parameters
+    parser.add_argument('--model', default='mae_vit_base_patch16', type=str, metavar='MODEL',
+                        help='Name of model to train')
+    parser.add_argument('--mask_ratio', default=0.5, type=float,
+                        help='Masking ratio (percentage of removed patches).')
+    parser.add_argument('--norm_pix_loss', action='store_true',
+                        help='Use (per-patch) normalized pixels as targets for computing loss')
+    parser.set_defaults(norm_pix_loss=False)
+    # Dataset parameters
+    parser.add_argument('--data_path', default='./data/FSC147/', type=str,
+                        help='dataset path')
+    parser.add_argument('--anno_file', default='annotation_FSC147_positive.json', type=str,
+                        help='annotation json file')
+    parser.add_argument('--anno_file_negative', default='./data/FSC147/annotation_FSC147_neg2.json', type=str,
+                        help='annotation json file')
+    parser.add_argument('--data_split_file', default='Train_Test_Val_FSC_147.json', type=str,
+                        help='data split json file')
+    parser.add_argument('--im_dir', default='images_384_VarV2', type=str,
+                        help='images directory')
+    parser.add_argument('--output_dir', default='./Image',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--device', default='cuda',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=0, type=int)
+    parser.add_argument('--resume', default='./output_fim6_dir/checkpoint-0.pth',
+                        help='resume from checkpoint')
+    parser.add_argument('--external', action='store_true',
+                        help='Set this param for using external exemplars')
+    parser.add_argument('--box_bound', default=-1, type=int,
+                        help='The max number of exemplars to be considered')
+    parser.add_argument('--split', default="test", type=str)
+    # Training parameters
+    parser.add_argument('--num_workers', default=0, type=int)
+    parser.add_argument('--pin_mem', action='store_true',
+                        help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+    parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
+    parser.set_defaults(pin_mem=True)
+    parser.add_argument('--normalization', default=True, help='Set to False to disable test-time normalization')
+    # Distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--local_rank', default=-1, type=int)
+    parser.add_argument('--dist_on_itp', action='store_true')
+    parser.add_argument('--dist_url', default='env://',
+                        help='url used to set up distributed training')
+    return parser
+os.environ["CUDA_LAUNCH_BLOCKING"] = '5'
+class TestData(Dataset):
+    def __init__(self, args, split='val', do_aug=True):
+        with open(data_path/args.anno_file) as f:
+            annotations = json.load(f)
+                # Load negative annotations
+        with open(args.anno_file_negative) as f:
+            neg_annotations = json.load(f)
+        with open(data_path/args.data_split_file) as f:
+            data_split = json.load(f)
+        self.img = data_split[split]
+        random.shuffle(self.img)
+        self.split = split
+        self.img_dir = im_dir
+        # self.TransformTrain = transform_train(args, do_aug=do_aug)
+        self.TransformVal = transform_val(args)
+        self.annotations = annotations
+        self.neg_annotations = neg_annotations
+        self.im_dir = im_dir
+    def __len__(self):
+        return len(self.img)
+    def __getitem__(self, idx):
+        im_id = self.img[idx]
+        anno = self.annotations[im_id]
+        bboxes = anno['box_examples_coordinates']
+        dots = np.array(anno['points'])
+        # 加载负样本的框
+        neg_anno = self.neg_annotations[im_id]  # 假设每个图像ID在负样本注释中都有对应的条目
+        neg_bboxes = neg_anno['box_examples_coordinates']
+        rects = list()
+        for bbox in bboxes:
+            x1 = bbox[0][0]
+            y1 = bbox[0][1]
+            x2 = bbox[2][0]
+            y2 = bbox[2][1]
+            if x1 < 0:
+                x1 = 0
+            if x2 < 0:
+                x2 = 0
+            if y1 < 0:
+                y1 = 0
+            if y2 < 0:
+                y2 = 0
+            rects.append([y1, x1, y2, x2])
+        neg_rects = list()
+        for neg_bbox in neg_bboxes:
+            x1 = neg_bbox[0][0]
+            y1 = neg_bbox[0][1]
+            x2 = neg_bbox[2][0]
+            y2 = neg_bbox[2][1]
+            if x1 < 0:
+                x1 = 0
+            if x2 < 0:
+                x2 = 0
+            if y1 < 0:
+                y1 = 0
+            if y2 < 0:
+                y2 = 0
+            neg_rects.append([y1, x1, y2, x2])
+        image = Image.open('{}/{}'.format(self.im_dir, im_id))
+        if image.mode == "RGBA":
+            image = image.convert("RGB")
+        image.load()
+        m_flag = 0
+        sample = {'image': image, 'lines_boxes': rects,'neg_lines_boxes': neg_rects, 'dots': dots, 'id': im_id, 'm_flag': m_flag}
+        sample = self.TransformTrain(sample) if self.split == "train" else self.TransformVal(sample)
+        # if self.split == "train":
+        #     sample = self.TransformTrain(sample)
+        # # print(sample.keys())
+        return sample['image'], sample['gt_density'], len(dots), sample['boxes'], sample['neg_boxes'], sample['pos'],sample['m_flag'], im_id
+def batched_rmse(predictions, targets, batch_size=100):
+    """
+    分批计算RMSE
+    :param predictions: 模型预测的值，一个PyTorch张量
+    :param targets: 真实的值，一个PyTorch张量，与predictions形状相同
+    :param batch_size: 每个批次的大小
+    :return: RMSE值
+    """
+    total_mse = 0.0
+    total_count = 0
+    # 分批处理
+    for i in range(0, len(predictions), batch_size):
+        batch_predictions = predictions[i:i+batch_size]
+        batch_targets = targets[i:i+batch_size]
+        # 确保使用float64进行计算以提高精度
+        batch_predictions = batch_predictions.double()
+        batch_targets = batch_targets.double()
+        # 计算批次的MSE
+        difference = batch_predictions - batch_targets
+        mse = torch.mean(difference ** 2)
+        # 累加MSE和计数
+        total_mse += mse * len(batch_predictions)
+        total_count += len(batch_predictions)
+    # 计算平均MSE
+    avg_mse = total_mse / total_count
+    # 计算RMSE
+    rmse_val = torch.sqrt(avg_mse)
+    return rmse_val
+def batched_mae(predictions, targets, batch_size=100):
+    """
+    分批计算MAE
+    :param predictions: 模型预测的值，一个PyTorch张量
+    :param targets: 真实的值，一个PyTorch张量，与predictions形状相同
+    :param batch_size: 每个批次的大小
+    :return: MAE值
+    """
+    total_mae = 0.0
+    total_count = 0
+    # 分批处理
+    for i in range(0, len(predictions), batch_size):
+        batch_predictions = predictions[i:i+batch_size]
+        batch_targets = targets[i:i+batch_size]
+        # 计算批次的绝对误差
+        absolute_errors = torch.abs(batch_predictions - batch_targets)
+        # 累加绝对误差和计数
+        total_mae += torch.sum(absolute_errors)
+        total_count += len(batch_predictions)
+    # 计算平均绝对误差
+    avg_mae = total_mae / total_count
+    return avg_mae
+def main(args):
+    misc.init_distributed_mode(args)
+    print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
+    print("{}".format(args).replace(', ', ',\n'))
+    device = torch.device(args.device)
+    # fix the seed for reproducibility
+    seed = args.seed + misc.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    cudnn.benchmark = True
+    # dataset_test = TestData(external=args.external, box_bound=args.box_bound, split=args.split)
+    dataset_test = TestData(args, split='test')
+    num_tasks = misc.get_world_size()
+    global_rank = misc.get_rank()
+    sampler_test = torch.utils.data.DistributedSampler(
+        dataset_test, num_replicas=num_tasks, rank=global_rank, shuffle=True
+    )
+    data_loader_test = torch.utils.data.DataLoader(
+        dataset_test, sampler=sampler_test,
+        batch_size=1,
+        num_workers=args.num_workers,
+        pin_memory=args.pin_mem,
+        drop_last=False,
+    )
+    # define the model
+    model = models_mae_cross.__dict__[args.model](norm_pix_loss=args.norm_pix_loss)
+    model.to(device)
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
+        model_without_ddp = model.module
+    misc.load_model_FSC(args=args, model_without_ddp=model_without_ddp)
+    print(f"Start testing.")
+    # test
+    model.eval()
+    # some parameters in training
+    train_mae = 0
+    train_rmse = 0
+    train_nae = 0
+    tot_load_time = 0
+    tot_infer_time = 0
+    loss_array = []
+    gt_array = []
+    pred_arr = []
+    name_arr = []
+    empties = []
+    total_mae = 0.0
+    total_mse = 0.0
+    total_nae = 0.0
+    total_count = 0
+    sub_batch_size = 50
+    for val_samples, val_gt_density, val_n_ppl, val_boxes,neg_val_boxes, val_pos, _, val_im_names in tqdm(data_loader_test, total=len(data_loader_test), desc="Validation"):
+        val_samples = val_samples.to(device, non_blocking=True, dtype=torch.float)  # 使用更高精度
+        val_gt_density = val_gt_density.to(device, non_blocking=True, dtype=torch.float)
+        val_boxes = val_boxes.to(device, non_blocking=True, dtype=torch.float)
+        neg_val_boxes = neg_val_boxes.to(device, non_blocking=True, dtype=torch.float)
+        num_samples = val_samples.size(0)
+        total_count += num_samples
+        for i in range(0, num_samples, sub_batch_size):
+            sub_val_samples = val_samples[i:i+sub_batch_size]
+            sub_val_gt_density = val_gt_density[i:i+sub_batch_size]
+            with torch.no_grad():
+                with torch.cuda.amp.autocast():
+                    sub_val_output = model(sub_val_samples, val_boxes[i:i+sub_batch_size], 3)
+            with torch.no_grad():
+                with torch.cuda.amp.autocast():
+                    neg_sub_val_output = model(sub_val_samples, neg_val_boxes[i:i+sub_batch_size], 3)
+                # output = torch.clamp((sub_val_output-neg_sub_val_output),min=0)
+                sub_val_pred_cnt = torch.abs(sub_val_output.sum()) / 60
+                # sub_val_pred_cnt = torch.abs(output.sum()) / 60
+                # neg_sub_val_pred_cnt = torch.abs(neg_sub_val_output.sum()) / 60
+                sub_val_gt_cnt = sub_val_gt_density.sum() / 60
+                sub_val_cnt_err = torch.abs(sub_val_pred_cnt - sub_val_gt_cnt)
+                # 逐项添加并检查
+                if not torch.isinf(sub_val_cnt_err) and not torch.isnan(sub_val_cnt_err):
+                    batch_mae = sub_val_cnt_err.item()
+                    batch_mse = sub_val_cnt_err.item() ** 2
+                    batch_nae = sub_val_cnt_err.item() / sub_val_gt_cnt.item() if sub_val_gt_cnt.item() != 0 else 0
+                    total_mae += batch_mae * sub_val_samples.size(0)
+                    total_mse += batch_mse * sub_val_samples.size(0)
+                    total_nae += batch_nae * sub_val_samples.size(0)
+                sub_val_pred_cnt = (sub_val_pred_cnt).int()
+    final_mae = total_mae / total_count
+    final_rmse = (total_mse / total_count) ** 0.5
+    final_nae = total_nae / total_count
+    print(f'MAE: {final_mae}, RMSE: {final_rmse}, NAE: {final_nae}')
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    # load data
+    data_path = Path(args.data_path)
+    anno_file = data_path / args.anno_file
+    data_split_file = data_path / args.data_split_file
+    im_dir = data_path / args.im_dir
+    with open(anno_file) as f:
+        annotations = json.load(f)
+    with open(data_split_file) as f:
+        data_split = json.load(f)
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 Chang Liu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,100 @@
----
-license: apache-2.0
----

+# VA-Count
+[ECCV 2024] Zero-shot Object Counting with Good Exemplars
+[[paper](https://arxiv.org/abs/2407.04948)]
+![figure](figure.png)
+# Zero-shot Object Counting with Good Exemplars
+## News🚀
+* **2024.09.27**: Our code is released.
+* **2024.09.26**: Our inference code has been updated, and the code for selecting exemplars and the training code will be coming soon.
+* **2024.07.02**: VA-Count is accepted by ECCV2024.
+## Overview
+Overview of the proposed method. The proposed method focuses on two main elements: the Exemplar Enhancement Module (EEM) for improving exemplar quality through a patch selection integrated with Grounding DINO, and the Noise Suppression Module (NSM) that distinguishes between positive and negative class samples using density maps. It employs a Contrastive Loss function to refine the precision in identifying target class objects from others in an image.
+## Environment
+```
+pip install torch==1.10.0+cu111 torchvision==0.11.0+cu111 torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html
+pip install timm==0.3.2
+pip install numpy
+pip install matplotlib tqdm
+pip install tensorboard
+pip install scipy
+pip install imgaug
+pip install opencv-python
+pip3 install hub
+```
+### For more information on Grounding DINO, please refer to the following link:
+[GroundingDINO](https://github.com/IDEA-Research/GroundingDINO)
+We are very grateful for the Grounding DINO approach, which has been instrumental in our work！
+## Datasets
+* [FSC147](https://github.com/cvlab-stonybrook/LearningToCountEverything)
+* [CARPK](https://lafi.github.io/LPN/)
+Preparing the datasets as follows:
+```
+./data/
+|--FSC147
+|  |--images_384_VarV2
+|  |  |--2.jpg
+|  |  |--3.jpg
+|  |--gt_density_map_adaptive_384_VarV2
+|  |  |--2.npy
+|  |  |--3.npy
+|  |--annotation_FSC147_384.json
+|  |--Train_Test_Val_FSC_147.json
+|  |--ImageClasses_FSC147.txt
+|  |--train.txt
+|  |--test.txt
+|  |--val.txt
+|--CARPK/
+|  |--Annotations/
+|  |--Images/
+|  |--ImageSets/
+```
+## Inference
++  For inference, you can download the model from [Baidu-Disk](https://pan.baidu.com/s/11sbdDYLDfTOIPx5pZvBpmw?pwd=paeh), passward:paeh
+```
+python FSC_test.py --output_dir ./data/out/results_base --resume ./data/checkpoint_FSC.pth
+```
+## Single and Multiple Object Classifier Training
+```
+python datasetmake.py
+python biclassify.py
+```
++  You can also directly download the model from [Baidu-Disk](https://pan.baidu.com/s/1fOF0giI3yQpvGTiNFUI7cQ?pwd=psum), passward:psum Save it in ./data/out/classify/
+## Generate exemplars
+```
+python grounding_pos.py --root_path ./data/FSC147/
+python grounding_neg.py --root_path ./data/FSC147/
+```
+## Train
+```
+CUDA_VISIBLE_DEVICES=0 python FSC_pretrain.py \
+    --epochs 500 \
+    --warmup_epochs 10 \
+    --blr 1.5e-4 --weight_decay 0.05
+```
++  You can also directly download the pre-train model from [Baidu-Disk](https://pan.baidu.com/s/1_-w_9I4bPA66pMZkHTrdrg?pwd=xynw), passward:xynw Save it in ./data/
+```
+CUDA_VISIBLE_DEVICES=0 python FSC_train.py --epochs 1000 --batch_size 8 --lr 1e-5 --output_dir ./data/out/
+```
+## Citation
+```
+@inproceedings{zhu2024zero,
+  title={Zero-shot Object Counting with Good Exemplars},
+  author={Zhu, Huilin and Yuan, Jingling and Yang, Zhengwei and Guo, Yu and Wang, Zheng and Zhong, Xian and He, Shengfeng},
+  booktitle={Proceedings of the European Conference on Computer Vision},
+  year={2024}
+}
+```
+## Acknowledgement
+This project is based on the implementation from [CounTR](https://github.com/Verg-Avesta/CounTR), we are very grateful for this work and [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO).
+#### If you have any questions, please get in touch with me ([email protected]).

__pycache__/models_crossvit.cpython-38.pyc ADDED Viewed

Binary file (6.28 kB). View file

__pycache__/models_mae_cross.cpython-38.pyc ADDED Viewed

Binary file (6.69 kB). View file

__pycache__/models_mae_noct.cpython-38.pyc ADDED Viewed

Binary file (7.03 kB). View file

__pycache__/models_mae_noct.cpython-39.pyc ADDED Viewed

Binary file (6.96 kB). View file

biclassify.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import pandas as pd
+import os
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision.transforms import Compose, Resize, Normalize, ToTensor
+from PIL import Image
+import torch.nn as nn
+import torch.nn.functional as F
+from sklearn.model_selection import train_test_split
+import clip
+import re
+import torchvision.models as models
+# 1. 读取数据和预处理
+def read_label_file(file_path):
+    data = []
+    with open(file_path, 'r') as f:
+        for line in f.readlines():
+            image_name, label = line.strip().split(',')
+            data.append([image_name, 1 if label == 'one' else 0])
+    return pd.DataFrame(data, columns=['image', 'label'])
+# 读取a.txt中的图片名称
+with open('./data/FSC147/train.txt', 'r') as file:
+    a_txt_images = file.read().splitlines()
+# 提取.jpg前的数字
+a_txt_numbers = set([name.split('.')[0] for name in a_txt_images])
+# 从label.txt中读取图片名称和标签
+with open('./data/FSC147/one/labels.txt', 'r') as file:
+    label_txt_lines = file.read().splitlines()
+# 筛选出存在于a.txt中的图片
+filtered_images = []
+for line in label_txt_lines:
+    image_name, label = line.strip().split(',')
+    # 使用正则表达式匹配开头的数字
+    match = re.match(r'(\d+)', image_name)
+    if match:
+        image_number = match.group(1)
+        if image_number in a_txt_numbers:
+            # 转换'label'的值
+            label_value = 1 if label == 'one' else 0
+            filtered_images.append([image_name, label_value])  # 注意这里是列表，以匹配read_label_file的输出
+# 将筛选后的图片和标签转换为DataFrame，确保列名与read_label_file函数的输出相匹配
+df_filtered = pd.DataFrame(filtered_images, columns=['image', 'label'])
+# 自定义Dataset类
+class CustomDataset(Dataset):
+    def __init__(self, dataframe, root_dir, transform=None):
+        self.dataframe = dataframe
+        self.root_dir = root_dir
+        self.transform = transform
+    def __len__(self):
+        return len(self.dataframe)
+    def __getitem__(self, idx):
+        img_name = os.path.join(self.root_dir, self.dataframe.iloc[idx, 0])
+        image = Image.open(img_name).convert('RGB')
+        label = self.dataframe.iloc[idx, 1]
+        if self.transform:
+            image = self.transform(image)
+        return image, label
+# 2. 数据集划分
+data_folder = './data/FSC147/one'
+label_file = os.path.join(data_folder, 'labels.txt')
+# df = read_label_file(label_file)
+df = df_filtered
+train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
+# 3. 数据加载
+transform = Compose([
+    Resize((224, 224)),
+    ToTensor(),
+    Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+])
+train_dataset = CustomDataset(train_df, data_folder, transform=transform)
+test_dataset = CustomDataset(test_df, data_folder, transform=transform)
+train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
+test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
+# 4. 模型定义
+class ClipClassifier(nn.Module):
+    def __init__(self, clip_model, embed_dim=512):
+        super(ClipClassifier, self).__init__()
+        self.clip_model = clip_model
+        # 冻结CLIP模型的参数
+        for param in self.clip_model.parameters():
+            param.requires_grad = False
+        self.fc = nn.Linear(clip_model.visual.output_dim, embed_dim)
+        self.classifier = nn.Linear(embed_dim, 2)  # 二分类
+    def forward(self, images):
+        with torch.no_grad():
+            image_features = self.clip_model.encode_image(images).float()
+        x = self.fc(image_features)
+        x = F.relu(x)
+        logits = self.classifier(x)
+        return logits
+class ResNetClassifier(nn.Module):
+    def __init__(self, num_classes=2):
+        super(ResNetClassifier, self).__init__()
+        # 加载预训练的ResNet50模型
+        self.resnet50 = models.resnet50(pretrained=True)
+        # 冻结所有预训练层的参数
+        for param in self.resnet50.parameters():
+            param.requires_grad = False
+        # 替换最后的全连接层以适应二分类任务
+        num_ftrs = self.resnet50.fc.in_features
+        self.resnet50.fc = nn.Linear(num_ftrs, num_classes)
+    def forward(self, images):
+        return self.resnet50(images)
+# 5. 训练和测试
+device = torch.device("cuda:5" if torch.cuda.is_available() else "cpu")
+clip_model, _ = clip.load("ViT-B/32", device=device)
+# model = ClipClassifier(clip_model).to(device)
+model = ResNetClassifier().to(device)
+optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
+criterion = nn.CrossEntropyLoss()
+def train(model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = criterion(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % 10 == 0:
+            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')
+def test(model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += criterion(output, target).item()
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+    test_loss /= len(test_loader.dataset)
+    print(f'\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n')
+    return 100. * correct / len(test_loader.dataset)
+best_accuracy = 0.0
+for epoch in range(1, 11):
+    train(model, device, train_loader, optimizer, epoch)
+    accuracy = test(model, device, test_loader)
+    if accuracy > best_accuracy:
+        best_accuracy = accuracy
+        torch.save(model.state_dict(), './data/out/classify/best_model.pth')
+        print(f'Best model saved with accuracy: {best_accuracy:.2f}%')

datasetmake.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from PIL import Image
+import os
+import random
+def is_image_file(filename):
+    """判断文件是否是图像文件"""
+    image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']  # 支持的图像文件扩展名列表
+    return any(filename.lower().endswith(ext) for ext in image_extensions)
+def random_crop(img, size=(256, 256)):
+    """从给定的图片中随机裁剪出指定大小的区域"""
+    width, height = img.size
+    crop_width, crop_height = size
+    if width < crop_width or height < crop_height:
+        return None  # 如果图片尺寸小于裁剪尺寸，则返回None
+    x_left = random.randint(0, width - crop_width)
+    y_upper = random.randint(0, height - crop_height)
+    return img.crop((x_left, y_upper, x_left + crop_width, y_upper + crop_height))
+# 文件夹路径设置（根据实际情况修改）
+single_object_folder = './data/FSC147/box'
+multiple_objects_folder = './data/FSC147/images_384_VarV2'
+output_folder = './data/FSC147/one'
+# 确保输出文件夹存在
+if not os.path.exists(output_folder):
+    os.makedirs(output_folder)
+output_txt_path = os.path.join(output_folder, 'labels.txt')
+with open(output_txt_path, 'w') as f:
+    for folder, label in [(single_object_folder, 'one'), (multiple_objects_folder, 'more')]:
+        for filename in os.listdir(folder):
+            if is_image_file(filename):  # 只处理图像文件
+                img_path = os.path.join(folder, filename)
+                img = Image.open(img_path)
+                # 保存原图并记录到txt文件
+                original_img_output_path = os.path.join(output_folder, filename)
+                img.save(original_img_output_path)
+                f.write(f"{filename},{label}\n")
+                # 从原图中随机裁剪并保存裁剪图像
+                for size in [(256, 384), (256, 256), (384, 384),(128,256),(256,128)]:
+                    img_cropped = random_crop(img, size=size)
+                    if img_cropped:
+                        cropped_img_output_path = os.path.join(output_folder, f"{filename[:-4]}_random_{size[0]}x{size[1]}.jpg")
+                        img_cropped.save(cropped_img_output_path)
+                        f.write(f"{filename[:-4]}_random_{size[0]}x{size[1]}.jpg,{label}\n")
+print("数据集准备完成。")

figure.png ADDED Viewed

grounding_neg.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import torch
+import os
+import inflect
+import argparse
+from GroundingDINO.groundingdino.util.inference import load_model, load_image, predict
+from PIL import Image
+import numpy as np
+from torchvision.ops import box_convert
+import json
+import torch.nn as nn
+import torch.nn.functional as F
+import clip
+# 定义全局变量
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# 阈值设置
+BOX_THRESHOLD = 0.02
+TEXT_THRESHOLD = 0.02
+BOX_THRESHOLD_class = 0.01
+TEXT_THRESHOLD_class = 0.01
+# 初始化inflect引擎
+p = inflect.engine()
+# 将单词转换为单数形式的函数
+def to_singular(word):
+    singular_word = p.singular_noun(word)
+    return singular_word if singular_word else word
+# 定义ClipClassifier类
+class ClipClassifier(nn.Module):
+    def __init__(self, clip_model, embed_dim=512):
+        super(ClipClassifier, self).__init__()
+        self.clip_model = clip_model.to(device)
+        for param in self.clip_model.parameters():
+            param.requires_grad = False
+        self.fc = nn.Linear(clip_model.visual.output_dim, embed_dim)
+        self.classifier = nn.Linear(embed_dim, 2)  # 二分类
+    def forward(self, images):
+        with torch.no_grad():
+            image_features = self.clip_model.encode_image(images).float().to(device)
+        x = self.fc(image_features)
+        x = F.relu(x)
+        logits = self.classifier(x)
+        return logits
+# 初始化和加载二分类模型
+clip_model, preprocess = clip.load("ViT-B/32", device)
+binary_classifier = ClipClassifier(clip_model).to(device)
+# 加载保存的权重
+model_weights_path = './data/out/classify/best_model.pth'
+binary_classifier.load_state_dict(torch.load(model_weights_path, map_location=device))
+# 确认模型已经被设置为评估模式
+binary_classifier.eval()
+# 计算两个边界框的IoU
+def calculate_iou(box1, box2):
+    x1, y1, w1, h1 = box1
+    x2, y2, w2, h2 = box2
+    intersection_x1 = max(x1, x2)
+    intersection_y1 = max(y1, y2)
+    intersection_x2 = min(x1 + w1, x2 + w2)
+    intersection_y2 = min(y1 + h1, y2 + h2)
+    intersection_area = max(intersection_x2 - intersection_x1, 0) * max(intersection_y2 - intersection_y1, 0)
+    box1_area = w1 * h1
+    box2_area = w2 * h2
+    union_area = box1_area + box2_area - intersection_area
+    iou = intersection_area / union_area if union_area > 0 else 0
+    return iou
+# 检查patch是否有效
+def is_valid_patch(patch, binary_classifier, preprocess, device):
+    if patch.size[0] <= 0 or patch.size[1] <= 0:
+        return False
+    patch_tensor = preprocess(patch).unsqueeze(0).to(device)
+    with torch.no_grad():
+        logits = binary_classifier(patch_tensor)
+        probabilities = torch.softmax(logits, dim=1)
+        prob_label_1 = probabilities[0, 1]
+    return prob_label_1.item() > 0.8
+# 处理图片的主函数
+def process_images(text_file_path, dataset_path, model, preprocess, binary_classifier, output_folder, device='cpu'):
+    boxes_dict = {}
+    with open(text_file_path, 'r') as f:
+        for line in f:
+            image_name, class_name = line.strip().split('\t')
+            print(f"Processing image: {image_name}")
+            text_prompt = class_name + ' .'
+            object_prompt = "object ."
+            image_path = os.path.join(dataset_path, image_name)
+            img = Image.open(image_path).convert("RGB")
+            image_source, image = load_image(image_path)
+            h, w, _ = image_source.shape
+            boxes_object, logits_object, _ = predict(model, image, object_prompt, BOX_THRESHOLD, TEXT_THRESHOLD)
+            boxes_class, logits_class, _ = predict(model, image, text_prompt, BOX_THRESHOLD_class, TEXT_THRESHOLD_class)
+            patches_object = box_convert(boxes_object, in_fmt="cxcywh", out_fmt="xyxy")
+            patches_class = box_convert(boxes_class, in_fmt="cxcywh", out_fmt="xyxy")
+            top_patches = []
+            iou_matrix = np.zeros((len(boxes_object), len(boxes_class)))
+            for j, box_class in enumerate(patches_class):
+                box_object_class = box_class.cpu().numpy() * np.array([w, h, w, h], dtype=np.float32)
+                x1_, y1_, x2_, y2_ = box_object_class.astype(int)
+                x1_, y1_, x2_, y2_ = max(x1_, 0), max(y1_, 0), min(x2_, w), min(y2_, h)
+                patch_ = img.crop((x1_, y1_, x2_, y2_))
+                if x2_ - x1_ > w / 2 or y2_ - y1_ > h / 2 or not is_valid_patch(patch_, binary_classifier, preprocess, device):
+                    print(f"Skipping patch at box {box_class}")
+                    continue
+                for i, box_object in enumerate(patches_object):
+                    iou_matrix[i][j] = calculate_iou(box_object.cpu().numpy(), box_class.cpu().numpy())
+            for i, box_object in enumerate(patches_object):
+                max_iou = np.max(iou_matrix[i])
+                if max_iou < 0.5:
+                    box_object = box_object.cpu().numpy() * np.array([w, h, w, h], dtype=np.float32)
+                    x1, y1, x2, y2 = box_object.astype(int)
+                    x1, y1, x2, y2 = max(x1, 0), max(y1, 0), min(x2, w), min(y2, h)
+                    patch = img.crop((x1, y1, x2, y2))
+                    if patch.size == (0, 0) or not is_valid_patch(patch, binary_classifier, preprocess, device) or x2 - x1 > w / 2 or y2 - y1 > h / 2 or y2 - y1 < 5 or x2 - x1 < 5:
+                        print(f"Skipping patch at box {box_object}")
+                        continue
+                    patch_logits = logits_object[i]
+                    top_patches.append((i, patch_logits.item()))
+            top_patches.sort(key=lambda x: x[1], reverse=True)
+            top_3_indices = [patch[0] for patch in top_patches[:3]]
+            while len(top_3_indices) < 3:
+                if len(top_3_indices) > 0:
+                    top_3_indices.append(top_3_indices[-1])
+                else:
+                    default_box = torch.tensor([0,0,20/w,20/h]).unsqueeze(0)
+                    patches_object = torch.cat((patches_object, default_box.to(boxes_object.device)), dim=0)
+                    top_3_indices.append(len(patches_object) - 1)
+            boxes_dict[image_name] = [patches_object[idx].cpu().numpy().tolist() * np.array([w, h, w, h], dtype=np.float32) for idx in top_3_indices]
+    return boxes_dict
+def main(args):
+    # 设置固定的默认路径
+    model_config = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
+    model_weights = "GroundingDINO/weights/groundingdino_swint_ogc.pth"
+    # 根据root_path设置路径
+    text_file_path = os.path.join(args.root_path, "ImageClasses_FSC147.txt")
+    dataset_path = os.path.join(args.root_path, "images_384_VarV2")
+    input_json_path = os.path.join(args.root_path, "annotation_FSC147_384.json")
+    output_json_path = os.path.join(args.root_path, "annotation_FSC147_neg.json")
+    output_folder = os.path.join(args.root_path, "annotated_images_n")
+    os.makedirs(output_folder, exist_ok=True)
+    # 加载GroundingDINO模型
+    model = load_model(model_config, model_weights, device=device)
+    # 处理图片并生成边界框
+    boxes_dict = process_images(text_file_path, dataset_path, model, preprocess, binary_classifier, output_folder, device=device)
+    # 更新JSON文件
+    with open(input_json_path, 'r') as f:
+        data = json.load(f)
+    for image_name, boxes in boxes_dict.items():
+        if image_name in data:
+            new_boxes = [[[x1, y1], [x1, y2], [x2, y2], [x2, y1]] for x1, y1, x2, y2 in boxes]
+            data[image_name]["box_examples_coordinates"] = new_boxes
+    with open(output_json_path, 'w') as f:
+        json.dump(data, f, indent=4)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Image Processing Script")
+    parser.add_argument("--root_path", type=str, required=True, help="Root path to the dataset and output files")
+    args = parser.parse_args()
+    main(args)

grounding_pos.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import torch
+import os
+import clip
+import inflect
+import argparse
+from torchvision.ops import box_convert
+from GroundingDINO.groundingdino.util.inference import load_model, load_image, predict
+from PIL import Image
+import numpy as np
+import json
+import torch.nn as nn
+import torch.nn.functional as F
+# 定义全局变量
+device = "cuda" if torch.cuda.is_available() else "cpu"
+BOX_THRESHOLD = 0.05
+TEXT_THRESHOLD = 0.05
+# 初始化inflect引擎
+p = inflect.engine()
+# 定义 ClipClassifier 类
+class ClipClassifier(nn.Module):
+    def __init__(self, clip_model, embed_dim=512):
+        super(ClipClassifier, self).__init__()
+        self.clip_model = clip_model.to(device)
+        for param in self.clip_model.parameters():
+            param.requires_grad = False
+        self.fc = nn.Linear(clip_model.visual.output_dim, embed_dim)
+        self.classifier = nn.Linear(embed_dim, 2)  # 二分类
+    def forward(self, images):
+        with torch.no_grad():
+            image_features = self.clip_model.encode_image(images).float().to(device)
+        x = self.fc(image_features)
+        x = F.relu(x)
+        logits = self.classifier(x)
+        return logits
+# 加载 CLIP 模型
+clip_model, preprocess = clip.load("ViT-B/32", device)
+clip_model.eval()
+# 初始化并加载二分类模型
+binary_classifier = ClipClassifier(clip_model).to(device)
+model_weights_path = './data/out/classify/best_model.pth'
+binary_classifier.load_state_dict(torch.load(model_weights_path, map_location=device))
+binary_classifier.eval()
+# 判断 patch 是否有效
+def is_valid_patch(patch, binary_classifier, preprocess, device):
+    if patch.size[0] <= 0 or patch.size[1] <= 0:
+        return False
+    patch_tensor = preprocess(patch).unsqueeze(0).to(device)
+    with torch.no_grad():
+        logits = binary_classifier(patch_tensor)
+        probabilities = torch.softmax(logits, dim=1)
+        prob_label_1 = probabilities[0, 1]
+    return prob_label_1.item() > 0.8
+# 处理图片的主函数
+def process_images(text_file_path, dataset_path, model, preprocess, clip_model, output_folder, device='cpu'):
+    boxes_dict = {}
+    with open(text_file_path, 'r') as f:
+        for line in f:
+            image_name, class_name = line.strip().split('\t')
+            print(f"Processing image: {image_name}")
+            text_prompt = class_name + ' .'
+            image_path = os.path.join(dataset_path, image_name)
+            img = Image.open(image_path).convert("RGB")
+            image_source, image = load_image(image_path)
+            h, w, _ = image_source.shape
+            boxes, logits, _ = predict(model, image, text_prompt, BOX_THRESHOLD, TEXT_THRESHOLD)
+            patches = box_convert(boxes, in_fmt="cxcywh", out_fmt="xyxy")
+            top_patches = []
+            for i, (box, logit) in enumerate(zip(patches, logits)):
+                box = box.cpu().numpy() * np.array([w, h, w, h], dtype=np.float32)
+                x1, y1, x2, y2 = box.astype(int)
+                x1, y1, x2, y2 = max(x1, 0), max(y1, 0), min(x2, w), min(y2, h)
+                patch = img.crop((x1, y1, x2, y2))
+                if patch.size == (0, 0) or not is_valid_patch(patch, binary_classifier, preprocess, device) or x2 - x1 > w / 2 or y2 - y1 > h / 2 or y2 - y1 < 5 or x2 - x1 < 5:
+                    print(f"Skipping patch due to binary classifier at box {box}")
+                    continue
+                top_patches.append((i, logit))
+            top_patches.sort(key=lambda x: x[1], reverse=True)
+            top_3_indices = [patch[0] for patch in top_patches[:3]]
+            # 确保每张图像都有三个边界框
+            while len(top_3_indices) < 3:
+                if len(top_3_indices) > 0:
+                    top_3_indices.append(top_3_indices[-1])
+                else:
+                    default_box = torch.tensor([0, 0, 20 / w, 20 / h]).unsqueeze(0)
+                    patches = torch.cat((patches, default_box.to(boxes.device)), dim=0)
+                    top_3_indices.append(len(patches) - 1)
+            boxes_dict[image_name] = [patches[idx].cpu().numpy().tolist() * np.array([w, h, w, h], dtype=np.float32) for idx in top_3_indices]
+    return boxes_dict
+# 主函数
+def main(args):
+    # 设置固定的默认路径
+    model_config = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
+    model_weights = "GroundingDINO/weights/groundingdino_swint_ogc.pth"
+    output_folder = os.path.join(args.root_path, "annotated_images")
+    # 根据 root_path 设置路径
+    text_file_path = os.path.join(args.root_path, "ImageClasses_FSC147.txt")
+    dataset_path = os.path.join(args.root_path, "images_384_VarV2")
+    input_json_path = os.path.join(args.root_path, "annotation_FSC147_384_old.json")
+    output_json_path = os.path.join(args.root_path, "annotation_FSC147_pos.json")
+    os.makedirs(output_folder, exist_ok=True)
+    # 加载 GroundingDINO 模型
+    model = load_model(model_config, model_weights, device=device)
+    # 处理��片并生成边界框
+    boxes_dict = process_images(text_file_path, dataset_path, model, preprocess, clip_model, output_folder, device=device)
+    # 更新 JSON 文件
+    with open(input_json_path, 'r') as f:
+        data = json.load(f)
+    for image_name, boxes in boxes_dict.items():
+        if image_name in data:
+            new_boxes = [[[x1, y1], [x1, y2], [x2, y2], [x2, y1]] for x1, y1, x2, y2 in boxes]
+            data[image_name]["box_examples_coordinates"] = new_boxes
+    with open(output_json_path, 'w') as f:
+        json.dump(data, f, indent=4)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Image Processing Script")
+    parser.add_argument("--root_path", type=str, required=True, help="Root path to the dataset and output files")
+    args = parser.parse_args()
+    main(args)

models_crossvit.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.hub
+from itertools import repeat
+import collections.abc
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        drop_probs = to_2tuple(drop)
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class CrossAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+        self.wq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.wk = nn.Linear(dim, dim, bias=qkv_bias)
+        self.wv = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, y):
+        B, Nx, C = x.shape
+        Ny = y.shape[1]
+        # BNxC -> BNxH(C/H) -> BHNx(C/H)
+        q = self.wq(x).reshape(B, Nx, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        # BNyC -> BNyH(C/H) -> BHNy(C/H)
+        k = self.wk(y).reshape(B, Ny, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        # BNyC -> BNyH(C/H) -> BHNy(C/H)
+        v = self.wv(y).reshape(B, Ny, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        attn = (q @ k.transpose(-2, -1)) * self.scale  # BHNx(C/H) @ BH(C/H)Ny -> BHNxNy
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, Nx, C)  # (BHNxNy @ BHNy(C/H)) -> BHNx(C/H) -> BNxH(C/H) -> BNxC
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class CrossAttentionBlock(nn.Module):
+    def __init__(
+            self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+            drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm0 = norm_layer(dim)
+        self.selfattn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path0 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm1 = norm_layer(dim)
+        self.attn = CrossAttention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x, y):
+        x = x + self.drop_path0(self.selfattn(self.norm0(x)))
+        x = x + self.drop_path1(self.attn(self.norm1(x), y))
+        x = x + self.drop_path2(self.mlp(self.norm2(x)))
+        return x

models_mae_cross.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import time
+from functools import partial
+import math
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.utils
+from timm.models.vision_transformer import PatchEmbed, Block
+from models_crossvit import CrossAttentionBlock
+from util.pos_embed import get_2d_sincos_pos_embed
+class SupervisedMAE(nn.Module):
+    def __init__(self, img_size=384, patch_size=16, in_chans=3,
+                 embed_dim=1024, depth=24, num_heads=16,
+                 decoder_embed_dim=512, decoder_depth=2, decoder_num_heads=16,
+                 mlp_ratio=4., norm_layer=nn.LayerNorm, norm_pix_loss=False):
+        super().__init__()
+        # --------------------------------------------------------------------------
+        # MAE encoder specifics
+        self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim), requires_grad=False)  # fixed sin-cos embedding
+        self.blocks = nn.ModuleList([
+            Block(embed_dim, num_heads, mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer)
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+        # --------------------------------------------------------------------------
+        # --------------------------------------------------------------------------
+        # MAE decoder specifics
+        self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True)
+        self.decoder_pos_embed = nn.Parameter(torch.zeros(1, num_patches, decoder_embed_dim), requires_grad=False)  # fixed sin-cos embedding
+        self.shot_token = nn.Parameter(torch.zeros(512))
+        # Exemplar encoder with CNN
+        self.decoder_proj1 = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
+            nn.InstanceNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2) #[3,64,64]->[64,32,32]
+        )
+        self.decoder_proj2 = nn.Sequential(
+            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
+            nn.InstanceNorm2d(128),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2) #[64,32,32]->[128,16,16]
+        )
+        self.decoder_proj3 = nn.Sequential(
+            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
+            nn.InstanceNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2) # [128,16,16]->[256,8,8]
+        )
+        self.decoder_proj4 = nn.Sequential(
+            nn.Conv2d(256, decoder_embed_dim, kernel_size=3, stride=1, padding=1),
+            nn.InstanceNorm2d(512),
+            nn.ReLU(inplace=True),
+            nn.AdaptiveAvgPool2d((1,1))
+            # [256,8,8]->[512,1,1]
+        )
+        self.decoder_blocks = nn.ModuleList([
+            CrossAttentionBlock(decoder_embed_dim, decoder_num_heads, mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer)
+            for i in range(decoder_depth)])
+        self.decoder_norm = norm_layer(decoder_embed_dim)
+        # Density map regresssion module
+        self.decode_head0 = nn.Sequential(
+            nn.Conv2d(decoder_embed_dim, 256, kernel_size=3, stride=1, padding=1),
+            nn.GroupNorm(8, 256),
+            nn.ReLU(inplace=True)
+        )
+        self.decode_head1 = nn.Sequential(
+            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
+            nn.GroupNorm(8, 256),
+            nn.ReLU(inplace=True)
+        )
+        self.decode_head2 = nn.Sequential(
+            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
+            nn.GroupNorm(8, 256),
+            nn.ReLU(inplace=True)
+        )
+        self.decode_head3 = nn.Sequential(
+            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
+            nn.GroupNorm(8, 256),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 1, kernel_size=1, stride=1)
+        )
+        # --------------------------------------------------------------------------
+        self.norm_pix_loss = norm_pix_loss
+        self.initialize_weights()
+    def initialize_weights(self):
+        # initialization
+        # initialize (and freeze) pos_embed by sin-cos embedding
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.patch_embed.num_patches**.5), cls_token=False)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        decoder_pos_embed = get_2d_sincos_pos_embed(self.decoder_pos_embed.shape[-1], int(self.patch_embed.num_patches**.5), cls_token=False)
+        self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0))
+        # initialize patch_embed like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embed.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        torch.nn.init.normal_(self.shot_token, std=.02)
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward_encoder(self, x):
+        # embed patches
+        x = self.patch_embed(x)
+        # add pos embed w/o cls token
+        x = x + self.pos_embed
+        # apply Transformer blocks
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x
+    def forward_decoder(self, x, y_, shot_num=3):
+        # embed tokens
+        x = self.decoder_embed(x)
+        # add pos embed
+        x = x + self.decoder_pos_embed
+        # Exemplar encoder
+        y_ = y_.transpose(0,1) # y_ [N,3,3,64,64]->[3,N,3,64,64]
+        y1=[]
+        C=0
+        N=0
+        cnt = 0
+        for yi in y_:
+            cnt+=1
+            if cnt > shot_num:
+                break
+            yi = self.decoder_proj1(yi)
+            yi = self.decoder_proj2(yi)
+            yi = self.decoder_proj3(yi)
+            yi = self.decoder_proj4(yi)
+            N, C,_,_ = yi.shape
+            y1.append(yi.squeeze(-1).squeeze(-1)) # yi [N,C,1,1]->[N,C]
+        if shot_num > 0:
+            y = torch.cat(y1,dim=0).reshape(shot_num,N,C).to(x.device)
+        else:
+            y = self.shot_token.repeat(y_.shape[1],1).unsqueeze(0).to(x.device)
+        y = y.transpose(0,1) # y [3,N,C]->[N,3,C]
+        # apply Transformer blocks
+        for blk in self.decoder_blocks:
+            x = blk(x, y)
+        x = self.decoder_norm(x)
+        # Density map regression
+        n, hw, c = x.shape
+        h = w = int(math.sqrt(hw))
+        x = x.transpose(1, 2).reshape(n, c, h, w)
+        x = F.interpolate(
+                        self.decode_head0(x), size=x.shape[-1]*2, mode='bilinear', align_corners=False)
+        x = F.interpolate(
+                        self.decode_head1(x), size=x.shape[-1]*2, mode='bilinear', align_corners=False)
+        x = F.interpolate(
+                        self.decode_head2(x), size=x.shape[-1]*2, mode='bilinear', align_corners=False)
+        x = F.interpolate(
+                        self.decode_head3(x), size=x.shape[-1]*2, mode='bilinear', align_corners=False)
+        x = x.squeeze(-3)
+        return x
+    def forward(self, imgs, boxes, shot_num):
+        # if boxes.nelement() > 0:
+        #     torchvision.utils.save_image(boxes[0], f"data/out/crops/box_{time.time()}_{random.randint(0, 99999):>5}.png")
+        with torch.no_grad():
+            latent = self.forward_encoder(imgs)
+        pred = self.forward_decoder(latent, boxes, shot_num)  # [N, 384, 384]
+        return pred
+def mae_vit_base_patch16_dec512d8b(**kwargs):
+    model = SupervisedMAE(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12,
+        decoder_embed_dim=512, decoder_depth=2, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def mae_vit_large_patch16_dec512d8b(**kwargs):
+    model = SupervisedMAE(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16,
+        decoder_embed_dim=512, decoder_depth=2, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def mae_vit_huge_patch14_dec512d8b(**kwargs):
+    model = SupervisedMAE(
+        patch_size=14, embed_dim=1280, depth=32, num_heads=16,
+        decoder_embed_dim=512, decoder_depth=2, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def mae_vit_base_patch16_fim4(**kwargs):
+    model = SupervisedMAE(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12,
+        decoder_embed_dim=512, decoder_depth=4, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def mae_vit_base_patch16_fim6(**kwargs):
+    model = SupervisedMAE(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12,
+        decoder_embed_dim=512, decoder_depth=6, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+# set recommended archs
+mae_vit_base_patch16 = mae_vit_base_patch16_dec512d8b
+mae_vit_base4_patch16 = mae_vit_base_patch16_fim4 # decoder: 4 blocks
+mae_vit_base6_patch16 = mae_vit_base_patch16_fim6 # decoder: 6 blocks
+mae_vit_large_patch16 = mae_vit_large_patch16_dec512d8b
+mae_vit_huge_patch14 = mae_vit_huge_patch14_dec512d8b

models_mae_noct.py ADDED Viewed

	@@ -0,0 +1,234 @@

+from functools import partial
+import torch
+import torch.nn as nn
+from timm.models.vision_transformer import PatchEmbed, Block
+from util.pos_embed import get_2d_sincos_pos_embed
+class MaskedAutoencoderViTNoCT(nn.Module):
+    """ Masked Autoencoder with VisionTransformer backbone
+    """
+    def __init__(self, img_size=384, patch_size=16, in_chans=3,
+                 embed_dim=1024, depth=24, num_heads=16,
+                 decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16,
+                 mlp_ratio=4., norm_layer=nn.LayerNorm, norm_pix_loss=False):
+        super().__init__()
+        # --------------------------------------------------------------------------
+        # MAE encoder specifics
+        self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim), requires_grad=False)  # fixed sin-cos embedding
+        self.blocks = nn.ModuleList([
+            Block(embed_dim, num_heads, mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer)
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+        # --------------------------------------------------------------------------
+        # --------------------------------------------------------------------------
+        # MAE decoder specifics
+        self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
+        self.decoder_pos_embed = nn.Parameter(torch.zeros(1, num_patches, decoder_embed_dim), requires_grad=False)  # fixed sin-cos embedding
+        self.decoder_blocks = nn.ModuleList([
+            Block(decoder_embed_dim, decoder_num_heads, mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer)
+            for i in range(decoder_depth)])
+        self.decoder_norm = norm_layer(decoder_embed_dim)
+        self.decoder_pred = nn.Linear(decoder_embed_dim, patch_size**2 * in_chans, bias=True) # decoder to patch
+        # --------------------------------------------------------------------------
+        self.norm_pix_loss = norm_pix_loss
+        self.initialize_weights()
+    def initialize_weights(self):
+        # initialization
+        # initialize (and freeze) pos_embed by sin-cos embedding
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.patch_embed.num_patches**.5), cls_token=False)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        decoder_pos_embed = get_2d_sincos_pos_embed(self.decoder_pos_embed.shape[-1], int(self.patch_embed.num_patches**.5), cls_token=False)
+        self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0))
+        # initialize patch_embed like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embed.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        torch.nn.init.normal_(self.mask_token, std=.02)
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def patchify(self, imgs):
+        """
+        imgs: (N, 3, H, W)
+        x: (N, L, patch_size**2 *3)
+        """
+        p = self.patch_embed.patch_size[0]
+        assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
+        h = w = imgs.shape[2] // p
+        x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
+        x = torch.einsum('nchpwq->nhwpqc', x)
+        x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
+        return x
+    def unpatchify(self, x):
+        """
+        x: (N, L, patch_size**2 *3)
+        imgs: (N, 3, H, W)
+        """
+        p = self.patch_embed.patch_size[0]
+        h = w = int(x.shape[1]**.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, 3))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p))
+        return imgs
+    def random_masking(self, x, mask_ratio):
+        """
+        Perform per-sample random masking by per-sample shuffling.
+        Per-sample shuffling is done by argsort random noise.
+        x: [N, L, D], sequence
+        """
+        N, L, D = x.shape  # batch, length, dim
+        len_keep = int(L * (1 - mask_ratio))
+        noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
+        # sort noise for each sample
+        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+        # generate the binary mask: 0 is keep, 1 is remove
+        mask = torch.ones([N, L], device=x.device)
+        mask[:, :len_keep] = 0
+        # unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+        return x_masked, mask, ids_restore
+    def forward_encoder(self, x, mask_ratio):
+        # embed patches
+        x = self.patch_embed(x)
+        # add pos embed w/o cls token
+        x = x + self.pos_embed
+        # masking: length -> length * mask_ratio
+        x, mask, ids_restore = self.random_masking(x, mask_ratio)
+        # apply Transformer blocks
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x, mask, ids_restore
+    def forward_decoder(self, x, ids_restore):
+        # embed tokens
+        x = self.decoder_embed(x)
+        # append mask tokens to sequence
+        mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] - x.shape[1], 1)
+        x_ = torch.cat([x, mask_tokens], dim=1)  # no cls token
+        x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]))  # unshuffle
+        x = x_ # append cls token
+        # add pos embed
+        x = x + self.decoder_pos_embed
+        # apply Transformer blocks
+        for blk in self.decoder_blocks:
+            x = blk(x)
+        x = self.decoder_norm(x)
+        # predictor projection
+        x = self.decoder_pred(x)
+        return x
+    def forward_loss(self, imgs, pred, mask):
+        """
+        imgs: [N, 3, H, W]
+        pred: [N, L, p*p*3]
+        mask: [N, L], 0 is keep, 1 is remove,
+        """
+        target = self.patchify(imgs)
+        if self.norm_pix_loss:
+            mean = target.mean(dim=-1, keepdim=True)
+            var = target.var(dim=-1, keepdim=True)
+            target = (target - mean) / (var + 1.e-6)**.5
+        loss = (pred - target) ** 2
+        loss = loss.mean(dim=-1)  # [N, L], mean loss per patch
+        # For mean loss on all patches
+        N, L = mask.shape
+        mask_s = torch.ones([N, L], device=imgs.device)
+        loss = (loss * mask_s).sum() / mask_s.sum()
+        #loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+    def forward(self, imgs, mask_ratio=0.75):
+        latent, mask, ids_restore = self.forward_encoder(imgs, mask_ratio)
+        pred = self.forward_decoder(latent, ids_restore)  # [N, L, p*p*3]
+        loss = self.forward_loss(imgs, pred, mask)
+        return loss, pred, mask
+def mae_vit_base_patch16_dec512d8b(**kwargs):
+    model = MaskedAutoencoderViTNoCT(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12,
+        decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def mae_vit_large_patch16_dec512d8b(**kwargs):
+    model = MaskedAutoencoderViTNoCT(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16,
+        decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def mae_vit_huge_patch14_dec512d8b(**kwargs):
+    model = MaskedAutoencoderViTNoCT(
+        patch_size=14, embed_dim=1280, depth=32, num_heads=16,
+        decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+# set recommended archs
+mae_vit_base_patch16 = mae_vit_base_patch16_dec512d8b  # decoder: 512 dim, 8 blocks
+mae_vit_large_patch16 = mae_vit_large_patch16_dec512d8b  # decoder: 512 dim, 8 blocks
+mae_vit_huge_patch14 = mae_vit_huge_patch14_dec512d8b  # decoder: 512 dim, 8 blocks

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+--extra-index-url https://download.pytorch.org/whl/cu116
+torch==1.13.1+cu116
+torchvision==0.14.1+cu116
+timm==0.4.9
+numpy==1.23.4
+scipy==1.10.1
+imgaug==0.4.0
+pillow==9.3.0
+matplotlib==3.6.3
+hub==3.0.1
+pandas==1.5.2
+six==1.16.0
+wandb
+tqdm

util/FSC147.py ADDED Viewed

	@@ -0,0 +1,524 @@

+from argparse import Namespace
+import json
+from pathlib import Path
+import numpy as np
+import random
+from torchvision import transforms
+import torch
+import cv2
+import torchvision.transforms.functional as TF
+import scipy.ndimage as ndimage
+from PIL import Image
+import argparse
+import imgaug.augmenters as iaa
+from imgaug.augmentables import Keypoint, KeypointsOnImage
+MAX_HW = 384
+IM_NORM_MEAN = [0.485, 0.456, 0.406]
+IM_NORM_STD = [0.229, 0.224, 0.225]
+def get_args_parser():
+    parser = argparse.ArgumentParser('MAE pre-training', add_help=False)
+    parser.add_argument('--batch_size', default=8, type=int,
+                        help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus')
+    parser.add_argument('--epochs', default=200, type=int)
+    parser.add_argument('--accum_iter', default=1, type=int,
+                        help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)')
+    # Model parameters
+    parser.add_argument('--model', default='mae_vit_base_patch16', type=str, metavar='MODEL',
+                        help='Name of model to train')
+    parser.add_argument('--mask_ratio', default=0.5, type=float,
+                        help='Masking ratio (percentage of removed patches).')
+    parser.add_argument('--norm_pix_loss', action='store_true',
+                        help='Use (per-patch) normalized pixels as targets for computing loss')
+    parser.set_defaults(norm_pix_loss=False)
+    # Optimizer parameters
+    parser.add_argument('--weight_decay', type=float, default=0.05,
+                        help='weight decay (default: 0.05)')
+    parser.add_argument('--lr', type=float, default=None, metavar='LR',
+                        help='learning rate (absolute lr)')
+    parser.add_argument('--blr', type=float, default=1e-3, metavar='LR',
+                        help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
+    parser.add_argument('--min_lr', type=float, default=0., metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0')
+    parser.add_argument('--warmup_epochs', type=int, default=10, metavar='N',
+                        help='epochs to warmup LR')
+    # Dataset parameters
+    parser.add_argument('--data_path', default='./data/FSC147/', type=str,
+                        help='dataset path')
+    parser.add_argument('--anno_file', default='annotation_FSC147_384.json', type=str,
+                        help='annotation json file')
+    parser.add_argument('--data_split_file', default='Train_Test_Val_FSC_147.json', type=str,
+                        help='data split json file')
+    parser.add_argument('--im_dir', default='images_384_VarV2', type=str,
+                        help='images directory')
+    parser.add_argument('--gt_dir', default='./data/FSC147/gt_density_map_adaptive_384_VarV2', type=str,
+                        help='ground truth directory')
+    parser.add_argument('--output_dir', default='./data/out/pre_4_dir',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--device', default='cuda',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=0, type=int)
+    parser.add_argument('--resume', default='./weights/mae_pretrain_vit_base_full.pth',  # mae_visualize_vit_base
+                        help='resume from checkpoint')
+    # Training parameters
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--num_workers', default=10, type=int)
+    parser.add_argument('--pin_mem', action='store_true',
+                        help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+    parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
+    parser.set_defaults(pin_mem=True)
+    # Distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--local_rank', default=-1, type=int)
+    parser.add_argument('--dist_on_itp', action='store_true')
+    parser.add_argument('--dist_url', default='env://',
+                        help='url used to set up distributed training')
+    # Logging parameters
+    parser.add_argument('--log_dir', default='./logs/pre_4_dir',
+                        help='path where to tensorboard log')
+    parser.add_argument("--title", default="CounTR_pretraining", type=str)
+    parser.add_argument("--wandb", default="counting", type=str)
+    parser.add_argument("--team", default="wsense", type=str)
+    parser.add_argument("--wandb_id", default=None, type=str)
+    parser.add_argument("--do_aug", default=True, type=bool)
+    parser.add_argument('--class_file', default='./data/FSC147/ImageClasses_FSC147.txt', type=str,
+                        help='class json file')
+    return parser
+args = get_args_parser()
+args = args.parse_args()
+class ResizeSomeImage(object):
+    def __init__(self, args):
+        args = get_args_parser()
+        args = args.parse_args()
+        # print(dir(args.im_dir.as_posix()))
+        self.data_path = Path(args.data_path)
+        self.im_dir = self.data_path/args.im_dir
+        anno_file = self.data_path/args.anno_file
+        data_split_file = self.data_path/args.data_split_file
+        with open(anno_file) as f:
+            self.annotations = json.load(f)
+        with open(data_split_file) as f:
+            data_split = json.load(f)
+        self.train_set = data_split['train']
+        self.class_dict = {}
+        if args.do_aug:
+            with open(args.class_file) as f:
+                for line in f:
+                    key = line.split()[0]
+                    val = line.split()[1:]
+                    self.class_dict[key] = val
+class ResizePreTrainImage(ResizeSomeImage):
+    """
+    Resize the image so that:
+        1. Image is equal to 384 * 384
+        2. The new height and new width are divisible by 16
+        3. The aspect ratio is preserved
+    Density and boxes correctness not preserved(crop and horizontal flip)
+    """
+    def __init__(self, args, MAX_HW=384):
+        super().__init__(args)
+        self.max_hw = MAX_HW
+    def __call__(self, sample):
+        image, lines_boxes, density = sample['image'], sample['lines_boxes'], sample['gt_density']
+        W, H = image.size
+        new_H = 16 * int(H / 16)
+        new_W = 16 * int(W / 16)
+        resized_image = transforms.Resize((new_H, new_W))(image)
+        resized_density = cv2.resize(density, (new_W, new_H))
+        orig_count = np.sum(density)
+        new_count = np.sum(resized_density)
+        if new_count > 0:
+            resized_density = resized_density * (orig_count / new_count)
+        boxes = list()
+        for box in lines_boxes:
+            box2 = [int(k) for k in box]
+            y1, x1, y2, x2 = box2[0], box2[1], box2[2], box2[3]
+            boxes.append([0, y1, x1, y2, x2])
+        boxes = torch.Tensor(boxes).unsqueeze(0)
+        resized_image = PreTrainNormalize(resized_image)
+        resized_density = torch.from_numpy(resized_density).unsqueeze(0).unsqueeze(0)
+        sample = {'image': resized_image, 'boxes': boxes, 'gt_density': resized_density}
+        return sample
+class ResizeTrainImage(ResizeSomeImage):
+    """
+    Resize the image so that:
+        1. Image is equal to 384 * 384
+        2. The new height and new width are divisible by 16
+        3. The aspect ratio is possibly preserved
+    Density map is cropped to have the same size(and position) with the cropped image
+    Exemplar boxes may be outside the cropped area.
+    Augmentation including Gaussian noise, Color jitter, Gaussian blur, Random affine, Random horizontal flip and Mosaic (or Random Crop if no Mosaic) is used.
+    """
+    def __init__(self, args, MAX_HW=384, do_aug=True):
+        super().__init__(args)
+        self.max_hw = MAX_HW
+        self.do_aug = do_aug
+    def __call__(self, sample):
+        image, lines_boxes, neg_lines_boxes, dots, im_id, m_flag = sample['image'], sample['lines_boxes'], sample['neg_lines_boxes'], \
+            sample['dots'], sample['id'], sample['m_flag']
+        W, H = image.size
+        new_H = 16 * int(H / 16)
+        new_W = 16 * int(W / 16)
+        scale_factor_h = float(new_H) / H
+        scale_factor_w = float(new_W) / W
+        resized_image = transforms.Resize((new_H, new_W))(image)
+        resized_image = TTensor(resized_image)
+        resized_density = np.zeros((new_H, new_W), dtype='float32')
+        # Augmentation probability
+        aug_flag = self.do_aug
+        mosaic_flag = random.random() < 0.25
+        if aug_flag:
+            # Gaussian noise
+            noise = np.random.normal(0, 0.1, resized_image.size())
+            noise = torch.from_numpy(noise)
+            re_image = resized_image + noise
+            re_image = torch.clamp(re_image, 0, 1)
+            # Color jitter and Gaussian blur
+            re_image = Augmentation(re_image)
+            # Random affine
+            re1_image = re_image.transpose(0, 1).transpose(1, 2).numpy()
+            keypoints = []
+            for i in range(dots.shape[0]):
+                keypoints.append(Keypoint(x=min(new_W - 1, int(dots[i][0] * scale_factor_w)), y=min(new_H - 1, int(dots[i][1] * scale_factor_h))))
+            kps = KeypointsOnImage(keypoints, re1_image.shape)
+            seq = iaa.Sequential([
+                iaa.Affine(
+                    rotate=(-15, 15),
+                    scale=(0.8, 1.2),
+                    shear=(-10, 10),
+                    translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}
+                )
+            ])
+            re1_image, kps_aug = seq(image=re1_image, keypoints=kps)
+            # Produce dot annotation map
+            resized_density = np.zeros((resized_density.shape[0], resized_density.shape[1]), dtype='float32')
+            for i in range(len(kps.keypoints)):
+                if (int(kps_aug.keypoints[i].y) <= new_H - 1 and int(kps_aug.keypoints[i].x) <= new_W - 1) and not \
+                        kps_aug.keypoints[i].is_out_of_image(re1_image):
+                    resized_density[int(kps_aug.keypoints[i].y)][int(kps_aug.keypoints[i].x)] = 1
+            resized_density = torch.from_numpy(resized_density)
+            re_image = TTensor(re1_image)
+            # Random horizontal flip
+            flip_p = random.random()
+            if flip_p > 0.5:
+                re_image = TF.hflip(re_image)
+                resized_density = TF.hflip(resized_density)
+            # Random self mosaic
+            if mosaic_flag:
+                image_array = []
+                map_array = []
+                blending_l = random.randint(10, 20)
+                resize_l = 192 + 2 * blending_l
+                if dots.shape[0] >= 70:
+                    for i in range(4):
+                        length = random.randint(150, 384)
+                        start_W = random.randint(0, new_W - length)
+                        start_H = random.randint(0, new_H - length)
+                        reresized_image1 = TF.crop(resized_image, start_H, start_W, length, length)
+                        reresized_image1 = transforms.Resize((resize_l, resize_l))(reresized_image1)
+                        reresized_density1 = np.zeros((resize_l, resize_l), dtype='float32')
+                        for i in range(dots.shape[0]):
+                            if start_H <= min(new_H - 1, int(dots[i][1] * scale_factor_h)) < start_H + length and start_W <= min(new_W - 1, int(dots[i][0] * scale_factor_w)) < start_W + length:
+                                reresized_density1[min(resize_l-1,int((min(new_H-1,int(dots[i][1] * scale_factor_h))-start_H)*resize_l/length))][min(resize_l-1,int((min(new_W-1,int(dots[i][0] * scale_factor_w))-start_W)*resize_l/length))]=1
+                        reresized_density1 = torch.from_numpy(reresized_density1)
+                        image_array.append(reresized_image1)
+                        map_array.append(reresized_density1)
+                else:
+                    m_flag = 1
+                    prob = random.random()
+                    if prob > 0.25:
+                        gt_pos = random.randint(0, 3)
+                    else:
+                        gt_pos = random.randint(0, 4)  # 5% 0 objects
+                    for i in range(4):
+                        if i == gt_pos:
+                            Tim_id = im_id
+                            r_image = resized_image
+                            Tdots = dots
+                            new_TH = new_H
+                            new_TW = new_W
+                            Tscale_factor_w = scale_factor_w
+                            Tscale_factor_h = scale_factor_h
+                        else:
+                            Tim_id = self.train_set[random.randint(0, len(self.train_set) - 1)]
+                            Tdots = np.array(self.annotations[Tim_id]['points'])
+                            Timage = Image.open('{}/{}'.format(self.im_dir, Tim_id))
+                            Timage.load()
+                            new_TW = 16 * int(Timage.size[0] / 16)
+                            new_TH = 16 * int(Timage.size[1] / 16)
+                            Tscale_factor_w = float(new_TW) / Timage.size[0]
+                            Tscale_factor_h = float(new_TH) / Timage.size[1]
+                            r_image = TTensor(transforms.Resize((new_TH, new_TW))(Timage))
+                        length = random.randint(250, 384)
+                        start_W = random.randint(0, new_TW - length)
+                        start_H = random.randint(0, new_TH - length)
+                        r_image1 = TF.crop(r_image, start_H, start_W, length, length)
+                        r_image1 = transforms.Resize((resize_l, resize_l))(r_image1)
+                        r_density1 = np.zeros((resize_l, resize_l), dtype='float32')
+                        # try:
+                        #     class_value = self.class_dict[im_id]
+                        #     Tim_value = self.class_dict[Tim_id]
+                        # except KeyError:
+                        #     # Handle the case when the key doesn't exist
+                        #     class_value = None  # Or any appropriate default value
+                        #     Tim_value = None  # Or any appropriate default value
+                        if self.class_dict[im_id] == self.class_dict[Tim_id]:
+                        # if class_value == Tim_value:
+                        # if im_id in self.class_dict and Tim_id in self.class_dict:
+                        # if im_id in self.class_dict and Tim_id in self.class_dict:
+                        #     class_value = self.class_dict[im_id]
+                        #     Tim_value = self.class_dict[Tim_id]
+                        # # Proceed with your comparison and processing here
+                        # if class_value == Tim_value:
+                            for i in range(Tdots.shape[0]):
+                                if start_H <= min(new_TH - 1, int(Tdots[i][1] * Tscale_factor_h)) < start_H + length and start_W <= min(new_TW - 1, int(Tdots[i][0] * Tscale_factor_w)) < start_W + length:
+                                    r_density1[min(resize_l-1,int((min(new_TH-1, int(Tdots[i][1] * Tscale_factor_h))-start_H)*resize_l/length))][min(resize_l-1,int((min(new_TW-1,int(Tdots[i][0] * Tscale_factor_w))-start_W)*resize_l/length))]=1
+                        r_density1 = torch.from_numpy(r_density1)
+                        image_array.append(r_image1)
+                        map_array.append(r_density1)
+                reresized_image5 = torch.cat((image_array[0][:, blending_l:resize_l-blending_l], image_array[1][:, blending_l: resize_l-blending_l]), 1)
+                reresized_density5 = torch.cat((map_array[0][blending_l:resize_l-blending_l], map_array[1][blending_l: resize_l-blending_l]), 0)
+                for i in range(blending_l):
+                        reresized_image5[:, 192+i] = image_array[0][:, resize_l-1-blending_l+i] * (blending_l-i)/(2 * blending_l) + reresized_image5[:, 192+i] * (i+blending_l)/(2*blending_l)
+                        reresized_image5[:, 191-i] = image_array[1][:, blending_l-i] * (blending_l-i)/(2*blending_l) + reresized_image5[:, 191-i] * (i+blending_l)/(2*blending_l)
+                reresized_image5 = torch.clamp(reresized_image5, 0, 1)
+                reresized_image6 = torch.cat((image_array[2][:, blending_l:resize_l-blending_l], image_array[3][:, blending_l: resize_l-blending_l]), 1)
+                reresized_density6 = torch.cat((map_array[2][blending_l:resize_l-blending_l], map_array[3][blending_l:resize_l-blending_l]), 0)
+                for i in range(blending_l):
+                        reresized_image6[:, 192+i] = image_array[2][:, resize_l-1-blending_l+i] * (blending_l-i)/(2*blending_l) + reresized_image6[:, 192+i] * (i+blending_l)/(2*blending_l)
+                        reresized_image6[:, 191-i] = image_array[3][:, blending_l-i] * (blending_l-i)/(2*blending_l) + reresized_image6[:, 191-i] * (i+blending_l)/(2*blending_l)
+                reresized_image6 = torch.clamp(reresized_image6, 0, 1)
+                reresized_image = torch.cat((reresized_image5[:, :, blending_l:resize_l-blending_l], reresized_image6[:, :, blending_l:resize_l-blending_l]), 2)
+                reresized_density = torch.cat((reresized_density5[:, blending_l:resize_l-blending_l], reresized_density6[:, blending_l:resize_l-blending_l]), 1)
+                for i in range(blending_l):
+                        reresized_image[:, :, 192+i] = reresized_image5[:, :, resize_l-1-blending_l+i] * (blending_l-i)/(2*blending_l) + reresized_image[:, :, 192+i] * (i+blending_l)/(2*blending_l)
+                        reresized_image[:, :, 191-i] = reresized_image6[:, :, blending_l-i] * (blending_l-i)/(2*blending_l) + reresized_image[:, :, 191-i] * (i+blending_l)/(2*blending_l)
+                reresized_image = torch.clamp(reresized_image, 0, 1)
+            else:
+                # Random 384*384 crop in a new_W*384 image and 384*new_W density map
+                start = random.randint(0, new_W - 1 - 383)
+                reresized_image = TF.crop(re_image, 0, start, 384, 384)
+                reresized_density = resized_density[:, start:start + 384]
+        else:
+            # Random 384*384 crop in a new_W*384 image and 384*new_W density map
+            for i in range(dots.shape[0]):
+                resized_density[min(new_H - 1, int(dots[i][1] * scale_factor_h))] \
+                                [min(new_W - 1, int(dots[i][0] * scale_factor_w))] = 1
+            resized_density = torch.from_numpy(resized_density)
+            start = random.randint(0, new_W - self.max_hw)
+            reresized_image = TF.crop(resized_image, 0, start, self.max_hw, self.max_hw)
+            reresized_density = resized_density[0:self.max_hw, start:start + self.max_hw]
+        # Gaussian distribution density map
+        reresized_density = ndimage.gaussian_filter(reresized_density.numpy(), sigma=(1, 1), order=0)
+        # Density map scale up
+        reresized_density = reresized_density * 60
+        reresized_density = torch.from_numpy(reresized_density)
+        # Crop bboxes and resize as 64x64
+        boxes = list()
+        rects = list()
+        cnt = 0
+        for box in lines_boxes:
+            cnt += 1
+            if cnt > 3:
+                break
+            box2 = [int(k) for k in box]
+            y1 = int(box2[0] * scale_factor_h)
+            x1 = int(box2[1] * scale_factor_w)
+            y2 = int(box2[2] * scale_factor_h)
+            x2 = int(box2[3] * scale_factor_w)
+            # print(y1,x1,y2,x2)
+            if not aug_flag:
+                rects.append(torch.tensor([y1, max(0, x1-start), y2, min(self.max_hw, x2-start)]))
+            bbox = resized_image[:, y1:y2 + 1, x1:x2 + 1]
+            bbox = transforms.Resize((64, 64))(bbox)
+            boxes.append(bbox)
+        boxes = torch.stack(boxes)
+        neg_boxes = list()
+        neg_rects = list()
+        cnt = 0
+        for box in neg_lines_boxes:
+            cnt += 1
+            if cnt > 3:
+                break
+            box2 = [int(k) for k in box]
+            y1 = int(box2[0] * scale_factor_h)
+            x1 = int(box2[1] * scale_factor_w)
+            y2 = int(box2[2] * scale_factor_h)
+            x2 = int(box2[3] * scale_factor_w)
+            # print(y1,x1,y2,x2)
+            if not aug_flag:
+                neg_rects.append(torch.tensor([y1, max(0, x1-start), y2, min(self.max_hw, x2-start)]))
+            neg_bbox = resized_image[:, y1:y2 + 1, x1:x2 + 1]
+            neg_bbox = transforms.Resize((64, 64))(neg_bbox)
+            neg_boxes.append(neg_bbox)
+        neg_boxes = torch.stack(neg_boxes)
+        # if len(boxes) > 0:
+        #     boxes = torch.stack(boxes)  # 如果 boxes 非空，则正常执行 torch.stack
+        #     boxes1 = boxes
+        # else:
+        #     boxes = boxes1
+        #     pass
+        # # 如果 boxes 为空，您可以选择跳过这个样本，或者提供一个默认的边界框
+        # # 例如，使用一个表示图像全区域的默认边界框
+        #     default_box = torch.tensor([[0, 0],[0, 0],0, 0])  # 一个示例的默认边界框，具体值取决于您的应用
+        #     boxes = default_box.unsqueeze(0)  # 增加一个维度以符合 torch.stack 的要求
+        #     # pass
+        if aug_flag:
+            pos = torch.tensor([])
+        else:
+            pos = torch.stack(rects)
+        # boxes shape [3,3,64,64], image shape [3,384,384], density shape[384,384]
+        sample = {'image': reresized_image, 'boxes': boxes, 'neg_boxes': neg_boxes, 'pos': pos, 'gt_density': reresized_density, 'm_flag': m_flag}
+        return sample
+class ResizeValImage(ResizeSomeImage):
+    def __init__(self, args, MAX_HW=384):
+        super().__init__(args)
+        self.max_hw = MAX_HW
+    def __call__(self, sample):
+        image, dots, m_flag, lines_boxes, neg_lines_boxes = sample['image'], sample['dots'], sample['m_flag'], sample['lines_boxes'], sample['neg_lines_boxes']
+        W, H = image.size
+        new_H = new_W = self.max_hw
+        scale_factor_h = float(new_H) / H
+        scale_factor_w = float(new_W) / W
+        resized_image = transforms.Resize((new_H, new_W))(image)
+        resized_image = TTensor(resized_image)
+        # Resize density map
+        resized_density = np.zeros((new_H, new_W), dtype='float32')
+        for i in range(dots.shape[0]):
+            resized_density[min(new_H - 1, int(dots[i][1] * scale_factor_h))] \
+                           [min(new_W - 1, int(dots[i][0] * scale_factor_w))] = 1
+        # resized_density = ndimage.gaussian_filter(resized_density, sigma=4, radius=7, order=0)
+        resized_density = ndimage.gaussian_filter(resized_density, sigma=4, order=0)
+        resized_density = torch.from_numpy(resized_density) * 60
+        # Crop bboxes and resize as 64x64
+        boxes = list()
+        rects = list()
+        cnt = 0
+        for box in lines_boxes:
+            cnt += 1
+            if cnt > 3:
+                break
+            box2 = [int(k) for k in box]
+            y1 = int(box2[0] * scale_factor_h)
+            x1 = int(box2[1] * scale_factor_w)
+            y2 = int(box2[2] * scale_factor_h)
+            x2 = int(box2[3] * scale_factor_w)
+            rects.append(torch.tensor([y1, x1, y2, x2]))
+            bbox = resized_image[:, y1:y2 + 1, x1:x2 + 1]
+            bbox = transforms.Resize((64, 64))(bbox)
+            boxes.append(bbox)
+        boxes = torch.stack(boxes)
+        pos = torch.stack(rects)
+        neg_boxes = list()
+        neg_rects = list()
+        cnt = 0
+        for box in neg_lines_boxes:
+            cnt += 1
+            if cnt > 3:
+                break
+            box2 = [int(k) for k in box]
+            y1 = int(box2[0] * scale_factor_h)
+            x1 = int(box2[1] * scale_factor_w)
+            y2 = int(box2[2] * scale_factor_h)
+            x2 = int(box2[3] * scale_factor_w)
+            neg_rects.append(torch.tensor([y1, x1, y2, x2]))
+            neg_bbox = resized_image[:, y1:y2 + 1, x1:x2 + 1]
+            neg_bbox = transforms.Resize((64, 64))(neg_bbox)
+            neg_boxes.append(neg_bbox)
+        neg_boxes = torch.stack(neg_boxes)
+        # boxes shape [3,3,64,64], image shape [3,384,384], density shape[384,384]
+        sample = {'image': resized_image, 'boxes': boxes, 'neg_boxes': neg_boxes, 'pos': pos, 'gt_density': resized_density, 'm_flag': m_flag}
+        return sample
+PreTrainNormalize = transforms.Compose([
+    transforms.RandomResizedCrop(MAX_HW, scale=(0.2, 1.0), interpolation=3),
+    transforms.RandomHorizontalFlip(),
+    transforms.ToTensor(),
+    # transforms.Normalize(mean=IM_NORM_MEAN, std=IM_NORM_STD)
+])
+TTensor = transforms.Compose([
+    transforms.ToTensor(),
+])
+Augmentation = transforms.Compose([
+    transforms.ColorJitter(brightness=0.25, contrast=0.15, saturation=0.15, hue=0.15),
+    transforms.GaussianBlur(kernel_size=(7, 9))
+])
+Normalize = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize(mean=IM_NORM_MEAN, std=IM_NORM_STD)
+])
+def transform_train(args: Namespace, do_aug=True):
+    return transforms.Compose([ResizeTrainImage(args, MAX_HW, do_aug)])
+def transform_val(args: Namespace):
+    return transforms.Compose([ResizeValImage(args, MAX_HW)])
+def transform_pre_train(args: Namespace):
+    return transforms.Compose([ResizePreTrainImage(args, MAX_HW)])

util/__pycache__/FSC147.cpython-38.pyc ADDED Viewed

Binary file (15.3 kB). View file

util/__pycache__/FSC147.cpython-39.pyc ADDED Viewed

Binary file (14.4 kB). View file

util/__pycache__/FSC147_test.cpython-38.pyc ADDED Viewed

Binary file (16.6 kB). View file

util/__pycache__/lr_sched.cpython-38.pyc ADDED Viewed

Binary file (628 Bytes). View file

util/__pycache__/lr_sched.cpython-39.pyc ADDED Viewed

Binary file (628 Bytes). View file

util/__pycache__/misc.cpython-38.pyc ADDED Viewed

Binary file (19.5 kB). View file

util/__pycache__/misc.cpython-39.pyc ADDED Viewed

Binary file (19.4 kB). View file

util/__pycache__/pos_embed.cpython-38.pyc ADDED Viewed

Binary file (2.41 kB). View file

util/__pycache__/pos_embed.cpython-39.pyc ADDED Viewed

Binary file (2.39 kB). View file

util/crop.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+from torchvision import transforms
+from torchvision.transforms import functional as F
+class RandomResizedCrop(transforms.RandomResizedCrop):
+    """
+    RandomResizedCrop for matching TF/TPU implementation: no for-loop is used.
+    This may lead to results different with torchvision's version.
+    Following BYOL's TF code:
+    https://github.com/deepmind/deepmind-research/blob/master/byol/utils/dataset.py#L206
+    """
+    @staticmethod
+    def get_params(img, scale, ratio):
+        width, height = F._get_image_size(img)
+        area = height * width
+        target_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item()
+        log_ratio = torch.log(torch.tensor(ratio))
+        aspect_ratio = torch.exp(
+            torch.empty(1).uniform_(log_ratio[0], log_ratio[1])
+        ).item()
+        w = int(round(math.sqrt(target_area * aspect_ratio)))
+        h = int(round(math.sqrt(target_area / aspect_ratio)))
+        w = min(w, width)
+        h = min(h, height)
+        i = torch.randint(0, height - h + 1, size=(1,)).item()
+        j = torch.randint(0, width - w + 1, size=(1,)).item()
+        return i, j, h, w

util/datasets.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+import os
+import PIL
+from torchvision import datasets, transforms
+from timm.data import create_transform
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+def build_dataset(is_train, args):
+    transform = build_transform(is_train, args)
+    root = os.path.join(args.data_path, 'train' if is_train else 'val')
+    dataset = datasets.ImageFolder(root, transform=transform)
+    print(dataset)
+    return dataset
+def build_transform(is_train, args):
+    mean = IMAGENET_DEFAULT_MEAN
+    std = IMAGENET_DEFAULT_STD
+    # train transform
+    if is_train:
+        # this should always dispatch to transforms_imagenet_train
+        transform = create_transform(
+            input_size=args.input_size,
+            is_training=True,
+            color_jitter=args.color_jitter,
+            auto_augment=args.aa,
+            interpolation='bicubic',
+            re_prob=args.reprob,
+            re_mode=args.remode,
+            re_count=args.recount,
+            mean=mean,
+            std=std,
+        )
+        return transform
+    # eval transform
+    t = []
+    if args.input_size <= 224:
+        crop_pct = 224 / 256
+    else:
+        crop_pct = 1.0
+    size = int(args.input_size / crop_pct)
+    t.append(
+        transforms.Resize(size, interpolation=PIL.Image.BICUBIC),  # to maintain same ratio w.r.t. 224 images
+    )
+    t.append(transforms.CenterCrop(args.input_size))
+    t.append(transforms.ToTensor())
+    t.append(transforms.Normalize(mean, std))
+    return transforms.Compose(t)

util/lars.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# LARS optimizer, implementation from MoCo v3:
+# https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+import torch
+class LARS(torch.optim.Optimizer):
+    """
+    LARS optimizer, no rate scaling or weight decay for parameters <= 1D.
+    """
+    def __init__(self, params, lr=0, weight_decay=0, momentum=0.9, trust_coefficient=0.001):
+        defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, trust_coefficient=trust_coefficient)
+        super().__init__(params, defaults)
+    @torch.no_grad()
+    def step(self):
+        for g in self.param_groups:
+            for p in g['params']:
+                dp = p.grad
+                if dp is None:
+                    continue
+                if p.ndim > 1: # if not normalization gamma/beta or bias
+                    dp = dp.add(p, alpha=g['weight_decay'])
+                    param_norm = torch.norm(p)
+                    update_norm = torch.norm(dp)
+                    one = torch.ones_like(param_norm)
+                    q = torch.where(param_norm > 0.,
+                                    torch.where(update_norm > 0,
+                                    (g['trust_coefficient'] * param_norm / update_norm), one),
+                                    one)
+                    dp = dp.mul(q)
+                param_state = self.state[p]
+                if 'mu' not in param_state:
+                    param_state['mu'] = torch.zeros_like(p)
+                mu = param_state['mu']
+                mu.mul_(g['momentum']).add_(dp)
+                p.add_(mu, alpha=-g['lr'])

util/lr_decay.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# ELECTRA https://github.com/google-research/electra
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import json
+def param_groups_lrd(model, weight_decay=0.05, no_weight_decay_list=[], layer_decay=.75):
+    """
+    Parameter groups for layer-wise lr decay
+    Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58
+    """
+    param_group_names = {}
+    param_groups = {}
+    num_layers = len(model.blocks) + 1
+    layer_scales = list(layer_decay ** (num_layers - i) for i in range(num_layers + 1))
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        # no decay: all 1D parameters and model specific ones
+        if p.ndim == 1 or n in no_weight_decay_list:
+            g_decay = "no_decay"
+            this_decay = 0.
+        else:
+            g_decay = "decay"
+            this_decay = weight_decay
+        layer_id = get_layer_id_for_vit(n, num_layers)
+        group_name = "layer_%d_%s" % (layer_id, g_decay)
+        if group_name not in param_group_names:
+            this_scale = layer_scales[layer_id]
+            param_group_names[group_name] = {
+                "lr_scale": this_scale,
+                "weight_decay": this_decay,
+                "params": [],
+            }
+            param_groups[group_name] = {
+                "lr_scale": this_scale,
+                "weight_decay": this_decay,
+                "params": [],
+            }
+        param_group_names[group_name]["params"].append(n)
+        param_groups[group_name]["params"].append(p)
+    # print("parameter groups: \n%s" % json.dumps(param_group_names, indent=2))
+    return list(param_groups.values())
+def get_layer_id_for_vit(name, num_layers):
+    """
+    Assign a parameter with its layer id
+    Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
+    """
+    if name in ['cls_token', 'pos_embed']:
+        return 0
+    elif name.startswith('patch_embed'):
+        return 0
+    elif name.startswith('blocks'):
+        return int(name.split('.')[1]) + 1
+    else:
+        return num_layers

util/lr_sched.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+def adjust_learning_rate(optimizer, epoch, args):
+    """Decay the learning rate with half-cycle cosine after warmup"""
+    if epoch < args.warmup_epochs:
+        lr = args.lr * epoch / args.warmup_epochs
+    else:
+        lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \
+            (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs)))
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
+    return lr

util/misc.py ADDED Viewed

	@@ -0,0 +1,624 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import builtins
+import datetime
+import os
+import time
+import json
+from collections import defaultdict, deque
+from pathlib import Path
+# from typing import Union
+import pandas as pd
+import torch
+import torch.distributed as dist
+import wandb
+# from torch._six import inf
+from torch import inf
+import matplotlib.pyplot as plt
+from torchvision import transforms
+import cv2
+from tqdm import tqdm
+from typing import Union, List
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        if self.count == 0:
+        # Return a default value or handle the zero count scenario
+            return 0  # Or any other default value or handling mechanism
+        else:
+            return self.total / self.count
+        # return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch.cuda.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    builtin_print = builtins.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        force = force or (get_world_size() > 8)
+        if is_master or force:
+            now = datetime.datetime.now().time()
+            builtin_print('[{}] '.format(now), end='')  # print with time stamp
+            builtin_print(*args, **kwargs)
+    builtins.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def init_distributed_mode(args):
+    if args.dist_on_itp:
+        args.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+        args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+        args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+        args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
+        os.environ['LOCAL_RANK'] = str(args.gpu)
+        os.environ['RANK'] = str(args.rank)
+        os.environ['WORLD_SIZE'] = str(args.world_size)
+        # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        setup_for_distributed(is_master=True)  # hack
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}, gpu {}'.format(
+        args.rank, args.dist_url, args.gpu), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+class NativeScalerWithGradNormCount:
+    state_dict_key = "amp_scaler"
+    def __init__(self):
+        self._scaler = torch.cuda.amp.GradScaler()
+    def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True):
+        self._scaler.scale(loss).backward(create_graph=create_graph)
+        if update_grad:
+            if clip_grad is not None:
+                assert parameters is not None
+                self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
+                norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
+            else:
+                self._scaler.unscale_(optimizer)
+                norm = get_grad_norm_(parameters)
+            self._scaler.step(optimizer)
+            self._scaler.update()
+        else:
+            norm = None
+        return norm
+    def state_dict(self):
+        return self._scaler.state_dict()
+    def load_state_dict(self, state_dict):
+        self._scaler.load_state_dict(state_dict)
+def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return torch.tensor(0.)
+    device = parameters[0].grad.device
+    if norm_type == inf:
+        total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
+    else:
+        total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
+    return total_norm
+def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler, suffix="", upload=True):
+    if suffix:
+        suffix = f"__{suffix}"
+    output_dir = Path(args.output_dir)
+    ckpt_name = f"checkpoint{suffix}.pth"
+    if loss_scaler is not None:
+        checkpoint_paths = [output_dir / ckpt_name]
+        for checkpoint_path in checkpoint_paths:
+            to_save = {
+                'model': model_without_ddp.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'epoch': epoch,
+                'scaler': loss_scaler.state_dict(),
+                'args': args,
+            }
+            save_on_master(to_save, checkpoint_path)
+            if upload and is_main_process():
+                log_wandb_model(f"checkpoint{suffix}", checkpoint_path, epoch)
+            print("checkpoint sent to W&B (if)")
+    else:
+        client_state = {'epoch': epoch}
+        model.save_checkpoint(save_dir=args.output_dir, tag=ckpt_name, client_state=client_state)
+        if upload and is_main_process():
+            log_wandb_model(f"checkpoint{suffix}", output_dir / ckpt_name, epoch)
+        print("checkpoint sent to W&B (else)")
+def log_wandb_model(title, path, epoch):
+    artifact = wandb.Artifact(title, type="model")
+    artifact.add_file(path)
+    artifact.metadata["epoch"] = epoch
+    wandb.log_artifact(artifact_or_path=artifact, name=title)
+def load_model(args, model_without_ddp, optimizer, loss_scaler):
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        if 'pos_embed' in checkpoint['model'] and checkpoint['model']['pos_embed'].shape != model_without_ddp.state_dict()['pos_embed'].shape:
+            print(f"Removing key pos_embed from pretrained checkpoint")
+            del checkpoint['model']['pos_embed']
+        if 'decoder_pos_embed' in checkpoint['model'] and checkpoint['model']['decoder_pos_embed'].shape != model_without_ddp.state_dict()['decoder_pos_embed'].shape:
+            print(f"Removing key decoder_pos_embed from pretrained checkpoint")
+            del checkpoint['model']['decoder_pos_embed']
+        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
+        print("Resume checkpoint %s" % args.resume)
+        if 'optimizer' in checkpoint and 'epoch' in checkpoint and not (hasattr(args, 'eval') and args.eval):
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            args.start_epoch = checkpoint['epoch'] + 1
+            if 'scaler' in checkpoint:
+                loss_scaler.load_state_dict(checkpoint['scaler'])
+            print("With optim & sched!")
+def load_model_FSC(args, model_without_ddp):
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        if 'pos_embed' in checkpoint['model'] and checkpoint['model']['pos_embed'].shape != model_without_ddp.state_dict()['pos_embed'].shape:
+            print(f"Removing key pos_embed from pretrained checkpoint")
+            del checkpoint['model']['pos_embed']
+        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
+        print(f"Resume checkpoint {args.resume} ({checkpoint['epoch']})")
+def load_model_FSC1(args, model_without_ddp):
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+            #model = timm.create_model('vit_base_patch16_224', pretrained=True)
+            #torch.save(model.state_dict(), './output_abnopre_dir/checkpoint-6657.pth')
+            checkpoint1 = torch.load('./output_abnopre_dir/checkpoint-6657.pth', map_location='cpu')
+        if 'pos_embed' in checkpoint['model'] and checkpoint['model']['pos_embed'].shape != model_without_ddp.state_dict()['pos_embed'].shape:
+            print(f"Removing key pos_embed from pretrained checkpoint")
+            del checkpoint['model']['pos_embed']
+        del checkpoint1['cls_token'],checkpoint1['pos_embed']
+        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
+        model_without_ddp.load_state_dict(checkpoint1, strict=False)
+        print("Resume checkpoint %s" % args.resume)
+def load_model_FSC_full(args, model_without_ddp, optimizer, loss_scaler):
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        if 'pos_embed' in checkpoint['model'] and checkpoint['model']['pos_embed'].shape != \
+                model_without_ddp.state_dict()['pos_embed'].shape:
+            print(f"Removing key pos_embed from pretrained checkpoint")
+            del checkpoint['model']['pos_embed']
+        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
+        print("Resume checkpoint %s" % args.resume)
+        if 'optimizer' in checkpoint and 'epoch' in checkpoint and args.do_resume:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            args.start_epoch = checkpoint['epoch'] + 1
+            if 'scaler' in checkpoint:
+                loss_scaler.load_state_dict(checkpoint['scaler'])
+            print("With optim & scheduler!")
+def all_reduce_mean(x):
+    world_size = get_world_size()
+    if world_size > 1:
+        x_reduce = torch.tensor(x).cuda()
+        dist.all_reduce(x_reduce)
+        x_reduce /= world_size
+        return x_reduce.item()
+    else:
+        return x
+def plot_counts(res_csv: Union[str, List[str]], output_dir: str, suffix: str = "", smooth: bool = False):
+    if suffix:
+        suffix = f"_{suffix}"
+    if smooth:
+        suffix = f"_smooth{suffix}"
+    if type(res_csv) == str:
+        res_csv = [res_csv]
+    plt.figure(figsize=(15, 5))
+    for res in res_csv:
+        name = Path(res).parent.name
+        df = pd.read_csv(res)
+        print(df)
+        df.sort_values(by="name", inplace=True)
+        df.reset_index(drop=True, inplace=True)
+        df.index += 1
+        print(df)
+        if smooth:
+            time_arr = df.index[5:-5]
+            smooth_pred_mean = df['prediction'].iloc[5:-5].rolling(25).mean()
+            smooth_pred_std = df['prediction'].iloc[5:-5].rolling(25).std()
+            plt.plot(time_arr, smooth_pred_mean, label=name)
+            plt.fill_between(time_arr, smooth_pred_mean + smooth_pred_std, smooth_pred_mean - smooth_pred_std, alpha=.2)
+            plt.xlabel('Frame')
+            plt.ylabel('Count')
+        else:
+            plt.plot(df.index, df['prediction'], label=name)
+    plt.legend()
+    plt.savefig(os.path.join(output_dir, f'counts{suffix}.png'), dpi=300)
+def write_zeroshot_annotations(p: Path):
+    with open(p / 'annotations.json', 'a') as split:
+        split.write('{\n')
+        for img in p.iterdir():
+            if img.is_file():
+                split.write(f'  "{img.name}": {{\n' \
+                            '    "H": 960,\n' \
+                            '    "W": 1280,\n' \
+                            '    "box_examples_coordinates": [],\n' \
+                            '    "points": []\n' \
+                            '  },\n')
+        split.write("}")
+    with open(p / 'split.json', 'a') as split:
+        split.write('{\n  "test":\n  [\n')
+        for img in p.iterdir():
+            if img.is_file():
+                split.write(f'    "{img.name}",\n')
+        split.write("  ]\n}")
+def make_grid(imgs, h, w):
+    assert len(imgs) == 9
+    rows = []
+    for i in range(0, 9, 3):
+        row = torch.cat((imgs[i], imgs[i + 1], imgs[i + 2]), -1)
+        rows += [row]
+    grid = torch.cat((rows[0], rows[1], rows[2]), 0)
+    grid = transforms.Resize((h, w))(grid.unsqueeze(0))
+    return grid.squeeze(0)
+def min_max(t):
+    t_shape = t.shape
+    t = t.view(t_shape[0], -1)
+    t -= t.min(1, keepdim=True)[0]
+    t /= t.max(1, keepdim=True)[0]
+    t = t.view(*t_shape)
+    return t
+def min_max_np(v, new_min=0, new_max=1):
+    v_min, v_max = v.min(), v.max()
+    return (v - v_min) / (v_max - v_min) * (new_max - new_min) + new_min
+def get_box_map(sample, pos, device, external=False):
+    box_map = torch.zeros([sample.shape[1], sample.shape[2]], device=device)
+    if external is False:
+        for rect in pos:
+            for i in range(rect[2] - rect[0]):
+                box_map[min(rect[0] + i, sample.shape[1] - 1), min(rect[1], sample.shape[2] - 1)] = 10
+                box_map[min(rect[0] + i, sample.shape[1] - 1), min(rect[3], sample.shape[2] - 1)] = 10
+            for i in range(rect[3] - rect[1]):
+                box_map[min(rect[0], sample.shape[1] - 1), min(rect[1] + i, sample.shape[2] - 1)] = 10
+                box_map[min(rect[2], sample.shape[1] - 1), min(rect[1] + i, sample.shape[2] - 1)] = 10
+        box_map = box_map.unsqueeze(0).repeat(3, 1, 1)
+    return box_map
+timerfunc = time.perf_counter
+class measure_time(object):
+    def __enter__(self):
+        self.start = timerfunc()
+        return self
+    def __exit__(self, typ, value, traceback):
+        self.duration = timerfunc() - self.start
+    def __add__(self, other):
+        return self.duration + other.duration
+    def __sub__(self, other):
+        return self.duration - other.duration
+    def __str__(self):
+        return str(self.duration)
+def log_test_results(test_dir):
+    test_dir = Path(test_dir)
+    logs = []
+    for d in test_dir.iterdir():
+        if d.is_dir() and (d / "log.txt").exists():
+            print(d.name)
+            with open(d / "log.txt") as f:
+                last = f.readlines()[-1]
+                j = json.loads(last)
+                j['name'] = d.name
+                logs.append(j)
+    df = pd.DataFrame(logs)
+    df.sort_values('name', inplace=True, ignore_index=True)
+    cols = list(df.columns)
+    cols = cols[-1:] + cols[:-1]
+    df = df[cols]
+    df.to_csv(test_dir / "logs.csv", index=False)
+COLORS = {
+    'muted blue': '#1f77b4',
+    'safety orange': '#ff7f0e',
+    'cooked asparagus green': '#2ca02c',
+    'brick red': '#d62728',
+    'muted purple': '#9467bd',
+    'chestnut brown': '#8c564b',
+    'raspberry yogurt pink': '#e377c2',
+    'middle gray': '#7f7f7f',
+    'curry yellow-green': '#bcbd22',
+    'blue-teal': '#17becf',
+    'muted blue light': '#419ede',
+    'safety orange light': '#ffa85b',
+    'cooked asparagus green light': '#4bce4b',
+    'brick red light': '#e36667'
+}
+def plot_test_results(test_dir):
+    import plotly.graph_objects as go
+    test_dir = Path(test_dir)
+    df = pd.read_csv(test_dir / "logs.csv")
+    df.sort_values('name', inplace=True)
+    fig = go.Figure()
+    fig.add_trace(go.Scatter(x=df['name'], y=df['MAE'], line_color=COLORS['muted blue'],
+                        mode='lines', name='MAE'))
+    fig.add_trace(go.Scatter(x=df['name'], y=df['RMSE'], line_color=COLORS['safety orange'],
+                        mode='lines', name='RMSE'))
+    fig.add_trace(go.Scatter(x=df['name'], y=df['NAE'], line_color=COLORS['cooked asparagus green'],
+                        mode='lines', name='NAE'))
+    fig.update_yaxes(type="log")
+    fig.write_image(test_dir / "plot.jpeg", scale=4)
+    fig.write_html(test_dir / "plot.html", auto_open=False)
+def frames2vid(input_dir: str, output_file: str, pattern: str, fps: int, h=720, w=1280):
+    input_dir = Path(input_dir)
+    video_file = None
+    files = sorted(input_dir.glob(pattern))
+    video_file = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
+    for img in tqdm(files, total=len(files)):
+        frame = cv2.imread(str(img))
+        frame = cv2.resize(frame, (w, h))
+        video_file.write(frame)
+    video_file.release()

util/pos_embed.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# Position embedding utils
+# --------------------------------------------------------
+import numpy as np
+import torch
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    # omega = np.arange(embed_dim // 2, dtype=np.float)
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed