-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #26 from okotaku/feat/dino
[Feature] Support DINO
- Loading branch information
Showing
17 changed files
with
459 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# dataset settings | ||
dataset_type = 'CocoDataset' | ||
data_root = 'data/coco/' | ||
|
||
file_client_args = dict(backend='disk') | ||
|
||
train_pipeline = [ | ||
dict(type='LoadImageFromFile', file_client_args=file_client_args), | ||
dict(type='LoadAnnotations', with_bbox=True), | ||
dict(type='RandomFlip', prob=0.5), | ||
dict( | ||
type='RandomChoice', | ||
transforms=[ | ||
[ | ||
dict( | ||
type='RandomChoiceResize', | ||
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), | ||
(608, 1333), (640, 1333), (672, 1333), (704, 1333), | ||
(736, 1333), (768, 1333), (800, 1333)], | ||
keep_ratio=True) | ||
], | ||
[ | ||
dict( | ||
type='RandomChoiceResize', | ||
# The radio of all image in train dataset < 7 | ||
# follow the original implement | ||
scales=[(400, 4200), (500, 4200), (600, 4200)], | ||
keep_ratio=True), | ||
dict( | ||
type='RandomCrop', | ||
crop_type='absolute_range', | ||
crop_size=(384, 600), | ||
allow_negative_crop=True), | ||
dict( | ||
type='RandomChoiceResize', | ||
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), | ||
(608, 1333), (640, 1333), (672, 1333), (704, 1333), | ||
(736, 1333), (768, 1333), (800, 1333)], | ||
keep_ratio=True) | ||
] | ||
]), | ||
dict(type='PackDetInputs') | ||
] | ||
test_pipeline = [ | ||
dict(type='LoadImageFromFile', file_client_args=file_client_args), | ||
dict(type='Resize', scale=(1333, 800), keep_ratio=True), | ||
# If you don't have a gt annotation, delete the pipeline | ||
dict(type='LoadAnnotations', with_bbox=True), | ||
dict( | ||
type='PackDetInputs', | ||
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', | ||
'scale_factor')) | ||
] | ||
train_dataloader = dict( | ||
batch_size=2, | ||
num_workers=2, | ||
persistent_workers=True, | ||
sampler=dict(type='DefaultSampler', shuffle=True), | ||
batch_sampler=dict(type='AspectRatioBatchSampler'), | ||
dataset=dict( | ||
type=dataset_type, | ||
data_root=data_root, | ||
ann_file='annotations/instances_train2017.json', | ||
data_prefix=dict(img='train2017/'), | ||
filter_cfg=dict(filter_empty_gt=True, min_size=32), | ||
pipeline=train_pipeline)) | ||
val_dataloader = dict( | ||
batch_size=1, | ||
num_workers=2, | ||
persistent_workers=True, | ||
drop_last=False, | ||
sampler=dict(type='DefaultSampler', shuffle=False), | ||
dataset=dict( | ||
type=dataset_type, | ||
data_root=data_root, | ||
ann_file='annotations/instances_val2017.json', | ||
data_prefix=dict(img='val2017/'), | ||
test_mode=True, | ||
pipeline=test_pipeline)) | ||
test_dataloader = val_dataloader | ||
|
||
val_evaluator = dict( | ||
type='CocoMetric', | ||
ann_file=data_root + 'annotations/instances_val2017.json', | ||
metric='bbox', | ||
format_only=False) | ||
test_evaluator = val_evaluator |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
# dataset settings | ||
dataset_type = 'LVISV1Dataset' | ||
data_root = 'data/lvis_v1/' | ||
|
||
file_client_args = dict(backend='disk') | ||
|
||
train_pipeline = [ | ||
dict(type='LoadImageFromFile', file_client_args=file_client_args), | ||
dict(type='LoadAnnotations', with_bbox=True), | ||
dict(type='RandomFlip', prob=0.5), | ||
dict( | ||
type='RandomChoice', | ||
transforms=[ | ||
[ | ||
dict( | ||
type='RandomChoiceResize', | ||
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), | ||
(608, 1333), (640, 1333), (672, 1333), (704, 1333), | ||
(736, 1333), (768, 1333), (800, 1333)], | ||
keep_ratio=True) | ||
], | ||
[ | ||
dict( | ||
type='RandomChoiceResize', | ||
# The radio of all image in train dataset < 7 | ||
# follow the original implement | ||
scales=[(400, 4200), (500, 4200), (600, 4200)], | ||
keep_ratio=True), | ||
dict( | ||
type='RandomCrop', | ||
crop_type='absolute_range', | ||
crop_size=(384, 600), | ||
allow_negative_crop=True), | ||
dict( | ||
type='RandomChoiceResize', | ||
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), | ||
(608, 1333), (640, 1333), (672, 1333), (704, 1333), | ||
(736, 1333), (768, 1333), (800, 1333)], | ||
keep_ratio=True) | ||
] | ||
]), | ||
dict(type='PackDetInputs') | ||
] | ||
test_pipeline = [ | ||
dict(type='LoadImageFromFile', file_client_args=file_client_args), | ||
dict(type='Resize', scale=(1333, 800), keep_ratio=True), | ||
dict(type='LoadAnnotations', with_bbox=True), | ||
dict( | ||
type='PackDetInputs', | ||
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', | ||
'scale_factor')) | ||
] | ||
|
||
train_dataloader = dict( | ||
batch_size=4, | ||
num_workers=4, | ||
persistent_workers=True, | ||
sampler=dict(type='DefaultSampler', shuffle=True), | ||
batch_sampler=dict(type='AspectRatioBatchSampler'), | ||
dataset=dict( | ||
type='ClassBalancedDataset', | ||
oversample_thr=1e-3, | ||
dataset=dict( | ||
type=dataset_type, | ||
data_root=data_root, | ||
ann_file='annotations/lvis_v1_train.json', | ||
data_prefix=dict(img=''), | ||
filter_cfg=dict(filter_empty_gt=True, min_size=32), | ||
pipeline=train_pipeline))) | ||
val_dataloader = dict( | ||
batch_size=1, | ||
num_workers=2, | ||
persistent_workers=True, | ||
drop_last=False, | ||
sampler=dict(type='DefaultSampler', shuffle=False), | ||
dataset=dict( | ||
type=dataset_type, | ||
data_root=data_root, | ||
ann_file='annotations/lvis_v1_val.json', | ||
data_prefix=dict(img=''), | ||
test_mode=True, | ||
pipeline=test_pipeline)) | ||
test_dataloader = val_dataloader | ||
|
||
val_evaluator = dict( | ||
type='LVISMetric', | ||
ann_file=data_root + 'annotations/lvis_v1_val.json', | ||
metric=['bbox']) | ||
test_evaluator = val_evaluator |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
# dataset settings | ||
dataset_type = 'LVISV1Dataset' | ||
data_root = 'data/lvis_v1/' | ||
|
||
file_client_args = dict(backend='disk') | ||
|
||
train_pipeline = [ | ||
dict(type='LoadImageFromFile', file_client_args=file_client_args), | ||
dict(type='LoadAnnotations', with_bbox=True), | ||
dict(type='RandomFlip', prob=0.5), | ||
dict( | ||
type='RandomChoice', | ||
transforms=[ | ||
[ | ||
dict( | ||
type='RandomChoiceResize', | ||
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), | ||
(608, 1333), (640, 1333), (672, 1333), (704, 1333), | ||
(736, 1333), (768, 1333), (800, 1333)], | ||
keep_ratio=True) | ||
], | ||
[ | ||
dict( | ||
type='RandomChoiceResize', | ||
# The radio of all image in train dataset < 7 | ||
# follow the original implement | ||
scales=[(400, 4200), (500, 4200), (600, 4200)], | ||
keep_ratio=True), | ||
dict( | ||
type='RandomCrop', | ||
crop_type='absolute_range', | ||
crop_size=(384, 600), | ||
allow_negative_crop=True), | ||
dict( | ||
type='RandomChoiceResize', | ||
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), | ||
(608, 1333), (640, 1333), (672, 1333), (704, 1333), | ||
(736, 1333), (768, 1333), (800, 1333)], | ||
keep_ratio=True) | ||
] | ||
]), | ||
dict(type='PackDetInputs') | ||
] | ||
test_pipeline = [ | ||
dict(type='LoadImageFromFile', file_client_args=file_client_args), | ||
dict(type='Resize', scale=(1333, 800), keep_ratio=True), | ||
dict(type='LoadAnnotations', with_bbox=True), | ||
dict( | ||
type='PackDetInputs', | ||
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', | ||
'scale_factor')) | ||
] | ||
|
||
train_dataloader = dict( | ||
batch_size=4, | ||
num_workers=4, | ||
persistent_workers=True, | ||
sampler=dict(type='DefaultSampler', shuffle=True), | ||
batch_sampler=dict(type='AspectRatioBatchSampler'), | ||
dataset=dict( | ||
type='ClassBalancedDataset', | ||
oversample_thr=1e-3, | ||
dataset=dict( | ||
type=dataset_type, | ||
data_root=data_root, | ||
ann_file='annotations/paco_lvis_v1_train.json', | ||
data_prefix=dict(img=''), | ||
filter_cfg=dict(filter_empty_gt=True, min_size=32), | ||
pipeline=train_pipeline))) | ||
val_dataloader = dict( | ||
batch_size=1, | ||
num_workers=2, | ||
persistent_workers=True, | ||
drop_last=False, | ||
sampler=dict(type='DefaultSampler', shuffle=False), | ||
dataset=dict( | ||
type=dataset_type, | ||
data_root=data_root, | ||
ann_file='annotations/paco_lvis_v1_val.json', | ||
data_prefix=dict(img=''), | ||
test_mode=True, | ||
pipeline=test_pipeline)) | ||
test_dataloader = val_dataloader | ||
|
||
val_evaluator = dict( | ||
type='LVISMetric', | ||
ann_file=data_root + 'annotations/paco_lvis_v1_val.json', | ||
metric=['bbox']) | ||
test_evaluator = val_evaluator |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
model = dict( | ||
type='DINO', | ||
num_queries=900, # num_matching_queries | ||
with_box_refine=True, | ||
as_two_stage=True, | ||
data_preprocessor=dict( | ||
type='DetDataPreprocessor', | ||
mean=[123.675, 116.28, 103.53], | ||
std=[58.395, 57.12, 57.375], | ||
bgr_to_rgb=True, | ||
pad_size_divisor=1), | ||
backbone=dict( | ||
type='ResNet', | ||
depth=50, | ||
num_stages=4, | ||
out_indices=(1, 2, 3), | ||
frozen_stages=1, | ||
norm_cfg=dict(type='BN', requires_grad=False), | ||
norm_eval=True, | ||
style='pytorch', | ||
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), | ||
neck=dict( | ||
type='ChannelMapper', | ||
in_channels=[512, 1024, 2048], | ||
kernel_size=1, | ||
out_channels=256, | ||
act_cfg=None, | ||
norm_cfg=dict(type='GN', num_groups=32), | ||
num_outs=4), | ||
encoder=dict( | ||
num_layers=6, | ||
layer_cfg=dict( | ||
self_attn_cfg=dict(embed_dims=256, num_levels=4, | ||
dropout=0.0), # 0.1 for DeformDETR | ||
ffn_cfg=dict( | ||
embed_dims=256, | ||
feedforward_channels=2048, # 1024 for DeformDETR | ||
ffn_drop=0.0))), # 0.1 for DeformDETR | ||
decoder=dict( | ||
num_layers=6, | ||
return_intermediate=True, | ||
layer_cfg=dict( | ||
self_attn_cfg=dict(embed_dims=256, num_heads=8, | ||
dropout=0.0), # 0.1 for DeformDETR | ||
cross_attn_cfg=dict(embed_dims=256, num_levels=4, | ||
dropout=0.0), # 0.1 for DeformDETR | ||
ffn_cfg=dict( | ||
embed_dims=256, | ||
feedforward_channels=2048, # 1024 for DeformDETR | ||
ffn_drop=0.0)), # 0.1 for DeformDETR | ||
post_norm_cfg=None), | ||
positional_encoding=dict( | ||
num_feats=128, | ||
normalize=True, | ||
offset=0.0, # -0.5 for DeformDETR | ||
temperature=20), # 10000 for DeformDETR | ||
bbox_head=dict( | ||
type='DINOHead', | ||
num_classes=80, | ||
sync_cls_avg_factor=True, | ||
loss_cls=dict( | ||
type='FocalLoss', | ||
use_sigmoid=True, | ||
gamma=2.0, | ||
alpha=0.25, | ||
loss_weight=1.0), # 2.0 in DeformDETR | ||
loss_bbox=dict(type='L1Loss', loss_weight=5.0), | ||
loss_iou=dict(type='GIoULoss', loss_weight=2.0)), | ||
dn_cfg=dict( # TODO: Move to model.train_cfg ? | ||
label_noise_scale=0.5, | ||
box_noise_scale=1.0, # 0.4 for DN-DETR | ||
group_cfg=dict(dynamic=True, num_groups=None, | ||
num_dn_queries=100)), # TODO: half num_dn_queries | ||
# training and testing settings | ||
train_cfg=dict( | ||
assigner=dict( | ||
type='HungarianAssigner', | ||
match_costs=[ | ||
dict(type='FocalLossCost', weight=2.0), | ||
dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), | ||
dict(type='IoUCost', iou_mode='giou', weight=2.0) | ||
])), | ||
test_cfg=dict(max_per_img=300)) # 100 for DeformDETR |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# optimizer | ||
optim_wrapper = dict( | ||
type='OptimWrapper', | ||
optimizer=dict( | ||
type='AdamW', | ||
lr=0.0001, # 0.0002 for DeformDETR | ||
weight_decay=0.0001), | ||
clip_grad=dict(max_norm=0.1, norm_type=2), | ||
paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)}) | ||
) # custom_keys contains sampling_offsets and reference_points in DeformDETR # noqa | ||
|
||
# learning policy | ||
max_epochs = 12 | ||
train_cfg = dict( | ||
type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) | ||
|
||
val_cfg = dict(type='ValLoop') | ||
test_cfg = dict(type='TestLoop') | ||
|
||
param_scheduler = [ | ||
dict( | ||
type='MultiStepLR', | ||
begin=0, | ||
end=max_epochs, | ||
by_epoch=True, | ||
milestones=[11], | ||
gamma=0.1) | ||
] | ||
|
||
# NOTE: `auto_scale_lr` is for automatically scaling LR, | ||
# USER SHOULD NOT CHANGE ITS VALUES. | ||
# base_batch_size = (8 GPUs) x (2 samples per GPU) | ||
auto_scale_lr = dict(enable=True, base_batch_size=16) |
Oops, something went wrong.