18 changed files with 3892 additions and 7 deletions
@ -0,0 +1,140 @@ |
|||
aic_info = dict( |
|||
dataset_name='aic', |
|||
paper_info=dict( |
|||
author='Wu, Jiahong and Zheng, He and Zhao, Bo and ' |
|||
'Li, Yixin and Yan, Baoming and Liang, Rui and ' |
|||
'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and ' |
|||
'Fu, Yanwei and others', |
|||
title='Ai challenger: A large-scale dataset for going ' |
|||
'deeper in image understanding', |
|||
container='arXiv', |
|||
year='2017', |
|||
homepage='https://github.com/AIChallenger/AI_Challenger_2017', |
|||
), |
|||
keypoint_info={ |
|||
0: |
|||
dict( |
|||
name='right_shoulder', |
|||
id=0, |
|||
color=[255, 128, 0], |
|||
type='upper', |
|||
swap='left_shoulder'), |
|||
1: |
|||
dict( |
|||
name='right_elbow', |
|||
id=1, |
|||
color=[255, 128, 0], |
|||
type='upper', |
|||
swap='left_elbow'), |
|||
2: |
|||
dict( |
|||
name='right_wrist', |
|||
id=2, |
|||
color=[255, 128, 0], |
|||
type='upper', |
|||
swap='left_wrist'), |
|||
3: |
|||
dict( |
|||
name='left_shoulder', |
|||
id=3, |
|||
color=[0, 255, 0], |
|||
type='upper', |
|||
swap='right_shoulder'), |
|||
4: |
|||
dict( |
|||
name='left_elbow', |
|||
id=4, |
|||
color=[0, 255, 0], |
|||
type='upper', |
|||
swap='right_elbow'), |
|||
5: |
|||
dict( |
|||
name='left_wrist', |
|||
id=5, |
|||
color=[0, 255, 0], |
|||
type='upper', |
|||
swap='right_wrist'), |
|||
6: |
|||
dict( |
|||
name='right_hip', |
|||
id=6, |
|||
color=[255, 128, 0], |
|||
type='lower', |
|||
swap='left_hip'), |
|||
7: |
|||
dict( |
|||
name='right_knee', |
|||
id=7, |
|||
color=[255, 128, 0], |
|||
type='lower', |
|||
swap='left_knee'), |
|||
8: |
|||
dict( |
|||
name='right_ankle', |
|||
id=8, |
|||
color=[255, 128, 0], |
|||
type='lower', |
|||
swap='left_ankle'), |
|||
9: |
|||
dict( |
|||
name='left_hip', |
|||
id=9, |
|||
color=[0, 255, 0], |
|||
type='lower', |
|||
swap='right_hip'), |
|||
10: |
|||
dict( |
|||
name='left_knee', |
|||
id=10, |
|||
color=[0, 255, 0], |
|||
type='lower', |
|||
swap='right_knee'), |
|||
11: |
|||
dict( |
|||
name='left_ankle', |
|||
id=11, |
|||
color=[0, 255, 0], |
|||
type='lower', |
|||
swap='right_ankle'), |
|||
12: |
|||
dict( |
|||
name='head_top', |
|||
id=12, |
|||
color=[51, 153, 255], |
|||
type='upper', |
|||
swap=''), |
|||
13: |
|||
dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='') |
|||
}, |
|||
skeleton_info={ |
|||
0: |
|||
dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]), |
|||
1: dict( |
|||
link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]), |
|||
2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]), |
|||
3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]), |
|||
4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]), |
|||
5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]), |
|||
6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]), |
|||
7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]), |
|||
8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]), |
|||
9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]), |
|||
10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]), |
|||
11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]), |
|||
12: dict( |
|||
link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]), |
|||
13: |
|||
dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255]) |
|||
}, |
|||
joint_weights=[ |
|||
1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1. |
|||
], |
|||
|
|||
# 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/' |
|||
# 'Evaluation/keypoint_eval/keypoint_eval.py#L50' |
|||
# delta = 2 x sigma |
|||
sigmas=[ |
|||
0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144, |
|||
0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081, |
|||
0.01291456, 0.01236173 |
|||
]) |
@ -0,0 +1,142 @@ |
|||
ap10k_info = dict( |
|||
dataset_name='ap10k', |
|||
paper_info=dict( |
|||
author='Yu, Hang and Xu, Yufei and Zhang, Jing and ' |
|||
'Zhao, Wei and Guan, Ziyu and Tao, Dacheng', |
|||
title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild', |
|||
container='35th Conference on Neural Information Processing Systems ' |
|||
'(NeurIPS 2021) Track on Datasets and Bench-marks.', |
|||
year='2021', |
|||
homepage='https://github.com/AlexTheBad/AP-10K', |
|||
), |
|||
keypoint_info={ |
|||
0: |
|||
dict( |
|||
name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'), |
|||
1: |
|||
dict( |
|||
name='R_Eye', |
|||
id=1, |
|||
color=[255, 128, 0], |
|||
type='upper', |
|||
swap='L_Eye'), |
|||
2: |
|||
dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''), |
|||
3: |
|||
dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''), |
|||
4: |
|||
dict( |
|||
name='Root of tail', |
|||
id=4, |
|||
color=[51, 153, 255], |
|||
type='lower', |
|||
swap=''), |
|||
5: |
|||
dict( |
|||
name='L_Shoulder', |
|||
id=5, |
|||
color=[51, 153, 255], |
|||
type='upper', |
|||
swap='R_Shoulder'), |
|||
6: |
|||
dict( |
|||
name='L_Elbow', |
|||
id=6, |
|||
color=[51, 153, 255], |
|||
type='upper', |
|||
swap='R_Elbow'), |
|||
7: |
|||
dict( |
|||
name='L_F_Paw', |
|||
id=7, |
|||
color=[0, 255, 0], |
|||
type='upper', |
|||
swap='R_F_Paw'), |
|||
8: |
|||
dict( |
|||
name='R_Shoulder', |
|||
id=8, |
|||
color=[0, 255, 0], |
|||
type='upper', |
|||
swap='L_Shoulder'), |
|||
9: |
|||
dict( |
|||
name='R_Elbow', |
|||
id=9, |
|||
color=[255, 128, 0], |
|||
type='upper', |
|||
swap='L_Elbow'), |
|||
10: |
|||
dict( |
|||
name='R_F_Paw', |
|||
id=10, |
|||
color=[0, 255, 0], |
|||
type='lower', |
|||
swap='L_F_Paw'), |
|||
11: |
|||
dict( |
|||
name='L_Hip', |
|||
id=11, |
|||
color=[255, 128, 0], |
|||
type='lower', |
|||
swap='R_Hip'), |
|||
12: |
|||
dict( |
|||
name='L_Knee', |
|||
id=12, |
|||
color=[255, 128, 0], |
|||
type='lower', |
|||
swap='R_Knee'), |
|||
13: |
|||
dict( |
|||
name='L_B_Paw', |
|||
id=13, |
|||
color=[0, 255, 0], |
|||
type='lower', |
|||
swap='R_B_Paw'), |
|||
14: |
|||
dict( |
|||
name='R_Hip', id=14, color=[0, 255, 0], type='lower', |
|||
swap='L_Hip'), |
|||
15: |
|||
dict( |
|||
name='R_Knee', |
|||
id=15, |
|||
color=[0, 255, 0], |
|||
type='lower', |
|||
swap='L_Knee'), |
|||
16: |
|||
dict( |
|||
name='R_B_Paw', |
|||
id=16, |
|||
color=[0, 255, 0], |
|||
type='lower', |
|||
swap='L_B_Paw'), |
|||
}, |
|||
skeleton_info={ |
|||
0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]), |
|||
1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]), |
|||
2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]), |
|||
3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]), |
|||
4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]), |
|||
5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]), |
|||
6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]), |
|||
7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]), |
|||
8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]), |
|||
9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]), |
|||
10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]), |
|||
11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]), |
|||
12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]), |
|||
13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]), |
|||
14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]), |
|||
15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]), |
|||
16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]), |
|||
}, |
|||
joint_weights=[ |
|||
1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, |
|||
1.5 |
|||
], |
|||
sigmas=[ |
|||
0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072, |
|||
0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089 |
|||
]) |
File diff suppressed because it is too large
@ -0,0 +1,155 @@ |
|||
mpii_info = dict( |
|||
dataset_name='mpii', |
|||
paper_info=dict( |
|||
author='Mykhaylo Andriluka and Leonid Pishchulin and ' |
|||
'Peter Gehler and Schiele, Bernt', |
|||
title='2D Human Pose Estimation: New Benchmark and ' |
|||
'State of the Art Analysis', |
|||
container='IEEE Conference on Computer Vision and ' |
|||
'Pattern Recognition (CVPR)', |
|||
year='2014', |
|||
homepage='http://human-pose.mpi-inf.mpg.de/', |
|||
), |
|||
keypoint_info={ |
|||
0: |
|||
dict( |
|||
name='right_ankle', |
|||
id=0, |
|||
color=[255, 128, 0], |
|||
type='lower', |
|||
swap='left_ankle'), |
|||
1: |
|||
dict( |
|||
name='right_knee', |
|||
id=1, |
|||
color=[255, 128, 0], |
|||
type='lower', |
|||
swap='left_knee'), |
|||
2: |
|||
dict( |
|||
name='right_hip', |
|||
id=2, |
|||
color=[255, 128, 0], |
|||
type='lower', |
|||
swap='left_hip'), |
|||
3: |
|||
dict( |
|||
name='left_hip', |
|||
id=3, |
|||
color=[0, 255, 0], |
|||
type='lower', |
|||
swap='right_hip'), |
|||
4: |
|||
dict( |
|||
name='left_knee', |
|||
id=4, |
|||
color=[0, 255, 0], |
|||
type='lower', |
|||
swap='right_knee'), |
|||
5: |
|||
dict( |
|||
name='left_ankle', |
|||
id=5, |
|||
color=[0, 255, 0], |
|||
type='lower', |
|||
swap='right_ankle'), |
|||
6: |
|||
dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''), |
|||
7: |
|||
dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''), |
|||
8: |
|||
dict( |
|||
name='upper_neck', |
|||
id=8, |
|||
color=[51, 153, 255], |
|||
type='upper', |
|||
swap=''), |
|||
9: |
|||
dict( |
|||
name='head_top', id=9, color=[51, 153, 255], type='upper', |
|||
swap=''), |
|||
10: |
|||
dict( |
|||
name='right_wrist', |
|||
id=10, |
|||
color=[255, 128, 0], |
|||
type='upper', |
|||
swap='left_wrist'), |
|||
11: |
|||
dict( |
|||
name='right_elbow', |
|||
id=11, |
|||
color=[255, 128, 0], |
|||
type='upper', |
|||
swap='left_elbow'), |
|||
12: |
|||
dict( |
|||
name='right_shoulder', |
|||
id=12, |
|||
color=[255, 128, 0], |
|||
type='upper', |
|||
swap='left_shoulder'), |
|||
13: |
|||
dict( |
|||
name='left_shoulder', |
|||
id=13, |
|||
color=[0, 255, 0], |
|||
type='upper', |
|||
swap='right_shoulder'), |
|||
14: |
|||
dict( |
|||
name='left_elbow', |
|||
id=14, |
|||
color=[0, 255, 0], |
|||
type='upper', |
|||
swap='right_elbow'), |
|||
15: |
|||
dict( |
|||
name='left_wrist', |
|||
id=15, |
|||
color=[0, 255, 0], |
|||
type='upper', |
|||
swap='right_wrist') |
|||
}, |
|||
skeleton_info={ |
|||
0: |
|||
dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), |
|||
1: |
|||
dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]), |
|||
2: |
|||
dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]), |
|||
3: |
|||
dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]), |
|||
4: |
|||
dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]), |
|||
5: |
|||
dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]), |
|||
6: |
|||
dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]), |
|||
7: |
|||
dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]), |
|||
8: |
|||
dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]), |
|||
9: |
|||
dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]), |
|||
10: |
|||
dict( |
|||
link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128, |
|||
0]), |
|||
11: |
|||
dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), |
|||
12: |
|||
dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]), |
|||
13: |
|||
dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]), |
|||
14: |
|||
dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0]) |
|||
}, |
|||
joint_weights=[ |
|||
1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5 |
|||
], |
|||
# Adapted from COCO dataset. |
|||
sigmas=[ |
|||
0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026, |
|||
0.062, 0.072, 0.179, 0.179, 0.072, 0.062 |
|||
]) |
@ -0,0 +1,500 @@ |
|||
_base_ = [ |
|||
'../../../../_base_/default_runtime.py', |
|||
'../../../../_base_/datasets/coco.py', |
|||
'../../../../_base_/datasets/aic_info.py', |
|||
'../../../../_base_/datasets/mpii_info.py', |
|||
'../../../../_base_/datasets/ap10k_info.py', |
|||
'../../../../_base_/datasets/coco_wholebody_info.py' |
|||
] |
|||
evaluation = dict(interval=10, metric='mAP', save_best='AP') |
|||
|
|||
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1, |
|||
constructor='LayerDecayOptimizerConstructor', |
|||
paramwise_cfg=dict( |
|||
num_layers=12, |
|||
layer_decay_rate=0.75, |
|||
custom_keys={ |
|||
'bias': dict(decay_multi=0.), |
|||
'pos_embed': dict(decay_mult=0.), |
|||
'relative_position_bias_table': dict(decay_mult=0.), |
|||
'norm': dict(decay_mult=0.) |
|||
} |
|||
) |
|||
) |
|||
|
|||
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) |
|||
|
|||
# learning policy |
|||
lr_config = dict( |
|||
policy='step', |
|||
warmup='linear', |
|||
warmup_iters=500, |
|||
warmup_ratio=0.001, |
|||
step=[170, 200]) |
|||
total_epochs = 210 |
|||
target_type = 'GaussianHeatmap' |
|||
channel_cfg = dict( |
|||
num_output_channels=17, |
|||
dataset_joints=17, |
|||
dataset_channel=[ |
|||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], |
|||
], |
|||
inference_channel=[ |
|||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
|||
]) |
|||
aic_channel_cfg = dict( |
|||
num_output_channels=14, |
|||
dataset_joints=14, |
|||
dataset_channel=[ |
|||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
|||
], |
|||
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) |
|||
mpii_channel_cfg = dict( |
|||
num_output_channels=16, |
|||
dataset_joints=16, |
|||
dataset_channel=list(range(16)), |
|||
inference_channel=list(range(16))) |
|||
crowdpose_channel_cfg = dict( |
|||
num_output_channels=14, |
|||
dataset_joints=14, |
|||
dataset_channel=[ |
|||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
|||
], |
|||
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) |
|||
ap10k_channel_cfg = dict( |
|||
num_output_channels=17, |
|||
dataset_joints=17, |
|||
dataset_channel=[ |
|||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], |
|||
], |
|||
inference_channel=[ |
|||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
|||
]) |
|||
cocowholebody_channel_cfg = dict( |
|||
num_output_channels=133, |
|||
dataset_joints=133, |
|||
dataset_channel=[ |
|||
list(range(133)), |
|||
], |
|||
inference_channel=list(range(133))) |
|||
|
|||
|
|||
# model settings |
|||
model = dict( |
|||
type='TopDownMoE', |
|||
pretrained=None, |
|||
backbone=dict( |
|||
type='ViTMoE', |
|||
img_size=(256, 192), |
|||
patch_size=16, |
|||
embed_dim=768, |
|||
depth=12, |
|||
num_heads=12, |
|||
ratio=1, |
|||
use_checkpoint=False, |
|||
mlp_ratio=4, |
|||
qkv_bias=True, |
|||
drop_path_rate=0.3, |
|||
num_expert=6, |
|||
part_features=192 |
|||
), |
|||
keypoint_head=dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=768, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
associate_keypoint_head=[ |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=768, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=aic_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=768, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=mpii_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=768, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=crowdpose_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=768, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=ap10k_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=768, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=ap10k_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=768, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=cocowholebody_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
], |
|||
train_cfg=dict(), |
|||
test_cfg=dict( |
|||
flip_test=True, |
|||
post_process='default', |
|||
shift_heatmap=False, |
|||
target_type=target_type, |
|||
modulate_kernel=11, |
|||
use_udp=True)) |
|||
|
|||
data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=channel_cfg['num_output_channels'], |
|||
num_joints=channel_cfg['dataset_joints'], |
|||
dataset_channel=channel_cfg['dataset_channel'], |
|||
inference_channel=channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=False, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='data/coco/person_detection_results/' |
|||
'COCO_val2017_detections_AP_H_56_person.json', |
|||
max_num_joints=133, |
|||
dataset_idx=0, |
|||
) |
|||
|
|||
aic_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=aic_channel_cfg['num_output_channels'], |
|||
num_joints=aic_channel_cfg['dataset_joints'], |
|||
dataset_channel=aic_channel_cfg['dataset_channel'], |
|||
inference_channel=aic_channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=True, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='data/coco/person_detection_results/' |
|||
'COCO_val2017_detections_AP_H_56_person.json', |
|||
max_num_joints=133, |
|||
dataset_idx=1, |
|||
) |
|||
|
|||
mpii_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=mpii_channel_cfg['num_output_channels'], |
|||
num_joints=mpii_channel_cfg['dataset_joints'], |
|||
dataset_channel=mpii_channel_cfg['dataset_channel'], |
|||
inference_channel=mpii_channel_cfg['inference_channel'], |
|||
max_num_joints=133, |
|||
dataset_idx=2, |
|||
use_gt_bbox=True, |
|||
bbox_file=None, |
|||
) |
|||
|
|||
ap10k_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=channel_cfg['num_output_channels'], |
|||
num_joints=channel_cfg['dataset_joints'], |
|||
dataset_channel=channel_cfg['dataset_channel'], |
|||
inference_channel=channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=True, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='', |
|||
max_num_joints=133, |
|||
dataset_idx=3, |
|||
) |
|||
|
|||
ap36k_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=channel_cfg['num_output_channels'], |
|||
num_joints=channel_cfg['dataset_joints'], |
|||
dataset_channel=channel_cfg['dataset_channel'], |
|||
inference_channel=channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=True, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='', |
|||
max_num_joints=133, |
|||
dataset_idx=4, |
|||
) |
|||
|
|||
cocowholebody_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=cocowholebody_channel_cfg['num_output_channels'], |
|||
num_joints=cocowholebody_channel_cfg['dataset_joints'], |
|||
dataset_channel=cocowholebody_channel_cfg['dataset_channel'], |
|||
inference_channel=cocowholebody_channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=False, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='data/coco/person_detection_results/' |
|||
'COCO_val2017_detections_AP_H_56_person.json', |
|||
dataset_idx=5, |
|||
max_num_joints=133, |
|||
) |
|||
|
|||
cocowholebody_train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownHalfBodyTransform', |
|||
num_joints_half_body=8, |
|||
prob_half_body=0.3), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine'), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict(type='TopDownGenerateTarget', sigma=2), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
ap10k_train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownHalfBodyTransform', |
|||
num_joints_half_body=8, |
|||
prob_half_body=0.3), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine'), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict(type='TopDownGenerateTarget', sigma=2), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
aic_train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownHalfBodyTransform', |
|||
num_joints_half_body=8, |
|||
prob_half_body=0.3), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine'), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict(type='TopDownGenerateTarget', sigma=2), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
mpii_train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine', use_udp=True), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict( |
|||
type='TopDownGenerateTarget', |
|||
sigma=2, |
|||
encoding='UDP', |
|||
target_type=target_type), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownHalfBodyTransform', |
|||
num_joints_half_body=8, |
|||
prob_half_body=0.3), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine', use_udp=True), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict( |
|||
type='TopDownGenerateTarget', |
|||
sigma=2, |
|||
encoding='UDP', |
|||
target_type=target_type), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
val_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownAffine', use_udp=True), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img'], |
|||
meta_keys=[ |
|||
'image_file', 'center', 'scale', 'rotation', 'bbox_score', |
|||
'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
test_pipeline = val_pipeline |
|||
|
|||
data_root = 'data/coco' |
|||
aic_data_root = 'data/aic' |
|||
mpii_data_root = 'data/mpii' |
|||
ap10k_data_root = 'data/ap10k' |
|||
ap36k_data_root = 'data/ap36k' |
|||
|
|||
data = dict( |
|||
samples_per_gpu=128, |
|||
workers_per_gpu=8, |
|||
val_dataloader=dict(samples_per_gpu=64), |
|||
test_dataloader=dict(samples_per_gpu=64), |
|||
train=[ |
|||
dict( |
|||
type='TopDownCocoDataset', |
|||
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', |
|||
img_prefix=f'{data_root}/train2017/', |
|||
data_cfg=data_cfg, |
|||
pipeline=train_pipeline, |
|||
dataset_info={{_base_.dataset_info}}), |
|||
dict( |
|||
type='TopDownAicDataset', |
|||
ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json', |
|||
img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/' |
|||
'keypoint_train_images_20170902/', |
|||
data_cfg=aic_data_cfg, |
|||
pipeline=aic_train_pipeline, |
|||
dataset_info={{_base_.aic_info}}), |
|||
dict( |
|||
type='TopDownMpiiDataset', |
|||
ann_file=f'{mpii_data_root}/annotations/mpii_train.json', |
|||
img_prefix=f'{mpii_data_root}/images/', |
|||
data_cfg=mpii_data_cfg, |
|||
pipeline=mpii_train_pipeline, |
|||
dataset_info={{_base_.mpii_info}}), |
|||
dict( |
|||
type='AnimalAP10KDataset', |
|||
ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json', |
|||
img_prefix=f'{ap10k_data_root}/data/', |
|||
data_cfg=ap10k_data_cfg, |
|||
pipeline=ap10k_train_pipeline, |
|||
dataset_info={{_base_.ap10k_info}}), |
|||
dict( |
|||
type='AnimalAP10KDataset', |
|||
ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json', |
|||
img_prefix=f'{ap36k_data_root}/', |
|||
data_cfg=ap36k_data_cfg, |
|||
pipeline=ap10k_train_pipeline, |
|||
dataset_info={{_base_.ap10k_info}}), |
|||
dict( |
|||
type='TopDownCocoWholeBodyDataset', |
|||
ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', |
|||
img_prefix=f'{data_root}/train2017/', |
|||
data_cfg=cocowholebody_data_cfg, |
|||
pipeline=cocowholebody_train_pipeline, |
|||
dataset_info={{_base_.cocowholebody_info}}), |
|||
], |
|||
val=dict( |
|||
type='TopDownCocoDataset', |
|||
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', |
|||
img_prefix=f'{data_root}/val2017/', |
|||
data_cfg=data_cfg, |
|||
pipeline=val_pipeline, |
|||
dataset_info={{_base_.dataset_info}}), |
|||
test=dict( |
|||
type='TopDownCocoDataset', |
|||
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', |
|||
img_prefix=f'{data_root}/val2017/', |
|||
data_cfg=data_cfg, |
|||
pipeline=test_pipeline, |
|||
dataset_info={{_base_.dataset_info}}), |
|||
) |
|||
|
@ -0,0 +1,500 @@ |
|||
_base_ = [ |
|||
'../../../../_base_/default_runtime.py', |
|||
'../../../../_base_/datasets/coco.py', |
|||
'../../../../_base_/datasets/aic_info.py', |
|||
'../../../../_base_/datasets/mpii_info.py', |
|||
'../../../../_base_/datasets/ap10k_info.py', |
|||
'../../../../_base_/datasets/coco_wholebody_info.py' |
|||
] |
|||
evaluation = dict(interval=10, metric='mAP', save_best='AP') |
|||
|
|||
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1, |
|||
constructor='LayerDecayOptimizerConstructor', |
|||
paramwise_cfg=dict( |
|||
num_layers=32, |
|||
layer_decay_rate=0.8, |
|||
custom_keys={ |
|||
'bias': dict(decay_multi=0.), |
|||
'pos_embed': dict(decay_mult=0.), |
|||
'relative_position_bias_table': dict(decay_mult=0.), |
|||
'norm': dict(decay_mult=0.) |
|||
} |
|||
) |
|||
) |
|||
|
|||
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) |
|||
|
|||
# learning policy |
|||
lr_config = dict( |
|||
policy='step', |
|||
warmup='linear', |
|||
warmup_iters=500, |
|||
warmup_ratio=0.001, |
|||
step=[170, 200]) |
|||
total_epochs = 210 |
|||
target_type = 'GaussianHeatmap' |
|||
channel_cfg = dict( |
|||
num_output_channels=17, |
|||
dataset_joints=17, |
|||
dataset_channel=[ |
|||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], |
|||
], |
|||
inference_channel=[ |
|||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
|||
]) |
|||
aic_channel_cfg = dict( |
|||
num_output_channels=14, |
|||
dataset_joints=14, |
|||
dataset_channel=[ |
|||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
|||
], |
|||
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) |
|||
mpii_channel_cfg = dict( |
|||
num_output_channels=16, |
|||
dataset_joints=16, |
|||
dataset_channel=list(range(16)), |
|||
inference_channel=list(range(16))) |
|||
crowdpose_channel_cfg = dict( |
|||
num_output_channels=14, |
|||
dataset_joints=14, |
|||
dataset_channel=[ |
|||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
|||
], |
|||
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) |
|||
ap10k_channel_cfg = dict( |
|||
num_output_channels=17, |
|||
dataset_joints=17, |
|||
dataset_channel=[ |
|||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], |
|||
], |
|||
inference_channel=[ |
|||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
|||
]) |
|||
cocowholebody_channel_cfg = dict( |
|||
num_output_channels=133, |
|||
dataset_joints=133, |
|||
dataset_channel=[ |
|||
list(range(133)), |
|||
], |
|||
inference_channel=list(range(133))) |
|||
|
|||
|
|||
# model settings |
|||
model = dict( |
|||
type='TopDownMoE', |
|||
pretrained=None, |
|||
backbone=dict( |
|||
type='ViTMoE', |
|||
img_size=(256, 192), |
|||
patch_size=16, |
|||
embed_dim=1280, |
|||
depth=32, |
|||
num_heads=16, |
|||
ratio=1, |
|||
use_checkpoint=False, |
|||
mlp_ratio=4, |
|||
qkv_bias=True, |
|||
drop_path_rate=0.55, |
|||
num_expert=6, |
|||
part_features=320 |
|||
), |
|||
keypoint_head=dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1280, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
associate_keypoint_head=[ |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1280, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=aic_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1280, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=mpii_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1280, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=crowdpose_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1280, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=ap10k_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1280, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=ap10k_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1280, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=cocowholebody_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
], |
|||
train_cfg=dict(), |
|||
test_cfg=dict( |
|||
flip_test=True, |
|||
post_process='default', |
|||
shift_heatmap=False, |
|||
target_type=target_type, |
|||
modulate_kernel=11, |
|||
use_udp=True)) |
|||
|
|||
data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=channel_cfg['num_output_channels'], |
|||
num_joints=channel_cfg['dataset_joints'], |
|||
dataset_channel=channel_cfg['dataset_channel'], |
|||
inference_channel=channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=False, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='data/coco/person_detection_results/' |
|||
'COCO_val2017_detections_AP_H_56_person.json', |
|||
max_num_joints=133, |
|||
dataset_idx=0, |
|||
) |
|||
|
|||
aic_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=aic_channel_cfg['num_output_channels'], |
|||
num_joints=aic_channel_cfg['dataset_joints'], |
|||
dataset_channel=aic_channel_cfg['dataset_channel'], |
|||
inference_channel=aic_channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=True, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='data/coco/person_detection_results/' |
|||
'COCO_val2017_detections_AP_H_56_person.json', |
|||
max_num_joints=133, |
|||
dataset_idx=1, |
|||
) |
|||
|
|||
mpii_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=mpii_channel_cfg['num_output_channels'], |
|||
num_joints=mpii_channel_cfg['dataset_joints'], |
|||
dataset_channel=mpii_channel_cfg['dataset_channel'], |
|||
inference_channel=mpii_channel_cfg['inference_channel'], |
|||
max_num_joints=133, |
|||
dataset_idx=2, |
|||
use_gt_bbox=True, |
|||
bbox_file=None, |
|||
) |
|||
|
|||
ap10k_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=channel_cfg['num_output_channels'], |
|||
num_joints=channel_cfg['dataset_joints'], |
|||
dataset_channel=channel_cfg['dataset_channel'], |
|||
inference_channel=channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=True, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='', |
|||
max_num_joints=133, |
|||
dataset_idx=3, |
|||
) |
|||
|
|||
ap36k_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=channel_cfg['num_output_channels'], |
|||
num_joints=channel_cfg['dataset_joints'], |
|||
dataset_channel=channel_cfg['dataset_channel'], |
|||
inference_channel=channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=True, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='', |
|||
max_num_joints=133, |
|||
dataset_idx=4, |
|||
) |
|||
|
|||
cocowholebody_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=cocowholebody_channel_cfg['num_output_channels'], |
|||
num_joints=cocowholebody_channel_cfg['dataset_joints'], |
|||
dataset_channel=cocowholebody_channel_cfg['dataset_channel'], |
|||
inference_channel=cocowholebody_channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=False, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='data/coco/person_detection_results/' |
|||
'COCO_val2017_detections_AP_H_56_person.json', |
|||
dataset_idx=5, |
|||
max_num_joints=133, |
|||
) |
|||
|
|||
cocowholebody_train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownHalfBodyTransform', |
|||
num_joints_half_body=8, |
|||
prob_half_body=0.3), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine'), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict(type='TopDownGenerateTarget', sigma=2), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
ap10k_train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownHalfBodyTransform', |
|||
num_joints_half_body=8, |
|||
prob_half_body=0.3), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine'), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict(type='TopDownGenerateTarget', sigma=2), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
aic_train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownHalfBodyTransform', |
|||
num_joints_half_body=8, |
|||
prob_half_body=0.3), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine'), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict(type='TopDownGenerateTarget', sigma=2), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
mpii_train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine', use_udp=True), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict( |
|||
type='TopDownGenerateTarget', |
|||
sigma=2, |
|||
encoding='UDP', |
|||
target_type=target_type), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownHalfBodyTransform', |
|||
num_joints_half_body=8, |
|||
prob_half_body=0.3), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine', use_udp=True), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict( |
|||
type='TopDownGenerateTarget', |
|||
sigma=2, |
|||
encoding='UDP', |
|||
target_type=target_type), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
val_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownAffine', use_udp=True), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img'], |
|||
meta_keys=[ |
|||
'image_file', 'center', 'scale', 'rotation', 'bbox_score', |
|||
'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
test_pipeline = val_pipeline |
|||
|
|||
data_root = 'data/coco' |
|||
aic_data_root = 'data/aic' |
|||
mpii_data_root = 'data/mpii' |
|||
ap10k_data_root = 'data/ap10k' |
|||
ap36k_data_root = 'data/ap36k' |
|||
|
|||
data = dict( |
|||
samples_per_gpu=128, |
|||
workers_per_gpu=8, |
|||
val_dataloader=dict(samples_per_gpu=64), |
|||
test_dataloader=dict(samples_per_gpu=64), |
|||
train=[ |
|||
dict( |
|||
type='TopDownCocoDataset', |
|||
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', |
|||
img_prefix=f'{data_root}/train2017/', |
|||
data_cfg=data_cfg, |
|||
pipeline=train_pipeline, |
|||
dataset_info={{_base_.dataset_info}}), |
|||
dict( |
|||
type='TopDownAicDataset', |
|||
ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json', |
|||
img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/' |
|||
'keypoint_train_images_20170902/', |
|||
data_cfg=aic_data_cfg, |
|||
pipeline=aic_train_pipeline, |
|||
dataset_info={{_base_.aic_info}}), |
|||
dict( |
|||
type='TopDownMpiiDataset', |
|||
ann_file=f'{mpii_data_root}/annotations/mpii_train.json', |
|||
img_prefix=f'{mpii_data_root}/images/', |
|||
data_cfg=mpii_data_cfg, |
|||
pipeline=mpii_train_pipeline, |
|||
dataset_info={{_base_.mpii_info}}), |
|||
dict( |
|||
type='AnimalAP10KDataset', |
|||
ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json', |
|||
img_prefix=f'{ap10k_data_root}/data/', |
|||
data_cfg=ap10k_data_cfg, |
|||
pipeline=ap10k_train_pipeline, |
|||
dataset_info={{_base_.ap10k_info}}), |
|||
dict( |
|||
type='AnimalAP10KDataset', |
|||
ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json', |
|||
img_prefix=f'{ap36k_data_root}/', |
|||
data_cfg=ap36k_data_cfg, |
|||
pipeline=ap10k_train_pipeline, |
|||
dataset_info={{_base_.ap10k_info}}), |
|||
dict( |
|||
type='TopDownCocoWholeBodyDataset', |
|||
ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', |
|||
img_prefix=f'{data_root}/train2017/', |
|||
data_cfg=cocowholebody_data_cfg, |
|||
pipeline=cocowholebody_train_pipeline, |
|||
dataset_info={{_base_.cocowholebody_info}}), |
|||
], |
|||
val=dict( |
|||
type='TopDownCocoDataset', |
|||
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', |
|||
img_prefix=f'{data_root}/val2017/', |
|||
data_cfg=data_cfg, |
|||
pipeline=val_pipeline, |
|||
dataset_info={{_base_.dataset_info}}), |
|||
test=dict( |
|||
type='TopDownCocoDataset', |
|||
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', |
|||
img_prefix=f'{data_root}/val2017/', |
|||
data_cfg=data_cfg, |
|||
pipeline=test_pipeline, |
|||
dataset_info={{_base_.dataset_info}}), |
|||
) |
|||
|
@ -0,0 +1,500 @@ |
|||
_base_ = [ |
|||
'../../../../_base_/default_runtime.py', |
|||
'../../../../_base_/datasets/coco.py', |
|||
'../../../../_base_/datasets/aic_info.py', |
|||
'../../../../_base_/datasets/mpii_info.py', |
|||
'../../../../_base_/datasets/ap10k_info.py', |
|||
'../../../../_base_/datasets/coco_wholebody_info.py' |
|||
] |
|||
evaluation = dict(interval=10, metric='mAP', save_best='AP') |
|||
|
|||
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1, |
|||
constructor='LayerDecayOptimizerConstructor', |
|||
paramwise_cfg=dict( |
|||
num_layers=24, |
|||
layer_decay_rate=0.8, |
|||
custom_keys={ |
|||
'bias': dict(decay_multi=0.), |
|||
'pos_embed': dict(decay_mult=0.), |
|||
'relative_position_bias_table': dict(decay_mult=0.), |
|||
'norm': dict(decay_mult=0.) |
|||
} |
|||
) |
|||
) |
|||
|
|||
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) |
|||
|
|||
# learning policy |
|||
lr_config = dict( |
|||
policy='step', |
|||
warmup='linear', |
|||
warmup_iters=500, |
|||
warmup_ratio=0.001, |
|||
step=[170, 200]) |
|||
total_epochs = 210 |
|||
target_type = 'GaussianHeatmap' |
|||
channel_cfg = dict( |
|||
num_output_channels=17, |
|||
dataset_joints=17, |
|||
dataset_channel=[ |
|||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], |
|||
], |
|||
inference_channel=[ |
|||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
|||
]) |
|||
aic_channel_cfg = dict( |
|||
num_output_channels=14, |
|||
dataset_joints=14, |
|||
dataset_channel=[ |
|||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
|||
], |
|||
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) |
|||
mpii_channel_cfg = dict( |
|||
num_output_channels=16, |
|||
dataset_joints=16, |
|||
dataset_channel=list(range(16)), |
|||
inference_channel=list(range(16))) |
|||
crowdpose_channel_cfg = dict( |
|||
num_output_channels=14, |
|||
dataset_joints=14, |
|||
dataset_channel=[ |
|||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
|||
], |
|||
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) |
|||
ap10k_channel_cfg = dict( |
|||
num_output_channels=17, |
|||
dataset_joints=17, |
|||
dataset_channel=[ |
|||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], |
|||
], |
|||
inference_channel=[ |
|||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
|||
]) |
|||
cocowholebody_channel_cfg = dict( |
|||
num_output_channels=133, |
|||
dataset_joints=133, |
|||
dataset_channel=[ |
|||
list(range(133)), |
|||
], |
|||
inference_channel=list(range(133))) |
|||
|
|||
|
|||
# model settings |
|||
model = dict( |
|||
type='TopDownMoE', |
|||
pretrained=None, |
|||
backbone=dict( |
|||
type='ViTMoE', |
|||
img_size=(256, 192), |
|||
patch_size=16, |
|||
embed_dim=1024, |
|||
depth=24, |
|||
num_heads=16, |
|||
ratio=1, |
|||
use_checkpoint=False, |
|||
mlp_ratio=4, |
|||
qkv_bias=True, |
|||
drop_path_rate=0.5, |
|||
num_expert=6, |
|||
part_features=256 |
|||
), |
|||
keypoint_head=dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1024, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
associate_keypoint_head=[ |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1024, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=aic_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1024, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=mpii_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1024, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=crowdpose_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1024, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=ap10k_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1024, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=ap10k_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
dict( |
|||
type='TopdownHeatmapSimpleHead', |
|||
in_channels=1024, |
|||
num_deconv_layers=2, |
|||
num_deconv_filters=(256, 256), |
|||
num_deconv_kernels=(4, 4), |
|||
extra=dict(final_conv_kernel=1, ), |
|||
out_channels=cocowholebody_channel_cfg['num_output_channels'], |
|||
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
|||
], |
|||
train_cfg=dict(), |
|||
test_cfg=dict( |
|||
flip_test=True, |
|||
post_process='default', |
|||
shift_heatmap=False, |
|||
target_type=target_type, |
|||
modulate_kernel=11, |
|||
use_udp=True)) |
|||
|
|||
data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=channel_cfg['num_output_channels'], |
|||
num_joints=channel_cfg['dataset_joints'], |
|||
dataset_channel=channel_cfg['dataset_channel'], |
|||
inference_channel=channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=False, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='data/coco/person_detection_results/' |
|||
'COCO_val2017_detections_AP_H_56_person.json', |
|||
max_num_joints=133, |
|||
dataset_idx=0, |
|||
) |
|||
|
|||
aic_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=aic_channel_cfg['num_output_channels'], |
|||
num_joints=aic_channel_cfg['dataset_joints'], |
|||
dataset_channel=aic_channel_cfg['dataset_channel'], |
|||
inference_channel=aic_channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=True, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='data/coco/person_detection_results/' |
|||
'COCO_val2017_detections_AP_H_56_person.json', |
|||
max_num_joints=133, |
|||
dataset_idx=1, |
|||
) |
|||
|
|||
mpii_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=mpii_channel_cfg['num_output_channels'], |
|||
num_joints=mpii_channel_cfg['dataset_joints'], |
|||
dataset_channel=mpii_channel_cfg['dataset_channel'], |
|||
inference_channel=mpii_channel_cfg['inference_channel'], |
|||
max_num_joints=133, |
|||
dataset_idx=2, |
|||
use_gt_bbox=True, |
|||
bbox_file=None, |
|||
) |
|||
|
|||
ap10k_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=channel_cfg['num_output_channels'], |
|||
num_joints=channel_cfg['dataset_joints'], |
|||
dataset_channel=channel_cfg['dataset_channel'], |
|||
inference_channel=channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=True, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='', |
|||
max_num_joints=133, |
|||
dataset_idx=3, |
|||
) |
|||
|
|||
ap36k_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=channel_cfg['num_output_channels'], |
|||
num_joints=channel_cfg['dataset_joints'], |
|||
dataset_channel=channel_cfg['dataset_channel'], |
|||
inference_channel=channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=True, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='', |
|||
max_num_joints=133, |
|||
dataset_idx=4, |
|||
) |
|||
|
|||
cocowholebody_data_cfg = dict( |
|||
image_size=[192, 256], |
|||
heatmap_size=[48, 64], |
|||
num_output_channels=cocowholebody_channel_cfg['num_output_channels'], |
|||
num_joints=cocowholebody_channel_cfg['dataset_joints'], |
|||
dataset_channel=cocowholebody_channel_cfg['dataset_channel'], |
|||
inference_channel=cocowholebody_channel_cfg['inference_channel'], |
|||
soft_nms=False, |
|||
nms_thr=1.0, |
|||
oks_thr=0.9, |
|||
vis_thr=0.2, |
|||
use_gt_bbox=False, |
|||
det_bbox_thr=0.0, |
|||
bbox_file='data/coco/person_detection_results/' |
|||
'COCO_val2017_detections_AP_H_56_person.json', |
|||
dataset_idx=5, |
|||
max_num_joints=133, |
|||
) |
|||
|
|||
cocowholebody_train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownHalfBodyTransform', |
|||
num_joints_half_body=8, |
|||
prob_half_body=0.3), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine'), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict(type='TopDownGenerateTarget', sigma=2), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
ap10k_train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownHalfBodyTransform', |
|||
num_joints_half_body=8, |
|||
prob_half_body=0.3), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine'), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict(type='TopDownGenerateTarget', sigma=2), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
aic_train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownHalfBodyTransform', |
|||
num_joints_half_body=8, |
|||
prob_half_body=0.3), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine'), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict(type='TopDownGenerateTarget', sigma=2), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
mpii_train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine', use_udp=True), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict( |
|||
type='TopDownGenerateTarget', |
|||
sigma=2, |
|||
encoding='UDP', |
|||
target_type=target_type), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
train_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownRandomFlip', flip_prob=0.5), |
|||
dict( |
|||
type='TopDownHalfBodyTransform', |
|||
num_joints_half_body=8, |
|||
prob_half_body=0.3), |
|||
dict( |
|||
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
|||
dict(type='TopDownAffine', use_udp=True), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict( |
|||
type='TopDownGenerateTarget', |
|||
sigma=2, |
|||
encoding='UDP', |
|||
target_type=target_type), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img', 'target', 'target_weight'], |
|||
meta_keys=[ |
|||
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
|||
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
val_pipeline = [ |
|||
dict(type='LoadImageFromFile'), |
|||
dict(type='TopDownAffine', use_udp=True), |
|||
dict(type='ToTensor'), |
|||
dict( |
|||
type='NormalizeTensor', |
|||
mean=[0.485, 0.456, 0.406], |
|||
std=[0.229, 0.224, 0.225]), |
|||
dict( |
|||
type='Collect', |
|||
keys=['img'], |
|||
meta_keys=[ |
|||
'image_file', 'center', 'scale', 'rotation', 'bbox_score', |
|||
'flip_pairs', 'dataset_idx' |
|||
]), |
|||
] |
|||
|
|||
test_pipeline = val_pipeline |
|||
|
|||
data_root = 'data/coco' |
|||
aic_data_root = 'data/aic' |
|||
mpii_data_root = 'data/mpii' |
|||
ap10k_data_root = 'data/ap10k' |
|||
ap36k_data_root = 'data/ap36k' |
|||
|
|||
data = dict( |
|||
samples_per_gpu=128, |
|||
workers_per_gpu=8, |
|||
val_dataloader=dict(samples_per_gpu=64), |
|||
test_dataloader=dict(samples_per_gpu=64), |
|||
train=[ |
|||
dict( |
|||
type='TopDownCocoDataset', |
|||
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', |
|||
img_prefix=f'{data_root}/train2017/', |
|||
data_cfg=data_cfg, |
|||
pipeline=train_pipeline, |
|||
dataset_info={{_base_.dataset_info}}), |
|||
dict( |
|||
type='TopDownAicDataset', |
|||
ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json', |
|||
img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/' |
|||
'keypoint_train_images_20170902/', |
|||
data_cfg=aic_data_cfg, |
|||
pipeline=aic_train_pipeline, |
|||
dataset_info={{_base_.aic_info}}), |
|||
dict( |
|||
type='TopDownMpiiDataset', |
|||
ann_file=f'{mpii_data_root}/annotations/mpii_train.json', |
|||
img_prefix=f'{mpii_data_root}/images/', |
|||
data_cfg=mpii_data_cfg, |
|||
pipeline=mpii_train_pipeline, |
|||
dataset_info={{_base_.mpii_info}}), |
|||
dict( |
|||
type='AnimalAP10KDataset', |
|||
ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json', |
|||
img_prefix=f'{ap10k_data_root}/data/', |
|||
data_cfg=ap10k_data_cfg, |
|||
pipeline=ap10k_train_pipeline, |
|||
dataset_info={{_base_.ap10k_info}}), |
|||
dict( |
|||
type='AnimalAP10KDataset', |
|||
ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json', |
|||
img_prefix=f'{ap36k_data_root}/', |
|||
data_cfg=ap36k_data_cfg, |
|||
pipeline=ap10k_train_pipeline, |
|||
dataset_info={{_base_.ap10k_info}}), |
|||
dict( |
|||
type='TopDownCocoWholeBodyDataset', |
|||
ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', |
|||
img_prefix=f'{data_root}/train2017/', |
|||
data_cfg=cocowholebody_data_cfg, |
|||
pipeline=cocowholebody_train_pipeline, |
|||
dataset_info={{_base_.cocowholebody_info}}), |
|||
], |
|||
val=dict( |
|||
type='TopDownCocoDataset', |
|||
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', |
|||
img_prefix=f'{data_root}/val2017/', |
|||
data_cfg=data_cfg, |
|||
pipeline=val_pipeline, |
|||
dataset_info={{_base_.dataset_info}}), |
|||
test=dict( |
|||
type='TopDownCocoDataset', |
|||
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', |
|||
img_prefix=f'{data_root}/val2017/', |
|||
data_cfg=data_cfg, |
|||
pipeline=test_pipeline, |
|||
dataset_info={{_base_.dataset_info}}), |
|||
) |
|||
|
@ -0,0 +1,384 @@ |
|||
# Copyright (c) OpenMMLab. All rights reserved. |
|||
import math |
|||
|
|||
import torch |
|||
from functools import partial |
|||
import torch.nn as nn |
|||
import torch.nn.functional as F |
|||
import torch.utils.checkpoint as checkpoint |
|||
|
|||
from timm.models.layers import drop_path, to_2tuple, trunc_normal_ |
|||
|
|||
from ..builder import BACKBONES |
|||
from .base_backbone import BaseBackbone |
|||
|
|||
def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True): |
|||
""" |
|||
Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token |
|||
dimension for the original embeddings. |
|||
Args: |
|||
abs_pos (Tensor): absolute positional embeddings with (1, num_position, C). |
|||
has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token. |
|||
hw (Tuple): size of input image tokens. |
|||
|
|||
Returns: |
|||
Absolute positional embeddings after processing with shape (1, H, W, C) |
|||
""" |
|||
cls_token = None |
|||
B, L, C = abs_pos.shape |
|||
if has_cls_token: |
|||
cls_token = abs_pos[:, 0:1] |
|||
abs_pos = abs_pos[:, 1:] |
|||
|
|||
if ori_h != h or ori_w != w: |
|||
new_abs_pos = F.interpolate( |
|||
abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2), |
|||
size=(h, w), |
|||
mode="bicubic", |
|||
align_corners=False, |
|||
).permute(0, 2, 3, 1).reshape(B, -1, C) |
|||
|
|||
else: |
|||
new_abs_pos = abs_pos |
|||
|
|||
if cls_token is not None: |
|||
new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1) |
|||
return new_abs_pos |
|||
|
|||
class DropPath(nn.Module): |
|||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). |
|||
""" |
|||
def __init__(self, drop_prob=None): |
|||
super(DropPath, self).__init__() |
|||
self.drop_prob = drop_prob |
|||
|
|||
def forward(self, x): |
|||
return drop_path(x, self.drop_prob, self.training) |
|||
|
|||
def extra_repr(self): |
|||
return 'p={}'.format(self.drop_prob) |
|||
|
|||
class Mlp(nn.Module): |
|||
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): |
|||
super().__init__() |
|||
out_features = out_features or in_features |
|||
hidden_features = hidden_features or in_features |
|||
self.fc1 = nn.Linear(in_features, hidden_features) |
|||
self.act = act_layer() |
|||
self.fc2 = nn.Linear(hidden_features, out_features) |
|||
self.drop = nn.Dropout(drop) |
|||
|
|||
def forward(self, x): |
|||
x = self.fc1(x) |
|||
x = self.act(x) |
|||
x = self.fc2(x) |
|||
x = self.drop(x) |
|||
return x |
|||
|
|||
class MoEMlp(nn.Module): |
|||
def __init__(self, num_expert=1, in_features=1024, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., part_features=256): |
|||
super().__init__() |
|||
out_features = out_features or in_features |
|||
hidden_features = hidden_features or in_features |
|||
self.part_features = part_features |
|||
self.fc1 = nn.Linear(in_features, hidden_features) |
|||
self.act = act_layer() |
|||
self.fc2 = nn.Linear(hidden_features, out_features - part_features) |
|||
self.drop = nn.Dropout(drop) |
|||
|
|||
self.num_expert = num_expert |
|||
experts = [] |
|||
|
|||
for i in range(num_expert): |
|||
experts.append( |
|||
nn.Linear(hidden_features, part_features) |
|||
) |
|||
self.experts = nn.ModuleList(experts) |
|||
|
|||
def forward(self, x, indices): |
|||
|
|||
expert_x = torch.zeros_like(x[:, :, -self.part_features:], device=x.device, dtype=x.dtype) |
|||
|
|||
x = self.fc1(x) |
|||
x = self.act(x) |
|||
shared_x = self.fc2(x) |
|||
indices = indices.view(-1, 1, 1) |
|||
|
|||
# to support ddp training |
|||
for i in range(self.num_expert): |
|||
selectedIndex = (indices == i) |
|||
current_x = self.experts[i](x) * selectedIndex |
|||
expert_x = expert_x + current_x |
|||
|
|||
x = torch.cat([shared_x, expert_x], dim=-1) |
|||
|
|||
return x |
|||
|
|||
class Attention(nn.Module): |
|||
def __init__( |
|||
self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., |
|||
proj_drop=0., attn_head_dim=None,): |
|||
super().__init__() |
|||
self.num_heads = num_heads |
|||
head_dim = dim // num_heads |
|||
self.dim = dim |
|||
|
|||
if attn_head_dim is not None: |
|||
head_dim = attn_head_dim |
|||
all_head_dim = head_dim * self.num_heads |
|||
|
|||
self.scale = qk_scale or head_dim ** -0.5 |
|||
|
|||
self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias) |
|||
|
|||
self.attn_drop = nn.Dropout(attn_drop) |
|||
self.proj = nn.Linear(all_head_dim, dim) |
|||
self.proj_drop = nn.Dropout(proj_drop) |
|||
|
|||
def forward(self, x): |
|||
B, N, C = x.shape |
|||
qkv = self.qkv(x) |
|||
qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) |
|||
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) |
|||
|
|||
q = q * self.scale |
|||
attn = (q @ k.transpose(-2, -1)) |
|||
|
|||
attn = attn.softmax(dim=-1) |
|||
attn = self.attn_drop(attn) |
|||
|
|||
x = (attn @ v).transpose(1, 2).reshape(B, N, -1) |
|||
x = self.proj(x) |
|||
x = self.proj_drop(x) |
|||
|
|||
return x |
|||
|
|||
class Block(nn.Module): |
|||
|
|||
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, |
|||
drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, |
|||
norm_layer=nn.LayerNorm, attn_head_dim=None, num_expert=1, part_features=None |
|||
): |
|||
super().__init__() |
|||
|
|||
self.norm1 = norm_layer(dim) |
|||
self.attn = Attention( |
|||
dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, |
|||
attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim |
|||
) |
|||
|
|||
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here |
|||
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() |
|||
self.norm2 = norm_layer(dim) |
|||
mlp_hidden_dim = int(dim * mlp_ratio) |
|||
self.mlp = MoEMlp(num_expert=num_expert, in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, mode=mode) |
|||
|
|||
def forward(self, x, indices=None): |
|||
|
|||
x = x + self.drop_path(self.attn(self.norm1(x))) |
|||
x = x + self.drop_path(self.mlp(self.norm2(x), indices)) |
|||
return x |
|||
|
|||
|
|||
class PatchEmbed(nn.Module): |
|||
""" Image to Patch Embedding |
|||
""" |
|||
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1): |
|||
super().__init__() |
|||
img_size = to_2tuple(img_size) |
|||
patch_size = to_2tuple(patch_size) |
|||
num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2) |
|||
self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio)) |
|||
self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1])) |
|||
self.img_size = img_size |
|||
self.patch_size = patch_size |
|||
self.num_patches = num_patches |
|||
|
|||
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1)) |
|||
|
|||
def forward(self, x, **kwargs): |
|||
B, C, H, W = x.shape |
|||
x = self.proj(x) |
|||
Hp, Wp = x.shape[2], x.shape[3] |
|||
|
|||
x = x.flatten(2).transpose(1, 2) |
|||
return x, (Hp, Wp) |
|||
|
|||
|
|||
class HybridEmbed(nn.Module): |
|||
""" CNN Feature Map Embedding |
|||
Extract feature map from CNN, flatten, project to embedding dim. |
|||
""" |
|||
def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768): |
|||
super().__init__() |
|||
assert isinstance(backbone, nn.Module) |
|||
img_size = to_2tuple(img_size) |
|||
self.img_size = img_size |
|||
self.backbone = backbone |
|||
if feature_size is None: |
|||
with torch.no_grad(): |
|||
training = backbone.training |
|||
if training: |
|||
backbone.eval() |
|||
o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1] |
|||
feature_size = o.shape[-2:] |
|||
feature_dim = o.shape[1] |
|||
backbone.train(training) |
|||
else: |
|||
feature_size = to_2tuple(feature_size) |
|||
feature_dim = self.backbone.feature_info.channels()[-1] |
|||
self.num_patches = feature_size[0] * feature_size[1] |
|||
self.proj = nn.Linear(feature_dim, embed_dim) |
|||
|
|||
def forward(self, x): |
|||
x = self.backbone(x)[-1] |
|||
x = x.flatten(2).transpose(1, 2) |
|||
x = self.proj(x) |
|||
return x |
|||
|
|||
|
|||
@BACKBONES.register_module() |
|||
class ViTMoE(BaseBackbone): |
|||
|
|||
def __init__(self, |
|||
img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12, |
|||
num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., |
|||
drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False, |
|||
frozen_stages=-1, ratio=1, last_norm=True, |
|||
patch_padding='pad', freeze_attn=False, freeze_ffn=False, |
|||
num_expert=1, part_features=None |
|||
): |
|||
# Protect mutable default arguments |
|||
super(ViTMoE, self).__init__() |
|||
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) |
|||
self.num_classes = num_classes |
|||
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models |
|||
self.frozen_stages = frozen_stages |
|||
self.use_checkpoint = use_checkpoint |
|||
self.patch_padding = patch_padding |
|||
self.freeze_attn = freeze_attn |
|||
self.freeze_ffn = freeze_ffn |
|||
self.depth = depth |
|||
|
|||
if hybrid_backbone is not None: |
|||
self.patch_embed = HybridEmbed( |
|||
hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim) |
|||
else: |
|||
self.patch_embed = PatchEmbed( |
|||
img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio) |
|||
num_patches = self.patch_embed.num_patches |
|||
|
|||
self.part_features = part_features |
|||
|
|||
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) |
|||
|
|||
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule |
|||
|
|||
self.blocks = nn.ModuleList([ |
|||
Block( |
|||
dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, |
|||
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, |
|||
num_expert=num_expert, part_features=part_features |
|||
) |
|||
for i in range(depth)]) |
|||
|
|||
self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity() |
|||
|
|||
if self.pos_embed is not None: |
|||
trunc_normal_(self.pos_embed, std=.02) |
|||
|
|||
self._freeze_stages() |
|||
|
|||
def _freeze_stages(self): |
|||
"""Freeze parameters.""" |
|||
if self.frozen_stages >= 0: |
|||
self.patch_embed.eval() |
|||
for param in self.patch_embed.parameters(): |
|||
param.requires_grad = False |
|||
|
|||
for i in range(1, self.frozen_stages + 1): |
|||
m = self.blocks[i] |
|||
m.eval() |
|||
for param in m.parameters(): |
|||
param.requires_grad = False |
|||
|
|||
if self.freeze_attn: |
|||
for i in range(0, self.depth): |
|||
m = self.blocks[i] |
|||
m.attn.eval() |
|||
m.norm1.eval() |
|||
for param in m.attn.parameters(): |
|||
param.requires_grad = False |
|||
for param in m.norm1.parameters(): |
|||
param.requires_grad = False |
|||
|
|||
if self.freeze_ffn: |
|||
self.pos_embed.requires_grad = False |
|||
self.patch_embed.eval() |
|||
for param in self.patch_embed.parameters(): |
|||
param.requires_grad = False |
|||
for i in range(0, self.depth): |
|||
m = self.blocks[i] |
|||
m.mlp.eval() |
|||
m.norm2.eval() |
|||
for param in m.mlp.parameters(): |
|||
param.requires_grad = False |
|||
for param in m.norm2.parameters(): |
|||
param.requires_grad = False |
|||
|
|||
def init_weights(self, pretrained=None): |
|||
"""Initialize the weights in backbone. |
|||
Args: |
|||
pretrained (str, optional): Path to pre-trained weights. |
|||
Defaults to None. |
|||
""" |
|||
super().init_weights(pretrained, patch_padding=self.patch_padding, part_features=self.part_features) |
|||
|
|||
if pretrained is None: |
|||
def _init_weights(m): |
|||
if isinstance(m, nn.Linear): |
|||
trunc_normal_(m.weight, std=.02) |
|||
if isinstance(m, nn.Linear) and m.bias is not None: |
|||
nn.init.constant_(m.bias, 0) |
|||
elif isinstance(m, nn.LayerNorm): |
|||
nn.init.constant_(m.bias, 0) |
|||
nn.init.constant_(m.weight, 1.0) |
|||
|
|||
self.apply(_init_weights) |
|||
|
|||
def get_num_layers(self): |
|||
return len(self.blocks) |
|||
|
|||
@torch.jit.ignore |
|||
def no_weight_decay(self): |
|||
return {'pos_embed', 'cls_token'} |
|||
|
|||
def forward_features(self, x, dataset_source=None): |
|||
B, C, H, W = x.shape |
|||
x, (Hp, Wp) = self.patch_embed(x) |
|||
|
|||
if self.pos_embed is not None: |
|||
# fit for multiple GPU training |
|||
# since the first element for pos embed (sin-cos manner) is zero, it will cause no difference |
|||
x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1] |
|||
|
|||
for blk in self.blocks: |
|||
if self.use_checkpoint: |
|||
x = checkpoint.checkpoint(blk, x, dataset_source) |
|||
else: |
|||
x = blk(x, dataset_source) |
|||
|
|||
x = self.last_norm(x) |
|||
|
|||
xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous() |
|||
|
|||
return xp |
|||
|
|||
def forward(self, x, dataset_source=None): |
|||
x = self.forward_features(x, dataset_source) |
|||
return x |
|||
|
|||
def train(self, mode=True): |
|||
"""Convert the model into training mode.""" |
|||
super().train(mode) |
|||
self._freeze_stages() |
@ -0,0 +1,351 @@ |
|||
# Copyright (c) OpenMMLab. All rights reserved. |
|||
import warnings |
|||
|
|||
import torch |
|||
import torch.nn as nn |
|||
|
|||
import mmcv |
|||
import numpy as np |
|||
from mmcv.image import imwrite |
|||
from mmcv.utils.misc import deprecated_api_warning |
|||
from mmcv.visualization.image import imshow |
|||
|
|||
from mmpose.core import imshow_bboxes, imshow_keypoints |
|||
from .. import builder |
|||
from ..builder import POSENETS |
|||
from .base import BasePose |
|||
|
|||
try: |
|||
from mmcv.runner import auto_fp16 |
|||
except ImportError: |
|||
warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0' |
|||
'Please install mmcv>=1.1.4') |
|||
from mmpose.core import auto_fp16 |
|||
|
|||
|
|||
@POSENETS.register_module() |
|||
class TopDownMoE(BasePose): |
|||
"""Top-down pose detectors. |
|||
|
|||
Args: |
|||
backbone (dict): Backbone modules to extract feature. |
|||
keypoint_head (dict): Keypoint head to process feature. |
|||
train_cfg (dict): Config for training. Default: None. |
|||
test_cfg (dict): Config for testing. Default: None. |
|||
pretrained (str): Path to the pretrained models. |
|||
loss_pose (None): Deprecated arguments. Please use |
|||
`loss_keypoint` for heads instead. |
|||
""" |
|||
|
|||
def __init__(self, |
|||
backbone, |
|||
neck=None, |
|||
keypoint_head=None, |
|||
associate_keypoint_head=None, |
|||
train_cfg=None, |
|||
test_cfg=None, |
|||
pretrained=None, |
|||
loss_pose=None): |
|||
super().__init__() |
|||
self.fp16_enabled = False |
|||
|
|||
self.backbone = builder.build_backbone(backbone) |
|||
|
|||
self.train_cfg = train_cfg |
|||
self.test_cfg = test_cfg |
|||
|
|||
if neck is not None: |
|||
self.neck = builder.build_neck(neck) |
|||
|
|||
if keypoint_head is not None: |
|||
keypoint_head['train_cfg'] = train_cfg |
|||
keypoint_head['test_cfg'] = test_cfg |
|||
|
|||
if 'loss_keypoint' not in keypoint_head and loss_pose is not None: |
|||
warnings.warn( |
|||
'`loss_pose` for TopDown is deprecated, ' |
|||
'use `loss_keypoint` for heads instead. See ' |
|||
'https://github.com/open-mmlab/mmpose/pull/382' |
|||
' for more information.', DeprecationWarning) |
|||
keypoint_head['loss_keypoint'] = loss_pose |
|||
|
|||
self.keypoint_head = builder.build_head(keypoint_head) |
|||
|
|||
|
|||
associate_keypoint_heads = [] |
|||
keypoint_heads_cnt = 1 |
|||
|
|||
if associate_keypoint_head is not None: |
|||
if not isinstance(associate_keypoint_head, list): |
|||
associate_keypoint_head = [associate_keypoint_head] |
|||
for single_keypoint_head in associate_keypoint_head: |
|||
single_keypoint_head['train_cfg'] = train_cfg |
|||
single_keypoint_head['test_cfg'] = test_cfg |
|||
associate_keypoint_heads.append(builder.build_head(single_keypoint_head)) |
|||
keypoint_heads_cnt += 1 |
|||
|
|||
self.associate_keypoint_heads = nn.ModuleList(associate_keypoint_heads) |
|||
|
|||
self.keypoint_heads_cnt = keypoint_heads_cnt |
|||
|
|||
self.init_weights(pretrained=pretrained) |
|||
|
|||
@property |
|||
def with_neck(self): |
|||
"""Check if has neck.""" |
|||
return hasattr(self, 'neck') |
|||
|
|||
@property |
|||
def with_keypoint(self): |
|||
"""Check if has keypoint_head.""" |
|||
return hasattr(self, 'keypoint_head') |
|||
|
|||
def init_weights(self, pretrained=None): |
|||
"""Weight initialization for model.""" |
|||
self.backbone.init_weights(pretrained) |
|||
if self.with_neck: |
|||
self.neck.init_weights() |
|||
if self.with_keypoint: |
|||
self.keypoint_head.init_weights() |
|||
for item in self.associate_keypoint_heads: |
|||
item.init_weights() |
|||
|
|||
@auto_fp16(apply_to=('img', )) |
|||
def forward(self, |
|||
img, |
|||
target=None, |
|||
target_weight=None, |
|||
img_metas=None, |
|||
return_loss=True, |
|||
return_heatmap=False, |
|||
**kwargs): |
|||
"""Calls either forward_train or forward_test depending on whether |
|||
return_loss=True. Note this setting will change the expected inputs. |
|||
When `return_loss=True`, img and img_meta are single-nested (i.e. |
|||
Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta |
|||
should be double nested (i.e. List[Tensor], List[List[dict]]), with |
|||
the outer list indicating test time augmentations. |
|||
|
|||
Note: |
|||
- batch_size: N |
|||
- num_keypoints: K |
|||
- num_img_channel: C (Default: 3) |
|||
- img height: imgH |
|||
- img width: imgW |
|||
- heatmaps height: H |
|||
- heatmaps weight: W |
|||
|
|||
Args: |
|||
img (torch.Tensor[NxCximgHximgW]): Input images. |
|||
target (torch.Tensor[NxKxHxW]): Target heatmaps. |
|||
target_weight (torch.Tensor[NxKx1]): Weights across |
|||
different joint types. |
|||
img_metas (list(dict)): Information about data augmentation |
|||
By default this includes: |
|||
|
|||
- "image_file: path to the image file |
|||
- "center": center of the bbox |
|||
- "scale": scale of the bbox |
|||
- "rotation": rotation of the bbox |
|||
- "bbox_score": score of bbox |
|||
return_loss (bool): Option to `return loss`. `return loss=True` |
|||
for training, `return loss=False` for validation & test. |
|||
return_heatmap (bool) : Option to return heatmap. |
|||
|
|||
Returns: |
|||
dict|tuple: if `return loss` is true, then return losses. \ |
|||
Otherwise, return predicted poses, boxes, image paths \ |
|||
and heatmaps. |
|||
""" |
|||
if return_loss: |
|||
return self.forward_train(img, target, target_weight, img_metas, |
|||
**kwargs) |
|||
return self.forward_test( |
|||
img, img_metas, return_heatmap=return_heatmap, **kwargs) |
|||
|
|||
def forward_train(self, img, target, target_weight, img_metas, **kwargs): |
|||
"""Defines the computation performed at every call when training.""" |
|||
|
|||
img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device) |
|||
|
|||
output = self.backbone(img, img_sources) |
|||
if self.with_neck: |
|||
output = self.neck(output) |
|||
# if return loss |
|||
losses = dict() |
|||
|
|||
main_stream_select = (img_sources == 0) |
|||
# if torch.sum(main_stream_select) > 0: |
|||
output_select = self.keypoint_head(output) |
|||
|
|||
target_select = target * main_stream_select.view(-1, 1, 1, 1) |
|||
target_weight_select = target_weight * main_stream_select.view(-1, 1, 1) |
|||
|
|||
keypoint_losses = self.keypoint_head.get_loss( |
|||
output_select, target_select, target_weight_select) |
|||
losses['main_stream_loss'] = keypoint_losses['heatmap_loss'] |
|||
keypoint_accuracy = self.keypoint_head.get_accuracy( |
|||
output_select, target_select, target_weight_select) |
|||
losses['main_stream_acc'] = keypoint_accuracy['acc_pose'] |
|||
|
|||
for idx in range(1, self.keypoint_heads_cnt): |
|||
idx_select = (img_sources == idx) |
|||
target_select = target * idx_select.view(-1, 1, 1, 1) |
|||
target_weight_select = target_weight * idx_select.view(-1, 1, 1) |
|||
output_select = self.associate_keypoint_heads[idx - 1](output) |
|||
keypoint_losses = self.associate_keypoint_heads[idx - 1].get_loss( |
|||
output_select, target_select, target_weight_select) |
|||
losses[f'{idx}_loss'] = keypoint_losses['heatmap_loss'] |
|||
keypoint_accuracy = self.associate_keypoint_heads[idx - 1].get_accuracy( |
|||
output_select, target_select, target_weight_select) |
|||
losses[f'{idx}_acc'] = keypoint_accuracy['acc_pose'] |
|||
|
|||
return losses |
|||
|
|||
def forward_test(self, img, img_metas, return_heatmap=False, **kwargs): |
|||
"""Defines the computation performed at every call when testing.""" |
|||
assert img.size(0) == len(img_metas) |
|||
batch_size, _, img_height, img_width = img.shape |
|||
if batch_size > 1: |
|||
assert 'bbox_id' in img_metas[0] |
|||
|
|||
result = {} |
|||
img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device) |
|||
|
|||
features = self.backbone(img, img_sources) |
|||
|
|||
if self.with_neck: |
|||
features = self.neck(features) |
|||
if self.with_keypoint: |
|||
output_heatmap = self.keypoint_head.inference_model( |
|||
features, flip_pairs=None) |
|||
|
|||
if self.test_cfg.get('flip_test', True): |
|||
img_flipped = img.flip(3) |
|||
features_flipped = self.backbone(img_flipped, img_sources) |
|||
if self.with_neck: |
|||
features_flipped = self.neck(features_flipped) |
|||
if self.with_keypoint: |
|||
output_flipped_heatmap = self.keypoint_head.inference_model( |
|||
features_flipped, img_metas[0]['flip_pairs']) |
|||
output_heatmap = (output_heatmap + |
|||
output_flipped_heatmap) * 0.5 |
|||
|
|||
if self.with_keypoint: |
|||
keypoint_result = self.keypoint_head.decode( |
|||
img_metas, output_heatmap, img_size=[img_width, img_height]) |
|||
result.update(keypoint_result) |
|||
|
|||
if not return_heatmap: |
|||
output_heatmap = None |
|||
|
|||
result['output_heatmap'] = output_heatmap |
|||
|
|||
return result |
|||
|
|||
def forward_dummy(self, img): |
|||
"""Used for computing network FLOPs. |
|||
|
|||
See ``tools/get_flops.py``. |
|||
|
|||
Args: |
|||
img (torch.Tensor): Input image. |
|||
|
|||
Returns: |
|||
Tensor: Output heatmaps. |
|||
""" |
|||
output = self.backbone(img) |
|||
if self.with_neck: |
|||
output = self.neck(output) |
|||
if self.with_keypoint: |
|||
output = self.keypoint_head(output) |
|||
return output |
|||
|
|||
@deprecated_api_warning({'pose_limb_color': 'pose_link_color'}, |
|||
cls_name='TopDown') |
|||
def show_result(self, |
|||
img, |
|||
result, |
|||
skeleton=None, |
|||
kpt_score_thr=0.3, |
|||
bbox_color='green', |
|||
pose_kpt_color=None, |
|||
pose_link_color=None, |
|||
text_color='white', |
|||
radius=4, |
|||
thickness=1, |
|||
font_scale=0.5, |
|||
bbox_thickness=1, |
|||
win_name='', |
|||
show=False, |
|||
show_keypoint_weight=False, |
|||
wait_time=0, |
|||
out_file=None): |
|||
"""Draw `result` over `img`. |
|||
|
|||
Args: |
|||
img (str or Tensor): The image to be displayed. |
|||
result (list[dict]): The results to draw over `img` |
|||
(bbox_result, pose_result). |
|||
skeleton (list[list]): The connection of keypoints. |
|||
skeleton is 0-based indexing. |
|||
kpt_score_thr (float, optional): Minimum score of keypoints |
|||
to be shown. Default: 0.3. |
|||
bbox_color (str or tuple or :obj:`Color`): Color of bbox lines. |
|||
pose_kpt_color (np.array[Nx3]`): Color of N keypoints. |
|||
If None, do not draw keypoints. |
|||
pose_link_color (np.array[Mx3]): Color of M links. |
|||
If None, do not draw links. |
|||
text_color (str or tuple or :obj:`Color`): Color of texts. |
|||
radius (int): Radius of circles. |
|||
thickness (int): Thickness of lines. |
|||
font_scale (float): Font scales of texts. |
|||
win_name (str): The window name. |
|||
show (bool): Whether to show the image. Default: False. |
|||
show_keypoint_weight (bool): Whether to change the transparency |
|||
using the predicted confidence scores of keypoints. |
|||
wait_time (int): Value of waitKey param. |
|||
Default: 0. |
|||
out_file (str or None): The filename to write the image. |
|||
Default: None. |
|||
|
|||
Returns: |
|||
Tensor: Visualized img, only if not `show` or `out_file`. |
|||
""" |
|||
img = mmcv.imread(img) |
|||
img = img.copy() |
|||
|
|||
bbox_result = [] |
|||
bbox_labels = [] |
|||
pose_result = [] |
|||
for res in result: |
|||
if 'bbox' in res: |
|||
bbox_result.append(res['bbox']) |
|||
bbox_labels.append(res.get('label', None)) |
|||
pose_result.append(res['keypoints']) |
|||
|
|||
if bbox_result: |
|||
bboxes = np.vstack(bbox_result) |
|||
# draw bounding boxes |
|||
imshow_bboxes( |
|||
img, |
|||
bboxes, |
|||
labels=bbox_labels, |
|||
colors=bbox_color, |
|||
text_color=text_color, |
|||
thickness=bbox_thickness, |
|||
font_scale=font_scale, |
|||
show=False) |
|||
|
|||
if pose_result: |
|||
imshow_keypoints(img, pose_result, skeleton, kpt_score_thr, |
|||
pose_kpt_color, pose_link_color, radius, |
|||
thickness) |
|||
|
|||
if show: |
|||
imshow(img, win_name, wait_time) |
|||
|
|||
if out_file is not None: |
|||
imwrite(img, out_file) |
|||
|
|||
return img |
Loading…
Reference in new issue