18 changed files with 3892 additions and 7 deletions
@ -0,0 +1,140 @@ |
|||||
|
aic_info = dict( |
||||
|
dataset_name='aic', |
||||
|
paper_info=dict( |
||||
|
author='Wu, Jiahong and Zheng, He and Zhao, Bo and ' |
||||
|
'Li, Yixin and Yan, Baoming and Liang, Rui and ' |
||||
|
'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and ' |
||||
|
'Fu, Yanwei and others', |
||||
|
title='Ai challenger: A large-scale dataset for going ' |
||||
|
'deeper in image understanding', |
||||
|
container='arXiv', |
||||
|
year='2017', |
||||
|
homepage='https://github.com/AIChallenger/AI_Challenger_2017', |
||||
|
), |
||||
|
keypoint_info={ |
||||
|
0: |
||||
|
dict( |
||||
|
name='right_shoulder', |
||||
|
id=0, |
||||
|
color=[255, 128, 0], |
||||
|
type='upper', |
||||
|
swap='left_shoulder'), |
||||
|
1: |
||||
|
dict( |
||||
|
name='right_elbow', |
||||
|
id=1, |
||||
|
color=[255, 128, 0], |
||||
|
type='upper', |
||||
|
swap='left_elbow'), |
||||
|
2: |
||||
|
dict( |
||||
|
name='right_wrist', |
||||
|
id=2, |
||||
|
color=[255, 128, 0], |
||||
|
type='upper', |
||||
|
swap='left_wrist'), |
||||
|
3: |
||||
|
dict( |
||||
|
name='left_shoulder', |
||||
|
id=3, |
||||
|
color=[0, 255, 0], |
||||
|
type='upper', |
||||
|
swap='right_shoulder'), |
||||
|
4: |
||||
|
dict( |
||||
|
name='left_elbow', |
||||
|
id=4, |
||||
|
color=[0, 255, 0], |
||||
|
type='upper', |
||||
|
swap='right_elbow'), |
||||
|
5: |
||||
|
dict( |
||||
|
name='left_wrist', |
||||
|
id=5, |
||||
|
color=[0, 255, 0], |
||||
|
type='upper', |
||||
|
swap='right_wrist'), |
||||
|
6: |
||||
|
dict( |
||||
|
name='right_hip', |
||||
|
id=6, |
||||
|
color=[255, 128, 0], |
||||
|
type='lower', |
||||
|
swap='left_hip'), |
||||
|
7: |
||||
|
dict( |
||||
|
name='right_knee', |
||||
|
id=7, |
||||
|
color=[255, 128, 0], |
||||
|
type='lower', |
||||
|
swap='left_knee'), |
||||
|
8: |
||||
|
dict( |
||||
|
name='right_ankle', |
||||
|
id=8, |
||||
|
color=[255, 128, 0], |
||||
|
type='lower', |
||||
|
swap='left_ankle'), |
||||
|
9: |
||||
|
dict( |
||||
|
name='left_hip', |
||||
|
id=9, |
||||
|
color=[0, 255, 0], |
||||
|
type='lower', |
||||
|
swap='right_hip'), |
||||
|
10: |
||||
|
dict( |
||||
|
name='left_knee', |
||||
|
id=10, |
||||
|
color=[0, 255, 0], |
||||
|
type='lower', |
||||
|
swap='right_knee'), |
||||
|
11: |
||||
|
dict( |
||||
|
name='left_ankle', |
||||
|
id=11, |
||||
|
color=[0, 255, 0], |
||||
|
type='lower', |
||||
|
swap='right_ankle'), |
||||
|
12: |
||||
|
dict( |
||||
|
name='head_top', |
||||
|
id=12, |
||||
|
color=[51, 153, 255], |
||||
|
type='upper', |
||||
|
swap=''), |
||||
|
13: |
||||
|
dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='') |
||||
|
}, |
||||
|
skeleton_info={ |
||||
|
0: |
||||
|
dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]), |
||||
|
1: dict( |
||||
|
link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]), |
||||
|
2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]), |
||||
|
3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]), |
||||
|
4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]), |
||||
|
5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]), |
||||
|
6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]), |
||||
|
7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]), |
||||
|
8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]), |
||||
|
9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]), |
||||
|
10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]), |
||||
|
11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]), |
||||
|
12: dict( |
||||
|
link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]), |
||||
|
13: |
||||
|
dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255]) |
||||
|
}, |
||||
|
joint_weights=[ |
||||
|
1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1. |
||||
|
], |
||||
|
|
||||
|
# 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/' |
||||
|
# 'Evaluation/keypoint_eval/keypoint_eval.py#L50' |
||||
|
# delta = 2 x sigma |
||||
|
sigmas=[ |
||||
|
0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144, |
||||
|
0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081, |
||||
|
0.01291456, 0.01236173 |
||||
|
]) |
@ -0,0 +1,142 @@ |
|||||
|
ap10k_info = dict( |
||||
|
dataset_name='ap10k', |
||||
|
paper_info=dict( |
||||
|
author='Yu, Hang and Xu, Yufei and Zhang, Jing and ' |
||||
|
'Zhao, Wei and Guan, Ziyu and Tao, Dacheng', |
||||
|
title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild', |
||||
|
container='35th Conference on Neural Information Processing Systems ' |
||||
|
'(NeurIPS 2021) Track on Datasets and Bench-marks.', |
||||
|
year='2021', |
||||
|
homepage='https://github.com/AlexTheBad/AP-10K', |
||||
|
), |
||||
|
keypoint_info={ |
||||
|
0: |
||||
|
dict( |
||||
|
name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'), |
||||
|
1: |
||||
|
dict( |
||||
|
name='R_Eye', |
||||
|
id=1, |
||||
|
color=[255, 128, 0], |
||||
|
type='upper', |
||||
|
swap='L_Eye'), |
||||
|
2: |
||||
|
dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''), |
||||
|
3: |
||||
|
dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''), |
||||
|
4: |
||||
|
dict( |
||||
|
name='Root of tail', |
||||
|
id=4, |
||||
|
color=[51, 153, 255], |
||||
|
type='lower', |
||||
|
swap=''), |
||||
|
5: |
||||
|
dict( |
||||
|
name='L_Shoulder', |
||||
|
id=5, |
||||
|
color=[51, 153, 255], |
||||
|
type='upper', |
||||
|
swap='R_Shoulder'), |
||||
|
6: |
||||
|
dict( |
||||
|
name='L_Elbow', |
||||
|
id=6, |
||||
|
color=[51, 153, 255], |
||||
|
type='upper', |
||||
|
swap='R_Elbow'), |
||||
|
7: |
||||
|
dict( |
||||
|
name='L_F_Paw', |
||||
|
id=7, |
||||
|
color=[0, 255, 0], |
||||
|
type='upper', |
||||
|
swap='R_F_Paw'), |
||||
|
8: |
||||
|
dict( |
||||
|
name='R_Shoulder', |
||||
|
id=8, |
||||
|
color=[0, 255, 0], |
||||
|
type='upper', |
||||
|
swap='L_Shoulder'), |
||||
|
9: |
||||
|
dict( |
||||
|
name='R_Elbow', |
||||
|
id=9, |
||||
|
color=[255, 128, 0], |
||||
|
type='upper', |
||||
|
swap='L_Elbow'), |
||||
|
10: |
||||
|
dict( |
||||
|
name='R_F_Paw', |
||||
|
id=10, |
||||
|
color=[0, 255, 0], |
||||
|
type='lower', |
||||
|
swap='L_F_Paw'), |
||||
|
11: |
||||
|
dict( |
||||
|
name='L_Hip', |
||||
|
id=11, |
||||
|
color=[255, 128, 0], |
||||
|
type='lower', |
||||
|
swap='R_Hip'), |
||||
|
12: |
||||
|
dict( |
||||
|
name='L_Knee', |
||||
|
id=12, |
||||
|
color=[255, 128, 0], |
||||
|
type='lower', |
||||
|
swap='R_Knee'), |
||||
|
13: |
||||
|
dict( |
||||
|
name='L_B_Paw', |
||||
|
id=13, |
||||
|
color=[0, 255, 0], |
||||
|
type='lower', |
||||
|
swap='R_B_Paw'), |
||||
|
14: |
||||
|
dict( |
||||
|
name='R_Hip', id=14, color=[0, 255, 0], type='lower', |
||||
|
swap='L_Hip'), |
||||
|
15: |
||||
|
dict( |
||||
|
name='R_Knee', |
||||
|
id=15, |
||||
|
color=[0, 255, 0], |
||||
|
type='lower', |
||||
|
swap='L_Knee'), |
||||
|
16: |
||||
|
dict( |
||||
|
name='R_B_Paw', |
||||
|
id=16, |
||||
|
color=[0, 255, 0], |
||||
|
type='lower', |
||||
|
swap='L_B_Paw'), |
||||
|
}, |
||||
|
skeleton_info={ |
||||
|
0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]), |
||||
|
1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]), |
||||
|
2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]), |
||||
|
3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]), |
||||
|
4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]), |
||||
|
5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]), |
||||
|
6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]), |
||||
|
7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]), |
||||
|
8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]), |
||||
|
9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]), |
||||
|
10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]), |
||||
|
11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]), |
||||
|
12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]), |
||||
|
13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]), |
||||
|
14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]), |
||||
|
15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]), |
||||
|
16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]), |
||||
|
}, |
||||
|
joint_weights=[ |
||||
|
1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, |
||||
|
1.5 |
||||
|
], |
||||
|
sigmas=[ |
||||
|
0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072, |
||||
|
0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089 |
||||
|
]) |
File diff suppressed because it is too large
@ -0,0 +1,155 @@ |
|||||
|
mpii_info = dict( |
||||
|
dataset_name='mpii', |
||||
|
paper_info=dict( |
||||
|
author='Mykhaylo Andriluka and Leonid Pishchulin and ' |
||||
|
'Peter Gehler and Schiele, Bernt', |
||||
|
title='2D Human Pose Estimation: New Benchmark and ' |
||||
|
'State of the Art Analysis', |
||||
|
container='IEEE Conference on Computer Vision and ' |
||||
|
'Pattern Recognition (CVPR)', |
||||
|
year='2014', |
||||
|
homepage='http://human-pose.mpi-inf.mpg.de/', |
||||
|
), |
||||
|
keypoint_info={ |
||||
|
0: |
||||
|
dict( |
||||
|
name='right_ankle', |
||||
|
id=0, |
||||
|
color=[255, 128, 0], |
||||
|
type='lower', |
||||
|
swap='left_ankle'), |
||||
|
1: |
||||
|
dict( |
||||
|
name='right_knee', |
||||
|
id=1, |
||||
|
color=[255, 128, 0], |
||||
|
type='lower', |
||||
|
swap='left_knee'), |
||||
|
2: |
||||
|
dict( |
||||
|
name='right_hip', |
||||
|
id=2, |
||||
|
color=[255, 128, 0], |
||||
|
type='lower', |
||||
|
swap='left_hip'), |
||||
|
3: |
||||
|
dict( |
||||
|
name='left_hip', |
||||
|
id=3, |
||||
|
color=[0, 255, 0], |
||||
|
type='lower', |
||||
|
swap='right_hip'), |
||||
|
4: |
||||
|
dict( |
||||
|
name='left_knee', |
||||
|
id=4, |
||||
|
color=[0, 255, 0], |
||||
|
type='lower', |
||||
|
swap='right_knee'), |
||||
|
5: |
||||
|
dict( |
||||
|
name='left_ankle', |
||||
|
id=5, |
||||
|
color=[0, 255, 0], |
||||
|
type='lower', |
||||
|
swap='right_ankle'), |
||||
|
6: |
||||
|
dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''), |
||||
|
7: |
||||
|
dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''), |
||||
|
8: |
||||
|
dict( |
||||
|
name='upper_neck', |
||||
|
id=8, |
||||
|
color=[51, 153, 255], |
||||
|
type='upper', |
||||
|
swap=''), |
||||
|
9: |
||||
|
dict( |
||||
|
name='head_top', id=9, color=[51, 153, 255], type='upper', |
||||
|
swap=''), |
||||
|
10: |
||||
|
dict( |
||||
|
name='right_wrist', |
||||
|
id=10, |
||||
|
color=[255, 128, 0], |
||||
|
type='upper', |
||||
|
swap='left_wrist'), |
||||
|
11: |
||||
|
dict( |
||||
|
name='right_elbow', |
||||
|
id=11, |
||||
|
color=[255, 128, 0], |
||||
|
type='upper', |
||||
|
swap='left_elbow'), |
||||
|
12: |
||||
|
dict( |
||||
|
name='right_shoulder', |
||||
|
id=12, |
||||
|
color=[255, 128, 0], |
||||
|
type='upper', |
||||
|
swap='left_shoulder'), |
||||
|
13: |
||||
|
dict( |
||||
|
name='left_shoulder', |
||||
|
id=13, |
||||
|
color=[0, 255, 0], |
||||
|
type='upper', |
||||
|
swap='right_shoulder'), |
||||
|
14: |
||||
|
dict( |
||||
|
name='left_elbow', |
||||
|
id=14, |
||||
|
color=[0, 255, 0], |
||||
|
type='upper', |
||||
|
swap='right_elbow'), |
||||
|
15: |
||||
|
dict( |
||||
|
name='left_wrist', |
||||
|
id=15, |
||||
|
color=[0, 255, 0], |
||||
|
type='upper', |
||||
|
swap='right_wrist') |
||||
|
}, |
||||
|
skeleton_info={ |
||||
|
0: |
||||
|
dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), |
||||
|
1: |
||||
|
dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]), |
||||
|
2: |
||||
|
dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]), |
||||
|
3: |
||||
|
dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]), |
||||
|
4: |
||||
|
dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]), |
||||
|
5: |
||||
|
dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]), |
||||
|
6: |
||||
|
dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]), |
||||
|
7: |
||||
|
dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]), |
||||
|
8: |
||||
|
dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]), |
||||
|
9: |
||||
|
dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]), |
||||
|
10: |
||||
|
dict( |
||||
|
link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128, |
||||
|
0]), |
||||
|
11: |
||||
|
dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), |
||||
|
12: |
||||
|
dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]), |
||||
|
13: |
||||
|
dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]), |
||||
|
14: |
||||
|
dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0]) |
||||
|
}, |
||||
|
joint_weights=[ |
||||
|
1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5 |
||||
|
], |
||||
|
# Adapted from COCO dataset. |
||||
|
sigmas=[ |
||||
|
0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026, |
||||
|
0.062, 0.072, 0.179, 0.179, 0.072, 0.062 |
||||
|
]) |
@ -0,0 +1,500 @@ |
|||||
|
_base_ = [ |
||||
|
'../../../../_base_/default_runtime.py', |
||||
|
'../../../../_base_/datasets/coco.py', |
||||
|
'../../../../_base_/datasets/aic_info.py', |
||||
|
'../../../../_base_/datasets/mpii_info.py', |
||||
|
'../../../../_base_/datasets/ap10k_info.py', |
||||
|
'../../../../_base_/datasets/coco_wholebody_info.py' |
||||
|
] |
||||
|
evaluation = dict(interval=10, metric='mAP', save_best='AP') |
||||
|
|
||||
|
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1, |
||||
|
constructor='LayerDecayOptimizerConstructor', |
||||
|
paramwise_cfg=dict( |
||||
|
num_layers=12, |
||||
|
layer_decay_rate=0.75, |
||||
|
custom_keys={ |
||||
|
'bias': dict(decay_multi=0.), |
||||
|
'pos_embed': dict(decay_mult=0.), |
||||
|
'relative_position_bias_table': dict(decay_mult=0.), |
||||
|
'norm': dict(decay_mult=0.) |
||||
|
} |
||||
|
) |
||||
|
) |
||||
|
|
||||
|
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) |
||||
|
|
||||
|
# learning policy |
||||
|
lr_config = dict( |
||||
|
policy='step', |
||||
|
warmup='linear', |
||||
|
warmup_iters=500, |
||||
|
warmup_ratio=0.001, |
||||
|
step=[170, 200]) |
||||
|
total_epochs = 210 |
||||
|
target_type = 'GaussianHeatmap' |
||||
|
channel_cfg = dict( |
||||
|
num_output_channels=17, |
||||
|
dataset_joints=17, |
||||
|
dataset_channel=[ |
||||
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], |
||||
|
], |
||||
|
inference_channel=[ |
||||
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
||||
|
]) |
||||
|
aic_channel_cfg = dict( |
||||
|
num_output_channels=14, |
||||
|
dataset_joints=14, |
||||
|
dataset_channel=[ |
||||
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
||||
|
], |
||||
|
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) |
||||
|
mpii_channel_cfg = dict( |
||||
|
num_output_channels=16, |
||||
|
dataset_joints=16, |
||||
|
dataset_channel=list(range(16)), |
||||
|
inference_channel=list(range(16))) |
||||
|
crowdpose_channel_cfg = dict( |
||||
|
num_output_channels=14, |
||||
|
dataset_joints=14, |
||||
|
dataset_channel=[ |
||||
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
||||
|
], |
||||
|
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) |
||||
|
ap10k_channel_cfg = dict( |
||||
|
num_output_channels=17, |
||||
|
dataset_joints=17, |
||||
|
dataset_channel=[ |
||||
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], |
||||
|
], |
||||
|
inference_channel=[ |
||||
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
||||
|
]) |
||||
|
cocowholebody_channel_cfg = dict( |
||||
|
num_output_channels=133, |
||||
|
dataset_joints=133, |
||||
|
dataset_channel=[ |
||||
|
list(range(133)), |
||||
|
], |
||||
|
inference_channel=list(range(133))) |
||||
|
|
||||
|
|
||||
|
# model settings |
||||
|
model = dict( |
||||
|
type='TopDownMoE', |
||||
|
pretrained=None, |
||||
|
backbone=dict( |
||||
|
type='ViTMoE', |
||||
|
img_size=(256, 192), |
||||
|
patch_size=16, |
||||
|
embed_dim=768, |
||||
|
depth=12, |
||||
|
num_heads=12, |
||||
|
ratio=1, |
||||
|
use_checkpoint=False, |
||||
|
mlp_ratio=4, |
||||
|
qkv_bias=True, |
||||
|
drop_path_rate=0.3, |
||||
|
num_expert=6, |
||||
|
part_features=192 |
||||
|
), |
||||
|
keypoint_head=dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=768, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
associate_keypoint_head=[ |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=768, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=aic_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=768, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=mpii_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=768, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=crowdpose_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=768, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=ap10k_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=768, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=ap10k_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=768, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=cocowholebody_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
], |
||||
|
train_cfg=dict(), |
||||
|
test_cfg=dict( |
||||
|
flip_test=True, |
||||
|
post_process='default', |
||||
|
shift_heatmap=False, |
||||
|
target_type=target_type, |
||||
|
modulate_kernel=11, |
||||
|
use_udp=True)) |
||||
|
|
||||
|
data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=channel_cfg['num_output_channels'], |
||||
|
num_joints=channel_cfg['dataset_joints'], |
||||
|
dataset_channel=channel_cfg['dataset_channel'], |
||||
|
inference_channel=channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=False, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='data/coco/person_detection_results/' |
||||
|
'COCO_val2017_detections_AP_H_56_person.json', |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=0, |
||||
|
) |
||||
|
|
||||
|
aic_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=aic_channel_cfg['num_output_channels'], |
||||
|
num_joints=aic_channel_cfg['dataset_joints'], |
||||
|
dataset_channel=aic_channel_cfg['dataset_channel'], |
||||
|
inference_channel=aic_channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=True, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='data/coco/person_detection_results/' |
||||
|
'COCO_val2017_detections_AP_H_56_person.json', |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=1, |
||||
|
) |
||||
|
|
||||
|
mpii_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=mpii_channel_cfg['num_output_channels'], |
||||
|
num_joints=mpii_channel_cfg['dataset_joints'], |
||||
|
dataset_channel=mpii_channel_cfg['dataset_channel'], |
||||
|
inference_channel=mpii_channel_cfg['inference_channel'], |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=2, |
||||
|
use_gt_bbox=True, |
||||
|
bbox_file=None, |
||||
|
) |
||||
|
|
||||
|
ap10k_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=channel_cfg['num_output_channels'], |
||||
|
num_joints=channel_cfg['dataset_joints'], |
||||
|
dataset_channel=channel_cfg['dataset_channel'], |
||||
|
inference_channel=channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=True, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='', |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=3, |
||||
|
) |
||||
|
|
||||
|
ap36k_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=channel_cfg['num_output_channels'], |
||||
|
num_joints=channel_cfg['dataset_joints'], |
||||
|
dataset_channel=channel_cfg['dataset_channel'], |
||||
|
inference_channel=channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=True, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='', |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=4, |
||||
|
) |
||||
|
|
||||
|
cocowholebody_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=cocowholebody_channel_cfg['num_output_channels'], |
||||
|
num_joints=cocowholebody_channel_cfg['dataset_joints'], |
||||
|
dataset_channel=cocowholebody_channel_cfg['dataset_channel'], |
||||
|
inference_channel=cocowholebody_channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=False, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='data/coco/person_detection_results/' |
||||
|
'COCO_val2017_detections_AP_H_56_person.json', |
||||
|
dataset_idx=5, |
||||
|
max_num_joints=133, |
||||
|
) |
||||
|
|
||||
|
cocowholebody_train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownHalfBodyTransform', |
||||
|
num_joints_half_body=8, |
||||
|
prob_half_body=0.3), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine'), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict(type='TopDownGenerateTarget', sigma=2), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
ap10k_train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownHalfBodyTransform', |
||||
|
num_joints_half_body=8, |
||||
|
prob_half_body=0.3), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine'), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict(type='TopDownGenerateTarget', sigma=2), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
aic_train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownHalfBodyTransform', |
||||
|
num_joints_half_body=8, |
||||
|
prob_half_body=0.3), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine'), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict(type='TopDownGenerateTarget', sigma=2), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
mpii_train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine', use_udp=True), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict( |
||||
|
type='TopDownGenerateTarget', |
||||
|
sigma=2, |
||||
|
encoding='UDP', |
||||
|
target_type=target_type), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownHalfBodyTransform', |
||||
|
num_joints_half_body=8, |
||||
|
prob_half_body=0.3), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine', use_udp=True), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict( |
||||
|
type='TopDownGenerateTarget', |
||||
|
sigma=2, |
||||
|
encoding='UDP', |
||||
|
target_type=target_type), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
val_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownAffine', use_udp=True), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'center', 'scale', 'rotation', 'bbox_score', |
||||
|
'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
test_pipeline = val_pipeline |
||||
|
|
||||
|
data_root = 'data/coco' |
||||
|
aic_data_root = 'data/aic' |
||||
|
mpii_data_root = 'data/mpii' |
||||
|
ap10k_data_root = 'data/ap10k' |
||||
|
ap36k_data_root = 'data/ap36k' |
||||
|
|
||||
|
data = dict( |
||||
|
samples_per_gpu=128, |
||||
|
workers_per_gpu=8, |
||||
|
val_dataloader=dict(samples_per_gpu=64), |
||||
|
test_dataloader=dict(samples_per_gpu=64), |
||||
|
train=[ |
||||
|
dict( |
||||
|
type='TopDownCocoDataset', |
||||
|
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', |
||||
|
img_prefix=f'{data_root}/train2017/', |
||||
|
data_cfg=data_cfg, |
||||
|
pipeline=train_pipeline, |
||||
|
dataset_info={{_base_.dataset_info}}), |
||||
|
dict( |
||||
|
type='TopDownAicDataset', |
||||
|
ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json', |
||||
|
img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/' |
||||
|
'keypoint_train_images_20170902/', |
||||
|
data_cfg=aic_data_cfg, |
||||
|
pipeline=aic_train_pipeline, |
||||
|
dataset_info={{_base_.aic_info}}), |
||||
|
dict( |
||||
|
type='TopDownMpiiDataset', |
||||
|
ann_file=f'{mpii_data_root}/annotations/mpii_train.json', |
||||
|
img_prefix=f'{mpii_data_root}/images/', |
||||
|
data_cfg=mpii_data_cfg, |
||||
|
pipeline=mpii_train_pipeline, |
||||
|
dataset_info={{_base_.mpii_info}}), |
||||
|
dict( |
||||
|
type='AnimalAP10KDataset', |
||||
|
ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json', |
||||
|
img_prefix=f'{ap10k_data_root}/data/', |
||||
|
data_cfg=ap10k_data_cfg, |
||||
|
pipeline=ap10k_train_pipeline, |
||||
|
dataset_info={{_base_.ap10k_info}}), |
||||
|
dict( |
||||
|
type='AnimalAP10KDataset', |
||||
|
ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json', |
||||
|
img_prefix=f'{ap36k_data_root}/', |
||||
|
data_cfg=ap36k_data_cfg, |
||||
|
pipeline=ap10k_train_pipeline, |
||||
|
dataset_info={{_base_.ap10k_info}}), |
||||
|
dict( |
||||
|
type='TopDownCocoWholeBodyDataset', |
||||
|
ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', |
||||
|
img_prefix=f'{data_root}/train2017/', |
||||
|
data_cfg=cocowholebody_data_cfg, |
||||
|
pipeline=cocowholebody_train_pipeline, |
||||
|
dataset_info={{_base_.cocowholebody_info}}), |
||||
|
], |
||||
|
val=dict( |
||||
|
type='TopDownCocoDataset', |
||||
|
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', |
||||
|
img_prefix=f'{data_root}/val2017/', |
||||
|
data_cfg=data_cfg, |
||||
|
pipeline=val_pipeline, |
||||
|
dataset_info={{_base_.dataset_info}}), |
||||
|
test=dict( |
||||
|
type='TopDownCocoDataset', |
||||
|
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', |
||||
|
img_prefix=f'{data_root}/val2017/', |
||||
|
data_cfg=data_cfg, |
||||
|
pipeline=test_pipeline, |
||||
|
dataset_info={{_base_.dataset_info}}), |
||||
|
) |
||||
|
|
@ -0,0 +1,500 @@ |
|||||
|
_base_ = [ |
||||
|
'../../../../_base_/default_runtime.py', |
||||
|
'../../../../_base_/datasets/coco.py', |
||||
|
'../../../../_base_/datasets/aic_info.py', |
||||
|
'../../../../_base_/datasets/mpii_info.py', |
||||
|
'../../../../_base_/datasets/ap10k_info.py', |
||||
|
'../../../../_base_/datasets/coco_wholebody_info.py' |
||||
|
] |
||||
|
evaluation = dict(interval=10, metric='mAP', save_best='AP') |
||||
|
|
||||
|
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1, |
||||
|
constructor='LayerDecayOptimizerConstructor', |
||||
|
paramwise_cfg=dict( |
||||
|
num_layers=32, |
||||
|
layer_decay_rate=0.8, |
||||
|
custom_keys={ |
||||
|
'bias': dict(decay_multi=0.), |
||||
|
'pos_embed': dict(decay_mult=0.), |
||||
|
'relative_position_bias_table': dict(decay_mult=0.), |
||||
|
'norm': dict(decay_mult=0.) |
||||
|
} |
||||
|
) |
||||
|
) |
||||
|
|
||||
|
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) |
||||
|
|
||||
|
# learning policy |
||||
|
lr_config = dict( |
||||
|
policy='step', |
||||
|
warmup='linear', |
||||
|
warmup_iters=500, |
||||
|
warmup_ratio=0.001, |
||||
|
step=[170, 200]) |
||||
|
total_epochs = 210 |
||||
|
target_type = 'GaussianHeatmap' |
||||
|
channel_cfg = dict( |
||||
|
num_output_channels=17, |
||||
|
dataset_joints=17, |
||||
|
dataset_channel=[ |
||||
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], |
||||
|
], |
||||
|
inference_channel=[ |
||||
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
||||
|
]) |
||||
|
aic_channel_cfg = dict( |
||||
|
num_output_channels=14, |
||||
|
dataset_joints=14, |
||||
|
dataset_channel=[ |
||||
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
||||
|
], |
||||
|
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) |
||||
|
mpii_channel_cfg = dict( |
||||
|
num_output_channels=16, |
||||
|
dataset_joints=16, |
||||
|
dataset_channel=list(range(16)), |
||||
|
inference_channel=list(range(16))) |
||||
|
crowdpose_channel_cfg = dict( |
||||
|
num_output_channels=14, |
||||
|
dataset_joints=14, |
||||
|
dataset_channel=[ |
||||
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
||||
|
], |
||||
|
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) |
||||
|
ap10k_channel_cfg = dict( |
||||
|
num_output_channels=17, |
||||
|
dataset_joints=17, |
||||
|
dataset_channel=[ |
||||
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], |
||||
|
], |
||||
|
inference_channel=[ |
||||
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
||||
|
]) |
||||
|
cocowholebody_channel_cfg = dict( |
||||
|
num_output_channels=133, |
||||
|
dataset_joints=133, |
||||
|
dataset_channel=[ |
||||
|
list(range(133)), |
||||
|
], |
||||
|
inference_channel=list(range(133))) |
||||
|
|
||||
|
|
||||
|
# model settings |
||||
|
model = dict( |
||||
|
type='TopDownMoE', |
||||
|
pretrained=None, |
||||
|
backbone=dict( |
||||
|
type='ViTMoE', |
||||
|
img_size=(256, 192), |
||||
|
patch_size=16, |
||||
|
embed_dim=1280, |
||||
|
depth=32, |
||||
|
num_heads=16, |
||||
|
ratio=1, |
||||
|
use_checkpoint=False, |
||||
|
mlp_ratio=4, |
||||
|
qkv_bias=True, |
||||
|
drop_path_rate=0.55, |
||||
|
num_expert=6, |
||||
|
part_features=320 |
||||
|
), |
||||
|
keypoint_head=dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1280, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
associate_keypoint_head=[ |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1280, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=aic_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1280, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=mpii_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1280, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=crowdpose_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1280, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=ap10k_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1280, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=ap10k_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1280, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=cocowholebody_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
], |
||||
|
train_cfg=dict(), |
||||
|
test_cfg=dict( |
||||
|
flip_test=True, |
||||
|
post_process='default', |
||||
|
shift_heatmap=False, |
||||
|
target_type=target_type, |
||||
|
modulate_kernel=11, |
||||
|
use_udp=True)) |
||||
|
|
||||
|
data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=channel_cfg['num_output_channels'], |
||||
|
num_joints=channel_cfg['dataset_joints'], |
||||
|
dataset_channel=channel_cfg['dataset_channel'], |
||||
|
inference_channel=channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=False, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='data/coco/person_detection_results/' |
||||
|
'COCO_val2017_detections_AP_H_56_person.json', |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=0, |
||||
|
) |
||||
|
|
||||
|
aic_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=aic_channel_cfg['num_output_channels'], |
||||
|
num_joints=aic_channel_cfg['dataset_joints'], |
||||
|
dataset_channel=aic_channel_cfg['dataset_channel'], |
||||
|
inference_channel=aic_channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=True, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='data/coco/person_detection_results/' |
||||
|
'COCO_val2017_detections_AP_H_56_person.json', |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=1, |
||||
|
) |
||||
|
|
||||
|
mpii_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=mpii_channel_cfg['num_output_channels'], |
||||
|
num_joints=mpii_channel_cfg['dataset_joints'], |
||||
|
dataset_channel=mpii_channel_cfg['dataset_channel'], |
||||
|
inference_channel=mpii_channel_cfg['inference_channel'], |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=2, |
||||
|
use_gt_bbox=True, |
||||
|
bbox_file=None, |
||||
|
) |
||||
|
|
||||
|
ap10k_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=channel_cfg['num_output_channels'], |
||||
|
num_joints=channel_cfg['dataset_joints'], |
||||
|
dataset_channel=channel_cfg['dataset_channel'], |
||||
|
inference_channel=channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=True, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='', |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=3, |
||||
|
) |
||||
|
|
||||
|
ap36k_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=channel_cfg['num_output_channels'], |
||||
|
num_joints=channel_cfg['dataset_joints'], |
||||
|
dataset_channel=channel_cfg['dataset_channel'], |
||||
|
inference_channel=channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=True, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='', |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=4, |
||||
|
) |
||||
|
|
||||
|
cocowholebody_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=cocowholebody_channel_cfg['num_output_channels'], |
||||
|
num_joints=cocowholebody_channel_cfg['dataset_joints'], |
||||
|
dataset_channel=cocowholebody_channel_cfg['dataset_channel'], |
||||
|
inference_channel=cocowholebody_channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=False, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='data/coco/person_detection_results/' |
||||
|
'COCO_val2017_detections_AP_H_56_person.json', |
||||
|
dataset_idx=5, |
||||
|
max_num_joints=133, |
||||
|
) |
||||
|
|
||||
|
cocowholebody_train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownHalfBodyTransform', |
||||
|
num_joints_half_body=8, |
||||
|
prob_half_body=0.3), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine'), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict(type='TopDownGenerateTarget', sigma=2), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
ap10k_train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownHalfBodyTransform', |
||||
|
num_joints_half_body=8, |
||||
|
prob_half_body=0.3), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine'), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict(type='TopDownGenerateTarget', sigma=2), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
aic_train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownHalfBodyTransform', |
||||
|
num_joints_half_body=8, |
||||
|
prob_half_body=0.3), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine'), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict(type='TopDownGenerateTarget', sigma=2), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
mpii_train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine', use_udp=True), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict( |
||||
|
type='TopDownGenerateTarget', |
||||
|
sigma=2, |
||||
|
encoding='UDP', |
||||
|
target_type=target_type), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownHalfBodyTransform', |
||||
|
num_joints_half_body=8, |
||||
|
prob_half_body=0.3), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine', use_udp=True), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict( |
||||
|
type='TopDownGenerateTarget', |
||||
|
sigma=2, |
||||
|
encoding='UDP', |
||||
|
target_type=target_type), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
val_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownAffine', use_udp=True), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'center', 'scale', 'rotation', 'bbox_score', |
||||
|
'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
test_pipeline = val_pipeline |
||||
|
|
||||
|
data_root = 'data/coco' |
||||
|
aic_data_root = 'data/aic' |
||||
|
mpii_data_root = 'data/mpii' |
||||
|
ap10k_data_root = 'data/ap10k' |
||||
|
ap36k_data_root = 'data/ap36k' |
||||
|
|
||||
|
data = dict( |
||||
|
samples_per_gpu=128, |
||||
|
workers_per_gpu=8, |
||||
|
val_dataloader=dict(samples_per_gpu=64), |
||||
|
test_dataloader=dict(samples_per_gpu=64), |
||||
|
train=[ |
||||
|
dict( |
||||
|
type='TopDownCocoDataset', |
||||
|
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', |
||||
|
img_prefix=f'{data_root}/train2017/', |
||||
|
data_cfg=data_cfg, |
||||
|
pipeline=train_pipeline, |
||||
|
dataset_info={{_base_.dataset_info}}), |
||||
|
dict( |
||||
|
type='TopDownAicDataset', |
||||
|
ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json', |
||||
|
img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/' |
||||
|
'keypoint_train_images_20170902/', |
||||
|
data_cfg=aic_data_cfg, |
||||
|
pipeline=aic_train_pipeline, |
||||
|
dataset_info={{_base_.aic_info}}), |
||||
|
dict( |
||||
|
type='TopDownMpiiDataset', |
||||
|
ann_file=f'{mpii_data_root}/annotations/mpii_train.json', |
||||
|
img_prefix=f'{mpii_data_root}/images/', |
||||
|
data_cfg=mpii_data_cfg, |
||||
|
pipeline=mpii_train_pipeline, |
||||
|
dataset_info={{_base_.mpii_info}}), |
||||
|
dict( |
||||
|
type='AnimalAP10KDataset', |
||||
|
ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json', |
||||
|
img_prefix=f'{ap10k_data_root}/data/', |
||||
|
data_cfg=ap10k_data_cfg, |
||||
|
pipeline=ap10k_train_pipeline, |
||||
|
dataset_info={{_base_.ap10k_info}}), |
||||
|
dict( |
||||
|
type='AnimalAP10KDataset', |
||||
|
ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json', |
||||
|
img_prefix=f'{ap36k_data_root}/', |
||||
|
data_cfg=ap36k_data_cfg, |
||||
|
pipeline=ap10k_train_pipeline, |
||||
|
dataset_info={{_base_.ap10k_info}}), |
||||
|
dict( |
||||
|
type='TopDownCocoWholeBodyDataset', |
||||
|
ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', |
||||
|
img_prefix=f'{data_root}/train2017/', |
||||
|
data_cfg=cocowholebody_data_cfg, |
||||
|
pipeline=cocowholebody_train_pipeline, |
||||
|
dataset_info={{_base_.cocowholebody_info}}), |
||||
|
], |
||||
|
val=dict( |
||||
|
type='TopDownCocoDataset', |
||||
|
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', |
||||
|
img_prefix=f'{data_root}/val2017/', |
||||
|
data_cfg=data_cfg, |
||||
|
pipeline=val_pipeline, |
||||
|
dataset_info={{_base_.dataset_info}}), |
||||
|
test=dict( |
||||
|
type='TopDownCocoDataset', |
||||
|
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', |
||||
|
img_prefix=f'{data_root}/val2017/', |
||||
|
data_cfg=data_cfg, |
||||
|
pipeline=test_pipeline, |
||||
|
dataset_info={{_base_.dataset_info}}), |
||||
|
) |
||||
|
|
@ -0,0 +1,500 @@ |
|||||
|
_base_ = [ |
||||
|
'../../../../_base_/default_runtime.py', |
||||
|
'../../../../_base_/datasets/coco.py', |
||||
|
'../../../../_base_/datasets/aic_info.py', |
||||
|
'../../../../_base_/datasets/mpii_info.py', |
||||
|
'../../../../_base_/datasets/ap10k_info.py', |
||||
|
'../../../../_base_/datasets/coco_wholebody_info.py' |
||||
|
] |
||||
|
evaluation = dict(interval=10, metric='mAP', save_best='AP') |
||||
|
|
||||
|
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1, |
||||
|
constructor='LayerDecayOptimizerConstructor', |
||||
|
paramwise_cfg=dict( |
||||
|
num_layers=24, |
||||
|
layer_decay_rate=0.8, |
||||
|
custom_keys={ |
||||
|
'bias': dict(decay_multi=0.), |
||||
|
'pos_embed': dict(decay_mult=0.), |
||||
|
'relative_position_bias_table': dict(decay_mult=0.), |
||||
|
'norm': dict(decay_mult=0.) |
||||
|
} |
||||
|
) |
||||
|
) |
||||
|
|
||||
|
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) |
||||
|
|
||||
|
# learning policy |
||||
|
lr_config = dict( |
||||
|
policy='step', |
||||
|
warmup='linear', |
||||
|
warmup_iters=500, |
||||
|
warmup_ratio=0.001, |
||||
|
step=[170, 200]) |
||||
|
total_epochs = 210 |
||||
|
target_type = 'GaussianHeatmap' |
||||
|
channel_cfg = dict( |
||||
|
num_output_channels=17, |
||||
|
dataset_joints=17, |
||||
|
dataset_channel=[ |
||||
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], |
||||
|
], |
||||
|
inference_channel=[ |
||||
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
||||
|
]) |
||||
|
aic_channel_cfg = dict( |
||||
|
num_output_channels=14, |
||||
|
dataset_joints=14, |
||||
|
dataset_channel=[ |
||||
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
||||
|
], |
||||
|
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) |
||||
|
mpii_channel_cfg = dict( |
||||
|
num_output_channels=16, |
||||
|
dataset_joints=16, |
||||
|
dataset_channel=list(range(16)), |
||||
|
inference_channel=list(range(16))) |
||||
|
crowdpose_channel_cfg = dict( |
||||
|
num_output_channels=14, |
||||
|
dataset_joints=14, |
||||
|
dataset_channel=[ |
||||
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], |
||||
|
], |
||||
|
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) |
||||
|
ap10k_channel_cfg = dict( |
||||
|
num_output_channels=17, |
||||
|
dataset_joints=17, |
||||
|
dataset_channel=[ |
||||
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], |
||||
|
], |
||||
|
inference_channel=[ |
||||
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
||||
|
]) |
||||
|
cocowholebody_channel_cfg = dict( |
||||
|
num_output_channels=133, |
||||
|
dataset_joints=133, |
||||
|
dataset_channel=[ |
||||
|
list(range(133)), |
||||
|
], |
||||
|
inference_channel=list(range(133))) |
||||
|
|
||||
|
|
||||
|
# model settings |
||||
|
model = dict( |
||||
|
type='TopDownMoE', |
||||
|
pretrained=None, |
||||
|
backbone=dict( |
||||
|
type='ViTMoE', |
||||
|
img_size=(256, 192), |
||||
|
patch_size=16, |
||||
|
embed_dim=1024, |
||||
|
depth=24, |
||||
|
num_heads=16, |
||||
|
ratio=1, |
||||
|
use_checkpoint=False, |
||||
|
mlp_ratio=4, |
||||
|
qkv_bias=True, |
||||
|
drop_path_rate=0.5, |
||||
|
num_expert=6, |
||||
|
part_features=256 |
||||
|
), |
||||
|
keypoint_head=dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1024, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
associate_keypoint_head=[ |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1024, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=aic_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1024, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=mpii_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1024, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=crowdpose_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1024, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=ap10k_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1024, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=ap10k_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
dict( |
||||
|
type='TopdownHeatmapSimpleHead', |
||||
|
in_channels=1024, |
||||
|
num_deconv_layers=2, |
||||
|
num_deconv_filters=(256, 256), |
||||
|
num_deconv_kernels=(4, 4), |
||||
|
extra=dict(final_conv_kernel=1, ), |
||||
|
out_channels=cocowholebody_channel_cfg['num_output_channels'], |
||||
|
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), |
||||
|
], |
||||
|
train_cfg=dict(), |
||||
|
test_cfg=dict( |
||||
|
flip_test=True, |
||||
|
post_process='default', |
||||
|
shift_heatmap=False, |
||||
|
target_type=target_type, |
||||
|
modulate_kernel=11, |
||||
|
use_udp=True)) |
||||
|
|
||||
|
data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=channel_cfg['num_output_channels'], |
||||
|
num_joints=channel_cfg['dataset_joints'], |
||||
|
dataset_channel=channel_cfg['dataset_channel'], |
||||
|
inference_channel=channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=False, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='data/coco/person_detection_results/' |
||||
|
'COCO_val2017_detections_AP_H_56_person.json', |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=0, |
||||
|
) |
||||
|
|
||||
|
aic_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=aic_channel_cfg['num_output_channels'], |
||||
|
num_joints=aic_channel_cfg['dataset_joints'], |
||||
|
dataset_channel=aic_channel_cfg['dataset_channel'], |
||||
|
inference_channel=aic_channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=True, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='data/coco/person_detection_results/' |
||||
|
'COCO_val2017_detections_AP_H_56_person.json', |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=1, |
||||
|
) |
||||
|
|
||||
|
mpii_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=mpii_channel_cfg['num_output_channels'], |
||||
|
num_joints=mpii_channel_cfg['dataset_joints'], |
||||
|
dataset_channel=mpii_channel_cfg['dataset_channel'], |
||||
|
inference_channel=mpii_channel_cfg['inference_channel'], |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=2, |
||||
|
use_gt_bbox=True, |
||||
|
bbox_file=None, |
||||
|
) |
||||
|
|
||||
|
ap10k_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=channel_cfg['num_output_channels'], |
||||
|
num_joints=channel_cfg['dataset_joints'], |
||||
|
dataset_channel=channel_cfg['dataset_channel'], |
||||
|
inference_channel=channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=True, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='', |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=3, |
||||
|
) |
||||
|
|
||||
|
ap36k_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=channel_cfg['num_output_channels'], |
||||
|
num_joints=channel_cfg['dataset_joints'], |
||||
|
dataset_channel=channel_cfg['dataset_channel'], |
||||
|
inference_channel=channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=True, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='', |
||||
|
max_num_joints=133, |
||||
|
dataset_idx=4, |
||||
|
) |
||||
|
|
||||
|
cocowholebody_data_cfg = dict( |
||||
|
image_size=[192, 256], |
||||
|
heatmap_size=[48, 64], |
||||
|
num_output_channels=cocowholebody_channel_cfg['num_output_channels'], |
||||
|
num_joints=cocowholebody_channel_cfg['dataset_joints'], |
||||
|
dataset_channel=cocowholebody_channel_cfg['dataset_channel'], |
||||
|
inference_channel=cocowholebody_channel_cfg['inference_channel'], |
||||
|
soft_nms=False, |
||||
|
nms_thr=1.0, |
||||
|
oks_thr=0.9, |
||||
|
vis_thr=0.2, |
||||
|
use_gt_bbox=False, |
||||
|
det_bbox_thr=0.0, |
||||
|
bbox_file='data/coco/person_detection_results/' |
||||
|
'COCO_val2017_detections_AP_H_56_person.json', |
||||
|
dataset_idx=5, |
||||
|
max_num_joints=133, |
||||
|
) |
||||
|
|
||||
|
cocowholebody_train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownHalfBodyTransform', |
||||
|
num_joints_half_body=8, |
||||
|
prob_half_body=0.3), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine'), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict(type='TopDownGenerateTarget', sigma=2), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
ap10k_train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownHalfBodyTransform', |
||||
|
num_joints_half_body=8, |
||||
|
prob_half_body=0.3), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine'), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict(type='TopDownGenerateTarget', sigma=2), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
aic_train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownHalfBodyTransform', |
||||
|
num_joints_half_body=8, |
||||
|
prob_half_body=0.3), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine'), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict(type='TopDownGenerateTarget', sigma=2), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
mpii_train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine', use_udp=True), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict( |
||||
|
type='TopDownGenerateTarget', |
||||
|
sigma=2, |
||||
|
encoding='UDP', |
||||
|
target_type=target_type), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
train_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownRandomFlip', flip_prob=0.5), |
||||
|
dict( |
||||
|
type='TopDownHalfBodyTransform', |
||||
|
num_joints_half_body=8, |
||||
|
prob_half_body=0.3), |
||||
|
dict( |
||||
|
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), |
||||
|
dict(type='TopDownAffine', use_udp=True), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict( |
||||
|
type='TopDownGenerateTarget', |
||||
|
sigma=2, |
||||
|
encoding='UDP', |
||||
|
target_type=target_type), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img', 'target', 'target_weight'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', |
||||
|
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
val_pipeline = [ |
||||
|
dict(type='LoadImageFromFile'), |
||||
|
dict(type='TopDownAffine', use_udp=True), |
||||
|
dict(type='ToTensor'), |
||||
|
dict( |
||||
|
type='NormalizeTensor', |
||||
|
mean=[0.485, 0.456, 0.406], |
||||
|
std=[0.229, 0.224, 0.225]), |
||||
|
dict( |
||||
|
type='Collect', |
||||
|
keys=['img'], |
||||
|
meta_keys=[ |
||||
|
'image_file', 'center', 'scale', 'rotation', 'bbox_score', |
||||
|
'flip_pairs', 'dataset_idx' |
||||
|
]), |
||||
|
] |
||||
|
|
||||
|
test_pipeline = val_pipeline |
||||
|
|
||||
|
data_root = 'data/coco' |
||||
|
aic_data_root = 'data/aic' |
||||
|
mpii_data_root = 'data/mpii' |
||||
|
ap10k_data_root = 'data/ap10k' |
||||
|
ap36k_data_root = 'data/ap36k' |
||||
|
|
||||
|
data = dict( |
||||
|
samples_per_gpu=128, |
||||
|
workers_per_gpu=8, |
||||
|
val_dataloader=dict(samples_per_gpu=64), |
||||
|
test_dataloader=dict(samples_per_gpu=64), |
||||
|
train=[ |
||||
|
dict( |
||||
|
type='TopDownCocoDataset', |
||||
|
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', |
||||
|
img_prefix=f'{data_root}/train2017/', |
||||
|
data_cfg=data_cfg, |
||||
|
pipeline=train_pipeline, |
||||
|
dataset_info={{_base_.dataset_info}}), |
||||
|
dict( |
||||
|
type='TopDownAicDataset', |
||||
|
ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json', |
||||
|
img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/' |
||||
|
'keypoint_train_images_20170902/', |
||||
|
data_cfg=aic_data_cfg, |
||||
|
pipeline=aic_train_pipeline, |
||||
|
dataset_info={{_base_.aic_info}}), |
||||
|
dict( |
||||
|
type='TopDownMpiiDataset', |
||||
|
ann_file=f'{mpii_data_root}/annotations/mpii_train.json', |
||||
|
img_prefix=f'{mpii_data_root}/images/', |
||||
|
data_cfg=mpii_data_cfg, |
||||
|
pipeline=mpii_train_pipeline, |
||||
|
dataset_info={{_base_.mpii_info}}), |
||||
|
dict( |
||||
|
type='AnimalAP10KDataset', |
||||
|
ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json', |
||||
|
img_prefix=f'{ap10k_data_root}/data/', |
||||
|
data_cfg=ap10k_data_cfg, |
||||
|
pipeline=ap10k_train_pipeline, |
||||
|
dataset_info={{_base_.ap10k_info}}), |
||||
|
dict( |
||||
|
type='AnimalAP10KDataset', |
||||
|
ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json', |
||||
|
img_prefix=f'{ap36k_data_root}/', |
||||
|
data_cfg=ap36k_data_cfg, |
||||
|
pipeline=ap10k_train_pipeline, |
||||
|
dataset_info={{_base_.ap10k_info}}), |
||||
|
dict( |
||||
|
type='TopDownCocoWholeBodyDataset', |
||||
|
ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', |
||||
|
img_prefix=f'{data_root}/train2017/', |
||||
|
data_cfg=cocowholebody_data_cfg, |
||||
|
pipeline=cocowholebody_train_pipeline, |
||||
|
dataset_info={{_base_.cocowholebody_info}}), |
||||
|
], |
||||
|
val=dict( |
||||
|
type='TopDownCocoDataset', |
||||
|
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', |
||||
|
img_prefix=f'{data_root}/val2017/', |
||||
|
data_cfg=data_cfg, |
||||
|
pipeline=val_pipeline, |
||||
|
dataset_info={{_base_.dataset_info}}), |
||||
|
test=dict( |
||||
|
type='TopDownCocoDataset', |
||||
|
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', |
||||
|
img_prefix=f'{data_root}/val2017/', |
||||
|
data_cfg=data_cfg, |
||||
|
pipeline=test_pipeline, |
||||
|
dataset_info={{_base_.dataset_info}}), |
||||
|
) |
||||
|
|
@ -0,0 +1,384 @@ |
|||||
|
# Copyright (c) OpenMMLab. All rights reserved. |
||||
|
import math |
||||
|
|
||||
|
import torch |
||||
|
from functools import partial |
||||
|
import torch.nn as nn |
||||
|
import torch.nn.functional as F |
||||
|
import torch.utils.checkpoint as checkpoint |
||||
|
|
||||
|
from timm.models.layers import drop_path, to_2tuple, trunc_normal_ |
||||
|
|
||||
|
from ..builder import BACKBONES |
||||
|
from .base_backbone import BaseBackbone |
||||
|
|
||||
|
def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True): |
||||
|
""" |
||||
|
Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token |
||||
|
dimension for the original embeddings. |
||||
|
Args: |
||||
|
abs_pos (Tensor): absolute positional embeddings with (1, num_position, C). |
||||
|
has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token. |
||||
|
hw (Tuple): size of input image tokens. |
||||
|
|
||||
|
Returns: |
||||
|
Absolute positional embeddings after processing with shape (1, H, W, C) |
||||
|
""" |
||||
|
cls_token = None |
||||
|
B, L, C = abs_pos.shape |
||||
|
if has_cls_token: |
||||
|
cls_token = abs_pos[:, 0:1] |
||||
|
abs_pos = abs_pos[:, 1:] |
||||
|
|
||||
|
if ori_h != h or ori_w != w: |
||||
|
new_abs_pos = F.interpolate( |
||||
|
abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2), |
||||
|
size=(h, w), |
||||
|
mode="bicubic", |
||||
|
align_corners=False, |
||||
|
).permute(0, 2, 3, 1).reshape(B, -1, C) |
||||
|
|
||||
|
else: |
||||
|
new_abs_pos = abs_pos |
||||
|
|
||||
|
if cls_token is not None: |
||||
|
new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1) |
||||
|
return new_abs_pos |
||||
|
|
||||
|
class DropPath(nn.Module): |
||||
|
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). |
||||
|
""" |
||||
|
def __init__(self, drop_prob=None): |
||||
|
super(DropPath, self).__init__() |
||||
|
self.drop_prob = drop_prob |
||||
|
|
||||
|
def forward(self, x): |
||||
|
return drop_path(x, self.drop_prob, self.training) |
||||
|
|
||||
|
def extra_repr(self): |
||||
|
return 'p={}'.format(self.drop_prob) |
||||
|
|
||||
|
class Mlp(nn.Module): |
||||
|
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): |
||||
|
super().__init__() |
||||
|
out_features = out_features or in_features |
||||
|
hidden_features = hidden_features or in_features |
||||
|
self.fc1 = nn.Linear(in_features, hidden_features) |
||||
|
self.act = act_layer() |
||||
|
self.fc2 = nn.Linear(hidden_features, out_features) |
||||
|
self.drop = nn.Dropout(drop) |
||||
|
|
||||
|
def forward(self, x): |
||||
|
x = self.fc1(x) |
||||
|
x = self.act(x) |
||||
|
x = self.fc2(x) |
||||
|
x = self.drop(x) |
||||
|
return x |
||||
|
|
||||
|
class MoEMlp(nn.Module): |
||||
|
def __init__(self, num_expert=1, in_features=1024, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., part_features=256): |
||||
|
super().__init__() |
||||
|
out_features = out_features or in_features |
||||
|
hidden_features = hidden_features or in_features |
||||
|
self.part_features = part_features |
||||
|
self.fc1 = nn.Linear(in_features, hidden_features) |
||||
|
self.act = act_layer() |
||||
|
self.fc2 = nn.Linear(hidden_features, out_features - part_features) |
||||
|
self.drop = nn.Dropout(drop) |
||||
|
|
||||
|
self.num_expert = num_expert |
||||
|
experts = [] |
||||
|
|
||||
|
for i in range(num_expert): |
||||
|
experts.append( |
||||
|
nn.Linear(hidden_features, part_features) |
||||
|
) |
||||
|
self.experts = nn.ModuleList(experts) |
||||
|
|
||||
|
def forward(self, x, indices): |
||||
|
|
||||
|
expert_x = torch.zeros_like(x[:, :, -self.part_features:], device=x.device, dtype=x.dtype) |
||||
|
|
||||
|
x = self.fc1(x) |
||||
|
x = self.act(x) |
||||
|
shared_x = self.fc2(x) |
||||
|
indices = indices.view(-1, 1, 1) |
||||
|
|
||||
|
# to support ddp training |
||||
|
for i in range(self.num_expert): |
||||
|
selectedIndex = (indices == i) |
||||
|
current_x = self.experts[i](x) * selectedIndex |
||||
|
expert_x = expert_x + current_x |
||||
|
|
||||
|
x = torch.cat([shared_x, expert_x], dim=-1) |
||||
|
|
||||
|
return x |
||||
|
|
||||
|
class Attention(nn.Module): |
||||
|
def __init__( |
||||
|
self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., |
||||
|
proj_drop=0., attn_head_dim=None,): |
||||
|
super().__init__() |
||||
|
self.num_heads = num_heads |
||||
|
head_dim = dim // num_heads |
||||
|
self.dim = dim |
||||
|
|
||||
|
if attn_head_dim is not None: |
||||
|
head_dim = attn_head_dim |
||||
|
all_head_dim = head_dim * self.num_heads |
||||
|
|
||||
|
self.scale = qk_scale or head_dim ** -0.5 |
||||
|
|
||||
|
self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias) |
||||
|
|
||||
|
self.attn_drop = nn.Dropout(attn_drop) |
||||
|
self.proj = nn.Linear(all_head_dim, dim) |
||||
|
self.proj_drop = nn.Dropout(proj_drop) |
||||
|
|
||||
|
def forward(self, x): |
||||
|
B, N, C = x.shape |
||||
|
qkv = self.qkv(x) |
||||
|
qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) |
||||
|
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) |
||||
|
|
||||
|
q = q * self.scale |
||||
|
attn = (q @ k.transpose(-2, -1)) |
||||
|
|
||||
|
attn = attn.softmax(dim=-1) |
||||
|
attn = self.attn_drop(attn) |
||||
|
|
||||
|
x = (attn @ v).transpose(1, 2).reshape(B, N, -1) |
||||
|
x = self.proj(x) |
||||
|
x = self.proj_drop(x) |
||||
|
|
||||
|
return x |
||||
|
|
||||
|
class Block(nn.Module): |
||||
|
|
||||
|
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, |
||||
|
drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, |
||||
|
norm_layer=nn.LayerNorm, attn_head_dim=None, num_expert=1, part_features=None |
||||
|
): |
||||
|
super().__init__() |
||||
|
|
||||
|
self.norm1 = norm_layer(dim) |
||||
|
self.attn = Attention( |
||||
|
dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, |
||||
|
attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim |
||||
|
) |
||||
|
|
||||
|
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here |
||||
|
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() |
||||
|
self.norm2 = norm_layer(dim) |
||||
|
mlp_hidden_dim = int(dim * mlp_ratio) |
||||
|
self.mlp = MoEMlp(num_expert=num_expert, in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, mode=mode) |
||||
|
|
||||
|
def forward(self, x, indices=None): |
||||
|
|
||||
|
x = x + self.drop_path(self.attn(self.norm1(x))) |
||||
|
x = x + self.drop_path(self.mlp(self.norm2(x), indices)) |
||||
|
return x |
||||
|
|
||||
|
|
||||
|
class PatchEmbed(nn.Module): |
||||
|
""" Image to Patch Embedding |
||||
|
""" |
||||
|
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1): |
||||
|
super().__init__() |
||||
|
img_size = to_2tuple(img_size) |
||||
|
patch_size = to_2tuple(patch_size) |
||||
|
num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2) |
||||
|
self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio)) |
||||
|
self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1])) |
||||
|
self.img_size = img_size |
||||
|
self.patch_size = patch_size |
||||
|
self.num_patches = num_patches |
||||
|
|
||||
|
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1)) |
||||
|
|
||||
|
def forward(self, x, **kwargs): |
||||
|
B, C, H, W = x.shape |
||||
|
x = self.proj(x) |
||||
|
Hp, Wp = x.shape[2], x.shape[3] |
||||
|
|
||||
|
x = x.flatten(2).transpose(1, 2) |
||||
|
return x, (Hp, Wp) |
||||
|
|
||||
|
|
||||
|
class HybridEmbed(nn.Module): |
||||
|
""" CNN Feature Map Embedding |
||||
|
Extract feature map from CNN, flatten, project to embedding dim. |
||||
|
""" |
||||
|
def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768): |
||||
|
super().__init__() |
||||
|
assert isinstance(backbone, nn.Module) |
||||
|
img_size = to_2tuple(img_size) |
||||
|
self.img_size = img_size |
||||
|
self.backbone = backbone |
||||
|
if feature_size is None: |
||||
|
with torch.no_grad(): |
||||
|
training = backbone.training |
||||
|
if training: |
||||
|
backbone.eval() |
||||
|
o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1] |
||||
|
feature_size = o.shape[-2:] |
||||
|
feature_dim = o.shape[1] |
||||
|
backbone.train(training) |
||||
|
else: |
||||
|
feature_size = to_2tuple(feature_size) |
||||
|
feature_dim = self.backbone.feature_info.channels()[-1] |
||||
|
self.num_patches = feature_size[0] * feature_size[1] |
||||
|
self.proj = nn.Linear(feature_dim, embed_dim) |
||||
|
|
||||
|
def forward(self, x): |
||||
|
x = self.backbone(x)[-1] |
||||
|
x = x.flatten(2).transpose(1, 2) |
||||
|
x = self.proj(x) |
||||
|
return x |
||||
|
|
||||
|
|
||||
|
@BACKBONES.register_module() |
||||
|
class ViTMoE(BaseBackbone): |
||||
|
|
||||
|
def __init__(self, |
||||
|
img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12, |
||||
|
num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., |
||||
|
drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False, |
||||
|
frozen_stages=-1, ratio=1, last_norm=True, |
||||
|
patch_padding='pad', freeze_attn=False, freeze_ffn=False, |
||||
|
num_expert=1, part_features=None |
||||
|
): |
||||
|
# Protect mutable default arguments |
||||
|
super(ViTMoE, self).__init__() |
||||
|
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) |
||||
|
self.num_classes = num_classes |
||||
|
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models |
||||
|
self.frozen_stages = frozen_stages |
||||
|
self.use_checkpoint = use_checkpoint |
||||
|
self.patch_padding = patch_padding |
||||
|
self.freeze_attn = freeze_attn |
||||
|
self.freeze_ffn = freeze_ffn |
||||
|
self.depth = depth |
||||
|
|
||||
|
if hybrid_backbone is not None: |
||||
|
self.patch_embed = HybridEmbed( |
||||
|
hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim) |
||||
|
else: |
||||
|
self.patch_embed = PatchEmbed( |
||||
|
img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio) |
||||
|
num_patches = self.patch_embed.num_patches |
||||
|
|
||||
|
self.part_features = part_features |
||||
|
|
||||
|
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) |
||||
|
|
||||
|
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule |
||||
|
|
||||
|
self.blocks = nn.ModuleList([ |
||||
|
Block( |
||||
|
dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, |
||||
|
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, |
||||
|
num_expert=num_expert, part_features=part_features |
||||
|
) |
||||
|
for i in range(depth)]) |
||||
|
|
||||
|
self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity() |
||||
|
|
||||
|
if self.pos_embed is not None: |
||||
|
trunc_normal_(self.pos_embed, std=.02) |
||||
|
|
||||
|
self._freeze_stages() |
||||
|
|
||||
|
def _freeze_stages(self): |
||||
|
"""Freeze parameters.""" |
||||
|
if self.frozen_stages >= 0: |
||||
|
self.patch_embed.eval() |
||||
|
for param in self.patch_embed.parameters(): |
||||
|
param.requires_grad = False |
||||
|
|
||||
|
for i in range(1, self.frozen_stages + 1): |
||||
|
m = self.blocks[i] |
||||
|
m.eval() |
||||
|
for param in m.parameters(): |
||||
|
param.requires_grad = False |
||||
|
|
||||
|
if self.freeze_attn: |
||||
|
for i in range(0, self.depth): |
||||
|
m = self.blocks[i] |
||||
|
m.attn.eval() |
||||
|
m.norm1.eval() |
||||
|
for param in m.attn.parameters(): |
||||
|
param.requires_grad = False |
||||
|
for param in m.norm1.parameters(): |
||||
|
param.requires_grad = False |
||||
|
|
||||
|
if self.freeze_ffn: |
||||
|
self.pos_embed.requires_grad = False |
||||
|
self.patch_embed.eval() |
||||
|
for param in self.patch_embed.parameters(): |
||||
|
param.requires_grad = False |
||||
|
for i in range(0, self.depth): |
||||
|
m = self.blocks[i] |
||||
|
m.mlp.eval() |
||||
|
m.norm2.eval() |
||||
|
for param in m.mlp.parameters(): |
||||
|
param.requires_grad = False |
||||
|
for param in m.norm2.parameters(): |
||||
|
param.requires_grad = False |
||||
|
|
||||
|
def init_weights(self, pretrained=None): |
||||
|
"""Initialize the weights in backbone. |
||||
|
Args: |
||||
|
pretrained (str, optional): Path to pre-trained weights. |
||||
|
Defaults to None. |
||||
|
""" |
||||
|
super().init_weights(pretrained, patch_padding=self.patch_padding, part_features=self.part_features) |
||||
|
|
||||
|
if pretrained is None: |
||||
|
def _init_weights(m): |
||||
|
if isinstance(m, nn.Linear): |
||||
|
trunc_normal_(m.weight, std=.02) |
||||
|
if isinstance(m, nn.Linear) and m.bias is not None: |
||||
|
nn.init.constant_(m.bias, 0) |
||||
|
elif isinstance(m, nn.LayerNorm): |
||||
|
nn.init.constant_(m.bias, 0) |
||||
|
nn.init.constant_(m.weight, 1.0) |
||||
|
|
||||
|
self.apply(_init_weights) |
||||
|
|
||||
|
def get_num_layers(self): |
||||
|
return len(self.blocks) |
||||
|
|
||||
|
@torch.jit.ignore |
||||
|
def no_weight_decay(self): |
||||
|
return {'pos_embed', 'cls_token'} |
||||
|
|
||||
|
def forward_features(self, x, dataset_source=None): |
||||
|
B, C, H, W = x.shape |
||||
|
x, (Hp, Wp) = self.patch_embed(x) |
||||
|
|
||||
|
if self.pos_embed is not None: |
||||
|
# fit for multiple GPU training |
||||
|
# since the first element for pos embed (sin-cos manner) is zero, it will cause no difference |
||||
|
x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1] |
||||
|
|
||||
|
for blk in self.blocks: |
||||
|
if self.use_checkpoint: |
||||
|
x = checkpoint.checkpoint(blk, x, dataset_source) |
||||
|
else: |
||||
|
x = blk(x, dataset_source) |
||||
|
|
||||
|
x = self.last_norm(x) |
||||
|
|
||||
|
xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous() |
||||
|
|
||||
|
return xp |
||||
|
|
||||
|
def forward(self, x, dataset_source=None): |
||||
|
x = self.forward_features(x, dataset_source) |
||||
|
return x |
||||
|
|
||||
|
def train(self, mode=True): |
||||
|
"""Convert the model into training mode.""" |
||||
|
super().train(mode) |
||||
|
self._freeze_stages() |
@ -0,0 +1,351 @@ |
|||||
|
# Copyright (c) OpenMMLab. All rights reserved. |
||||
|
import warnings |
||||
|
|
||||
|
import torch |
||||
|
import torch.nn as nn |
||||
|
|
||||
|
import mmcv |
||||
|
import numpy as np |
||||
|
from mmcv.image import imwrite |
||||
|
from mmcv.utils.misc import deprecated_api_warning |
||||
|
from mmcv.visualization.image import imshow |
||||
|
|
||||
|
from mmpose.core import imshow_bboxes, imshow_keypoints |
||||
|
from .. import builder |
||||
|
from ..builder import POSENETS |
||||
|
from .base import BasePose |
||||
|
|
||||
|
try: |
||||
|
from mmcv.runner import auto_fp16 |
||||
|
except ImportError: |
||||
|
warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0' |
||||
|
'Please install mmcv>=1.1.4') |
||||
|
from mmpose.core import auto_fp16 |
||||
|
|
||||
|
|
||||
|
@POSENETS.register_module() |
||||
|
class TopDownMoE(BasePose): |
||||
|
"""Top-down pose detectors. |
||||
|
|
||||
|
Args: |
||||
|
backbone (dict): Backbone modules to extract feature. |
||||
|
keypoint_head (dict): Keypoint head to process feature. |
||||
|
train_cfg (dict): Config for training. Default: None. |
||||
|
test_cfg (dict): Config for testing. Default: None. |
||||
|
pretrained (str): Path to the pretrained models. |
||||
|
loss_pose (None): Deprecated arguments. Please use |
||||
|
`loss_keypoint` for heads instead. |
||||
|
""" |
||||
|
|
||||
|
def __init__(self, |
||||
|
backbone, |
||||
|
neck=None, |
||||
|
keypoint_head=None, |
||||
|
associate_keypoint_head=None, |
||||
|
train_cfg=None, |
||||
|
test_cfg=None, |
||||
|
pretrained=None, |
||||
|
loss_pose=None): |
||||
|
super().__init__() |
||||
|
self.fp16_enabled = False |
||||
|
|
||||
|
self.backbone = builder.build_backbone(backbone) |
||||
|
|
||||
|
self.train_cfg = train_cfg |
||||
|
self.test_cfg = test_cfg |
||||
|
|
||||
|
if neck is not None: |
||||
|
self.neck = builder.build_neck(neck) |
||||
|
|
||||
|
if keypoint_head is not None: |
||||
|
keypoint_head['train_cfg'] = train_cfg |
||||
|
keypoint_head['test_cfg'] = test_cfg |
||||
|
|
||||
|
if 'loss_keypoint' not in keypoint_head and loss_pose is not None: |
||||
|
warnings.warn( |
||||
|
'`loss_pose` for TopDown is deprecated, ' |
||||
|
'use `loss_keypoint` for heads instead. See ' |
||||
|
'https://github.com/open-mmlab/mmpose/pull/382' |
||||
|
' for more information.', DeprecationWarning) |
||||
|
keypoint_head['loss_keypoint'] = loss_pose |
||||
|
|
||||
|
self.keypoint_head = builder.build_head(keypoint_head) |
||||
|
|
||||
|
|
||||
|
associate_keypoint_heads = [] |
||||
|
keypoint_heads_cnt = 1 |
||||
|
|
||||
|
if associate_keypoint_head is not None: |
||||
|
if not isinstance(associate_keypoint_head, list): |
||||
|
associate_keypoint_head = [associate_keypoint_head] |
||||
|
for single_keypoint_head in associate_keypoint_head: |
||||
|
single_keypoint_head['train_cfg'] = train_cfg |
||||
|
single_keypoint_head['test_cfg'] = test_cfg |
||||
|
associate_keypoint_heads.append(builder.build_head(single_keypoint_head)) |
||||
|
keypoint_heads_cnt += 1 |
||||
|
|
||||
|
self.associate_keypoint_heads = nn.ModuleList(associate_keypoint_heads) |
||||
|
|
||||
|
self.keypoint_heads_cnt = keypoint_heads_cnt |
||||
|
|
||||
|
self.init_weights(pretrained=pretrained) |
||||
|
|
||||
|
@property |
||||
|
def with_neck(self): |
||||
|
"""Check if has neck.""" |
||||
|
return hasattr(self, 'neck') |
||||
|
|
||||
|
@property |
||||
|
def with_keypoint(self): |
||||
|
"""Check if has keypoint_head.""" |
||||
|
return hasattr(self, 'keypoint_head') |
||||
|
|
||||
|
def init_weights(self, pretrained=None): |
||||
|
"""Weight initialization for model.""" |
||||
|
self.backbone.init_weights(pretrained) |
||||
|
if self.with_neck: |
||||
|
self.neck.init_weights() |
||||
|
if self.with_keypoint: |
||||
|
self.keypoint_head.init_weights() |
||||
|
for item in self.associate_keypoint_heads: |
||||
|
item.init_weights() |
||||
|
|
||||
|
@auto_fp16(apply_to=('img', )) |
||||
|
def forward(self, |
||||
|
img, |
||||
|
target=None, |
||||
|
target_weight=None, |
||||
|
img_metas=None, |
||||
|
return_loss=True, |
||||
|
return_heatmap=False, |
||||
|
**kwargs): |
||||
|
"""Calls either forward_train or forward_test depending on whether |
||||
|
return_loss=True. Note this setting will change the expected inputs. |
||||
|
When `return_loss=True`, img and img_meta are single-nested (i.e. |
||||
|
Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta |
||||
|
should be double nested (i.e. List[Tensor], List[List[dict]]), with |
||||
|
the outer list indicating test time augmentations. |
||||
|
|
||||
|
Note: |
||||
|
- batch_size: N |
||||
|
- num_keypoints: K |
||||
|
- num_img_channel: C (Default: 3) |
||||
|
- img height: imgH |
||||
|
- img width: imgW |
||||
|
- heatmaps height: H |
||||
|
- heatmaps weight: W |
||||
|
|
||||
|
Args: |
||||
|
img (torch.Tensor[NxCximgHximgW]): Input images. |
||||
|
target (torch.Tensor[NxKxHxW]): Target heatmaps. |
||||
|
target_weight (torch.Tensor[NxKx1]): Weights across |
||||
|
different joint types. |
||||
|
img_metas (list(dict)): Information about data augmentation |
||||
|
By default this includes: |
||||
|
|
||||
|
- "image_file: path to the image file |
||||
|
- "center": center of the bbox |
||||
|
- "scale": scale of the bbox |
||||
|
- "rotation": rotation of the bbox |
||||
|
- "bbox_score": score of bbox |
||||
|
return_loss (bool): Option to `return loss`. `return loss=True` |
||||
|
for training, `return loss=False` for validation & test. |
||||
|
return_heatmap (bool) : Option to return heatmap. |
||||
|
|
||||
|
Returns: |
||||
|
dict|tuple: if `return loss` is true, then return losses. \ |
||||
|
Otherwise, return predicted poses, boxes, image paths \ |
||||
|
and heatmaps. |
||||
|
""" |
||||
|
if return_loss: |
||||
|
return self.forward_train(img, target, target_weight, img_metas, |
||||
|
**kwargs) |
||||
|
return self.forward_test( |
||||
|
img, img_metas, return_heatmap=return_heatmap, **kwargs) |
||||
|
|
||||
|
def forward_train(self, img, target, target_weight, img_metas, **kwargs): |
||||
|
"""Defines the computation performed at every call when training.""" |
||||
|
|
||||
|
img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device) |
||||
|
|
||||
|
output = self.backbone(img, img_sources) |
||||
|
if self.with_neck: |
||||
|
output = self.neck(output) |
||||
|
# if return loss |
||||
|
losses = dict() |
||||
|
|
||||
|
main_stream_select = (img_sources == 0) |
||||
|
# if torch.sum(main_stream_select) > 0: |
||||
|
output_select = self.keypoint_head(output) |
||||
|
|
||||
|
target_select = target * main_stream_select.view(-1, 1, 1, 1) |
||||
|
target_weight_select = target_weight * main_stream_select.view(-1, 1, 1) |
||||
|
|
||||
|
keypoint_losses = self.keypoint_head.get_loss( |
||||
|
output_select, target_select, target_weight_select) |
||||
|
losses['main_stream_loss'] = keypoint_losses['heatmap_loss'] |
||||
|
keypoint_accuracy = self.keypoint_head.get_accuracy( |
||||
|
output_select, target_select, target_weight_select) |
||||
|
losses['main_stream_acc'] = keypoint_accuracy['acc_pose'] |
||||
|
|
||||
|
for idx in range(1, self.keypoint_heads_cnt): |
||||
|
idx_select = (img_sources == idx) |
||||
|
target_select = target * idx_select.view(-1, 1, 1, 1) |
||||
|
target_weight_select = target_weight * idx_select.view(-1, 1, 1) |
||||
|
output_select = self.associate_keypoint_heads[idx - 1](output) |
||||
|
keypoint_losses = self.associate_keypoint_heads[idx - 1].get_loss( |
||||
|
output_select, target_select, target_weight_select) |
||||
|
losses[f'{idx}_loss'] = keypoint_losses['heatmap_loss'] |
||||
|
keypoint_accuracy = self.associate_keypoint_heads[idx - 1].get_accuracy( |
||||
|
output_select, target_select, target_weight_select) |
||||
|
losses[f'{idx}_acc'] = keypoint_accuracy['acc_pose'] |
||||
|
|
||||
|
return losses |
||||
|
|
||||
|
def forward_test(self, img, img_metas, return_heatmap=False, **kwargs): |
||||
|
"""Defines the computation performed at every call when testing.""" |
||||
|
assert img.size(0) == len(img_metas) |
||||
|
batch_size, _, img_height, img_width = img.shape |
||||
|
if batch_size > 1: |
||||
|
assert 'bbox_id' in img_metas[0] |
||||
|
|
||||
|
result = {} |
||||
|
img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device) |
||||
|
|
||||
|
features = self.backbone(img, img_sources) |
||||
|
|
||||
|
if self.with_neck: |
||||
|
features = self.neck(features) |
||||
|
if self.with_keypoint: |
||||
|
output_heatmap = self.keypoint_head.inference_model( |
||||
|
features, flip_pairs=None) |
||||
|
|
||||
|
if self.test_cfg.get('flip_test', True): |
||||
|
img_flipped = img.flip(3) |
||||
|
features_flipped = self.backbone(img_flipped, img_sources) |
||||
|
if self.with_neck: |
||||
|
features_flipped = self.neck(features_flipped) |
||||
|
if self.with_keypoint: |
||||
|
output_flipped_heatmap = self.keypoint_head.inference_model( |
||||
|
features_flipped, img_metas[0]['flip_pairs']) |
||||
|
output_heatmap = (output_heatmap + |
||||
|
output_flipped_heatmap) * 0.5 |
||||
|
|
||||
|
if self.with_keypoint: |
||||
|
keypoint_result = self.keypoint_head.decode( |
||||
|
img_metas, output_heatmap, img_size=[img_width, img_height]) |
||||
|
result.update(keypoint_result) |
||||
|
|
||||
|
if not return_heatmap: |
||||
|
output_heatmap = None |
||||
|
|
||||
|
result['output_heatmap'] = output_heatmap |
||||
|
|
||||
|
return result |
||||
|
|
||||
|
def forward_dummy(self, img): |
||||
|
"""Used for computing network FLOPs. |
||||
|
|
||||
|
See ``tools/get_flops.py``. |
||||
|
|
||||
|
Args: |
||||
|
img (torch.Tensor): Input image. |
||||
|
|
||||
|
Returns: |
||||
|
Tensor: Output heatmaps. |
||||
|
""" |
||||
|
output = self.backbone(img) |
||||
|
if self.with_neck: |
||||
|
output = self.neck(output) |
||||
|
if self.with_keypoint: |
||||
|
output = self.keypoint_head(output) |
||||
|
return output |
||||
|
|
||||
|
@deprecated_api_warning({'pose_limb_color': 'pose_link_color'}, |
||||
|
cls_name='TopDown') |
||||
|
def show_result(self, |
||||
|
img, |
||||
|
result, |
||||
|
skeleton=None, |
||||
|
kpt_score_thr=0.3, |
||||
|
bbox_color='green', |
||||
|
pose_kpt_color=None, |
||||
|
pose_link_color=None, |
||||
|
text_color='white', |
||||
|
radius=4, |
||||
|
thickness=1, |
||||
|
font_scale=0.5, |
||||
|
bbox_thickness=1, |
||||
|
win_name='', |
||||
|
show=False, |
||||
|
show_keypoint_weight=False, |
||||
|
wait_time=0, |
||||
|
out_file=None): |
||||
|
"""Draw `result` over `img`. |
||||
|
|
||||
|
Args: |
||||
|
img (str or Tensor): The image to be displayed. |
||||
|
result (list[dict]): The results to draw over `img` |
||||
|
(bbox_result, pose_result). |
||||
|
skeleton (list[list]): The connection of keypoints. |
||||
|
skeleton is 0-based indexing. |
||||
|
kpt_score_thr (float, optional): Minimum score of keypoints |
||||
|
to be shown. Default: 0.3. |
||||
|
bbox_color (str or tuple or :obj:`Color`): Color of bbox lines. |
||||
|
pose_kpt_color (np.array[Nx3]`): Color of N keypoints. |
||||
|
If None, do not draw keypoints. |
||||
|
pose_link_color (np.array[Mx3]): Color of M links. |
||||
|
If None, do not draw links. |
||||
|
text_color (str or tuple or :obj:`Color`): Color of texts. |
||||
|
radius (int): Radius of circles. |
||||
|
thickness (int): Thickness of lines. |
||||
|
font_scale (float): Font scales of texts. |
||||
|
win_name (str): The window name. |
||||
|
show (bool): Whether to show the image. Default: False. |
||||
|
show_keypoint_weight (bool): Whether to change the transparency |
||||
|
using the predicted confidence scores of keypoints. |
||||
|
wait_time (int): Value of waitKey param. |
||||
|
Default: 0. |
||||
|
out_file (str or None): The filename to write the image. |
||||
|
Default: None. |
||||
|
|
||||
|
Returns: |
||||
|
Tensor: Visualized img, only if not `show` or `out_file`. |
||||
|
""" |
||||
|
img = mmcv.imread(img) |
||||
|
img = img.copy() |
||||
|
|
||||
|
bbox_result = [] |
||||
|
bbox_labels = [] |
||||
|
pose_result = [] |
||||
|
for res in result: |
||||
|
if 'bbox' in res: |
||||
|
bbox_result.append(res['bbox']) |
||||
|
bbox_labels.append(res.get('label', None)) |
||||
|
pose_result.append(res['keypoints']) |
||||
|
|
||||
|
if bbox_result: |
||||
|
bboxes = np.vstack(bbox_result) |
||||
|
# draw bounding boxes |
||||
|
imshow_bboxes( |
||||
|
img, |
||||
|
bboxes, |
||||
|
labels=bbox_labels, |
||||
|
colors=bbox_color, |
||||
|
text_color=text_color, |
||||
|
thickness=bbox_thickness, |
||||
|
font_scale=font_scale, |
||||
|
show=False) |
||||
|
|
||||
|
if pose_result: |
||||
|
imshow_keypoints(img, pose_result, skeleton, kpt_score_thr, |
||||
|
pose_kpt_color, pose_link_color, radius, |
||||
|
thickness) |
||||
|
|
||||
|
if show: |
||||
|
imshow(img, win_name, wait_time) |
||||
|
|
||||
|
if out_file is not None: |
||||
|
imwrite(img, out_file) |
||||
|
|
||||
|
return img |
Loading…
Reference in new issue