Browse Source

update MoE code for vitpose+

233
Yufei 3 years ago
parent
commit
98fb7f859f
  1. 140
      configs/_base_/datasets/aic_info.py
  2. 142
      configs/_base_/datasets/ap10k_info.py
  3. 1154
      configs/_base_/datasets/coco_wholebody_info.py
  4. 155
      configs/_base_/datasets/mpii_info.py
  5. 2
      configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
  6. 500
      configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_base_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
  7. 500
      configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_huge_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
  8. 500
      configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_large_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
  9. 15
      mmcv_custom/checkpoint.py
  10. 3
      mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py
  11. 8
      mmpose/datasets/pipelines/top_down_transform.py
  12. 3
      mmpose/models/backbones/__init__.py
  13. 4
      mmpose/models/backbones/base_backbone.py
  14. 33
      mmpose/models/backbones/vit.py
  15. 384
      mmpose/models/backbones/vit_moe.py
  16. 3
      mmpose/models/detectors/__init__.py
  17. 351
      mmpose/models/detectors/top_down_moe.py
  18. 2
      tools/train.py

140
configs/_base_/datasets/aic_info.py

@ -0,0 +1,140 @@
aic_info = dict(
dataset_name='aic',
paper_info=dict(
author='Wu, Jiahong and Zheng, He and Zhao, Bo and '
'Li, Yixin and Yan, Baoming and Liang, Rui and '
'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and '
'Fu, Yanwei and others',
title='Ai challenger: A large-scale dataset for going '
'deeper in image understanding',
container='arXiv',
year='2017',
homepage='https://github.com/AIChallenger/AI_Challenger_2017',
),
keypoint_info={
0:
dict(
name='right_shoulder',
id=0,
color=[255, 128, 0],
type='upper',
swap='left_shoulder'),
1:
dict(
name='right_elbow',
id=1,
color=[255, 128, 0],
type='upper',
swap='left_elbow'),
2:
dict(
name='right_wrist',
id=2,
color=[255, 128, 0],
type='upper',
swap='left_wrist'),
3:
dict(
name='left_shoulder',
id=3,
color=[0, 255, 0],
type='upper',
swap='right_shoulder'),
4:
dict(
name='left_elbow',
id=4,
color=[0, 255, 0],
type='upper',
swap='right_elbow'),
5:
dict(
name='left_wrist',
id=5,
color=[0, 255, 0],
type='upper',
swap='right_wrist'),
6:
dict(
name='right_hip',
id=6,
color=[255, 128, 0],
type='lower',
swap='left_hip'),
7:
dict(
name='right_knee',
id=7,
color=[255, 128, 0],
type='lower',
swap='left_knee'),
8:
dict(
name='right_ankle',
id=8,
color=[255, 128, 0],
type='lower',
swap='left_ankle'),
9:
dict(
name='left_hip',
id=9,
color=[0, 255, 0],
type='lower',
swap='right_hip'),
10:
dict(
name='left_knee',
id=10,
color=[0, 255, 0],
type='lower',
swap='right_knee'),
11:
dict(
name='left_ankle',
id=11,
color=[0, 255, 0],
type='lower',
swap='right_ankle'),
12:
dict(
name='head_top',
id=12,
color=[51, 153, 255],
type='upper',
swap=''),
13:
dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='')
},
skeleton_info={
0:
dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]),
1: dict(
link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]),
2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]),
3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]),
4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]),
7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]),
8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]),
9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]),
10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]),
11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]),
12: dict(
link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]),
13:
dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255])
},
joint_weights=[
1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.
],
# 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/'
# 'Evaluation/keypoint_eval/keypoint_eval.py#L50'
# delta = 2 x sigma
sigmas=[
0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144,
0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081,
0.01291456, 0.01236173
])

142
configs/_base_/datasets/ap10k_info.py

@ -0,0 +1,142 @@
ap10k_info = dict(
dataset_name='ap10k',
paper_info=dict(
author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
container='35th Conference on Neural Information Processing Systems '
'(NeurIPS 2021) Track on Datasets and Bench-marks.',
year='2021',
homepage='https://github.com/AlexTheBad/AP-10K',
),
keypoint_info={
0:
dict(
name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
1:
dict(
name='R_Eye',
id=1,
color=[255, 128, 0],
type='upper',
swap='L_Eye'),
2:
dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
3:
dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
4:
dict(
name='Root of tail',
id=4,
color=[51, 153, 255],
type='lower',
swap=''),
5:
dict(
name='L_Shoulder',
id=5,
color=[51, 153, 255],
type='upper',
swap='R_Shoulder'),
6:
dict(
name='L_Elbow',
id=6,
color=[51, 153, 255],
type='upper',
swap='R_Elbow'),
7:
dict(
name='L_F_Paw',
id=7,
color=[0, 255, 0],
type='upper',
swap='R_F_Paw'),
8:
dict(
name='R_Shoulder',
id=8,
color=[0, 255, 0],
type='upper',
swap='L_Shoulder'),
9:
dict(
name='R_Elbow',
id=9,
color=[255, 128, 0],
type='upper',
swap='L_Elbow'),
10:
dict(
name='R_F_Paw',
id=10,
color=[0, 255, 0],
type='lower',
swap='L_F_Paw'),
11:
dict(
name='L_Hip',
id=11,
color=[255, 128, 0],
type='lower',
swap='R_Hip'),
12:
dict(
name='L_Knee',
id=12,
color=[255, 128, 0],
type='lower',
swap='R_Knee'),
13:
dict(
name='L_B_Paw',
id=13,
color=[0, 255, 0],
type='lower',
swap='R_B_Paw'),
14:
dict(
name='R_Hip', id=14, color=[0, 255, 0], type='lower',
swap='L_Hip'),
15:
dict(
name='R_Knee',
id=15,
color=[0, 255, 0],
type='lower',
swap='L_Knee'),
16:
dict(
name='R_B_Paw',
id=16,
color=[0, 255, 0],
type='lower',
swap='L_B_Paw'),
},
skeleton_info={
0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
},
joint_weights=[
1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
1.5
],
sigmas=[
0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
])

1154
configs/_base_/datasets/coco_wholebody_info.py

File diff suppressed because it is too large

155
configs/_base_/datasets/mpii_info.py

@ -0,0 +1,155 @@
mpii_info = dict(
dataset_name='mpii',
paper_info=dict(
author='Mykhaylo Andriluka and Leonid Pishchulin and '
'Peter Gehler and Schiele, Bernt',
title='2D Human Pose Estimation: New Benchmark and '
'State of the Art Analysis',
container='IEEE Conference on Computer Vision and '
'Pattern Recognition (CVPR)',
year='2014',
homepage='http://human-pose.mpi-inf.mpg.de/',
),
keypoint_info={
0:
dict(
name='right_ankle',
id=0,
color=[255, 128, 0],
type='lower',
swap='left_ankle'),
1:
dict(
name='right_knee',
id=1,
color=[255, 128, 0],
type='lower',
swap='left_knee'),
2:
dict(
name='right_hip',
id=2,
color=[255, 128, 0],
type='lower',
swap='left_hip'),
3:
dict(
name='left_hip',
id=3,
color=[0, 255, 0],
type='lower',
swap='right_hip'),
4:
dict(
name='left_knee',
id=4,
color=[0, 255, 0],
type='lower',
swap='right_knee'),
5:
dict(
name='left_ankle',
id=5,
color=[0, 255, 0],
type='lower',
swap='right_ankle'),
6:
dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
7:
dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
8:
dict(
name='upper_neck',
id=8,
color=[51, 153, 255],
type='upper',
swap=''),
9:
dict(
name='head_top', id=9, color=[51, 153, 255], type='upper',
swap=''),
10:
dict(
name='right_wrist',
id=10,
color=[255, 128, 0],
type='upper',
swap='left_wrist'),
11:
dict(
name='right_elbow',
id=11,
color=[255, 128, 0],
type='upper',
swap='left_elbow'),
12:
dict(
name='right_shoulder',
id=12,
color=[255, 128, 0],
type='upper',
swap='left_shoulder'),
13:
dict(
name='left_shoulder',
id=13,
color=[0, 255, 0],
type='upper',
swap='right_shoulder'),
14:
dict(
name='left_elbow',
id=14,
color=[0, 255, 0],
type='upper',
swap='right_elbow'),
15:
dict(
name='left_wrist',
id=15,
color=[0, 255, 0],
type='upper',
swap='right_wrist')
},
skeleton_info={
0:
dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
1:
dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
2:
dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
3:
dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
4:
dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
5:
dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
6:
dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
7:
dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
8:
dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
9:
dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
10:
dict(
link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
0]),
11:
dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
12:
dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
13:
dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
14:
dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
},
joint_weights=[
1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
],
# Adapted from COCO dataset.
sigmas=[
0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
0.062, 0.072, 0.179, 0.179, 0.072, 0.062
])

2
configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py

@ -7,7 +7,7 @@ evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1, optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
constructor='LayerDecayOptimizerConstructor', constructor='LayerDecayOptimizerConstructor',
paramwise_cfg=dict( paramwise_cfg=dict(
num_layers=16, num_layers=24,
layer_decay_rate=0.8, layer_decay_rate=0.8,
custom_keys={ custom_keys={
'bias': dict(decay_multi=0.), 'bias': dict(decay_multi=0.),

500
configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_base_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py

@ -0,0 +1,500 @@
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py',
'../../../../_base_/datasets/aic_info.py',
'../../../../_base_/datasets/mpii_info.py',
'../../../../_base_/datasets/ap10k_info.py',
'../../../../_base_/datasets/coco_wholebody_info.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1,
constructor='LayerDecayOptimizerConstructor',
paramwise_cfg=dict(
num_layers=12,
layer_decay_rate=0.75,
custom_keys={
'bias': dict(decay_multi=0.),
'pos_embed': dict(decay_mult=0.),
'relative_position_bias_table': dict(decay_mult=0.),
'norm': dict(decay_mult=0.)
}
)
)
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
target_type = 'GaussianHeatmap'
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
aic_channel_cfg = dict(
num_output_channels=14,
dataset_joints=14,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
],
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
mpii_channel_cfg = dict(
num_output_channels=16,
dataset_joints=16,
dataset_channel=list(range(16)),
inference_channel=list(range(16)))
crowdpose_channel_cfg = dict(
num_output_channels=14,
dataset_joints=14,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
],
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
ap10k_channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
cocowholebody_channel_cfg = dict(
num_output_channels=133,
dataset_joints=133,
dataset_channel=[
list(range(133)),
],
inference_channel=list(range(133)))
# model settings
model = dict(
type='TopDownMoE',
pretrained=None,
backbone=dict(
type='ViTMoE',
img_size=(256, 192),
patch_size=16,
embed_dim=768,
depth=12,
num_heads=12,
ratio=1,
use_checkpoint=False,
mlp_ratio=4,
qkv_bias=True,
drop_path_rate=0.3,
num_expert=6,
part_features=192
),
keypoint_head=dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
associate_keypoint_head=[
dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=aic_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=mpii_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=crowdpose_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=ap10k_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=ap10k_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=cocowholebody_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
],
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=False,
target_type=target_type,
modulate_kernel=11,
use_udp=True))
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
max_num_joints=133,
dataset_idx=0,
)
aic_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=aic_channel_cfg['num_output_channels'],
num_joints=aic_channel_cfg['dataset_joints'],
dataset_channel=aic_channel_cfg['dataset_channel'],
inference_channel=aic_channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
max_num_joints=133,
dataset_idx=1,
)
mpii_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=mpii_channel_cfg['num_output_channels'],
num_joints=mpii_channel_cfg['dataset_joints'],
dataset_channel=mpii_channel_cfg['dataset_channel'],
inference_channel=mpii_channel_cfg['inference_channel'],
max_num_joints=133,
dataset_idx=2,
use_gt_bbox=True,
bbox_file=None,
)
ap10k_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='',
max_num_joints=133,
dataset_idx=3,
)
ap36k_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='',
max_num_joints=133,
dataset_idx=4,
)
cocowholebody_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=cocowholebody_channel_cfg['num_output_channels'],
num_joints=cocowholebody_channel_cfg['dataset_joints'],
dataset_channel=cocowholebody_channel_cfg['dataset_channel'],
inference_channel=cocowholebody_channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
dataset_idx=5,
max_num_joints=133,
)
cocowholebody_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
ap10k_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
aic_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
mpii_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
sigma=2,
encoding='UDP',
target_type=target_type),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'flip_pairs', 'dataset_idx'
]),
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
sigma=2,
encoding='UDP',
target_type=target_type),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'dataset_idx'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
aic_data_root = 'data/aic'
mpii_data_root = 'data/mpii'
ap10k_data_root = 'data/ap10k'
ap36k_data_root = 'data/ap36k'
data = dict(
samples_per_gpu=128,
workers_per_gpu=8,
val_dataloader=dict(samples_per_gpu=64),
test_dataloader=dict(samples_per_gpu=64),
train=[
dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
dict(
type='TopDownAicDataset',
ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/'
'keypoint_train_images_20170902/',
data_cfg=aic_data_cfg,
pipeline=aic_train_pipeline,
dataset_info={{_base_.aic_info}}),
dict(
type='TopDownMpiiDataset',
ann_file=f'{mpii_data_root}/annotations/mpii_train.json',
img_prefix=f'{mpii_data_root}/images/',
data_cfg=mpii_data_cfg,
pipeline=mpii_train_pipeline,
dataset_info={{_base_.mpii_info}}),
dict(
type='AnimalAP10KDataset',
ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json',
img_prefix=f'{ap10k_data_root}/data/',
data_cfg=ap10k_data_cfg,
pipeline=ap10k_train_pipeline,
dataset_info={{_base_.ap10k_info}}),
dict(
type='AnimalAP10KDataset',
ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json',
img_prefix=f'{ap36k_data_root}/',
data_cfg=ap36k_data_cfg,
pipeline=ap10k_train_pipeline,
dataset_info={{_base_.ap10k_info}}),
dict(
type='TopDownCocoWholeBodyDataset',
ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=cocowholebody_data_cfg,
pipeline=cocowholebody_train_pipeline,
dataset_info={{_base_.cocowholebody_info}}),
],
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)

500
configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_huge_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py

@ -0,0 +1,500 @@
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py',
'../../../../_base_/datasets/aic_info.py',
'../../../../_base_/datasets/mpii_info.py',
'../../../../_base_/datasets/ap10k_info.py',
'../../../../_base_/datasets/coco_wholebody_info.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1,
constructor='LayerDecayOptimizerConstructor',
paramwise_cfg=dict(
num_layers=32,
layer_decay_rate=0.8,
custom_keys={
'bias': dict(decay_multi=0.),
'pos_embed': dict(decay_mult=0.),
'relative_position_bias_table': dict(decay_mult=0.),
'norm': dict(decay_mult=0.)
}
)
)
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
target_type = 'GaussianHeatmap'
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
aic_channel_cfg = dict(
num_output_channels=14,
dataset_joints=14,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
],
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
mpii_channel_cfg = dict(
num_output_channels=16,
dataset_joints=16,
dataset_channel=list(range(16)),
inference_channel=list(range(16)))
crowdpose_channel_cfg = dict(
num_output_channels=14,
dataset_joints=14,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
],
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
ap10k_channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
cocowholebody_channel_cfg = dict(
num_output_channels=133,
dataset_joints=133,
dataset_channel=[
list(range(133)),
],
inference_channel=list(range(133)))
# model settings
model = dict(
type='TopDownMoE',
pretrained=None,
backbone=dict(
type='ViTMoE',
img_size=(256, 192),
patch_size=16,
embed_dim=1280,
depth=32,
num_heads=16,
ratio=1,
use_checkpoint=False,
mlp_ratio=4,
qkv_bias=True,
drop_path_rate=0.55,
num_expert=6,
part_features=320
),
keypoint_head=dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
associate_keypoint_head=[
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=aic_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=mpii_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=crowdpose_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=ap10k_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=ap10k_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=cocowholebody_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
],
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=False,
target_type=target_type,
modulate_kernel=11,
use_udp=True))
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
max_num_joints=133,
dataset_idx=0,
)
aic_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=aic_channel_cfg['num_output_channels'],
num_joints=aic_channel_cfg['dataset_joints'],
dataset_channel=aic_channel_cfg['dataset_channel'],
inference_channel=aic_channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
max_num_joints=133,
dataset_idx=1,
)
mpii_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=mpii_channel_cfg['num_output_channels'],
num_joints=mpii_channel_cfg['dataset_joints'],
dataset_channel=mpii_channel_cfg['dataset_channel'],
inference_channel=mpii_channel_cfg['inference_channel'],
max_num_joints=133,
dataset_idx=2,
use_gt_bbox=True,
bbox_file=None,
)
ap10k_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='',
max_num_joints=133,
dataset_idx=3,
)
ap36k_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='',
max_num_joints=133,
dataset_idx=4,
)
cocowholebody_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=cocowholebody_channel_cfg['num_output_channels'],
num_joints=cocowholebody_channel_cfg['dataset_joints'],
dataset_channel=cocowholebody_channel_cfg['dataset_channel'],
inference_channel=cocowholebody_channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
dataset_idx=5,
max_num_joints=133,
)
cocowholebody_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
ap10k_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
aic_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
mpii_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
sigma=2,
encoding='UDP',
target_type=target_type),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'flip_pairs', 'dataset_idx'
]),
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
sigma=2,
encoding='UDP',
target_type=target_type),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'dataset_idx'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
aic_data_root = 'data/aic'
mpii_data_root = 'data/mpii'
ap10k_data_root = 'data/ap10k'
ap36k_data_root = 'data/ap36k'
data = dict(
samples_per_gpu=128,
workers_per_gpu=8,
val_dataloader=dict(samples_per_gpu=64),
test_dataloader=dict(samples_per_gpu=64),
train=[
dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
dict(
type='TopDownAicDataset',
ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/'
'keypoint_train_images_20170902/',
data_cfg=aic_data_cfg,
pipeline=aic_train_pipeline,
dataset_info={{_base_.aic_info}}),
dict(
type='TopDownMpiiDataset',
ann_file=f'{mpii_data_root}/annotations/mpii_train.json',
img_prefix=f'{mpii_data_root}/images/',
data_cfg=mpii_data_cfg,
pipeline=mpii_train_pipeline,
dataset_info={{_base_.mpii_info}}),
dict(
type='AnimalAP10KDataset',
ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json',
img_prefix=f'{ap10k_data_root}/data/',
data_cfg=ap10k_data_cfg,
pipeline=ap10k_train_pipeline,
dataset_info={{_base_.ap10k_info}}),
dict(
type='AnimalAP10KDataset',
ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json',
img_prefix=f'{ap36k_data_root}/',
data_cfg=ap36k_data_cfg,
pipeline=ap10k_train_pipeline,
dataset_info={{_base_.ap10k_info}}),
dict(
type='TopDownCocoWholeBodyDataset',
ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=cocowholebody_data_cfg,
pipeline=cocowholebody_train_pipeline,
dataset_info={{_base_.cocowholebody_info}}),
],
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)

500
configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_large_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py

@ -0,0 +1,500 @@
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py',
'../../../../_base_/datasets/aic_info.py',
'../../../../_base_/datasets/mpii_info.py',
'../../../../_base_/datasets/ap10k_info.py',
'../../../../_base_/datasets/coco_wholebody_info.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1,
constructor='LayerDecayOptimizerConstructor',
paramwise_cfg=dict(
num_layers=24,
layer_decay_rate=0.8,
custom_keys={
'bias': dict(decay_multi=0.),
'pos_embed': dict(decay_mult=0.),
'relative_position_bias_table': dict(decay_mult=0.),
'norm': dict(decay_mult=0.)
}
)
)
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
target_type = 'GaussianHeatmap'
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
aic_channel_cfg = dict(
num_output_channels=14,
dataset_joints=14,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
],
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
mpii_channel_cfg = dict(
num_output_channels=16,
dataset_joints=16,
dataset_channel=list(range(16)),
inference_channel=list(range(16)))
crowdpose_channel_cfg = dict(
num_output_channels=14,
dataset_joints=14,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
],
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
ap10k_channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
cocowholebody_channel_cfg = dict(
num_output_channels=133,
dataset_joints=133,
dataset_channel=[
list(range(133)),
],
inference_channel=list(range(133)))
# model settings
model = dict(
type='TopDownMoE',
pretrained=None,
backbone=dict(
type='ViTMoE',
img_size=(256, 192),
patch_size=16,
embed_dim=1024,
depth=24,
num_heads=16,
ratio=1,
use_checkpoint=False,
mlp_ratio=4,
qkv_bias=True,
drop_path_rate=0.5,
num_expert=6,
part_features=256
),
keypoint_head=dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
associate_keypoint_head=[
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=aic_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=mpii_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=crowdpose_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=ap10k_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=ap10k_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=cocowholebody_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
],
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=False,
target_type=target_type,
modulate_kernel=11,
use_udp=True))
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
max_num_joints=133,
dataset_idx=0,
)
aic_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=aic_channel_cfg['num_output_channels'],
num_joints=aic_channel_cfg['dataset_joints'],
dataset_channel=aic_channel_cfg['dataset_channel'],
inference_channel=aic_channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
max_num_joints=133,
dataset_idx=1,
)
mpii_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=mpii_channel_cfg['num_output_channels'],
num_joints=mpii_channel_cfg['dataset_joints'],
dataset_channel=mpii_channel_cfg['dataset_channel'],
inference_channel=mpii_channel_cfg['inference_channel'],
max_num_joints=133,
dataset_idx=2,
use_gt_bbox=True,
bbox_file=None,
)
ap10k_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='',
max_num_joints=133,
dataset_idx=3,
)
ap36k_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='',
max_num_joints=133,
dataset_idx=4,
)
cocowholebody_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=cocowholebody_channel_cfg['num_output_channels'],
num_joints=cocowholebody_channel_cfg['dataset_joints'],
dataset_channel=cocowholebody_channel_cfg['dataset_channel'],
inference_channel=cocowholebody_channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
dataset_idx=5,
max_num_joints=133,
)
cocowholebody_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
ap10k_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
aic_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
mpii_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
sigma=2,
encoding='UDP',
target_type=target_type),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'flip_pairs', 'dataset_idx'
]),
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
sigma=2,
encoding='UDP',
target_type=target_type),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'dataset_idx'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
aic_data_root = 'data/aic'
mpii_data_root = 'data/mpii'
ap10k_data_root = 'data/ap10k'
ap36k_data_root = 'data/ap36k'
data = dict(
samples_per_gpu=128,
workers_per_gpu=8,
val_dataloader=dict(samples_per_gpu=64),
test_dataloader=dict(samples_per_gpu=64),
train=[
dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
dict(
type='TopDownAicDataset',
ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/'
'keypoint_train_images_20170902/',
data_cfg=aic_data_cfg,
pipeline=aic_train_pipeline,
dataset_info={{_base_.aic_info}}),
dict(
type='TopDownMpiiDataset',
ann_file=f'{mpii_data_root}/annotations/mpii_train.json',
img_prefix=f'{mpii_data_root}/images/',
data_cfg=mpii_data_cfg,
pipeline=mpii_train_pipeline,
dataset_info={{_base_.mpii_info}}),
dict(
type='AnimalAP10KDataset',
ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json',
img_prefix=f'{ap10k_data_root}/data/',
data_cfg=ap10k_data_cfg,
pipeline=ap10k_train_pipeline,
dataset_info={{_base_.ap10k_info}}),
dict(
type='AnimalAP10KDataset',
ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json',
img_prefix=f'{ap36k_data_root}/',
data_cfg=ap36k_data_cfg,
pipeline=ap10k_train_pipeline,
dataset_info={{_base_.ap10k_info}}),
dict(
type='TopDownCocoWholeBodyDataset',
ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=cocowholebody_data_cfg,
pipeline=cocowholebody_train_pipeline,
dataset_info={{_base_.cocowholebody_info}}),
],
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)

15
mmcv_custom/checkpoint.py

@ -25,6 +25,8 @@ from mmcv.runner import get_dist_info
from scipy import interpolate from scipy import interpolate
import numpy as np import numpy as np
import math import math
import re
import copy
ENV_MMCV_HOME = 'MMCV_HOME' ENV_MMCV_HOME = 'MMCV_HOME'
ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
@ -313,6 +315,7 @@ def load_checkpoint(model,
strict=False, strict=False,
logger=None, logger=None,
patch_padding='pad', patch_padding='pad',
part_features=None
): ):
"""Load checkpoint from a file or URI. """Load checkpoint from a file or URI.
@ -390,8 +393,18 @@ def load_checkpoint(model,
new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
state_dict['pos_embed'] = new_pos_embed state_dict['pos_embed'] = new_pos_embed
new_state_dict = copy.deepcopy(state_dict)
if part_features is not None:
current_keys = list(model.state_dict().keys())
for key in current_keys:
if "mlp.experts" in key:
source_key = re.sub(r'experts.\d+.', 'fc2.', key)
new_state_dict[key] = state_dict[source_key][-part_features:]
elif 'fc2' in key:
new_state_dict[key] = state_dict[key][:-part_features]
# load state_dict # load state_dict
load_state_dict(model, state_dict, strict, logger) load_state_dict(model, new_state_dict, strict, logger)
return checkpoint return checkpoint

3
mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py

@ -60,6 +60,9 @@ class Kpt2dSviewRgbImgTopDownDataset(Dataset, metaclass=ABCMeta):
self.ann_info['num_output_channels'] = data_cfg['num_output_channels'] self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
self.ann_info['dataset_channel'] = data_cfg['dataset_channel'] self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
self.ann_info['max_num_joints'] = data_cfg.get('max_num_joints', None)
self.ann_info['dataset_idx'] = data_cfg.get('dataset_idx', 0)
self.ann_info['use_different_joint_weights'] = data_cfg.get( self.ann_info['use_different_joint_weights'] = data_cfg.get(
'use_different_joint_weights', False) 'use_different_joint_weights', False)

8
mmpose/datasets/pipelines/top_down_transform.py

@ -633,9 +633,17 @@ class TopDownGenerateTarget:
raise ValueError( raise ValueError(
f'Encoding approach {self.encoding} is not supported!') f'Encoding approach {self.encoding} is not supported!')
if results['ann_info'].get('max_num_joints', None) is not None:
W, H = results['ann_info']['heatmap_size']
padded_length = int(results['ann_info'].get('max_num_joints') - results['ann_info'].get('num_joints'))
target_weight = np.concatenate([target_weight, np.zeros((padded_length, 1), dtype=np.float32)], 0)
target = np.concatenate([target, np.zeros((padded_length, H, W), dtype=np.float32)], 0)
results['target'] = target results['target'] = target
results['target_weight'] = target_weight results['target_weight'] = target_weight
results['dataset_idx'] = results['ann_info'].get('dataset_idx', 0)
return results return results

3
mmpose/models/backbones/__init__.py

@ -25,11 +25,12 @@ from .vgg import VGG
from .vipnas_mbv3 import ViPNAS_MobileNetV3 from .vipnas_mbv3 import ViPNAS_MobileNetV3
from .vipnas_resnet import ViPNAS_ResNet from .vipnas_resnet import ViPNAS_ResNet
from .vit import ViT from .vit import ViT
from .vit_moe import ViTMoE
__all__ = [ __all__ = [
'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2', 'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2',
'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet', 'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet',
'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN', 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3', 'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
'LiteHRNet', 'V2VNet', 'HRFormer', 'ViT' 'LiteHRNet', 'V2VNet', 'HRFormer', 'ViT', 'ViTMoE'
] ]

4
mmpose/models/backbones/base_backbone.py

@ -14,7 +14,7 @@ class BaseBackbone(nn.Module, metaclass=ABCMeta):
inherits this class should at least define its own `forward` function. inherits this class should at least define its own `forward` function.
""" """
def init_weights(self, pretrained=None, patch_padding='pad'): def init_weights(self, pretrained=None, patch_padding='pad', part_features=None):
"""Init backbone weights. """Init backbone weights.
Args: Args:
@ -25,7 +25,7 @@ class BaseBackbone(nn.Module, metaclass=ABCMeta):
""" """
if isinstance(pretrained, str): if isinstance(pretrained, str):
logger = logging.getLogger() logger = logging.getLogger()
load_checkpoint(self, pretrained, strict=False, logger=logger, patch_padding=patch_padding) load_checkpoint(self, pretrained, strict=False, logger=logger, patch_padding=patch_padding, part_features=part_features)
elif pretrained is None: elif pretrained is None:
# use default initializer or customized initializer in subclasses # use default initializer or customized initializer in subclasses
pass pass

33
mmpose/models/backbones/vit.py

@ -12,6 +12,39 @@ from timm.models.layers import drop_path, to_2tuple, trunc_normal_
from ..builder import BACKBONES from ..builder import BACKBONES
from .base_backbone import BaseBackbone from .base_backbone import BaseBackbone
def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True):
"""
Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
dimension for the original embeddings.
Args:
abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
hw (Tuple): size of input image tokens.
Returns:
Absolute positional embeddings after processing with shape (1, H, W, C)
"""
cls_token = None
B, L, C = abs_pos.shape
if has_cls_token:
cls_token = abs_pos[:, 0:1]
abs_pos = abs_pos[:, 1:]
if ori_h != h or ori_w != w:
new_abs_pos = F.interpolate(
abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2),
size=(h, w),
mode="bicubic",
align_corners=False,
).permute(0, 2, 3, 1).reshape(B, -1, C)
else:
new_abs_pos = abs_pos
if cls_token is not None:
new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1)
return new_abs_pos
class DropPath(nn.Module): class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
""" """

384
mmpose/models/backbones/vit_moe.py

@ -0,0 +1,384 @@
# Copyright (c) OpenMMLab. All rights reserved.
import math
import torch
from functools import partial
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from timm.models.layers import drop_path, to_2tuple, trunc_normal_
from ..builder import BACKBONES
from .base_backbone import BaseBackbone
def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True):
"""
Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
dimension for the original embeddings.
Args:
abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
hw (Tuple): size of input image tokens.
Returns:
Absolute positional embeddings after processing with shape (1, H, W, C)
"""
cls_token = None
B, L, C = abs_pos.shape
if has_cls_token:
cls_token = abs_pos[:, 0:1]
abs_pos = abs_pos[:, 1:]
if ori_h != h or ori_w != w:
new_abs_pos = F.interpolate(
abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2),
size=(h, w),
mode="bicubic",
align_corners=False,
).permute(0, 2, 3, 1).reshape(B, -1, C)
else:
new_abs_pos = abs_pos
if cls_token is not None:
new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1)
return new_abs_pos
class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
def extra_repr(self):
return 'p={}'.format(self.drop_prob)
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.fc2(x)
x = self.drop(x)
return x
class MoEMlp(nn.Module):
def __init__(self, num_expert=1, in_features=1024, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., part_features=256):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.part_features = part_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features - part_features)
self.drop = nn.Dropout(drop)
self.num_expert = num_expert
experts = []
for i in range(num_expert):
experts.append(
nn.Linear(hidden_features, part_features)
)
self.experts = nn.ModuleList(experts)
def forward(self, x, indices):
expert_x = torch.zeros_like(x[:, :, -self.part_features:], device=x.device, dtype=x.dtype)
x = self.fc1(x)
x = self.act(x)
shared_x = self.fc2(x)
indices = indices.view(-1, 1, 1)
# to support ddp training
for i in range(self.num_expert):
selectedIndex = (indices == i)
current_x = self.experts[i](x) * selectedIndex
expert_x = expert_x + current_x
x = torch.cat([shared_x, expert_x], dim=-1)
return x
class Attention(nn.Module):
def __init__(
self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
proj_drop=0., attn_head_dim=None,):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.dim = dim
if attn_head_dim is not None:
head_dim = attn_head_dim
all_head_dim = head_dim * self.num_heads
self.scale = qk_scale or head_dim ** -0.5
self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(all_head_dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape
qkv = self.qkv(x)
qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
q = q * self.scale
attn = (q @ k.transpose(-2, -1))
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Module):
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None,
drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU,
norm_layer=nn.LayerNorm, attn_head_dim=None, num_expert=1, part_features=None
):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = MoEMlp(num_expert=num_expert, in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, mode=mode)
def forward(self, x, indices=None):
x = x + self.drop_path(self.attn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x), indices))
return x
class PatchEmbed(nn.Module):
""" Image to Patch Embedding
"""
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
self.img_size = img_size
self.patch_size = patch_size
self.num_patches = num_patches
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1))
def forward(self, x, **kwargs):
B, C, H, W = x.shape
x = self.proj(x)
Hp, Wp = x.shape[2], x.shape[3]
x = x.flatten(2).transpose(1, 2)
return x, (Hp, Wp)
class HybridEmbed(nn.Module):
""" CNN Feature Map Embedding
Extract feature map from CNN, flatten, project to embedding dim.
"""
def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
super().__init__()
assert isinstance(backbone, nn.Module)
img_size = to_2tuple(img_size)
self.img_size = img_size
self.backbone = backbone
if feature_size is None:
with torch.no_grad():
training = backbone.training
if training:
backbone.eval()
o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
feature_size = o.shape[-2:]
feature_dim = o.shape[1]
backbone.train(training)
else:
feature_size = to_2tuple(feature_size)
feature_dim = self.backbone.feature_info.channels()[-1]
self.num_patches = feature_size[0] * feature_size[1]
self.proj = nn.Linear(feature_dim, embed_dim)
def forward(self, x):
x = self.backbone(x)[-1]
x = x.flatten(2).transpose(1, 2)
x = self.proj(x)
return x
@BACKBONES.register_module()
class ViTMoE(BaseBackbone):
def __init__(self,
img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12,
num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False,
frozen_stages=-1, ratio=1, last_norm=True,
patch_padding='pad', freeze_attn=False, freeze_ffn=False,
num_expert=1, part_features=None
):
# Protect mutable default arguments
super(ViTMoE, self).__init__()
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
self.num_classes = num_classes
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
self.frozen_stages = frozen_stages
self.use_checkpoint = use_checkpoint
self.patch_padding = patch_padding
self.freeze_attn = freeze_attn
self.freeze_ffn = freeze_ffn
self.depth = depth
if hybrid_backbone is not None:
self.patch_embed = HybridEmbed(
hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
else:
self.patch_embed = PatchEmbed(
img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio)
num_patches = self.patch_embed.num_patches
self.part_features = part_features
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
self.blocks = nn.ModuleList([
Block(
dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
num_expert=num_expert, part_features=part_features
)
for i in range(depth)])
self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()
if self.pos_embed is not None:
trunc_normal_(self.pos_embed, std=.02)
self._freeze_stages()
def _freeze_stages(self):
"""Freeze parameters."""
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.requires_grad = False
for i in range(1, self.frozen_stages + 1):
m = self.blocks[i]
m.eval()
for param in m.parameters():
param.requires_grad = False
if self.freeze_attn:
for i in range(0, self.depth):
m = self.blocks[i]
m.attn.eval()
m.norm1.eval()
for param in m.attn.parameters():
param.requires_grad = False
for param in m.norm1.parameters():
param.requires_grad = False
if self.freeze_ffn:
self.pos_embed.requires_grad = False
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.requires_grad = False
for i in range(0, self.depth):
m = self.blocks[i]
m.mlp.eval()
m.norm2.eval()
for param in m.mlp.parameters():
param.requires_grad = False
for param in m.norm2.parameters():
param.requires_grad = False
def init_weights(self, pretrained=None):
"""Initialize the weights in backbone.
Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""
super().init_weights(pretrained, patch_padding=self.patch_padding, part_features=self.part_features)
if pretrained is None:
def _init_weights(m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
self.apply(_init_weights)
def get_num_layers(self):
return len(self.blocks)
@torch.jit.ignore
def no_weight_decay(self):
return {'pos_embed', 'cls_token'}
def forward_features(self, x, dataset_source=None):
B, C, H, W = x.shape
x, (Hp, Wp) = self.patch_embed(x)
if self.pos_embed is not None:
# fit for multiple GPU training
# since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
for blk in self.blocks:
if self.use_checkpoint:
x = checkpoint.checkpoint(blk, x, dataset_source)
else:
x = blk(x, dataset_source)
x = self.last_norm(x)
xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous()
return xp
def forward(self, x, dataset_source=None):
x = self.forward_features(x, dataset_source)
return x
def train(self, mode=True):
"""Convert the model into training mode."""
super().train(mode)
self._freeze_stages()

3
mmpose/models/detectors/__init__.py

@ -8,9 +8,10 @@ from .multiview_pose import (DetectAndRegress, VoxelCenterDetector,
from .pose_lifter import PoseLifter from .pose_lifter import PoseLifter
from .posewarper import PoseWarper from .posewarper import PoseWarper
from .top_down import TopDown from .top_down import TopDown
from .top_down_moe import TopDownMoE
__all__ = [ __all__ = [
'TopDown', 'AssociativeEmbedding', 'ParametricMesh', 'MultiTask', 'TopDown', 'AssociativeEmbedding', 'ParametricMesh', 'MultiTask',
'PoseLifter', 'Interhand3D', 'PoseWarper', 'DetectAndRegress', 'PoseLifter', 'Interhand3D', 'PoseWarper', 'DetectAndRegress',
'VoxelCenterDetector', 'VoxelSinglePose' 'VoxelCenterDetector', 'VoxelSinglePose', 'TopDownMoE'
] ]

351
mmpose/models/detectors/top_down_moe.py

@ -0,0 +1,351 @@
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
import torch
import torch.nn as nn
import mmcv
import numpy as np
from mmcv.image import imwrite
from mmcv.utils.misc import deprecated_api_warning
from mmcv.visualization.image import imshow
from mmpose.core import imshow_bboxes, imshow_keypoints
from .. import builder
from ..builder import POSENETS
from .base import BasePose
try:
from mmcv.runner import auto_fp16
except ImportError:
warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
'Please install mmcv>=1.1.4')
from mmpose.core import auto_fp16
@POSENETS.register_module()
class TopDownMoE(BasePose):
"""Top-down pose detectors.
Args:
backbone (dict): Backbone modules to extract feature.
keypoint_head (dict): Keypoint head to process feature.
train_cfg (dict): Config for training. Default: None.
test_cfg (dict): Config for testing. Default: None.
pretrained (str): Path to the pretrained models.
loss_pose (None): Deprecated arguments. Please use
`loss_keypoint` for heads instead.
"""
def __init__(self,
backbone,
neck=None,
keypoint_head=None,
associate_keypoint_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None,
loss_pose=None):
super().__init__()
self.fp16_enabled = False
self.backbone = builder.build_backbone(backbone)
self.train_cfg = train_cfg
self.test_cfg = test_cfg
if neck is not None:
self.neck = builder.build_neck(neck)
if keypoint_head is not None:
keypoint_head['train_cfg'] = train_cfg
keypoint_head['test_cfg'] = test_cfg
if 'loss_keypoint' not in keypoint_head and loss_pose is not None:
warnings.warn(
'`loss_pose` for TopDown is deprecated, '
'use `loss_keypoint` for heads instead. See '
'https://github.com/open-mmlab/mmpose/pull/382'
' for more information.', DeprecationWarning)
keypoint_head['loss_keypoint'] = loss_pose
self.keypoint_head = builder.build_head(keypoint_head)
associate_keypoint_heads = []
keypoint_heads_cnt = 1
if associate_keypoint_head is not None:
if not isinstance(associate_keypoint_head, list):
associate_keypoint_head = [associate_keypoint_head]
for single_keypoint_head in associate_keypoint_head:
single_keypoint_head['train_cfg'] = train_cfg
single_keypoint_head['test_cfg'] = test_cfg
associate_keypoint_heads.append(builder.build_head(single_keypoint_head))
keypoint_heads_cnt += 1
self.associate_keypoint_heads = nn.ModuleList(associate_keypoint_heads)
self.keypoint_heads_cnt = keypoint_heads_cnt
self.init_weights(pretrained=pretrained)
@property
def with_neck(self):
"""Check if has neck."""
return hasattr(self, 'neck')
@property
def with_keypoint(self):
"""Check if has keypoint_head."""
return hasattr(self, 'keypoint_head')
def init_weights(self, pretrained=None):
"""Weight initialization for model."""
self.backbone.init_weights(pretrained)
if self.with_neck:
self.neck.init_weights()
if self.with_keypoint:
self.keypoint_head.init_weights()
for item in self.associate_keypoint_heads:
item.init_weights()
@auto_fp16(apply_to=('img', ))
def forward(self,
img,
target=None,
target_weight=None,
img_metas=None,
return_loss=True,
return_heatmap=False,
**kwargs):
"""Calls either forward_train or forward_test depending on whether
return_loss=True. Note this setting will change the expected inputs.
When `return_loss=True`, img and img_meta are single-nested (i.e.
Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
should be double nested (i.e. List[Tensor], List[List[dict]]), with
the outer list indicating test time augmentations.
Note:
- batch_size: N
- num_keypoints: K
- num_img_channel: C (Default: 3)
- img height: imgH
- img width: imgW
- heatmaps height: H
- heatmaps weight: W
Args:
img (torch.Tensor[NxCximgHximgW]): Input images.
target (torch.Tensor[NxKxHxW]): Target heatmaps.
target_weight (torch.Tensor[NxKx1]): Weights across
different joint types.
img_metas (list(dict)): Information about data augmentation
By default this includes:
- "image_file: path to the image file
- "center": center of the bbox
- "scale": scale of the bbox
- "rotation": rotation of the bbox
- "bbox_score": score of bbox
return_loss (bool): Option to `return loss`. `return loss=True`
for training, `return loss=False` for validation & test.
return_heatmap (bool) : Option to return heatmap.
Returns:
dict|tuple: if `return loss` is true, then return losses. \
Otherwise, return predicted poses, boxes, image paths \
and heatmaps.
"""
if return_loss:
return self.forward_train(img, target, target_weight, img_metas,
**kwargs)
return self.forward_test(
img, img_metas, return_heatmap=return_heatmap, **kwargs)
def forward_train(self, img, target, target_weight, img_metas, **kwargs):
"""Defines the computation performed at every call when training."""
img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device)
output = self.backbone(img, img_sources)
if self.with_neck:
output = self.neck(output)
# if return loss
losses = dict()
main_stream_select = (img_sources == 0)
# if torch.sum(main_stream_select) > 0:
output_select = self.keypoint_head(output)
target_select = target * main_stream_select.view(-1, 1, 1, 1)
target_weight_select = target_weight * main_stream_select.view(-1, 1, 1)
keypoint_losses = self.keypoint_head.get_loss(
output_select, target_select, target_weight_select)
losses['main_stream_loss'] = keypoint_losses['heatmap_loss']
keypoint_accuracy = self.keypoint_head.get_accuracy(
output_select, target_select, target_weight_select)
losses['main_stream_acc'] = keypoint_accuracy['acc_pose']
for idx in range(1, self.keypoint_heads_cnt):
idx_select = (img_sources == idx)
target_select = target * idx_select.view(-1, 1, 1, 1)
target_weight_select = target_weight * idx_select.view(-1, 1, 1)
output_select = self.associate_keypoint_heads[idx - 1](output)
keypoint_losses = self.associate_keypoint_heads[idx - 1].get_loss(
output_select, target_select, target_weight_select)
losses[f'{idx}_loss'] = keypoint_losses['heatmap_loss']
keypoint_accuracy = self.associate_keypoint_heads[idx - 1].get_accuracy(
output_select, target_select, target_weight_select)
losses[f'{idx}_acc'] = keypoint_accuracy['acc_pose']
return losses
def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
"""Defines the computation performed at every call when testing."""
assert img.size(0) == len(img_metas)
batch_size, _, img_height, img_width = img.shape
if batch_size > 1:
assert 'bbox_id' in img_metas[0]
result = {}
img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device)
features = self.backbone(img, img_sources)
if self.with_neck:
features = self.neck(features)
if self.with_keypoint:
output_heatmap = self.keypoint_head.inference_model(
features, flip_pairs=None)
if self.test_cfg.get('flip_test', True):
img_flipped = img.flip(3)
features_flipped = self.backbone(img_flipped, img_sources)
if self.with_neck:
features_flipped = self.neck(features_flipped)
if self.with_keypoint:
output_flipped_heatmap = self.keypoint_head.inference_model(
features_flipped, img_metas[0]['flip_pairs'])
output_heatmap = (output_heatmap +
output_flipped_heatmap) * 0.5
if self.with_keypoint:
keypoint_result = self.keypoint_head.decode(
img_metas, output_heatmap, img_size=[img_width, img_height])
result.update(keypoint_result)
if not return_heatmap:
output_heatmap = None
result['output_heatmap'] = output_heatmap
return result
def forward_dummy(self, img):
"""Used for computing network FLOPs.
See ``tools/get_flops.py``.
Args:
img (torch.Tensor): Input image.
Returns:
Tensor: Output heatmaps.
"""
output = self.backbone(img)
if self.with_neck:
output = self.neck(output)
if self.with_keypoint:
output = self.keypoint_head(output)
return output
@deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
cls_name='TopDown')
def show_result(self,
img,
result,
skeleton=None,
kpt_score_thr=0.3,
bbox_color='green',
pose_kpt_color=None,
pose_link_color=None,
text_color='white',
radius=4,
thickness=1,
font_scale=0.5,
bbox_thickness=1,
win_name='',
show=False,
show_keypoint_weight=False,
wait_time=0,
out_file=None):
"""Draw `result` over `img`.
Args:
img (str or Tensor): The image to be displayed.
result (list[dict]): The results to draw over `img`
(bbox_result, pose_result).
skeleton (list[list]): The connection of keypoints.
skeleton is 0-based indexing.
kpt_score_thr (float, optional): Minimum score of keypoints
to be shown. Default: 0.3.
bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
If None, do not draw keypoints.
pose_link_color (np.array[Mx3]): Color of M links.
If None, do not draw links.
text_color (str or tuple or :obj:`Color`): Color of texts.
radius (int): Radius of circles.
thickness (int): Thickness of lines.
font_scale (float): Font scales of texts.
win_name (str): The window name.
show (bool): Whether to show the image. Default: False.
show_keypoint_weight (bool): Whether to change the transparency
using the predicted confidence scores of keypoints.
wait_time (int): Value of waitKey param.
Default: 0.
out_file (str or None): The filename to write the image.
Default: None.
Returns:
Tensor: Visualized img, only if not `show` or `out_file`.
"""
img = mmcv.imread(img)
img = img.copy()
bbox_result = []
bbox_labels = []
pose_result = []
for res in result:
if 'bbox' in res:
bbox_result.append(res['bbox'])
bbox_labels.append(res.get('label', None))
pose_result.append(res['keypoints'])
if bbox_result:
bboxes = np.vstack(bbox_result)
# draw bounding boxes
imshow_bboxes(
img,
bboxes,
labels=bbox_labels,
colors=bbox_color,
text_color=text_color,
thickness=bbox_thickness,
font_scale=font_scale,
show=False)
if pose_result:
imshow_keypoints(img, pose_result, skeleton, kpt_score_thr,
pose_kpt_color, pose_link_color, radius,
thickness)
if show:
imshow(img, win_name, wait_time)
if out_file is not None:
imwrite(img, out_file)
return img

2
tools/train.py

@ -17,7 +17,7 @@ from mmpose.apis import init_random_seed, train_model
from mmpose.datasets import build_dataset from mmpose.datasets import build_dataset
from mmpose.models import build_posenet from mmpose.models import build_posenet
from mmpose.utils import collect_env, get_root_logger, setup_multi_processes from mmpose.utils import collect_env, get_root_logger, setup_multi_processes
import mmcv_custom
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description='Train a pose model') parser = argparse.ArgumentParser(description='Train a pose model')

Loading…
Cancel
Save