Browse Source

update MoE code for vitpose+

233
Yufei 3 years ago
parent
commit
98fb7f859f
  1. 140
      configs/_base_/datasets/aic_info.py
  2. 142
      configs/_base_/datasets/ap10k_info.py
  3. 1154
      configs/_base_/datasets/coco_wholebody_info.py
  4. 155
      configs/_base_/datasets/mpii_info.py
  5. 2
      configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py
  6. 500
      configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_base_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
  7. 500
      configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_huge_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
  8. 500
      configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_large_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py
  9. 15
      mmcv_custom/checkpoint.py
  10. 3
      mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py
  11. 8
      mmpose/datasets/pipelines/top_down_transform.py
  12. 3
      mmpose/models/backbones/__init__.py
  13. 4
      mmpose/models/backbones/base_backbone.py
  14. 33
      mmpose/models/backbones/vit.py
  15. 384
      mmpose/models/backbones/vit_moe.py
  16. 3
      mmpose/models/detectors/__init__.py
  17. 351
      mmpose/models/detectors/top_down_moe.py
  18. 2
      tools/train.py

140
configs/_base_/datasets/aic_info.py

@ -0,0 +1,140 @@
aic_info = dict(
dataset_name='aic',
paper_info=dict(
author='Wu, Jiahong and Zheng, He and Zhao, Bo and '
'Li, Yixin and Yan, Baoming and Liang, Rui and '
'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and '
'Fu, Yanwei and others',
title='Ai challenger: A large-scale dataset for going '
'deeper in image understanding',
container='arXiv',
year='2017',
homepage='https://github.com/AIChallenger/AI_Challenger_2017',
),
keypoint_info={
0:
dict(
name='right_shoulder',
id=0,
color=[255, 128, 0],
type='upper',
swap='left_shoulder'),
1:
dict(
name='right_elbow',
id=1,
color=[255, 128, 0],
type='upper',
swap='left_elbow'),
2:
dict(
name='right_wrist',
id=2,
color=[255, 128, 0],
type='upper',
swap='left_wrist'),
3:
dict(
name='left_shoulder',
id=3,
color=[0, 255, 0],
type='upper',
swap='right_shoulder'),
4:
dict(
name='left_elbow',
id=4,
color=[0, 255, 0],
type='upper',
swap='right_elbow'),
5:
dict(
name='left_wrist',
id=5,
color=[0, 255, 0],
type='upper',
swap='right_wrist'),
6:
dict(
name='right_hip',
id=6,
color=[255, 128, 0],
type='lower',
swap='left_hip'),
7:
dict(
name='right_knee',
id=7,
color=[255, 128, 0],
type='lower',
swap='left_knee'),
8:
dict(
name='right_ankle',
id=8,
color=[255, 128, 0],
type='lower',
swap='left_ankle'),
9:
dict(
name='left_hip',
id=9,
color=[0, 255, 0],
type='lower',
swap='right_hip'),
10:
dict(
name='left_knee',
id=10,
color=[0, 255, 0],
type='lower',
swap='right_knee'),
11:
dict(
name='left_ankle',
id=11,
color=[0, 255, 0],
type='lower',
swap='right_ankle'),
12:
dict(
name='head_top',
id=12,
color=[51, 153, 255],
type='upper',
swap=''),
13:
dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='')
},
skeleton_info={
0:
dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]),
1: dict(
link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]),
2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]),
3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]),
4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]),
7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]),
8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]),
9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]),
10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]),
11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]),
12: dict(
link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]),
13:
dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255])
},
joint_weights=[
1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.
],
# 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/'
# 'Evaluation/keypoint_eval/keypoint_eval.py#L50'
# delta = 2 x sigma
sigmas=[
0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144,
0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081,
0.01291456, 0.01236173
])

142
configs/_base_/datasets/ap10k_info.py

@ -0,0 +1,142 @@
ap10k_info = dict(
dataset_name='ap10k',
paper_info=dict(
author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
container='35th Conference on Neural Information Processing Systems '
'(NeurIPS 2021) Track on Datasets and Bench-marks.',
year='2021',
homepage='https://github.com/AlexTheBad/AP-10K',
),
keypoint_info={
0:
dict(
name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
1:
dict(
name='R_Eye',
id=1,
color=[255, 128, 0],
type='upper',
swap='L_Eye'),
2:
dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
3:
dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
4:
dict(
name='Root of tail',
id=4,
color=[51, 153, 255],
type='lower',
swap=''),
5:
dict(
name='L_Shoulder',
id=5,
color=[51, 153, 255],
type='upper',
swap='R_Shoulder'),
6:
dict(
name='L_Elbow',
id=6,
color=[51, 153, 255],
type='upper',
swap='R_Elbow'),
7:
dict(
name='L_F_Paw',
id=7,
color=[0, 255, 0],
type='upper',
swap='R_F_Paw'),
8:
dict(
name='R_Shoulder',
id=8,
color=[0, 255, 0],
type='upper',
swap='L_Shoulder'),
9:
dict(
name='R_Elbow',
id=9,
color=[255, 128, 0],
type='upper',
swap='L_Elbow'),
10:
dict(
name='R_F_Paw',
id=10,
color=[0, 255, 0],
type='lower',
swap='L_F_Paw'),
11:
dict(
name='L_Hip',
id=11,
color=[255, 128, 0],
type='lower',
swap='R_Hip'),
12:
dict(
name='L_Knee',
id=12,
color=[255, 128, 0],
type='lower',
swap='R_Knee'),
13:
dict(
name='L_B_Paw',
id=13,
color=[0, 255, 0],
type='lower',
swap='R_B_Paw'),
14:
dict(
name='R_Hip', id=14, color=[0, 255, 0], type='lower',
swap='L_Hip'),
15:
dict(
name='R_Knee',
id=15,
color=[0, 255, 0],
type='lower',
swap='L_Knee'),
16:
dict(
name='R_B_Paw',
id=16,
color=[0, 255, 0],
type='lower',
swap='L_B_Paw'),
},
skeleton_info={
0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
},
joint_weights=[
1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
1.5
],
sigmas=[
0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
])

1154
configs/_base_/datasets/coco_wholebody_info.py

File diff suppressed because it is too large

155
configs/_base_/datasets/mpii_info.py

@ -0,0 +1,155 @@
mpii_info = dict(
dataset_name='mpii',
paper_info=dict(
author='Mykhaylo Andriluka and Leonid Pishchulin and '
'Peter Gehler and Schiele, Bernt',
title='2D Human Pose Estimation: New Benchmark and '
'State of the Art Analysis',
container='IEEE Conference on Computer Vision and '
'Pattern Recognition (CVPR)',
year='2014',
homepage='http://human-pose.mpi-inf.mpg.de/',
),
keypoint_info={
0:
dict(
name='right_ankle',
id=0,
color=[255, 128, 0],
type='lower',
swap='left_ankle'),
1:
dict(
name='right_knee',
id=1,
color=[255, 128, 0],
type='lower',
swap='left_knee'),
2:
dict(
name='right_hip',
id=2,
color=[255, 128, 0],
type='lower',
swap='left_hip'),
3:
dict(
name='left_hip',
id=3,
color=[0, 255, 0],
type='lower',
swap='right_hip'),
4:
dict(
name='left_knee',
id=4,
color=[0, 255, 0],
type='lower',
swap='right_knee'),
5:
dict(
name='left_ankle',
id=5,
color=[0, 255, 0],
type='lower',
swap='right_ankle'),
6:
dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
7:
dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
8:
dict(
name='upper_neck',
id=8,
color=[51, 153, 255],
type='upper',
swap=''),
9:
dict(
name='head_top', id=9, color=[51, 153, 255], type='upper',
swap=''),
10:
dict(
name='right_wrist',
id=10,
color=[255, 128, 0],
type='upper',
swap='left_wrist'),
11:
dict(
name='right_elbow',
id=11,
color=[255, 128, 0],
type='upper',
swap='left_elbow'),
12:
dict(
name='right_shoulder',
id=12,
color=[255, 128, 0],
type='upper',
swap='left_shoulder'),
13:
dict(
name='left_shoulder',
id=13,
color=[0, 255, 0],
type='upper',
swap='right_shoulder'),
14:
dict(
name='left_elbow',
id=14,
color=[0, 255, 0],
type='upper',
swap='right_elbow'),
15:
dict(
name='left_wrist',
id=15,
color=[0, 255, 0],
type='upper',
swap='right_wrist')
},
skeleton_info={
0:
dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
1:
dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
2:
dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
3:
dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
4:
dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
5:
dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
6:
dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
7:
dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
8:
dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
9:
dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
10:
dict(
link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
0]),
11:
dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
12:
dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
13:
dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
14:
dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
},
joint_weights=[
1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
],
# Adapted from COCO dataset.
sigmas=[
0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
0.062, 0.072, 0.179, 0.179, 0.072, 0.062
])

2
configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py

@ -7,7 +7,7 @@ evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
constructor='LayerDecayOptimizerConstructor',
paramwise_cfg=dict(
num_layers=16,
num_layers=24,
layer_decay_rate=0.8,
custom_keys={
'bias': dict(decay_multi=0.),

500
configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_base_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py

@ -0,0 +1,500 @@
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py',
'../../../../_base_/datasets/aic_info.py',
'../../../../_base_/datasets/mpii_info.py',
'../../../../_base_/datasets/ap10k_info.py',
'../../../../_base_/datasets/coco_wholebody_info.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1,
constructor='LayerDecayOptimizerConstructor',
paramwise_cfg=dict(
num_layers=12,
layer_decay_rate=0.75,
custom_keys={
'bias': dict(decay_multi=0.),
'pos_embed': dict(decay_mult=0.),
'relative_position_bias_table': dict(decay_mult=0.),
'norm': dict(decay_mult=0.)
}
)
)
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
target_type = 'GaussianHeatmap'
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
aic_channel_cfg = dict(
num_output_channels=14,
dataset_joints=14,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
],
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
mpii_channel_cfg = dict(
num_output_channels=16,
dataset_joints=16,
dataset_channel=list(range(16)),
inference_channel=list(range(16)))
crowdpose_channel_cfg = dict(
num_output_channels=14,
dataset_joints=14,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
],
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
ap10k_channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
cocowholebody_channel_cfg = dict(
num_output_channels=133,
dataset_joints=133,
dataset_channel=[
list(range(133)),
],
inference_channel=list(range(133)))
# model settings
model = dict(
type='TopDownMoE',
pretrained=None,
backbone=dict(
type='ViTMoE',
img_size=(256, 192),
patch_size=16,
embed_dim=768,
depth=12,
num_heads=12,
ratio=1,
use_checkpoint=False,
mlp_ratio=4,
qkv_bias=True,
drop_path_rate=0.3,
num_expert=6,
part_features=192
),
keypoint_head=dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
associate_keypoint_head=[
dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=aic_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=mpii_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=crowdpose_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=ap10k_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=ap10k_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=768,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=cocowholebody_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
],
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=False,
target_type=target_type,
modulate_kernel=11,
use_udp=True))
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
max_num_joints=133,
dataset_idx=0,
)
aic_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=aic_channel_cfg['num_output_channels'],
num_joints=aic_channel_cfg['dataset_joints'],
dataset_channel=aic_channel_cfg['dataset_channel'],
inference_channel=aic_channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
max_num_joints=133,
dataset_idx=1,
)
mpii_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=mpii_channel_cfg['num_output_channels'],
num_joints=mpii_channel_cfg['dataset_joints'],
dataset_channel=mpii_channel_cfg['dataset_channel'],
inference_channel=mpii_channel_cfg['inference_channel'],
max_num_joints=133,
dataset_idx=2,
use_gt_bbox=True,
bbox_file=None,
)
ap10k_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='',
max_num_joints=133,
dataset_idx=3,
)
ap36k_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='',
max_num_joints=133,
dataset_idx=4,
)
cocowholebody_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=cocowholebody_channel_cfg['num_output_channels'],
num_joints=cocowholebody_channel_cfg['dataset_joints'],
dataset_channel=cocowholebody_channel_cfg['dataset_channel'],
inference_channel=cocowholebody_channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
dataset_idx=5,
max_num_joints=133,
)
cocowholebody_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
ap10k_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
aic_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
mpii_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
sigma=2,
encoding='UDP',
target_type=target_type),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'flip_pairs', 'dataset_idx'
]),
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
sigma=2,
encoding='UDP',
target_type=target_type),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'dataset_idx'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
aic_data_root = 'data/aic'
mpii_data_root = 'data/mpii'
ap10k_data_root = 'data/ap10k'
ap36k_data_root = 'data/ap36k'
data = dict(
samples_per_gpu=128,
workers_per_gpu=8,
val_dataloader=dict(samples_per_gpu=64),
test_dataloader=dict(samples_per_gpu=64),
train=[
dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
dict(
type='TopDownAicDataset',
ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/'
'keypoint_train_images_20170902/',
data_cfg=aic_data_cfg,
pipeline=aic_train_pipeline,
dataset_info={{_base_.aic_info}}),
dict(
type='TopDownMpiiDataset',
ann_file=f'{mpii_data_root}/annotations/mpii_train.json',
img_prefix=f'{mpii_data_root}/images/',
data_cfg=mpii_data_cfg,
pipeline=mpii_train_pipeline,
dataset_info={{_base_.mpii_info}}),
dict(
type='AnimalAP10KDataset',
ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json',
img_prefix=f'{ap10k_data_root}/data/',
data_cfg=ap10k_data_cfg,
pipeline=ap10k_train_pipeline,
dataset_info={{_base_.ap10k_info}}),
dict(
type='AnimalAP10KDataset',
ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json',
img_prefix=f'{ap36k_data_root}/',
data_cfg=ap36k_data_cfg,
pipeline=ap10k_train_pipeline,
dataset_info={{_base_.ap10k_info}}),
dict(
type='TopDownCocoWholeBodyDataset',
ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=cocowholebody_data_cfg,
pipeline=cocowholebody_train_pipeline,
dataset_info={{_base_.cocowholebody_info}}),
],
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)

500
configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_huge_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py

@ -0,0 +1,500 @@
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py',
'../../../../_base_/datasets/aic_info.py',
'../../../../_base_/datasets/mpii_info.py',
'../../../../_base_/datasets/ap10k_info.py',
'../../../../_base_/datasets/coco_wholebody_info.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1,
constructor='LayerDecayOptimizerConstructor',
paramwise_cfg=dict(
num_layers=32,
layer_decay_rate=0.8,
custom_keys={
'bias': dict(decay_multi=0.),
'pos_embed': dict(decay_mult=0.),
'relative_position_bias_table': dict(decay_mult=0.),
'norm': dict(decay_mult=0.)
}
)
)
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
target_type = 'GaussianHeatmap'
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
aic_channel_cfg = dict(
num_output_channels=14,
dataset_joints=14,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
],
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
mpii_channel_cfg = dict(
num_output_channels=16,
dataset_joints=16,
dataset_channel=list(range(16)),
inference_channel=list(range(16)))
crowdpose_channel_cfg = dict(
num_output_channels=14,
dataset_joints=14,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
],
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
ap10k_channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
cocowholebody_channel_cfg = dict(
num_output_channels=133,
dataset_joints=133,
dataset_channel=[
list(range(133)),
],
inference_channel=list(range(133)))
# model settings
model = dict(
type='TopDownMoE',
pretrained=None,
backbone=dict(
type='ViTMoE',
img_size=(256, 192),
patch_size=16,
embed_dim=1280,
depth=32,
num_heads=16,
ratio=1,
use_checkpoint=False,
mlp_ratio=4,
qkv_bias=True,
drop_path_rate=0.55,
num_expert=6,
part_features=320
),
keypoint_head=dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
associate_keypoint_head=[
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=aic_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=mpii_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=crowdpose_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=ap10k_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=ap10k_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1280,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=cocowholebody_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
],
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=False,
target_type=target_type,
modulate_kernel=11,
use_udp=True))
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
max_num_joints=133,
dataset_idx=0,
)
aic_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=aic_channel_cfg['num_output_channels'],
num_joints=aic_channel_cfg['dataset_joints'],
dataset_channel=aic_channel_cfg['dataset_channel'],
inference_channel=aic_channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
max_num_joints=133,
dataset_idx=1,
)
mpii_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=mpii_channel_cfg['num_output_channels'],
num_joints=mpii_channel_cfg['dataset_joints'],
dataset_channel=mpii_channel_cfg['dataset_channel'],
inference_channel=mpii_channel_cfg['inference_channel'],
max_num_joints=133,
dataset_idx=2,
use_gt_bbox=True,
bbox_file=None,
)
ap10k_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='',
max_num_joints=133,
dataset_idx=3,
)
ap36k_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='',
max_num_joints=133,
dataset_idx=4,
)
cocowholebody_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=cocowholebody_channel_cfg['num_output_channels'],
num_joints=cocowholebody_channel_cfg['dataset_joints'],
dataset_channel=cocowholebody_channel_cfg['dataset_channel'],
inference_channel=cocowholebody_channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
dataset_idx=5,
max_num_joints=133,
)
cocowholebody_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
ap10k_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
aic_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
mpii_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
sigma=2,
encoding='UDP',
target_type=target_type),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'flip_pairs', 'dataset_idx'
]),
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
sigma=2,
encoding='UDP',
target_type=target_type),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'dataset_idx'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
aic_data_root = 'data/aic'
mpii_data_root = 'data/mpii'
ap10k_data_root = 'data/ap10k'
ap36k_data_root = 'data/ap36k'
data = dict(
samples_per_gpu=128,
workers_per_gpu=8,
val_dataloader=dict(samples_per_gpu=64),
test_dataloader=dict(samples_per_gpu=64),
train=[
dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
dict(
type='TopDownAicDataset',
ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/'
'keypoint_train_images_20170902/',
data_cfg=aic_data_cfg,
pipeline=aic_train_pipeline,
dataset_info={{_base_.aic_info}}),
dict(
type='TopDownMpiiDataset',
ann_file=f'{mpii_data_root}/annotations/mpii_train.json',
img_prefix=f'{mpii_data_root}/images/',
data_cfg=mpii_data_cfg,
pipeline=mpii_train_pipeline,
dataset_info={{_base_.mpii_info}}),
dict(
type='AnimalAP10KDataset',
ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json',
img_prefix=f'{ap10k_data_root}/data/',
data_cfg=ap10k_data_cfg,
pipeline=ap10k_train_pipeline,
dataset_info={{_base_.ap10k_info}}),
dict(
type='AnimalAP10KDataset',
ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json',
img_prefix=f'{ap36k_data_root}/',
data_cfg=ap36k_data_cfg,
pipeline=ap10k_train_pipeline,
dataset_info={{_base_.ap10k_info}}),
dict(
type='TopDownCocoWholeBodyDataset',
ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=cocowholebody_data_cfg,
pipeline=cocowholebody_train_pipeline,
dataset_info={{_base_.cocowholebody_info}}),
],
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)

500
configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_large_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py

@ -0,0 +1,500 @@
_base_ = [
'../../../../_base_/default_runtime.py',
'../../../../_base_/datasets/coco.py',
'../../../../_base_/datasets/aic_info.py',
'../../../../_base_/datasets/mpii_info.py',
'../../../../_base_/datasets/ap10k_info.py',
'../../../../_base_/datasets/coco_wholebody_info.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1,
constructor='LayerDecayOptimizerConstructor',
paramwise_cfg=dict(
num_layers=24,
layer_decay_rate=0.8,
custom_keys={
'bias': dict(decay_multi=0.),
'pos_embed': dict(decay_mult=0.),
'relative_position_bias_table': dict(decay_mult=0.),
'norm': dict(decay_mult=0.)
}
)
)
optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
target_type = 'GaussianHeatmap'
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
aic_channel_cfg = dict(
num_output_channels=14,
dataset_joints=14,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
],
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
mpii_channel_cfg = dict(
num_output_channels=16,
dataset_joints=16,
dataset_channel=list(range(16)),
inference_channel=list(range(16)))
crowdpose_channel_cfg = dict(
num_output_channels=14,
dataset_joints=14,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
],
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
ap10k_channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
cocowholebody_channel_cfg = dict(
num_output_channels=133,
dataset_joints=133,
dataset_channel=[
list(range(133)),
],
inference_channel=list(range(133)))
# model settings
model = dict(
type='TopDownMoE',
pretrained=None,
backbone=dict(
type='ViTMoE',
img_size=(256, 192),
patch_size=16,
embed_dim=1024,
depth=24,
num_heads=16,
ratio=1,
use_checkpoint=False,
mlp_ratio=4,
qkv_bias=True,
drop_path_rate=0.5,
num_expert=6,
part_features=256
),
keypoint_head=dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
associate_keypoint_head=[
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=aic_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=mpii_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=crowdpose_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=ap10k_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=ap10k_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
dict(
type='TopdownHeatmapSimpleHead',
in_channels=1024,
num_deconv_layers=2,
num_deconv_filters=(256, 256),
num_deconv_kernels=(4, 4),
extra=dict(final_conv_kernel=1, ),
out_channels=cocowholebody_channel_cfg['num_output_channels'],
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
],
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=False,
target_type=target_type,
modulate_kernel=11,
use_udp=True))
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
max_num_joints=133,
dataset_idx=0,
)
aic_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=aic_channel_cfg['num_output_channels'],
num_joints=aic_channel_cfg['dataset_joints'],
dataset_channel=aic_channel_cfg['dataset_channel'],
inference_channel=aic_channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
max_num_joints=133,
dataset_idx=1,
)
mpii_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=mpii_channel_cfg['num_output_channels'],
num_joints=mpii_channel_cfg['dataset_joints'],
dataset_channel=mpii_channel_cfg['dataset_channel'],
inference_channel=mpii_channel_cfg['inference_channel'],
max_num_joints=133,
dataset_idx=2,
use_gt_bbox=True,
bbox_file=None,
)
ap10k_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='',
max_num_joints=133,
dataset_idx=3,
)
ap36k_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=True,
det_bbox_thr=0.0,
bbox_file='',
max_num_joints=133,
dataset_idx=4,
)
cocowholebody_data_cfg = dict(
image_size=[192, 256],
heatmap_size=[48, 64],
num_output_channels=cocowholebody_channel_cfg['num_output_channels'],
num_joints=cocowholebody_channel_cfg['dataset_joints'],
dataset_channel=cocowholebody_channel_cfg['dataset_channel'],
inference_channel=cocowholebody_channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
det_bbox_thr=0.0,
bbox_file='data/coco/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
dataset_idx=5,
max_num_joints=133,
)
cocowholebody_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
ap10k_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
aic_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTarget', sigma=2),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
mpii_train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
sigma=2,
encoding='UDP',
target_type=target_type),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'flip_pairs', 'dataset_idx'
]),
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTarget',
sigma=2,
encoding='UDP',
target_type=target_type),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffine', use_udp=True),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'dataset_idx'
]),
]
test_pipeline = val_pipeline
data_root = 'data/coco'
aic_data_root = 'data/aic'
mpii_data_root = 'data/mpii'
ap10k_data_root = 'data/ap10k'
ap36k_data_root = 'data/ap36k'
data = dict(
samples_per_gpu=128,
workers_per_gpu=8,
val_dataloader=dict(samples_per_gpu=64),
test_dataloader=dict(samples_per_gpu=64),
train=[
dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
dict(
type='TopDownAicDataset',
ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/'
'keypoint_train_images_20170902/',
data_cfg=aic_data_cfg,
pipeline=aic_train_pipeline,
dataset_info={{_base_.aic_info}}),
dict(
type='TopDownMpiiDataset',
ann_file=f'{mpii_data_root}/annotations/mpii_train.json',
img_prefix=f'{mpii_data_root}/images/',
data_cfg=mpii_data_cfg,
pipeline=mpii_train_pipeline,
dataset_info={{_base_.mpii_info}}),
dict(
type='AnimalAP10KDataset',
ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json',
img_prefix=f'{ap10k_data_root}/data/',
data_cfg=ap10k_data_cfg,
pipeline=ap10k_train_pipeline,
dataset_info={{_base_.ap10k_info}}),
dict(
type='AnimalAP10KDataset',
ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json',
img_prefix=f'{ap36k_data_root}/',
data_cfg=ap36k_data_cfg,
pipeline=ap10k_train_pipeline,
dataset_info={{_base_.ap10k_info}}),
dict(
type='TopDownCocoWholeBodyDataset',
ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=cocowholebody_data_cfg,
pipeline=cocowholebody_train_pipeline,
dataset_info={{_base_.cocowholebody_info}}),
],
val=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDataset',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)

15
mmcv_custom/checkpoint.py

@ -25,6 +25,8 @@ from mmcv.runner import get_dist_info
from scipy import interpolate
import numpy as np
import math
import re
import copy
ENV_MMCV_HOME = 'MMCV_HOME'
ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
@ -313,6 +315,7 @@ def load_checkpoint(model,
strict=False,
logger=None,
patch_padding='pad',
part_features=None
):
"""Load checkpoint from a file or URI.
@ -389,9 +392,19 @@ def load_checkpoint(model,
pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
state_dict['pos_embed'] = new_pos_embed
new_state_dict = copy.deepcopy(state_dict)
if part_features is not None:
current_keys = list(model.state_dict().keys())
for key in current_keys:
if "mlp.experts" in key:
source_key = re.sub(r'experts.\d+.', 'fc2.', key)
new_state_dict[key] = state_dict[source_key][-part_features:]
elif 'fc2' in key:
new_state_dict[key] = state_dict[key][:-part_features]
# load state_dict
load_state_dict(model, state_dict, strict, logger)
load_state_dict(model, new_state_dict, strict, logger)
return checkpoint

3
mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py

@ -60,6 +60,9 @@ class Kpt2dSviewRgbImgTopDownDataset(Dataset, metaclass=ABCMeta):
self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
self.ann_info['max_num_joints'] = data_cfg.get('max_num_joints', None)
self.ann_info['dataset_idx'] = data_cfg.get('dataset_idx', 0)
self.ann_info['use_different_joint_weights'] = data_cfg.get(
'use_different_joint_weights', False)

8
mmpose/datasets/pipelines/top_down_transform.py

@ -633,9 +633,17 @@ class TopDownGenerateTarget:
raise ValueError(
f'Encoding approach {self.encoding} is not supported!')
if results['ann_info'].get('max_num_joints', None) is not None:
W, H = results['ann_info']['heatmap_size']
padded_length = int(results['ann_info'].get('max_num_joints') - results['ann_info'].get('num_joints'))
target_weight = np.concatenate([target_weight, np.zeros((padded_length, 1), dtype=np.float32)], 0)
target = np.concatenate([target, np.zeros((padded_length, H, W), dtype=np.float32)], 0)
results['target'] = target
results['target_weight'] = target_weight
results['dataset_idx'] = results['ann_info'].get('dataset_idx', 0)
return results

3
mmpose/models/backbones/__init__.py

@ -25,11 +25,12 @@ from .vgg import VGG
from .vipnas_mbv3 import ViPNAS_MobileNetV3
from .vipnas_resnet import ViPNAS_ResNet
from .vit import ViT
from .vit_moe import ViTMoE
__all__ = [
'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2',
'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet',
'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
'LiteHRNet', 'V2VNet', 'HRFormer', 'ViT'
'LiteHRNet', 'V2VNet', 'HRFormer', 'ViT', 'ViTMoE'
]

4
mmpose/models/backbones/base_backbone.py

@ -14,7 +14,7 @@ class BaseBackbone(nn.Module, metaclass=ABCMeta):
inherits this class should at least define its own `forward` function.
"""
def init_weights(self, pretrained=None, patch_padding='pad'):
def init_weights(self, pretrained=None, patch_padding='pad', part_features=None):
"""Init backbone weights.
Args:
@ -25,7 +25,7 @@ class BaseBackbone(nn.Module, metaclass=ABCMeta):
"""
if isinstance(pretrained, str):
logger = logging.getLogger()
load_checkpoint(self, pretrained, strict=False, logger=logger, patch_padding=patch_padding)
load_checkpoint(self, pretrained, strict=False, logger=logger, patch_padding=patch_padding, part_features=part_features)
elif pretrained is None:
# use default initializer or customized initializer in subclasses
pass

33
mmpose/models/backbones/vit.py

@ -12,6 +12,39 @@ from timm.models.layers import drop_path, to_2tuple, trunc_normal_
from ..builder import BACKBONES
from .base_backbone import BaseBackbone
def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True):
"""
Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
dimension for the original embeddings.
Args:
abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
hw (Tuple): size of input image tokens.
Returns:
Absolute positional embeddings after processing with shape (1, H, W, C)
"""
cls_token = None
B, L, C = abs_pos.shape
if has_cls_token:
cls_token = abs_pos[:, 0:1]
abs_pos = abs_pos[:, 1:]
if ori_h != h or ori_w != w:
new_abs_pos = F.interpolate(
abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2),
size=(h, w),
mode="bicubic",
align_corners=False,
).permute(0, 2, 3, 1).reshape(B, -1, C)
else:
new_abs_pos = abs_pos
if cls_token is not None:
new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1)
return new_abs_pos
class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""

384
mmpose/models/backbones/vit_moe.py

@ -0,0 +1,384 @@
# Copyright (c) OpenMMLab. All rights reserved.
import math
import torch
from functools import partial
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from timm.models.layers import drop_path, to_2tuple, trunc_normal_
from ..builder import BACKBONES
from .base_backbone import BaseBackbone
def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True):
"""
Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
dimension for the original embeddings.
Args:
abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
hw (Tuple): size of input image tokens.
Returns:
Absolute positional embeddings after processing with shape (1, H, W, C)
"""
cls_token = None
B, L, C = abs_pos.shape
if has_cls_token:
cls_token = abs_pos[:, 0:1]
abs_pos = abs_pos[:, 1:]
if ori_h != h or ori_w != w:
new_abs_pos = F.interpolate(
abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2),
size=(h, w),
mode="bicubic",
align_corners=False,
).permute(0, 2, 3, 1).reshape(B, -1, C)
else:
new_abs_pos = abs_pos
if cls_token is not None:
new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1)
return new_abs_pos
class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
def extra_repr(self):
return 'p={}'.format(self.drop_prob)
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.fc2(x)
x = self.drop(x)
return x
class MoEMlp(nn.Module):
def __init__(self, num_expert=1, in_features=1024, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., part_features=256):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.part_features = part_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features - part_features)
self.drop = nn.Dropout(drop)
self.num_expert = num_expert
experts = []
for i in range(num_expert):
experts.append(
nn.Linear(hidden_features, part_features)
)
self.experts = nn.ModuleList(experts)
def forward(self, x, indices):
expert_x = torch.zeros_like(x[:, :, -self.part_features:], device=x.device, dtype=x.dtype)
x = self.fc1(x)
x = self.act(x)
shared_x = self.fc2(x)
indices = indices.view(-1, 1, 1)
# to support ddp training
for i in range(self.num_expert):
selectedIndex = (indices == i)
current_x = self.experts[i](x) * selectedIndex
expert_x = expert_x + current_x
x = torch.cat([shared_x, expert_x], dim=-1)
return x
class Attention(nn.Module):
def __init__(
self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
proj_drop=0., attn_head_dim=None,):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.dim = dim
if attn_head_dim is not None:
head_dim = attn_head_dim
all_head_dim = head_dim * self.num_heads
self.scale = qk_scale or head_dim ** -0.5
self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(all_head_dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape
qkv = self.qkv(x)
qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
q = q * self.scale
attn = (q @ k.transpose(-2, -1))
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Module):
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None,
drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU,
norm_layer=nn.LayerNorm, attn_head_dim=None, num_expert=1, part_features=None
):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = MoEMlp(num_expert=num_expert, in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, mode=mode)
def forward(self, x, indices=None):
x = x + self.drop_path(self.attn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x), indices))
return x
class PatchEmbed(nn.Module):
""" Image to Patch Embedding
"""
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
self.img_size = img_size
self.patch_size = patch_size
self.num_patches = num_patches
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1))
def forward(self, x, **kwargs):
B, C, H, W = x.shape
x = self.proj(x)
Hp, Wp = x.shape[2], x.shape[3]
x = x.flatten(2).transpose(1, 2)
return x, (Hp, Wp)
class HybridEmbed(nn.Module):
""" CNN Feature Map Embedding
Extract feature map from CNN, flatten, project to embedding dim.
"""
def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
super().__init__()
assert isinstance(backbone, nn.Module)
img_size = to_2tuple(img_size)
self.img_size = img_size
self.backbone = backbone
if feature_size is None:
with torch.no_grad():
training = backbone.training
if training:
backbone.eval()
o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
feature_size = o.shape[-2:]
feature_dim = o.shape[1]
backbone.train(training)
else:
feature_size = to_2tuple(feature_size)
feature_dim = self.backbone.feature_info.channels()[-1]
self.num_patches = feature_size[0] * feature_size[1]
self.proj = nn.Linear(feature_dim, embed_dim)
def forward(self, x):
x = self.backbone(x)[-1]
x = x.flatten(2).transpose(1, 2)
x = self.proj(x)
return x
@BACKBONES.register_module()
class ViTMoE(BaseBackbone):
def __init__(self,
img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12,
num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False,
frozen_stages=-1, ratio=1, last_norm=True,
patch_padding='pad', freeze_attn=False, freeze_ffn=False,
num_expert=1, part_features=None
):
# Protect mutable default arguments
super(ViTMoE, self).__init__()
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
self.num_classes = num_classes
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
self.frozen_stages = frozen_stages
self.use_checkpoint = use_checkpoint
self.patch_padding = patch_padding
self.freeze_attn = freeze_attn
self.freeze_ffn = freeze_ffn
self.depth = depth
if hybrid_backbone is not None:
self.patch_embed = HybridEmbed(
hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
else:
self.patch_embed = PatchEmbed(
img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio)
num_patches = self.patch_embed.num_patches
self.part_features = part_features
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
self.blocks = nn.ModuleList([
Block(
dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
num_expert=num_expert, part_features=part_features
)
for i in range(depth)])
self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()
if self.pos_embed is not None:
trunc_normal_(self.pos_embed, std=.02)
self._freeze_stages()
def _freeze_stages(self):
"""Freeze parameters."""
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.requires_grad = False
for i in range(1, self.frozen_stages + 1):
m = self.blocks[i]
m.eval()
for param in m.parameters():
param.requires_grad = False
if self.freeze_attn:
for i in range(0, self.depth):
m = self.blocks[i]
m.attn.eval()
m.norm1.eval()
for param in m.attn.parameters():
param.requires_grad = False
for param in m.norm1.parameters():
param.requires_grad = False
if self.freeze_ffn:
self.pos_embed.requires_grad = False
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.requires_grad = False
for i in range(0, self.depth):
m = self.blocks[i]
m.mlp.eval()
m.norm2.eval()
for param in m.mlp.parameters():
param.requires_grad = False
for param in m.norm2.parameters():
param.requires_grad = False
def init_weights(self, pretrained=None):
"""Initialize the weights in backbone.
Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""
super().init_weights(pretrained, patch_padding=self.patch_padding, part_features=self.part_features)
if pretrained is None:
def _init_weights(m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
self.apply(_init_weights)
def get_num_layers(self):
return len(self.blocks)
@torch.jit.ignore
def no_weight_decay(self):
return {'pos_embed', 'cls_token'}
def forward_features(self, x, dataset_source=None):
B, C, H, W = x.shape
x, (Hp, Wp) = self.patch_embed(x)
if self.pos_embed is not None:
# fit for multiple GPU training
# since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
for blk in self.blocks:
if self.use_checkpoint:
x = checkpoint.checkpoint(blk, x, dataset_source)
else:
x = blk(x, dataset_source)
x = self.last_norm(x)
xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous()
return xp
def forward(self, x, dataset_source=None):
x = self.forward_features(x, dataset_source)
return x
def train(self, mode=True):
"""Convert the model into training mode."""
super().train(mode)
self._freeze_stages()

3
mmpose/models/detectors/__init__.py

@ -8,9 +8,10 @@ from .multiview_pose import (DetectAndRegress, VoxelCenterDetector,
from .pose_lifter import PoseLifter
from .posewarper import PoseWarper
from .top_down import TopDown
from .top_down_moe import TopDownMoE
__all__ = [
'TopDown', 'AssociativeEmbedding', 'ParametricMesh', 'MultiTask',
'PoseLifter', 'Interhand3D', 'PoseWarper', 'DetectAndRegress',
'VoxelCenterDetector', 'VoxelSinglePose'
'VoxelCenterDetector', 'VoxelSinglePose', 'TopDownMoE'
]

351
mmpose/models/detectors/top_down_moe.py

@ -0,0 +1,351 @@
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
import torch
import torch.nn as nn
import mmcv
import numpy as np
from mmcv.image import imwrite
from mmcv.utils.misc import deprecated_api_warning
from mmcv.visualization.image import imshow
from mmpose.core import imshow_bboxes, imshow_keypoints
from .. import builder
from ..builder import POSENETS
from .base import BasePose
try:
from mmcv.runner import auto_fp16
except ImportError:
warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
'Please install mmcv>=1.1.4')
from mmpose.core import auto_fp16
@POSENETS.register_module()
class TopDownMoE(BasePose):
"""Top-down pose detectors.
Args:
backbone (dict): Backbone modules to extract feature.
keypoint_head (dict): Keypoint head to process feature.
train_cfg (dict): Config for training. Default: None.
test_cfg (dict): Config for testing. Default: None.
pretrained (str): Path to the pretrained models.
loss_pose (None): Deprecated arguments. Please use
`loss_keypoint` for heads instead.
"""
def __init__(self,
backbone,
neck=None,
keypoint_head=None,
associate_keypoint_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None,
loss_pose=None):
super().__init__()
self.fp16_enabled = False
self.backbone = builder.build_backbone(backbone)
self.train_cfg = train_cfg
self.test_cfg = test_cfg
if neck is not None:
self.neck = builder.build_neck(neck)
if keypoint_head is not None:
keypoint_head['train_cfg'] = train_cfg
keypoint_head['test_cfg'] = test_cfg
if 'loss_keypoint' not in keypoint_head and loss_pose is not None:
warnings.warn(
'`loss_pose` for TopDown is deprecated, '
'use `loss_keypoint` for heads instead. See '
'https://github.com/open-mmlab/mmpose/pull/382'
' for more information.', DeprecationWarning)
keypoint_head['loss_keypoint'] = loss_pose
self.keypoint_head = builder.build_head(keypoint_head)
associate_keypoint_heads = []
keypoint_heads_cnt = 1
if associate_keypoint_head is not None:
if not isinstance(associate_keypoint_head, list):
associate_keypoint_head = [associate_keypoint_head]
for single_keypoint_head in associate_keypoint_head:
single_keypoint_head['train_cfg'] = train_cfg
single_keypoint_head['test_cfg'] = test_cfg
associate_keypoint_heads.append(builder.build_head(single_keypoint_head))
keypoint_heads_cnt += 1
self.associate_keypoint_heads = nn.ModuleList(associate_keypoint_heads)
self.keypoint_heads_cnt = keypoint_heads_cnt
self.init_weights(pretrained=pretrained)
@property
def with_neck(self):
"""Check if has neck."""
return hasattr(self, 'neck')
@property
def with_keypoint(self):
"""Check if has keypoint_head."""
return hasattr(self, 'keypoint_head')
def init_weights(self, pretrained=None):
"""Weight initialization for model."""
self.backbone.init_weights(pretrained)
if self.with_neck:
self.neck.init_weights()
if self.with_keypoint:
self.keypoint_head.init_weights()
for item in self.associate_keypoint_heads:
item.init_weights()
@auto_fp16(apply_to=('img', ))
def forward(self,
img,
target=None,
target_weight=None,
img_metas=None,
return_loss=True,
return_heatmap=False,
**kwargs):
"""Calls either forward_train or forward_test depending on whether
return_loss=True. Note this setting will change the expected inputs.
When `return_loss=True`, img and img_meta are single-nested (i.e.
Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
should be double nested (i.e. List[Tensor], List[List[dict]]), with
the outer list indicating test time augmentations.
Note:
- batch_size: N
- num_keypoints: K
- num_img_channel: C (Default: 3)
- img height: imgH
- img width: imgW
- heatmaps height: H
- heatmaps weight: W
Args:
img (torch.Tensor[NxCximgHximgW]): Input images.
target (torch.Tensor[NxKxHxW]): Target heatmaps.
target_weight (torch.Tensor[NxKx1]): Weights across
different joint types.
img_metas (list(dict)): Information about data augmentation
By default this includes:
- "image_file: path to the image file
- "center": center of the bbox
- "scale": scale of the bbox
- "rotation": rotation of the bbox
- "bbox_score": score of bbox
return_loss (bool): Option to `return loss`. `return loss=True`
for training, `return loss=False` for validation & test.
return_heatmap (bool) : Option to return heatmap.
Returns:
dict|tuple: if `return loss` is true, then return losses. \
Otherwise, return predicted poses, boxes, image paths \
and heatmaps.
"""
if return_loss:
return self.forward_train(img, target, target_weight, img_metas,
**kwargs)
return self.forward_test(
img, img_metas, return_heatmap=return_heatmap, **kwargs)
def forward_train(self, img, target, target_weight, img_metas, **kwargs):
"""Defines the computation performed at every call when training."""
img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device)
output = self.backbone(img, img_sources)
if self.with_neck:
output = self.neck(output)
# if return loss
losses = dict()
main_stream_select = (img_sources == 0)
# if torch.sum(main_stream_select) > 0:
output_select = self.keypoint_head(output)
target_select = target * main_stream_select.view(-1, 1, 1, 1)
target_weight_select = target_weight * main_stream_select.view(-1, 1, 1)
keypoint_losses = self.keypoint_head.get_loss(
output_select, target_select, target_weight_select)
losses['main_stream_loss'] = keypoint_losses['heatmap_loss']
keypoint_accuracy = self.keypoint_head.get_accuracy(
output_select, target_select, target_weight_select)
losses['main_stream_acc'] = keypoint_accuracy['acc_pose']
for idx in range(1, self.keypoint_heads_cnt):
idx_select = (img_sources == idx)
target_select = target * idx_select.view(-1, 1, 1, 1)
target_weight_select = target_weight * idx_select.view(-1, 1, 1)
output_select = self.associate_keypoint_heads[idx - 1](output)
keypoint_losses = self.associate_keypoint_heads[idx - 1].get_loss(
output_select, target_select, target_weight_select)
losses[f'{idx}_loss'] = keypoint_losses['heatmap_loss']
keypoint_accuracy = self.associate_keypoint_heads[idx - 1].get_accuracy(
output_select, target_select, target_weight_select)
losses[f'{idx}_acc'] = keypoint_accuracy['acc_pose']
return losses
def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
"""Defines the computation performed at every call when testing."""
assert img.size(0) == len(img_metas)
batch_size, _, img_height, img_width = img.shape
if batch_size > 1:
assert 'bbox_id' in img_metas[0]
result = {}
img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device)
features = self.backbone(img, img_sources)
if self.with_neck:
features = self.neck(features)
if self.with_keypoint:
output_heatmap = self.keypoint_head.inference_model(
features, flip_pairs=None)
if self.test_cfg.get('flip_test', True):
img_flipped = img.flip(3)
features_flipped = self.backbone(img_flipped, img_sources)
if self.with_neck:
features_flipped = self.neck(features_flipped)
if self.with_keypoint:
output_flipped_heatmap = self.keypoint_head.inference_model(
features_flipped, img_metas[0]['flip_pairs'])
output_heatmap = (output_heatmap +
output_flipped_heatmap) * 0.5
if self.with_keypoint:
keypoint_result = self.keypoint_head.decode(
img_metas, output_heatmap, img_size=[img_width, img_height])
result.update(keypoint_result)
if not return_heatmap:
output_heatmap = None
result['output_heatmap'] = output_heatmap
return result
def forward_dummy(self, img):
"""Used for computing network FLOPs.
See ``tools/get_flops.py``.
Args:
img (torch.Tensor): Input image.
Returns:
Tensor: Output heatmaps.
"""
output = self.backbone(img)
if self.with_neck:
output = self.neck(output)
if self.with_keypoint:
output = self.keypoint_head(output)
return output
@deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
cls_name='TopDown')
def show_result(self,
img,
result,
skeleton=None,
kpt_score_thr=0.3,
bbox_color='green',
pose_kpt_color=None,
pose_link_color=None,
text_color='white',
radius=4,
thickness=1,
font_scale=0.5,
bbox_thickness=1,
win_name='',
show=False,
show_keypoint_weight=False,
wait_time=0,
out_file=None):
"""Draw `result` over `img`.
Args:
img (str or Tensor): The image to be displayed.
result (list[dict]): The results to draw over `img`
(bbox_result, pose_result).
skeleton (list[list]): The connection of keypoints.
skeleton is 0-based indexing.
kpt_score_thr (float, optional): Minimum score of keypoints
to be shown. Default: 0.3.
bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
If None, do not draw keypoints.
pose_link_color (np.array[Mx3]): Color of M links.
If None, do not draw links.
text_color (str or tuple or :obj:`Color`): Color of texts.
radius (int): Radius of circles.
thickness (int): Thickness of lines.
font_scale (float): Font scales of texts.
win_name (str): The window name.
show (bool): Whether to show the image. Default: False.
show_keypoint_weight (bool): Whether to change the transparency
using the predicted confidence scores of keypoints.
wait_time (int): Value of waitKey param.
Default: 0.
out_file (str or None): The filename to write the image.
Default: None.
Returns:
Tensor: Visualized img, only if not `show` or `out_file`.
"""
img = mmcv.imread(img)
img = img.copy()
bbox_result = []
bbox_labels = []
pose_result = []
for res in result:
if 'bbox' in res:
bbox_result.append(res['bbox'])
bbox_labels.append(res.get('label', None))
pose_result.append(res['keypoints'])
if bbox_result:
bboxes = np.vstack(bbox_result)
# draw bounding boxes
imshow_bboxes(
img,
bboxes,
labels=bbox_labels,
colors=bbox_color,
text_color=text_color,
thickness=bbox_thickness,
font_scale=font_scale,
show=False)
if pose_result:
imshow_keypoints(img, pose_result, skeleton, kpt_score_thr,
pose_kpt_color, pose_link_color, radius,
thickness)
if show:
imshow(img, win_name, wait_time)
if out_file is not None:
imwrite(img, out_file)
return img

2
tools/train.py

@ -17,7 +17,7 @@ from mmpose.apis import init_random_seed, train_model
from mmpose.datasets import build_dataset
from mmpose.models import build_posenet
from mmpose.utils import collect_env, get_root_logger, setup_multi_processes
import mmcv_custom
def parse_args():
parser = argparse.ArgumentParser(description='Train a pose model')

Loading…
Cancel
Save