From 98fb7f859f85004f9b7d447b34fff8d058127ab9 Mon Sep 17 00:00:00 2001 From: Yufei Date: Sun, 25 Dec 2022 21:17:52 +0800 Subject: [PATCH] update MoE code for vitpose+ --- configs/_base_/datasets/aic_info.py | 140 ++ configs/_base_/datasets/ap10k_info.py | 142 ++ .../_base_/datasets/coco_wholebody_info.py | 1154 +++++++++++++++++ configs/_base_/datasets/mpii_info.py | 155 +++ .../coco/ViTPose_large_coco_256x192.py | 2 +- ...mpii+ap10k+apt36k+wholebody_256x192_udp.py | 500 +++++++ ...mpii+ap10k+apt36k+wholebody_256x192_udp.py | 500 +++++++ ...mpii+ap10k+apt36k+wholebody_256x192_udp.py | 500 +++++++ mmcv_custom/checkpoint.py | 15 +- .../kpt_2d_sview_rgb_img_top_down_dataset.py | 3 + .../datasets/pipelines/top_down_transform.py | 8 + mmpose/models/backbones/__init__.py | 3 +- mmpose/models/backbones/base_backbone.py | 4 +- mmpose/models/backbones/vit.py | 33 + mmpose/models/backbones/vit_moe.py | 384 ++++++ mmpose/models/detectors/__init__.py | 3 +- mmpose/models/detectors/top_down_moe.py | 351 +++++ tools/train.py | 2 +- 18 files changed, 3892 insertions(+), 7 deletions(-) create mode 100644 configs/_base_/datasets/aic_info.py create mode 100644 configs/_base_/datasets/ap10k_info.py create mode 100644 configs/_base_/datasets/coco_wholebody_info.py create mode 100644 configs/_base_/datasets/mpii_info.py create mode 100644 configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_base_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py create mode 100644 configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_huge_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py create mode 100644 configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_large_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py create mode 100644 mmpose/models/backbones/vit_moe.py create mode 100644 mmpose/models/detectors/top_down_moe.py diff --git a/configs/_base_/datasets/aic_info.py b/configs/_base_/datasets/aic_info.py new file mode 100644 index 0000000..f143fd8 --- /dev/null +++ b/configs/_base_/datasets/aic_info.py @@ -0,0 +1,140 @@ +aic_info = dict( + dataset_name='aic', + paper_info=dict( + author='Wu, Jiahong and Zheng, He and Zhao, Bo and ' + 'Li, Yixin and Yan, Baoming and Liang, Rui and ' + 'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and ' + 'Fu, Yanwei and others', + title='Ai challenger: A large-scale dataset for going ' + 'deeper in image understanding', + container='arXiv', + year='2017', + homepage='https://github.com/AIChallenger/AI_Challenger_2017', + ), + keypoint_info={ + 0: + dict( + name='right_shoulder', + id=0, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 1: + dict( + name='right_elbow', + id=1, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 2: + dict( + name='right_wrist', + id=2, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 3: + dict( + name='left_shoulder', + id=3, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 4: + dict( + name='left_elbow', + id=4, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 5: + dict( + name='left_wrist', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 6: + dict( + name='right_hip', + id=6, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 7: + dict( + name='right_knee', + id=7, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 8: + dict( + name='right_ankle', + id=8, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 9: + dict( + name='left_hip', + id=9, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 10: + dict( + name='left_knee', + id=10, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 11: + dict( + name='left_ankle', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 12: + dict( + name='head_top', + id=12, + color=[51, 153, 255], + type='upper', + swap=''), + 13: + dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='') + }, + skeleton_info={ + 0: + dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]), + 1: dict( + link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]), + 2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]), + 3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]), + 4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]), + 5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]), + 6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]), + 7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]), + 8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]), + 9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]), + 10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]), + 11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]), + 12: dict( + link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]), + 13: + dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1. + ], + + # 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/' + # 'Evaluation/keypoint_eval/keypoint_eval.py#L50' + # delta = 2 x sigma + sigmas=[ + 0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144, + 0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081, + 0.01291456, 0.01236173 + ]) diff --git a/configs/_base_/datasets/ap10k_info.py b/configs/_base_/datasets/ap10k_info.py new file mode 100644 index 0000000..af2461c --- /dev/null +++ b/configs/_base_/datasets/ap10k_info.py @@ -0,0 +1,142 @@ +ap10k_info = dict( + dataset_name='ap10k', + paper_info=dict( + author='Yu, Hang and Xu, Yufei and Zhang, Jing and ' + 'Zhao, Wei and Guan, Ziyu and Tao, Dacheng', + title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild', + container='35th Conference on Neural Information Processing Systems ' + '(NeurIPS 2021) Track on Datasets and Bench-marks.', + year='2021', + homepage='https://github.com/AlexTheBad/AP-10K', + ), + keypoint_info={ + 0: + dict( + name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'), + 1: + dict( + name='R_Eye', + id=1, + color=[255, 128, 0], + type='upper', + swap='L_Eye'), + 2: + dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''), + 3: + dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''), + 4: + dict( + name='Root of tail', + id=4, + color=[51, 153, 255], + type='lower', + swap=''), + 5: + dict( + name='L_Shoulder', + id=5, + color=[51, 153, 255], + type='upper', + swap='R_Shoulder'), + 6: + dict( + name='L_Elbow', + id=6, + color=[51, 153, 255], + type='upper', + swap='R_Elbow'), + 7: + dict( + name='L_F_Paw', + id=7, + color=[0, 255, 0], + type='upper', + swap='R_F_Paw'), + 8: + dict( + name='R_Shoulder', + id=8, + color=[0, 255, 0], + type='upper', + swap='L_Shoulder'), + 9: + dict( + name='R_Elbow', + id=9, + color=[255, 128, 0], + type='upper', + swap='L_Elbow'), + 10: + dict( + name='R_F_Paw', + id=10, + color=[0, 255, 0], + type='lower', + swap='L_F_Paw'), + 11: + dict( + name='L_Hip', + id=11, + color=[255, 128, 0], + type='lower', + swap='R_Hip'), + 12: + dict( + name='L_Knee', + id=12, + color=[255, 128, 0], + type='lower', + swap='R_Knee'), + 13: + dict( + name='L_B_Paw', + id=13, + color=[0, 255, 0], + type='lower', + swap='R_B_Paw'), + 14: + dict( + name='R_Hip', id=14, color=[0, 255, 0], type='lower', + swap='L_Hip'), + 15: + dict( + name='R_Knee', + id=15, + color=[0, 255, 0], + type='lower', + swap='L_Knee'), + 16: + dict( + name='R_B_Paw', + id=16, + color=[0, 255, 0], + type='lower', + swap='L_B_Paw'), + }, + skeleton_info={ + 0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]), + 1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]), + 2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]), + 3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]), + 4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]), + 5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]), + 6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]), + 7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]), + 8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]), + 9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]), + 10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]), + 11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]), + 12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]), + 13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]), + 14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]), + 15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]), + 16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]), + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072, + 0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089 + ]) diff --git a/configs/_base_/datasets/coco_wholebody_info.py b/configs/_base_/datasets/coco_wholebody_info.py new file mode 100644 index 0000000..50ac8fe --- /dev/null +++ b/configs/_base_/datasets/coco_wholebody_info.py @@ -0,0 +1,1154 @@ +cocowholebody_info = dict( + dataset_name='coco_wholebody', + paper_info=dict( + author='Jin, Sheng and Xu, Lumin and Xu, Jin and ' + 'Wang, Can and Liu, Wentao and ' + 'Qian, Chen and Ouyang, Wanli and Luo, Ping', + title='Whole-Body Human Pose Estimation in the Wild', + container='Proceedings of the European ' + 'Conference on Computer Vision (ECCV)', + year='2020', + homepage='https://github.com/jin-s13/COCO-WholeBody/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 17: + dict( + name='left_big_toe', + id=17, + color=[255, 128, 0], + type='lower', + swap='right_big_toe'), + 18: + dict( + name='left_small_toe', + id=18, + color=[255, 128, 0], + type='lower', + swap='right_small_toe'), + 19: + dict( + name='left_heel', + id=19, + color=[255, 128, 0], + type='lower', + swap='right_heel'), + 20: + dict( + name='right_big_toe', + id=20, + color=[255, 128, 0], + type='lower', + swap='left_big_toe'), + 21: + dict( + name='right_small_toe', + id=21, + color=[255, 128, 0], + type='lower', + swap='left_small_toe'), + 22: + dict( + name='right_heel', + id=22, + color=[255, 128, 0], + type='lower', + swap='left_heel'), + 23: + dict( + name='face-0', + id=23, + color=[255, 255, 255], + type='', + swap='face-16'), + 24: + dict( + name='face-1', + id=24, + color=[255, 255, 255], + type='', + swap='face-15'), + 25: + dict( + name='face-2', + id=25, + color=[255, 255, 255], + type='', + swap='face-14'), + 26: + dict( + name='face-3', + id=26, + color=[255, 255, 255], + type='', + swap='face-13'), + 27: + dict( + name='face-4', + id=27, + color=[255, 255, 255], + type='', + swap='face-12'), + 28: + dict( + name='face-5', + id=28, + color=[255, 255, 255], + type='', + swap='face-11'), + 29: + dict( + name='face-6', + id=29, + color=[255, 255, 255], + type='', + swap='face-10'), + 30: + dict( + name='face-7', + id=30, + color=[255, 255, 255], + type='', + swap='face-9'), + 31: + dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''), + 32: + dict( + name='face-9', + id=32, + color=[255, 255, 255], + type='', + swap='face-7'), + 33: + dict( + name='face-10', + id=33, + color=[255, 255, 255], + type='', + swap='face-6'), + 34: + dict( + name='face-11', + id=34, + color=[255, 255, 255], + type='', + swap='face-5'), + 35: + dict( + name='face-12', + id=35, + color=[255, 255, 255], + type='', + swap='face-4'), + 36: + dict( + name='face-13', + id=36, + color=[255, 255, 255], + type='', + swap='face-3'), + 37: + dict( + name='face-14', + id=37, + color=[255, 255, 255], + type='', + swap='face-2'), + 38: + dict( + name='face-15', + id=38, + color=[255, 255, 255], + type='', + swap='face-1'), + 39: + dict( + name='face-16', + id=39, + color=[255, 255, 255], + type='', + swap='face-0'), + 40: + dict( + name='face-17', + id=40, + color=[255, 255, 255], + type='', + swap='face-26'), + 41: + dict( + name='face-18', + id=41, + color=[255, 255, 255], + type='', + swap='face-25'), + 42: + dict( + name='face-19', + id=42, + color=[255, 255, 255], + type='', + swap='face-24'), + 43: + dict( + name='face-20', + id=43, + color=[255, 255, 255], + type='', + swap='face-23'), + 44: + dict( + name='face-21', + id=44, + color=[255, 255, 255], + type='', + swap='face-22'), + 45: + dict( + name='face-22', + id=45, + color=[255, 255, 255], + type='', + swap='face-21'), + 46: + dict( + name='face-23', + id=46, + color=[255, 255, 255], + type='', + swap='face-20'), + 47: + dict( + name='face-24', + id=47, + color=[255, 255, 255], + type='', + swap='face-19'), + 48: + dict( + name='face-25', + id=48, + color=[255, 255, 255], + type='', + swap='face-18'), + 49: + dict( + name='face-26', + id=49, + color=[255, 255, 255], + type='', + swap='face-17'), + 50: + dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''), + 51: + dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''), + 52: + dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''), + 53: + dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''), + 54: + dict( + name='face-31', + id=54, + color=[255, 255, 255], + type='', + swap='face-35'), + 55: + dict( + name='face-32', + id=55, + color=[255, 255, 255], + type='', + swap='face-34'), + 56: + dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''), + 57: + dict( + name='face-34', + id=57, + color=[255, 255, 255], + type='', + swap='face-32'), + 58: + dict( + name='face-35', + id=58, + color=[255, 255, 255], + type='', + swap='face-31'), + 59: + dict( + name='face-36', + id=59, + color=[255, 255, 255], + type='', + swap='face-45'), + 60: + dict( + name='face-37', + id=60, + color=[255, 255, 255], + type='', + swap='face-44'), + 61: + dict( + name='face-38', + id=61, + color=[255, 255, 255], + type='', + swap='face-43'), + 62: + dict( + name='face-39', + id=62, + color=[255, 255, 255], + type='', + swap='face-42'), + 63: + dict( + name='face-40', + id=63, + color=[255, 255, 255], + type='', + swap='face-47'), + 64: + dict( + name='face-41', + id=64, + color=[255, 255, 255], + type='', + swap='face-46'), + 65: + dict( + name='face-42', + id=65, + color=[255, 255, 255], + type='', + swap='face-39'), + 66: + dict( + name='face-43', + id=66, + color=[255, 255, 255], + type='', + swap='face-38'), + 67: + dict( + name='face-44', + id=67, + color=[255, 255, 255], + type='', + swap='face-37'), + 68: + dict( + name='face-45', + id=68, + color=[255, 255, 255], + type='', + swap='face-36'), + 69: + dict( + name='face-46', + id=69, + color=[255, 255, 255], + type='', + swap='face-41'), + 70: + dict( + name='face-47', + id=70, + color=[255, 255, 255], + type='', + swap='face-40'), + 71: + dict( + name='face-48', + id=71, + color=[255, 255, 255], + type='', + swap='face-54'), + 72: + dict( + name='face-49', + id=72, + color=[255, 255, 255], + type='', + swap='face-53'), + 73: + dict( + name='face-50', + id=73, + color=[255, 255, 255], + type='', + swap='face-52'), + 74: + dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''), + 75: + dict( + name='face-52', + id=75, + color=[255, 255, 255], + type='', + swap='face-50'), + 76: + dict( + name='face-53', + id=76, + color=[255, 255, 255], + type='', + swap='face-49'), + 77: + dict( + name='face-54', + id=77, + color=[255, 255, 255], + type='', + swap='face-48'), + 78: + dict( + name='face-55', + id=78, + color=[255, 255, 255], + type='', + swap='face-59'), + 79: + dict( + name='face-56', + id=79, + color=[255, 255, 255], + type='', + swap='face-58'), + 80: + dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''), + 81: + dict( + name='face-58', + id=81, + color=[255, 255, 255], + type='', + swap='face-56'), + 82: + dict( + name='face-59', + id=82, + color=[255, 255, 255], + type='', + swap='face-55'), + 83: + dict( + name='face-60', + id=83, + color=[255, 255, 255], + type='', + swap='face-64'), + 84: + dict( + name='face-61', + id=84, + color=[255, 255, 255], + type='', + swap='face-63'), + 85: + dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''), + 86: + dict( + name='face-63', + id=86, + color=[255, 255, 255], + type='', + swap='face-61'), + 87: + dict( + name='face-64', + id=87, + color=[255, 255, 255], + type='', + swap='face-60'), + 88: + dict( + name='face-65', + id=88, + color=[255, 255, 255], + type='', + swap='face-67'), + 89: + dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''), + 90: + dict( + name='face-67', + id=90, + color=[255, 255, 255], + type='', + swap='face-65'), + 91: + dict( + name='left_hand_root', + id=91, + color=[255, 255, 255], + type='', + swap='right_hand_root'), + 92: + dict( + name='left_thumb1', + id=92, + color=[255, 128, 0], + type='', + swap='right_thumb1'), + 93: + dict( + name='left_thumb2', + id=93, + color=[255, 128, 0], + type='', + swap='right_thumb2'), + 94: + dict( + name='left_thumb3', + id=94, + color=[255, 128, 0], + type='', + swap='right_thumb3'), + 95: + dict( + name='left_thumb4', + id=95, + color=[255, 128, 0], + type='', + swap='right_thumb4'), + 96: + dict( + name='left_forefinger1', + id=96, + color=[255, 153, 255], + type='', + swap='right_forefinger1'), + 97: + dict( + name='left_forefinger2', + id=97, + color=[255, 153, 255], + type='', + swap='right_forefinger2'), + 98: + dict( + name='left_forefinger3', + id=98, + color=[255, 153, 255], + type='', + swap='right_forefinger3'), + 99: + dict( + name='left_forefinger4', + id=99, + color=[255, 153, 255], + type='', + swap='right_forefinger4'), + 100: + dict( + name='left_middle_finger1', + id=100, + color=[102, 178, 255], + type='', + swap='right_middle_finger1'), + 101: + dict( + name='left_middle_finger2', + id=101, + color=[102, 178, 255], + type='', + swap='right_middle_finger2'), + 102: + dict( + name='left_middle_finger3', + id=102, + color=[102, 178, 255], + type='', + swap='right_middle_finger3'), + 103: + dict( + name='left_middle_finger4', + id=103, + color=[102, 178, 255], + type='', + swap='right_middle_finger4'), + 104: + dict( + name='left_ring_finger1', + id=104, + color=[255, 51, 51], + type='', + swap='right_ring_finger1'), + 105: + dict( + name='left_ring_finger2', + id=105, + color=[255, 51, 51], + type='', + swap='right_ring_finger2'), + 106: + dict( + name='left_ring_finger3', + id=106, + color=[255, 51, 51], + type='', + swap='right_ring_finger3'), + 107: + dict( + name='left_ring_finger4', + id=107, + color=[255, 51, 51], + type='', + swap='right_ring_finger4'), + 108: + dict( + name='left_pinky_finger1', + id=108, + color=[0, 255, 0], + type='', + swap='right_pinky_finger1'), + 109: + dict( + name='left_pinky_finger2', + id=109, + color=[0, 255, 0], + type='', + swap='right_pinky_finger2'), + 110: + dict( + name='left_pinky_finger3', + id=110, + color=[0, 255, 0], + type='', + swap='right_pinky_finger3'), + 111: + dict( + name='left_pinky_finger4', + id=111, + color=[0, 255, 0], + type='', + swap='right_pinky_finger4'), + 112: + dict( + name='right_hand_root', + id=112, + color=[255, 255, 255], + type='', + swap='left_hand_root'), + 113: + dict( + name='right_thumb1', + id=113, + color=[255, 128, 0], + type='', + swap='left_thumb1'), + 114: + dict( + name='right_thumb2', + id=114, + color=[255, 128, 0], + type='', + swap='left_thumb2'), + 115: + dict( + name='right_thumb3', + id=115, + color=[255, 128, 0], + type='', + swap='left_thumb3'), + 116: + dict( + name='right_thumb4', + id=116, + color=[255, 128, 0], + type='', + swap='left_thumb4'), + 117: + dict( + name='right_forefinger1', + id=117, + color=[255, 153, 255], + type='', + swap='left_forefinger1'), + 118: + dict( + name='right_forefinger2', + id=118, + color=[255, 153, 255], + type='', + swap='left_forefinger2'), + 119: + dict( + name='right_forefinger3', + id=119, + color=[255, 153, 255], + type='', + swap='left_forefinger3'), + 120: + dict( + name='right_forefinger4', + id=120, + color=[255, 153, 255], + type='', + swap='left_forefinger4'), + 121: + dict( + name='right_middle_finger1', + id=121, + color=[102, 178, 255], + type='', + swap='left_middle_finger1'), + 122: + dict( + name='right_middle_finger2', + id=122, + color=[102, 178, 255], + type='', + swap='left_middle_finger2'), + 123: + dict( + name='right_middle_finger3', + id=123, + color=[102, 178, 255], + type='', + swap='left_middle_finger3'), + 124: + dict( + name='right_middle_finger4', + id=124, + color=[102, 178, 255], + type='', + swap='left_middle_finger4'), + 125: + dict( + name='right_ring_finger1', + id=125, + color=[255, 51, 51], + type='', + swap='left_ring_finger1'), + 126: + dict( + name='right_ring_finger2', + id=126, + color=[255, 51, 51], + type='', + swap='left_ring_finger2'), + 127: + dict( + name='right_ring_finger3', + id=127, + color=[255, 51, 51], + type='', + swap='left_ring_finger3'), + 128: + dict( + name='right_ring_finger4', + id=128, + color=[255, 51, 51], + type='', + swap='left_ring_finger4'), + 129: + dict( + name='right_pinky_finger1', + id=129, + color=[0, 255, 0], + type='', + swap='left_pinky_finger1'), + 130: + dict( + name='right_pinky_finger2', + id=130, + color=[0, 255, 0], + type='', + swap='left_pinky_finger2'), + 131: + dict( + name='right_pinky_finger3', + id=131, + color=[0, 255, 0], + type='', + swap='left_pinky_finger3'), + 132: + dict( + name='right_pinky_finger4', + id=132, + color=[0, 255, 0], + type='', + swap='left_pinky_finger4') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]), + 19: + dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]), + 20: + dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]), + 21: + dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]), + 22: + dict( + link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]), + 23: + dict( + link=('right_ankle', 'right_small_toe'), + id=23, + color=[255, 128, 0]), + 24: + dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]), + 25: + dict( + link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128, + 0]), + 26: + dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]), + 27: + dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]), + 28: + dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]), + 29: + dict( + link=('left_hand_root', 'left_forefinger1'), + id=29, + color=[255, 153, 255]), + 30: + dict( + link=('left_forefinger1', 'left_forefinger2'), + id=30, + color=[255, 153, 255]), + 31: + dict( + link=('left_forefinger2', 'left_forefinger3'), + id=31, + color=[255, 153, 255]), + 32: + dict( + link=('left_forefinger3', 'left_forefinger4'), + id=32, + color=[255, 153, 255]), + 33: + dict( + link=('left_hand_root', 'left_middle_finger1'), + id=33, + color=[102, 178, 255]), + 34: + dict( + link=('left_middle_finger1', 'left_middle_finger2'), + id=34, + color=[102, 178, 255]), + 35: + dict( + link=('left_middle_finger2', 'left_middle_finger3'), + id=35, + color=[102, 178, 255]), + 36: + dict( + link=('left_middle_finger3', 'left_middle_finger4'), + id=36, + color=[102, 178, 255]), + 37: + dict( + link=('left_hand_root', 'left_ring_finger1'), + id=37, + color=[255, 51, 51]), + 38: + dict( + link=('left_ring_finger1', 'left_ring_finger2'), + id=38, + color=[255, 51, 51]), + 39: + dict( + link=('left_ring_finger2', 'left_ring_finger3'), + id=39, + color=[255, 51, 51]), + 40: + dict( + link=('left_ring_finger3', 'left_ring_finger4'), + id=40, + color=[255, 51, 51]), + 41: + dict( + link=('left_hand_root', 'left_pinky_finger1'), + id=41, + color=[0, 255, 0]), + 42: + dict( + link=('left_pinky_finger1', 'left_pinky_finger2'), + id=42, + color=[0, 255, 0]), + 43: + dict( + link=('left_pinky_finger2', 'left_pinky_finger3'), + id=43, + color=[0, 255, 0]), + 44: + dict( + link=('left_pinky_finger3', 'left_pinky_finger4'), + id=44, + color=[0, 255, 0]), + 45: + dict( + link=('right_hand_root', 'right_thumb1'), + id=45, + color=[255, 128, 0]), + 46: + dict( + link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]), + 47: + dict( + link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]), + 48: + dict( + link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]), + 49: + dict( + link=('right_hand_root', 'right_forefinger1'), + id=49, + color=[255, 153, 255]), + 50: + dict( + link=('right_forefinger1', 'right_forefinger2'), + id=50, + color=[255, 153, 255]), + 51: + dict( + link=('right_forefinger2', 'right_forefinger3'), + id=51, + color=[255, 153, 255]), + 52: + dict( + link=('right_forefinger3', 'right_forefinger4'), + id=52, + color=[255, 153, 255]), + 53: + dict( + link=('right_hand_root', 'right_middle_finger1'), + id=53, + color=[102, 178, 255]), + 54: + dict( + link=('right_middle_finger1', 'right_middle_finger2'), + id=54, + color=[102, 178, 255]), + 55: + dict( + link=('right_middle_finger2', 'right_middle_finger3'), + id=55, + color=[102, 178, 255]), + 56: + dict( + link=('right_middle_finger3', 'right_middle_finger4'), + id=56, + color=[102, 178, 255]), + 57: + dict( + link=('right_hand_root', 'right_ring_finger1'), + id=57, + color=[255, 51, 51]), + 58: + dict( + link=('right_ring_finger1', 'right_ring_finger2'), + id=58, + color=[255, 51, 51]), + 59: + dict( + link=('right_ring_finger2', 'right_ring_finger3'), + id=59, + color=[255, 51, 51]), + 60: + dict( + link=('right_ring_finger3', 'right_ring_finger4'), + id=60, + color=[255, 51, 51]), + 61: + dict( + link=('right_hand_root', 'right_pinky_finger1'), + id=61, + color=[0, 255, 0]), + 62: + dict( + link=('right_pinky_finger1', 'right_pinky_finger2'), + id=62, + color=[0, 255, 0]), + 63: + dict( + link=('right_pinky_finger2', 'right_pinky_finger3'), + id=63, + color=[0, 255, 0]), + 64: + dict( + link=('right_pinky_finger3', 'right_pinky_finger4'), + id=64, + color=[0, 255, 0]) + }, + joint_weights=[1.] * 133, + # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/' + # 'evaluation/myeval_wholebody.py#L175' + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066, + 0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031, + 0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045, + 0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015, + 0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017, + 0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, + 0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, + 0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01, + 0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, + 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, + 0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, + 0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, + 0.019, 0.022, 0.031 + ]) diff --git a/configs/_base_/datasets/mpii_info.py b/configs/_base_/datasets/mpii_info.py new file mode 100644 index 0000000..8090992 --- /dev/null +++ b/configs/_base_/datasets/mpii_info.py @@ -0,0 +1,155 @@ +mpii_info = dict( + dataset_name='mpii', + paper_info=dict( + author='Mykhaylo Andriluka and Leonid Pishchulin and ' + 'Peter Gehler and Schiele, Bernt', + title='2D Human Pose Estimation: New Benchmark and ' + 'State of the Art Analysis', + container='IEEE Conference on Computer Vision and ' + 'Pattern Recognition (CVPR)', + year='2014', + homepage='http://human-pose.mpi-inf.mpg.de/', + ), + keypoint_info={ + 0: + dict( + name='right_ankle', + id=0, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 1: + dict( + name='right_knee', + id=1, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 2: + dict( + name='right_hip', + id=2, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 3: + dict( + name='left_hip', + id=3, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 4: + dict( + name='left_knee', + id=4, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 5: + dict( + name='left_ankle', + id=5, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 6: + dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''), + 7: + dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''), + 8: + dict( + name='upper_neck', + id=8, + color=[51, 153, 255], + type='upper', + swap=''), + 9: + dict( + name='head_top', id=9, color=[51, 153, 255], type='upper', + swap=''), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='right_elbow', + id=11, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 12: + dict( + name='right_shoulder', + id=12, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 13: + dict( + name='left_shoulder', + id=13, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 14: + dict( + name='left_elbow', + id=14, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 15: + dict( + name='left_wrist', + id=15, + color=[0, 255, 0], + type='upper', + swap='right_wrist') + }, + skeleton_info={ + 0: + dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), + 1: + dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]), + 2: + dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]), + 3: + dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]), + 4: + dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]), + 5: + dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]), + 6: + dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]), + 7: + dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]), + 8: + dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]), + 9: + dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]), + 10: + dict( + link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128, + 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]), + 13: + dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]), + 14: + dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0]) + }, + joint_weights=[ + 1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5 + ], + # Adapted from COCO dataset. + sigmas=[ + 0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026, + 0.062, 0.072, 0.179, 0.179, 0.072, 0.062 + ]) diff --git a/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py b/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py index 0753a3c..7f92e06 100644 --- a/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py +++ b/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_large_coco_256x192.py @@ -7,7 +7,7 @@ evaluation = dict(interval=10, metric='mAP', save_best='AP') optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1, constructor='LayerDecayOptimizerConstructor', paramwise_cfg=dict( - num_layers=16, + num_layers=24, layer_decay_rate=0.8, custom_keys={ 'bias': dict(decay_multi=0.), diff --git a/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_base_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py b/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_base_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py new file mode 100644 index 0000000..01677a4 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_base_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py @@ -0,0 +1,500 @@ +_base_ = [ + '../../../../_base_/default_runtime.py', + '../../../../_base_/datasets/coco.py', + '../../../../_base_/datasets/aic_info.py', + '../../../../_base_/datasets/mpii_info.py', + '../../../../_base_/datasets/ap10k_info.py', + '../../../../_base_/datasets/coco_wholebody_info.py' +] +evaluation = dict(interval=10, metric='mAP', save_best='AP') + +optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1, + constructor='LayerDecayOptimizerConstructor', + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.75, + custom_keys={ + 'bias': dict(decay_multi=0.), + 'pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + } + ) + ) + +optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) + +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +target_type = 'GaussianHeatmap' +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) +aic_channel_cfg = dict( + num_output_channels=14, + dataset_joints=14, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) +mpii_channel_cfg = dict( + num_output_channels=16, + dataset_joints=16, + dataset_channel=list(range(16)), + inference_channel=list(range(16))) +crowdpose_channel_cfg = dict( + num_output_channels=14, + dataset_joints=14, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) +ap10k_channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) +cocowholebody_channel_cfg = dict( + num_output_channels=133, + dataset_joints=133, + dataset_channel=[ + list(range(133)), + ], + inference_channel=list(range(133))) + + +# model settings +model = dict( + type='TopDownMoE', + pretrained=None, + backbone=dict( + type='ViTMoE', + img_size=(256, 192), + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + ratio=1, + use_checkpoint=False, + mlp_ratio=4, + qkv_bias=True, + drop_path_rate=0.3, + num_expert=6, + part_features=192 + ), + keypoint_head=dict( + type='TopdownHeatmapSimpleHead', + in_channels=768, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + associate_keypoint_head=[ + dict( + type='TopdownHeatmapSimpleHead', + in_channels=768, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=aic_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=768, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=mpii_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=768, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=crowdpose_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=768, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=ap10k_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=768, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=ap10k_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=768, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=cocowholebody_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + ], + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process='default', + shift_heatmap=False, + target_type=target_type, + modulate_kernel=11, + use_udp=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=False, + det_bbox_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + max_num_joints=133, + dataset_idx=0, +) + +aic_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=aic_channel_cfg['num_output_channels'], + num_joints=aic_channel_cfg['dataset_joints'], + dataset_channel=aic_channel_cfg['dataset_channel'], + inference_channel=aic_channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + max_num_joints=133, + dataset_idx=1, +) + +mpii_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=mpii_channel_cfg['num_output_channels'], + num_joints=mpii_channel_cfg['dataset_joints'], + dataset_channel=mpii_channel_cfg['dataset_channel'], + inference_channel=mpii_channel_cfg['inference_channel'], + max_num_joints=133, + dataset_idx=2, + use_gt_bbox=True, + bbox_file=None, +) + +ap10k_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + bbox_file='', + max_num_joints=133, + dataset_idx=3, +) + +ap36k_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + bbox_file='', + max_num_joints=133, + dataset_idx=4, +) + +cocowholebody_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=cocowholebody_channel_cfg['num_output_channels'], + num_joints=cocowholebody_channel_cfg['dataset_joints'], + dataset_channel=cocowholebody_channel_cfg['dataset_channel'], + inference_channel=cocowholebody_channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=False, + det_bbox_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + dataset_idx=5, + max_num_joints=133, +) + +cocowholebody_train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' + ]), +] + +ap10k_train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' + ]), +] + +aic_train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' + ]), +] + +mpii_train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='TopDownGenerateTarget', + sigma=2, + encoding='UDP', + target_type=target_type), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'flip_pairs', 'dataset_idx' + ]), +] + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='TopDownGenerateTarget', + sigma=2, + encoding='UDP', + target_type=target_type), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs', 'dataset_idx' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +aic_data_root = 'data/aic' +mpii_data_root = 'data/mpii' +ap10k_data_root = 'data/ap10k' +ap36k_data_root = 'data/ap36k' + +data = dict( + samples_per_gpu=128, + workers_per_gpu=8, + val_dataloader=dict(samples_per_gpu=64), + test_dataloader=dict(samples_per_gpu=64), + train=[ + dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + dict( + type='TopDownAicDataset', + ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/' + 'keypoint_train_images_20170902/', + data_cfg=aic_data_cfg, + pipeline=aic_train_pipeline, + dataset_info={{_base_.aic_info}}), + dict( + type='TopDownMpiiDataset', + ann_file=f'{mpii_data_root}/annotations/mpii_train.json', + img_prefix=f'{mpii_data_root}/images/', + data_cfg=mpii_data_cfg, + pipeline=mpii_train_pipeline, + dataset_info={{_base_.mpii_info}}), + dict( + type='AnimalAP10KDataset', + ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json', + img_prefix=f'{ap10k_data_root}/data/', + data_cfg=ap10k_data_cfg, + pipeline=ap10k_train_pipeline, + dataset_info={{_base_.ap10k_info}}), + dict( + type='AnimalAP10KDataset', + ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json', + img_prefix=f'{ap36k_data_root}/', + data_cfg=ap36k_data_cfg, + pipeline=ap10k_train_pipeline, + dataset_info={{_base_.ap10k_info}}), + dict( + type='TopDownCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=cocowholebody_data_cfg, + pipeline=cocowholebody_train_pipeline, + dataset_info={{_base_.cocowholebody_info}}), + ], + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) + diff --git a/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_huge_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py b/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_huge_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py new file mode 100644 index 0000000..2b9dbee --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_huge_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py @@ -0,0 +1,500 @@ +_base_ = [ + '../../../../_base_/default_runtime.py', + '../../../../_base_/datasets/coco.py', + '../../../../_base_/datasets/aic_info.py', + '../../../../_base_/datasets/mpii_info.py', + '../../../../_base_/datasets/ap10k_info.py', + '../../../../_base_/datasets/coco_wholebody_info.py' +] +evaluation = dict(interval=10, metric='mAP', save_best='AP') + +optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1, + constructor='LayerDecayOptimizerConstructor', + paramwise_cfg=dict( + num_layers=32, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.), + 'pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + } + ) + ) + +optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) + +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +target_type = 'GaussianHeatmap' +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) +aic_channel_cfg = dict( + num_output_channels=14, + dataset_joints=14, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) +mpii_channel_cfg = dict( + num_output_channels=16, + dataset_joints=16, + dataset_channel=list(range(16)), + inference_channel=list(range(16))) +crowdpose_channel_cfg = dict( + num_output_channels=14, + dataset_joints=14, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) +ap10k_channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) +cocowholebody_channel_cfg = dict( + num_output_channels=133, + dataset_joints=133, + dataset_channel=[ + list(range(133)), + ], + inference_channel=list(range(133))) + + +# model settings +model = dict( + type='TopDownMoE', + pretrained=None, + backbone=dict( + type='ViTMoE', + img_size=(256, 192), + patch_size=16, + embed_dim=1280, + depth=32, + num_heads=16, + ratio=1, + use_checkpoint=False, + mlp_ratio=4, + qkv_bias=True, + drop_path_rate=0.55, + num_expert=6, + part_features=320 + ), + keypoint_head=dict( + type='TopdownHeatmapSimpleHead', + in_channels=1280, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + associate_keypoint_head=[ + dict( + type='TopdownHeatmapSimpleHead', + in_channels=1280, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=aic_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=1280, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=mpii_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=1280, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=crowdpose_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=1280, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=ap10k_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=1280, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=ap10k_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=1280, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=cocowholebody_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + ], + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process='default', + shift_heatmap=False, + target_type=target_type, + modulate_kernel=11, + use_udp=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=False, + det_bbox_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + max_num_joints=133, + dataset_idx=0, +) + +aic_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=aic_channel_cfg['num_output_channels'], + num_joints=aic_channel_cfg['dataset_joints'], + dataset_channel=aic_channel_cfg['dataset_channel'], + inference_channel=aic_channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + max_num_joints=133, + dataset_idx=1, +) + +mpii_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=mpii_channel_cfg['num_output_channels'], + num_joints=mpii_channel_cfg['dataset_joints'], + dataset_channel=mpii_channel_cfg['dataset_channel'], + inference_channel=mpii_channel_cfg['inference_channel'], + max_num_joints=133, + dataset_idx=2, + use_gt_bbox=True, + bbox_file=None, +) + +ap10k_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + bbox_file='', + max_num_joints=133, + dataset_idx=3, +) + +ap36k_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + bbox_file='', + max_num_joints=133, + dataset_idx=4, +) + +cocowholebody_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=cocowholebody_channel_cfg['num_output_channels'], + num_joints=cocowholebody_channel_cfg['dataset_joints'], + dataset_channel=cocowholebody_channel_cfg['dataset_channel'], + inference_channel=cocowholebody_channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=False, + det_bbox_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + dataset_idx=5, + max_num_joints=133, +) + +cocowholebody_train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' + ]), +] + +ap10k_train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' + ]), +] + +aic_train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' + ]), +] + +mpii_train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='TopDownGenerateTarget', + sigma=2, + encoding='UDP', + target_type=target_type), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'flip_pairs', 'dataset_idx' + ]), +] + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='TopDownGenerateTarget', + sigma=2, + encoding='UDP', + target_type=target_type), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs', 'dataset_idx' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +aic_data_root = 'data/aic' +mpii_data_root = 'data/mpii' +ap10k_data_root = 'data/ap10k' +ap36k_data_root = 'data/ap36k' + +data = dict( + samples_per_gpu=128, + workers_per_gpu=8, + val_dataloader=dict(samples_per_gpu=64), + test_dataloader=dict(samples_per_gpu=64), + train=[ + dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + dict( + type='TopDownAicDataset', + ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/' + 'keypoint_train_images_20170902/', + data_cfg=aic_data_cfg, + pipeline=aic_train_pipeline, + dataset_info={{_base_.aic_info}}), + dict( + type='TopDownMpiiDataset', + ann_file=f'{mpii_data_root}/annotations/mpii_train.json', + img_prefix=f'{mpii_data_root}/images/', + data_cfg=mpii_data_cfg, + pipeline=mpii_train_pipeline, + dataset_info={{_base_.mpii_info}}), + dict( + type='AnimalAP10KDataset', + ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json', + img_prefix=f'{ap10k_data_root}/data/', + data_cfg=ap10k_data_cfg, + pipeline=ap10k_train_pipeline, + dataset_info={{_base_.ap10k_info}}), + dict( + type='AnimalAP10KDataset', + ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json', + img_prefix=f'{ap36k_data_root}/', + data_cfg=ap36k_data_cfg, + pipeline=ap10k_train_pipeline, + dataset_info={{_base_.ap10k_info}}), + dict( + type='TopDownCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=cocowholebody_data_cfg, + pipeline=cocowholebody_train_pipeline, + dataset_info={{_base_.cocowholebody_info}}), + ], + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) + diff --git a/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_large_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py b/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_large_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py new file mode 100644 index 0000000..98b3d4d --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vitPose+_large_coco+aic+mpii+ap10k+apt36k+wholebody_256x192_udp.py @@ -0,0 +1,500 @@ +_base_ = [ + '../../../../_base_/default_runtime.py', + '../../../../_base_/datasets/coco.py', + '../../../../_base_/datasets/aic_info.py', + '../../../../_base_/datasets/mpii_info.py', + '../../../../_base_/datasets/ap10k_info.py', + '../../../../_base_/datasets/coco_wholebody_info.py' +] +evaluation = dict(interval=10, metric='mAP', save_best='AP') + +optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.1, + constructor='LayerDecayOptimizerConstructor', + paramwise_cfg=dict( + num_layers=24, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.), + 'pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + } + ) + ) + +optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2)) + +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +target_type = 'GaussianHeatmap' +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) +aic_channel_cfg = dict( + num_output_channels=14, + dataset_joints=14, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) +mpii_channel_cfg = dict( + num_output_channels=16, + dataset_joints=16, + dataset_channel=list(range(16)), + inference_channel=list(range(16))) +crowdpose_channel_cfg = dict( + num_output_channels=14, + dataset_joints=14, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) +ap10k_channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) +cocowholebody_channel_cfg = dict( + num_output_channels=133, + dataset_joints=133, + dataset_channel=[ + list(range(133)), + ], + inference_channel=list(range(133))) + + +# model settings +model = dict( + type='TopDownMoE', + pretrained=None, + backbone=dict( + type='ViTMoE', + img_size=(256, 192), + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + ratio=1, + use_checkpoint=False, + mlp_ratio=4, + qkv_bias=True, + drop_path_rate=0.5, + num_expert=6, + part_features=256 + ), + keypoint_head=dict( + type='TopdownHeatmapSimpleHead', + in_channels=1024, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + associate_keypoint_head=[ + dict( + type='TopdownHeatmapSimpleHead', + in_channels=1024, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=aic_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=1024, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=mpii_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=1024, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=crowdpose_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=1024, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=ap10k_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=1024, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=ap10k_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + dict( + type='TopdownHeatmapSimpleHead', + in_channels=1024, + num_deconv_layers=2, + num_deconv_filters=(256, 256), + num_deconv_kernels=(4, 4), + extra=dict(final_conv_kernel=1, ), + out_channels=cocowholebody_channel_cfg['num_output_channels'], + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + ], + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process='default', + shift_heatmap=False, + target_type=target_type, + modulate_kernel=11, + use_udp=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=False, + det_bbox_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + max_num_joints=133, + dataset_idx=0, +) + +aic_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=aic_channel_cfg['num_output_channels'], + num_joints=aic_channel_cfg['dataset_joints'], + dataset_channel=aic_channel_cfg['dataset_channel'], + inference_channel=aic_channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + max_num_joints=133, + dataset_idx=1, +) + +mpii_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=mpii_channel_cfg['num_output_channels'], + num_joints=mpii_channel_cfg['dataset_joints'], + dataset_channel=mpii_channel_cfg['dataset_channel'], + inference_channel=mpii_channel_cfg['inference_channel'], + max_num_joints=133, + dataset_idx=2, + use_gt_bbox=True, + bbox_file=None, +) + +ap10k_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + bbox_file='', + max_num_joints=133, + dataset_idx=3, +) + +ap36k_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + bbox_file='', + max_num_joints=133, + dataset_idx=4, +) + +cocowholebody_data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=cocowholebody_channel_cfg['num_output_channels'], + num_joints=cocowholebody_channel_cfg['dataset_joints'], + dataset_channel=cocowholebody_channel_cfg['dataset_channel'], + inference_channel=cocowholebody_channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=False, + det_bbox_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + dataset_idx=5, + max_num_joints=133, +) + +cocowholebody_train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' + ]), +] + +ap10k_train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' + ]), +] + +aic_train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' + ]), +] + +mpii_train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='TopDownGenerateTarget', + sigma=2, + encoding='UDP', + target_type=target_type), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'flip_pairs', 'dataset_idx' + ]), +] + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='TopDownGenerateTarget', + sigma=2, + encoding='UDP', + target_type=target_type), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'dataset_idx' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine', use_udp=True), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs', 'dataset_idx' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +aic_data_root = 'data/aic' +mpii_data_root = 'data/mpii' +ap10k_data_root = 'data/ap10k' +ap36k_data_root = 'data/ap36k' + +data = dict( + samples_per_gpu=128, + workers_per_gpu=8, + val_dataloader=dict(samples_per_gpu=64), + test_dataloader=dict(samples_per_gpu=64), + train=[ + dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + dict( + type='TopDownAicDataset', + ann_file=f'{aic_data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{aic_data_root}/ai_challenger_keypoint_train_20170909/' + 'keypoint_train_images_20170902/', + data_cfg=aic_data_cfg, + pipeline=aic_train_pipeline, + dataset_info={{_base_.aic_info}}), + dict( + type='TopDownMpiiDataset', + ann_file=f'{mpii_data_root}/annotations/mpii_train.json', + img_prefix=f'{mpii_data_root}/images/', + data_cfg=mpii_data_cfg, + pipeline=mpii_train_pipeline, + dataset_info={{_base_.mpii_info}}), + dict( + type='AnimalAP10KDataset', + ann_file=f'{ap10k_data_root}/annotations/ap10k-train-split1.json', + img_prefix=f'{ap10k_data_root}/data/', + data_cfg=ap10k_data_cfg, + pipeline=ap10k_train_pipeline, + dataset_info={{_base_.ap10k_info}}), + dict( + type='AnimalAP10KDataset', + ann_file=f'{ap36k_data_root}/annotations/train_annotations_1.json', + img_prefix=f'{ap36k_data_root}/', + data_cfg=ap36k_data_cfg, + pipeline=ap10k_train_pipeline, + dataset_info={{_base_.ap10k_info}}), + dict( + type='TopDownCocoWholeBodyDataset', + ann_file=f'{data_root}/annotations/coco_wholebody_train_v1.0.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=cocowholebody_data_cfg, + pipeline=cocowholebody_train_pipeline, + dataset_info={{_base_.cocowholebody_info}}), + ], + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) + diff --git a/mmcv_custom/checkpoint.py b/mmcv_custom/checkpoint.py index 1649160..52c9bac 100644 --- a/mmcv_custom/checkpoint.py +++ b/mmcv_custom/checkpoint.py @@ -25,6 +25,8 @@ from mmcv.runner import get_dist_info from scipy import interpolate import numpy as np import math +import re +import copy ENV_MMCV_HOME = 'MMCV_HOME' ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' @@ -313,6 +315,7 @@ def load_checkpoint(model, strict=False, logger=None, patch_padding='pad', + part_features=None ): """Load checkpoint from a file or URI. @@ -389,9 +392,19 @@ def load_checkpoint(model, pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) state_dict['pos_embed'] = new_pos_embed + + new_state_dict = copy.deepcopy(state_dict) + if part_features is not None: + current_keys = list(model.state_dict().keys()) + for key in current_keys: + if "mlp.experts" in key: + source_key = re.sub(r'experts.\d+.', 'fc2.', key) + new_state_dict[key] = state_dict[source_key][-part_features:] + elif 'fc2' in key: + new_state_dict[key] = state_dict[key][:-part_features] # load state_dict - load_state_dict(model, state_dict, strict, logger) + load_state_dict(model, new_state_dict, strict, logger) return checkpoint diff --git a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py index 45a8ffe..fb281f1 100644 --- a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py +++ b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py @@ -60,6 +60,9 @@ class Kpt2dSviewRgbImgTopDownDataset(Dataset, metaclass=ABCMeta): self.ann_info['num_output_channels'] = data_cfg['num_output_channels'] self.ann_info['dataset_channel'] = data_cfg['dataset_channel'] + self.ann_info['max_num_joints'] = data_cfg.get('max_num_joints', None) + self.ann_info['dataset_idx'] = data_cfg.get('dataset_idx', 0) + self.ann_info['use_different_joint_weights'] = data_cfg.get( 'use_different_joint_weights', False) diff --git a/mmpose/datasets/pipelines/top_down_transform.py b/mmpose/datasets/pipelines/top_down_transform.py index 6fb95e6..1af1ea9 100644 --- a/mmpose/datasets/pipelines/top_down_transform.py +++ b/mmpose/datasets/pipelines/top_down_transform.py @@ -633,9 +633,17 @@ class TopDownGenerateTarget: raise ValueError( f'Encoding approach {self.encoding} is not supported!') + if results['ann_info'].get('max_num_joints', None) is not None: + W, H = results['ann_info']['heatmap_size'] + padded_length = int(results['ann_info'].get('max_num_joints') - results['ann_info'].get('num_joints')) + target_weight = np.concatenate([target_weight, np.zeros((padded_length, 1), dtype=np.float32)], 0) + target = np.concatenate([target, np.zeros((padded_length, H, W), dtype=np.float32)], 0) + results['target'] = target results['target_weight'] = target_weight + results['dataset_idx'] = results['ann_info'].get('dataset_idx', 0) + return results diff --git a/mmpose/models/backbones/__init__.py b/mmpose/models/backbones/__init__.py index f9bf9c7..2b8efcf 100644 --- a/mmpose/models/backbones/__init__.py +++ b/mmpose/models/backbones/__init__.py @@ -25,11 +25,12 @@ from .vgg import VGG from .vipnas_mbv3 import ViPNAS_MobileNetV3 from .vipnas_resnet import ViPNAS_ResNet from .vit import ViT +from .vit_moe import ViTMoE __all__ = [ 'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2', 'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet', 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN', 'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3', - 'LiteHRNet', 'V2VNet', 'HRFormer', 'ViT' + 'LiteHRNet', 'V2VNet', 'HRFormer', 'ViT', 'ViTMoE' ] diff --git a/mmpose/models/backbones/base_backbone.py b/mmpose/models/backbones/base_backbone.py index a11d5b1..d64dca1 100644 --- a/mmpose/models/backbones/base_backbone.py +++ b/mmpose/models/backbones/base_backbone.py @@ -14,7 +14,7 @@ class BaseBackbone(nn.Module, metaclass=ABCMeta): inherits this class should at least define its own `forward` function. """ - def init_weights(self, pretrained=None, patch_padding='pad'): + def init_weights(self, pretrained=None, patch_padding='pad', part_features=None): """Init backbone weights. Args: @@ -25,7 +25,7 @@ class BaseBackbone(nn.Module, metaclass=ABCMeta): """ if isinstance(pretrained, str): logger = logging.getLogger() - load_checkpoint(self, pretrained, strict=False, logger=logger, patch_padding=patch_padding) + load_checkpoint(self, pretrained, strict=False, logger=logger, patch_padding=patch_padding, part_features=part_features) elif pretrained is None: # use default initializer or customized initializer in subclasses pass diff --git a/mmpose/models/backbones/vit.py b/mmpose/models/backbones/vit.py index 828e0dc..2719d1a 100644 --- a/mmpose/models/backbones/vit.py +++ b/mmpose/models/backbones/vit.py @@ -12,6 +12,39 @@ from timm.models.layers import drop_path, to_2tuple, trunc_normal_ from ..builder import BACKBONES from .base_backbone import BaseBackbone +def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True): + """ + Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token + dimension for the original embeddings. + Args: + abs_pos (Tensor): absolute positional embeddings with (1, num_position, C). + has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token. + hw (Tuple): size of input image tokens. + + Returns: + Absolute positional embeddings after processing with shape (1, H, W, C) + """ + cls_token = None + B, L, C = abs_pos.shape + if has_cls_token: + cls_token = abs_pos[:, 0:1] + abs_pos = abs_pos[:, 1:] + + if ori_h != h or ori_w != w: + new_abs_pos = F.interpolate( + abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2), + size=(h, w), + mode="bicubic", + align_corners=False, + ).permute(0, 2, 3, 1).reshape(B, -1, C) + + else: + new_abs_pos = abs_pos + + if cls_token is not None: + new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1) + return new_abs_pos + class DropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """ diff --git a/mmpose/models/backbones/vit_moe.py b/mmpose/models/backbones/vit_moe.py new file mode 100644 index 0000000..2daa270 --- /dev/null +++ b/mmpose/models/backbones/vit_moe.py @@ -0,0 +1,384 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +from functools import partial +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint + +from timm.models.layers import drop_path, to_2tuple, trunc_normal_ + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone + +def get_abs_pos(abs_pos, h, w, ori_h, ori_w, has_cls_token=True): + """ + Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token + dimension for the original embeddings. + Args: + abs_pos (Tensor): absolute positional embeddings with (1, num_position, C). + has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token. + hw (Tuple): size of input image tokens. + + Returns: + Absolute positional embeddings after processing with shape (1, H, W, C) + """ + cls_token = None + B, L, C = abs_pos.shape + if has_cls_token: + cls_token = abs_pos[:, 0:1] + abs_pos = abs_pos[:, 1:] + + if ori_h != h or ori_w != w: + new_abs_pos = F.interpolate( + abs_pos.reshape(1, ori_h, ori_w, -1).permute(0, 3, 1, 2), + size=(h, w), + mode="bicubic", + align_corners=False, + ).permute(0, 2, 3, 1).reshape(B, -1, C) + + else: + new_abs_pos = abs_pos + + if cls_token is not None: + new_abs_pos = torch.cat([cls_token, new_abs_pos], dim=1) + return new_abs_pos + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self): + return 'p={}'.format(self.drop_prob) + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + x = self.drop(x) + return x + +class MoEMlp(nn.Module): + def __init__(self, num_expert=1, in_features=1024, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., part_features=256): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.part_features = part_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features - part_features) + self.drop = nn.Dropout(drop) + + self.num_expert = num_expert + experts = [] + + for i in range(num_expert): + experts.append( + nn.Linear(hidden_features, part_features) + ) + self.experts = nn.ModuleList(experts) + + def forward(self, x, indices): + + expert_x = torch.zeros_like(x[:, :, -self.part_features:], device=x.device, dtype=x.dtype) + + x = self.fc1(x) + x = self.act(x) + shared_x = self.fc2(x) + indices = indices.view(-1, 1, 1) + + # to support ddp training + for i in range(self.num_expert): + selectedIndex = (indices == i) + current_x = self.experts[i](x) * selectedIndex + expert_x = expert_x + current_x + + x = torch.cat([shared_x, expert_x], dim=-1) + + return x + +class Attention(nn.Module): + def __init__( + self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., + proj_drop=0., attn_head_dim=None,): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.dim = dim + + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias) + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + + return x + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, + drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, + norm_layer=nn.LayerNorm, attn_head_dim=None, num_expert=1, part_features=None + ): + super().__init__() + + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim + ) + + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = MoEMlp(num_expert=num_expert, in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, mode=mode) + + def forward(self, x, indices=None): + + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x), indices)) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2) + self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio)) + self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1])) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1)) + + def forward(self, x, **kwargs): + B, C, H, W = x.shape + x = self.proj(x) + Hp, Wp = x.shape[2], x.shape[3] + + x = x.flatten(2).transpose(1, 2) + return x, (Hp, Wp) + + +class HybridEmbed(nn.Module): + """ CNN Feature Map Embedding + Extract feature map from CNN, flatten, project to embedding dim. + """ + def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768): + super().__init__() + assert isinstance(backbone, nn.Module) + img_size = to_2tuple(img_size) + self.img_size = img_size + self.backbone = backbone + if feature_size is None: + with torch.no_grad(): + training = backbone.training + if training: + backbone.eval() + o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1] + feature_size = o.shape[-2:] + feature_dim = o.shape[1] + backbone.train(training) + else: + feature_size = to_2tuple(feature_size) + feature_dim = self.backbone.feature_info.channels()[-1] + self.num_patches = feature_size[0] * feature_size[1] + self.proj = nn.Linear(feature_dim, embed_dim) + + def forward(self, x): + x = self.backbone(x)[-1] + x = x.flatten(2).transpose(1, 2) + x = self.proj(x) + return x + + +@BACKBONES.register_module() +class ViTMoE(BaseBackbone): + + def __init__(self, + img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False, + frozen_stages=-1, ratio=1, last_norm=True, + patch_padding='pad', freeze_attn=False, freeze_ffn=False, + num_expert=1, part_features=None + ): + # Protect mutable default arguments + super(ViTMoE, self).__init__() + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.frozen_stages = frozen_stages + self.use_checkpoint = use_checkpoint + self.patch_padding = patch_padding + self.freeze_attn = freeze_attn + self.freeze_ffn = freeze_ffn + self.depth = depth + + if hybrid_backbone is not None: + self.patch_embed = HybridEmbed( + hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim) + else: + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio) + num_patches = self.patch_embed.num_patches + + self.part_features = part_features + + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + num_expert=num_expert, part_features=part_features + ) + for i in range(depth)]) + + self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity() + + if self.pos_embed is not None: + trunc_normal_(self.pos_embed, std=.02) + + self._freeze_stages() + + def _freeze_stages(self): + """Freeze parameters.""" + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = self.blocks[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + if self.freeze_attn: + for i in range(0, self.depth): + m = self.blocks[i] + m.attn.eval() + m.norm1.eval() + for param in m.attn.parameters(): + param.requires_grad = False + for param in m.norm1.parameters(): + param.requires_grad = False + + if self.freeze_ffn: + self.pos_embed.requires_grad = False + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + for i in range(0, self.depth): + m = self.blocks[i] + m.mlp.eval() + m.norm2.eval() + for param in m.mlp.parameters(): + param.requires_grad = False + for param in m.norm2.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + super().init_weights(pretrained, patch_padding=self.patch_padding, part_features=self.part_features) + + if pretrained is None: + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + self.apply(_init_weights) + + def get_num_layers(self): + return len(self.blocks) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def forward_features(self, x, dataset_source=None): + B, C, H, W = x.shape + x, (Hp, Wp) = self.patch_embed(x) + + if self.pos_embed is not None: + # fit for multiple GPU training + # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference + x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1] + + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, dataset_source) + else: + x = blk(x, dataset_source) + + x = self.last_norm(x) + + xp = x.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous() + + return xp + + def forward(self, x, dataset_source=None): + x = self.forward_features(x, dataset_source) + return x + + def train(self, mode=True): + """Convert the model into training mode.""" + super().train(mode) + self._freeze_stages() diff --git a/mmpose/models/detectors/__init__.py b/mmpose/models/detectors/__init__.py index 66e575e..e098209 100644 --- a/mmpose/models/detectors/__init__.py +++ b/mmpose/models/detectors/__init__.py @@ -8,9 +8,10 @@ from .multiview_pose import (DetectAndRegress, VoxelCenterDetector, from .pose_lifter import PoseLifter from .posewarper import PoseWarper from .top_down import TopDown +from .top_down_moe import TopDownMoE __all__ = [ 'TopDown', 'AssociativeEmbedding', 'ParametricMesh', 'MultiTask', 'PoseLifter', 'Interhand3D', 'PoseWarper', 'DetectAndRegress', - 'VoxelCenterDetector', 'VoxelSinglePose' + 'VoxelCenterDetector', 'VoxelSinglePose', 'TopDownMoE' ] diff --git a/mmpose/models/detectors/top_down_moe.py b/mmpose/models/detectors/top_down_moe.py new file mode 100644 index 0000000..7d499b7 --- /dev/null +++ b/mmpose/models/detectors/top_down_moe.py @@ -0,0 +1,351 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch +import torch.nn as nn + +import mmcv +import numpy as np +from mmcv.image import imwrite +from mmcv.utils.misc import deprecated_api_warning +from mmcv.visualization.image import imshow + +from mmpose.core import imshow_bboxes, imshow_keypoints +from .. import builder +from ..builder import POSENETS +from .base import BasePose + +try: + from mmcv.runner import auto_fp16 +except ImportError: + warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0' + 'Please install mmcv>=1.1.4') + from mmpose.core import auto_fp16 + + +@POSENETS.register_module() +class TopDownMoE(BasePose): + """Top-down pose detectors. + + Args: + backbone (dict): Backbone modules to extract feature. + keypoint_head (dict): Keypoint head to process feature. + train_cfg (dict): Config for training. Default: None. + test_cfg (dict): Config for testing. Default: None. + pretrained (str): Path to the pretrained models. + loss_pose (None): Deprecated arguments. Please use + `loss_keypoint` for heads instead. + """ + + def __init__(self, + backbone, + neck=None, + keypoint_head=None, + associate_keypoint_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + loss_pose=None): + super().__init__() + self.fp16_enabled = False + + self.backbone = builder.build_backbone(backbone) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + if neck is not None: + self.neck = builder.build_neck(neck) + + if keypoint_head is not None: + keypoint_head['train_cfg'] = train_cfg + keypoint_head['test_cfg'] = test_cfg + + if 'loss_keypoint' not in keypoint_head and loss_pose is not None: + warnings.warn( + '`loss_pose` for TopDown is deprecated, ' + 'use `loss_keypoint` for heads instead. See ' + 'https://github.com/open-mmlab/mmpose/pull/382' + ' for more information.', DeprecationWarning) + keypoint_head['loss_keypoint'] = loss_pose + + self.keypoint_head = builder.build_head(keypoint_head) + + + associate_keypoint_heads = [] + keypoint_heads_cnt = 1 + + if associate_keypoint_head is not None: + if not isinstance(associate_keypoint_head, list): + associate_keypoint_head = [associate_keypoint_head] + for single_keypoint_head in associate_keypoint_head: + single_keypoint_head['train_cfg'] = train_cfg + single_keypoint_head['test_cfg'] = test_cfg + associate_keypoint_heads.append(builder.build_head(single_keypoint_head)) + keypoint_heads_cnt += 1 + + self.associate_keypoint_heads = nn.ModuleList(associate_keypoint_heads) + + self.keypoint_heads_cnt = keypoint_heads_cnt + + self.init_weights(pretrained=pretrained) + + @property + def with_neck(self): + """Check if has neck.""" + return hasattr(self, 'neck') + + @property + def with_keypoint(self): + """Check if has keypoint_head.""" + return hasattr(self, 'keypoint_head') + + def init_weights(self, pretrained=None): + """Weight initialization for model.""" + self.backbone.init_weights(pretrained) + if self.with_neck: + self.neck.init_weights() + if self.with_keypoint: + self.keypoint_head.init_weights() + for item in self.associate_keypoint_heads: + item.init_weights() + + @auto_fp16(apply_to=('img', )) + def forward(self, + img, + target=None, + target_weight=None, + img_metas=None, + return_loss=True, + return_heatmap=False, + **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. Note this setting will change the expected inputs. + When `return_loss=True`, img and img_meta are single-nested (i.e. + Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta + should be double nested (i.e. List[Tensor], List[List[dict]]), with + the outer list indicating test time augmentations. + + Note: + - batch_size: N + - num_keypoints: K + - num_img_channel: C (Default: 3) + - img height: imgH + - img width: imgW + - heatmaps height: H + - heatmaps weight: W + + Args: + img (torch.Tensor[NxCximgHximgW]): Input images. + target (torch.Tensor[NxKxHxW]): Target heatmaps. + target_weight (torch.Tensor[NxKx1]): Weights across + different joint types. + img_metas (list(dict)): Information about data augmentation + By default this includes: + + - "image_file: path to the image file + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + return_loss (bool): Option to `return loss`. `return loss=True` + for training, `return loss=False` for validation & test. + return_heatmap (bool) : Option to return heatmap. + + Returns: + dict|tuple: if `return loss` is true, then return losses. \ + Otherwise, return predicted poses, boxes, image paths \ + and heatmaps. + """ + if return_loss: + return self.forward_train(img, target, target_weight, img_metas, + **kwargs) + return self.forward_test( + img, img_metas, return_heatmap=return_heatmap, **kwargs) + + def forward_train(self, img, target, target_weight, img_metas, **kwargs): + """Defines the computation performed at every call when training.""" + + img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device) + + output = self.backbone(img, img_sources) + if self.with_neck: + output = self.neck(output) + # if return loss + losses = dict() + + main_stream_select = (img_sources == 0) + # if torch.sum(main_stream_select) > 0: + output_select = self.keypoint_head(output) + + target_select = target * main_stream_select.view(-1, 1, 1, 1) + target_weight_select = target_weight * main_stream_select.view(-1, 1, 1) + + keypoint_losses = self.keypoint_head.get_loss( + output_select, target_select, target_weight_select) + losses['main_stream_loss'] = keypoint_losses['heatmap_loss'] + keypoint_accuracy = self.keypoint_head.get_accuracy( + output_select, target_select, target_weight_select) + losses['main_stream_acc'] = keypoint_accuracy['acc_pose'] + + for idx in range(1, self.keypoint_heads_cnt): + idx_select = (img_sources == idx) + target_select = target * idx_select.view(-1, 1, 1, 1) + target_weight_select = target_weight * idx_select.view(-1, 1, 1) + output_select = self.associate_keypoint_heads[idx - 1](output) + keypoint_losses = self.associate_keypoint_heads[idx - 1].get_loss( + output_select, target_select, target_weight_select) + losses[f'{idx}_loss'] = keypoint_losses['heatmap_loss'] + keypoint_accuracy = self.associate_keypoint_heads[idx - 1].get_accuracy( + output_select, target_select, target_weight_select) + losses[f'{idx}_acc'] = keypoint_accuracy['acc_pose'] + + return losses + + def forward_test(self, img, img_metas, return_heatmap=False, **kwargs): + """Defines the computation performed at every call when testing.""" + assert img.size(0) == len(img_metas) + batch_size, _, img_height, img_width = img.shape + if batch_size > 1: + assert 'bbox_id' in img_metas[0] + + result = {} + img_sources = torch.from_numpy(np.array([ele['dataset_idx'] for ele in img_metas])).to(img.device) + + features = self.backbone(img, img_sources) + + if self.with_neck: + features = self.neck(features) + if self.with_keypoint: + output_heatmap = self.keypoint_head.inference_model( + features, flip_pairs=None) + + if self.test_cfg.get('flip_test', True): + img_flipped = img.flip(3) + features_flipped = self.backbone(img_flipped, img_sources) + if self.with_neck: + features_flipped = self.neck(features_flipped) + if self.with_keypoint: + output_flipped_heatmap = self.keypoint_head.inference_model( + features_flipped, img_metas[0]['flip_pairs']) + output_heatmap = (output_heatmap + + output_flipped_heatmap) * 0.5 + + if self.with_keypoint: + keypoint_result = self.keypoint_head.decode( + img_metas, output_heatmap, img_size=[img_width, img_height]) + result.update(keypoint_result) + + if not return_heatmap: + output_heatmap = None + + result['output_heatmap'] = output_heatmap + + return result + + def forward_dummy(self, img): + """Used for computing network FLOPs. + + See ``tools/get_flops.py``. + + Args: + img (torch.Tensor): Input image. + + Returns: + Tensor: Output heatmaps. + """ + output = self.backbone(img) + if self.with_neck: + output = self.neck(output) + if self.with_keypoint: + output = self.keypoint_head(output) + return output + + @deprecated_api_warning({'pose_limb_color': 'pose_link_color'}, + cls_name='TopDown') + def show_result(self, + img, + result, + skeleton=None, + kpt_score_thr=0.3, + bbox_color='green', + pose_kpt_color=None, + pose_link_color=None, + text_color='white', + radius=4, + thickness=1, + font_scale=0.5, + bbox_thickness=1, + win_name='', + show=False, + show_keypoint_weight=False, + wait_time=0, + out_file=None): + """Draw `result` over `img`. + + Args: + img (str or Tensor): The image to be displayed. + result (list[dict]): The results to draw over `img` + (bbox_result, pose_result). + skeleton (list[list]): The connection of keypoints. + skeleton is 0-based indexing. + kpt_score_thr (float, optional): Minimum score of keypoints + to be shown. Default: 0.3. + bbox_color (str or tuple or :obj:`Color`): Color of bbox lines. + pose_kpt_color (np.array[Nx3]`): Color of N keypoints. + If None, do not draw keypoints. + pose_link_color (np.array[Mx3]): Color of M links. + If None, do not draw links. + text_color (str or tuple or :obj:`Color`): Color of texts. + radius (int): Radius of circles. + thickness (int): Thickness of lines. + font_scale (float): Font scales of texts. + win_name (str): The window name. + show (bool): Whether to show the image. Default: False. + show_keypoint_weight (bool): Whether to change the transparency + using the predicted confidence scores of keypoints. + wait_time (int): Value of waitKey param. + Default: 0. + out_file (str or None): The filename to write the image. + Default: None. + + Returns: + Tensor: Visualized img, only if not `show` or `out_file`. + """ + img = mmcv.imread(img) + img = img.copy() + + bbox_result = [] + bbox_labels = [] + pose_result = [] + for res in result: + if 'bbox' in res: + bbox_result.append(res['bbox']) + bbox_labels.append(res.get('label', None)) + pose_result.append(res['keypoints']) + + if bbox_result: + bboxes = np.vstack(bbox_result) + # draw bounding boxes + imshow_bboxes( + img, + bboxes, + labels=bbox_labels, + colors=bbox_color, + text_color=text_color, + thickness=bbox_thickness, + font_scale=font_scale, + show=False) + + if pose_result: + imshow_keypoints(img, pose_result, skeleton, kpt_score_thr, + pose_kpt_color, pose_link_color, radius, + thickness) + + if show: + imshow(img, win_name, wait_time) + + if out_file is not None: + imwrite(img, out_file) + + return img diff --git a/tools/train.py b/tools/train.py index 2477c1d..2e1f707 100644 --- a/tools/train.py +++ b/tools/train.py @@ -17,7 +17,7 @@ from mmpose.apis import init_random_seed, train_model from mmpose.datasets import build_dataset from mmpose.models import build_posenet from mmpose.utils import collect_env, get_root_logger, setup_multi_processes - +import mmcv_custom def parse_args(): parser = argparse.ArgumentParser(description='Train a pose model')