ViTPose/mmpose/models/detectors/posewarper.py


								# Copyright (c) OpenMMLab. All rights reserved.

								import warnings


								import numpy as np

								import torch


								from ..builder import POSENETS

								from .top_down import TopDown


								try:

								    from mmcv.runner import auto_fp16

								except ImportError:

								    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'

								                  'Please install mmcv>=1.1.4')

								    from mmpose.core import auto_fp16


								@POSENETS.register_module()

								class PoseWarper(TopDown):

								    """Top-down pose detectors for multi-frame settings for video inputs.


								    `"Learning temporal pose estimation from sparsely-labeled videos"

								    <https://arxiv.org/abs/1906.04016>`_.


								    A child class of TopDown detector. The main difference between PoseWarper

								    and TopDown lies in that the former takes a list of tensors as input image

								    while the latter takes a single tensor as input image in forward method.


								    Args:

								        backbone (dict): Backbone modules to extract features.

								        neck (dict): intermediate modules to transform features.

								        keypoint_head (dict): Keypoint head to process feature.

								        train_cfg (dict): Config for training. Default: None.

								        test_cfg (dict): Config for testing. Default: None.

								        pretrained (str): Path to the pretrained models.

								        loss_pose (None): Deprecated arguments. Please use

								            `loss_keypoint` for heads instead.

								        concat_tensors (bool): Whether to concat the tensors on the batch dim,

								            which can speed up, Default: True

								    """


								    def __init__(self,

								                 backbone,

								                 neck=None,

								                 keypoint_head=None,

								                 train_cfg=None,

								                 test_cfg=None,

								                 pretrained=None,

								                 loss_pose=None,

								                 concat_tensors=True):

								        super().__init__(

								            backbone=backbone,

								            neck=neck,

								            keypoint_head=keypoint_head,

								            train_cfg=train_cfg,

								            test_cfg=test_cfg,

								            pretrained=pretrained,

								            loss_pose=loss_pose)

								        self.concat_tensors = concat_tensors


								    @auto_fp16(apply_to=('img', ))

								    def forward(self,

								                img,

								                target=None,

								                target_weight=None,

								                img_metas=None,

								                return_loss=True,

								                return_heatmap=False,

								                **kwargs):

								        """Calls either forward_train or forward_test depending on whether

								        return_loss=True. Note this setting will change the expected inputs.

								        When `return_loss=True`, img and img_meta are single-nested (i.e.

								        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta

								        should be double nested (i.e.  List[Tensor], List[List[dict]]), with

								        the outer list indicating test time augmentations.


								        Note:

								            - number of frames: F

								            - batch_size: N

								            - num_keypoints: K

								            - num_img_channel: C (Default: 3)

								            - img height: imgH

								            - img width: imgW

								            - heatmaps height: H

								            - heatmaps weight: W


								        Args:

								            imgs (list[F,torch.Tensor[N,C,imgH,imgW]]): multiple input frames

								            target (torch.Tensor[N,K,H,W]): Target heatmaps for one frame.

								            target_weight (torch.Tensor[N,K,1]): Weights across

								                different joint types.

								            img_metas (list(dict)): Information about data augmentation

								                By default this includes:


								                - "image_file: paths to multiple video frames

								                - "center": center of the bbox

								                - "scale": scale of the bbox

								                - "rotation": rotation of the bbox

								                - "bbox_score": score of bbox

								            return_loss (bool): Option to `return loss`. `return loss=True`

								                for training, `return loss=False` for validation & test.

								            return_heatmap (bool) : Option to return heatmap.


								        Returns:

								            dict|tuple: if `return loss` is true, then return losses. \

								                Otherwise, return predicted poses, boxes, image paths \

								                and heatmaps.

								        """

								        if return_loss:

								            return self.forward_train(img, target, target_weight, img_metas,

								                                      **kwargs)

								        return self.forward_test(

								            img, img_metas, return_heatmap=return_heatmap, **kwargs)


								    def forward_train(self, imgs, target, target_weight, img_metas, **kwargs):

								        """Defines the computation performed at every call when training."""

								        # imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames

								        assert imgs[0].size(0) == len(img_metas)

								        num_frames = len(imgs)

								        frame_weight = img_metas[0]['frame_weight']


								        assert num_frames == len(frame_weight), f'The number of frames ' \

								            f'({num_frames}) and the length of weights for each frame ' \

								            f'({len(frame_weight)}) must match'


								        if self.concat_tensors:

								            features = [self.backbone(torch.cat(imgs, 0))]

								        else:

								            features = [self.backbone(img) for img in imgs]


								        if self.with_neck:

								            features = self.neck(features, frame_weight=frame_weight)


								        if self.with_keypoint:

								            output = self.keypoint_head(features)


								        # if return loss

								        losses = dict()

								        if self.with_keypoint:

								            keypoint_losses = self.keypoint_head.get_loss(

								                output, target, target_weight)

								            losses.update(keypoint_losses)

								            keypoint_accuracy = self.keypoint_head.get_accuracy(

								                output, target, target_weight)

								            losses.update(keypoint_accuracy)


								        return losses


								    def forward_test(self, imgs, img_metas, return_heatmap=False, **kwargs):

								        """Defines the computation performed at every call when testing."""

								        # imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames

								        assert imgs[0].size(0) == len(img_metas)

								        num_frames = len(imgs)

								        frame_weight = img_metas[0]['frame_weight']


								        assert num_frames == len(frame_weight), f'The number of frames ' \

								            f'({num_frames}) and the length of weights for each frame ' \

								            f'({len(frame_weight)}) must match'


								        batch_size, _, img_height, img_width = imgs[0].shape


								        if batch_size > 1:

								            assert 'bbox_id' in img_metas[0]


								        result = {}


								        if self.concat_tensors:

								            features = [self.backbone(torch.cat(imgs, 0))]

								        else:

								            features = [self.backbone(img) for img in imgs]


								        if self.with_neck:

								            features = self.neck(features, frame_weight=frame_weight)


								        if self.with_keypoint:

								            output_heatmap = self.keypoint_head.inference_model(

								                features, flip_pairs=None)


								        if self.test_cfg.get('flip_test', True):

								            imgs_flipped = [img.flip(3) for img in imgs]


								            if self.concat_tensors:

								                features_flipped = [self.backbone(torch.cat(imgs_flipped, 0))]

								            else:

								                features_flipped = [

								                    self.backbone(img_flipped) for img_flipped in imgs_flipped

								                ]


								            if self.with_neck:

								                features_flipped = self.neck(

								                    features_flipped, frame_weight=frame_weight)


								            if self.with_keypoint:

								                output_flipped_heatmap = self.keypoint_head.inference_model(

								                    features_flipped, img_metas[0]['flip_pairs'])

								                output_heatmap = (output_heatmap +

								                                  output_flipped_heatmap) * 0.5


								        if self.with_keypoint:

								            keypoint_result = self.keypoint_head.decode(

								                img_metas, output_heatmap, img_size=[img_width, img_height])

								            result.update(keypoint_result)


								            if not return_heatmap:

								                output_heatmap = None


								            result['output_heatmap'] = output_heatmap


								        return result


								    def forward_dummy(self, img):

								        """Used for computing network FLOPs.


								        See ``tools/get_flops.py``.


								        Args:

								            img (torch.Tensor[N,C,imgH,imgW], or list|tuple of tensors):

								                multiple input frames, N >= 2.


								        Returns:

								            Tensor: Output heatmaps.

								        """

								        # concat tensors if they are in a list

								        if isinstance(img, (list, tuple)):

								            img = torch.cat(img, 0)


								        batch_size = img.size(0)

								        assert batch_size > 1, 'Input batch size to PoseWarper ' \

								            'should be larger than 1.'

								        if batch_size == 2:

								            warnings.warn('Current batch size: 2, for pytorch2onnx and '

								                          'getting flops both.')

								        else:

								            warnings.warn(

								                f'Current batch size: {batch_size}, for getting flops only.')


								        frame_weight = np.random.uniform(0, 1, batch_size)

								        output = [self.backbone(img)]


								        if self.with_neck:

								            output = self.neck(output, frame_weight=frame_weight)

								        if self.with_keypoint:

								            output = self.keypoint_head(output)

								        return output