ViTPose/mmpose/models/backbones/rsn.py


								# Copyright (c) OpenMMLab. All rights reserved.

								import copy as cp


								import torch

								import torch.nn as nn

								import torch.nn.functional as F

								from mmcv.cnn import (ConvModule, MaxPool2d, constant_init, kaiming_init,

								                      normal_init)


								from ..builder import BACKBONES

								from .base_backbone import BaseBackbone


								class RSB(nn.Module):

								    """Residual Steps block for RSN. Paper ref: Cai et al. "Learning Delicate

								    Local Representations for Multi-Person Pose Estimation" (ECCV 2020).


								    Args:

								        in_channels (int): Input channels of this block.

								        out_channels (int): Output channels of this block.

								        num_steps (int): Numbers of steps in RSB

								        stride (int): stride of the block. Default: 1

								        downsample (nn.Module): downsample operation on identity branch.

								            Default: None.

								        norm_cfg (dict): dictionary to construct and config norm layer.

								            Default: dict(type='BN')

								        expand_times (int): Times by which the in_channels are expanded.

								            Default:26.

								        res_top_channels (int): Number of channels of feature output by

								            ResNet_top. Default:64.

								    """


								    expansion = 1


								    def __init__(self,

								                 in_channels,

								                 out_channels,

								                 num_steps=4,

								                 stride=1,

								                 downsample=None,

								                 with_cp=False,

								                 norm_cfg=dict(type='BN'),

								                 expand_times=26,

								                 res_top_channels=64):

								        # Protect mutable default arguments

								        norm_cfg = cp.deepcopy(norm_cfg)

								        super().__init__()

								        assert num_steps > 1

								        self.in_channels = in_channels

								        self.branch_channels = self.in_channels * expand_times

								        self.branch_channels //= res_top_channels

								        self.out_channels = out_channels

								        self.stride = stride

								        self.downsample = downsample

								        self.with_cp = with_cp

								        self.norm_cfg = norm_cfg

								        self.num_steps = num_steps

								        self.conv_bn_relu1 = ConvModule(

								            self.in_channels,

								            self.num_steps * self.branch_channels,

								            kernel_size=1,

								            stride=self.stride,

								            padding=0,

								            norm_cfg=self.norm_cfg,

								            inplace=False)

								        for i in range(self.num_steps):

								            for j in range(i + 1):

								                module_name = f'conv_bn_relu2_{i + 1}_{j + 1}'

								                self.add_module(

								                    module_name,

								                    ConvModule(

								                        self.branch_channels,

								                        self.branch_channels,

								                        kernel_size=3,

								                        stride=1,

								                        padding=1,

								                        norm_cfg=self.norm_cfg,

								                        inplace=False))

								        self.conv_bn3 = ConvModule(

								            self.num_steps * self.branch_channels,

								            self.out_channels * self.expansion,

								            kernel_size=1,

								            stride=1,

								            padding=0,

								            act_cfg=None,

								            norm_cfg=self.norm_cfg,

								            inplace=False)

								        self.relu = nn.ReLU(inplace=False)


								    def forward(self, x):

								        """Forward function."""


								        identity = x

								        x = self.conv_bn_relu1(x)

								        spx = torch.split(x, self.branch_channels, 1)

								        outputs = list()

								        outs = list()

								        for i in range(self.num_steps):

								            outputs_i = list()

								            outputs.append(outputs_i)

								            for j in range(i + 1):

								                if j == 0:

								                    inputs = spx[i]

								                else:

								                    inputs = outputs[i][j - 1]

								                if i > j:

								                    inputs = inputs + outputs[i - 1][j]

								                module_name = f'conv_bn_relu2_{i + 1}_{j + 1}'

								                module_i_j = getattr(self, module_name)

								                outputs[i].append(module_i_j(inputs))


								            outs.append(outputs[i][i])

								        out = torch.cat(tuple(outs), 1)

								        out = self.conv_bn3(out)


								        if self.downsample is not None:

								            identity = self.downsample(identity)

								        out = out + identity


								        out = self.relu(out)


								        return out


								class Downsample_module(nn.Module):

								    """Downsample module for RSN.


								    Args:

								        block (nn.Module): Downsample block.

								        num_blocks (list): Number of blocks in each downsample unit.

								        num_units (int): Numbers of downsample units. Default: 4

								        has_skip (bool): Have skip connections from prior upsample

								            module or not. Default:False

								        num_steps (int): Number of steps in a block. Default:4

								        norm_cfg (dict): dictionary to construct and config norm layer.

								            Default: dict(type='BN')

								        in_channels (int): Number of channels of the input feature to

								            downsample module. Default: 64

								        expand_times (int): Times by which the in_channels are expanded.

								            Default:26.

								    """


								    def __init__(self,

								                 block,

								                 num_blocks,

								                 num_steps=4,

								                 num_units=4,

								                 has_skip=False,

								                 norm_cfg=dict(type='BN'),

								                 in_channels=64,

								                 expand_times=26):

								        # Protect mutable default arguments

								        norm_cfg = cp.deepcopy(norm_cfg)

								        super().__init__()

								        self.has_skip = has_skip

								        self.in_channels = in_channels

								        assert len(num_blocks) == num_units

								        self.num_blocks = num_blocks

								        self.num_units = num_units

								        self.num_steps = num_steps

								        self.norm_cfg = norm_cfg

								        self.layer1 = self._make_layer(

								            block,

								            in_channels,

								            num_blocks[0],

								            expand_times=expand_times,

								            res_top_channels=in_channels)

								        for i in range(1, num_units):

								            module_name = f'layer{i + 1}'

								            self.add_module(

								                module_name,

								                self._make_layer(

								                    block,

								                    in_channels * pow(2, i),

								                    num_blocks[i],

								                    stride=2,

								                    expand_times=expand_times,

								                    res_top_channels=in_channels))


								    def _make_layer(self,

								                    block,

								                    out_channels,

								                    blocks,

								                    stride=1,

								                    expand_times=26,

								                    res_top_channels=64):

								        downsample = None

								        if stride != 1 or self.in_channels != out_channels * block.expansion:

								            downsample = ConvModule(

								                self.in_channels,

								                out_channels * block.expansion,

								                kernel_size=1,

								                stride=stride,

								                padding=0,

								                norm_cfg=self.norm_cfg,

								                act_cfg=None,

								                inplace=True)


								        units = list()

								        units.append(

								            block(

								                self.in_channels,

								                out_channels,

								                num_steps=self.num_steps,

								                stride=stride,

								                downsample=downsample,

								                norm_cfg=self.norm_cfg,

								                expand_times=expand_times,

								                res_top_channels=res_top_channels))

								        self.in_channels = out_channels * block.expansion

								        for _ in range(1, blocks):

								            units.append(

								                block(

								                    self.in_channels,

								                    out_channels,

								                    num_steps=self.num_steps,

								                    expand_times=expand_times,

								                    res_top_channels=res_top_channels))


								        return nn.Sequential(*units)


								    def forward(self, x, skip1, skip2):

								        out = list()

								        for i in range(self.num_units):

								            module_name = f'layer{i + 1}'

								            module_i = getattr(self, module_name)

								            x = module_i(x)

								            if self.has_skip:

								                x = x + skip1[i] + skip2[i]

								            out.append(x)

								        out.reverse()


								        return tuple(out)


								class Upsample_unit(nn.Module):

								    """Upsample unit for upsample module.


								    Args:

								        ind (int): Indicates whether to interpolate (>0) and whether to

								           generate feature map for the next hourglass-like module.

								        num_units (int): Number of units that form a upsample module. Along

								            with ind and gen_cross_conv, nm_units is used to decide whether

								            to generate feature map for the next hourglass-like module.

								        in_channels (int): Channel number of the skip-in feature maps from

								            the corresponding downsample unit.

								        unit_channels (int): Channel number in this unit. Default:256.

								        gen_skip: (bool): Whether or not to generate skips for the posterior

								            downsample module. Default:False

								        gen_cross_conv (bool): Whether to generate feature map for the next

								            hourglass-like module. Default:False

								        norm_cfg (dict): dictionary to construct and config norm layer.

								            Default: dict(type='BN')

								        out_channels (in): Number of channels of feature output by upsample

								            module. Must equal to in_channels of downsample module. Default:64

								    """


								    def __init__(self,

								                 ind,

								                 num_units,

								                 in_channels,

								                 unit_channels=256,

								                 gen_skip=False,

								                 gen_cross_conv=False,

								                 norm_cfg=dict(type='BN'),

								                 out_channels=64):

								        # Protect mutable default arguments

								        norm_cfg = cp.deepcopy(norm_cfg)

								        super().__init__()

								        self.num_units = num_units

								        self.norm_cfg = norm_cfg

								        self.in_skip = ConvModule(

								            in_channels,

								            unit_channels,

								            kernel_size=1,

								            stride=1,

								            padding=0,

								            norm_cfg=self.norm_cfg,

								            act_cfg=None,

								            inplace=True)

								        self.relu = nn.ReLU(inplace=True)


								        self.ind = ind

								        if self.ind > 0:

								            self.up_conv = ConvModule(

								                unit_channels,

								                unit_channels,

								                kernel_size=1,

								                stride=1,

								                padding=0,

								                norm_cfg=self.norm_cfg,

								                act_cfg=None,

								                inplace=True)


								        self.gen_skip = gen_skip

								        if self.gen_skip:

								            self.out_skip1 = ConvModule(

								                in_channels,

								                in_channels,

								                kernel_size=1,

								                stride=1,

								                padding=0,

								                norm_cfg=self.norm_cfg,

								                inplace=True)


								            self.out_skip2 = ConvModule(

								                unit_channels,

								                in_channels,

								                kernel_size=1,

								                stride=1,

								                padding=0,

								                norm_cfg=self.norm_cfg,

								                inplace=True)


								        self.gen_cross_conv = gen_cross_conv

								        if self.ind == num_units - 1 and self.gen_cross_conv:

								            self.cross_conv = ConvModule(

								                unit_channels,

								                out_channels,

								                kernel_size=1,

								                stride=1,

								                padding=0,

								                norm_cfg=self.norm_cfg,

								                inplace=True)


								    def forward(self, x, up_x):

								        out = self.in_skip(x)


								        if self.ind > 0:

								            up_x = F.interpolate(

								                up_x,

								                size=(x.size(2), x.size(3)),

								                mode='bilinear',

								                align_corners=True)

								            up_x = self.up_conv(up_x)

								            out = out + up_x

								        out = self.relu(out)


								        skip1 = None

								        skip2 = None

								        if self.gen_skip:

								            skip1 = self.out_skip1(x)

								            skip2 = self.out_skip2(out)


								        cross_conv = None

								        if self.ind == self.num_units - 1 and self.gen_cross_conv:

								            cross_conv = self.cross_conv(out)


								        return out, skip1, skip2, cross_conv


								class Upsample_module(nn.Module):

								    """Upsample module for RSN.


								    Args:

								        unit_channels (int): Channel number in the upsample units.

								            Default:256.

								        num_units (int): Numbers of upsample units. Default: 4

								        gen_skip (bool): Whether to generate skip for posterior downsample

								            module or not. Default:False

								        gen_cross_conv (bool): Whether to generate feature map for the next

								            hourglass-like module. Default:False

								        norm_cfg (dict): dictionary to construct and config norm layer.

								            Default: dict(type='BN')

								        out_channels (int): Number of channels of feature output by upsample

								            module. Must equal to in_channels of downsample module. Default:64

								    """


								    def __init__(self,

								                 unit_channels=256,

								                 num_units=4,

								                 gen_skip=False,

								                 gen_cross_conv=False,

								                 norm_cfg=dict(type='BN'),

								                 out_channels=64):

								        # Protect mutable default arguments

								        norm_cfg = cp.deepcopy(norm_cfg)

								        super().__init__()

								        self.in_channels = list()

								        for i in range(num_units):

								            self.in_channels.append(RSB.expansion * out_channels * pow(2, i))

								        self.in_channels.reverse()

								        self.num_units = num_units

								        self.gen_skip = gen_skip

								        self.gen_cross_conv = gen_cross_conv

								        self.norm_cfg = norm_cfg

								        for i in range(num_units):

								            module_name = f'up{i + 1}'

								            self.add_module(

								                module_name,

								                Upsample_unit(

								                    i,

								                    self.num_units,

								                    self.in_channels[i],

								                    unit_channels,

								                    self.gen_skip,

								                    self.gen_cross_conv,

								                    norm_cfg=self.norm_cfg,

								                    out_channels=64))


								    def forward(self, x):

								        out = list()

								        skip1 = list()

								        skip2 = list()

								        cross_conv = None

								        for i in range(self.num_units):

								            module_i = getattr(self, f'up{i + 1}')

								            if i == 0:

								                outi, skip1_i, skip2_i, _ = module_i(x[i], None)

								            elif i == self.num_units - 1:

								                outi, skip1_i, skip2_i, cross_conv = module_i(x[i], out[i - 1])

								            else:

								                outi, skip1_i, skip2_i, _ = module_i(x[i], out[i - 1])

								            out.append(outi)

								            skip1.append(skip1_i)

								            skip2.append(skip2_i)

								        skip1.reverse()

								        skip2.reverse()


								        return out, skip1, skip2, cross_conv


								class Single_stage_RSN(nn.Module):

								    """Single_stage Residual Steps Network.


								    Args:

								        unit_channels (int): Channel number in the upsample units. Default:256.

								        num_units (int): Numbers of downsample/upsample units. Default: 4

								        gen_skip (bool): Whether to generate skip for posterior downsample

								            module or not. Default:False

								        gen_cross_conv (bool): Whether to generate feature map for the next

								            hourglass-like module. Default:False

								        has_skip (bool): Have skip connections from prior upsample

								            module or not. Default:False

								        num_steps (int): Number of steps in RSB. Default: 4

								        num_blocks (list): Number of blocks in each downsample unit.

								            Default: [2, 2, 2, 2] Note: Make sure num_units==len(num_blocks)

								        norm_cfg (dict): dictionary to construct and config norm layer.

								            Default: dict(type='BN')

								        in_channels (int): Number of channels of the feature from ResNet_Top.

								            Default: 64.

								        expand_times (int): Times by which the in_channels are expanded in RSB.

								            Default:26.

								    """


								    def __init__(self,

								                 has_skip=False,

								                 gen_skip=False,

								                 gen_cross_conv=False,

								                 unit_channels=256,

								                 num_units=4,

								                 num_steps=4,

								                 num_blocks=[2, 2, 2, 2],

								                 norm_cfg=dict(type='BN'),

								                 in_channels=64,

								                 expand_times=26):

								        # Protect mutable default arguments

								        norm_cfg = cp.deepcopy(norm_cfg)

								        num_blocks = cp.deepcopy(num_blocks)

								        super().__init__()

								        assert len(num_blocks) == num_units

								        self.has_skip = has_skip

								        self.gen_skip = gen_skip

								        self.gen_cross_conv = gen_cross_conv

								        self.num_units = num_units

								        self.num_steps = num_steps

								        self.unit_channels = unit_channels

								        self.num_blocks = num_blocks

								        self.norm_cfg = norm_cfg


								        self.downsample = Downsample_module(RSB, num_blocks, num_steps,

								                                            num_units, has_skip, norm_cfg,

								                                            in_channels, expand_times)

								        self.upsample = Upsample_module(unit_channels, num_units, gen_skip,

								                                        gen_cross_conv, norm_cfg, in_channels)


								    def forward(self, x, skip1, skip2):

								        mid = self.downsample(x, skip1, skip2)

								        out, skip1, skip2, cross_conv = self.upsample(mid)


								        return out, skip1, skip2, cross_conv


								class ResNet_top(nn.Module):

								    """ResNet top for RSN.


								    Args:

								        norm_cfg (dict): dictionary to construct and config norm layer.

								            Default: dict(type='BN')

								        channels (int): Number of channels of the feature output by ResNet_top.

								    """


								    def __init__(self, norm_cfg=dict(type='BN'), channels=64):

								        # Protect mutable default arguments

								        norm_cfg = cp.deepcopy(norm_cfg)

								        super().__init__()

								        self.top = nn.Sequential(

								            ConvModule(

								                3,

								                channels,

								                kernel_size=7,

								                stride=2,

								                padding=3,

								                norm_cfg=norm_cfg,

								                inplace=True), MaxPool2d(kernel_size=3, stride=2, padding=1))


								    def forward(self, img):

								        return self.top(img)


								@BACKBONES.register_module()

								class RSN(BaseBackbone):

								    """Residual Steps Network backbone. Paper ref: Cai et al. "Learning

								    Delicate Local Representations for Multi-Person Pose Estimation" (ECCV

								    2020).


								    Args:

								        unit_channels (int): Number of Channels in an upsample unit.

								            Default: 256

								        num_stages (int): Number of stages in a multi-stage RSN. Default: 4

								        num_units (int): NUmber of downsample/upsample units in a single-stage

								            RSN. Default: 4 Note: Make sure num_units == len(self.num_blocks)

								        num_blocks (list): Number of RSBs (Residual Steps Block) in each

								            downsample unit. Default: [2, 2, 2, 2]

								        num_steps (int): Number of steps in a RSB. Default:4

								        norm_cfg (dict): dictionary to construct and config norm layer.

								            Default: dict(type='BN')

								        res_top_channels (int): Number of channels of feature from ResNet_top.

								            Default: 64.

								        expand_times (int): Times by which the in_channels are expanded in RSB.

								            Default:26.

								    Example:

								        >>> from mmpose.models import RSN

								        >>> import torch

								        >>> self = RSN(num_stages=2,num_units=2,num_blocks=[2,2])

								        >>> self.eval()

								        >>> inputs = torch.rand(1, 3, 511, 511)

								        >>> level_outputs = self.forward(inputs)

								        >>> for level_output in level_outputs:

								        ...     for feature in level_output:

								        ...         print(tuple(feature.shape))

								        ...

								        (1, 256, 64, 64)

								        (1, 256, 128, 128)

								        (1, 256, 64, 64)

								        (1, 256, 128, 128)

								    """


								    def __init__(self,

								                 unit_channels=256,

								                 num_stages=4,

								                 num_units=4,

								                 num_blocks=[2, 2, 2, 2],

								                 num_steps=4,

								                 norm_cfg=dict(type='BN'),

								                 res_top_channels=64,

								                 expand_times=26):

								        # Protect mutable default arguments

								        norm_cfg = cp.deepcopy(norm_cfg)

								        num_blocks = cp.deepcopy(num_blocks)

								        super().__init__()

								        self.unit_channels = unit_channels

								        self.num_stages = num_stages

								        self.num_units = num_units

								        self.num_blocks = num_blocks

								        self.num_steps = num_steps

								        self.norm_cfg = norm_cfg


								        assert self.num_stages > 0

								        assert self.num_steps > 1

								        assert self.num_units > 1

								        assert self.num_units == len(self.num_blocks)

								        self.top = ResNet_top(norm_cfg=norm_cfg)

								        self.multi_stage_rsn = nn.ModuleList([])

								        for i in range(self.num_stages):

								            if i == 0:

								                has_skip = False

								            else:

								                has_skip = True

								            if i != self.num_stages - 1:

								                gen_skip = True

								                gen_cross_conv = True

								            else:

								                gen_skip = False

								                gen_cross_conv = False

								            self.multi_stage_rsn.append(

								                Single_stage_RSN(has_skip, gen_skip, gen_cross_conv,

								                                 unit_channels, num_units, num_steps,

								                                 num_blocks, norm_cfg, res_top_channels,

								                                 expand_times))


								    def forward(self, x):

								        """Model forward function."""

								        out_feats = []

								        skip1 = None

								        skip2 = None

								        x = self.top(x)

								        for i in range(self.num_stages):

								            out, skip1, skip2, x = self.multi_stage_rsn[i](x, skip1, skip2)

								            out_feats.append(out)


								        return out_feats


								    def init_weights(self, pretrained=None):

								        """Initialize model weights."""

								        for m in self.multi_stage_rsn.modules():

								            if isinstance(m, nn.Conv2d):

								                kaiming_init(m)

								            elif isinstance(m, nn.BatchNorm2d):

								                constant_init(m, 1)

								            elif isinstance(m, nn.Linear):

								                normal_init(m, std=0.01)


								        for m in self.top.modules():

								            if isinstance(m, nn.Conv2d):

								                kaiming_init(m)