You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
746 lines
29 KiB
746 lines
29 KiB
# Copyright (c) OpenMMLab. All rights reserved.
|
|
|
|
import math
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
# from timm.models.layers import to_2tuple, trunc_normal_
|
|
from mmcv.cnn import (build_activation_layer, build_conv_layer,
|
|
build_norm_layer, trunc_normal_init)
|
|
from mmcv.cnn.bricks.transformer import build_dropout
|
|
from mmcv.runner import BaseModule
|
|
from torch.nn.functional import pad
|
|
|
|
from ..builder import BACKBONES
|
|
from .hrnet import Bottleneck, HRModule, HRNet
|
|
|
|
|
|
def nlc_to_nchw(x, hw_shape):
|
|
"""Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
|
|
|
|
Args:
|
|
x (Tensor): The input tensor of shape [N, L, C] before conversion.
|
|
hw_shape (Sequence[int]): The height and width of output feature map.
|
|
|
|
Returns:
|
|
Tensor: The output tensor of shape [N, C, H, W] after conversion.
|
|
"""
|
|
H, W = hw_shape
|
|
assert len(x.shape) == 3
|
|
B, L, C = x.shape
|
|
assert L == H * W, 'The seq_len doesn\'t match H, W'
|
|
return x.transpose(1, 2).reshape(B, C, H, W)
|
|
|
|
|
|
def nchw_to_nlc(x):
|
|
"""Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
|
|
|
|
Args:
|
|
x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
|
|
|
|
Returns:
|
|
Tensor: The output tensor of shape [N, L, C] after conversion.
|
|
"""
|
|
assert len(x.shape) == 4
|
|
return x.flatten(2).transpose(1, 2).contiguous()
|
|
|
|
|
|
def build_drop_path(drop_path_rate):
|
|
"""Build drop path layer."""
|
|
return build_dropout(dict(type='DropPath', drop_prob=drop_path_rate))
|
|
|
|
|
|
class WindowMSA(BaseModule):
|
|
"""Window based multi-head self-attention (W-MSA) module with relative
|
|
position bias.
|
|
|
|
Args:
|
|
embed_dims (int): Number of input channels.
|
|
num_heads (int): Number of attention heads.
|
|
window_size (tuple[int]): The height and width of the window.
|
|
qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
|
|
Default: True.
|
|
qk_scale (float | None, optional): Override default qk scale of
|
|
head_dim ** -0.5 if set. Default: None.
|
|
attn_drop_rate (float, optional): Dropout ratio of attention weight.
|
|
Default: 0.0
|
|
proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
|
|
with_rpe (bool, optional): If True, use relative position bias.
|
|
Default: True.
|
|
init_cfg (dict | None, optional): The Config for initialization.
|
|
Default: None.
|
|
"""
|
|
|
|
def __init__(self,
|
|
embed_dims,
|
|
num_heads,
|
|
window_size,
|
|
qkv_bias=True,
|
|
qk_scale=None,
|
|
attn_drop_rate=0.,
|
|
proj_drop_rate=0.,
|
|
with_rpe=True,
|
|
init_cfg=None):
|
|
|
|
super().__init__(init_cfg=init_cfg)
|
|
self.embed_dims = embed_dims
|
|
self.window_size = window_size # Wh, Ww
|
|
self.num_heads = num_heads
|
|
head_embed_dims = embed_dims // num_heads
|
|
self.scale = qk_scale or head_embed_dims**-0.5
|
|
|
|
self.with_rpe = with_rpe
|
|
if self.with_rpe:
|
|
# define a parameter table of relative position bias
|
|
self.relative_position_bias_table = nn.Parameter(
|
|
torch.zeros(
|
|
(2 * window_size[0] - 1) * (2 * window_size[1] - 1),
|
|
num_heads)) # 2*Wh-1 * 2*Ww-1, nH
|
|
|
|
Wh, Ww = self.window_size
|
|
rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
|
|
rel_position_index = rel_index_coords + rel_index_coords.T
|
|
rel_position_index = rel_position_index.flip(1).contiguous()
|
|
self.register_buffer('relative_position_index', rel_position_index)
|
|
|
|
self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
|
|
self.attn_drop = nn.Dropout(attn_drop_rate)
|
|
self.proj = nn.Linear(embed_dims, embed_dims)
|
|
self.proj_drop = nn.Dropout(proj_drop_rate)
|
|
|
|
self.softmax = nn.Softmax(dim=-1)
|
|
|
|
def init_weights(self):
|
|
trunc_normal_init(self.relative_position_bias_table, std=0.02)
|
|
|
|
def forward(self, x, mask=None):
|
|
"""
|
|
Args:
|
|
|
|
x (tensor): input features with shape of (B*num_windows, N, C)
|
|
mask (tensor | None, Optional): mask with shape of (num_windows,
|
|
Wh*Ww, Wh*Ww), value should be between (-inf, 0].
|
|
"""
|
|
B, N, C = x.shape
|
|
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
|
|
C // self.num_heads).permute(2, 0, 3, 1, 4)
|
|
q, k, v = qkv[0], qkv[1], qkv[2]
|
|
|
|
q = q * self.scale
|
|
attn = (q @ k.transpose(-2, -1))
|
|
|
|
if self.with_rpe:
|
|
relative_position_bias = self.relative_position_bias_table[
|
|
self.relative_position_index.view(-1)].view(
|
|
self.window_size[0] * self.window_size[1],
|
|
self.window_size[0] * self.window_size[1],
|
|
-1) # Wh*Ww,Wh*Ww,nH
|
|
relative_position_bias = relative_position_bias.permute(
|
|
2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
|
|
attn = attn + relative_position_bias.unsqueeze(0)
|
|
|
|
if mask is not None:
|
|
nW = mask.shape[0]
|
|
attn = attn.view(B // nW, nW, self.num_heads, N,
|
|
N) + mask.unsqueeze(1).unsqueeze(0)
|
|
attn = attn.view(-1, self.num_heads, N, N)
|
|
attn = self.softmax(attn)
|
|
|
|
attn = self.attn_drop(attn)
|
|
|
|
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
|
|
x = self.proj(x)
|
|
x = self.proj_drop(x)
|
|
return x
|
|
|
|
@staticmethod
|
|
def double_step_seq(step1, len1, step2, len2):
|
|
seq1 = torch.arange(0, step1 * len1, step1)
|
|
seq2 = torch.arange(0, step2 * len2, step2)
|
|
return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
|
|
|
|
|
|
class LocalWindowSelfAttention(BaseModule):
|
|
r""" Local-window Self Attention (LSA) module with relative position bias.
|
|
|
|
This module is the short-range self-attention module in the
|
|
Interlaced Sparse Self-Attention <https://arxiv.org/abs/1907.12273>`_.
|
|
|
|
Args:
|
|
embed_dims (int): Number of input channels.
|
|
num_heads (int): Number of attention heads.
|
|
window_size (tuple[int] | int): The height and width of the window.
|
|
qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
|
|
Default: True.
|
|
qk_scale (float | None, optional): Override default qk scale of
|
|
head_dim ** -0.5 if set. Default: None.
|
|
attn_drop_rate (float, optional): Dropout ratio of attention weight.
|
|
Default: 0.0
|
|
proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
|
|
with_rpe (bool, optional): If True, use relative position bias.
|
|
Default: True.
|
|
with_pad_mask (bool, optional): If True, mask out the padded tokens in
|
|
the attention process. Default: False.
|
|
init_cfg (dict | None, optional): The Config for initialization.
|
|
Default: None.
|
|
"""
|
|
|
|
def __init__(self,
|
|
embed_dims,
|
|
num_heads,
|
|
window_size,
|
|
qkv_bias=True,
|
|
qk_scale=None,
|
|
attn_drop_rate=0.,
|
|
proj_drop_rate=0.,
|
|
with_rpe=True,
|
|
with_pad_mask=False,
|
|
init_cfg=None):
|
|
super().__init__(init_cfg=init_cfg)
|
|
if isinstance(window_size, int):
|
|
window_size = (window_size, window_size)
|
|
self.window_size = window_size
|
|
self.with_pad_mask = with_pad_mask
|
|
self.attn = WindowMSA(
|
|
embed_dims=embed_dims,
|
|
num_heads=num_heads,
|
|
window_size=window_size,
|
|
qkv_bias=qkv_bias,
|
|
qk_scale=qk_scale,
|
|
attn_drop_rate=attn_drop_rate,
|
|
proj_drop_rate=proj_drop_rate,
|
|
with_rpe=with_rpe,
|
|
init_cfg=init_cfg)
|
|
|
|
def forward(self, x, H, W, **kwargs):
|
|
"""Forward function."""
|
|
B, N, C = x.shape
|
|
x = x.view(B, H, W, C)
|
|
Wh, Ww = self.window_size
|
|
|
|
# center-pad the feature on H and W axes
|
|
pad_h = math.ceil(H / Wh) * Wh - H
|
|
pad_w = math.ceil(W / Ww) * Ww - W
|
|
x = pad(x, (0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
|
|
pad_h - pad_h // 2))
|
|
|
|
# permute
|
|
x = x.view(B, math.ceil(H / Wh), Wh, math.ceil(W / Ww), Ww, C)
|
|
x = x.permute(0, 1, 3, 2, 4, 5)
|
|
x = x.reshape(-1, Wh * Ww, C) # (B*num_window, Wh*Ww, C)
|
|
|
|
# attention
|
|
if self.with_pad_mask and pad_h > 0 and pad_w > 0:
|
|
pad_mask = x.new_zeros(1, H, W, 1)
|
|
pad_mask = pad(
|
|
pad_mask, [
|
|
0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
|
|
pad_h - pad_h // 2
|
|
],
|
|
value=-float('inf'))
|
|
pad_mask = pad_mask.view(1, math.ceil(H / Wh), Wh,
|
|
math.ceil(W / Ww), Ww, 1)
|
|
pad_mask = pad_mask.permute(1, 3, 0, 2, 4, 5)
|
|
pad_mask = pad_mask.reshape(-1, Wh * Ww)
|
|
pad_mask = pad_mask[:, None, :].expand([-1, Wh * Ww, -1])
|
|
out = self.attn(x, pad_mask, **kwargs)
|
|
else:
|
|
out = self.attn(x, **kwargs)
|
|
|
|
# reverse permutation
|
|
out = out.reshape(B, math.ceil(H / Wh), math.ceil(W / Ww), Wh, Ww, C)
|
|
out = out.permute(0, 1, 3, 2, 4, 5)
|
|
out = out.reshape(B, H + pad_h, W + pad_w, C)
|
|
|
|
# de-pad
|
|
out = out[:, pad_h // 2:H + pad_h // 2, pad_w // 2:W + pad_w // 2]
|
|
return out.reshape(B, N, C)
|
|
|
|
|
|
class CrossFFN(BaseModule):
|
|
r"""FFN with Depthwise Conv of HRFormer.
|
|
|
|
Args:
|
|
in_features (int): The feature dimension.
|
|
hidden_features (int, optional): The hidden dimension of FFNs.
|
|
Defaults: The same as in_features.
|
|
act_cfg (dict, optional): Config of activation layer.
|
|
Default: dict(type='GELU').
|
|
dw_act_cfg (dict, optional): Config of activation layer appended
|
|
right after DW Conv. Default: dict(type='GELU').
|
|
norm_cfg (dict, optional): Config of norm layer.
|
|
Default: dict(type='SyncBN').
|
|
init_cfg (dict | list | None, optional): The init config.
|
|
Default: None.
|
|
"""
|
|
|
|
def __init__(self,
|
|
in_features,
|
|
hidden_features=None,
|
|
out_features=None,
|
|
act_cfg=dict(type='GELU'),
|
|
dw_act_cfg=dict(type='GELU'),
|
|
norm_cfg=dict(type='SyncBN'),
|
|
init_cfg=None):
|
|
super().__init__(init_cfg=init_cfg)
|
|
out_features = out_features or in_features
|
|
hidden_features = hidden_features or in_features
|
|
self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
|
|
self.act1 = build_activation_layer(act_cfg)
|
|
self.norm1 = build_norm_layer(norm_cfg, hidden_features)[1]
|
|
self.dw3x3 = nn.Conv2d(
|
|
hidden_features,
|
|
hidden_features,
|
|
kernel_size=3,
|
|
stride=1,
|
|
groups=hidden_features,
|
|
padding=1)
|
|
self.act2 = build_activation_layer(dw_act_cfg)
|
|
self.norm2 = build_norm_layer(norm_cfg, hidden_features)[1]
|
|
self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
|
|
self.act3 = build_activation_layer(act_cfg)
|
|
self.norm3 = build_norm_layer(norm_cfg, out_features)[1]
|
|
|
|
# put the modules togather
|
|
self.layers = [
|
|
self.fc1, self.norm1, self.act1, self.dw3x3, self.norm2, self.act2,
|
|
self.fc2, self.norm3, self.act3
|
|
]
|
|
|
|
def forward(self, x, H, W):
|
|
"""Forward function."""
|
|
x = nlc_to_nchw(x, (H, W))
|
|
for layer in self.layers:
|
|
x = layer(x)
|
|
x = nchw_to_nlc(x)
|
|
return x
|
|
|
|
|
|
class HRFormerBlock(BaseModule):
|
|
"""High-Resolution Block for HRFormer.
|
|
|
|
Args:
|
|
in_features (int): The input dimension.
|
|
out_features (int): The output dimension.
|
|
num_heads (int): The number of head within each LSA.
|
|
window_size (int, optional): The window size for the LSA.
|
|
Default: 7
|
|
mlp_ratio (int, optional): The expansion ration of FFN.
|
|
Default: 4
|
|
act_cfg (dict, optional): Config of activation layer.
|
|
Default: dict(type='GELU').
|
|
norm_cfg (dict, optional): Config of norm layer.
|
|
Default: dict(type='SyncBN').
|
|
transformer_norm_cfg (dict, optional): Config of transformer norm
|
|
layer. Default: dict(type='LN', eps=1e-6).
|
|
init_cfg (dict | list | None, optional): The init config.
|
|
Default: None.
|
|
"""
|
|
|
|
expansion = 1
|
|
|
|
def __init__(self,
|
|
in_features,
|
|
out_features,
|
|
num_heads,
|
|
window_size=7,
|
|
mlp_ratio=4.0,
|
|
drop_path=0.0,
|
|
act_cfg=dict(type='GELU'),
|
|
norm_cfg=dict(type='SyncBN'),
|
|
transformer_norm_cfg=dict(type='LN', eps=1e-6),
|
|
init_cfg=None,
|
|
**kwargs):
|
|
super(HRFormerBlock, self).__init__(init_cfg=init_cfg)
|
|
self.num_heads = num_heads
|
|
self.window_size = window_size
|
|
self.mlp_ratio = mlp_ratio
|
|
|
|
self.norm1 = build_norm_layer(transformer_norm_cfg, in_features)[1]
|
|
self.attn = LocalWindowSelfAttention(
|
|
in_features,
|
|
num_heads=num_heads,
|
|
window_size=window_size,
|
|
init_cfg=None,
|
|
**kwargs)
|
|
|
|
self.norm2 = build_norm_layer(transformer_norm_cfg, out_features)[1]
|
|
self.ffn = CrossFFN(
|
|
in_features=in_features,
|
|
hidden_features=int(in_features * mlp_ratio),
|
|
out_features=out_features,
|
|
norm_cfg=norm_cfg,
|
|
act_cfg=act_cfg,
|
|
dw_act_cfg=act_cfg,
|
|
init_cfg=None)
|
|
|
|
self.drop_path = build_drop_path(
|
|
drop_path) if drop_path > 0.0 else nn.Identity()
|
|
|
|
def forward(self, x):
|
|
"""Forward function."""
|
|
B, C, H, W = x.size()
|
|
# Attention
|
|
x = x.view(B, C, -1).permute(0, 2, 1)
|
|
x = x + self.drop_path(self.attn(self.norm1(x), H, W))
|
|
# FFN
|
|
x = x + self.drop_path(self.ffn(self.norm2(x), H, W))
|
|
x = x.permute(0, 2, 1).view(B, C, H, W)
|
|
return x
|
|
|
|
def extra_repr(self):
|
|
"""(Optional) Set the extra information about this module."""
|
|
return 'num_heads={}, window_size={}, mlp_ratio={}'.format(
|
|
self.num_heads, self.window_size, self.mlp_ratio)
|
|
|
|
|
|
class HRFomerModule(HRModule):
|
|
"""High-Resolution Module for HRFormer.
|
|
|
|
Args:
|
|
num_branches (int): The number of branches in the HRFormerModule.
|
|
block (nn.Module): The building block of HRFormer.
|
|
The block should be the HRFormerBlock.
|
|
num_blocks (tuple): The number of blocks in each branch.
|
|
The length must be equal to num_branches.
|
|
num_inchannels (tuple): The number of input channels in each branch.
|
|
The length must be equal to num_branches.
|
|
num_channels (tuple): The number of channels in each branch.
|
|
The length must be equal to num_branches.
|
|
num_heads (tuple): The number of heads within the LSAs.
|
|
num_window_sizes (tuple): The window size for the LSAs.
|
|
num_mlp_ratios (tuple): The expansion ratio for the FFNs.
|
|
drop_path (int, optional): The drop path rate of HRFomer.
|
|
Default: 0.0
|
|
multiscale_output (bool, optional): Whether to output multi-level
|
|
features produced by multiple branches. If False, only the first
|
|
level feature will be output. Default: True.
|
|
conv_cfg (dict, optional): Config of the conv layers.
|
|
Default: None.
|
|
norm_cfg (dict, optional): Config of the norm layers appended
|
|
right after conv. Default: dict(type='SyncBN', requires_grad=True)
|
|
transformer_norm_cfg (dict, optional): Config of the norm layers.
|
|
Default: dict(type='LN', eps=1e-6)
|
|
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
|
|
memory while slowing down the training speed. Default: False
|
|
upsample_cfg(dict, optional): The config of upsample layers in fuse
|
|
layers. Default: dict(mode='bilinear', align_corners=False)
|
|
"""
|
|
|
|
def __init__(self,
|
|
num_branches,
|
|
block,
|
|
num_blocks,
|
|
num_inchannels,
|
|
num_channels,
|
|
num_heads,
|
|
num_window_sizes,
|
|
num_mlp_ratios,
|
|
multiscale_output=True,
|
|
drop_paths=0.0,
|
|
with_rpe=True,
|
|
with_pad_mask=False,
|
|
conv_cfg=None,
|
|
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
|
transformer_norm_cfg=dict(type='LN', eps=1e-6),
|
|
with_cp=False,
|
|
upsample_cfg=dict(mode='bilinear', align_corners=False)):
|
|
|
|
self.transformer_norm_cfg = transformer_norm_cfg
|
|
self.drop_paths = drop_paths
|
|
self.num_heads = num_heads
|
|
self.num_window_sizes = num_window_sizes
|
|
self.num_mlp_ratios = num_mlp_ratios
|
|
self.with_rpe = with_rpe
|
|
self.with_pad_mask = with_pad_mask
|
|
|
|
super().__init__(num_branches, block, num_blocks, num_inchannels,
|
|
num_channels, multiscale_output, with_cp, conv_cfg,
|
|
norm_cfg, upsample_cfg)
|
|
|
|
def _make_one_branch(self,
|
|
branch_index,
|
|
block,
|
|
num_blocks,
|
|
num_channels,
|
|
stride=1):
|
|
"""Build one branch."""
|
|
# HRFormerBlock does not support down sample layer yet.
|
|
assert stride == 1 and self.in_channels[branch_index] == num_channels[
|
|
branch_index]
|
|
layers = []
|
|
layers.append(
|
|
block(
|
|
self.in_channels[branch_index],
|
|
num_channels[branch_index],
|
|
num_heads=self.num_heads[branch_index],
|
|
window_size=self.num_window_sizes[branch_index],
|
|
mlp_ratio=self.num_mlp_ratios[branch_index],
|
|
drop_path=self.drop_paths[0],
|
|
norm_cfg=self.norm_cfg,
|
|
transformer_norm_cfg=self.transformer_norm_cfg,
|
|
init_cfg=None,
|
|
with_rpe=self.with_rpe,
|
|
with_pad_mask=self.with_pad_mask))
|
|
|
|
self.in_channels[
|
|
branch_index] = self.in_channels[branch_index] * block.expansion
|
|
for i in range(1, num_blocks[branch_index]):
|
|
layers.append(
|
|
block(
|
|
self.in_channels[branch_index],
|
|
num_channels[branch_index],
|
|
num_heads=self.num_heads[branch_index],
|
|
window_size=self.num_window_sizes[branch_index],
|
|
mlp_ratio=self.num_mlp_ratios[branch_index],
|
|
drop_path=self.drop_paths[i],
|
|
norm_cfg=self.norm_cfg,
|
|
transformer_norm_cfg=self.transformer_norm_cfg,
|
|
init_cfg=None,
|
|
with_rpe=self.with_rpe,
|
|
with_pad_mask=self.with_pad_mask))
|
|
return nn.Sequential(*layers)
|
|
|
|
def _make_fuse_layers(self):
|
|
"""Build fuse layers."""
|
|
if self.num_branches == 1:
|
|
return None
|
|
num_branches = self.num_branches
|
|
num_inchannels = self.in_channels
|
|
fuse_layers = []
|
|
for i in range(num_branches if self.multiscale_output else 1):
|
|
fuse_layer = []
|
|
for j in range(num_branches):
|
|
if j > i:
|
|
fuse_layer.append(
|
|
nn.Sequential(
|
|
build_conv_layer(
|
|
self.conv_cfg,
|
|
num_inchannels[j],
|
|
num_inchannels[i],
|
|
kernel_size=1,
|
|
stride=1,
|
|
bias=False),
|
|
build_norm_layer(self.norm_cfg,
|
|
num_inchannels[i])[1],
|
|
nn.Upsample(
|
|
scale_factor=2**(j - i),
|
|
mode=self.upsample_cfg['mode'],
|
|
align_corners=self.
|
|
upsample_cfg['align_corners'])))
|
|
elif j == i:
|
|
fuse_layer.append(None)
|
|
else:
|
|
conv3x3s = []
|
|
for k in range(i - j):
|
|
if k == i - j - 1:
|
|
num_outchannels_conv3x3 = num_inchannels[i]
|
|
with_out_act = False
|
|
else:
|
|
num_outchannels_conv3x3 = num_inchannels[j]
|
|
with_out_act = True
|
|
sub_modules = [
|
|
build_conv_layer(
|
|
self.conv_cfg,
|
|
num_inchannels[j],
|
|
num_inchannels[j],
|
|
kernel_size=3,
|
|
stride=2,
|
|
padding=1,
|
|
groups=num_inchannels[j],
|
|
bias=False,
|
|
),
|
|
build_norm_layer(self.norm_cfg,
|
|
num_inchannels[j])[1],
|
|
build_conv_layer(
|
|
self.conv_cfg,
|
|
num_inchannels[j],
|
|
num_outchannels_conv3x3,
|
|
kernel_size=1,
|
|
stride=1,
|
|
bias=False,
|
|
),
|
|
build_norm_layer(self.norm_cfg,
|
|
num_outchannels_conv3x3)[1]
|
|
]
|
|
if with_out_act:
|
|
sub_modules.append(nn.ReLU(False))
|
|
conv3x3s.append(nn.Sequential(*sub_modules))
|
|
fuse_layer.append(nn.Sequential(*conv3x3s))
|
|
fuse_layers.append(nn.ModuleList(fuse_layer))
|
|
|
|
return nn.ModuleList(fuse_layers)
|
|
|
|
def get_num_inchannels(self):
|
|
"""Return the number of input channels."""
|
|
return self.in_channels
|
|
|
|
|
|
@BACKBONES.register_module()
|
|
class HRFormer(HRNet):
|
|
"""HRFormer backbone.
|
|
|
|
This backbone is the implementation of `HRFormer: High-Resolution
|
|
Transformer for Dense Prediction <https://arxiv.org/abs/2110.09408>`_.
|
|
|
|
Args:
|
|
extra (dict): Detailed configuration for each stage of HRNet.
|
|
There must be 4 stages, the configuration for each stage must have
|
|
5 keys:
|
|
|
|
- num_modules (int): The number of HRModule in this stage.
|
|
- num_branches (int): The number of branches in the HRModule.
|
|
- block (str): The type of block.
|
|
- num_blocks (tuple): The number of blocks in each branch.
|
|
The length must be equal to num_branches.
|
|
- num_channels (tuple): The number of channels in each branch.
|
|
The length must be equal to num_branches.
|
|
in_channels (int): Number of input image channels. Normally 3.
|
|
conv_cfg (dict): Dictionary to construct and config conv layer.
|
|
Default: None.
|
|
norm_cfg (dict): Config of norm layer.
|
|
Use `SyncBN` by default.
|
|
transformer_norm_cfg (dict): Config of transformer norm layer.
|
|
Use `LN` by default.
|
|
norm_eval (bool): Whether to set norm layers to eval mode, namely,
|
|
freeze running stats (mean and var). Note: Effect on Batch Norm
|
|
and its variants only. Default: False.
|
|
zero_init_residual (bool): Whether to use zero init for last norm layer
|
|
in resblocks to let them behave as identity. Default: False.
|
|
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
|
|
-1 means not freezing any parameters. Default: -1.
|
|
Example:
|
|
>>> from mmpose.models import HRFormer
|
|
>>> import torch
|
|
>>> extra = dict(
|
|
>>> stage1=dict(
|
|
>>> num_modules=1,
|
|
>>> num_branches=1,
|
|
>>> block='BOTTLENECK',
|
|
>>> num_blocks=(2, ),
|
|
>>> num_channels=(64, )),
|
|
>>> stage2=dict(
|
|
>>> num_modules=1,
|
|
>>> num_branches=2,
|
|
>>> block='HRFORMER',
|
|
>>> window_sizes=(7, 7),
|
|
>>> num_heads=(1, 2),
|
|
>>> mlp_ratios=(4, 4),
|
|
>>> num_blocks=(2, 2),
|
|
>>> num_channels=(32, 64)),
|
|
>>> stage3=dict(
|
|
>>> num_modules=4,
|
|
>>> num_branches=3,
|
|
>>> block='HRFORMER',
|
|
>>> window_sizes=(7, 7, 7),
|
|
>>> num_heads=(1, 2, 4),
|
|
>>> mlp_ratios=(4, 4, 4),
|
|
>>> num_blocks=(2, 2, 2),
|
|
>>> num_channels=(32, 64, 128)),
|
|
>>> stage4=dict(
|
|
>>> num_modules=2,
|
|
>>> num_branches=4,
|
|
>>> block='HRFORMER',
|
|
>>> window_sizes=(7, 7, 7, 7),
|
|
>>> num_heads=(1, 2, 4, 8),
|
|
>>> mlp_ratios=(4, 4, 4, 4),
|
|
>>> num_blocks=(2, 2, 2, 2),
|
|
>>> num_channels=(32, 64, 128, 256)))
|
|
>>> self = HRFormer(extra, in_channels=1)
|
|
>>> self.eval()
|
|
>>> inputs = torch.rand(1, 1, 32, 32)
|
|
>>> level_outputs = self.forward(inputs)
|
|
>>> for level_out in level_outputs:
|
|
... print(tuple(level_out.shape))
|
|
(1, 32, 8, 8)
|
|
(1, 64, 4, 4)
|
|
(1, 128, 2, 2)
|
|
(1, 256, 1, 1)
|
|
"""
|
|
|
|
blocks_dict = {'BOTTLENECK': Bottleneck, 'HRFORMERBLOCK': HRFormerBlock}
|
|
|
|
def __init__(self,
|
|
extra,
|
|
in_channels=3,
|
|
conv_cfg=None,
|
|
norm_cfg=dict(type='BN', requires_grad=True),
|
|
transformer_norm_cfg=dict(type='LN', eps=1e-6),
|
|
norm_eval=False,
|
|
with_cp=False,
|
|
zero_init_residual=False,
|
|
frozen_stages=-1):
|
|
|
|
# stochastic depth
|
|
depths = [
|
|
extra[stage]['num_blocks'][0] * extra[stage]['num_modules']
|
|
for stage in ['stage2', 'stage3', 'stage4']
|
|
]
|
|
depth_s2, depth_s3, _ = depths
|
|
drop_path_rate = extra['drop_path_rate']
|
|
dpr = [
|
|
x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
|
|
]
|
|
extra['stage2']['drop_path_rates'] = dpr[0:depth_s2]
|
|
extra['stage3']['drop_path_rates'] = dpr[depth_s2:depth_s2 + depth_s3]
|
|
extra['stage4']['drop_path_rates'] = dpr[depth_s2 + depth_s3:]
|
|
|
|
# HRFormer use bilinear upsample as default
|
|
upsample_cfg = extra.get('upsample', {
|
|
'mode': 'bilinear',
|
|
'align_corners': False
|
|
})
|
|
extra['upsample'] = upsample_cfg
|
|
self.transformer_norm_cfg = transformer_norm_cfg
|
|
self.with_rpe = extra.get('with_rpe', True)
|
|
self.with_pad_mask = extra.get('with_pad_mask', False)
|
|
|
|
super().__init__(extra, in_channels, conv_cfg, norm_cfg, norm_eval,
|
|
with_cp, zero_init_residual, frozen_stages)
|
|
|
|
def _make_stage(self,
|
|
layer_config,
|
|
num_inchannels,
|
|
multiscale_output=True):
|
|
"""Make each stage."""
|
|
num_modules = layer_config['num_modules']
|
|
num_branches = layer_config['num_branches']
|
|
num_blocks = layer_config['num_blocks']
|
|
num_channels = layer_config['num_channels']
|
|
block = self.blocks_dict[layer_config['block']]
|
|
num_heads = layer_config['num_heads']
|
|
num_window_sizes = layer_config['window_sizes']
|
|
num_mlp_ratios = layer_config['mlp_ratios']
|
|
drop_path_rates = layer_config['drop_path_rates']
|
|
|
|
modules = []
|
|
for i in range(num_modules):
|
|
# multiscale_output is only used at the last module
|
|
if not multiscale_output and i == num_modules - 1:
|
|
reset_multiscale_output = False
|
|
else:
|
|
reset_multiscale_output = True
|
|
|
|
modules.append(
|
|
HRFomerModule(
|
|
num_branches,
|
|
block,
|
|
num_blocks,
|
|
num_inchannels,
|
|
num_channels,
|
|
num_heads,
|
|
num_window_sizes,
|
|
num_mlp_ratios,
|
|
reset_multiscale_output,
|
|
drop_paths=drop_path_rates[num_blocks[0] *
|
|
i:num_blocks[0] * (i + 1)],
|
|
with_rpe=self.with_rpe,
|
|
with_pad_mask=self.with_pad_mask,
|
|
conv_cfg=self.conv_cfg,
|
|
norm_cfg=self.norm_cfg,
|
|
transformer_norm_cfg=self.transformer_norm_cfg,
|
|
with_cp=self.with_cp,
|
|
upsample_cfg=self.upsample_cfg))
|
|
num_inchannels = modules[-1].get_num_inchannels()
|
|
|
|
return nn.Sequential(*modules), num_inchannels
|
|
|