|
@ -259,7 +259,7 @@ class CustomAttentionFFN(nn.Module): |
|
|
nn.Linear(dim, dim * 4), |
|
|
nn.Linear(dim, dim * 4), |
|
|
nn.GELU(), |
|
|
nn.GELU(), |
|
|
nn.Linear(dim * 4, dim), |
|
|
nn.Linear(dim * 4, dim), |
|
|
nn.DropPath(proj_drop) |
|
|
nn.Dropout(proj_drop) |
|
|
) |
|
|
) |
|
|
self.norm1 = nn.LayerNorm(dim) |
|
|
self.norm1 = nn.LayerNorm(dim) |
|
|
self.norm2 = nn.LayerNorm(dim) |
|
|
self.norm2 = nn.LayerNorm(dim) |
|
@ -330,8 +330,8 @@ class ViTSam(BaseBackbone): |
|
|
param.requires_grad = False |
|
|
param.requires_grad = False |
|
|
|
|
|
|
|
|
# 交叉注意力 |
|
|
# 交叉注意力 |
|
|
# self.cross_attn = Cross_Attention(embed_dim, num_heads=num_heads, qkv_bias=qkv_bias, \ |
|
|
self.cross_attn = Cross_Attention(embed_dim, num_heads=num_heads, qkv_bias=qkv_bias, \ |
|
|
# qk_scale=qk_scale, attn_drop=attn_drop_rate, proj_drop=drop_rate) |
|
|
qk_scale=qk_scale, attn_drop=attn_drop_rate, proj_drop=drop_rate) |
|
|
|
|
|
|
|
|
# vit_token做自注意力后,再和sam_token做交叉注意力,得到的结果再经过FFN |
|
|
# vit_token做自注意力后,再和sam_token做交叉注意力,得到的结果再经过FFN |
|
|
# self.custom_attn_ffn = CustomAttentionFFN(embed_dim, num_heads=num_heads, qkv_bias=qkv_bias, \ |
|
|
# self.custom_attn_ffn = CustomAttentionFFN(embed_dim, num_heads=num_heads, qkv_bias=qkv_bias, \ |
|
@ -342,7 +342,7 @@ class ViTSam(BaseBackbone): |
|
|
nn.Linear(embed_dim, embed_dim * 4), |
|
|
nn.Linear(embed_dim, embed_dim * 4), |
|
|
nn.GELU(), |
|
|
nn.GELU(), |
|
|
nn.Linear(embed_dim * 4, embed_dim), |
|
|
nn.Linear(embed_dim * 4, embed_dim), |
|
|
nn.DropPath(drop_rate) |
|
|
nn.Dropout(drop_rate) |
|
|
) |
|
|
) |
|
|
self.sam_norm = norm_layer(embed_dim) |
|
|
self.sam_norm = norm_layer(embed_dim) |
|
|
|
|
|
|
|
|