Sequencer: LSTM在CV领域重生!
CV全新范式!LSTM在CV领域杀出一条血路!Sequencer:超越Swin、ConvNeXt等网络
论文题目:Sequencer: Deep LSTM for Image Classification
本文基于Paddleclas进行复现。论文地址:Sequencer
不得不说,现在的CV太卷了,连LSTM都进军CV了,那咱们就先简单介绍一下LSTM吧
一、LSTM原理介绍
LSTM的整体结构如下图所示,接下来我们对其中关键部分进行讲述
1、首先是LSTM中的顶部直线,即穿过图表上方的水平直线, 直接在整个链上运行,只有一些小的线性相互作用, 让信息在流动的同时保持不变性。
2、阀门。LSTM有能力向细胞状态中添加或移除信息, 这通过精细调整叫阀门的结构来实现.
阀门可以让信息有选择地通过. 它们由一个S形网络层和一个逐点乘法操作组成.
S形网络层输出[0,1]的一个数, 描述应该让每个组件通过多少信息. 0就是什么也不让通过, 1就是每个信息都可以通过.
一个LSTM含有三个这样的阀门, 来保护和控制每个单元状态。
忘记阀门层。用于决定从细胞状态中扔掉哪些信息, 这由一个叫做忘记阀门层的S形网络层实现。
输入阀门层。在丢掉一些信息之后, 下一步要决定把哪些信息存储在细胞状态中.由两部分组成。输入阀门层决定要更新的值以及创造新的候选值向量。
计算单元状态。将旧的单元状态Ct−1更新为新的状态Ct。
输出阀门层。首先, 网络会运行一个s网络层, 用于决定单元状态的哪些部分会被输出. 然后将单元状态作tanh运算, 得到[-1,1]的值。
二、Sequencer介绍
简单介绍完LSTM,接下来介绍这篇文章核心创新点
1. 探索一种新的可以进行建模远程依赖的模块;
2. 论述自注意力并不是Transformer取得成功的关键;
3. 探索LSTM在CV领域应用的可能性;
介绍完核心创新点,再来看一下方法的核心结构,如下图所示
其中,最终方法应用的是Sequencer2D block,从图中可以看出,Sequenceer2D block的核心就是BiLSTM2D
BiLSTM2D的做法其实很简单,类似MLP中的Token-Mixing MLP,就是分别对Token的长宽运用BiLSTM,然后将处理后的特征进行沿通道进行拼接
是不是有点懵,来看看代码
三、代码复现
class BiLSTM2D(nn.Layer):
"""
C:输入Token通道数
D:BiLSTM输出层维度
name:可用于定义偏置(bias)
"""
def __init__(self,C,D,name = None):
super(BiLSTM2D,self).__init__()
self.rnn_v = nn.LSTM(C,D,num_layers=1,direction = 'bidirect',
bias_ih_attr = True,
bias_hh_attr = True)
self.rnn_h = nn.LSTM(C,D,num_layers=1,direction = 'bidirect',
bias_ih_attr = True,
bias_hh_attr = True)
self.fc = nn.Linear(4*D,C)
def forward(self,x,H,W):
"""
output is channel-last
"""
B,_,C = x.shape #获取输入tensor的shape
x = x.reshape((B,H,W,C)).transpose((0,3,1,2)) #对输入tensor进行reshape
v,_ = self.rnn_v(x.transpose((0,2,1,3)).reshape((-1,H,C))) #对输入tensor的高(H)方向进行双向LSTM
v = v.reshape((B,W,H,-1)).transpose((0,2,1,3)) #1将处理之后的tensor的维度reshape回四维
h,_ = self.rnn_h(x.reshape((-1,W,C))) #对输入tensor的宽(W)方向进行双向LSTM
h = h.reshape((B,H,W,-1)) ¥将处理后的tensor的维度reshape回四维
x = paddle.concat([v,h],axis=-1) #将宽和高分别处理后的tensor沿着C方向进行堆叠
x = self.fc(x) #经过全连接层提取特征
x = x.transpose((0,3,1,2)).flatten(2).transpose((0,2,1)) #对tensor进行最后reshape操作使其宽、高在一个维度上
return x
代码位置:其余的部分和ViT类似,本项目复现的Sequencer2D-S模型,代码放在PaddleClas/ppcls/arch/backbone/modelzoo/Sequencer.py路径下,大家可以去查阅,下图展示Sequencer2D结构
需要注意的是,在stage3和stage4这两个阶段,模型是没有进行下采样的。在stage1,模型将图片下采样7倍,在stage2,模型将输入特征下采样2倍。代码如下。
#这里定义块嵌入,即将图片分成一个个块(patch)
patch_embed = PatchEmbed(
img_size=img_size if i == 0 else img_size // ps[i-1],
patch_size=7 if i == 0 else 2,
in_chans=in_chans if i == 0 else embed_dims[i - 1],
embed_dim=embed_dims[i]) if (i in self.path_used) else nn.Conv2D(embed_dims[i - 1],embed_dims[i - 1],1,1,0)
Sequencer block的代码如下所示。
class Block(nn.Layer):
def __init__(self,
dim,
mlp_ratio=4.,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer='nn.LayerNorm',
epsilon=1e-5,
name = None):
super().__init__()
if isinstance(norm_layer, str):
self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
elif isinstance(norm_layer, Callable):
self.norm1 = norm_layer(dim)
else:
raise TypeError(
"The norm_layer must be str or paddle.nn.layer.Layer class")
#===================================================================
#注意力部分便是Sequencer核心部分
self.attn = BiLSTM2D(
dim,dim,name = name)
#===================================================================
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
if isinstance(norm_layer, str):
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
elif isinstance(norm_layer, Callable):
self.norm2 = norm_layer(dim)
else:
raise TypeError(
"The norm_layer must be str or paddle.nn.layer.Layer class")
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
def forward(self, x,H,W):
x = x + self.drop_path(self.attn(self.norm1(x),H,W))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
定义一些其他必要的模块,这里大部分和VIT当中类似
from collections.abc import Callable
from functools import partial
import numpy as np
import paddle
import paddle.nn as nn
from paddle.nn.initializer import TruncatedNormal, Constant, Normal
from paddle import ParamAttr
__all__ = ["Sequence_2d"]
trunc_normal_ = TruncatedNormal(std=.02)
normal_ = Normal
zeros_ = Constant(value=0.)
ones_ = Constant(value=1.)
def to_2tuple(x):
return tuple([x] * 2)
def drop_path(x, drop_prob=0., training=False):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
"""
if drop_prob == 0. or not training:
return x
keep_prob = paddle.to_tensor(1 - drop_prob)
shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
random_tensor = paddle.floor(random_tensor) # binarize
output = x.divide(keep_prob) * random_tensor
return output
class DropPath(nn.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
class Identity(nn.Layer):
def __init__(self):
super(Identity, self).__init__()
def forward(self, input):
return input
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
"""
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
num_patches = (img_size[1] // patch_size[1]) * \
(img_size[0] // patch_size[0])
self.img_size = img_size
self.patch_size = patch_size
self.num_patches = num_patches
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
def forward(self, x):
B, C, H, W = x.shape
assert H == self.img_size[0] and W == self.img_size[1], \
f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
x = self.proj(x)
B, C, H, W = x.shape
x = x.flatten(2).transpose((0, 2, 1))
return x,H,W
最后组成整个网络,并整合成Paddleclas代码格式。
class Sequencer(nn.Layer):
def __init__(self,
img_size=224,
patch_size=16,
in_chans=3,
class_num=1000,
embed_dims=[64, 128, 128, 128],
num_heads=[1, 2, 4, 8],
mlp_ratios=[4, 4, 4, 4],
patch_used = [0,1],
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
norm_layer=nn.LayerNorm,
depths=[3, 4, 6, 3],
num_stages=4,
linear=False):
super().__init__()
self.class_num = class_num
self.depths = depths
self.num_stages = num_stages
self.path_used = patch_used
dpr = [x for x in paddle.linspace(0, drop_path_rate, sum(depths))
] # stochastic depth decay rule
cur = 0
ps = [7,2]
for i in range(num_stages):
patch_embed = PatchEmbed(
img_size=img_size if i == 0 else img_size // ps[i-1],
patch_size=7 if i == 0 else 2,
in_chans=in_chans if i == 0 else embed_dims[i - 1],
embed_dim=embed_dims[i]) if (i in self.path_used) else nn.Conv2D(embed_dims[i - 1],embed_dims[i - 1],1,1,0)
block = nn.LayerList([
Block(
dim=embed_dims[i],
mlp_ratio=mlp_ratios[i],
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[cur + j],
norm_layer=norm_layer,
name = str(i) + "_" + str(j)) for j in range(depths[i])
])
norm = norm_layer(embed_dims[i])
cur += depths[i]
setattr(self, f"patch_embed{i + 1}", patch_embed)
setattr(self, f"block{i + 1}", block)
setattr(self, f"norm{i + 1}", norm)
# classification head
self.head = nn.Linear(embed_dims[3],
class_num) if class_num > 0 else Identity()
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
def forward_features(self, x):
B = x.shape[0]
for i in range(self.num_stages):
patch_embed = getattr(self, f"patch_embed{i + 1}")
block = getattr(self, f"block{i + 1}")
norm = getattr(self, f"norm{i + 1}")
try:
x,H,W= patch_embed(x)
except:
x= patch_embed(x)
x = x.flatten(2).transpose((0,2,1))
for blk in block:
x = blk(x,H,W)
x = norm(x)
if i != self.num_stages - 1:
x = x.reshape([B, H, W, x.shape[2]]).transpose([0, 3, 1, 2])
return x.mean(axis=1)
def forward(self, x):
x = self.forward_features(x)
x = self.head(x)
return x
def Sequence_2d(pretrained=False, use_ssld=False,**kwargs):
model = Sequencer(patch_size = 7,
embed_dims=[64, 128, 128, 128],
mlp_ratios=[3, 3, 3, 3],
norm_layer=partial(
nn.LayerNorm, epsilon=1e-6),
depths=[4, 3, 8, 3],)
return model
下面就是Paddleclas训练代码,数据集使用的是imagenet-mini,由于官方并未公开Sequencer权重,故这里训练过程暂时不展示。
%cd PaddleClas/
/home/aistudio/PaddleClas
!python -m paddle.distributed.launch --gpus 0 tools/train.py -c ./ppcls/configs/Sequencer.yaml -o Arch.pretrained=False -o Global.device=gpu
下面贴一下论文里面在ImageNet上分类的准确率。
四、总结
个人一些感受
1、现在模型结构创新更多是在注意力机制上改进、替换,总体架构仍然采用VIT结构,这样表明模型整体架构或许影响更重要。
2、Sequencer模型将LSTM引入到CV中,性能能达到甚至超越CNN和Transformer模型。
3、Sequencer模型计算量较大、推理速度较慢,未来可以研究更加高效的LSTM的CV模型。
此文章为搬运
原项目链接
更多推荐
所有评论(0)