Implement of slowfast networks in Pytorch paper:https://arxiv.org/pdf/1812.03982.pdf
"""SlowFast_Network model for Pytorch.
# Reference:
- [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982)
Adapted code from:
@inproceedings{hara3dcnns,
author={Kensho Hara and Hirokatsu Kataoka and Yutaka Satoh},
title={Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?},
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
pages={6546--6555},
year={2018},
}.
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from functools import partial
__all__ = ['resnet50', 'resnet101','resnet152', 'resnet200']
def conv3x3x3(in_planes, out_planes, stride=1):
# 3x3x3 convolution with padding
return nn.Conv3d(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=1,
bias=False)
def downsample_basic_block(x, planes, stride):
out = F.avg_pool3d(x, kernel_size=1, stride=stride)
zero_pads = torch.Tensor(
out.size(0), planes - out.size(1), out.size(2), out.size(3),
out.size(4)).zero_()
if isinstance(out.data, torch.cuda.FloatTensor):
zero_pads = zero_pads.cuda()
out = Variable(torch.cat([out.data, zero_pads], dim=1))
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None, head_conv=1):
super(Bottleneck, self).__init__()
if head_conv == 1:
self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm3d(planes)
elif head_conv == 3:
self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=(3, 1, 1), bias=False, padding=(1, 0, 0))
self.bn1 = nn.BatchNorm3d(planes)
else:
raise ValueError("Unsupported head_conv!")
self.conv2 = nn.Conv3d(
planes, planes, kernel_size=(1, 3, 3), stride=stride, padding=(0, 1, 1), bias=False)
self.bn2 = nn.BatchNorm3d(planes)
self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm3d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
def get_fine_tuning_parameters(model, ft_begin_index):
if ft_begin_index == 0:
return model.parameters()
ft_module_names = []
for i in range(ft_begin_index, 5):
ft_module_names.append('layer{}'.format(i))
ft_module_names.append('fc')
parameters = []
for k, v in model.named_parameters():
for ft_module in ft_module_names:
if ft_module in k:
parameters.append({'params': v})
break
else:
parameters.append({'params': v, 'lr': 0.0})
return parameters
class SlowFast(nn.Module):
def __init__(self, block=Bottleneck, layers=[3, 4, 6, 3], class_num=10, shortcut_type='B',dropout=0.5 ):
super(SlowFast, self).__init__()
self.fast_inplanes = 8
self.fast_conv1 = nn.Conv3d(3, 8, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
self.fast_bn1 = nn.BatchNorm3d(8)
self.fast_relu = nn.ReLU(inplace=True)
self.fast_maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
self.fast_res1 = self._make_layer_fast(block, 8, layers[0], shortcut_type, head_conv=3)
self.fast_res2 = self._make_layer_fast(
block, 16, layers[1], shortcut_type, stride=2, head_conv=3)
self.fast_res3 = self._make_layer_fast(
block, 32, layers[2], shortcut_type, stride=2, head_conv=3)
self.fast_res4 = self._make_layer_fast(
block, 64, layers[3], shortcut_type, stride=2, head_conv=3)
self.slow_inplanes = 64
self.slow_conv1 = nn.Conv3d(3, 64, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
self.slow_bn1 = nn.BatchNorm3d(64)
self.slow_relu = nn.ReLU(inplace=True)
self.slow_maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
self.slow_res1 = self._make_layer_slow(block, 64, layers[0], shortcut_type, head_conv=1)
self.slow_res2 = self._make_layer_slow(
block, 128, layers[1], shortcut_type, stride=2, head_conv=1)
self.slow_res3 = self._make_layer_slow(
block, 256, layers[2], shortcut_type, stride=2, head_conv=1)
self.slow_res4 = self._make_layer_slow(
block, 512, layers[3], shortcut_type, stride=2, head_conv=1)
self.dp = nn.Dropout(dropout)
self.fc = nn.Linear(self.fast_inplanes+self.slow_inplanes, class_num, bias=False)
def forward(self, input):
slow = self.SlowPath(input[:, :, ::16, :, :])
fast = self.FastPath(input[:, :, ::2, :, :])
x = torch.cat([slow, fast], dim=1)
x = self.dp(x)
x = self.fc(x)
return x
def SlowPath(self, input):
x = self.slow_conv1(input)
x = self.slow_bn1(x)
x = self.slow_relu(x)
x = self.slow_maxpool(x)
x = self.slow_res1(x)
x = self.slow_res2(x)
x = self.slow_res3(x)
x = self.slow_res4(x)
x = nn.AdaptiveAvgPool3d(1)(x)
x = x.view(-1, x.size(1))
return x
def FastPath(self, input):
x = self.fast_conv1(input)
x = self.fast_bn1(x)
x = self.fast_relu(x)
x = self.fast_maxpool(x)
x = self.fast_res1(x)
x = self.fast_res2(x)
x = self.fast_res3(x)
x = self.fast_res4(x)
x = nn.AdaptiveAvgPool3d(1)(x)
x = x.view(-1, x.size(1))
return x
def _make_layer_fast(self, block, planes, blocks, shortcut_type, stride=1, head_conv=1):
downsample = None
if stride != 1 or self.fast_inplanes != planes * block.expansion:
if shortcut_type == 'A':
downsample = partial(
downsample_basic_block,
planes=planes * block.expansion,
stride=stride)
else:
downsample = nn.Sequential(
nn.Conv3d(
self.fast_inplanes,
planes * block.expansion,
kernel_size=1,
stride=stride,
bias=False), nn.BatchNorm3d(planes * block.expansion))
layers = []
layers.append(block(self.fast_inplanes, planes, stride, downsample, head_conv=head_conv))
self.fast_inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.fast_inplanes, planes, head_conv=head_conv))
return nn.Sequential(*layers)
def _make_layer_slow(self, block, planes, blocks, shortcut_type, stride=1, head_conv=1):
downsample = None
if stride != 1 or self.slow_inplanes != planes * block.expansion:
if shortcut_type == 'A':
downsample = partial(
downsample_basic_block,
planes=planes * block.expansion,
stride=stride)
else:
downsample = nn.Sequential(
nn.Conv3d(
self.slow_inplanes,
planes * block.expansion,
kernel_size=1,
stride=stride,
bias=False), nn.BatchNorm3d(planes * block.expansion))
layers = []
layers.append(block(self.slow_inplanes, planes, stride, downsample, head_conv=head_conv))
self.slow_inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.slow_inplanes, planes, head_conv=head_conv))
return nn.Sequential(*layers)
def resnet50(**kwargs):
"""Constructs a ResNet-50 model.
"""
model = SlowFast(Bottleneck, [3, 4, 6, 3], **kwargs)
return model
def resnet101(**kwargs):
"""Constructs a ResNet-101 model.
"""
model = SlowFast(Bottleneck, [3, 4, 23, 3], **kwargs)
return model
def resnet152(**kwargs):
"""Constructs a ResNet-101 model.
"""
model = SlowFast(Bottleneck, [3, 8, 36, 3], **kwargs)
return model
def resnet200(**kwargs):
"""Constructs a ResNet-101 model.
"""
model = SlowFast(Bottleneck, [3, 24, 36, 3], **kwargs)
return model
if __name__ == "__main__":
num_classes = 174
input_tensor = torch.autograd.Variable(torch.rand(1, 3, 64, 224, 224))
model = resnet152(class_num=num_classes)
output = model(input_tensor)
print(output.size())