Source code for gluoncv.model_zoo.action_recognition.i3d_resnet

# pylint: disable=line-too-long,too-many-lines,missing-docstring,arguments-differ,unused-argument
# Code adapted from https://github.com/open-mmlab/mmaction.

__all__ = ['I3D_ResNetV1', 'i3d_resnet50_v1_kinetics400', 'i3d_resnet101_v1_kinetics400',
           'i3d_nl5_resnet50_v1_kinetics400', 'i3d_nl10_resnet50_v1_kinetics400',
           'i3d_nl5_resnet101_v1_kinetics400', 'i3d_nl10_resnet101_v1_kinetics400',
           'i3d_resnet50_v1_sthsthv2', 'i3d_resnet50_v1_hmdb51', 'i3d_resnet50_v1_ucf101',
           'i3d_resnet50_v1_custom']

from mxnet import nd
from mxnet import init
from mxnet.context import cpu
from mxnet.gluon.block import HybridBlock
from mxnet.gluon import nn
from mxnet.gluon.nn import BatchNorm
from ..resnetv1b import resnet50_v1b, resnet101_v1b
from .non_local import build_nonlocal_block

def conv3x3x3(in_planes, out_planes, spatial_stride=1, temporal_stride=1, dilation=1):
    """3x3x3 convolution with padding"""
    return nn.Conv3D(in_channels=in_planes,
                     channels=out_planes,
                     kernel_size=3,
                     strides=(temporal_stride, spatial_stride, spatial_stride),
                     dilation=dilation,
                     use_bias=False)


def conv1x3x3(in_planes, out_planes, spatial_stride=1, temporal_stride=1, dilation=1):
    """1x3x3 convolution with padding"""
    return nn.Conv3D(in_channels=in_planes,
                     channels=out_planes,
                     kernel_size=(1, 3, 3),
                     strides=(temporal_stride, spatial_stride, spatial_stride),
                     padding=(0, dilation, dilation),
                     dilation=dilation,
                     use_bias=False)

class BasicBlock(HybridBlock):
    """
    Basic building block for ResNet18 and ResNet34.
    Not supported for I3D at this moment.
    """
    expansion = 1

    def __init__(self,
                 inplanes,
                 planes,
                 spatial_stride=1,
                 temporal_stride=1,
                 dilation=1,
                 downsample=None,
                 if_inflate=True,
                 inflate_style=None,
                 norm_layer=BatchNorm,
                 norm_kwargs=None,
                 layer_name='',
                 **kwargs):
        super(BasicBlock, self).__init__()

        self.basicblock = nn.HybridSequential(prefix=layer_name)
        with self.basicblock.name_scope():
            if if_inflate:
                self.conv1 = conv3x3x3(inplanes, planes, spatial_stride, temporal_stride, dilation)
            else:
                self.conv1 = conv1x3x3(inplanes, planes, spatial_stride, temporal_stride, dilation)
            self.bn1 = norm_layer(**({} if norm_kwargs is None else norm_kwargs))
            self.relu = nn.Activation('relu')
            if if_inflate:
                self.conv2 = conv3x3x3(planes, planes)
            else:
                self.conv2 = conv1x3x3(planes, planes)
            self.bn2 = norm_layer(**({} if norm_kwargs is None else norm_kwargs))

            self.basicblock.add(self.conv1)
            self.basicblock.add(self.bn1)
            self.basicblock.add(self.relu)
            self.basicblock.add(self.conv2)
            self.basicblock.add(self.bn2)

            self.downsample = downsample
            self.spatial_stride = spatial_stride
            self.temporal_stride = temporal_stride
            self.dilation = dilation

    def hybrid_forward(self, F, x):
        """Hybrid forward of a ResNet basic block"""
        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample is not None:
            identity = self.downsample(x)
        out = F.Activation(out + identity, act_type='relu')
        return out

class Bottleneck(HybridBlock):
    r"""
    Bottleneck building block for ResNet50, ResNet101 and ResNet152.

    Parameters
    ----------
    inplanes : int.
        Input channels of each block.
    planes : int.
        Output channels of each block.
    spatial_stride : int, default is 1.
        Stride in spatial dimension of convolutional layers in a block.
    temporal_stride : int, default is 1.
        Stride in temporal dimension of convolutional layers in a block.
    dilation : int, default is 1.
        Dilation of convolutional layers in a block.
    downsample : bool.
        Whether to contain a downsampling layer in the block.
    if_inflate : bool.
        Whether enable inflation of 3D convolutional layers in this block.
    inflate_style : str, default is '3x1x1'.
        How to inflate a 2D kernel, either '3x1x1' or '1x3x3'.
    if_nonlocal : bool.
        Whether to insert a non-local block after this ResNet block.
    nonlocal_cfg : dict.
        Additional `non-local` arguments, for example `nonlocal_type='gaussian'`.
    norm_layer : object
        Normalization layer used (default: :class:`mxnet.gluon.nn.BatchNorm`)
        Can be :class:`mxnet.gluon.nn.BatchNorm` or :class:`mxnet.gluon.contrib.nn.SyncBatchNorm`.
    norm_kwargs : dict
        Additional `norm_layer` arguments, for example `num_devices=4`
        for :class:`mxnet.gluon.contrib.nn.SyncBatchNorm`.
    layer_name : str, default is ''.
        Give a name to current block.
    """
    expansion = 4

    def __init__(self,
                 inplanes,
                 planes,
                 spatial_stride=1,
                 temporal_stride=1,
                 dilation=1,
                 downsample=None,
                 if_inflate=True,
                 inflate_style='3x1x1',
                 if_nonlocal=True,
                 nonlocal_cfg=None,
                 norm_layer=BatchNorm,
                 norm_kwargs=None,
                 layer_name='',
                 **kwargs):
        super(Bottleneck, self).__init__()
        assert inflate_style in ['3x1x1', '3x3x3']
        self.inplanes = inplanes
        self.planes = planes

        self.bottleneck = nn.HybridSequential(prefix=layer_name)
        with self.bottleneck.name_scope():
            self.conv1_stride = 1
            self.conv2_stride = spatial_stride
            self.conv1_stride_t = 1
            self.conv2_stride_t = temporal_stride

            if if_inflate:
                if inflate_style == '3x1x1':
                    self.conv1 = nn.Conv3D(in_channels=inplanes,
                                           channels=planes,
                                           kernel_size=(3, 1, 1),
                                           strides=(self.conv1_stride_t, self.conv1_stride, self.conv1_stride),
                                           padding=(1, 0, 0),
                                           use_bias=False)
                    self.conv2 = nn.Conv3D(in_channels=planes,
                                           channels=planes,
                                           kernel_size=(1, 3, 3),
                                           strides=(self.conv2_stride_t, self.conv2_stride, self.conv2_stride),
                                           padding=(0, dilation, dilation),
                                           dilation=(1, dilation, dilation),
                                           use_bias=False)
                else:
                    self.conv1 = nn.Conv3D(in_channels=inplanes,
                                           channels=planes,
                                           kernel_size=1,
                                           strides=(self.conv1_stride_t, self.conv1_stride, self.conv1_stride),
                                           use_bias=False)
                    self.conv2 = nn.Conv3D(in_channels=planes,
                                           channels=planes,
                                           kernel_size=3,
                                           strides=(self.conv2_stride_t, self.conv2_stride, self.conv2_stride),
                                           padding=(1, dilation, dilation),
                                           dilation=(1, dilation, dilation),
                                           use_bias=False)
            else:
                self.conv1 = nn.Conv3D(in_channels=inplanes,
                                       channels=planes,
                                       kernel_size=1,
                                       strides=(1, self.conv1_stride, self.conv1_stride),
                                       use_bias=False)
                self.conv2 = nn.Conv3D(in_channels=planes,
                                       channels=planes,
                                       kernel_size=(1, 3, 3),
                                       strides=(1, self.conv2_stride, self.conv2_stride),
                                       padding=(0, dilation, dilation),
                                       dilation=(1, dilation, dilation),
                                       use_bias=False)

            self.bn1 = norm_layer(in_channels=planes, **({} if norm_kwargs is None else norm_kwargs))
            self.bn2 = norm_layer(in_channels=planes, **({} if norm_kwargs is None else norm_kwargs))
            self.conv3 = nn.Conv3D(in_channels=planes,
                                   channels=planes * self.expansion,
                                   kernel_size=1,
                                   use_bias=False)
            self.bn3 = norm_layer(in_channels=planes * self.expansion, **({} if norm_kwargs is None else norm_kwargs))
            self.relu = nn.Activation('relu')
            self.downsample = downsample

            self.bottleneck.add(self.conv1)
            self.bottleneck.add(self.bn1)
            self.bottleneck.add(self.relu)
            self.bottleneck.add(self.conv2)
            self.bottleneck.add(self.bn2)
            self.bottleneck.add(self.relu)
            self.bottleneck.add(self.conv3)
            self.bottleneck.add(self.bn3)

            self.spatial_tride = spatial_stride
            self.temporal_tride = temporal_stride
            self.dilation = dilation

            if if_nonlocal and nonlocal_cfg is not None:
                nonlocal_cfg_ = nonlocal_cfg.copy()
                nonlocal_cfg_['in_channels'] = planes * self.expansion
                self.nonlocal_block = build_nonlocal_block(nonlocal_cfg_)
                self.bottleneck.add(self.nonlocal_block)
            else:
                self.nonlocal_block = None

    def hybrid_forward(self, F, x):
        """Hybrid forward of a ResNet bottleneck block"""
        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.bn3(out)
        if self.downsample is not None:
            identity = self.downsample(x)
        out = F.Activation(out + identity, act_type='relu')

        if self.nonlocal_block is not None:
            out = self.nonlocal_block(out)
        return out


def make_res_layer(block,
                   inplanes,
                   planes,
                   blocks,
                   spatial_stride=1,
                   temporal_stride=1,
                   dilation=1,
                   inflate_freq=1,
                   inflate_style='3x1x1',
                   nonlocal_freq=1,
                   nonlocal_cfg=None,
                   norm_layer=BatchNorm,
                   norm_kwargs=None,
                   layer_name=''):
    """Build each stage of a ResNet"""
    inflate_freq = inflate_freq if not isinstance(inflate_freq, int) else (inflate_freq, ) * blocks
    nonlocal_freq = nonlocal_freq if not isinstance(nonlocal_freq, int) else (nonlocal_freq, ) * blocks
    assert len(inflate_freq) == blocks
    assert len(nonlocal_freq) == blocks

    downsample = None
    if spatial_stride != 1 or inplanes != planes * block.expansion:
        downsample = nn.HybridSequential(prefix=layer_name+'downsample_')
        with downsample.name_scope():
            downsample.add(nn.Conv3D(in_channels=inplanes,
                                     channels=planes * block.expansion,
                                     kernel_size=1,
                                     strides=(temporal_stride, spatial_stride, spatial_stride),
                                     use_bias=False))
            downsample.add(norm_layer(in_channels=planes * block.expansion, **({} if norm_kwargs is None else norm_kwargs)))


    layers = nn.HybridSequential(prefix=layer_name)
    cnt = 0
    with layers.name_scope():
        layers.add(block(inplanes=inplanes,
                         planes=planes,
                         spatial_stride=spatial_stride,
                         temporal_stride=temporal_stride,
                         dilation=dilation,
                         downsample=downsample,
                         if_inflate=(inflate_freq[0] == 1),
                         inflate_style=inflate_style,
                         if_nonlocal=(nonlocal_freq[0] == 1),
                         nonlocal_cfg=nonlocal_cfg,
                         layer_name='%d_' % cnt))

        cnt += 1
        inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.add(block(inplanes=inplanes,
                             planes=planes,
                             spatial_stride=1,
                             temporal_stride=1,
                             dilation=dilation,
                             if_inflate=(inflate_freq[i] == 1),
                             inflate_style=inflate_style,
                             if_nonlocal=(nonlocal_freq[i] == 1),
                             nonlocal_cfg=nonlocal_cfg,
                             layer_name='%d_' % cnt))
            cnt += 1
    return layers


[docs]class I3D_ResNetV1(HybridBlock):
    r"""ResNet_I3D backbone.
    Inflated 3D model (I3D) from
    `"Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset"
    <https://arxiv.org/abs/1705.07750>`_ paper.

    Parameters
    ----------
    nclass : int.
        Number of categories in the dataset.
    depth : int, default is 50.
        Depth of ResNet, from {18, 34, 50, 101, 152}.
    num_stages : int, default is 4.
        Number of stages in a ResNet.
    pretrained : bool or str.
        Boolean value controls whether to load the default pretrained weights for model.
        String value represents the hashtag for a certain version of pretrained weights.
    pretrained_base : bool or str, optional, default is True.
        Load pretrained base network, the extra layers are randomized. Note that
        if pretrained is `True`, this has no effect.
    feat_ext : bool.
        Whether to extract features before dense classification layer or
        do a complete forward pass.
    num_segments : int, default is 1.
        Number of segments used to evenly divide a video.
    num_crop : int, default is 1.
        Number of crops used during evaluation, choices are 1, 3 or 10.
    spatial_strides : tuple of int.
        Strides in the spatial dimension of the first block of each stage.
    temporal_strides : tuple of int.
        Strides in the temporal dimension of the first block of each stage.
    dilations : tuple of int.
        Dilation ratio of each stage.
    out_indices : tuple of int.
        Collect features from the selected stages of ResNet,
        usually used for feature extraction or auxililary loss.
    conv1_kernel_t : int, default is 5.
        The kernel size of first convolutional layer in a ResNet.
    conv1_stride_t : int, default is 2.
        The stride of first convolutional layer in a ResNet.
    pool1_kernel_t : int, default is 1.
        The kernel size of first pooling layer in a ResNet.
    pool1_stride_t : int, default is 2.
        The stride of first pooling layer in a ResNet.
    inflate_freq : tuple of int.
        Select which 2D convolutional layers to be inflated to 3D convolutional layers in each stage.
    inflate_stride : tuple of int.
        The stride for inflated layers in each stage.
    inflate_style : str, default is '3x1x1'.
        How to inflate a 2D kernel, either '3x1x1' or '1x3x3'.
    nonlocal_stages : tuple of int.
        Select which stage we need non-local blocks.
    nonlocal_freq : tuple of int.
        Select where to insert non-local blocks in each stage.
    nonlocal_cfg : dict.
        Additional `non-local` arguments, for example `nonlocal_type='gaussian'`.
    bn_eval : bool.
        Whether to set BN layers to eval mode, namely, freeze
        running stats (mean and var).
    bn_frozen : bool.
        Whether to freeze weight and bias of BN layers.
    partial_bn : bool, default False.
        Freeze all batch normalization layers during training except the first layer.
    frozen_stages : int.
        Stages to be frozen (all param fixed). -1 means not freezing any parameters.
    dropout_ratio : float, default is 0.5.
        The dropout rate of a dropout layer.
        The larger the value, the more strength to prevent overfitting.
    init_std : float, default is 0.001.
        Standard deviation value when initialize the dense layers.
    norm_layer : object
        Normalization layer used (default: :class:`mxnet.gluon.nn.BatchNorm`)
        Can be :class:`mxnet.gluon.nn.BatchNorm` or :class:`mxnet.gluon.contrib.nn.SyncBatchNorm`.
    norm_kwargs : dict
        Additional `norm_layer` arguments, for example `num_devices=4`
        for :class:`mxnet.gluon.contrib.nn.SyncBatchNorm`.
    ctx : Context, default CPU.
        The context in which to load the pretrained weights.
    """

    arch_settings = {
        18: (BasicBlock, (2, 2, 2, 2)),
        34: (BasicBlock, (3, 4, 6, 3)),
        50: (Bottleneck, (3, 4, 6, 3)),
        101: (Bottleneck, (3, 4, 23, 3)),
        152: (Bottleneck, (3, 8, 36, 3))
    }

    def __init__(self,
                 nclass,
                 depth,
                 num_stages=4,
                 pretrained=False,
                 pretrained_base=True,
                 feat_ext=False,
                 num_segments=1,
                 num_crop=1,
                 spatial_strides=(1, 2, 2, 2),
                 temporal_strides=(1, 1, 1, 1),
                 dilations=(1, 1, 1, 1),
                 out_indices=(0, 1, 2, 3),
                 conv1_kernel_t=5,
                 conv1_stride_t=2,
                 pool1_kernel_t=1,
                 pool1_stride_t=2,
                 inflate_freq=(1, 1, 1, 1),
                 inflate_stride=(1, 1, 1, 1),
                 inflate_style='3x1x1',
                 nonlocal_stages=(-1, ),
                 nonlocal_freq=(0, 1, 1, 0),
                 nonlocal_cfg=None,
                 bn_eval=True,
                 bn_frozen=False,
                 partial_bn=False,
                 frozen_stages=-1,
                 dropout_ratio=0.5,
                 init_std=0.01,
                 norm_layer=BatchNorm,
                 norm_kwargs=None,
                 ctx=None,
                 **kwargs):
        super(I3D_ResNetV1, self).__init__()

        if depth not in self.arch_settings:
            raise KeyError('invalid depth {} for resnet'.format(depth))

        self.nclass = nclass
        self.depth = depth
        self.num_stages = num_stages
        self.pretrained = pretrained
        self.pretrained_base = pretrained_base
        self.feat_ext = feat_ext
        self.num_segments = num_segments
        self.num_crop = num_crop
        self.spatial_strides = spatial_strides
        self.temporal_strides = temporal_strides
        self.dilations = dilations
        assert len(spatial_strides) == len(temporal_strides) == len(dilations) == num_stages
        self.out_indices = out_indices
        assert max(out_indices) < num_stages
        self.inflate_freqs = inflate_freq if not isinstance(inflate_freq, int) else (inflate_freq, ) * num_stages
        self.inflate_style = inflate_style
        self.nonlocal_stages = nonlocal_stages
        self.nonlocal_freqs = nonlocal_freq if not isinstance(nonlocal_freq, int) else (nonlocal_freq, ) * num_stages
        self.nonlocal_cfg = nonlocal_cfg
        self.bn_eval = bn_eval
        self.bn_frozen = bn_frozen
        self.partial_bn = partial_bn
        self.frozen_stages = frozen_stages
        self.dropout_ratio = dropout_ratio
        self.init_std = init_std

        self.block, stage_blocks = self.arch_settings[depth]
        self.stage_blocks = stage_blocks[:num_stages]
        self.inplanes = 64

        if self.bn_frozen:
            if norm_kwargs is not None:
                norm_kwargs['use_global_stats'] = True
            else:
                norm_kwargs = {}
                norm_kwargs['use_global_stats'] = True

        self.first_stage = nn.HybridSequential(prefix='')
        self.first_stage.add(nn.Conv3D(in_channels=3, channels=64, kernel_size=(conv1_kernel_t, 7, 7),
                                       strides=(conv1_stride_t, 2, 2), padding=((conv1_kernel_t - 1)//2, 3, 3), use_bias=False))
        self.first_stage.add(norm_layer(in_channels=64, **({} if norm_kwargs is None else norm_kwargs)))
        self.first_stage.add(nn.Activation('relu'))
        self.first_stage.add(nn.MaxPool3D(pool_size=(pool1_kernel_t, 3, 3), strides=(pool1_stride_t, 2, 2), padding=(pool1_kernel_t//2, 1, 1)))

        self.pool2 = nn.MaxPool3D(pool_size=(2, 1, 1), strides=(2, 1, 1), padding=(0, 0, 0))

        if self.partial_bn:
            if norm_kwargs is not None:
                norm_kwargs['use_global_stats'] = True
            else:
                norm_kwargs = {}
                norm_kwargs['use_global_stats'] = True

        self.res_layers = nn.HybridSequential(prefix='')
        for i, num_blocks in enumerate(self.stage_blocks):
            spatial_stride = spatial_strides[i]
            temporal_stride = temporal_strides[i]
            dilation = dilations[i]
            planes = 64 * 2**i
            layer_name = 'layer{}_'.format(i + 1)

            res_layer = make_res_layer(self.block,
                                       self.inplanes,
                                       planes,
                                       num_blocks,
                                       spatial_stride=spatial_stride,
                                       temporal_stride=temporal_stride,
                                       dilation=dilation,
                                       inflate_freq=self.inflate_freqs[i],
                                       inflate_style=self.inflate_style,
                                       nonlocal_freq=self.nonlocal_freqs[i],
                                       nonlocal_cfg=self.nonlocal_cfg if i in self.nonlocal_stages else None,
                                       norm_layer=norm_layer,
                                       norm_kwargs=norm_kwargs,
                                       layer_name=layer_name)
            self.inplanes = planes * self.block.expansion
            self.res_layers.add(res_layer)

        self.feat_dim = self.block.expansion * 64 * 2**(len(self.stage_blocks) - 1)

        # We use ``GlobalAvgPool3D`` here for simplicity. Otherwise the input size must be fixed.
        # You can also use ``AvgPool3D`` and specify the arguments on your own, e.g.
        # self.st_avg = nn.AvgPool3D(pool_size=(4, 7, 7), strides=1, padding=0)
        # ``AvgPool3D`` is 10% faster, but ``GlobalAvgPool3D`` makes the code cleaner.
        self.st_avg = nn.GlobalAvgPool3D()

        self.head = nn.HybridSequential(prefix='')
        self.head.add(nn.Dropout(rate=self.dropout_ratio))
        self.fc = nn.Dense(in_units=self.feat_dim, units=nclass, weight_initializer=init.Normal(sigma=self.init_std))
        self.head.add(self.fc)

        self.init_weights(ctx)

[docs]    def init_weights(self, ctx):
        """Initial I3D network with its 2D pretrained weights."""

        self.first_stage.initialize(ctx=ctx)
        self.res_layers.initialize(ctx=ctx)
        self.head.initialize(ctx=ctx)

        if self.pretrained_base and not self.pretrained:
            if self.depth == 50:
                resnet2d = resnet50_v1b(pretrained=True)
            elif self.depth == 101:
                resnet2d = resnet101_v1b(pretrained=True)
            else:
                print('No such 2D pre-trained network of depth %d.' % (self.depth))

            weights2d = resnet2d.collect_params()
            if self.nonlocal_cfg is None:
                weights3d = self.collect_params()
            else:
                train_params_list = []
                raw_params = self.collect_params()
                for raw_name in raw_params.keys():
                    if 'nonlocal' in raw_name:
                        continue
                    train_params_list.append(raw_name)
                init_patterns = '|'.join(train_params_list)
                weights3d = self.collect_params(init_patterns)
            assert len(weights2d.keys()) == len(weights3d.keys()), 'Number of parameters should be same.'

            dict2d = {}
            for key_id, key_name in enumerate(weights2d.keys()):
                dict2d[key_id] = key_name

            dict3d = {}
            for key_id, key_name in enumerate(weights3d.keys()):
                dict3d[key_id] = key_name

            dict_transform = {}
            for key_id, key_name in dict3d.items():
                dict_transform[dict2d[key_id]] = key_name

            cnt = 0
            for key2d, key3d in dict_transform.items():
                if 'conv' in key3d:
                    temporal_dim = weights3d[key3d].shape[2]
                    temporal_2d = nd.expand_dims(weights2d[key2d].data(), axis=2)
                    inflated_2d = nd.broadcast_to(temporal_2d, shape=[0, 0, temporal_dim, 0, 0]) / temporal_dim
                    assert inflated_2d.shape == weights3d[key3d].shape, 'the shape of %s and %s does not match. ' % (key2d, key3d)
                    weights3d[key3d].set_data(inflated_2d)
                    cnt += 1
                    print('%s is done with shape: ' % (key3d), weights3d[key3d].shape)
                if 'batchnorm' in key3d:
                    assert weights2d[key2d].shape == weights3d[key3d].shape, 'the shape of %s and %s does not match. ' % (key2d, key3d)
                    weights3d[key3d].set_data(weights2d[key2d].data())
                    cnt += 1
                    print('%s is done with shape: ' % (key3d), weights3d[key3d].shape)
                if 'dense' in key3d:
                    cnt += 1
                    print('%s is skipped with shape: ' % (key3d), weights3d[key3d].shape)

            assert cnt == len(weights2d.keys()), 'Not all parameters have been ported, check the initialization.'

[docs]    def hybrid_forward(self, F, x):
        """Hybrid forward of I3D network"""
        x = self.first_stage(x)
        outs = []
        for i, res_layer in enumerate(self.res_layers):
            x = res_layer(x)
            if i in self.out_indices:
                outs.append(x)
            if i == 0:
                x = self.pool2(x)

        feat = outs[0]

        # spatial temporal average
        pooled_feat = self.st_avg(feat)
        x = F.squeeze(pooled_feat, axis=(2, 3, 4))

        # segmental consensus
        x = F.reshape(x, shape=(-1, self.num_segments * self.num_crop, self.feat_dim))
        x = F.mean(x, axis=1)

        if self.feat_ext:
            return x

        x = self.head(x)
        return x

[docs]def i3d_resnet50_v1_kinetics400(nclass=400, pretrained=False, pretrained_base=True, ctx=cpu(),
                                root='~/.mxnet/models', use_tsn=False, num_segments=1, num_crop=1,
                                partial_bn=False, bn_frozen=False, feat_ext=False, **kwargs):
    r"""Inflated 3D model (I3D) with ResNet50 backbone trained on Kinetics400 dataset.

    Parameters
    ----------
    nclass : int.
        Number of categories in the dataset.
    pretrained : bool or str.
        Boolean value controls whether to load the default pretrained weights for model.
        String value represents the hashtag for a certain version of pretrained weights.
    pretrained_base : bool or str, optional, default is True.
        Load pretrained base network, the extra layers are randomized. Note that
        if pretrained is `True`, this has no effect.
    ctx : Context, default CPU.
        The context in which to load the pretrained weights.
    root : str, default $MXNET_HOME/models
        Location for keeping the model parameters.
    num_segments : int, default is 1.
        Number of segments used to evenly divide a video.
    num_crop : int, default is 1.
        Number of crops used during evaluation, choices are 1, 3 or 10.
    partial_bn : bool, default False.
        Freeze all batch normalization layers during training except the first layer.
    bn_frozen : bool.
        Whether to freeze weight and bias of BN layers.
    feat_ext : bool.
        Whether to extract features before dense classification layer or
        do a complete forward pass.
    """

    model = I3D_ResNetV1(nclass=nclass,
                         depth=50,
                         pretrained=pretrained,
                         pretrained_base=pretrained_base,
                         feat_ext=feat_ext,
                         num_segments=num_segments,
                         num_crop=num_crop,
                         out_indices=[3],
                         inflate_freq=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
                         bn_eval=False,
                         partial_bn=partial_bn,
                         bn_frozen=bn_frozen,
                         ctx=ctx,
                         **kwargs)

    if pretrained:
        from ..model_store import get_model_file
        model.load_parameters(get_model_file('i3d_resnet50_v1_kinetics400',
                                             tag=pretrained, root=root), ctx=ctx)
        from ...data import Kinetics400Attr
        attrib = Kinetics400Attr()
        model.classes = attrib.classes
    model.collect_params().reset_ctx(ctx)

    return model

[docs]def i3d_resnet101_v1_kinetics400(nclass=400, pretrained=False, pretrained_base=True, ctx=cpu(),
                                 root='~/.mxnet/models', use_tsn=False, num_segments=1, num_crop=1,
                                 partial_bn=False, feat_ext=False, **kwargs):
    r"""Inflated 3D model (I3D) with ResNet101 backbone trained on Kinetics400 dataset.

    Parameters
    ----------
    nclass : int.
        Number of categories in the dataset.
    pretrained : bool or str.
        Boolean value controls whether to load the default pretrained weights for model.
        String value represents the hashtag for a certain version of pretrained weights.
    pretrained_base : bool or str, optional, default is True.
        Load pretrained base network, the extra layers are randomized. Note that
        if pretrained is `True`, this has no effect.
    ctx : Context, default CPU.
        The context in which to load the pretrained weights.
    root : str, default $MXNET_HOME/models
        Location for keeping the model parameters.
    num_segments : int, default is 1.
        Number of segments used to evenly divide a video.
    num_crop : int, default is 1.
        Number of crops used during evaluation, choices are 1, 3 or 10.
    partial_bn : bool, default False.
        Freeze all batch normalization layers during training except the first layer.
    bn_frozen : bool.
        Whether to freeze weight and bias of BN layers.
    feat_ext : bool.
        Whether to extract features before dense classification layer or
        do a complete forward pass.
    """

    model = I3D_ResNetV1(nclass=nclass,
                         depth=101,
                         pretrained=pretrained,
                         pretrained_base=pretrained_base,
                         feat_ext=feat_ext,
                         num_segments=num_segments,
                         num_crop=num_crop,
                         out_indices=[3],
                         inflate_freq=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1), (0, 1, 0)),
                         bn_eval=False,
                         partial_bn=partial_bn,
                         ctx=ctx,
                         **kwargs)

    if pretrained:
        from ..model_store import get_model_file
        model.load_parameters(get_model_file('i3d_resnet101_v1_kinetics400',
                                             tag=pretrained, root=root), ctx=ctx)
        from ...data import Kinetics400Attr
        attrib = Kinetics400Attr()
        model.classes = attrib.classes
    model.collect_params().reset_ctx(ctx)

    return model

[docs]def i3d_nl5_resnet50_v1_kinetics400(nclass=400, pretrained=False, pretrained_base=True, ctx=cpu(),
                                    root='~/.mxnet/models', use_tsn=False, num_segments=1, num_crop=1,
                                    partial_bn=False, feat_ext=False, **kwargs):
    r"""Inflated 3D model (I3D) with ResNet50 backbone and 5 non-local blocks
    trained on Kinetics400 dataset.

    Parameters
    ----------
    nclass : int.
        Number of categories in the dataset.
    pretrained : bool or str.
        Boolean value controls whether to load the default pretrained weights for model.
        String value represents the hashtag for a certain version of pretrained weights.
    pretrained_base : bool or str, optional, default is True.
        Load pretrained base network, the extra layers are randomized. Note that
        if pretrained is `True`, this has no effect.
    ctx : Context, default CPU.
        The context in which to load the pretrained weights.
    root : str, default $MXNET_HOME/models
        Location for keeping the model parameters.
    num_segments : int, default is 1.
        Number of segments used to evenly divide a video.
    num_crop : int, default is 1.
        Number of crops used during evaluation, choices are 1, 3 or 10.
    partial_bn : bool, default False.
        Freeze all batch normalization layers during training except the first layer.
    bn_frozen : bool.
        Whether to freeze weight and bias of BN layers.
    feat_ext : bool.
        Whether to extract features before dense classification layer or
        do a complete forward pass.
    """

    model = I3D_ResNetV1(nclass=nclass,
                         depth=50,
                         pretrained=pretrained,
                         pretrained_base=pretrained_base,
                         feat_ext=feat_ext,
                         num_segments=num_segments,
                         num_crop=num_crop,
                         out_indices=[3],
                         inflate_freq=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
                         nonlocal_stages=(1, 2),
                         nonlocal_cfg=dict(nonlocal_type="gaussian"),
                         nonlocal_freq=((0, 0, 0), (0, 1, 0, 1), (0, 1, 0, 1, 0, 1), (0, 0, 0)),
                         bn_eval=False,
                         partial_bn=partial_bn,
                         ctx=ctx,
                         **kwargs)

    if pretrained:
        from ..model_store import get_model_file
        model.load_parameters(get_model_file('i3d_nl5_resnet50_v1_kinetics400',
                                             tag=pretrained, root=root), ctx=ctx)
        from ...data import Kinetics400Attr
        attrib = Kinetics400Attr()
        model.classes = attrib.classes
    model.collect_params().reset_ctx(ctx)

    return model

[docs]def i3d_nl10_resnet50_v1_kinetics400(nclass=400, pretrained=False, pretrained_base=True, ctx=cpu(),
                                     root='~/.mxnet/models', use_tsn=False, num_segments=1, num_crop=1,
                                     partial_bn=False, feat_ext=False, **kwargs):
    r"""Inflated 3D model (I3D) with ResNet50 backbone and 10 non-local blocks
    trained on Kinetics400 dataset.

    Parameters
    ----------
    nclass : int.
        Number of categories in the dataset.
    pretrained : bool or str.
        Boolean value controls whether to load the default pretrained weights for model.
        String value represents the hashtag for a certain version of pretrained weights.
    pretrained_base : bool or str, optional, default is True.
        Load pretrained base network, the extra layers are randomized. Note that
        if pretrained is `True`, this has no effect.
    ctx : Context, default CPU.
        The context in which to load the pretrained weights.
    root : str, default $MXNET_HOME/models
        Location for keeping the model parameters.
    num_segments : int, default is 1.
        Number of segments used to evenly divide a video.
    num_crop : int, default is 1.
        Number of crops used during evaluation, choices are 1, 3 or 10.
    partial_bn : bool, default False.
        Freeze all batch normalization layers during training except the first layer.
    bn_frozen : bool.
        Whether to freeze weight and bias of BN layers.
    feat_ext : bool.
        Whether to extract features before dense classification layer or
        do a complete forward pass.
    """

    model = I3D_ResNetV1(nclass=nclass,
                         depth=50,
                         pretrained=pretrained,
                         pretrained_base=pretrained_base,
                         feat_ext=feat_ext,
                         num_segments=num_segments,
                         num_crop=num_crop,
                         out_indices=[3],
                         inflate_freq=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
                         nonlocal_stages=(1, 2),
                         nonlocal_cfg=dict(nonlocal_type="gaussian"),
                         nonlocal_freq=((0, 0, 0), (1, 1, 1, 1), (1, 1, 1, 1, 1, 1), (0, 0, 0)),
                         bn_eval=False,
                         partial_bn=partial_bn,
                         ctx=ctx,
                         **kwargs)

    if pretrained:
        from ..model_store import get_model_file
        model.load_parameters(get_model_file('i3d_nl10_resnet50_v1_kinetics400',
                                             tag=pretrained, root=root), ctx=ctx)
        from ...data import Kinetics400Attr
        attrib = Kinetics400Attr()
        model.classes = attrib.classes
    model.collect_params().reset_ctx(ctx)

    return model

[docs]def i3d_nl5_resnet101_v1_kinetics400(nclass=400, pretrained=False, pretrained_base=True, ctx=cpu(),
                                     root='~/.mxnet/models', use_tsn=False, num_segments=1, num_crop=1,
                                     partial_bn=False, feat_ext=False, **kwargs):
    r"""Inflated 3D model (I3D) with ResNet101 backbone and 5 non-local blocks
    trained on Kinetics400 dataset.

    Parameters
    ----------
    nclass : int.
        Number of categories in the dataset.
    pretrained : bool or str.
        Boolean value controls whether to load the default pretrained weights for model.
        String value represents the hashtag for a certain version of pretrained weights.
    pretrained_base : bool or str, optional, default is True.
        Load pretrained base network, the extra layers are randomized. Note that
        if pretrained is `True`, this has no effect.
    ctx : Context, default CPU.
        The context in which to load the pretrained weights.
    root : str, default $MXNET_HOME/models
        Location for keeping the model parameters.
    num_segments : int, default is 1.
        Number of segments used to evenly divide a video.
    num_crop : int, default is 1.
        Number of crops used during evaluation, choices are 1, 3 or 10.
    partial_bn : bool, default False.
        Freeze all batch normalization layers during training except the first layer.
    bn_frozen : bool.
        Whether to freeze weight and bias of BN layers.
    feat_ext : bool.
        Whether to extract features before dense classification layer or
        do a complete forward pass.
    """

    model = I3D_ResNetV1(nclass=nclass,
                         depth=101,
                         pretrained=pretrained,
                         pretrained_base=pretrained_base,
                         feat_ext=feat_ext,
                         num_segments=num_segments,
                         num_crop=num_crop,
                         out_indices=[3],
                         inflate_freq=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1), (0, 1, 0)),
                         nonlocal_stages=(1, 2),
                         nonlocal_cfg=dict(nonlocal_type="gaussian"),
                         nonlocal_freq=((0, 0, 0), (0, 1, 0, 1), (0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0), (0, 0, 0)),
                         bn_eval=False,
                         partial_bn=partial_bn,
                         ctx=ctx,
                         **kwargs)

    if pretrained:
        from ..model_store import get_model_file
        model.load_parameters(get_model_file('i3d_nl5_resnet101_v1_kinetics400',
                                             tag=pretrained, root=root), ctx=ctx)
        from ...data import Kinetics400Attr
        attrib = Kinetics400Attr()
        model.classes = attrib.classes
    model.collect_params().reset_ctx(ctx)

    return model

[docs]def i3d_nl10_resnet101_v1_kinetics400(nclass=400, pretrained=False, pretrained_base=True, ctx=cpu(),
                                      root='~/.mxnet/models', use_tsn=False, num_segments=1, num_crop=1,
                                      partial_bn=False, feat_ext=False, **kwargs):
    r"""Inflated 3D model (I3D) with ResNet101 backbone and 10 non-local blocks
    trained on Kinetics400 dataset.

    Parameters
    ----------
    nclass : int.
        Number of categories in the dataset.
    pretrained : bool or str.
        Boolean value controls whether to load the default pretrained weights for model.
        String value represents the hashtag for a certain version of pretrained weights.
    pretrained_base : bool or str, optional, default is True.
        Load pretrained base network, the extra layers are randomized. Note that
        if pretrained is `True`, this has no effect.
    ctx : Context, default CPU.
        The context in which to load the pretrained weights.
    root : str, default $MXNET_HOME/models
        Location for keeping the model parameters.
    num_segments : int, default is 1.
        Number of segments used to evenly divide a video.
    num_crop : int, default is 1.
        Number of crops used during evaluation, choices are 1, 3 or 10.
    partial_bn : bool, default False.
        Freeze all batch normalization layers during training except the first layer.
    bn_frozen : bool.
        Whether to freeze weight and bias of BN layers.
    feat_ext : bool.
        Whether to extract features before dense classification layer or
        do a complete forward pass.
    """

    model = I3D_ResNetV1(nclass=nclass,
                         depth=101,
                         pretrained=pretrained,
                         pretrained_base=pretrained_base,
                         feat_ext=feat_ext,
                         num_segments=num_segments,
                         num_crop=num_crop,
                         out_indices=[3],
                         inflate_freq=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1), (0, 1, 0)),
                         nonlocal_stages=(1, 2),
                         nonlocal_cfg=dict(nonlocal_type="gaussian"),
                         nonlocal_freq=((0, 0, 0), (1, 1, 1, 1), (0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1), (0, 0, 0)),
                         bn_eval=False,
                         partial_bn=partial_bn,
                         ctx=ctx,
                         **kwargs)

    if pretrained:
        from ..model_store import get_model_file
        model.load_parameters(get_model_file('i3d_nl10_resnet101_v1_kinetics400',
                                             tag=pretrained, root=root), ctx=ctx)
        from ...data import Kinetics400Attr
        attrib = Kinetics400Attr()
        model.classes = attrib.classes
    model.collect_params().reset_ctx(ctx)

    return model

[docs]def i3d_resnet50_v1_sthsthv2(nclass=174, pretrained=False, pretrained_base=True, ctx=cpu(),
                             root='~/.mxnet/models', use_tsn=False, num_segments=1, num_crop=1,
                             partial_bn=False, feat_ext=False, **kwargs):
    r"""Inflated 3D model (I3D) with ResNet50 backbone trained on Something-Something-V2 dataset.

    Parameters
    ----------
    nclass : int.
        Number of categories in the dataset.
    pretrained : bool or str.
        Boolean value controls whether to load the default pretrained weights for model.
        String value represents the hashtag for a certain version of pretrained weights.
    pretrained_base : bool or str, optional, default is True.
        Load pretrained base network, the extra layers are randomized. Note that
        if pretrained is `True`, this has no effect.
    ctx : Context, default CPU.
        The context in which to load the pretrained weights.
    root : str, default $MXNET_HOME/models
        Location for keeping the model parameters.
    num_segments : int, default is 1.
        Number of segments used to evenly divide a video.
    num_crop : int, default is 1.
        Number of crops used during evaluation, choices are 1, 3 or 10.
    partial_bn : bool, default False.
        Freeze all batch normalization layers during training except the first layer.
    bn_frozen : bool.
        Whether to freeze weight and bias of BN layers.
    feat_ext : bool.
        Whether to extract features before dense classification layer or
        do a complete forward pass.
    """

    model = I3D_ResNetV1(nclass=nclass,
                         depth=50,
                         pretrained=pretrained,
                         pretrained_base=pretrained_base,
                         feat_ext=feat_ext,
                         num_segments=num_segments,
                         num_crop=num_crop,
                         out_indices=[3],
                         inflate_freq=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
                         bn_eval=False,
                         partial_bn=partial_bn,
                         ctx=ctx,
                         **kwargs)

    if pretrained:
        from ..model_store import get_model_file
        model.load_parameters(get_model_file('i3d_resnet50_v1_sthsthv2',
                                             tag=pretrained, root=root), ctx=ctx)
        from ...data import SomethingSomethingV2Attr
        attrib = SomethingSomethingV2Attr()
        model.classes = attrib.classes
    model.collect_params().reset_ctx(ctx)

    return model

[docs]def i3d_resnet50_v1_hmdb51(nclass=51, pretrained=False, pretrained_base=True, ctx=cpu(),
                           root='~/.mxnet/models', use_tsn=False, num_segments=1, num_crop=1,
                           partial_bn=False, use_kinetics_pretrain=True, feat_ext=False, **kwargs):
    r"""Inflated 3D model (I3D) with ResNet50 backbone trained on HMDB51 dataset.

    Parameters
    ----------
    nclass : int.
        Number of categories in the dataset.
    pretrained : bool or str.
        Boolean value controls whether to load the default pretrained weights for model.
        String value represents the hashtag for a certain version of pretrained weights.
    pretrained_base : bool or str, optional, default is True.
        Load pretrained base network, the extra layers are randomized. Note that
        if pretrained is `True`, this has no effect.
    ctx : Context, default CPU.
        The context in which to load the pretrained weights.
    root : str, default $MXNET_HOME/models
        Location for keeping the model parameters.
    num_segments : int, default is 1.
        Number of segments used to evenly divide a video.
    num_crop : int, default is 1.
        Number of crops used during evaluation, choices are 1, 3 or 10.
    partial_bn : bool, default False.
        Freeze all batch normalization layers during training except the first layer.
    bn_frozen : bool.
        Whether to freeze weight and bias of BN layers.
    feat_ext : bool.
        Whether to extract features before dense classification layer or
        do a complete forward pass.
    """

    model = I3D_ResNetV1(nclass=nclass,
                         depth=50,
                         pretrained=pretrained,
                         pretrained_base=pretrained_base,
                         feat_ext=feat_ext,
                         num_segments=num_segments,
                         num_crop=num_crop,
                         out_indices=[3],
                         inflate_freq=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
                         bn_eval=False,
                         partial_bn=False,
                         ctx=ctx,
                         dropout_ratio=0.8,
                         init_std=0.001,
                         **kwargs)

    if use_kinetics_pretrain and not pretrained:
        from gluoncv.model_zoo import get_model
        kinetics_model = get_model('i3d_resnet50_v1_kinetics400', nclass=400, pretrained=True)
        source_params = kinetics_model.collect_params()
        target_params = model.collect_params()
        assert len(source_params.keys()) == len(target_params.keys())

        pretrained_weights = []
        for layer_name in source_params.keys():
            pretrained_weights.append(source_params[layer_name].data())

        for i, layer_name in enumerate(target_params.keys()):
            if i + 2 == len(source_params.keys()):
                # skip the last dense layer
                break
            target_params[layer_name].set_data(pretrained_weights[i])

    if pretrained:
        from ..model_store import get_model_file
        model.load_parameters(get_model_file('i3d_resnet50_v1_hmdb51',
                                             tag=pretrained, root=root), ctx=ctx)
        from ...data import HMDB51Attr
        attrib = HMDB51Attr()
        model.classes = attrib.classes
    model.collect_params().reset_ctx(ctx)

    return model

[docs]def i3d_resnet50_v1_ucf101(nclass=101, pretrained=False, pretrained_base=True, ctx=cpu(),
                           root='~/.mxnet/models', use_tsn=False, num_segments=1, num_crop=1,
                           partial_bn=False, use_kinetics_pretrain=True, feat_ext=False, **kwargs):
    r"""Inflated 3D model (I3D) with ResNet50 backbone trained on UCF101 dataset.

    Parameters
    ----------
    nclass : int.
        Number of categories in the dataset.
    pretrained : bool or str.
        Boolean value controls whether to load the default pretrained weights for model.
        String value represents the hashtag for a certain version of pretrained weights.
    pretrained_base : bool or str, optional, default is True.
        Load pretrained base network, the extra layers are randomized. Note that
        if pretrained is `True`, this has no effect.
    ctx : Context, default CPU.
        The context in which to load the pretrained weights.
    root : str, default $MXNET_HOME/models
        Location for keeping the model parameters.
    num_segments : int, default is 1.
        Number of segments used to evenly divide a video.
    num_crop : int, default is 1.
        Number of crops used during evaluation, choices are 1, 3 or 10.
    partial_bn : bool, default False.
        Freeze all batch normalization layers during training except the first layer.
    bn_frozen : bool.
        Whether to freeze weight and bias of BN layers.
    feat_ext : bool.
        Whether to extract features before dense classification layer or
        do a complete forward pass.
    """

    model = I3D_ResNetV1(nclass=nclass,
                         depth=50,
                         pretrained=pretrained,
                         pretrained_base=pretrained_base,
                         feat_ext=feat_ext,
                         num_segments=num_segments,
                         num_crop=num_crop,
                         out_indices=[3],
                         inflate_freq=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
                         bn_eval=False,
                         partial_bn=True,
                         ctx=ctx,
                         dropout_ratio=0.8,
                         init_std=0.001,
                         **kwargs)

    if use_kinetics_pretrain and not pretrained:
        from gluoncv.model_zoo import get_model
        kinetics_model = get_model('i3d_resnet50_v1_kinetics400', nclass=400, pretrained=True)
        source_params = kinetics_model.collect_params()
        target_params = model.collect_params()
        assert len(source_params.keys()) == len(target_params.keys())

        pretrained_weights = []
        for layer_name in source_params.keys():
            pretrained_weights.append(source_params[layer_name].data())

        for i, layer_name in enumerate(target_params.keys()):
            if i + 2 == len(source_params.keys()):
                # skip the last dense layer
                break
            target_params[layer_name].set_data(pretrained_weights[i])

    if pretrained:
        from ..model_store import get_model_file
        model.load_parameters(get_model_file('i3d_resnet50_v1_ucf101',
                                             tag=pretrained, root=root), ctx=ctx)
        from ...data import UCF101Attr
        attrib = UCF101Attr()
        model.classes = attrib.classes
    model.collect_params().reset_ctx(ctx)

    return model

[docs]def i3d_resnet50_v1_custom(nclass=400, pretrained=False, pretrained_base=True, ctx=cpu(),
                           root='~/.mxnet/models', use_tsn=False, num_segments=1, num_crop=1,
                           partial_bn=False, use_kinetics_pretrain=True, feat_ext=False, **kwargs):
    r"""Inflated 3D model (I3D) with ResNet50 backbone. Customized for users's own dataset.

    Parameters
    ----------
    nclass : int.
        Number of categories in the dataset.
    pretrained : bool or str.
        Boolean value controls whether to load the default pretrained weights for model.
        String value represents the hashtag for a certain version of pretrained weights.
    pretrained_base : bool or str, optional, default is True.
        Load pretrained base network, the extra layers are randomized. Note that
        if pretrained is `True`, this has no effect.
    ctx : Context, default CPU.
        The context in which to load the pretrained weights.
    root : str, default $MXNET_HOME/models
        Location for keeping the model parameters.
    num_segments : int, default is 1.
        Number of segments used to evenly divide a video.
    num_crop : int, default is 1.
        Number of crops used during evaluation, choices are 1, 3 or 10.
    partial_bn : bool, default False.
        Freeze all batch normalization layers during training except the first layer.
    bn_frozen : bool.
        Whether to freeze weight and bias of BN layers.
    feat_ext : bool.
        Whether to extract features before dense classification layer or
        do a complete forward pass.
    use_kinetics_pretrain : bool.
        Whether to load Kinetics-400 pre-trained model weights.
    """

    model = I3D_ResNetV1(nclass=nclass,
                         depth=50,
                         pretrained=pretrained,
                         pretrained_base=pretrained_base,
                         feat_ext=feat_ext,
                         num_segments=num_segments,
                         num_crop=num_crop,
                         out_indices=[3],
                         inflate_freq=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
                         bn_eval=False,
                         partial_bn=True,
                         ctx=ctx,
                         dropout_ratio=0.8,
                         init_std=0.001,
                         **kwargs)

    if use_kinetics_pretrain and not pretrained:
        from gluoncv.model_zoo import get_model
        kinetics_model = get_model('i3d_resnet50_v1_kinetics400', nclass=400, pretrained=True)
        source_params = kinetics_model.collect_params()
        target_params = model.collect_params()
        assert len(source_params.keys()) == len(target_params.keys())

        pretrained_weights = []
        for layer_name in source_params.keys():
            pretrained_weights.append(source_params[layer_name].data())

        for i, layer_name in enumerate(target_params.keys()):
            if i + 2 == len(source_params.keys()):
                # skip the last dense layer
                break
            target_params[layer_name].set_data(pretrained_weights[i])

    model.collect_params().reset_ctx(ctx)

    return model