Source code for gluoncv.nn.feature

# pylint: disable=abstract-method
"""Feature extraction blocks.
Feature or Multi-Feature extraction is a key component in object detection.
Class predictor/Box predictor are usually applied on feature layer(s).
A good feature extraction mechanism is critical to performance.
"""
from __future__ import absolute_import

import mxnet as mx
from mxnet.base import string_types
from mxnet.gluon import HybridBlock, SymbolBlock
from mxnet.symbol import Symbol
from mxnet.symbol.contrib import SyncBatchNorm


def _parse_network(network, outputs, inputs, pretrained, ctx, **kwargs):
    """Parse network with specified outputs and other arguments.

    Parameters
    ----------
    network : str or HybridBlock or Symbol
        Logic chain: load from gluoncv.model_zoo if network is string.
        Convert to Symbol if network is HybridBlock
    outputs : str or iterable of str
        The name of layers to be extracted as features.
    inputs : iterable of str
        The name of input datas.
    pretrained : bool
        Use pretrained parameters as in gluon.model_zoo
    ctx : Context
        The context, e.g. mxnet.cpu(), mxnet.gpu(0).

    Returns
    -------
    inputs : list of Symbol
        Network input Symbols, usually ['data']
    outputs : list of Symbol
        Network output Symbols, usually as features
    params : ParameterDict
        Network parameters.
    """
    inputs = list(inputs) if isinstance(inputs, tuple) else inputs
    for i, inp in enumerate(inputs):
        if isinstance(inp, string_types):
            inputs[i] = mx.sym.var(inp)
        assert isinstance(inputs[i], Symbol), "Network expects inputs are Symbols."
    if len(inputs) == 1:
        inputs = inputs[0]
    else:
        inputs = mx.sym.Group(inputs)
    params = None
    prefix = ''
    if isinstance(network, string_types):
        from ..model_zoo import get_model
        network = get_model(network, pretrained=pretrained, ctx=ctx, **kwargs)
    if isinstance(network, HybridBlock):
        params = network.collect_params()
        prefix = network._prefix
        network = network(inputs)
    assert isinstance(network, Symbol), \
        "FeatureExtractor requires the network argument to be either " \
        "str, HybridBlock or Symbol, but got %s" % type(network)

    if isinstance(outputs, string_types):
        outputs = [outputs]
    assert len(outputs) > 0, "At least one outputs must be specified."
    outputs = [out if out.endswith('_output') else out + '_output' for out in outputs]
    outputs = [network.get_internals()[prefix + out] for out in outputs]
    return inputs, outputs, params


[docs]class FeatureExtractor(SymbolBlock):
    """Feature extractor.

    Parameters
    ----------
    network : str or HybridBlock or Symbol
        Logic chain: load from gluoncv.model_zoo if network is string.
        Convert to Symbol if network is HybridBlock
    outputs : str or list of str
        The name of layers to be extracted as features
    inputs : list of str or list of Symbol
        The inputs of network.
    pretrained : bool
        Use pretrained parameters as in gluon.model_zoo
    ctx : Context
        The context, e.g. mxnet.cpu(), mxnet.gpu(0).
    """

    def __init__(self, network, outputs, inputs=('data',),
                 pretrained=False, ctx=mx.cpu(), **kwargs):
        inputs, outputs, params = _parse_network(
            network, outputs, inputs, pretrained, ctx, **kwargs)
        super(FeatureExtractor, self).__init__(outputs, inputs, params=params)


[docs]class FeatureExpander(SymbolBlock):
    """Feature extractor with additional layers to append.
    This is very common in vision networks where extra branches are attached to
    backbone network.

    Parameters
    ----------
    network : str or HybridBlock or Symbol
        Logic chain: load from gluoncv.model_zoo if network is string.
        Convert to Symbol if network is HybridBlock.
    outputs : str or list of str
        The name of layers to be extracted as features
    num_filters : list of int
        Number of filters to be appended.
    use_1x1_transition : bool
        Whether to use 1x1 convolution between attached layers. It is effective
        reducing network size.
    use_bn : bool
        Whether to use BatchNorm between attached layers.
    reduce_ratio : float
        Channel reduction ratio of the transition layers.
    min_depth : int
        Minimum channel number of transition layers.
    global_pool : bool
        Whether to use global pooling as the last layer.
    pretrained : bool
        Use pretrained parameters as in gluon.model_zoo if `True`.
    ctx : Context
        The context, e.g. mxnet.cpu(), mxnet.gpu(0).
    inputs : list of str
        Name of input variables to the network.

    """

    def __init__(self, network, outputs, num_filters, use_1x1_transition=True,
                 use_bn=True, reduce_ratio=1.0, min_depth=128, global_pool=False,
                 pretrained=False, ctx=mx.cpu(), inputs=('data',), **kwargs):
        inputs, outputs, params = _parse_network(
            network, outputs, inputs, pretrained, ctx, **kwargs)
        # append more layers
        y = outputs[-1]
        weight_init = mx.init.Xavier(rnd_type='gaussian', factor_type='out', magnitude=2)
        for i, f in enumerate(num_filters):
            if use_1x1_transition:
                num_trans = max(min_depth, int(round(f * reduce_ratio)))
                y = mx.sym.Convolution(
                    y, num_filter=num_trans, kernel=(1, 1), no_bias=use_bn,
                    name='expand_trans_conv{}'.format(i), attr={'__init__': weight_init})
                if use_bn:
                    y = mx.sym.BatchNorm(y, name='expand_trans_bn{}'.format(i))
                y = mx.sym.Activation(y, act_type='relu', name='expand_trans_relu{}'.format(i))
            y = mx.sym.Convolution(
                y, num_filter=f, kernel=(3, 3), pad=(1, 1), stride=(2, 2),
                no_bias=use_bn, name='expand_conv{}'.format(i), attr={'__init__': weight_init})
            if use_bn:
                y = mx.sym.BatchNorm(y, name='expand_bn{}'.format(i))
            y = mx.sym.Activation(y, act_type='relu', name='expand_relu{}'.format(i))
            outputs.append(y)
        if global_pool:
            outputs.append(mx.sym.Pooling(y, pool_type='avg', global_pool=True, kernel=(1, 1)))
        super(FeatureExpander, self).__init__(outputs, inputs, params)


[docs]class FPNFeatureExpander(SymbolBlock):
    """Feature extractor with additional layers to append.
    This is specified for ``Feature Pyramid Network for Object Detection``
    which implement ``Top-down pathway and lateral connections``.

    Parameters
    ----------
    network : str or HybridBlock or Symbol
        Logic chain: load from gluon.model_zoo.vision if network is string.
        Convert to Symbol if network is HybridBlock.
    outputs : str or list of str
        The name of layers to be extracted as features
    num_filters : list of int e.g. [256, 256, 256, 256]
        Number of filters to be appended.
    use_1x1 : bool
        Whether to use 1x1 convolution
    use_upsample : bool
        Whether to use upsample
    use_elewadd : float
        Whether to use element-wise add operation
    use_p6 : bool
        Whether use P6 stage, this is used for RPN experiments in ori paper
    p6_conv : bool
        Whether to use convolution for P6 stage, if it is enabled, or just max pooling.
    no_bias : bool
        Whether use bias for Convolution operation.
    norm_layer : HybridBlock or SymbolBlock
        Type of normalization layer.
    norm_kwargs : dict
        Arguments for normalization layer.
    pretrained : bool
        Use pretrained parameters as in gluon.model_zoo if `True`.
    ctx : Context
        The context, e.g. mxnet.cpu(), mxnet.gpu(0).
    inputs : list of str
        Name of input variables to the network.

    """

    def __init__(self, network, outputs, num_filters, use_1x1=True, use_upsample=True,
                 use_elewadd=True, use_p6=False, p6_conv=True, no_bias=True, pretrained=False,
                 norm_layer=None, norm_kwargs=None, ctx=mx.cpu(), inputs=('data',)):
        inputs, outputs, params = _parse_network(network, outputs, inputs, pretrained, ctx)
        if norm_kwargs is None:
            norm_kwargs = {}
        # e.g. For ResNet50, the feature is :
        # outputs = ['stage1_activation2', 'stage2_activation3',
        #            'stage3_activation5', 'stage4_activation2']
        # with regard to [conv2, conv3, conv4, conv5] -> [C2, C3, C4, C5]
        # append more layers with reversed order : [P5, P4, P3, P2]
        y = outputs[-1]
        base_features = outputs[::-1]
        num_stages = len(num_filters) + 1  # usually 5
        weight_init = mx.init.Xavier(rnd_type='uniform', factor_type='in', magnitude=1.)
        tmp_outputs = []
        # num_filter is 256 in ori paper
        for i, (bf, f) in enumerate(zip(base_features, num_filters)):
            if i == 0:
                if use_1x1:
                    y = mx.sym.Convolution(y, num_filter=f, kernel=(1, 1), pad=(0, 0),
                                           stride=(1, 1), no_bias=no_bias,
                                           name="P{}_conv_lat".format(num_stages - i),
                                           attr={'__init__': weight_init})
                    if norm_layer is not None:
                        if norm_layer is SyncBatchNorm:
                            norm_kwargs['key'] = "P{}_lat_bn".format(num_stages - i)
                            norm_kwargs['name'] = "P{}_lat_bn".format(num_stages - i)
                        y = norm_layer(y, **norm_kwargs)
                if use_p6 and p6_conv:
                    # method 2 : use conv (Deformable use this)
                    y_p6 = mx.sym.Convolution(y, num_filter=f, kernel=(3, 3), pad=(1, 1),
                                              stride=(2, 2), no_bias=no_bias,
                                              name='P{}_conv1'.format(num_stages + 1),
                                              attr={'__init__': weight_init})
                    if norm_layer is not None:
                        if norm_layer is SyncBatchNorm:
                            norm_kwargs['key'] = "P{}_pre_bn".format(num_stages + 1)
                            norm_kwargs['name'] = "P{}_pre_bn".format(num_stages + 1)
                        y_p6 = norm_layer(y_p6, **norm_kwargs)
            else:
                if use_1x1:
                    bf = mx.sym.Convolution(bf, num_filter=f, kernel=(1, 1), pad=(0, 0),
                                            stride=(1, 1), no_bias=no_bias,
                                            name="P{}_conv_lat".format(num_stages - i),
                                            attr={'__init__': weight_init})
                    if norm_layer is not None:
                        if norm_layer is SyncBatchNorm:
                            norm_kwargs['key'] = "P{}_conv1_bn".format(num_stages - i)
                            norm_kwargs['name'] = "P{}_conv1_bn".format(num_stages - i)
                        bf = norm_layer(bf, **norm_kwargs)
                if use_upsample:
                    y = mx.sym.UpSampling(y, scale=2, sample_type='nearest',
                                          name="P{}_upsp".format(num_stages - i))

                if use_elewadd:
                    # make two symbol alignment
                    # method 1 : mx.sym.Crop
                    # y = mx.sym.Crop(*[y, bf], name="P{}_clip".format(num_stages-i))
                    # method 2 : mx.sym.slice_like
                    y = mx.sym.slice_like(y, bf, axes=(2, 3),
                                          name="P{}_clip".format(num_stages - i))
                    y = mx.sym.ElementWiseSum(bf, y, name="P{}_sum".format(num_stages - i))
            # Reduce the aliasing effect of upsampling described in ori paper
            out = mx.sym.Convolution(y, num_filter=f, kernel=(3, 3), pad=(1, 1), stride=(1, 1),
                                     no_bias=no_bias, name='P{}_conv1'.format(num_stages - i),
                                     attr={'__init__': weight_init})
            if i == 0 and use_p6 and not p6_conv:
                # method 2 : use max pool (Detectron use this)
                y_p6 = mx.sym.Pooling(out, pool_type='max', kernel=(1, 1), pad=(0, 0),
                                      stride=(2, 2), name="P{}_pre".format(num_stages + 1))
            if norm_layer is not None:
                if norm_layer is SyncBatchNorm:
                    norm_kwargs['key'] = "P{}_bn".format(num_stages - i)
                    norm_kwargs['name'] = "P{}_bn".format(num_stages - i)
                out = norm_layer(out, **norm_kwargs)
            tmp_outputs.append(out)
        if use_p6:
            outputs = tmp_outputs[::-1] + [y_p6]  # [P2, P3, P4, P5] + [P6]
        else:
            outputs = tmp_outputs[::-1]  # [P2, P3, P4, P5]

        super(FPNFeatureExpander, self).__init__(outputs, inputs, params)