Source code for openspeech.encoders.jasper

# MIT License
#
# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Tuple
from omegaconf import DictConfig

from openspeech.modules import JasperSubBlock, JasperBlock, MaskConv1d


[docs]class Jasper(nn.Module):
    r"""
    Jasper (Just Another Speech Recognizer), an ASR model comprised of 54 layers proposed by NVIDIA.
    Jasper achieved sub 3 percent word error rate (WER) on the LibriSpeech dataset.

    Args:
        num_classes (int): number of classification
        version (str): version of jasper. Marked as BxR: B - number of blocks, R - number of sub-blocks

    Inputs: inputs, input_lengths, residual
        - **inputs**: tensor contains input sequence vector
        - **input_lengths**: tensor contains sequence lengths

    Returns:
            (Tensor, Tensor):

            * outputs (torch.FloatTensor): Log probability of model predictions.  ``(batch, seq_length, num_classes)``
            * output_lengths (torch.LongTensor): The length of output tensor ``(batch)``

    Reference:
        Jason Li. et al.: Jasper: An End-to-End Convolutional Neural Acoustic Model
        https://arxiv.org/pdf/1904.03288.pdf
    """

    def __init__(self, configs: DictConfig, input_dim: int, num_classes: int) -> None:
        super(Jasper, self).__init__()
        self.configs = configs
        self.layers = nn.ModuleList()

        in_channels = eval(self.configs.in_channels)
        out_channels = eval(self.configs.out_channels)
        kernel_size = eval(self.configs.kernel_size)
        stride = eval(self.configs.stride)
        dilation = eval(self.configs.dilation)
        dropout_p = eval(self.configs.dropout_p)

        self.layers.append(
            JasperSubBlock(
                in_channels=input_dim,
                out_channels=out_channels[0],
                kernel_size=kernel_size[0],
                stride=stride[0],
                dilation=dilation[0],
                dropout_p=dropout_p[0],
                activation='relu',
                bias=False,
            )
        )
        self.layers.extend([
            JasperBlock(
                num_sub_blocks=self.configs.num_sub_blocks,
                in_channels=in_channels[i],
                out_channels=out_channels[i],
                kernel_size=kernel_size[i],
                dilation=dilation[i],
                dropout_p=dropout_p[i],
                activation='relu',
                bias=False,
            ) for i in range(1, self.configs.num_blocks + 1)
        ])
        self.postprocess_layers = nn.ModuleList([
            JasperSubBlock(
                in_channels=in_channels[i],
                out_channels=num_classes if out_channels[i] is None else out_channels[i],
                kernel_size=kernel_size[i],
                dilation=dilation[i],
                dropout_p=dropout_p[i],
                activation='relu',
                bias=True if i == 2 else False,
            ) for i in range(self.configs.num_blocks + 1, self.configs.num_blocks + 4)
        ])

        self.residual_connections = self._create_jasper_dense_residual_connections()

[docs]    def count_parameters(self) -> int:
        r""" Count parameters of model """
        return sum([p.numel for p in self.parameters()])

[docs]    def update_dropout(self, dropout_p: float) -> None:
        r""" Update dropout probability of model """
        for name, child in self.named_children():
            if isinstance(child, nn.Dropout):
                child.p = dropout_p

[docs]    def forward(
            self,
            inputs: torch.Tensor,
            input_lengths: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        r"""
        Forward propagate a `inputs` for  encoder_only training.

        Args:
            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded
                `FloatTensor` of size ``(batch, seq_length, dimension)``.
            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``

        Returns:
            (Tensor, Tensor):

            * outputs (torch.FloatTensor): Log probability of model predictions.  ``(batch, seq_length, num_classes)``
            * output_lengths (torch.LongTensor): The length of output tensor ``(batch)``
        """
        residual, prev_outputs, prev_output_lengths = None, list(), list()
        inputs = inputs.transpose(1, 2)

        for i, layer in enumerate(self.layers[:-1]):
            inputs, input_lengths = layer(inputs, input_lengths, residual)
            prev_outputs.append(inputs)
            prev_output_lengths.append(input_lengths)
            residual = self._get_jasper_dencse_residual(prev_outputs, prev_output_lengths, i)

        outputs, output_lengths = self.layers[-1](inputs, input_lengths, residual)

        for i, layer in enumerate(self.postprocess_layers):
            outputs, output_lengths = layer(outputs, output_lengths)

        outputs = F.log_softmax(outputs.transpose(1, 2), dim=-1)

        return outputs, output_lengths

    def _get_jasper_dencse_residual(self, prev_outputs: list, prev_output_lengths: list, index: int):
        residual = None

        for item in zip(prev_outputs, prev_output_lengths, self.residual_connections[index]):
            prev_output, prev_output_length, residual_modules = item
            conv1x1, batch_norm = residual_modules

            if residual is None:
                residual = conv1x1(prev_output, prev_output_length)[0]
            else:
                residual += conv1x1(prev_output, prev_output_length)[0]

            residual = batch_norm(residual)

        return residual

    def _create_jasper_dense_residual_connections(self) -> nn.ModuleList:
        residual_connections = nn.ModuleList()

        for i in range(self.configs.num_blocks):
            residual_modules = nn.ModuleList()
            for j in range(1, i + 2):
                residual_modules.append(nn.ModuleList([
                    MaskConv1d(
                        self.configs.in_channels[j], self.configs.out_channels[j], kernel_size=1
                    ),
                    nn.BatchNorm1d(self.configs.out_channels[i], eps=1e-03, momentum=0.1),
                ]))
            residual_connections.append(residual_modules)

        return residual_connections