Source code for openspeech.datasets.ksponspeech.lit_data_module

# MIT License
#
# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import logging
import pytorch_lightning as pl
from typing import Optional
from omegaconf import DictConfig
from openspeech.data.audio.dataset import SpeechToTextDataset

from openspeech.datasets import register_data_module
from openspeech.data.sampler import BucketingSampler
from openspeech.data.audio.data_loader import AudioDataLoader
from openspeech.datasets.ksponspeech.preprocess.preprocess import preprocess, preprocess_test_data
from openspeech.datasets.ksponspeech.preprocess.character import generate_character_script, generate_character_labels
from openspeech.datasets.ksponspeech.preprocess.grapheme import sentence_to_grapheme
from openspeech.datasets.ksponspeech.preprocess.subword import train_sentencepiece, sentence_to_subwords
from openspeech.tokenizers import TOKENIZER_REGISTRY
from openspeech.tokenizers.tokenizer import Tokenizer


[docs]@register_data_module('ksponspeech') class LightningKsponSpeechDataModule(pl.LightningDataModule): r""" Lightning data module for KsponSpeech. KsponSpeech corpus contains 969 h of general open-domain dialog utterances, spoken by about 2000 native Korean speakers in a clean environment. All data were constructed by recording the dialogue of two people freely conversing on a variety of topics and manually transcribing the utterances. The transcription provides a dual transcription consisting of orthography and pronunciation, and disfluency tags for spontaneity of speech, such as filler words, repeated words, and word fragments. Attributes: KSPONSPEECH_TRAIN_NUM (int): the number of KsponSpeech's train data. Args: configs (DictConfig): configuration set. """ KSPONSPEECH_TRAIN_NUM = 620000 KSPONSPEECH_VALID_NUM = 2545 KSPONSPEECH_TEST_NUM = 6000 def __init__(self, configs: DictConfig) -> None: super(LightningKsponSpeechDataModule, self).__init__() self.configs = configs self.dataset = dict() self.logger = logging.getLogger(__name__) self.encoding = 'utf-8' if self.configs.tokenizer.unit == 'kspon_subword' else 'cp949' def _generate_manifest_files(self, manifest_file_path: str) -> None: r""" Generate KsponSpeech manifest file. Format: AUDIO_PATH [TAB] TEXT_TRANSCRIPTS [TAB] NUMBER_TRANSCRIPT """ train_valid_audio_paths, train_valid_transcripts = preprocess( self.configs.dataset.dataset_path, self.configs.dataset.preprocess_mode ) test_audio_paths, test_transcripts = preprocess_test_data( self.configs.dataset.test_manifest_dir, self.configs.dataset.preprocess_mode ) audio_paths = train_valid_audio_paths + test_audio_paths transcripts = train_valid_transcripts + test_transcripts if self.configs.tokenizer.unit == 'kspon_character': generate_character_labels(transcripts, self.configs.tokenizer.vocab_path) generate_character_script(audio_paths, transcripts, manifest_file_path, self.configs.tokenizer.vocab_path) elif self.configs.tokenizer.unit == 'kspon_subword': train_sentencepiece(transcripts, self.configs.tokenizer.vocab_size, self.configs.tokenizer.blank_token) sentence_to_subwords( audio_paths, transcripts, manifest_file_path, sp_model_path=self.configs.tokenizer.sp_model_path ) elif self.configs.tokenizer.unit == 'kspon_grapheme': sentence_to_grapheme(audio_paths, transcripts, manifest_file_path, self.configs.tokenizer.vocab_path) else: raise ValueError(f"Unsupported vocab : {self.configs.tokenizer.unit}") def _parse_manifest_file(self): r""" Parsing manifest file. Returns: audio_paths (list): list of audio path transcritps (list): list of transcript of audio """ audio_paths = list() transcripts = list() with open(self.configs.dataset.manifest_file_path, encoding=self.encoding) as f: for idx, line in enumerate(f.readlines()): audio_path, korean_transcript, transcript = line.split('\t') transcript = transcript.replace('\n', '') audio_paths.append(audio_path) transcripts.append(transcript) return audio_paths, transcripts
[docs] def prepare_data(self): r""" Prepare KsponSpeech manifest file. If there is not exist manifest file, generate manifest file. Returns: tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model. """ if not os.path.exists(self.configs.dataset.manifest_file_path): self.logger.info("Manifest file is not exists !!\n" "Generate manifest files..") if not os.path.exists(self.configs.dataset.dataset_path): raise FileNotFoundError self._generate_manifest_files(self.configs.dataset.manifest_file_path) return TOKENIZER_REGISTRY[self.configs.tokenizer.unit](self.configs)
[docs] def setup(self, stage: Optional[str] = None, tokenizer: Tokenizer = None): r""" Split `train` and `valid` dataset for training. Args: stage (str): stage of training. `train` or `valid` tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model. Returns: None """ valid_end_idx = self.KSPONSPEECH_TRAIN_NUM + self.KSPONSPEECH_VALID_NUM audio_paths, transcripts = self._parse_manifest_file() audio_paths = { "train": audio_paths[:self.KSPONSPEECH_TRAIN_NUM], "valid": audio_paths[self.KSPONSPEECH_TRAIN_NUM:valid_end_idx], "test": audio_paths[valid_end_idx:], } transcripts = { "train": transcripts[:self.KSPONSPEECH_TRAIN_NUM], "valid": transcripts[self.KSPONSPEECH_TRAIN_NUM:valid_end_idx], "test": transcripts[valid_end_idx:], } for stage in audio_paths.keys(): if stage == 'test': dataset_path = self.configs.dataset.test_dataset_path else: dataset_path = self.configs.dataset.dataset_path self.dataset[stage] = SpeechToTextDataset( configs=self.configs, dataset_path=dataset_path, audio_paths=audio_paths[stage], transcripts=transcripts[stage], sos_id=tokenizer.sos_id, eos_id=tokenizer.eos_id, apply_spec_augment=self.configs.audio.apply_spec_augment if stage == 'train' else False, del_silence=self.configs.audio.del_silence if stage == 'train' else False, )
[docs] def train_dataloader(self) -> AudioDataLoader: r""" Return data loader for training. """ train_sampler = BucketingSampler(self.dataset['train'], batch_size=self.configs.trainer.batch_size) return AudioDataLoader( dataset=self.dataset['train'], num_workers=self.configs.trainer.num_workers, batch_sampler=train_sampler, )
[docs] def val_dataloader(self) -> AudioDataLoader: r""" Return data loader for validation. """ valid_sampler = BucketingSampler(self.dataset['valid'], batch_size=self.configs.trainer.batch_size) return AudioDataLoader( dataset=self.dataset['valid'], num_workers=self.configs.trainer.num_workers, batch_sampler=valid_sampler, )
[docs] def test_dataloader(self) -> AudioDataLoader: r""" Return data loader for training. """ train_sampler = BucketingSampler(self.dataset['test'], batch_size=self.configs.trainer.batch_size) return AudioDataLoader( dataset=self.dataset['test'], num_workers=self.configs.trainer.num_workers, batch_sampler=train_sampler, )