test refactor collator

b7b1bda3 · Hui Zhang · f628e218 · b7b1bda3 · b7b1bda3 · b7b1bda3
47 changed file
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@@ -31,7 +31,6 @@ from yacs.config import CfgNode
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.collator import TripletSpeechCollator
 from deepspeech.io.dataset import ManifestDataset
-from deepspeech.io.dataset import TripletManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2_st import U2STModel
@@ -249,12 +248,11 @@ class U2STTrainer(Trainer):
        config.collator.keep_transcription_text = False

        # train/valid dataset, return token ids
-        Dataset = TripletManifestDataset if config.model.model_conf.asr_weight > 0. else ManifestDataset
        config.data.manifest = config.data.train_manifest
-        train_dataset = Dataset.from_config(config)
+        train_dataset = ManifestDataset.from_config(config)

        config.data.manifest = config.data.dev_manifest
-        dev_dataset = Dataset.from_config(config)
+        dev_dataset = ManifestDataset.from_config(config)

        if config.model.model_conf.asr_weight > 0.:
            Collator = TripletSpeechCollator

--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@@ -24,15 +24,15 @@ class AudioFeaturizer():

    Currently, it supports feature types of linear spectrogram and mfcc.

-    :param specgram_type: Specgram feature type. Options: 'linear'.
-    :type specgram_type: str
+    :param spectrum_type: Specgram feature type. Options: 'linear'.
+    :type spectrum_type: str
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
-    :param max_freq: When specgram_type is 'linear', only FFT bins
+    :param max_freq: When spectrum_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
-                     returned; when specgram_type is 'mfcc', max_feq is the
+                     returned; when spectrum_type is 'mfcc', max_feq is the
                     highest band edge of mel filters.
    :types max_freq: None|float
    :param target_sample_rate: Audio are resampled (if upsampling or
@@ -47,7 +47,7 @@ class AudioFeaturizer():
    """

    def __init__(self,
-                 specgram_type: str='linear',
+                 spectrum_type: str='linear',
                 feat_dim: int=None,
                 delta_delta: bool=False,
                 stride_ms=10.0,
@@ -58,7 +58,7 @@ class AudioFeaturizer():
                 use_dB_normalization=True,
                 target_dB=-20,
                 dither=1.0):
-        self._specgram_type = specgram_type
+        self._spectrum_type = spectrum_type
        # mfcc and fbank using `feat_dim`
        self._feat_dim = feat_dim
        # mfcc and fbank using `delta-delta`
@@ -113,27 +113,27 @@ class AudioFeaturizer():
    def feature_size(self):
        """audio feature size"""
        feat_dim = 0
-        if self._specgram_type == 'linear':
+        if self._spectrum_type == 'linear':
            fft_point = self._window_ms if self._fft_point is None else self._fft_point
            feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
                           1)
-        elif self._specgram_type == 'mfcc':
+        elif self._spectrum_type == 'mfcc':
            # mfcc, delta, delta-delta
            feat_dim = int(self._feat_dim *
                           3) if self._delta_delta else int(self._feat_dim)
-        elif self._specgram_type == 'fbank':
+        elif self._spectrum_type == 'fbank':
            # fbank, delta, delta-delta
            feat_dim = int(self._feat_dim *
                           3) if self._delta_delta else int(self._feat_dim)
        else:
-            raise ValueError("Unknown specgram_type %s. "
-                             "Supported values: linear." % self._specgram_type)
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)
        return feat_dim

    def _compute_specgram(self, audio_segment):
        """Extract various audio features."""
        sample_rate = audio_segment.sample_rate
-        if self._specgram_type == 'linear':
+        if self._spectrum_type == 'linear':
            samples = audio_segment.samples
            return self._compute_linear_specgram(
                samples,
@@ -141,7 +141,7 @@ class AudioFeaturizer():
                stride_ms=self._stride_ms,
                window_ms=self._window_ms,
                max_freq=self._max_freq)
-        elif self._specgram_type == 'mfcc':
+        elif self._spectrum_type == 'mfcc':
            samples = audio_segment.to('int16')
            return self._compute_mfcc(
                samples,
@@ -152,7 +152,7 @@ class AudioFeaturizer():
                max_freq=self._max_freq,
                dither=self._dither,
                delta_delta=self._delta_delta)
-        elif self._specgram_type == 'fbank':
+        elif self._spectrum_type == 'fbank':
            samples = audio_segment.to('int16')
            return self._compute_fbank(
                samples,
@@ -164,8 +164,8 @@ class AudioFeaturizer():
                dither=self._dither,
                delta_delta=self._delta_delta)
        else:
-            raise ValueError("Unknown specgram_type %s. "
-                             "Supported values: linear." % self._specgram_type)
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)

    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
        """Compute the spectrogram for samples from a real signal."""

--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer


 class SpeechFeaturizer():
-    """Speech featurizer, for extracting features from both audio and transcript
-    contents of SpeechSegment.
-
-    Currently, for audio parts, it supports feature types of linear
-    spectrogram and mfcc; for transcript parts, it only supports char-level
-    tokenizing and conversion into a list of token indices. Note that the
-    token indexing order follows the given vocabulary file.
-
-    :param vocab_filepath: Filepath to load vocabulary for token indices
-                           conversion.
-    :type specgram_type: str
-    :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
-    :type specgram_type: str
-    :param stride_ms: Striding size (in milliseconds) for generating frames.
-    :type stride_ms: float
-    :param window_ms: Window size (in milliseconds) for generating frames.
-    :type window_ms: float
-    :param max_freq: When specgram_type is 'linear', only FFT bins
-                     corresponding to frequencies between [0, max_freq] are
-                     returned; when specgram_type is 'mfcc', max_freq is the
-                     highest band edge of mel filters.
-    :types max_freq: None|float
-    :param target_sample_rate: Speech are resampled (if upsampling or
-                               downsampling is allowed) to this before
-                               extracting spectrogram features.
-    :type target_sample_rate: float
-    :param use_dB_normalization: Whether to normalize the audio to a certain
-                                 decibels before extracting the features.
-    :type use_dB_normalization: bool
-    :param target_dB: Target audio decibels for normalization.
-    :type target_dB: float
+    """Speech and Text feature extraction.
    """

    def __init__(self,
                 unit_type,
                 vocab_filepath,
                 spm_model_prefix=None,
-                 specgram_type='linear',
+                 spectrum_type='linear',
                 feat_dim=None,
                 delta_delta=False,
                 stride_ms=10.0,
@@ -70,7 +40,7 @@ class SpeechFeaturizer():
        self.window_ms = window_ms

        self.audio_feature = AudioFeaturizer(
-            specgram_type=specgram_type,
+            spectrum_type=spectrum_type,
            feat_dim=feat_dim,
            delta_delta=delta_delta,
            stride_ms=stride_ms,

--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@@ -15,6 +15,7 @@
 import json
 import math
 import tarfile
+from collections import namedtuple
 from typing import List
 from typing import Optional
 from typing import Text

--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -32,6 +32,19 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"]
 logger = Log(__name__).getlog()


+def tokenids(text, keep_transcription_text):
+    # for training text is token ids 
+    tokens = text  # token ids
+
+    if keep_transcription_text:
+        # text is string, convert to unicode ord
+        assert isinstance(text, str), (type(text), text)
+        tokens = [ord(t) for t in text]
+
+    tokens = np.array(tokens, dtype=np.int64)
+    return tokens
+
+
 class SpeechCollatorBase():
    def __init__(
            self,
@@ -150,7 +163,6 @@ class SpeechCollatorBase():
            # extract speech feature
            spectrum, transcript_part = self._speech_featurizer.featurize(
                speech_segment, self.keep_transcription_text)
-
            # CMVN spectrum
            if self._normalizer:
                spectrum = self._normalizer.apply(spectrum)
@@ -163,38 +175,35 @@ class SpeechCollatorBase():
        """batch examples

        Args:
-            batch ([List]): batch is (audio, text)
+            batch (List[Dict]): batch is [dict(audio, text, ...)]
                audio (np.ndarray) shape (T, D)
                text (List[int] or str): shape (U,)

        Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                text : (B, Umax)
-                text_lens: (B)
+            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
+                utts: (B,)
+                xs_pad : (B, Tmax, D)
+                ilens: (B,)
+                ys_pad : (B, Umax)
+                olens: (B,)
        """
        audios = []
        audio_lens = []
        texts = []
        text_lens = []
        utts = []
-        for utt, audio, text in batch:
+
+        for idx, item in enumerate(batch):
+            utts.append(item['utt'])
+
+            audio = item['feat']
+            text = item['text']
            audio, text = self.process_utterance(audio, text)
-            #utt
-            utts.append(utt)
-            # audio
+
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids, else text is string, convert to unicode ord
-            tokens = []
-            if self.keep_transcription_text:
-                assert isinstance(text, str), (type(text), text)
-                tokens = [ord(t) for t in text]
-            else:
-                tokens = text  # token ids
-            tokens = np.array(tokens, dtype=np.int64)
+
+            tokens = tokenids(text, self.keep_transcription_text)
            texts.append(tokens)
            text_lens.append(tokens.shape[0])

@@ -308,17 +317,19 @@ class TripletSpeechCollator(SpeechCollator):
        """batch examples

        Args:
-            batch ([List]): batch is (audio, text)
+            batch (List[Dict]): batch is [dict(audio, text, ...)]
                audio (np.ndarray) shape (T, D)
                text (List[int] or str): shape (U,)

        Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                text : (B, Umax)
-                text_lens: (B)
+            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
+                utts: (B,)
+                xs_pad : (B, Tmax, D)
+                ilens: (B,)
+                ys_pad : [(B, Umax), (B, Umax)]
+                olens: [(B,), (B,)]
        """
+        utts = []
        audios = []
        audio_lens = []
        translation_text = []
@@ -326,41 +337,38 @@ class TripletSpeechCollator(SpeechCollator):
        transcription_text = []
        transcription_text_lens = []

-        utts = []
-        for utt, audio, translation, transcription in batch:
+        for idx, item in enumerate(batch):
+            utts.append(item['utt'])
+
+            audio = item['feat']
+            translation = item['text']
+            transcription = item['text1']
            audio, translation, transcription = self.process_utterance(
                audio, translation, transcription)
-            #utt
-            utts.append(utt)
-            # audio
+
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids
-            # else text is string, convert to unicode ord
+
            tokens = [[], []]
            for idx, text in enumerate([translation, transcription]):
-                if self.keep_transcription_text:
-                    assert isinstance(text, str), (type(text), text)
-                    tokens[idx] = [ord(t) for t in text]
-                else:
-                    tokens[idx] = text  # token ids
-                tokens[idx] = np.array(tokens[idx], dtype=np.int64)
+                tokens[idx] = tokenids(text, self.keep_transcription_text)

            translation_text.append(tokens[0])
            translation_text_lens.append(tokens[0].shape[0])
            transcription_text.append(tokens[1])
            transcription_text_lens.append(tokens[1].shape[0])

-        padded_audios = pad_sequence(
-            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
-        audio_lens = np.array(audio_lens).astype(np.int64)
-        padded_translation = pad_sequence(
-            translation_text, padding_value=IGNORE_ID).astype(np.int64)
+        xs_pad = pad_list(audios, 0.0).astype(np.float32)  #[B, T, D]
+        ilens = np.array(audio_lens).astype(np.int64)
+
+        padded_translation = pad_list(translation_text,
+                                      IGNORE_ID).astype(np.int64)
        translation_lens = np.array(translation_text_lens).astype(np.int64)
-        padded_transcription = pad_sequence(
-            transcription_text, padding_value=IGNORE_ID).astype(np.int64)
+
+        padded_transcription = pad_list(transcription_text,
+                                        IGNORE_ID).astype(np.int64)
        transcription_lens = np.array(transcription_text_lens).astype(np.int64)
-        return utts, padded_audios, audio_lens, (
-            padded_translation, padded_transcription), (translation_lens,
-                                                        transcription_lens)
+
+        ys_pad = (padded_translation, padded_transcription)
+        olens = (translation_lens, transcription_lens)
+        return utts, xs_pad, ilens, ys_pad, olens
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log

-__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
+__all__ = ["ManifestDataset", "TransformDataset"]

 logger = Log(__name__).getlog()

@@ -107,21 +107,7 @@ class ManifestDataset(Dataset):
        return len(self._manifest)

    def __getitem__(self, idx):
-        instance = self._manifest[idx]
-        return instance["utt"], instance["feat"], instance["text"]
-
-
-class TripletManifestDataset(ManifestDataset):
-    """
-        For Joint Training of Speech Translation and ASR.
-        text: translation,
-        text1: transcript.
-    """
-
-    def __getitem__(self, idx):
-        instance = self._manifest[idx]
-        return instance["utt"], instance["feat"], instance["text"], instance[
-            "text1"]
+        return self._manifest[idx]


 class TransformDataset(Dataset):
@@ -273,5 +259,4 @@ class AudioDataset(Dataset):
        return len(self.minibatch)

    def __getitem__(self, idx):
-        instance = self.minibatch[idx]
-        return instance["utt"], instance["feat"], instance["text"]
+        return self.minibatch[idx]
--- a/deepspeech/io/reader.py
+++ b/deepspeech/io/reader.py
@@ -322,7 +322,7 @@ class LoadInputsAndTargets():
                "Not supported: loader_type={}".format(filetype))

    def file_type(self, filepath):
-        suffix = filepath.split(":")[0].split('.')[1]
+        suffix = filepath.split(":")[0].split('.')[-1]
        if suffix == 'ark':
            return 'mat'
        elif suffix == 'scp':

--- a/docs/src/data_preparation.md
+++ b/docs/src/data_preparation.md
@@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w
 ```bash
 python3 utils/compute_mean_std.py \
 --num_samples 2000 \
--specgram_type linear \
+--spectrum_type linear \
 --manifest_path examples/librispeech/data/manifest.train \
 --output_path examples/librispeech/data/mean_std.npz
 ```

--- a/docs/src/deepspeech_architecture.md
+++ b/docs/src/deepspeech_architecture.md
@@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute
 cd examples/aishell/s0
 python3 ../../../utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
-     --specgram_type="linear" \
+     --spectrum_type="linear" \
     --delta_delta=false \
     --stride_ms=10.0 \
     --window_ms=20.0 \

--- a/examples/1xt2x/aishell/conf/deepspeech2.yaml
+++ b/examples/1xt2x/aishell/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0

--- a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
+++ b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0

--- a/examples/1xt2x/librispeech/conf/deepspeech2.yaml
+++ b/examples/1xt2x/librispeech/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0

--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0

--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear #linear, mfcc, fbank
+  spectrum_type: linear #linear, mfcc, fbank
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0

--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
    --delta_delta=false \
    --stride_ms=10.0 \
    --window_ms=20.0 \

--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/s1/local/data.sh
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --stride_ms=10.0 \

--- a/examples/callcenter/s1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/s1/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/callcenter/s1/conf/conformer.yaml
+++ b/examples/callcenter/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/s1/local/data.sh
@@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --stride_ms=10.0 \

--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  target_sample_rate: 16000
  max_freq: None
  n_fft: None

--- a/examples/librispeech/s0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  target_sample_rate: 16000
  max_freq: None
  n_fft: None

--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/s0/local/data.sh
@@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=2000 \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
    --stride_ms=10.0 \

--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \

--- a/examples/librispeech/s2/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/librispeech/s2/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_transformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/librispeech/s2/conf/conformer.yaml
+++ b/examples/librispeech/s2/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/librispeech/s2/local/data.sh
+++ b/examples/librispeech/s2/local/data.sh
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \

--- a/examples/ted_en_zh/t0/conf/transformer.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer.yaml
@@ -18,7 +18,7 @@ collator:
  # augmentation_config: conf/augmentation.json
  batch_size: 10
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
@@ -18,7 +18,7 @@ collator:
  # augmentation_config: conf/augmentation.json
  batch_size: 10
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \

--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/s1/conf/transformer.yaml
@@ -17,7 +17,7 @@ collator:
  augmentation_config: ""
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/s1/local/data.sh
@@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \

--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0

--- a/examples/tiny/s0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/s0/conf/deepspeech2_online.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0

--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.tiny.raw" \
    --num_samples=64 \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
    --stride_ms=10.0 \

--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0

--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
@@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.tiny.raw" \
    --num_samples=64 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \

--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")

-add_arg('specgram_type',    str,
+add_arg('spectrum_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc, fbank.",
        choices=['linear', 'mfcc', 'fbank'])
@@ -58,7 +58,7 @@ def main():

    augmentation_pipeline = AugmentationPipeline('{}')
    audio_featurizer = AudioFeaturizer(
-        specgram_type=args.specgram_type,
+        spectrum_type=args.spectrum_type,
        feat_dim=args.feat_dim,
        delta_delta=args.delta_delta,
        stride_ms=args.stride_ms,