add part ecapa-tdnn note, test=doc

9b5f7f71 · xiongxinlei · 83310b63 · 9b5f7f71 · 9b5f7f71 · 9b5f7f71
3 changed file
--- a/demos/speaker_verification/README.md
+++ b/demos/speaker_verification/README.md
@@ -117,6 +117,8 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
      audio_file='./123456789.wav',
      device=paddle.get_device())
  print('Test embedding Result: \n{}'.format(test_emb))
+
+  # score range [0, 1]
  score = vector_executor.get_embeddings_score(audio_emb, test_emb)
  print(f"Eembeddings Score: {score}")
  ```

--- a/demos/speaker_verification/README_cn.md
+++ b/demos/speaker_verification/README_cn.md
@@ -115,6 +115,8 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
      audio_file='./123456789.wav',
      device=paddle.get_device())
  print('Test embedding Result: \n{}'.format(test_emb))
+
+  # score range [0, 1]
  score = vector_executor.get_embeddings_score(audio_emb, test_emb)
  print(f"Eembeddings Score: {score}")
  ```

--- a/paddlespeech/vector/models/ecapa_tdnn.py
+++ b/paddlespeech/vector/models/ecapa_tdnn.py
@@ -79,6 +79,20 @@ class Conv1d(nn.Layer):
            bias_attr=bias, )

    def forward(self, x):
+        """Do conv1d forward
+
+        Args:
+            x (paddle.Tensor): [N, C, L] input data, 
+                                N is the batch,
+                                C is the data dimension, 
+                                L is the time
+
+        Raises:
+            ValueError: only support the same padding type
+
+        Returns:
+            paddle.Tensor: the value of conv1d
+        """
        if self.padding == "same":
            x = self._manage_padding(x, self.kernel_size, self.dilation,
                                     self.stride)
@@ -88,6 +102,20 @@ class Conv1d(nn.Layer):
        return self.conv(x)

    def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int):
+        """Padding the input data
+
+        Args:
+            x (paddle.Tensor): [N, C, L] input data
+                                N is the batch,
+                                C is the data dimension, 
+                                L is the time
+            kernel_size (int): 1-d convolution kernel size
+            dilation (int): 1-d convolution dilation
+            stride (int): 1-d convolution stride
+
+        Returns:
+            paddle.Tensor: the padded input data
+        """
        L_in = x.shape[-1]  # Detecting input shape
        padding = self._get_padding_elem(L_in, stride, kernel_size,
                                         dilation)  # Time padding
@@ -101,6 +129,17 @@ class Conv1d(nn.Layer):
                          stride: int,
                          kernel_size: int,
                          dilation: int):
+        """Calculate the padding value in same mode
+
+        Args:
+            L_in (int): the times of the input data, 
+            stride (int): 1-d convolution stride
+            kernel_size (int): 1-d convolution kernel size
+            dilation (int): 1-d convolution stride
+
+        Returns:
+            int: return the padding value in same mode
+        """
        if stride > 1:
            n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
            L_out = stride * (n_steps - 1) + kernel_size * dilation
@@ -245,6 +284,13 @@ class SEBlock(nn.Layer):

 class AttentiveStatisticsPooling(nn.Layer):
    def __init__(self, channels, attention_channels=128, global_context=True):
+        """Compute the speaker verification statistics
+           The detail info is section 3.1 in https://arxiv.org/pdf/1709.01507.pdf 
+        Args:
+            channels (int): input data channel or data dimension
+            attention_channels (int, optional): attention dimension. Defaults to 128.
+            global_context (bool, optional): If use the global context information. Defaults to True.
+        """
        super().__init__()

        self.eps = 1e-12