fix bug(#64), update mask, fix typo

a86f344a · ShusenTang · c5d0f74a · a86f344a · a86f344a
2 changed file
--- a/code/chapter10_natural-language-processing/10.12_machine-translation.ipynb
+++ b/code/chapter10_natural-language-processing/10.12_machine-translation.ipynb
@@ -16,7 +16,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "1.0.0 cpu\n"
+      "1.2.0 cpu\n"
     ]
    }
   ],
@@ -52,9 +52,7 @@
  {
   "cell_type": "code",
   "execution_count": 2,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# 将一个序列中所有的词记录在all_tokens中以便之后构造词典，然后在该序列后面添加PAD直到序列\n",
@@ -75,9 +73,7 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def read_data(max_seq_len):\n",
@@ -130,9 +126,7 @@
  {
   "cell_type": "code",
   "execution_count": 5,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "class Encoder(nn.Module):\n",
@@ -183,9 +177,7 @@
  {
   "cell_type": "code",
   "execution_count": 7,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def attention_model(input_size, attention_size):\n",
@@ -198,9 +190,7 @@
  {
   "cell_type": "code",
   "execution_count": 8,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def attention_forward(model, enc_states, dec_state):\n",
@@ -250,9 +240,7 @@
  {
   "cell_type": "code",
   "execution_count": 10,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "class Decoder(nn.Module):\n",
@@ -261,8 +249,9 @@
    "        super(Decoder, self).__init__()\n",
    "        self.embedding = nn.Embedding(vocab_size, embed_size)\n",
    "        self.attention = attention_model(2*num_hiddens, attention_size)\n",
-    "        # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 2*embed_size\n",
-    "        self.rnn = nn.GRU(2*embed_size, num_hiddens, num_layers, dropout=drop_prob)\n",
+    "        # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 num_hiddens+embed_size\n",
+    "        self.rnn = nn.GRU(num_hiddens + embed_size, num_hiddens, \n",
+    "                          num_layers, dropout=drop_prob)\n",
    "        self.out = nn.Linear(num_hiddens, vocab_size)\n",
    "\n",
    "    def forward(self, cur_input, state, enc_states):\n",
@@ -272,8 +261,8 @@
    "        \"\"\"\n",
    "        # 使用注意力机制计算背景向量\n",
    "        c = attention_forward(self.attention, enc_states, state[-1])\n",
-    "        # 将嵌入后的输入和背景向量在特征维连结\n",
-    "        input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) # (批量大小, 2*embed_size)\n",
+    "        # 将嵌入后的输入和背景向量在特征维连结, (批量大小, num_hiddens+embed_size)\n",
+    "        input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) \n",
    "        # 为输入和背景向量的连结增加时间步维，时间步个数为1\n",
    "        output, state = self.rnn(input_and_c.unsqueeze(0), state)\n",
    "        # 移除时间步维，输出形状为(批量大小, 输出词典大小)\n",
@@ -295,9 +284,7 @@
  {
   "cell_type": "code",
   "execution_count": 11,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def batch_loss(encoder, decoder, X, Y, loss):\n",
@@ -308,7 +295,7 @@
    "    dec_state = decoder.begin_state(enc_state)\n",
    "    # 解码器在最初时间步的输入是BOS\n",
    "    dec_input = torch.tensor([out_vocab.stoi[BOS]] * batch_size)\n",
-    "    # 我们将使用掩码变量mask来忽略掉标签为填充项PAD的损失\n",
+    "    # 我们将使用掩码变量mask来忽略掉标签为填充项PAD的损失, 初始全1\n",
    "    mask, num_not_pad_tokens = torch.ones(batch_size,), 0\n",
    "    l = torch.tensor([0.0])\n",
    "    for y in Y.permute(1,0): # Y shape: (batch, seq_len)\n",
@@ -316,17 +303,15 @@
    "        l = l + (mask * loss(dec_output, y)).sum()\n",
    "        dec_input = y  # 使用强制教学\n",
    "        num_not_pad_tokens += mask.sum().item()\n",
-    "        # 将PAD对应位置的掩码设成0, 原文这里是 y != out_vocab.stoi[EOS], 感觉有误\n",
-    "        mask = mask * (y != out_vocab.stoi[PAD]).float()\n",
+    "        # EOS后面全是PAD. 下面一行保证一旦遇到EOS接下来的循环中mask就一直是0\n",
+    "        mask = mask * (y != out_vocab.stoi[EOS]).float()\n",
    "    return l / num_not_pad_tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def train(encoder, decoder, dataset, lr, batch_size, num_epochs):\n",
@@ -358,11 +343,11 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch 10, loss 0.441\n",
-      "epoch 20, loss 0.183\n",
-      "epoch 30, loss 0.100\n",
-      "epoch 40, loss 0.046\n",
-      "epoch 50, loss 0.025\n"
+      "epoch 10, loss 0.475\n",
+      "epoch 20, loss 0.245\n",
+      "epoch 30, loss 0.157\n",
+      "epoch 40, loss 0.052\n",
+      "epoch 50, loss 0.039\n"
     ]
    }
   ],
@@ -386,9 +371,7 @@
  {
   "cell_type": "code",
   "execution_count": 14,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def translate(encoder, decoder, input_seq, max_seq_len):\n",
@@ -443,9 +426,7 @@
  {
   "cell_type": "code",
   "execution_count": 16,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def bleu(pred_tokens, label_tokens, k):\n",
@@ -466,9 +447,7 @@
  {
   "cell_type": "code",
   "execution_count": 17,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def score(input_seq, label_seq, k):\n",
@@ -504,29 +483,27 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "bleu 0.658, predict: they are russian .\n"
+      "bleu 0.658, predict: they are exhausted .\n"
     ]
    }
   ],
   "source": [
-    "score('ils sont canadiens .', 'they are canadian .', k=2)"
+    "score('ils sont canadienne .', 'they are canadian .', k=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python [conda env:anaconda3]",
+   "display_name": "Python [conda env:py36]",
   "language": "python",
-   "name": "conda-env-anaconda3-py"
+   "name": "conda-env-py36-py"
  },
  "language_info": {
   "codemirror_mode": {
@@ -538,7 +515,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.2"
  }
 },
 "nbformat": 4,

--- a/docs/chapter10_natural-language-processing/10.12_machine-translation.md
+++ b/docs/chapter10_natural-language-processing/10.12_machine-translation.md
@@ -165,8 +165,9 @@ class Decoder(nn.Module):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.attention = attention_model(2*num_hiddens, attention_size)
-        # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 2*embed_size
-        self.rnn = nn.GRU(2*embed_size, num_hiddens, num_layers, dropout=drop_prob)
+        # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 num_hiddens+embed_size
+        self.rnn = nn.GRU(num_hiddens + embed_size, num_hiddens, 
+                          num_layers, dropout=drop_prob)
        self.out = nn.Linear(num_hiddens, vocab_size)

    def forward(self, cur_input, state, enc_states):
@@ -176,8 +177,8 @@ class Decoder(nn.Module):
        """
        # 使用注意力机制计算背景向量
        c = attention_forward(self.attention, enc_states, state[-1])
-        # 将嵌入后的输入和背景向量在特征维连结
-        input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) # (批量大小, 2*embed_size)
+        # 将嵌入后的输入和背景向量在特征维连结, (批量大小, num_hiddens+embed_size)
+        input_and_c = torch.cat((self.embedding(cur_input), c), dim=1)
        # 为输入和背景向量的连结增加时间步维，时间步个数为1
        output, state = self.rnn(input_and_c.unsqueeze(0), state)
        # 移除时间步维，输出形状为(批量大小, 输出词典大小)
@@ -210,8 +211,8 @@ def batch_loss(encoder, decoder, X, Y, loss):
        l = l + (mask * loss(dec_output, y)).sum()
        dec_input = y  # 使用强制教学
        num_not_pad_tokens += mask.sum().item()
-        # 将PAD对应位置的掩码设成0, 原文这里是 y != out_vocab.stoi[EOS], 感觉有误
-        mask = mask * (y != out_vocab.stoi[PAD]).float()
+        # EOS后面全是PAD. 下面一行保证一旦遇到EOS接下来的循环中mask就一直是0
+        mask = mask * (y != out_vocab.stoi[EOS]).float()
    return l / num_not_pad_tokens
 ```

@@ -299,7 +300,7 @@ translate(encoder, decoder, input_seq, max_seq_len)

 评价机器翻译结果通常使用BLEU（Bilingual Evaluation Understudy）[1]。对于模型预测序列中任意的子序列，BLEU考察这个子序列是否出现在标签序列中。

-具体来说，设词数为$n$的子序列的精度为$p_n$。它是预测序列与标签序列匹配词数为$n$的子序列的数量与预测序列中词数为$n$的子序列的数量之比。举个例子，假设标签序列为$A$、$B$、$C$、$D$、$E$、$F$，预测序列为$A$、$B$、$B$、$C$、$D$，那么$p_1 = 4/5,\ p_2 = 3/4,\ p_3 = 1/3,\ p_4 = 0$。设$len_{\text{label}}$和$len_{\text{pred}}$分别为标签序列和预测序列的词数，那么，BLEU的定义为
+具体来说，设词数为$n$的子序列的精度为$p_n$。它是预测序列与标签序列匹配词数为$n$的子序列的数量与预测序列中词数为$n$的子序列的数量之比。举个例子，假设标签序列为$A$、$B$、$C$、$D$、$E$、$F$，预测序列为$A$、$B$、$B$、$C$、$D$，那么$p_1 = 4/5, p_2 = 3/4, p_3 = 1/3, p_4 = 0$。设$len_{\text{label}}$和$len_{\text{pred}}$分别为标签序列和预测序列的词数，那么，BLEU的定义为

 $$ \exp\left(\min\left(0, 1 - \frac{len_{\text{label}}}{len_{\text{pred}}}\right)\right) \prod_{n=1}^k p_n^{1/2^n},$$

@@ -348,7 +349,7 @@ bleu 1.000, predict: they are watching .
 测试一个不在训练集中的样本。

 ``` python
-score('ils sont canadiens .', 'they are canadian .', k=2)
+score('ils sont canadienne .', 'they are canadian .', k=2)
 ```
 输出：
 ```