diff --git a/code/chapter10_natural-language-processing/10.12_machine-translation.ipynb b/code/chapter10_natural-language-processing/10.12_machine-translation.ipynb
index e0761c092f20f295d71eab8539c2478b4c28bfa7..b52f686e1dca00b70398c2b394abedeea33265a1 100644
--- a/code/chapter10_natural-language-processing/10.12_machine-translation.ipynb
+++ b/code/chapter10_natural-language-processing/10.12_machine-translation.ipynb
@@ -16,7 +16,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1.0.0 cpu\n"
+      "1.2.0 cpu\n"
      ]
     }
    ],
@@ -52,9 +52,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# 将一个序列中所有的词记录在all_tokens中以便之后构造词典，然后在该序列后面添加PAD直到序列\n",
@@ -75,9 +73,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def read_data(max_seq_len):\n",
@@ -130,9 +126,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "class Encoder(nn.Module):\n",
@@ -183,9 +177,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def attention_model(input_size, attention_size):\n",
@@ -198,9 +190,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def attention_forward(model, enc_states, dec_state):\n",
@@ -250,9 +240,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "class Decoder(nn.Module):\n",
@@ -261,8 +249,9 @@
     "        super(Decoder, self).__init__()\n",
     "        self.embedding = nn.Embedding(vocab_size, embed_size)\n",
     "        self.attention = attention_model(2*num_hiddens, attention_size)\n",
-    "        # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 2*embed_size\n",
-    "        self.rnn = nn.GRU(2*embed_size, num_hiddens, num_layers, dropout=drop_prob)\n",
+    "        # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 num_hiddens+embed_size\n",
+    "        self.rnn = nn.GRU(num_hiddens + embed_size, num_hiddens, \n",
+    "                          num_layers, dropout=drop_prob)\n",
     "        self.out = nn.Linear(num_hiddens, vocab_size)\n",
     "\n",
     "    def forward(self, cur_input, state, enc_states):\n",
@@ -272,8 +261,8 @@
     "        \"\"\"\n",
     "        # 使用注意力机制计算背景向量\n",
     "        c = attention_forward(self.attention, enc_states, state[-1])\n",
-    "        # 将嵌入后的输入和背景向量在特征维连结\n",
-    "        input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) # (批量大小, 2*embed_size)\n",
+    "        # 将嵌入后的输入和背景向量在特征维连结, (批量大小, num_hiddens+embed_size)\n",
+    "        input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) \n",
     "        # 为输入和背景向量的连结增加时间步维，时间步个数为1\n",
     "        output, state = self.rnn(input_and_c.unsqueeze(0), state)\n",
     "        # 移除时间步维，输出形状为(批量大小, 输出词典大小)\n",
@@ -295,9 +284,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def batch_loss(encoder, decoder, X, Y, loss):\n",
@@ -308,7 +295,7 @@
     "    dec_state = decoder.begin_state(enc_state)\n",
     "    # 解码器在最初时间步的输入是BOS\n",
     "    dec_input = torch.tensor([out_vocab.stoi[BOS]] * batch_size)\n",
-    "    # 我们将使用掩码变量mask来忽略掉标签为填充项PAD的损失\n",
+    "    # 我们将使用掩码变量mask来忽略掉标签为填充项PAD的损失, 初始全1\n",
     "    mask, num_not_pad_tokens = torch.ones(batch_size,), 0\n",
     "    l = torch.tensor([0.0])\n",
     "    for y in Y.permute(1,0): # Y shape: (batch, seq_len)\n",
@@ -316,17 +303,15 @@
     "        l = l + (mask * loss(dec_output, y)).sum()\n",
     "        dec_input = y  # 使用强制教学\n",
     "        num_not_pad_tokens += mask.sum().item()\n",
-    "        # 将PAD对应位置的掩码设成0, 原文这里是 y != out_vocab.stoi[EOS], 感觉有误\n",
-    "        mask = mask * (y != out_vocab.stoi[PAD]).float()\n",
+    "        # EOS后面全是PAD. 下面一行保证一旦遇到EOS接下来的循环中mask就一直是0\n",
+    "        mask = mask * (y != out_vocab.stoi[EOS]).float()\n",
     "    return l / num_not_pad_tokens"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 12,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def train(encoder, decoder, dataset, lr, batch_size, num_epochs):\n",
@@ -358,11 +343,11 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "epoch 10, loss 0.441\n",
-      "epoch 20, loss 0.183\n",
-      "epoch 30, loss 0.100\n",
-      "epoch 40, loss 0.046\n",
-      "epoch 50, loss 0.025\n"
+      "epoch 10, loss 0.475\n",
+      "epoch 20, loss 0.245\n",
+      "epoch 30, loss 0.157\n",
+      "epoch 40, loss 0.052\n",
+      "epoch 50, loss 0.039\n"
      ]
     }
    ],
@@ -386,9 +371,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def translate(encoder, decoder, input_seq, max_seq_len):\n",
@@ -443,9 +426,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def bleu(pred_tokens, label_tokens, k):\n",
@@ -466,9 +447,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def score(input_seq, label_seq, k):\n",
@@ -504,29 +483,27 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "bleu 0.658, predict: they are russian .\n"
+      "bleu 0.658, predict: they are exhausted .\n"
      ]
     }
    ],
    "source": [
-    "score('ils sont canadiens .', 'they are canadian .', k=2)"
+    "score('ils sont canadienne .', 'they are canadian .', k=2)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:anaconda3]",
+   "display_name": "Python [conda env:py36]",
    "language": "python",
-   "name": "conda-env-anaconda3-py"
+   "name": "conda-env-py36-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -538,7 +515,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.6.2"
   }
  },
  "nbformat": 4,
diff --git a/docs/chapter10_natural-language-processing/10.12_machine-translation.md b/docs/chapter10_natural-language-processing/10.12_machine-translation.md
index d8644b13570fff340254c396d975426eeaa9b368..019b177890c07ffd78a26a9cde2f942b46c6e749 100644
--- a/docs/chapter10_natural-language-processing/10.12_machine-translation.md
+++ b/docs/chapter10_natural-language-processing/10.12_machine-translation.md
@@ -165,8 +165,9 @@ class Decoder(nn.Module):
         super(Decoder, self).__init__()
         self.embedding = nn.Embedding(vocab_size, embed_size)
         self.attention = attention_model(2*num_hiddens, attention_size)
-        # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 2*embed_size
-        self.rnn = nn.GRU(2*embed_size, num_hiddens, num_layers, dropout=drop_prob)
+        # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 num_hiddens+embed_size
+        self.rnn = nn.GRU(num_hiddens + embed_size, num_hiddens, 
+                          num_layers, dropout=drop_prob)
         self.out = nn.Linear(num_hiddens, vocab_size)
 
     def forward(self, cur_input, state, enc_states):
@@ -176,8 +177,8 @@ class Decoder(nn.Module):
         """
         # 使用注意力机制计算背景向量
         c = attention_forward(self.attention, enc_states, state[-1])
-        # 将嵌入后的输入和背景向量在特征维连结
-        input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) # (批量大小, 2*embed_size)
+        # 将嵌入后的输入和背景向量在特征维连结, (批量大小, num_hiddens+embed_size)
+        input_and_c = torch.cat((self.embedding(cur_input), c), dim=1)
         # 为输入和背景向量的连结增加时间步维，时间步个数为1
         output, state = self.rnn(input_and_c.unsqueeze(0), state)
         # 移除时间步维，输出形状为(批量大小, 输出词典大小)
@@ -210,8 +211,8 @@ def batch_loss(encoder, decoder, X, Y, loss):
         l = l + (mask * loss(dec_output, y)).sum()
         dec_input = y  # 使用强制教学
         num_not_pad_tokens += mask.sum().item()
-        # 将PAD对应位置的掩码设成0, 原文这里是 y != out_vocab.stoi[EOS], 感觉有误
-        mask = mask * (y != out_vocab.stoi[PAD]).float()
+        # EOS后面全是PAD. 下面一行保证一旦遇到EOS接下来的循环中mask就一直是0
+        mask = mask * (y != out_vocab.stoi[EOS]).float()
     return l / num_not_pad_tokens
 ```
 
@@ -299,7 +300,7 @@ translate(encoder, decoder, input_seq, max_seq_len)
 
 评价机器翻译结果通常使用BLEU（Bilingual Evaluation Understudy）[1]。对于模型预测序列中任意的子序列，BLEU考察这个子序列是否出现在标签序列中。
 
-具体来说，设词数为$n$的子序列的精度为$p_n$。它是预测序列与标签序列匹配词数为$n$的子序列的数量与预测序列中词数为$n$的子序列的数量之比。举个例子，假设标签序列为$A$、$B$、$C$、$D$、$E$、$F$，预测序列为$A$、$B$、$B$、$C$、$D$，那么$p_1 = 4/5,\ p_2 = 3/4,\ p_3 = 1/3,\ p_4 = 0$。设$len_{\text{label}}$和$len_{\text{pred}}$分别为标签序列和预测序列的词数，那么，BLEU的定义为
+具体来说，设词数为$n$的子序列的精度为$p_n$。它是预测序列与标签序列匹配词数为$n$的子序列的数量与预测序列中词数为$n$的子序列的数量之比。举个例子，假设标签序列为$A$、$B$、$C$、$D$、$E$、$F$，预测序列为$A$、$B$、$B$、$C$、$D$，那么$p_1 = 4/5, p_2 = 3/4, p_3 = 1/3, p_4 = 0$。设$len_{\text{label}}$和$len_{\text{pred}}$分别为标签序列和预测序列的词数，那么，BLEU的定义为
 
 $$ \exp\left(\min\left(0, 1 - \frac{len_{\text{label}}}{len_{\text{pred}}}\right)\right) \prod_{n=1}^k p_n^{1/2^n},$$
 
@@ -348,7 +349,7 @@ bleu 1.000, predict: they are watching .
 测试一个不在训练集中的样本。
 
 ``` python
-score('ils sont canadiens .', 'they are canadian .', k=2)
+score('ils sont canadienne .', 'they are canadian .', k=2)
 ```
 输出：
 ```