diff --git a/code/chapter10_natural-language-processing/10.12_machine-translation.ipynb b/code/chapter10_natural-language-processing/10.12_machine-translation.ipynb index e0761c092f20f295d71eab8539c2478b4c28bfa7..b52f686e1dca00b70398c2b394abedeea33265a1 100644 --- a/code/chapter10_natural-language-processing/10.12_machine-translation.ipynb +++ b/code/chapter10_natural-language-processing/10.12_machine-translation.ipynb @@ -16,7 +16,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1.0.0 cpu\n" + "1.2.0 cpu\n" ] } ], @@ -52,9 +52,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# 将一个序列中所有的词记录在all_tokens中以便之后构造词典,然后在该序列后面添加PAD直到序列\n", @@ -75,9 +73,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def read_data(max_seq_len):\n", @@ -130,9 +126,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "class Encoder(nn.Module):\n", @@ -183,9 +177,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def attention_model(input_size, attention_size):\n", @@ -198,9 +190,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def attention_forward(model, enc_states, dec_state):\n", @@ -250,9 +240,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "class Decoder(nn.Module):\n", @@ -261,8 +249,9 @@ " super(Decoder, self).__init__()\n", " self.embedding = nn.Embedding(vocab_size, embed_size)\n", " self.attention = attention_model(2*num_hiddens, attention_size)\n", - " # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 2*embed_size\n", - " self.rnn = nn.GRU(2*embed_size, num_hiddens, num_layers, dropout=drop_prob)\n", + " # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 num_hiddens+embed_size\n", + " self.rnn = nn.GRU(num_hiddens + embed_size, num_hiddens, \n", + " num_layers, dropout=drop_prob)\n", " self.out = nn.Linear(num_hiddens, vocab_size)\n", "\n", " def forward(self, cur_input, state, enc_states):\n", @@ -272,8 +261,8 @@ " \"\"\"\n", " # 使用注意力机制计算背景向量\n", " c = attention_forward(self.attention, enc_states, state[-1])\n", - " # 将嵌入后的输入和背景向量在特征维连结\n", - " input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) # (批量大小, 2*embed_size)\n", + " # 将嵌入后的输入和背景向量在特征维连结, (批量大小, num_hiddens+embed_size)\n", + " input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) \n", " # 为输入和背景向量的连结增加时间步维,时间步个数为1\n", " output, state = self.rnn(input_and_c.unsqueeze(0), state)\n", " # 移除时间步维,输出形状为(批量大小, 输出词典大小)\n", @@ -295,9 +284,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def batch_loss(encoder, decoder, X, Y, loss):\n", @@ -308,7 +295,7 @@ " dec_state = decoder.begin_state(enc_state)\n", " # 解码器在最初时间步的输入是BOS\n", " dec_input = torch.tensor([out_vocab.stoi[BOS]] * batch_size)\n", - " # 我们将使用掩码变量mask来忽略掉标签为填充项PAD的损失\n", + " # 我们将使用掩码变量mask来忽略掉标签为填充项PAD的损失, 初始全1\n", " mask, num_not_pad_tokens = torch.ones(batch_size,), 0\n", " l = torch.tensor([0.0])\n", " for y in Y.permute(1,0): # Y shape: (batch, seq_len)\n", @@ -316,17 +303,15 @@ " l = l + (mask * loss(dec_output, y)).sum()\n", " dec_input = y # 使用强制教学\n", " num_not_pad_tokens += mask.sum().item()\n", - " # 将PAD对应位置的掩码设成0, 原文这里是 y != out_vocab.stoi[EOS], 感觉有误\n", - " mask = mask * (y != out_vocab.stoi[PAD]).float()\n", + " # EOS后面全是PAD. 下面一行保证一旦遇到EOS接下来的循环中mask就一直是0\n", + " mask = mask * (y != out_vocab.stoi[EOS]).float()\n", " return l / num_not_pad_tokens" ] }, { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def train(encoder, decoder, dataset, lr, batch_size, num_epochs):\n", @@ -358,11 +343,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "epoch 10, loss 0.441\n", - "epoch 20, loss 0.183\n", - "epoch 30, loss 0.100\n", - "epoch 40, loss 0.046\n", - "epoch 50, loss 0.025\n" + "epoch 10, loss 0.475\n", + "epoch 20, loss 0.245\n", + "epoch 30, loss 0.157\n", + "epoch 40, loss 0.052\n", + "epoch 50, loss 0.039\n" ] } ], @@ -386,9 +371,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def translate(encoder, decoder, input_seq, max_seq_len):\n", @@ -443,9 +426,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def bleu(pred_tokens, label_tokens, k):\n", @@ -466,9 +447,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def score(input_seq, label_seq, k):\n", @@ -504,29 +483,27 @@ "name": "stdout", "output_type": "stream", "text": [ - "bleu 0.658, predict: they are russian .\n" + "bleu 0.658, predict: they are exhausted .\n" ] } ], "source": [ - "score('ils sont canadiens .', 'they are canadian .', k=2)" + "score('ils sont canadienne .', 'they are canadian .', k=2)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:anaconda3]", + "display_name": "Python [conda env:py36]", "language": "python", - "name": "conda-env-anaconda3-py" + "name": "conda-env-py36-py" }, "language_info": { "codemirror_mode": { @@ -538,7 +515,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.2" } }, "nbformat": 4, diff --git a/docs/chapter10_natural-language-processing/10.12_machine-translation.md b/docs/chapter10_natural-language-processing/10.12_machine-translation.md index d8644b13570fff340254c396d975426eeaa9b368..019b177890c07ffd78a26a9cde2f942b46c6e749 100644 --- a/docs/chapter10_natural-language-processing/10.12_machine-translation.md +++ b/docs/chapter10_natural-language-processing/10.12_machine-translation.md @@ -165,8 +165,9 @@ class Decoder(nn.Module): super(Decoder, self).__init__() self.embedding = nn.Embedding(vocab_size, embed_size) self.attention = attention_model(2*num_hiddens, attention_size) - # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 2*embed_size - self.rnn = nn.GRU(2*embed_size, num_hiddens, num_layers, dropout=drop_prob) + # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 num_hiddens+embed_size + self.rnn = nn.GRU(num_hiddens + embed_size, num_hiddens, + num_layers, dropout=drop_prob) self.out = nn.Linear(num_hiddens, vocab_size) def forward(self, cur_input, state, enc_states): @@ -176,8 +177,8 @@ class Decoder(nn.Module): """ # 使用注意力机制计算背景向量 c = attention_forward(self.attention, enc_states, state[-1]) - # 将嵌入后的输入和背景向量在特征维连结 - input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) # (批量大小, 2*embed_size) + # 将嵌入后的输入和背景向量在特征维连结, (批量大小, num_hiddens+embed_size) + input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) # 为输入和背景向量的连结增加时间步维,时间步个数为1 output, state = self.rnn(input_and_c.unsqueeze(0), state) # 移除时间步维,输出形状为(批量大小, 输出词典大小) @@ -210,8 +211,8 @@ def batch_loss(encoder, decoder, X, Y, loss): l = l + (mask * loss(dec_output, y)).sum() dec_input = y # 使用强制教学 num_not_pad_tokens += mask.sum().item() - # 将PAD对应位置的掩码设成0, 原文这里是 y != out_vocab.stoi[EOS], 感觉有误 - mask = mask * (y != out_vocab.stoi[PAD]).float() + # EOS后面全是PAD. 下面一行保证一旦遇到EOS接下来的循环中mask就一直是0 + mask = mask * (y != out_vocab.stoi[EOS]).float() return l / num_not_pad_tokens ``` @@ -299,7 +300,7 @@ translate(encoder, decoder, input_seq, max_seq_len) 评价机器翻译结果通常使用BLEU(Bilingual Evaluation Understudy)[1]。对于模型预测序列中任意的子序列,BLEU考察这个子序列是否出现在标签序列中。 -具体来说,设词数为$n$的子序列的精度为$p_n$。它是预测序列与标签序列匹配词数为$n$的子序列的数量与预测序列中词数为$n$的子序列的数量之比。举个例子,假设标签序列为$A$、$B$、$C$、$D$、$E$、$F$,预测序列为$A$、$B$、$B$、$C$、$D$,那么$p_1 = 4/5,\ p_2 = 3/4,\ p_3 = 1/3,\ p_4 = 0$。设$len_{\text{label}}$和$len_{\text{pred}}$分别为标签序列和预测序列的词数,那么,BLEU的定义为 +具体来说,设词数为$n$的子序列的精度为$p_n$。它是预测序列与标签序列匹配词数为$n$的子序列的数量与预测序列中词数为$n$的子序列的数量之比。举个例子,假设标签序列为$A$、$B$、$C$、$D$、$E$、$F$,预测序列为$A$、$B$、$B$、$C$、$D$,那么$p_1 = 4/5, p_2 = 3/4, p_3 = 1/3, p_4 = 0$。设$len_{\text{label}}$和$len_{\text{pred}}$分别为标签序列和预测序列的词数,那么,BLEU的定义为 $$ \exp\left(\min\left(0, 1 - \frac{len_{\text{label}}}{len_{\text{pred}}}\right)\right) \prod_{n=1}^k p_n^{1/2^n},$$ @@ -348,7 +349,7 @@ bleu 1.000, predict: they are watching . 测试一个不在训练集中的样本。 ``` python -score('ils sont canadiens .', 'they are canadian .', k=2) +score('ils sont canadienne .', 'they are canadian .', k=2) ``` 输出: ```