提交 a86f344a 编写于 作者: S ShusenTang

fix bug(#64), update mask, fix typo

上级 c5d0f74a
......@@ -16,7 +16,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"1.0.0 cpu\n"
"1.2.0 cpu\n"
]
}
],
......@@ -52,9 +52,7 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"# 将一个序列中所有的词记录在all_tokens中以便之后构造词典,然后在该序列后面添加PAD直到序列\n",
......@@ -75,9 +73,7 @@
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def read_data(max_seq_len):\n",
......@@ -130,9 +126,7 @@
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"class Encoder(nn.Module):\n",
......@@ -183,9 +177,7 @@
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def attention_model(input_size, attention_size):\n",
......@@ -198,9 +190,7 @@
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def attention_forward(model, enc_states, dec_state):\n",
......@@ -250,9 +240,7 @@
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"class Decoder(nn.Module):\n",
......@@ -261,8 +249,9 @@
" super(Decoder, self).__init__()\n",
" self.embedding = nn.Embedding(vocab_size, embed_size)\n",
" self.attention = attention_model(2*num_hiddens, attention_size)\n",
" # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 2*embed_size\n",
" self.rnn = nn.GRU(2*embed_size, num_hiddens, num_layers, dropout=drop_prob)\n",
" # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 num_hiddens+embed_size\n",
" self.rnn = nn.GRU(num_hiddens + embed_size, num_hiddens, \n",
" num_layers, dropout=drop_prob)\n",
" self.out = nn.Linear(num_hiddens, vocab_size)\n",
"\n",
" def forward(self, cur_input, state, enc_states):\n",
......@@ -272,8 +261,8 @@
" \"\"\"\n",
" # 使用注意力机制计算背景向量\n",
" c = attention_forward(self.attention, enc_states, state[-1])\n",
" # 将嵌入后的输入和背景向量在特征维连结\n",
" input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) # (批量大小, 2*embed_size)\n",
" # 将嵌入后的输入和背景向量在特征维连结, (批量大小, num_hiddens+embed_size)\n",
" input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) \n",
" # 为输入和背景向量的连结增加时间步维,时间步个数为1\n",
" output, state = self.rnn(input_and_c.unsqueeze(0), state)\n",
" # 移除时间步维,输出形状为(批量大小, 输出词典大小)\n",
......@@ -295,9 +284,7 @@
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def batch_loss(encoder, decoder, X, Y, loss):\n",
......@@ -308,7 +295,7 @@
" dec_state = decoder.begin_state(enc_state)\n",
" # 解码器在最初时间步的输入是BOS\n",
" dec_input = torch.tensor([out_vocab.stoi[BOS]] * batch_size)\n",
" # 我们将使用掩码变量mask来忽略掉标签为填充项PAD的损失\n",
" # 我们将使用掩码变量mask来忽略掉标签为填充项PAD的损失, 初始全1\n",
" mask, num_not_pad_tokens = torch.ones(batch_size,), 0\n",
" l = torch.tensor([0.0])\n",
" for y in Y.permute(1,0): # Y shape: (batch, seq_len)\n",
......@@ -316,17 +303,15 @@
" l = l + (mask * loss(dec_output, y)).sum()\n",
" dec_input = y # 使用强制教学\n",
" num_not_pad_tokens += mask.sum().item()\n",
" # 将PAD对应位置的掩码设成0, 原文这里是 y != out_vocab.stoi[EOS], 感觉有误\n",
" mask = mask * (y != out_vocab.stoi[PAD]).float()\n",
" # EOS后面全是PAD. 下面一行保证一旦遇到EOS接下来的循环中mask就一直是0\n",
" mask = mask * (y != out_vocab.stoi[EOS]).float()\n",
" return l / num_not_pad_tokens"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def train(encoder, decoder, dataset, lr, batch_size, num_epochs):\n",
......@@ -358,11 +343,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
"epoch 10, loss 0.441\n",
"epoch 20, loss 0.183\n",
"epoch 30, loss 0.100\n",
"epoch 40, loss 0.046\n",
"epoch 50, loss 0.025\n"
"epoch 10, loss 0.475\n",
"epoch 20, loss 0.245\n",
"epoch 30, loss 0.157\n",
"epoch 40, loss 0.052\n",
"epoch 50, loss 0.039\n"
]
}
],
......@@ -386,9 +371,7 @@
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def translate(encoder, decoder, input_seq, max_seq_len):\n",
......@@ -443,9 +426,7 @@
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def bleu(pred_tokens, label_tokens, k):\n",
......@@ -466,9 +447,7 @@
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def score(input_seq, label_seq, k):\n",
......@@ -504,29 +483,27 @@
"name": "stdout",
"output_type": "stream",
"text": [
"bleu 0.658, predict: they are russian .\n"
"bleu 0.658, predict: they are exhausted .\n"
]
}
],
"source": [
"score('ils sont canadiens .', 'they are canadian .', k=2)"
"score('ils sont canadienne .', 'they are canadian .', k=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:anaconda3]",
"display_name": "Python [conda env:py36]",
"language": "python",
"name": "conda-env-anaconda3-py"
"name": "conda-env-py36-py"
},
"language_info": {
"codemirror_mode": {
......@@ -538,7 +515,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.6.2"
}
},
"nbformat": 4,
......
......@@ -165,8 +165,9 @@ class Decoder(nn.Module):
super(Decoder, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_size)
self.attention = attention_model(2*num_hiddens, attention_size)
# GRU的输入包含attention输出的c和实际输入, 所以尺寸是 2*embed_size
self.rnn = nn.GRU(2*embed_size, num_hiddens, num_layers, dropout=drop_prob)
# GRU的输入包含attention输出的c和实际输入, 所以尺寸是 num_hiddens+embed_size
self.rnn = nn.GRU(num_hiddens + embed_size, num_hiddens,
num_layers, dropout=drop_prob)
self.out = nn.Linear(num_hiddens, vocab_size)
def forward(self, cur_input, state, enc_states):
......@@ -176,8 +177,8 @@ class Decoder(nn.Module):
"""
# 使用注意力机制计算背景向量
c = attention_forward(self.attention, enc_states, state[-1])
# 将嵌入后的输入和背景向量在特征维连结
input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) # (批量大小, 2*embed_size)
# 将嵌入后的输入和背景向量在特征维连结, (批量大小, num_hiddens+embed_size)
input_and_c = torch.cat((self.embedding(cur_input), c), dim=1)
# 为输入和背景向量的连结增加时间步维,时间步个数为1
output, state = self.rnn(input_and_c.unsqueeze(0), state)
# 移除时间步维,输出形状为(批量大小, 输出词典大小)
......@@ -210,8 +211,8 @@ def batch_loss(encoder, decoder, X, Y, loss):
l = l + (mask * loss(dec_output, y)).sum()
dec_input = y # 使用强制教学
num_not_pad_tokens += mask.sum().item()
# 将PAD对应位置的掩码设成0, 原文这里是 y != out_vocab.stoi[EOS], 感觉有误
mask = mask * (y != out_vocab.stoi[PAD]).float()
# EOS后面全是PAD. 下面一行保证一旦遇到EOS接下来的循环中mask就一直是0
mask = mask * (y != out_vocab.stoi[EOS]).float()
return l / num_not_pad_tokens
```
......@@ -299,7 +300,7 @@ translate(encoder, decoder, input_seq, max_seq_len)
评价机器翻译结果通常使用BLEU(Bilingual Evaluation Understudy)[1]。对于模型预测序列中任意的子序列,BLEU考察这个子序列是否出现在标签序列中。
具体来说,设词数为$n$的子序列的精度为$p_n$。它是预测序列与标签序列匹配词数为$n$的子序列的数量与预测序列中词数为$n$的子序列的数量之比。举个例子,假设标签序列为$A$、$B$、$C$、$D$、$E$、$F$,预测序列为$A$、$B$、$B$、$C$、$D$,那么$p_1 = 4/5,\ p_2 = 3/4,\ p_3 = 1/3,\ p_4 = 0$。设$len_{\text{label}}$和$len_{\text{pred}}$分别为标签序列和预测序列的词数,那么,BLEU的定义为
具体来说,设词数为$n$的子序列的精度为$p_n$。它是预测序列与标签序列匹配词数为$n$的子序列的数量与预测序列中词数为$n$的子序列的数量之比。举个例子,假设标签序列为$A$、$B$、$C$、$D$、$E$、$F$,预测序列为$A$、$B$、$B$、$C$、$D$,那么$p_1 = 4/5, p_2 = 3/4, p_3 = 1/3, p_4 = 0$。设$len_{\text{label}}$和$len_{\text{pred}}$分别为标签序列和预测序列的词数,那么,BLEU的定义为
$$ \exp\left(\min\left(0, 1 - \frac{len_{\text{label}}}{len_{\text{pred}}}\right)\right) \prod_{n=1}^k p_n^{1/2^n},$$
......@@ -348,7 +349,7 @@ bleu 1.000, predict: they are watching .
测试一个不在训练集中的样本。
``` python
score('ils sont canadiens .', 'they are canadian .', k=2)
score('ils sont canadienne .', 'they are canadian .', k=2)
```
输出:
```
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册