Synchronize to develop (#845)

* -1->None (#823) * unify paddle 1.6 api in understand_sentiment (#824) * unify paddle 1.6 api in understand_sentiment * Upgrade w2v & srl's api (#828) * Upgrade w2v & srl's api * Upgrade label semantic roles api * Rewrite 08.machine_translation using Paddle-1.6 apis. (#826) * Rewrite 08.machine_translation using Paddle-1.6 apis. * Delete the old train.py in 08.machine_translation * Update train.py to seq2seq.py in README_cn 08.machine_translation. * Fix the print content of seq2seq.py. * Update code format in README_cn of 08.machine_translation. * add 1.6 requirement (#830) * fix bugs in Readme.md of 04 and 07 (#831) (#832) * Polish optimizer name in 04.word2vec (#841) (#842) * change opt name in word2vec * update index.cn.html * Cherry pick for paddle 1.6 (#843) Co-authored-by: N ceci3 <ceci3@users.noreply.github.com> Co-authored-by: N Li Fuchen <lfchener@outlook.com> Co-authored-by: N Yibing Liu <liuyibing01@baidu.com> Co-authored-by: N Guo Sheng <whucsgs@163.com> Co-authored-by: N ruri <shipeng1108@163.com> Co-authored-by: N Chen Weihang <sunny_cwh@163.com>

Synchronize to develop (#845)
* -1->None (#823) * unify paddle 1.6 api in understand_sentiment (#824) * unify paddle 1.6 api in understand_sentiment * Upgrade w2v & srl's api (#828) * Upgrade w2v & srl's api * Upgrade label semantic roles api * Rewrite 08.machine_translation using Paddle-1.6 apis. (#826) * Rewrite 08.machine_translation using Paddle-1.6 apis. * Delete the old train.py in 08.machine_translation * Update train.py to seq2seq.py in README_cn 08.machine_translation. * Fix the print content of seq2seq.py. * Update code format in README_cn of 08.machine_translation. * add 1.6 requirement (#830) * fix bugs in Readme.md of 04 and 07 (#831) (#832) * Polish optimizer name in 04.word2vec (#841) (#842) * change opt name in word2vec * update index.cn.html * Cherry pick for paddle 1.6 (#843) Co-authored-by: N ceci3 <ceci3@users.noreply.github.com> Co-authored-by: N Li Fuchen <lfchener@outlook.com> Co-authored-by: N Yibing Liu <liuyibing01@baidu.com> Co-authored-by: N Guo Sheng <whucsgs@163.com> Co-authored-by: N ruri <shipeng1108@163.com> Co-authored-by: N Chen Weihang <sunny_cwh@163.com>
7d9dbc42 · xiaoting · ceci3 · a0510ad9 · 7d9dbc42 · 7d9dbc42
38 changed file
--- a/01.fit_a_line/README.cn.md
+++ b/01.fit_a_line/README.cn.md
@@ -194,8 +194,8 @@ test_reader = paddle.batch(
 训练程序的目的是定义一个训练模型的网络结构。对于线性回归来讲，它就是一个从输入到输出的简单的全连接层。更加复杂的结果，比如卷积神经网络，递归神经网络等会在随后的章节中介绍。训练程序必须返回`平均损失`作为第一个返回值，因为它会被后面反向传播算法所用到。

 ```python
-x = fluid.layers.data(name='x', shape=[13], dtype='float32') # 定义输入的形状和数据类型
-y = fluid.layers.data(name='y', shape=[1], dtype='float32') # 定义输出的形状和数据类型
+x = fluid.data(name='x', shape=[None, 13], dtype='float32') # 定义输入的形状和数据类型
+y = fluid.data(name='y', shape=[None, 1], dtype='float32') # 定义输出的形状和数据类型
 y_predict = fluid.layers.fc(input=x, size=1, act=None) # 连接输入和输出的全连接层

 main_program = fluid.default_main_program() # 获取默认/全局主函数

--- a/01.fit_a_line/README.md
+++ b/01.fit_a_line/README.md
@@ -196,8 +196,8 @@ test_reader = paddle.batch(
 The aim of the program for training is to define a network structure of a training model. For linear regression, it is a simple fully connected layer from input to output. More complex result, such as Convolutional Neural Network and Recurrent Neural Network, will be introduced in later chapters. It must return `mean error` as the first return value in program for training, for that `mean error` will be used for BackPropagation.

 ```python
-x = fluid.layers.data(name='x', shape=[13], dtype='float32') # define shape and data type of input
-y = fluid.layers.data(name='y', shape=[1], dtype='float32') # define shape and data type of output
+x = fluid.data(name='x', shape=[None, 13], dtype='float32') # define shape and data type of input
+y = fluid.data(name='y', shape=[None, 1], dtype='float32') # define shape and data type of output
 y_predict = fluid.layers.fc(input=x, size=1, act=None) # fully connected layer connecting input and output

 main_program = fluid.default_main_program() # get default/global main function

--- a/01.fit_a_line/index.cn.html
+++ b/01.fit_a_line/index.cn.html
@@ -236,8 +236,8 @@ test_reader = paddle.batch(
 训练程序的目的是定义一个训练模型的网络结构。对于线性回归来讲，它就是一个从输入到输出的简单的全连接层。更加复杂的结果，比如卷积神经网络，递归神经网络等会在随后的章节中介绍。训练程序必须返回`平均损失`作为第一个返回值，因为它会被后面反向传播算法所用到。

 ```python
-x = fluid.layers.data(name='x', shape=[13], dtype='float32') # 定义输入的形状和数据类型
-y = fluid.layers.data(name='y', shape=[1], dtype='float32') # 定义输出的形状和数据类型
+x = fluid.data(name='x', shape=[None, 13], dtype='float32') # 定义输入的形状和数据类型
+y = fluid.data(name='y', shape=[None, 1], dtype='float32') # 定义输出的形状和数据类型
 y_predict = fluid.layers.fc(input=x, size=1, act=None) # 连接输入和输出的全连接层

 main_program = fluid.default_main_program() # 获取默认/全局主函数

--- a/01.fit_a_line/index.html
+++ b/01.fit_a_line/index.html
@@ -238,8 +238,8 @@ test_reader = paddle.batch(
 The aim of the program for training is to define a network structure of a training model. For linear regression, it is a simple fully connected layer from input to output. More complex result, such as Convolutional Neural Network and Recurrent Neural Network, will be introduced in later chapters. It must return `mean error` as the first return value in program for training, for that `mean error` will be used for BackPropagation.

 ```python
-x = fluid.layers.data(name='x', shape=[13], dtype='float32') # define shape and data type of input
-y = fluid.layers.data(name='y', shape=[1], dtype='float32') # define shape and data type of output
+x = fluid.data(name='x', shape=[None, 13], dtype='float32') # define shape and data type of input
+y = fluid.data(name='y', shape=[None, 1], dtype='float32') # define shape and data type of output
 y_predict = fluid.layers.fc(input=x, size=1, act=None) # fully connected layer connecting input and output

 main_program = fluid.default_main_program() # get default/global main function

--- a/01.fit_a_line/train.py
+++ b/01.fit_a_line/train.py
@@ -87,8 +87,8 @@ def main():
            batch_size=batch_size)

    # feature vector of length 13
-    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    x = fluid.data(name='x', shape=[None, 13], dtype='float32')
+    y = fluid.data(name='y', shape=[None, 1], dtype='float32')

    main_program = fluid.default_main_program()
    startup_program = fluid.default_startup_program()

--- a/02.recognize_digits/README.cn.md
+++ b/02.recognize_digits/README.cn.md
@@ -209,7 +209,7 @@ def softmax_regression():
        predict_image -- 分类的结果
    """
    # 输入的原始图像数据，大小为28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # 以softmax为激活函数的全连接层，输出层的大小必须为数字的个数10
    predict = fluid.layers.fc(
        input=img, size=10, act='softmax')
@@ -229,7 +229,7 @@ def multilayer_perceptron():
        predict_image -- 分类的结果
    """
    # 输入的原始图像数据，大小为28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # 第一个全连接层，激活函数为ReLU
    hidden = fluid.layers.fc(input=img, size=200, act='relu')
    # 第二个全连接层，激活函数为ReLU
@@ -282,7 +282,7 @@ def convolutional_neural_network():
        predict -- 分类的结果
    """
    # 输入的原始图像数据，大小为28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # 第一个卷积-池化层
    # 使用20个5*5的滤波器，池化大小为2，池化步长为2，激活函数为Relu
    conv_pool_1 = conv_pool(
@@ -327,7 +327,7 @@ def train_program():

    """
    # 标签层，名称为label,对应输入图片的类别标签
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')

    # predict = softmax_regression() # 取消注释将使用 Softmax回归
    # predict = multilayer_perceptron() # 取消注释将使用 多层感知器

--- a/02.recognize_digits/README.md
+++ b/02.recognize_digits/README.md
@@ -188,7 +188,7 @@ def softmax_regression():
    predict_image -- result of classification
    """
    # input original image data in size of 28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # With softmax as the fully connected layer of the activation function, the size of the output layer must be 10
    predict = fluid.layers.fc(
    input=img, size=10, act='softmax')
@@ -208,7 +208,7 @@ def multilayer_perceptron():
    predict_image -- result of classification
    """
    # input raw image data in size of 28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # the first fully connected layer, whose activation function is ReLU
    hidden = fluid.layers.fc(input=img, size=200, act='relu')
    # the second fully connected layer, whose activation function is ReLU
@@ -260,7 +260,7 @@ def convolutional_neural_network():
    predict -- result of classification
    """
    # input raw image data in size of 28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # the first convolution-pooling layer
    # Use 20 5*5 filters, the pooling size is 2, the pooling step is 2, and the activation function is Relu.
    conv_pool_1 = conv_pool(
@@ -305,7 +305,7 @@ def train_program():

    """
    # label layer, called label, correspondent with label category of input picture
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')

    # predict = softmax_regression() # cancel note and run Softmax regression
    # predict = multilayer_perceptron() # cancel note and run multiple perceptron

--- a/02.recognize_digits/index.cn.html
+++ b/02.recognize_digits/index.cn.html
@@ -251,7 +251,7 @@ def softmax_regression():
        predict_image -- 分类的结果
    """
    # 输入的原始图像数据，大小为28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # 以softmax为激活函数的全连接层，输出层的大小必须为数字的个数10
    predict = fluid.layers.fc(
        input=img, size=10, act='softmax')
@@ -271,7 +271,7 @@ def multilayer_perceptron():
        predict_image -- 分类的结果
    """
    # 输入的原始图像数据，大小为28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # 第一个全连接层，激活函数为ReLU
    hidden = fluid.layers.fc(input=img, size=200, act='relu')
    # 第二个全连接层，激活函数为ReLU
@@ -324,7 +324,7 @@ def convolutional_neural_network():
        predict -- 分类的结果
    """
    # 输入的原始图像数据，大小为28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # 第一个卷积-池化层
    # 使用20个5*5的滤波器，池化大小为2，池化步长为2，激活函数为Relu
    conv_pool_1 = conv_pool(
@@ -369,7 +369,7 @@ def train_program():

    """
    # 标签层，名称为label,对应输入图片的类别标签
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')

    # predict = softmax_regression() # 取消注释将使用 Softmax回归
    # predict = multilayer_perceptron() # 取消注释将使用 多层感知器

--- a/02.recognize_digits/index.html
+++ b/02.recognize_digits/index.html
@@ -230,7 +230,7 @@ def softmax_regression():
    predict_image -- result of classification
    """
    # input original image data in size of 28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # With softmax as the fully connected layer of the activation function, the size of the output layer must be 10
    predict = fluid.layers.fc(
    input=img, size=10, act='softmax')
@@ -250,7 +250,7 @@ def multilayer_perceptron():
    predict_image -- result of classification
    """
    # input raw image data in size of 28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # the first fully connected layer, whose activation function is ReLU
    hidden = fluid.layers.fc(input=img, size=200, act='relu')
    # the second fully connected layer, whose activation function is ReLU
@@ -302,7 +302,7 @@ def convolutional_neural_network():
    predict -- result of classification
    """
    # input raw image data in size of 28*28*1
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
    # the first convolution-pooling layer
    # Use 20 5*5 filters, the pooling size is 2, the pooling step is 2, and the activation function is Relu.
    conv_pool_1 = conv_pool(
@@ -347,7 +347,7 @@ def train_program():

    """
    # label layer, called label, correspondent with label category of input picture
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')

    # predict = softmax_regression() # cancel note and run Softmax regression
    # predict = multilayer_perceptron() # cancel note and run multiple perceptron

--- a/02.recognize_digits/train.py
+++ b/02.recognize_digits/train.py
@@ -101,8 +101,8 @@ def train(nn_type,
        test_reader = paddle.batch(
            paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)

-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')

    if nn_type == 'softmax_regression':
        net_conf = softmax_regression

--- a/03.image_classification/README.cn.md
+++ b/03.image_classification/README.cn.md
@@ -14,6 +14,7 @@
 3.文档和脚本中代码的一致性问题：
 请注意：为使本文更加易读易用，我们拆分、调整了train.py的代码并放入本文。本文中代码与train.py的运行结果一致，可直接运行[train.py](https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/train.py)进行验证。

+4. PaddlePaddle版本：PaddlePaddle 1.6及以上版本或适当的develop版本。

 ## 背景介绍


--- a/03.image_classification/README.md
+++ b/03.image_classification/README.md
@@ -15,6 +15,10 @@ With Deep learning, image classification can be framed as a supervised or unsupe

 In this chapter, we introduce deep-learning-based image classification methods and explain how to train a CNN model using PaddlePaddle.

+## Requirement
+
+1. PaddlePaddle version 1.6 or higher, or suitable develop version.
+
 ## Result Demo

 Image Classification can be divided into general image classification and fine-grained image classification.

--- a/03.image_classification/index.cn.html
+++ b/03.image_classification/index.cn.html
@@ -56,6 +56,7 @@
 3.文档和脚本中代码的一致性问题：
 请注意：为使本文更加易读易用，我们拆分、调整了train.py的代码并放入本文。本文中代码与train.py的运行结果一致，可直接运行[train.py](https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/train.py)进行验证。

+4. PaddlePaddle版本：PaddlePaddle 1.6及以上版本或适当的develop版本。

 ## 背景介绍


--- a/03.image_classification/index.html
+++ b/03.image_classification/index.html
@@ -57,6 +57,10 @@ With Deep learning, image classification can be framed as a supervised or unsupe

 In this chapter, we introduce deep-learning-based image classification methods and explain how to train a CNN model using PaddlePaddle.

+## Requirement
+
+1. PaddlePaddle version 1.6 or higher, or suitable develop version.
+
 ## Result Demo

 Image Classification can be divided into general image classification and fine-grained image classification.

--- a/04.word2vec/README.cn.md
+++ b/04.word2vec/README.cn.md
@@ -262,32 +262,32 @@ dict_size = len(word_dict)
 ```

 更大的`BATCH_SIZE`将使得训练更快收敛，但也会消耗更多内存。由于词向量计算规模较大，如果环境允许，请开启使用GPU进行训练，能更快得到结果。
-不同于之前的PaddlePaddle v2版本，在新的Fluid版本里，我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.layers.embedding`，我们就可以直接用它来构造 N-gram 神经网络。
+不同于之前的PaddlePaddle v2版本，在新的Fluid版本里，我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.embedding`，我们就可以直接用它来构造 N-gram 神经网络。

 - 我们来定义我们的 N-gram 神经网络结构。这个结构在训练和预测中都会使用到。因为词向量比较稀疏，我们传入参数 `is_sparse == True`, 可以加速稀疏矩阵的更新。

 ```python
 def inference_program(words, is_sparse):

-    embed_first = fluid.layers.embedding(
+    embed_first = fluid.embedding(
        input=words[0],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_second = fluid.layers.embedding(
+    embed_second = fluid.embedding(
        input=words[1],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_third = fluid.layers.embedding(
+    embed_third = fluid.embedding(
        input=words[2],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_fourth = fluid.layers.embedding(
+    embed_fourth = fluid.embedding(
        input=words[3],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
@@ -310,7 +310,7 @@ def train_program(predict_word):
    # 'next_word'的定义必须要在inference_program的声明之后，
    # 否则train program输入数据的顺序就变成了[next_word, firstw, secondw,
    # thirdw, fourthw], 这是不正确的.
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')
    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
    avg_cost = fluid.layers.mean(cost)
    return avg_cost
@@ -335,11 +335,11 @@ def train(if_use_cuda, params_dirname, is_sparse=True):
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)

-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    first_word = fluid.data(name='firstw', shape=[None, 1], dtype='int64')
+    second_word = fluid.data(name='secondw', shape=[None, 1], dtype='int64')
+    third_word = fluid.data(name='thirdw', shape=[None, 1], dtype='int64')
+    forth_word = fluid.data(name='fourthw', shape=[None, 1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')

    word_list = [first_word, second_word, third_word, forth_word, next_word]
    feed_order = ['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']

--- a/04.word2vec/README.md
+++ b/04.word2vec/README.md
@@ -227,32 +227,32 @@ dict_size = len(word_dict)
 ```

 A larger `BATCH_SIZE` will make the training converge faster, but it will also consume more memory. Since the word vector calculation is large, if the environment allows, please turn on the GPU for training, and get results faster.
-Unlike the previous PaddlePaddle v2 version, in the new Fluid version, we don't have to manually calculate the word vector. PaddlePaddle provides a built-in method `fluid.layers.embedding`, which we can use directly to construct an N-gram neural network.
+Unlike the previous PaddlePaddle v2 version, in the new Fluid version, we don't have to manually calculate the word vector. PaddlePaddle provides a built-in method `fluid.embedding`, which we can use directly to construct an N-gram neural network.

 - Let's define our N-gram neural network structure. This structure is used in both training and predicting. Because the word vector is sparse, we pass the parameter `is_sparse == True` to speed up the update of the sparse matrix.

 ```python
 def inference_program(words, is_sparse):

-    embed_first = fluid.layers.embedding(
+    embed_first = fluid.embedding(
        input=words[0],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_second = fluid.layers.embedding(
+    embed_second = fluid.embedding(
        input=words[1],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_third = fluid.layers.embedding(
+    embed_third = fluid.embedding(
        input=words[2],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_fourth = fluid.layers.embedding(
+    embed_fourth = fluid.embedding(
        input=words[3],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
@@ -275,7 +275,7 @@ def train_program(predict_word):
    # The definition of'next_word' must be after the declaration of inference_program.
    # Otherwise the sequence of the train program input data becomes [next_word, firstw, secondw,
    #thirdw, fourthw], This is not true.
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')
    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
    avg_cost = fluid.layers.mean(cost)
    return avg_cost
@@ -300,11 +300,11 @@ def train(if_use_cuda, params_dirname, is_sparse=True):
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)

-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    first_word = fluid.data(name='firstw', shape=[None, 1], dtype='int64')
+    second_word = fluid.data(name='secondw', shape=[None, 1], dtype='int64')
+    third_word = fluid.data(name='thirdw', shape=[None, 1], dtype='int64')
+    forth_word = fluid.data(name='fourthw', shape=[None, 1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')

    word_list = [first_word, second_word, third_word, forth_word, next_word]
    feed_order = ['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']

--- a/04.word2vec/index.cn.html
+++ b/04.word2vec/index.cn.html
@@ -304,32 +304,32 @@ dict_size = len(word_dict)
 ```

 更大的`BATCH_SIZE`将使得训练更快收敛，但也会消耗更多内存。由于词向量计算规模较大，如果环境允许，请开启使用GPU进行训练，能更快得到结果。
-不同于之前的PaddlePaddle v2版本，在新的Fluid版本里，我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.layers.embedding`，我们就可以直接用它来构造 N-gram 神经网络。
+不同于之前的PaddlePaddle v2版本，在新的Fluid版本里，我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.embedding`，我们就可以直接用它来构造 N-gram 神经网络。

 - 我们来定义我们的 N-gram 神经网络结构。这个结构在训练和预测中都会使用到。因为词向量比较稀疏，我们传入参数 `is_sparse == True`, 可以加速稀疏矩阵的更新。

 ```python
 def inference_program(words, is_sparse):

-    embed_first = fluid.layers.embedding(
+    embed_first = fluid.embedding(
        input=words[0],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_second = fluid.layers.embedding(
+    embed_second = fluid.embedding(
        input=words[1],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_third = fluid.layers.embedding(
+    embed_third = fluid.embedding(
        input=words[2],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_fourth = fluid.layers.embedding(
+    embed_fourth = fluid.embedding(
        input=words[3],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
@@ -352,7 +352,7 @@ def train_program(predict_word):
    # 'next_word'的定义必须要在inference_program的声明之后，
    # 否则train program输入数据的顺序就变成了[next_word, firstw, secondw,
    # thirdw, fourthw], 这是不正确的.
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')
    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
    avg_cost = fluid.layers.mean(cost)
    return avg_cost
@@ -377,11 +377,11 @@ def train(if_use_cuda, params_dirname, is_sparse=True):
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)

-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    first_word = fluid.data(name='firstw', shape=[None, 1], dtype='int64')
+    second_word = fluid.data(name='secondw', shape=[None, 1], dtype='int64')
+    third_word = fluid.data(name='thirdw', shape=[None, 1], dtype='int64')
+    forth_word = fluid.data(name='fourthw', shape=[None, 1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')

    word_list = [first_word, second_word, third_word, forth_word, next_word]
    feed_order = ['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']

--- a/04.word2vec/index.html
+++ b/04.word2vec/index.html
@@ -269,32 +269,32 @@ dict_size = len(word_dict)
 ```

 A larger `BATCH_SIZE` will make the training converge faster, but it will also consume more memory. Since the word vector calculation is large, if the environment allows, please turn on the GPU for training, and get results faster.
-Unlike the previous PaddlePaddle v2 version, in the new Fluid version, we don't have to manually calculate the word vector. PaddlePaddle provides a built-in method `fluid.layers.embedding`, which we can use directly to construct an N-gram neural network.
+Unlike the previous PaddlePaddle v2 version, in the new Fluid version, we don't have to manually calculate the word vector. PaddlePaddle provides a built-in method `fluid.embedding`, which we can use directly to construct an N-gram neural network.

 - Let's define our N-gram neural network structure. This structure is used in both training and predicting. Because the word vector is sparse, we pass the parameter `is_sparse == True` to speed up the update of the sparse matrix.

 ```python
 def inference_program(words, is_sparse):

-    embed_first = fluid.layers.embedding(
+    embed_first = fluid.embedding(
        input=words[0],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_second = fluid.layers.embedding(
+    embed_second = fluid.embedding(
        input=words[1],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_third = fluid.layers.embedding(
+    embed_third = fluid.embedding(
        input=words[2],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_fourth = fluid.layers.embedding(
+    embed_fourth = fluid.embedding(
        input=words[3],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
@@ -317,7 +317,7 @@ def train_program(predict_word):
    # The definition of'next_word' must be after the declaration of inference_program.
    # Otherwise the sequence of the train program input data becomes [next_word, firstw, secondw,
    #thirdw, fourthw], This is not true.
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')
    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
    avg_cost = fluid.layers.mean(cost)
    return avg_cost
@@ -342,11 +342,11 @@ def train(if_use_cuda, params_dirname, is_sparse=True):
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)

-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    first_word = fluid.data(name='firstw', shape=[None, 1], dtype='int64')
+    second_word = fluid.data(name='secondw', shape=[None, 1], dtype='int64')
+    third_word = fluid.data(name='thirdw', shape=[None, 1], dtype='int64')
+    forth_word = fluid.data(name='fourthw', shape=[None, 1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')

    word_list = [first_word, second_word, third_word, forth_word, next_word]
    feed_order = ['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']

--- a/04.word2vec/train.py
+++ b/04.word2vec/train.py
@@ -45,25 +45,25 @@ def parse_args():

 def inference_program(words, is_sparse):

-    embed_first = fluid.layers.embedding(
+    embed_first = fluid.embedding(
        input=words[0],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_second = fluid.layers.embedding(
+    embed_second = fluid.embedding(
        input=words[1],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_third = fluid.layers.embedding(
+    embed_third = fluid.embedding(
        input=words[2],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
        is_sparse=is_sparse,
        param_attr='shared_w')
-    embed_fourth = fluid.layers.embedding(
+    embed_fourth = fluid.embedding(
        input=words[3],
        size=[dict_size, EMBED_SIZE],
        dtype='float32',
@@ -82,7 +82,7 @@ def train_program(predict_word):
    # The declaration of 'next_word' must be after the invoking of inference_program,
    # or the data input order of train program would be [next_word, firstw, secondw,
    # thirdw, fourthw], which is not correct.
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')
    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
    avg_cost = fluid.layers.mean(cost)
    return avg_cost
@@ -102,11 +102,11 @@ def train(if_use_cuda, params_dirname, is_sparse=True):
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)

-    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-    forth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    first_word = fluid.data(name='firstw', shape=[None, 1], dtype='int64')
+    second_word = fluid.data(name='secondw', shape=[None, 1], dtype='int64')
+    third_word = fluid.data(name='thirdw', shape=[None, 1], dtype='int64')
+    forth_word = fluid.data(name='fourthw', shape=[None, 1], dtype='int64')
+    next_word = fluid.data(name='nextw', shape=[None, 1], dtype='int64')

    word_list = [first_word, second_word, third_word, forth_word, next_word]
    feed_order = ['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']

--- a/06.understand_sentiment/README.cn.md
+++ b/06.understand_sentiment/README.cn.md
@@ -218,9 +218,8 @@ def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):

 ```python
 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
+    data = fluid.data(
+        name="words", shape=[None], dtype="int64", lod_level=1)
    dict_dim = len(word_dict)
    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
    # net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
@@ -235,7 +234,7 @@ def inference_program(word_dict):

 ```python
 def train_program(prediction):
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/06.understand_sentiment/README.md
+++ b/06.understand_sentiment/README.md
@@ -207,9 +207,8 @@ Next we define the prediction program (`inference_program`). We use `convolution

 ```python
 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
+    data = fluid.data(
+        name="words", shape=[None], dtype="int64", lod_level=1)
    dict_dim = len(word_dict)
    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
    # net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
@@ -224,7 +223,7 @@ During the testing, the classifier calculates the probability of each output. Th

 ```python
 def train_program(prediction):
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/06.understand_sentiment/index.cn.html
+++ b/06.understand_sentiment/index.cn.html
@@ -260,9 +260,8 @@ def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):

 ```python
 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
+    data = fluid.data(
+        name="words", shape=[None], dtype="int64", lod_level=1)
    dict_dim = len(word_dict)
    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
    # net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
@@ -277,7 +276,7 @@ def inference_program(word_dict):

 ```python
 def train_program(prediction):
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/06.understand_sentiment/index.html
+++ b/06.understand_sentiment/index.html
@@ -249,9 +249,8 @@ Next we define the prediction program (`inference_program`). We use `convolution

 ```python
 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
+    data = fluid.data(
+        name="words", shape=[None], dtype="int64", lod_level=1)
    dict_dim = len(word_dict)
    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
    # net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
@@ -266,7 +265,7 @@ During the testing, the classifier calculates the probability of each output. Th

 ```python
 def train_program(prediction):
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/06.understand_sentiment/train_conv.py
+++ b/06.understand_sentiment/train_conv.py
@@ -62,16 +62,14 @@ def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):


 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
    dict_dim = len(word_dict)
+    data = fluid.data(name="words", shape=[None], dtype="int64", lod_level=1)
    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
    return net


 def train_program(prediction):
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/06.understand_sentiment/train_dyn_rnn.py
+++ b/06.understand_sentiment/train_dyn_rnn.py
@@ -54,16 +54,14 @@ def dynamic_rnn_lstm(data, input_dim, class_dim, emb_dim, lstm_size):


 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
+    data = fluid.data(name="words", shape=[None], dtype="int64", lod_level=1)
    dict_dim = len(word_dict)
    pred = dynamic_rnn_lstm(data, dict_dim, CLASS_DIM, EMB_DIM, LSTM_SIZE)
    return pred


 def train_program(prediction):
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/06.understand_sentiment/train_stacked_lstm.py
+++ b/06.understand_sentiment/train_stacked_lstm.py
@@ -69,9 +69,7 @@ def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):


 def inference_program(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
+    data = fluid.data(name="words", shape=[None], dtype="int64", lod_level=1)
    dict_dim = len(word_dict)
    net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM,
                           STACKED_NUM)
@@ -80,7 +78,7 @@ def inference_program(word_dict):

 def train_program(prediction):
    # prediction = inference_program(word_dict)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    label = fluid.data(name="label", shape=[None, 1], dtype="int64")
    cost = fluid.layers.cross_entropy(input=prediction, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)

--- a/07.label_semantic_roles/README.cn.md
+++ b/07.label_semantic_roles/README.cn.md
@@ -270,42 +270,42 @@ is_local = True

 ```python
 # 句子序列
-word = fluid.layers.data(
-    name='word_data', shape=[1], dtype='int64', lod_level=1)
+word = fluid.data(
+    name='word_data', shape=[None, 1], dtype='int64', lod_level=1)

 # 谓词
-predicate = fluid.layers.data(
-    name='verb_data', shape=[1], dtype='int64', lod_level=1)
+predicate = fluid.data(
+    name='verb_data', shape=[None, 1], dtype='int64', lod_level=1)

 # 谓词上下文5个特征
-ctx_n2 = fluid.layers.data(
-    name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-ctx_n1 = fluid.layers.data(
-    name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_0 = fluid.layers.data(
-    name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p1 = fluid.layers.data(
-    name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p2 = fluid.layers.data(
-    name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+ctx_n2 = fluid.data(
+    name='ctx_n2_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_n1 = fluid.data(
+    name='ctx_n1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_0 = fluid.data(
+    name='ctx_0_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p1 = fluid.data(
+    name='ctx_p1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p2 = fluid.data(
+    name='ctx_p2_data', shape=[None, 1], dtype='int64', lod_level=1)

 # 谓词上下区域标志
-mark = fluid.layers.data(
-    name='mark_data', shape=[1], dtype='int64', lod_level=1)
+mark = fluid.data(
+    name='mark_data', shape=[None, 1], dtype='int64', lod_level=1)
 ```
 ### 定义网络结构
 首先预训练并定义模型输入层

 ```python
 #预训练谓词和谓词上下区域标志
-predicate_embedding = fluid.layers.embedding(
+predicate_embedding = fluid.embedding(
    input=predicate,
    size=[pred_dict_len, word_dim],
    dtype='float32',
    is_sparse=IS_SPARSE,
    param_attr='vemb')

-mark_embedding = fluid.layers.embedding(
+mark_embedding = fluid.embedding(
    input=mark,
    size=[mark_dict_len, mark_dim],
    dtype='float32',
@@ -316,7 +316,7 @@ word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
 # 因词向量是预训练好的，这里不再训练embedding表，
 # 参数属性trainable设置成False阻止了embedding表在训练过程中被更新
 emb_layers = [
-    fluid.layers.embedding(
+    fluid.embedding(
        size=[word_dict_len, word_dim],
        input=x,
        param_attr=fluid.ParamAttr(
@@ -374,8 +374,8 @@ feature_out = fluid.layers.sums(input=[
 ])

 # 标注序列
-target = fluid.layers.data(
-    name='target', shape=[1], dtype='int64', lod_level=1)
+target = fluid.data(
+    name='target', shape=[None, 1], dtype='int64', lod_level=1)

 # 学习 CRF 的转移特征
 crf_cost = fluid.layers.linear_chain_crf(

--- a/07.label_semantic_roles/README.md
+++ b/07.label_semantic_roles/README.md
@@ -252,42 +252,42 @@ Defines the format of the model input features, including the sentence sequence,

 ```python
 # Sentence sequences
-word = fluid.layers.data(
-    name='word_data', shape=[1], dtype='int64', lod_level=1)
+word = fluid.data(
+    name='word_data', shape=[None, 1], dtype='int64', lod_level=1)

 # predicate
-predicate = fluid.layers.data(
-    name='verb_data', shape=[1], dtype='int64', lod_level=1)
+predicate = fluid.data(
+    name='verb_data', shape=[None, 1], dtype='int64', lod_level=1)

 # predicate context's 5 features
-ctx_n2 = fluid.layers.data(
-    name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-ctx_n1 = fluid.layers.data(
-    name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_0 = fluid.layers.data(
-    name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p1 = fluid.layers.data(
-    name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p2 = fluid.layers.data(
-    name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+ctx_n2 = fluid.data(
+    name='ctx_n2_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_n1 = fluid.data(
+    name='ctx_n1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_0 = fluid.data(
+    name='ctx_0_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p1 = fluid.data(
+    name='ctx_p1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p2 = fluid.data(
+    name='ctx_p2_data', shape=[None, 1], dtype='int64', lod_level=1)

 # Predicate conotext area flag
-mark = fluid.layers.data(
-    name='mark_data', shape=[1], dtype='int64', lod_level=1)
+mark = fluid.data(
+    name='mark_data', shape=[None, 1], dtype='int64', lod_level=1)
 ```
 ### Defining the network structure
 First pre-train and define the model input layer

 ```python
 #pre-training predicate and predicate context area flags
-predicate_embedding = fluid.layers.embedding(
+predicate_embedding = fluid.embedding(
    input=predicate,
    size=[pred_dict_len, word_dim],
    dtype='float32',
    is_sparse=IS_SPARSE,
    param_attr='vemb')

-mark_embedding = fluid.layers.embedding(
+mark_embedding = fluid.embedding(
    input=mark,
    size=[mark_dict_len, mark_dim],
    dtype='float32',
@@ -298,7 +298,7 @@ word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
 #Because word vector is pre-trained, no longer training embedding table,
 # The trainable's parameter attribute set to False prevents the embedding table from being updated during training
 emb_layers = [
-    fluid.layers.embedding(
+    fluid.embedding(
        size=[word_dict_len, word_dim],
        input=x,
        param_attr=fluid.ParamAttr(
@@ -356,7 +356,7 @@ feature_out = fluid.layers.sums(input=[
 ])

 # tag/label sequence
-target = fluid.layers.data(
+target = fluid.data(
    name='target', shape=[1], dtype='int64', lod_level=1)

 # Learning CRF transfer features

--- a/07.label_semantic_roles/index.cn.html
+++ b/07.label_semantic_roles/index.cn.html
@@ -312,42 +312,42 @@ is_local = True

 ```python
 # 句子序列
-word = fluid.layers.data(
-    name='word_data', shape=[1], dtype='int64', lod_level=1)
+word = fluid.data(
+    name='word_data', shape=[None, 1], dtype='int64', lod_level=1)

 # 谓词
-predicate = fluid.layers.data(
-    name='verb_data', shape=[1], dtype='int64', lod_level=1)
+predicate = fluid.data(
+    name='verb_data', shape=[None, 1], dtype='int64', lod_level=1)

 # 谓词上下文5个特征
-ctx_n2 = fluid.layers.data(
-    name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-ctx_n1 = fluid.layers.data(
-    name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_0 = fluid.layers.data(
-    name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p1 = fluid.layers.data(
-    name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p2 = fluid.layers.data(
-    name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+ctx_n2 = fluid.data(
+    name='ctx_n2_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_n1 = fluid.data(
+    name='ctx_n1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_0 = fluid.data(
+    name='ctx_0_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p1 = fluid.data(
+    name='ctx_p1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p2 = fluid.data(
+    name='ctx_p2_data', shape=[None, 1], dtype='int64', lod_level=1)

 # 谓词上下区域标志
-mark = fluid.layers.data(
-    name='mark_data', shape=[1], dtype='int64', lod_level=1)
+mark = fluid.data(
+    name='mark_data', shape=[None, 1], dtype='int64', lod_level=1)
 ```
 ### 定义网络结构
 首先预训练并定义模型输入层

 ```python
 #预训练谓词和谓词上下区域标志
-predicate_embedding = fluid.layers.embedding(
+predicate_embedding = fluid.embedding(
    input=predicate,
    size=[pred_dict_len, word_dim],
    dtype='float32',
    is_sparse=IS_SPARSE,
    param_attr='vemb')

-mark_embedding = fluid.layers.embedding(
+mark_embedding = fluid.embedding(
    input=mark,
    size=[mark_dict_len, mark_dim],
    dtype='float32',
@@ -358,7 +358,7 @@ word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
 # 因词向量是预训练好的，这里不再训练embedding表，
 # 参数属性trainable设置成False阻止了embedding表在训练过程中被更新
 emb_layers = [
-    fluid.layers.embedding(
+    fluid.embedding(
        size=[word_dict_len, word_dim],
        input=x,
        param_attr=fluid.ParamAttr(
@@ -416,8 +416,8 @@ feature_out = fluid.layers.sums(input=[
 ])

 # 标注序列
-target = fluid.layers.data(
-    name='target', shape=[1], dtype='int64', lod_level=1)
+target = fluid.data(
+    name='target', shape=[None, 1], dtype='int64', lod_level=1)

 # 学习 CRF 的转移特征
 crf_cost = fluid.layers.linear_chain_crf(

--- a/07.label_semantic_roles/index.html
+++ b/07.label_semantic_roles/index.html
@@ -294,42 +294,42 @@ Defines the format of the model input features, including the sentence sequence,

 ```python
 # Sentence sequences
-word = fluid.layers.data(
-    name='word_data', shape=[1], dtype='int64', lod_level=1)
+word = fluid.data(
+    name='word_data', shape=[None, 1], dtype='int64', lod_level=1)

 # predicate
-predicate = fluid.layers.data(
-    name='verb_data', shape=[1], dtype='int64', lod_level=1)
+predicate = fluid.data(
+    name='verb_data', shape=[None, 1], dtype='int64', lod_level=1)

 # predicate context's 5 features
-ctx_n2 = fluid.layers.data(
-    name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-ctx_n1 = fluid.layers.data(
-    name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_0 = fluid.layers.data(
-    name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p1 = fluid.layers.data(
-    name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p2 = fluid.layers.data(
-    name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+ctx_n2 = fluid.data(
+    name='ctx_n2_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_n1 = fluid.data(
+    name='ctx_n1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_0 = fluid.data(
+    name='ctx_0_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p1 = fluid.data(
+    name='ctx_p1_data', shape=[None, 1], dtype='int64', lod_level=1)
+ctx_p2 = fluid.data(
+    name='ctx_p2_data', shape=[None, 1], dtype='int64', lod_level=1)

 # Predicate conotext area flag
-mark = fluid.layers.data(
-    name='mark_data', shape=[1], dtype='int64', lod_level=1)
+mark = fluid.data(
+    name='mark_data', shape=[None, 1], dtype='int64', lod_level=1)
 ```
 ### Defining the network structure
 First pre-train and define the model input layer

 ```python
 #pre-training predicate and predicate context area flags
-predicate_embedding = fluid.layers.embedding(
+predicate_embedding = fluid.embedding(
    input=predicate,
    size=[pred_dict_len, word_dim],
    dtype='float32',
    is_sparse=IS_SPARSE,
    param_attr='vemb')

-mark_embedding = fluid.layers.embedding(
+mark_embedding = fluid.embedding(
    input=mark,
    size=[mark_dict_len, mark_dim],
    dtype='float32',
@@ -340,7 +340,7 @@ word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
 #Because word vector is pre-trained, no longer training embedding table,
 # The trainable's parameter attribute set to False prevents the embedding table from being updated during training
 emb_layers = [
-    fluid.layers.embedding(
+    fluid.embedding(
        size=[word_dict_len, word_dim],
        input=x,
        param_attr=fluid.ParamAttr(
@@ -398,7 +398,7 @@ feature_out = fluid.layers.sums(input=[
 ])

 # tag/label sequence
-target = fluid.layers.data(
+target = fluid.data(
    name='target', shape=[1], dtype='int64', lod_level=1)

 # Learning CRF transfer features

--- a/07.label_semantic_roles/train.py
+++ b/07.label_semantic_roles/train.py
@@ -53,14 +53,14 @@ def load_parameter(file_name, h, w):
 def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
            **ignored):
    # 8 features
-    predicate_embedding = fluid.layers.embedding(
+    predicate_embedding = fluid.embedding(
        input=predicate,
        size=[pred_dict_len, word_dim],
        dtype='float32',
        is_sparse=IS_SPARSE,
        param_attr='vemb')

-    mark_embedding = fluid.layers.embedding(
+    mark_embedding = fluid.embedding(
        input=mark,
        size=[mark_dict_len, mark_dim],
        dtype='float32',
@@ -68,7 +68,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,

    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
    emb_layers = [
-        fluid.layers.embedding(
+        fluid.embedding(
            size=[word_dict_len, word_dim],
            input=x,
            param_attr=fluid.ParamAttr(name=embedding_name, trainable=False))
@@ -120,22 +120,22 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,

 def train(use_cuda, save_dirname=None, is_local=True):
    # define data layers
-    word = fluid.layers.data(
-        name='word_data', shape=[1], dtype='int64', lod_level=1)
-    predicate = fluid.layers.data(
-        name='verb_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n2 = fluid.layers.data(
-        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_n1 = fluid.layers.data(
-        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_0 = fluid.layers.data(
-        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p1 = fluid.layers.data(
-        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-    ctx_p2 = fluid.layers.data(
-        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-    mark = fluid.layers.data(
-        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+    word = fluid.data(
+        name='word_data', shape=[None, 1], dtype='int64', lod_level=1)
+    predicate = fluid.data(
+        name='verb_data', shape=[None, 1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.data(
+        name='ctx_n2_data', shape=[None, 1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.data(
+        name='ctx_n1_data', shape=[None, 1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.data(
+        name='ctx_0_data', shape=[None, 1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.data(
+        name='ctx_p1_data', shape=[None, 1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.data(
+        name='ctx_p2_data', shape=[None, 1], dtype='int64', lod_level=1)
+    mark = fluid.data(
+        name='mark_data', shape=[None, 1], dtype='int64', lod_level=1)

    if args.enable_ce:
        fluid.default_startup_program().random_seed = 90

--- a/08.machine_translation/README.cn.md
+++ b/08.machine_translation/README.cn.md
--- a/08.machine_translation/index.cn.html
+++ b/08.machine_translation/index.cn.html
--- a/08.machine_translation/seq2seq.py
+++ b/08.machine_translation/seq2seq.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import os
+import six
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+dict_size = 30000
+source_dict_size = target_dict_size = dict_size
+bos_id = 0
+eos_id = 1
+word_dim = 512
+hidden_dim = 512
+decoder_size = hidden_dim
+max_length = 256
+beam_size = 4
+batch_size = 64
+
+model_save_dir = "machine_translation.inference.model"
+
+
+class DecoderCell(layers.RNNCell):
+    """Additive Attention followed by GRU"""
+
+    def __init__(self, hidden_size):
+        self.hidden_size = hidden_size
+        self.gru_cell = layers.GRUCell(hidden_size)
+
+    def attention(self, hidden, encoder_output, encoder_output_proj,
+                  encoder_padding_mask):
+        decoder_state_proj = layers.unsqueeze(
+            layers.fc(hidden, size=self.hidden_size, bias_attr=False), [1])
+        mixed_state = fluid.layers.elementwise_add(
+            encoder_output_proj,
+            layers.expand(decoder_state_proj,
+                          [1, layers.shape(decoder_state_proj)[1], 1]))
+        # attn_scores: [batch_size, src_seq_len]
+        attn_scores = layers.squeeze(
+            layers.fc(
+                input=mixed_state, size=1, num_flatten_dims=2, bias_attr=False),
+            [2])
+        if encoder_padding_mask is not None:
+            attn_scores = layers.elementwise_add(attn_scores,
+                                                 encoder_padding_mask)
+        attn_scores = layers.softmax(attn_scores)
+        context = layers.reduce_sum(
+            layers.elementwise_mul(encoder_output, attn_scores, axis=0), dim=1)
+        return context
+
+    def call(self,
+             step_input,
+             hidden,
+             encoder_output,
+             encoder_output_proj,
+             encoder_padding_mask=None):
+        context = self.attention(hidden, encoder_output, encoder_output_proj,
+                                 encoder_padding_mask)
+        step_input = layers.concat([step_input, context], axis=1)
+        output, new_hidden = self.gru_cell(step_input, hidden)
+        return output, new_hidden
+
+
+def data_func(is_train=True):
+    """data inputs and data loader"""
+    src = fluid.data(name="src", shape=[None, None], dtype="int64")
+    src_sequence_length = fluid.data(
+        name="src_sequence_length", shape=[None], dtype="int64")
+    inputs = [src, src_sequence_length]
+    if is_train:
+        trg = fluid.data(name="trg", shape=[None, None], dtype="int64")
+        trg_sequence_length = fluid.data(
+            name="trg_sequence_length", shape=[None], dtype="int64")
+        label = fluid.data(name="label", shape=[None, None], dtype="int64")
+        inputs += [trg, trg_sequence_length, label]
+    loader = fluid.io.DataLoader.from_generator(
+        feed_list=inputs, capacity=10, iterable=True, use_double_buffer=True)
+    return inputs, loader
+
+
+def encoder(src_embedding, src_sequence_length):
+    """Encoder: Bidirectional GRU"""
+    encoder_fwd_cell = layers.GRUCell(hidden_size=hidden_dim)
+    encoder_fwd_output, fwd_state = layers.rnn(
+        cell=encoder_fwd_cell,
+        inputs=src_embedding,
+        sequence_length=src_sequence_length,
+        time_major=False,
+        is_reverse=False)
+    encoder_bwd_cell = layers.GRUCell(hidden_size=hidden_dim)
+    encoder_bwd_output, bwd_state = layers.rnn(
+        cell=encoder_bwd_cell,
+        inputs=src_embedding,
+        sequence_length=src_sequence_length,
+        time_major=False,
+        is_reverse=True)
+    encoder_output = layers.concat(
+        input=[encoder_fwd_output, encoder_bwd_output], axis=2)
+    encoder_state = layers.concat(input=[fwd_state, bwd_state], axis=1)
+    return encoder_output, encoder_state
+
+
+def decoder(encoder_output,
+            encoder_output_proj,
+            encoder_state,
+            encoder_padding_mask,
+            trg=None,
+            is_train=True):
+    """Decoder: GRU with Attention"""
+    decoder_cell = DecoderCell(hidden_size=decoder_size)
+    decoder_initial_states = layers.fc(
+        encoder_state, size=decoder_size, act="tanh")
+    trg_embeder = lambda x: fluid.embedding(input=x,
+                                            size=[target_dict_size, hidden_dim],
+                                            dtype="float32",
+                                            param_attr=fluid.ParamAttr(
+                                                name="trg_emb_table"))
+    output_layer = lambda x: layers.fc(x,
+                                       size=target_dict_size,
+                                       num_flatten_dims=len(x.shape) - 1,
+                                       param_attr=fluid.ParamAttr(name=
+                                                                  "output_w"))
+    if is_train:
+        decoder_output, _ = layers.rnn(
+            cell=decoder_cell,
+            inputs=trg_embeder(trg),
+            initial_states=decoder_initial_states,
+            time_major=False,
+            encoder_output=encoder_output,
+            encoder_output_proj=encoder_output_proj,
+            encoder_padding_mask=encoder_padding_mask)
+        decoder_output = output_layer(decoder_output)
+    else:
+        encoder_output = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
+            encoder_output, beam_size)
+        encoder_output_proj = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
+            encoder_output_proj, beam_size)
+        encoder_padding_mask = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
+            encoder_padding_mask, beam_size)
+        beam_search_decoder = layers.BeamSearchDecoder(
+            cell=decoder_cell,
+            start_token=bos_id,
+            end_token=eos_id,
+            beam_size=beam_size,
+            embedding_fn=trg_embeder,
+            output_fn=output_layer)
+        decoder_output, _ = layers.dynamic_decode(
+            decoder=beam_search_decoder,
+            inits=decoder_initial_states,
+            max_step_num=max_length,
+            output_time_major=False,
+            encoder_output=encoder_output,
+            encoder_output_proj=encoder_output_proj,
+            encoder_padding_mask=encoder_padding_mask)
+
+    return decoder_output
+
+
+def model_func(inputs, is_train=True):
+    src = inputs[0]
+    src_sequence_length = inputs[1]
+    # source embedding
+    src_embeder = lambda x: fluid.embedding(
+        input=x,
+        size=[source_dict_size, hidden_dim],
+        dtype="float32",
+        param_attr=fluid.ParamAttr(name="src_emb_table"))
+    src_embedding = src_embeder(src)
+
+    # encoder
+    encoder_output, encoder_state = encoder(src_embedding, src_sequence_length)
+
+    encoder_output_proj = layers.fc(
+        input=encoder_output,
+        size=decoder_size,
+        num_flatten_dims=2,
+        bias_attr=False)
+    src_mask = layers.sequence_mask(
+        src_sequence_length, maxlen=layers.shape(src)[1], dtype="float32")
+    encoder_padding_mask = (src_mask - 1.0) * 1e9
+
+    trg = inputs[2] if is_train else None
+
+    # decoder
+    output = decoder(
+        encoder_output=encoder_output,
+        encoder_output_proj=encoder_output_proj,
+        encoder_state=encoder_state,
+        encoder_padding_mask=encoder_padding_mask,
+        trg=trg,
+        is_train=is_train)
+    return output
+
+
+def loss_func(logits, label, trg_sequence_length):
+    probs = layers.softmax(logits)
+    loss = layers.cross_entropy(input=probs, label=label)
+    trg_mask = layers.sequence_mask(
+        trg_sequence_length, maxlen=layers.shape(logits)[1], dtype="float32")
+    avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask)
+    return avg_cost
+
+
+def optimizer_func():
+    fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
+        clip_norm=5.0))
+    lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(hidden_dim, 1000)
+    return fluid.optimizer.Adam(
+        learning_rate=lr_decay,
+        regularization=fluid.regularizer.L2DecayRegularizer(
+            regularization_coeff=1e-4))
+
+
+def inputs_generator(batch_size, pad_id, is_train=True):
+    data_generator = fluid.io.shuffle(
+        paddle.dataset.wmt16.train(source_dict_size, target_dict_size),
+        buf_size=10000) if is_train else paddle.dataset.wmt16.test(
+            source_dict_size, target_dict_size)
+    batch_generator = fluid.io.batch(data_generator, batch_size=batch_size)
+
+    def _pad_batch_data(insts, pad_id):
+        seq_lengths = np.array(list(map(len, insts)), dtype="int64")
+        max_len = max(seq_lengths)
+        pad_data = np.array(
+            [inst + [pad_id] * (max_len - len(inst)) for inst in insts],
+            dtype="int64")
+        return pad_data, seq_lengths
+
+    def _generator():
+        for batch in batch_generator():
+            batch_src = [ins[0] for ins in batch]
+            src_data, src_lengths = _pad_batch_data(batch_src, pad_id)
+            inputs = [src_data, src_lengths]
+            if is_train:
+                batch_trg = [ins[1] for ins in batch]
+                trg_data, trg_lengths = _pad_batch_data(batch_trg, pad_id)
+                batch_lbl = [ins[2] for ins in batch]
+                lbl_data, _ = _pad_batch_data(batch_lbl, pad_id)
+                inputs += [trg_data, trg_lengths, lbl_data]
+            yield inputs
+
+    return _generator
+
+
+def train(use_cuda):
+    # define program
+    train_prog = fluid.Program()
+    startup_prog = fluid.Program()
+    with fluid.program_guard(train_prog, startup_prog):
+        with fluid.unique_name.guard():
+            # For training:
+            # inputs = [src, src_sequence_length, trg, trg_sequence_length, label]
+            inputs, loader = data_func(is_train=True)
+            logits = model_func(inputs, is_train=True)
+            loss = loss_func(logits, inputs[-1], inputs[-2])
+            optimizer = optimizer_func()
+            optimizer.minimize(loss)
+
+    # define data source
+    places = fluid.cuda_places() if use_cuda else fluid.cpu_places()
+    loader.set_batch_generator(
+        inputs_generator(batch_size, eos_id, is_train=True), places=places)
+
+    exe = fluid.Executor(places[0])
+    exe.run(startup_prog)
+    prog = fluid.CompiledProgram(train_prog).with_data_parallel(
+        loss_name=loss.name)
+
+    EPOCH_NUM = 20
+    for pass_id in six.moves.xrange(EPOCH_NUM):
+        batch_id = 0
+        for data in loader():
+            loss_val = exe.run(prog, feed=data, fetch_list=[loss])[0]
+            print('pass_id: %d, batch_id: %d, loss: %f' %
+                  (pass_id, batch_id, loss_val))
+            batch_id += 1
+        fluid.io.save_params(exe, model_save_dir, main_program=train_prog)
+
+
+def infer(use_cuda):
+    # define program
+    infer_prog = fluid.Program()
+    startup_prog = fluid.Program()
+    with fluid.program_guard(infer_prog, startup_prog):
+        with fluid.unique_name.guard():
+            inputs, loader = data_func(is_train=False)
+            predict_seqs = model_func(inputs, is_train=False)
+
+    # define data source
+    places = fluid.cuda_places() if use_cuda else fluid.cpu_places()
+    loader.set_batch_generator(
+        inputs_generator(batch_size, eos_id, is_train=False), places=places)
+    src_idx2word = paddle.dataset.wmt16.get_dict(
+        "en", source_dict_size, reverse=True)
+    trg_idx2word = paddle.dataset.wmt16.get_dict(
+        "de", target_dict_size, reverse=True)
+
+    exe = fluid.Executor(places[0])
+    exe.run(startup_prog)
+    fluid.io.load_params(exe, model_save_dir, main_program=infer_prog)
+    prog = fluid.CompiledProgram(infer_prog).with_data_parallel()
+
+    for data in loader():
+        seq_ids = exe.run(prog, feed=data, fetch_list=[predict_seqs])[0]
+        for ins_idx in range(seq_ids.shape[0]):
+            print("Original sentence:")
+            src_seqs = np.array(data[0]["src"])
+            print(" ".join([
+                src_idx2word[idx] for idx in src_seqs[ins_idx][1:]
+                if idx != eos_id
+            ]))
+            print("Translated sentence:")
+            for beam_idx in range(beam_size):
+                seq = [
+                    trg_idx2word[idx] for idx in seq_ids[ins_idx, :, beam_idx]
+                    if idx != eos_id
+                ]
+                print(" ".join(seq).encode("utf8"))
+
+
+def main(use_cuda):
+    train(use_cuda)
+    infer(use_cuda)
+
+
+if __name__ == '__main__':
+    use_cuda = False  # set to True if training with GPU
+    main(use_cuda)
--- a/08.machine_translation/train.py
+++ b/08.machine_translation/train.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import os
-import six
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-
-dict_size = 30000
-source_dict_size = target_dict_size = dict_size
-word_dim = 512
-hidden_dim = 512
-decoder_size = hidden_dim
-max_length = 256
-beam_size = 4
-batch_size = 64
-
-is_sparse = True
-model_save_dir = "machine_translation.inference.model"
-
-
-def encoder():
-    src_word_id = fluid.layers.data(
-        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-    src_embedding = fluid.layers.embedding(
-        input=src_word_id,
-        size=[source_dict_size, word_dim],
-        dtype='float32',
-        is_sparse=is_sparse)
-
-    fc_forward = fluid.layers.fc(
-        input=src_embedding, size=hidden_dim * 3, bias_attr=False)
-    src_forward = fluid.layers.dynamic_gru(input=fc_forward, size=hidden_dim)
-    fc_backward = fluid.layers.fc(
-        input=src_embedding, size=hidden_dim * 3, bias_attr=False)
-    src_backward = fluid.layers.dynamic_gru(
-        input=fc_backward, size=hidden_dim, is_reverse=True)
-    encoded_vector = fluid.layers.concat(
-        input=[src_forward, src_backward], axis=1)
-    return encoded_vector
-
-
-def cell(x, hidden, encoder_out, encoder_out_proj):
-    def simple_attention(encoder_vec, encoder_proj, decoder_state):
-        decoder_state_proj = fluid.layers.fc(
-            input=decoder_state, size=decoder_size, bias_attr=False)
-        decoder_state_expand = fluid.layers.sequence_expand(
-            x=decoder_state_proj, y=encoder_proj)
-        mixed_state = fluid.layers.elementwise_add(encoder_proj,
-                                                   decoder_state_expand)
-        attention_weights = fluid.layers.fc(
-            input=mixed_state, size=1, bias_attr=False)
-        attention_weights = fluid.layers.sequence_softmax(
-            input=attention_weights)
-        weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
-        scaled = fluid.layers.elementwise_mul(
-            x=encoder_vec, y=weigths_reshape, axis=0)
-        context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
-        return context
-
-    context = simple_attention(encoder_out, encoder_out_proj, hidden)
-    out = fluid.layers.fc(
-        input=[x, context], size=decoder_size * 3, bias_attr=False)
-    out = fluid.layers.gru_unit(
-        input=out, hidden=hidden, size=decoder_size * 3)[0]
-    return out, out
-
-
-def train_decoder(encoder_out):
-    encoder_last = fluid.layers.sequence_last_step(input=encoder_out)
-    encoder_last_proj = fluid.layers.fc(
-        input=encoder_last, size=decoder_size, act='tanh')
-    # cache the encoder_out's computed result in attention
-    encoder_out_proj = fluid.layers.fc(
-        input=encoder_out, size=decoder_size, bias_attr=False)
-
-    trg_language_word = fluid.layers.data(
-        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-    trg_embedding = fluid.layers.embedding(
-        input=trg_language_word,
-        size=[target_dict_size, word_dim],
-        dtype='float32',
-        is_sparse=is_sparse)
-
-    rnn = fluid.layers.DynamicRNN()
-    with rnn.block():
-        x = rnn.step_input(trg_embedding)
-        pre_state = rnn.memory(init=encoder_last_proj, need_reorder=True)
-        encoder_out = rnn.static_input(encoder_out)
-        encoder_out_proj = rnn.static_input(encoder_out_proj)
-        out, current_state = cell(x, pre_state, encoder_out, encoder_out_proj)
-        prob = fluid.layers.fc(input=out, size=target_dict_size, act='softmax')
-
-        rnn.update_memory(pre_state, current_state)
-        rnn.output(prob)
-
-    return rnn()
-
-
-def train_model():
-    encoder_out = encoder()
-    rnn_out = train_decoder(encoder_out)
-    label = fluid.layers.data(
-        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
-    cost = fluid.layers.cross_entropy(input=rnn_out, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    return avg_cost
-
-
-def optimizer_func():
-    fluid.clip.set_gradient_clip(
-        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
-    lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(hidden_dim, 1000)
-    return fluid.optimizer.Adam(
-        learning_rate=lr_decay,
-        regularization=fluid.regularizer.L2DecayRegularizer(
-            regularization_coeff=1e-4))
-
-
-def train(use_cuda):
-    train_prog = fluid.Program()
-    startup_prog = fluid.Program()
-    with fluid.program_guard(train_prog, startup_prog):
-        with fluid.unique_name.guard():
-            avg_cost = train_model()
-            optimizer = optimizer_func()
-            optimizer.minimize(avg_cost)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt16.train(source_dict_size, target_dict_size),
-            buf_size=10000),
-        batch_size=batch_size)
-
-    feeder = fluid.DataFeeder(
-        feed_list=[
-            'src_word_id', 'target_language_word', 'target_language_next_word'
-        ],
-        place=place,
-        program=train_prog)
-
-    exe.run(startup_prog)
-
-    EPOCH_NUM = 20
-    for pass_id in six.moves.xrange(EPOCH_NUM):
-        batch_id = 0
-        for data in train_data():
-            cost = exe.run(
-                train_prog, feed=feeder.feed(data), fetch_list=[avg_cost])[0]
-            print('pass_id: %d, batch_id: %d, loss: %f' % (pass_id, batch_id,
-                                                           cost))
-            batch_id += 1
-        fluid.io.save_params(exe, model_save_dir, main_program=train_prog)
-
-
-def infer_decoder(encoder_out):
-    encoder_last = fluid.layers.sequence_last_step(input=encoder_out)
-    encoder_last_proj = fluid.layers.fc(
-        input=encoder_last, size=decoder_size, act='tanh')
-    encoder_out_proj = fluid.layers.fc(
-        input=encoder_out, size=decoder_size, bias_attr=False)
-
-    max_len = fluid.layers.fill_constant(
-        shape=[1], dtype='int64', value=max_length)
-    counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True)
-
-    init_ids = fluid.layers.data(
-        name="init_ids", shape=[1], dtype="int64", lod_level=2)
-    init_scores = fluid.layers.data(
-        name="init_scores", shape=[1], dtype="float32", lod_level=2)
-    # create and init arrays to save selected ids, scores and states for each step
-    ids_array = fluid.layers.array_write(init_ids, i=counter)
-    scores_array = fluid.layers.array_write(init_scores, i=counter)
-    state_array = fluid.layers.array_write(encoder_last_proj, i=counter)
-
-    cond = fluid.layers.less_than(x=counter, y=max_len)
-    while_op = fluid.layers.While(cond=cond)
-    with while_op.block():
-        pre_ids = fluid.layers.array_read(array=ids_array, i=counter)
-        pre_score = fluid.layers.array_read(array=scores_array, i=counter)
-        pre_state = fluid.layers.array_read(array=state_array, i=counter)
-
-        pre_ids_emb = fluid.layers.embedding(
-            input=pre_ids,
-            size=[target_dict_size, word_dim],
-            dtype='float32',
-            is_sparse=is_sparse)
-        out, current_state = cell(pre_ids_emb, pre_state, encoder_out,
-                                  encoder_out_proj)
-        prob = fluid.layers.fc(
-            input=current_state, size=target_dict_size, act='softmax')
-
-        # beam search
-        topk_scores, topk_indices = fluid.layers.topk(prob, k=beam_size)
-        accu_scores = fluid.layers.elementwise_add(
-            x=fluid.layers.log(topk_scores),
-            y=fluid.layers.reshape(pre_score, shape=[-1]),
-            axis=0)
-        accu_scores = fluid.layers.lod_reset(x=accu_scores, y=pre_ids)
-        selected_ids, selected_scores = fluid.layers.beam_search(
-            pre_ids, pre_score, topk_indices, accu_scores, beam_size, end_id=1)
-
-        fluid.layers.increment(x=counter, value=1, in_place=True)
-        # save selected ids and corresponding scores of each step
-        fluid.layers.array_write(selected_ids, array=ids_array, i=counter)
-        fluid.layers.array_write(selected_scores, array=scores_array, i=counter)
-        # update rnn state by sequence_expand acting as gather
-        current_state = fluid.layers.sequence_expand(current_state,
-                                                     selected_ids)
-        fluid.layers.array_write(current_state, array=state_array, i=counter)
-        current_enc_out = fluid.layers.sequence_expand(encoder_out,
-                                                       selected_ids)
-        fluid.layers.assign(current_enc_out, encoder_out)
-        current_enc_out_proj = fluid.layers.sequence_expand(encoder_out_proj,
-                                                            selected_ids)
-        fluid.layers.assign(current_enc_out_proj, encoder_out_proj)
-
-        # update conditional variable
-        length_cond = fluid.layers.less_than(x=counter, y=max_len)
-        finish_cond = fluid.layers.logical_not(
-            fluid.layers.is_empty(x=selected_ids))
-        fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond)
-
-    translation_ids, translation_scores = fluid.layers.beam_search_decode(
-        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=1)
-
-    return translation_ids, translation_scores
-
-
-def infer_model():
-    encoder_out = encoder()
-    translation_ids, translation_scores = infer_decoder(encoder_out)
-    return translation_ids, translation_scores
-
-
-def infer(use_cuda):
-    infer_prog = fluid.Program()
-    startup_prog = fluid.Program()
-    with fluid.program_guard(infer_prog, startup_prog):
-        with fluid.unique_name.guard():
-            translation_ids, translation_scores = infer_model()
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    test_data = paddle.batch(
-        paddle.dataset.wmt16.test(source_dict_size, target_dict_size),
-        batch_size=batch_size)
-    src_idx2word = paddle.dataset.wmt16.get_dict(
-        "en", source_dict_size, reverse=True)
-    trg_idx2word = paddle.dataset.wmt16.get_dict(
-        "de", target_dict_size, reverse=True)
-
-    fluid.io.load_params(exe, model_save_dir, main_program=infer_prog)
-
-    for data in test_data():
-        src_word_id = fluid.create_lod_tensor(
-            data=[x[0] for x in data],
-            recursive_seq_lens=[[len(x[0]) for x in data]],
-            place=place)
-        init_ids = fluid.create_lod_tensor(
-            data=np.array([[0]] * len(data), dtype='int64'),
-            recursive_seq_lens=[[1] * len(data)] * 2,
-            place=place)
-        init_scores = fluid.create_lod_tensor(
-            data=np.array([[0.]] * len(data), dtype='float32'),
-            recursive_seq_lens=[[1] * len(data)] * 2,
-            place=place)
-        seq_ids, seq_scores = exe.run(
-            infer_prog,
-            feed={
-                'src_word_id': src_word_id,
-                'init_ids': init_ids,
-                'init_scores': init_scores
-            },
-            fetch_list=[translation_ids, translation_scores],
-            return_numpy=False)
-        # How to parse the results:
-        #   Suppose the lod of seq_ids is:
-        #     [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
-        #   then from lod[0]:
-        #     there are 2 source sentences, beam width is 3.
-        #   from lod[1]:
-        #     the first source sentence has 3 hyps; the lengths are 12, 12, 16
-        #     the second source sentence has 3 hyps; the lengths are 14, 13, 15
-        hyps = [[] for i in range(len(seq_ids.lod()[0]) - 1)]
-        scores = [[] for i in range(len(seq_scores.lod()[0]) - 1)]
-        for i in range(len(seq_ids.lod()[0]) - 1):  # for each source sentence
-            start = seq_ids.lod()[0][i]
-            end = seq_ids.lod()[0][i + 1]
-            print("Original sentence:")
-            print(" ".join([src_idx2word[idx] for idx in data[i][0][1:-1]]))
-            print("Translated score and sentence:")
-            for j in range(end - start):  # for each candidate
-                sub_start = seq_ids.lod()[1][start + j]
-                sub_end = seq_ids.lod()[1][start + j + 1]
-                hyps[i].append(" ".join([
-                    trg_idx2word[idx]
-                    for idx in np.array(seq_ids)[sub_start:sub_end][1:-1]
-                ]))
-                scores[i].append(np.array(seq_scores)[sub_end - 1])
-                print(scores[i][-1], hyps[i][-1].encode('utf8'))
-
-
-def main(use_cuda):
-    train(use_cuda)
-    infer(use_cuda)
-
-
-if __name__ == '__main__':
-    use_cuda = False  # set to True if training with GPU
-    main(use_cuda)
--- a/09.gan/README.cn.md
+++ b/09.gan/README.cn.md
@@ -265,16 +265,16 @@ dg_program = fluid.Program()
 # 定义判别真实图片的program
 with fluid.program_guard(d_program):
    # 输入图片大小为28*28=784
-    img = fluid.layers.data(name='img', shape=[784], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 784], dtype='float32')
    # 标签shape=1
-    label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+    label = fluid.data(name='label', shape=[None, 1], dtype='float32')
    d_logit = D(img)
    d_loss = loss(d_logit, label)

 # 定义判别生成图片的program
 with fluid.program_guard(dg_program):
-    noise = fluid.layers.data(
-        name='noise', shape=[NOISE_SIZE], dtype='float32')
+    noise = fluid.data(
+        name='noise', shape=[None, NOISE_SIZE], dtype='float32')
    # 噪声数据作为输入得到生成图片
    g_img = G(x=noise)


--- a/09.gan/dc_gan.py
+++ b/09.gan/dc_gan.py
@@ -60,14 +60,14 @@ def train(args):
    dg_program = fluid.Program()

    with fluid.program_guard(d_program):
-        img = fluid.layers.data(name='img', shape=[784], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+        img = fluid.data(name='img', shape=[None, 784], dtype='float32')
+        label = fluid.data(name='label', shape=[None, 1], dtype='float32')
        d_logit = D(img)
        d_loss = loss(d_logit, label)

    with fluid.program_guard(dg_program):
-        noise = fluid.layers.data(
-            name='noise', shape=[NOISE_SIZE], dtype='float32')
+        noise = fluid.data(
+            name='noise', shape=[None, NOISE_SIZE], dtype='float32')
        g_img = G(x=noise)

        g_program = dg_program.clone()

--- a/09.gan/index.cn.html
+++ b/09.gan/index.cn.html
@@ -307,16 +307,16 @@ dg_program = fluid.Program()
 # 定义判别真实图片的program
 with fluid.program_guard(d_program):
    # 输入图片大小为28*28=784
-    img = fluid.layers.data(name='img', shape=[784], dtype='float32')
+    img = fluid.data(name='img', shape=[None, 784], dtype='float32')
    # 标签shape=1
-    label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+    label = fluid.data(name='label', shape=[None, 1], dtype='float32')
    d_logit = D(img)
    d_loss = loss(d_logit, label)

 # 定义判别生成图片的program
 with fluid.program_guard(dg_program):
-    noise = fluid.layers.data(
-        name='noise', shape=[NOISE_SIZE], dtype='float32')
+    noise = fluid.data(
+        name='noise', shape=[None, NOISE_SIZE], dtype='float32')
    # 噪声数据作为输入得到生成图片
    g_img = G(x=noise)