Add OCR attention model

ee2054da · qingqing01 · ed14907e · ee2054da · ee2054da · ee2054da
14 changed file
--- a/examples/ocr/README.md
+++ b/examples/ocr/README.md
+简介
+--------
+本OCR任务是识别图片单行的字母信息，基于attention的seq2seq结构。 运行本目录下的程序示例需要使用PaddlePaddle develop最新版本。
+
+## 代码结构
+```
+.
+|-- data.py          # 数据读取
+|-- eval.py          # 评估脚本
+|-- images           # 测试图片
+|-- predict.py       # 预测脚本
+|-- seq2seq_attn.py  # 模型
+|-- train.py         # 训练脚本
+`-- utility.py       # 公共模块
+```
+
+## 训练/评估/预测流程
+
+- 设置GPU环境:
+
+```
+export CUDA_VISIBLE_DEVICES=0
+```
+
+- 训练
+
+```
+python train.py
+```
+
+更多参数可以通过`--help`查看。
+
+
+- 动静切换
+
+
+```
+python train.py --dynamic=True
+```
+
+
+- 评估
+
+```
+python eval.py --init_model=checkpoint/final
+```
+
+
+- 预测
+
+目前不支持动态图预测
+
+```
+python predict.py --init_model=checkpoint/final --image_path=images/ --dynamic=False --beam_size=3
+```
+
+预测结果如下:
+
+```
+Image 1: images/112_chubbiness_13557.jpg
+0: chubbines
+1: chubbiness
+2: chubbinesS
+Image 2: images/177_Interfiled_40185.jpg
+0: Interflied
+1: Interfiled
+2: InterfIled
+Image 3: images/325_dame_19109.jpg
+0: da
+1: damo
+2: dame
+Image 4: images/368_fixtures_29232.jpg
+0: firtures
+1: Firtures
+2: fixtures
+```
--- a/examples/ocr/data.py
+++ b/examples/ocr/data.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from os import path
+import random
+import traceback
+import copy
+import math
+import tarfile
+from PIL import Image
+
+import logging
+logger = logging.getLogger(__name__)
+
+import paddle
+from paddle import fluid
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+DATA_MD5 = "7256b1d5420d8c3e74815196e58cdad5"
+DATA_URL = "http://paddle-ocr-data.bj.bcebos.com/data.tar.gz"
+CACHE_DIR_NAME = "attention_data"
+SAVED_FILE_NAME = "data.tar.gz"
+DATA_DIR_NAME = "data"
+TRAIN_DATA_DIR_NAME = "train_images"
+TEST_DATA_DIR_NAME = "test_images"
+TRAIN_LIST_FILE_NAME = "train.list"
+TEST_LIST_FILE_NAME = "test.list"
+
+
+class BatchCompose(object):
+    def __init__(self, transforms=[]):
+        self.transforms = transforms
+
+    def __call__(self, data):
+        for f in self.transforms:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.info("fail to perform batch transform [{}] with error: "
+                            "{} and stack:\n{}".format(f, e, str(stack_info)))
+                raise e
+        # sample list to batch data
+        batch = list(zip(*data))
+        return batch
+
+
+class Compose(object):
+    def __init__(self, transforms=[]):
+        self.transforms = transforms
+
+    def __call__(self, *data):
+        for f in self.transforms:
+            try:
+                data = f(*data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.info("fail to perform transform [{}] with error: "
+                            "{} and stack:\n{}".format(f, e, str(stack_info)))
+                raise e
+        return data
+
+
+class Resize(object):
+    def __init__(self, height=48):
+        self.interp = Image.NEAREST  # Image.ANTIALIAS
+        self.height = height
+
+    def __call__(self, samples):
+        shape = samples[0][0].size
+        for i in range(len(samples)):
+            im = samples[i][0]
+            im = im.resize((shape[0], self.height), self.interp)
+            samples[i][0] = im
+        return samples
+
+
+class Normalize(object):
+    def __init__(self,
+                 mean=[127.5],
+                 std=[1.0],
+                 scale=False,
+                 channel_first=True):
+        self.mean = mean
+        self.std = std
+        self.scale = scale
+        self.channel_first = channel_first
+        if not (isinstance(self.mean, list) and isinstance(self.std, list) and
+                isinstance(self.scale, bool)):
+            raise TypeError("{}: input type is invalid.".format(self))
+
+    def __call__(self, samples):
+        for i in range(len(samples)):
+            im = samples[i][0]
+            im = np.array(im).astype(np.float32, copy=False)
+            im = im[np.newaxis, ...]
+            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+            std = np.array(self.std)[np.newaxis, np.newaxis, :]
+            if self.scale:
+                im = im / 255.0
+            #im -= mean
+            im -= 127.5
+            #im /= std
+            samples[i][0] = im
+        return samples
+
+
+class PadTarget(object):
+    def __init__(self, SOS=0, EOS=1):
+        self.SOS = SOS
+        self.EOS = EOS
+
+    def __call__(self, samples):
+        lens = np.array([len(s[1]) for s in samples], dtype="int64")
+        max_len = np.max(lens)
+        for i in range(len(samples)):
+            label = samples[i][1]
+            if max_len > len(label):
+                pad_label = label + [self.EOS] * (max_len - len(label))
+            else:
+                pad_label = label
+            samples[i][1] = np.array([self.SOS] + pad_label, dtype='int64')
+            # label_out
+            samples[i].append(np.array(pad_label + [self.EOS], dtype='int64'))
+            mask = np.zeros((max_len + 1)).astype('float32')
+            mask[:len(label) + 1] = 1.0
+            # mask
+            samples[i].append(np.array(mask, dtype='float32'))
+        return samples
+
+
+class MyBatchSampler(fluid.io.BatchSampler):
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 shuffle=False,
+                 drop_last=True,
+                 seed=None):
+        self._dataset = dataset
+        self._batch_size = batch_size
+        self._shuffle = shuffle
+        self._drop_last = drop_last
+        self._random = np.random
+        self._random.seed(seed)
+        self._nranks = ParallelEnv().nranks
+        self._local_rank = ParallelEnv().local_rank
+        self._device_id = ParallelEnv().dev_id
+        self._num_samples = int(
+            math.ceil(len(self._dataset) * 1.0 / self._nranks))
+        self._total_size = self._num_samples * self._nranks
+        self._epoch = 0
+
+    def __iter__(self):
+        infos = copy.copy(self._dataset._sample_infos)
+        skip_num = 0
+        if self._shuffle:
+            if self._batch_size == 1:
+                self._random.RandomState(self._epoch).shuffle(infos)
+            else:  # partial shuffle
+                infos = sorted(infos, key=lambda x: x.w)
+                skip_num = random.randint(1, 100)
+
+        infos = infos[skip_num:] + infos[:skip_num]
+        infos += infos[:(self._total_size - len(infos))]
+        last_size = self._total_size % (self._batch_size * self._nranks)
+        batches = []
+        for i in range(self._local_rank * self._batch_size,
+                       len(infos) - last_size,
+                       self._batch_size * self._nranks):
+            batches.append(infos[i:i + self._batch_size])
+
+        if (not self._drop_last) and last_size != 0:
+            last_local_size = last_size // self._nranks
+            last_infos = infos[len(infos) - last_size:]
+            start = self._local_rank * last_local_size
+            batches.append(last_infos[start:start + last_local_size])
+
+        if self._shuffle:
+            self._random.RandomState(self._epoch).shuffle(batches)
+            self._epoch += 1
+
+        for batch in batches:
+            batch_indices = [info.idx for info in batch]
+            yield batch_indices
+
+    def __len__(self):
+        if self._drop_last:
+            return self._total_size // self._batch_size
+        else:
+            return math.ceil(self._total_size / float(self._batch_size))
+
+
+class SampleInfo(object):
+    def __init__(self, idx, h, w, im_name, labels):
+        self.idx = idx
+        self.h = h
+        self.w = w
+        self.im_name = im_name
+        self.labels = labels
+
+
+class OCRDataset(paddle.io.Dataset):
+    def __init__(self, image_dir, anno_file):
+        self.image_dir = image_dir
+        self.anno_file = anno_file
+        self._sample_infos = []
+        with open(anno_file, 'r') as f:
+            for i, line in enumerate(f):
+                w, h, im_name, labels = line.strip().split(' ')
+                h, w = int(h), int(w)
+                labels = [int(c) for c in labels.split(',')]
+                self._sample_infos.append(SampleInfo(i, h, w, im_name, labels))
+        #self._sample_infos = sorted(self._sample_infos,
+        #    key=lambda x: x.w)
+
+    def __getitem__(self, idx):
+        info = self._sample_infos[idx]
+        im_name, labels = info.im_name, info.labels
+        image = Image.open(path.join(self.image_dir, im_name)).convert('L')
+        return [image, labels]
+
+    def __len__(self):
+        return len(self._sample_infos)
+
+
+def train(
+        root_dir=None,
+        images_dir=None,
+        anno_file=None,
+        shuffle=True, ):
+    if root_dir is None:
+        root_dir = download_data()
+    if images_dir is None:
+        images_dir = TRAIN_DATA_DIR_NAME
+    images_dir = path.join(root_dir, TRAIN_DATA_DIR_NAME)
+    if anno_file is None:
+        anno_file = TRAIN_LIST_FILE_NAME
+    anno_file = path.join(root_dir, TRAIN_LIST_FILE_NAME)
+    return OCRDataset(images_dir, anno_file)
+
+
+def test(
+        root_dir=None,
+        images_dir=None,
+        anno_file=None,
+        shuffle=True, ):
+    if root_dir is None:
+        root_dir = download_data()
+    if images_dir is None:
+        images_dir = TEST_DATA_DIR_NAME
+    images_dir = path.join(root_dir, TEST_DATA_DIR_NAME)
+    if anno_file is None:
+        anno_file = TEST_LIST_FILE_NAME
+    anno_file = path.join(root_dir, TEST_LIST_FILE_NAME)
+    return OCRDataset(images_dir, anno_file)
+
+
+def download_data():
+    '''Download train and test data.
+    '''
+    tar_file = paddle.dataset.common.download(
+        DATA_URL, CACHE_DIR_NAME, DATA_MD5, save_name=SAVED_FILE_NAME)
+    data_dir = path.join(path.dirname(tar_file), DATA_DIR_NAME)
+    if not path.isdir(data_dir):
+        t = tarfile.open(tar_file, "r:gz")
+        t.extractall(path=path.dirname(tar_file))
+        t.close()
+    return data_dir
--- a/examples/ocr/eval.py
+++ b/examples/ocr/eval.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import argparse
+import functools
+
+import paddle.fluid.profiler as profiler
+import paddle.fluid as fluid
+
+from hapi.model import Input, set_device
+
+from utility import add_arguments, print_arguments
+from utility import SeqAccuracy, MyProgBarLogger, SeqBeamAccuracy
+from utility import postprocess
+from seq2seq_attn import Seq2SeqAttModel, Seq2SeqAttInferModel, WeightCrossEntropy
+import data
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',        int,   32,                 "Minibatch size.")
+add_arg('test_images',       str,   None,               "The directory of images to be used for test.")
+add_arg('test_list',         str,   None,               "The list file of images to be used for training.")
+add_arg('init_model',        str,   'checkpoint/final', "The init model file of directory.")
+add_arg('use_gpu',           bool,  True,               "Whether use GPU to train.")
+add_arg('encoder_size',      int,   200,                "Encoder size.")
+add_arg('decoder_size',      int,   128,                "Decoder size.")
+add_arg('embedding_dim',     int,   128,                "Word vector dim.")
+add_arg('num_classes',       int,   95,                 "Number classes.")
+add_arg('beam_size',         int,   0,                  "If set beam size, will use beam search.")
+add_arg('dynamic',           bool,  False,              "Whether to use dygraph.")
+# yapf: enable
+
+
+def main(FLAGS):
+    device = set_device("gpu" if FLAGS.use_gpu else "cpu")
+    fluid.enable_dygraph(device) if FLAGS.dynamic else None
+    model = Seq2SeqAttModel(
+        encoder_size=FLAGS.encoder_size,
+        decoder_size=FLAGS.decoder_size,
+        emb_dim=FLAGS.embedding_dim,
+        num_classes=FLAGS.num_classes)
+
+    # yapf: disable
+    inputs = [
+        Input([None, 1, 48, 384], "float32", name="pixel"),
+        Input([None, None], "int64", name="label_in")
+    ]
+    labels = [
+        Input([None, None], "int64", name="label_out"),
+        Input([None, None], "float32", name="mask")
+    ]
+    # yapf: enable
+
+    model.prepare(
+        loss_function=WeightCrossEntropy(),
+        metrics=SeqAccuracy(),
+        inputs=inputs,
+        labels=labels,
+        device=device)
+    model.load(FLAGS.init_model)
+
+    test_dataset = data.test()
+    test_collate_fn = data.BatchCompose(
+        [data.Resize(), data.Normalize(), data.PadTarget()])
+    test_sampler = data.MyBatchSampler(
+        test_dataset,
+        batch_size=FLAGS.batch_size,
+        drop_last=False,
+        shuffle=False)
+    test_loader = fluid.io.DataLoader(
+        test_dataset,
+        batch_sampler=test_sampler,
+        places=device,
+        num_workers=0,
+        return_list=True,
+        collate_fn=test_collate_fn)
+
+    model.evaluate(
+        eval_data=test_loader,
+        callbacks=[MyProgBarLogger(10, 2, FLAGS.batch_size)])
+
+
+def beam_search(FLAGS):
+    device = set_device("gpu" if FLAGS.use_gpu else "cpu")
+    fluid.enable_dygraph(device) if FLAGS.dynamic else None
+    model = Seq2SeqAttInferModel(
+        encoder_size=FLAGS.encoder_size,
+        decoder_size=FLAGS.decoder_size,
+        emb_dim=FLAGS.embedding_dim,
+        num_classes=FLAGS.num_classes,
+        beam_size=FLAGS.beam_size)
+
+    inputs = [
+        Input(
+            [None, 1, 48, 384], "float32", name="pixel"), Input(
+                [None, None], "int64", name="label_in")
+    ]
+    labels = [
+        Input(
+            [None, None], "int64", name="label_out"), Input(
+                [None, None], "float32", name="mask")
+    ]
+    model.prepare(
+        loss_function=None,
+        metrics=SeqBeamAccuracy(),
+        inputs=inputs,
+        labels=labels,
+        device=device)
+    model.load(FLAGS.init_model)
+
+    test_dataset = data.test()
+    test_collate_fn = data.BatchCompose(
+        [data.Resize(), data.Normalize(), data.PadTarget()])
+    test_sampler = data.MyBatchSampler(
+        test_dataset,
+        batch_size=FLAGS.batch_size,
+        drop_last=False,
+        shuffle=False)
+    test_loader = fluid.io.DataLoader(
+        test_dataset,
+        batch_sampler=test_sampler,
+        places=device,
+        num_workers=0,
+        return_list=True,
+        collate_fn=test_collate_fn)
+
+    model.evaluate(
+        eval_data=test_loader,
+        callbacks=[MyProgBarLogger(10, 2, FLAGS.batch_size)])
+
+
+if __name__ == '__main__':
+    FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
+    if FLAGS.beam_size:
+        beam_search(FLAGS)
+    else:
+        main(FLAGS)
--- a/examples/ocr/images/112_chubbiness_13557.jpg
+++ b/examples/ocr/images/112_chubbiness_13557.jpg
--- a/examples/ocr/images/177_Interfiled_40185.jpg
+++ b/examples/ocr/images/177_Interfiled_40185.jpg
--- a/examples/ocr/images/325_dame_19109.jpg
+++ b/examples/ocr/images/325_dame_19109.jpg
--- a/examples/ocr/images/368_fixtures_29232.jpg
+++ b/examples/ocr/images/368_fixtures_29232.jpg
--- a/examples/ocr/predict.py
+++ b/examples/ocr/predict.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import os
+import sys
+import random
+import numpy as np
+
+import argparse
+import functools
+from PIL import Image
+
+import paddle.fluid.profiler as profiler
+import paddle.fluid as fluid
+
+from hapi.model import Input, set_device
+from hapi.datasets.folder import ImageFolder
+
+from utility import add_arguments, print_arguments
+from utility import postprocess, index2word
+from seq2seq_attn import Seq2SeqAttInferModel, WeightCrossEntropy
+import data
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',        int,   1,       "Minibatch size.")
+add_arg('image_path',        str,   None,    "The directory of images to be used for test.")
+add_arg('init_model',        str,   None,    "The init model file of directory.")
+add_arg('use_gpu',           bool,  True,    "Whether use GPU to train.")
+# model hyper paramters
+add_arg('encoder_size',      int,   200,     "Encoder size.")
+add_arg('decoder_size',      int,   128,     "Decoder size.")
+add_arg('embedding_dim',     int,   128,     "Word vector dim.")
+add_arg('num_classes',       int,   95,      "Number classes.")
+add_arg('beam_size',         int,   3,       "Beam size for beam search.")
+add_arg('dynamic',           bool,  False,   "Whether to use dygraph.")
+# yapf: enable
+
+
+def main(FLAGS):
+    device = set_device("gpu" if FLAGS.use_gpu else "cpu")
+    fluid.enable_dygraph(device) if FLAGS.dynamic else None
+    model = Seq2SeqAttInferModel(
+        encoder_size=FLAGS.encoder_size,
+        decoder_size=FLAGS.decoder_size,
+        emb_dim=FLAGS.embedding_dim,
+        num_classes=FLAGS.num_classes,
+        beam_size=FLAGS.beam_size)
+
+    inputs = [Input([None, 1, 48, 384], "float32", name="pixel"), ]
+
+    model.prepare(inputs=inputs, device=device)
+    model.load(FLAGS.init_model)
+
+    fn = lambda p: Image.open(p).convert('L')
+    test_dataset = ImageFolder(FLAGS.image_path, loader=fn)
+    test_collate_fn = data.BatchCompose([data.Resize(), data.Normalize()])
+    test_loader = fluid.io.DataLoader(
+        test_dataset,
+        places=device,
+        num_workers=0,
+        return_list=True,
+        collate_fn=test_collate_fn)
+
+    samples = test_dataset.samples
+    #outputs = model.predict(test_loader)
+    ins_id = 0
+    for image, in test_loader:
+        image = image if FLAGS.dynamic else image[0]
+        pred = model.test_batch([image])[0]
+        pred = pred[:, :, np.newaxis] if len(pred.shape) == 2 else pred
+        pred = np.transpose(pred, [0, 2, 1])
+        for ins in pred:
+            impath = samples[ins_id]
+            ins_id += 1
+            print('Image {}: {}'.format(ins_id, impath))
+            for beam_idx, beam in enumerate(ins):
+                id_list = postprocess(beam)
+                word_list = index2word(id_list)
+                sequence = "".join(word_list)
+                print('{}: {}'.format(beam_idx, sequence))
+
+
+if __name__ == '__main__':
+    FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
+    main(FLAGS)
--- a/examples/ocr/seq2seq_attn.py
+++ b/examples/ocr/seq2seq_attn.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid.layers import BeamSearchDecoder
+
+from hapi.text import RNNCell, RNN, DynamicDecode
+from hapi.model import Model, Loss
+
+
+class ConvBNPool(fluid.dygraph.Layer):
+    def __init__(self,
+                 in_ch,
+                 out_ch,
+                 act="relu",
+                 is_test=False,
+                 pool=True,
+                 use_cudnn=True):
+        super(ConvBNPool, self).__init__()
+        self.pool = pool
+
+        filter_size = 3
+        std = (2.0 / (filter_size**2 * in_ch))**0.5
+        param_0 = fluid.ParamAttr(
+            initializer=fluid.initializer.Normal(0.0, std))
+
+        std = (2.0 / (filter_size**2 * out_ch))**0.5
+        param_1 = fluid.ParamAttr(
+            initializer=fluid.initializer.Normal(0.0, std))
+
+        self.conv0 = fluid.dygraph.Conv2D(
+            in_ch,
+            out_ch,
+            3,
+            padding=1,
+            param_attr=param_0,
+            bias_attr=False,
+            act=None,
+            use_cudnn=use_cudnn)
+        self.bn0 = fluid.dygraph.BatchNorm(out_ch, act=act)
+        self.conv1 = fluid.dygraph.Conv2D(
+            out_ch,
+            out_ch,
+            filter_size=3,
+            padding=1,
+            param_attr=param_1,
+            bias_attr=False,
+            act=None,
+            use_cudnn=use_cudnn)
+        self.bn1 = fluid.dygraph.BatchNorm(out_ch, act=act)
+
+        if self.pool:
+            self.pool = fluid.dygraph.Pool2D(
+                pool_size=2,
+                pool_type='max',
+                pool_stride=2,
+                use_cudnn=use_cudnn,
+                ceil_mode=True)
+
+    def forward(self, inputs):
+        out = self.conv0(inputs)
+        out = self.bn0(out)
+        out = self.conv1(out)
+        out = self.bn1(out)
+        if self.pool:
+            out = self.pool(out)
+        return out
+
+
+class CNN(fluid.dygraph.Layer):
+    def __init__(self, in_ch=1, is_test=False):
+        super(CNN, self).__init__()
+        self.conv_bn1 = ConvBNPool(in_ch, 16)
+        self.conv_bn2 = ConvBNPool(16, 32)
+        self.conv_bn3 = ConvBNPool(32, 64)
+        self.conv_bn4 = ConvBNPool(64, 128, pool=False)
+
+    def forward(self, inputs):
+        conv = self.conv_bn1(inputs)
+        conv = self.conv_bn2(conv)
+        conv = self.conv_bn3(conv)
+        conv = self.conv_bn4(conv)
+        return conv
+
+
+class GRUCell(RNNCell):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 param_attr=None,
+                 bias_attr=None,
+                 gate_activation='sigmoid',
+                 candidate_activation='tanh',
+                 origin_mode=False):
+        super(GRUCell, self).__init__()
+        self.hidden_size = hidden_size
+        self.fc_layer = fluid.dygraph.Linear(
+            input_size,
+            hidden_size * 3,
+            param_attr=param_attr,
+            bias_attr=False)
+
+        self.gru_unit = fluid.dygraph.GRUUnit(
+            hidden_size * 3,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            activation=candidate_activation,
+            gate_activation=gate_activation,
+            origin_mode=origin_mode)
+
+    def forward(self, inputs, states):
+        # step_outputs, new_states = cell(step_inputs, states)
+        # for GRUCell, `step_outputs` and `new_states` both are hidden
+        x = self.fc_layer(inputs)
+        hidden, _, _ = self.gru_unit(x, states)
+        return hidden, hidden
+
+    @property
+    def state_shape(self):
+        return [self.hidden_size]
+
+
+class Encoder(fluid.dygraph.Layer):
+    def __init__(
+            self,
+            in_channel=1,
+            rnn_hidden_size=200,
+            decoder_size=128,
+            is_test=False, ):
+        super(Encoder, self).__init__()
+        self.rnn_hidden_size = rnn_hidden_size
+
+        self.backbone = CNN(in_ch=in_channel, is_test=is_test)
+
+        para_attr = fluid.ParamAttr(
+            initializer=fluid.initializer.Normal(0.0, 0.02))
+        bias_attr = fluid.ParamAttr(
+            initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)
+        self.gru_fwd = RNN(cell=GRUCell(
+            input_size=128 * 6,
+            hidden_size=rnn_hidden_size,
+            param_attr=para_attr,
+            bias_attr=bias_attr,
+            candidate_activation='relu'),
+                           is_reverse=False,
+                           time_major=False)
+        self.gru_bwd = RNN(cell=GRUCell(
+            input_size=128 * 6,
+            hidden_size=rnn_hidden_size,
+            param_attr=para_attr,
+            bias_attr=bias_attr,
+            candidate_activation='relu'),
+                           is_reverse=True,
+                           time_major=False)
+        self.encoded_proj_fc = fluid.dygraph.Linear(
+            rnn_hidden_size * 2, decoder_size, bias_attr=False)
+
+    def forward(self, inputs):
+        conv_features = self.backbone(inputs)
+        conv_features = fluid.layers.transpose(
+            conv_features, perm=[0, 3, 1, 2])
+
+        n, w, c, h = conv_features.shape
+        seq_feature = fluid.layers.reshape(conv_features, [0, -1, c * h])
+
+        gru_fwd, _ = self.gru_fwd(seq_feature)
+        gru_bwd, _ = self.gru_bwd(seq_feature)
+
+        encoded_vector = fluid.layers.concat(input=[gru_fwd, gru_bwd], axis=2)
+        encoded_proj = self.encoded_proj_fc(encoded_vector)
+        return gru_bwd, encoded_vector, encoded_proj
+
+
+class Attention(fluid.dygraph.Layer):
+    """
+    Neural Machine Translation by Jointly Learning to Align and Translate.
+    https://arxiv.org/abs/1409.0473
+    """
+
+    def __init__(self, decoder_size):
+        super(Attention, self).__init__()
+        self.fc1 = fluid.dygraph.Linear(
+            decoder_size, decoder_size, bias_attr=False)
+        self.fc2 = fluid.dygraph.Linear(decoder_size, 1, bias_attr=False)
+
+    def forward(self, encoder_vec, encoder_proj, decoder_state):
+        # alignment model, single-layer multilayer perceptron
+        decoder_state = self.fc1(decoder_state)
+        decoder_state = fluid.layers.unsqueeze(decoder_state, [1])
+
+        e = fluid.layers.elementwise_add(encoder_proj, decoder_state)
+        e = fluid.layers.tanh(e)
+
+        att_scores = self.fc2(e)
+        att_scores = fluid.layers.squeeze(att_scores, [2])
+        att_scores = fluid.layers.softmax(att_scores)
+
+        context = fluid.layers.elementwise_mul(
+            x=encoder_vec, y=att_scores, axis=0)
+        context = fluid.layers.reduce_sum(context, dim=1)
+        return context
+
+
+class DecoderCell(RNNCell):
+    def __init__(self, encoder_size=200, decoder_size=128):
+        super(DecoderCell, self).__init__()
+        self.attention = Attention(decoder_size)
+        self.gru_cell = GRUCell(
+            input_size=encoder_size * 2 + decoder_size,
+            hidden_size=decoder_size)
+
+    def forward(self, current_word, states, encoder_vec, encoder_proj):
+        context = self.attention(encoder_vec, encoder_proj, states)
+        decoder_inputs = fluid.layers.concat([current_word, context], axis=1)
+        hidden, _ = self.gru_cell(decoder_inputs, states)
+        return hidden, hidden
+
+
+class Decoder(fluid.dygraph.Layer):
+    def __init__(self, num_classes, emb_dim, encoder_size, decoder_size):
+        super(Decoder, self).__init__()
+        self.decoder_attention = RNN(DecoderCell(encoder_size, decoder_size))
+        self.fc = fluid.dygraph.Linear(
+            decoder_size, num_classes + 2, act='softmax')
+
+    def forward(self, target, initial_states, encoder_vec, encoder_proj):
+        out, _ = self.decoder_attention(
+            target,
+            initial_states=initial_states,
+            encoder_vec=encoder_vec,
+            encoder_proj=encoder_proj)
+        pred = self.fc(out)
+        return pred
+
+
+class Seq2SeqAttModel(Model):
+    def __init__(
+            self,
+            in_channle=1,
+            encoder_size=200,
+            decoder_size=128,
+            emb_dim=128,
+            num_classes=None, ):
+        super(Seq2SeqAttModel, self).__init__()
+        self.encoder = Encoder(in_channle, encoder_size, decoder_size)
+        self.fc = fluid.dygraph.Linear(
+            input_dim=encoder_size,
+            output_dim=decoder_size,
+            bias_attr=False,
+            act='relu')
+        self.embedding = fluid.dygraph.Embedding(
+            [num_classes + 2, emb_dim], dtype='float32')
+        self.decoder = Decoder(num_classes, emb_dim, encoder_size,
+                               decoder_size)
+
+    def forward(self, inputs, target):
+        gru_backward, encoded_vector, encoded_proj = self.encoder(inputs)
+        decoder_boot = self.fc(gru_backward[:, 0])
+        trg_embedding = self.embedding(target)
+        prediction = self.decoder(trg_embedding, decoder_boot, encoded_vector,
+                                  encoded_proj)
+        return prediction
+
+
+class Seq2SeqAttInferModel(Seq2SeqAttModel):
+    def __init__(
+            self,
+            in_channle=1,
+            encoder_size=200,
+            decoder_size=128,
+            emb_dim=128,
+            num_classes=None,
+            beam_size=0,
+            bos_id=0,
+            eos_id=1,
+            max_out_len=20, ):
+        super(Seq2SeqAttInferModel, self).__init__(
+            in_channle, encoder_size, decoder_size, emb_dim, num_classes)
+        self.beam_size = beam_size
+        # dynamic decoder for inference
+        decoder = BeamSearchDecoder(
+            self.decoder.decoder_attention.cell,
+            start_token=bos_id,
+            end_token=eos_id,
+            beam_size=beam_size,
+            embedding_fn=self.embedding,
+            output_fn=self.decoder.fc)
+        self.infer_decoder = DynamicDecode(
+            decoder, max_step_num=max_out_len, is_test=True)
+
+    def forward(self, inputs, *args):
+        gru_backward, encoded_vector, encoded_proj = self.encoder(inputs)
+        decoder_boot = self.fc(gru_backward[:, 0])
+
+        if self.beam_size:
+            # Tile the batch dimension with beam_size
+            encoded_vector = BeamSearchDecoder.tile_beam_merge_with_batch(
+                encoded_vector, self.beam_size)
+            encoded_proj = BeamSearchDecoder.tile_beam_merge_with_batch(
+                encoded_proj, self.beam_size)
+        # dynamic decoding with beam search
+        rs, _ = self.infer_decoder(
+            inits=decoder_boot,
+            encoder_vec=encoded_vector,
+            encoder_proj=encoded_proj)
+        return rs
+
+
+class WeightCrossEntropy(Loss):
+    def __init__(self):
+        super(WeightCrossEntropy, self).__init__(average=False)
+
+    def forward(self, outputs, labels):
+        predict, (label, mask) = outputs[0], labels
+        loss = layers.cross_entropy(predict, label=label)
+        loss = layers.elementwise_mul(loss, mask, axis=0)
+        loss = layers.reduce_sum(loss)
+        return loss
--- a/examples/ocr/train.py
+++ b/examples/ocr/train.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import os
+import sys
+import random
+import numpy as np
+
+import argparse
+import functools
+
+import paddle.fluid.profiler as profiler
+import paddle.fluid as fluid
+
+from hapi.model import Input, set_device
+
+from utility import add_arguments, print_arguments
+from utility import SeqAccuracy, MyProgBarLogger
+from seq2seq_attn import Seq2SeqAttModel, WeightCrossEntropy
+import data
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',        int,   32,           "Minibatch size.")
+add_arg('epoch',             int,   30,           "Epoch number.")
+add_arg('num_workers',       int,   0,            "workers number.")
+add_arg('lr',                float, 0.001,        "Learning rate.")
+add_arg('lr_decay_strategy', str,   "",           "Learning rate decay strategy.")
+add_arg('checkpoint_path',   str,   "checkpoint", "The directory the model to be saved to.")
+add_arg('train_images',      str,   None,         "The directory of images to be used for training.")
+add_arg('train_list',        str,   None,         "The list file of images to be used for training.")
+add_arg('test_images',       str,   None,         "The directory of images to be used for test.")
+add_arg('test_list',         str,   None,         "The list file of images to be used for training.")
+add_arg('resume_path',       str,   None,         "The init model file of directory.")
+add_arg('use_gpu',           bool,  True,         "Whether use GPU to train.")
+# model hyper paramters
+add_arg('encoder_size',      int,   200,     "Encoder size.")
+add_arg('decoder_size',      int,   128,     "Decoder size.")
+add_arg('embedding_dim',     int,   128,     "Word vector dim.")
+add_arg('num_classes',       int,   95,     "Number classes.")
+add_arg('gradient_clip',     float, 5.0,     "Gradient clip value.")
+add_arg('dynamic',           bool,  False,      "Whether to use dygraph.")
+# yapf: enable
+
+
+def main(FLAGS):
+    device = set_device("gpu" if FLAGS.use_gpu else "cpu")
+    fluid.enable_dygraph(device) if FLAGS.dynamic else None
+
+    model = Seq2SeqAttModel(
+        encoder_size=FLAGS.encoder_size,
+        decoder_size=FLAGS.decoder_size,
+        emb_dim=FLAGS.embedding_dim,
+        num_classes=FLAGS.num_classes)
+
+    lr = FLAGS.lr
+    if FLAGS.lr_decay_strategy == "piecewise_decay":
+        learning_rate = fluid.layers.piecewise_decay(
+            [200000, 250000], [lr, lr * 0.1, lr * 0.01])
+    else:
+        learning_rate = lr
+    grad_clip = fluid.clip.GradientClipByGlobalNorm(FLAGS.gradient_clip)
+    optimizer = fluid.optimizer.Adam(
+        learning_rate=learning_rate,
+        parameter_list=model.parameters(),
+        grad_clip=grad_clip)
+
+    # yapf: disable
+    inputs = [
+        Input([None,1,48,384], "float32", name="pixel"),
+        Input([None, None], "int64", name="label_in"),
+    ]
+    labels = [
+        Input([None, None], "int64", name="label_out"),
+        Input([None, None], "float32", name="mask"),
+    ]
+    # yapf: enable
+
+    model.prepare(
+        optimizer,
+        WeightCrossEntropy(),
+        SeqAccuracy(),
+        inputs=inputs,
+        labels=labels)
+
+    train_dataset = data.train()
+    train_collate_fn = data.BatchCompose(
+        [data.Resize(), data.Normalize(), data.PadTarget()])
+    train_sampler = data.MyBatchSampler(
+        train_dataset, batch_size=FLAGS.batch_size, shuffle=True)
+    train_loader = fluid.io.DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        places=device,
+        num_workers=FLAGS.num_workers,
+        return_list=True,
+        collate_fn=train_collate_fn)
+    test_dataset = data.test()
+    test_collate_fn = data.BatchCompose(
+        [data.Resize(), data.Normalize(), data.PadTarget()])
+    test_sampler = data.MyBatchSampler(
+        test_dataset,
+        batch_size=FLAGS.batch_size,
+        drop_last=False,
+        shuffle=False)
+    test_loader = fluid.io.DataLoader(
+        test_dataset,
+        batch_sampler=test_sampler,
+        places=device,
+        num_workers=0,
+        return_list=True,
+        collate_fn=test_collate_fn)
+
+    model.fit(train_data=train_loader,
+              eval_data=test_loader,
+              epochs=FLAGS.epoch,
+              save_dir=FLAGS.checkpoint_path,
+              callbacks=[MyProgBarLogger(10, 2, FLAGS.batch_size)])
+
+
+if __name__ == '__main__':
+    FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
+    main(FLAGS)
--- a/examples/ocr/utility.py
+++ b/examples/ocr/utility.py
+"""Contains common utility functions."""
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import distutils.util
+import numpy as np
+import paddle.fluid as fluid
+import six
+
+from hapi.metrics import Metric
+from hapi.callbacks import ProgBarLogger
+
+
+def print_arguments(args):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(six.iteritems(vars(args))):
+        print("%s: %s" % (arg, value))
+    print("------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
+
+
+class SeqAccuracy(Metric):
+    def __init__(self, name=None, *args, **kwargs):
+        super(SeqAccuracy, self).__init__(*args, **kwargs)
+        self._name = 'seq_acc'
+        self.reset()
+
+    def add_metric_op(self, output, label, mask, *args, **kwargs):
+        pred = fluid.layers.flatten(output, axis=2)
+        score, topk = fluid.layers.topk(pred, 1)
+        return topk, label, mask
+
+    def update(self, topk, label, mask, *args, **kwargs):
+        topk = topk.reshape(label.shape[0], -1)
+        seq_len = np.sum(mask, -1)
+        acc = 0
+        for i in range(label.shape[0]):
+            l = int(seq_len[i] - 1)
+            pred = topk[i][:l - 1]
+            ref = label[i][:l - 1]
+            if np.array_equal(pred, ref):
+                self.total += 1
+                acc += 1
+            self.count += 1
+        return float(acc) / label.shape[0]
+
+    def reset(self):
+        self.total = 0.
+        self.count = 0.
+
+    def accumulate(self):
+        return float(self.total) / self.count
+
+    def name(self):
+        return self._name
+
+
+class MyProgBarLogger(ProgBarLogger):
+    def __init__(self, log_freq=1, verbose=2, train_bs=None, eval_bs=None):
+        super(MyProgBarLogger, self).__init__(log_freq, verbose)
+        self.train_bs = train_bs
+        self.eval_bs = eval_bs if eval_bs else train_bs
+
+    def on_train_batch_end(self, step, logs=None):
+        logs = logs or {}
+        logs['loss'] = [l / self.train_bs for l in logs['loss']]
+        super(MyProgBarLogger, self).on_train_batch_end(step, logs)
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        logs['loss'] = [l / self.train_bs for l in logs['loss']]
+        super(MyProgBarLogger, self).on_epoch_end(epoch, logs)
+
+    def on_eval_batch_end(self, step, logs=None):
+        logs = logs or {}
+        logs['loss'] = [l / self.eval_bs for l in logs['loss']]
+        super(MyProgBarLogger, self).on_eval_batch_end(step, logs)
+
+    def on_eval_end(self, logs=None):
+        logs = logs or {}
+        logs['loss'] = [l / self.eval_bs for l in logs['loss']]
+        super(MyProgBarLogger, self).on_eval_end(logs)
+
+
+def index2word(ids):
+    return [chr(int(k + 33)) for k in ids]
+
+
+def postprocess(seq, bos_idx=0, eos_idx=1):
+    if type(seq) is np.ndarray:
+        seq = seq.tolist()
+    eos_pos = len(seq) - 1
+    for i, idx in enumerate(seq):
+        if idx == eos_idx:
+            eos_pos = i
+            break
+    seq = [
+        idx for idx in seq[:eos_pos + 1] if idx != bos_idx and idx != eos_idx
+    ]
+    return seq
+
+
+class SeqBeamAccuracy(Metric):
+    def __init__(self, name=None, *args, **kwargs):
+        super(SeqBeamAccuracy, self).__init__(*args, **kwargs)
+        self._name = 'seq_acc'
+        self.reset()
+
+    def add_metric_op(self, output, label, mask, *args, **kwargs):
+        return output, label, mask
+
+    def update(self, preds, labels, masks, *args, **kwargs):
+        preds = preds[:, :, np.newaxis] if len(preds.shape) == 2 else preds
+        preds = np.transpose(preds, [0, 2, 1])
+        seq_len = np.sum(masks, -1)
+        acc = 0
+        for i in range(labels.shape[0]):
+            l = int(seq_len[i] - 1)
+            #ref = labels[i][: l - 1]
+            ref = np.array(postprocess(labels[i]))
+            pred = preds[i]
+            for idx, beam in enumerate(pred):
+                beam_pred = np.array(postprocess(beam))
+                if np.array_equal(beam_pred, ref):
+                    self.total += 1
+                    acc += 1
+                    break
+            self.count += 1
+        return float(acc) / labels.shape[0]
+
+    def reset(self):
+        self.total = 0.
+        self.count = 0.
+
+    def accumulate(self):
+        return float(self.total) / self.count
+
+    def name(self):
+        return self._name
--- a/hapi/callbacks.py
+++ b/hapi/callbacks.py
@@ -218,8 +218,6 @@ class ProgBarLogger(Callback):
            # if steps is not None, last step will update in on_epoch_end
            if self.steps and self.train_step < self.steps:
                self._updates(logs, 'train')
-            else:
-                self._updates(logs, 'train')

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
@@ -238,7 +236,7 @@ class ProgBarLogger(Callback):

    def on_eval_batch_end(self, step, logs=None):
        logs = logs or {}
-        self.eval_step = step
+        self.eval_step += 1
        samples = logs.get('batch_size', 1)
        self.evaled_samples += samples


--- a/hapi/datasets/folder.py
+++ b/hapi/datasets/folder.py
@@ -18,7 +18,7 @@ import cv2

 from paddle.io import Dataset

-__all__ = ["DatasetFolder"]
+__all__ = ["DatasetFolder", "ImageFolder"]


 def has_valid_extension(filename, extensions):
@@ -164,3 +164,80 @@ IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',

 def cv2_loader(path):
    return cv2.imread(path)
+
+
+class ImageFolder(Dataset):
+    """A generic data loader where the samples are arranged in this way:
+
+        root/1.ext
+        root/2.ext
+        root/sub_dir/3.ext
+
+    Args:
+        root (string): Root directory path.
+        loader (callable, optional): A function to load a sample given its path.
+        extensions (tuple[string], optional): A list of allowed extensions.
+            both extensions and is_valid_file should not be passed.
+        transform (callable, optional): A function/transform that takes in
+            a sample and returns a transformed version.
+        is_valid_file (callable, optional): A function that takes path of a file
+            and check if the file is a valid file (used to check of corrupt files)
+            both extensions and is_valid_file should not be passed.
+
+     Attributes:
+        samples (list): List of sample path
+     """
+
+    def __init__(self,
+                 root,
+                 loader=None,
+                 extensions=None,
+                 transform=None,
+                 is_valid_file=None):
+        self.root = root
+        if extensions is None:
+            extensions = IMG_EXTENSIONS
+
+        samples = []
+        path = os.path.expanduser(root)
+        if not ((extensions is None) ^ (is_valid_file is None)):
+            raise ValueError(
+                "Both extensions and is_valid_file cannot be None or not None at the same time"
+            )
+        if extensions is not None:
+
+            def is_valid_file(x):
+                return has_valid_extension(x, extensions)
+
+        for root, _, fnames in sorted(os.walk(path, followlinks=True)):
+            for fname in sorted(fnames):
+                f = os.path.join(root, fname)
+                if is_valid_file(f):
+                    samples.append(f)
+
+        if len(samples) == 0:
+            raise (RuntimeError(
+                "Found 0 files in subfolders of: " + self.root + "\n"
+                "Supported extensions are: " + ",".join(extensions)))
+
+        self.loader = cv2_loader if loader is None else loader
+        self.extensions = extensions
+        self.samples = samples
+        self.transform = transform
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        path = self.samples[index]
+        sample = self.loader(path)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        return [sample]
+
+    def __len__(self):
+        return len(self.samples)
--- a/hapi/model.py
+++ b/hapi/model.py
@@ -1161,7 +1161,7 @@ class Model(fluid.dygraph.Layer):
        if fluid.in_dygraph_mode():
            feed_list = None
        else:
-            feed_list = [x.forward() for x in self._inputs + self._labels]
+            feed_list = [x.forward() for x in self._inputs]

        if test_data is not None and isinstance(test_data, Dataset):
            test_sampler = DistributedBatchSampler(
@@ -1236,10 +1236,10 @@ class Model(fluid.dygraph.Layer):
            callbacks.on_batch_begin(mode, step, logs)
            if mode == 'train':
                outs = self.train_batch(data[:len(self._inputs)],
-                                           data[len(self._inputs):])
+                                        data[len(self._inputs):])
            else:
                outs = self.eval_batch(data[:len(self._inputs)],
-                                          data[len(self._inputs):])
+                                       data[len(self._inputs):])

            # losses
            loss = outs[0] if self._metrics else outs