diff --git a/docs/advanced_tutorials/READER.md b/docs/advanced_tutorials/READER.md
index 4d87fdbbb1291bf49960940e6c94e5daa0e89e39..836ef0a5c2054e8d33b31a2293d3e1bacb929446 100644
--- a/docs/advanced_tutorials/READER.md
+++ b/docs/advanced_tutorials/READER.md
@@ -34,8 +34,6 @@ PaddleDetection的数据处理模块是一个Python模块，所有代码逻辑
   ├── tests  # 单元测试模块
   │   ├── test_dataset.py # 对数据集解析、加载等进行单元测试
   │   │   ...
-  ├── tools  # 一些有用的工具
-  │   ├── x2coco.py       # 将其他数据集转换为COCO数据集格式
   ├── transform  # 数据预处理模块
   │   ├── batch_operators.py  # 定义各类基于批量数据的预处理算子
   │   ├── op_helper.py    # 预处理算子的辅助函数
diff --git a/docs/advanced_tutorials/TRANSFER_LEARNING.md b/docs/advanced_tutorials/TRANSFER_LEARNING.md
index f88873b49f0e0d128a2ce44cf450571e20ccfe8a..407379a4cd1c72b469ad3381d52622edfb3a5fb6 100644
--- a/docs/advanced_tutorials/TRANSFER_LEARNING.md
+++ b/docs/advanced_tutorials/TRANSFER_LEARNING.md
@@ -8,7 +8,7 @@ In transfer learning, if different dataset and the number of classes is used, th
 
 ### Use custom dataset
 
-Transfer learning needs custom dataset and annotation in COCO-format and VOC-format is supported now. The script converts the annotation from labelme or cityscape to COCO is provided in ```ppdet/data/tools/x2coco.py```. More details please refer to [READER](READER.md). After data preparation, update the data parameters in configuration file.
+Transfer learning needs custom dataset and annotation in COCO-format and VOC-format is supported now. The script converts the annotation from voc, labelme or cityscape to COCO is provided in ```tools/x2coco.py```. More details please refer to [READER](READER.md). After data preparation, update the data parameters in configuration file.
 
 
 1. COCO-format dataset, take [yolov3\_darknet.yml](https://github.com/PaddlePaddle/PaddleDetection/blob/master/configs/yolov3_darknet.yml#L66) for example, modify the COCODataSet in yolov3\_reader:
diff --git a/docs/advanced_tutorials/TRANSFER_LEARNING_cn.md b/docs/advanced_tutorials/TRANSFER_LEARNING_cn.md
index 5cd5d7fe78b283f2ad6d7a0c7cf20ab01c3dc9f5..022ef0c96c52ac5764e26835715f459b5a61b0a6 100644
--- a/docs/advanced_tutorials/TRANSFER_LEARNING_cn.md
+++ b/docs/advanced_tutorials/TRANSFER_LEARNING_cn.md
@@ -7,7 +7,7 @@
 
 ### 选择数据
 
-迁移学习需要使用自己的数据集，目前已支持COCO和VOC的数据标注格式，在```ppdet/data/tools/x2coco.py```中给出了labelme和cityscape标注格式转换为COCO格式的脚本，具体使用方式可以参考[自定义数据源](READER.md)。数据准备完成后，在配置文件中配置数据路径，对应修改reader中的路径参数即可。
+迁移学习需要使用自己的数据集，目前已支持COCO和VOC的数据标注格式，在```tools/x2coco.py```中给出了voc、labelme和cityscape标注格式转换为COCO格式的脚本，具体使用方式可以参考[自定义数据源](READER.md)。数据准备完成后，在配置文件中配置数据路径，对应修改reader中的路径参数即可。
 
 1. COCO数据集需要修改COCODataSet中的参数，以[yolov3\_darknet.yml](https://github.com/PaddlePaddle/PaddleDetection/blob/master/configs/yolov3_darknet.yml#L66)为例，修改yolov3\_reader中的配置：
 
diff --git a/docs/tutorials/Custom_DataSet.md b/docs/tutorials/Custom_DataSet.md
index 3734d799476a1e286c208174bafc4fc6e5396c45..aab35436a8ea6a44e84f57152514fd201a1a0932 100644
--- a/docs/tutorials/Custom_DataSet.md
+++ b/docs/tutorials/Custom_DataSet.md
@@ -15,26 +15,42 @@
 
 ### 方式一：将数据集转换为COCO格式
 
-在`./tools/`中提供了`x2coco.py`用于将labelme标注的数据集或cityscape数据集转换为COCO数据集:
+在`./tools/`中提供了`x2coco.py`用于将voc格式数据集、labelme标注的数据集或cityscape数据集转换为COCO数据集，例如:
+
+（1）labelmes数据转换为COCO格式：
 ```bash
-python ./ppdet/data/tools/x2coco.py \
+python tools/x2coco.py \
                 --dataset_type labelme \
                 --json_input_dir ./labelme_annos/ \
                 --image_input_dir ./labelme_imgs/ \
                 --output_dir ./cocome/ \
                 --train_proportion 0.8 \
                 --val_proportion 0.2 \
-                --test_proportion 0.0 \
+                --test_proportion 0.0
+```
+（2）voc数据转换为COCO格式：
+```bash
+python tools/x2coco.py \
+        --dataset_type voc \
+        --voc_anno_dir path/to/VOCdevkit/VOC2007/Annotations/ \
+        --voc_anno_list path/to/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt \
+        --voc_label_list dataset/voc/label_list.txt \
+        --voc_out_name voc_train.json
 ```
+
 **参数说明：**
 
-- `--dataset_type`：需要转换的数据格式，目前支持：’labelme‘和’cityscape‘
+- `--dataset_type`：需要转换的数据格式，目前支持：’voc‘、’labelme‘和’cityscape‘
 - `--json_input_dir`：使用labelme标注的json文件所在文件夹
 - `--image_input_dir`：图像文件所在文件夹
 - `--output_dir`：转换后的COCO格式数据集存放位置
 - `--train_proportion`：标注数据中用于train的比例
 - `--val_proportion`：标注数据中用于validation的比例
 - `--test_proportion`：标注数据中用于infer的比例
+- `--voc_anno_dir`：VOC数据转换为COCO数据集时的voc数据集标注文件路径
+- `--voc_anno_list`：VOC数据转换为COCO数据集时的标注列表文件，一般是`ImageSets/Main`下trainval.txt和test.txt文件
+- `--voc_label_list`：VOC数据转换为COCO数据集时的类别列表文件，文件中每一行表示一种物体类别
+- `--voc_out_name`：VOC数据转换为COCO数据集时的输出的COCO数据集格式json文件名
 
 ### 方式二：将数据集转换为VOC格式
 
diff --git a/ppdet/data/source/coco.py b/ppdet/data/source/coco.py
index 6b31ccb3547f73284c5213433301237ad7a47432..67c561786446722e2b538d78cf8cf0488a168691 100644
--- a/ppdet/data/source/coco.py
+++ b/ppdet/data/source/coco.py
@@ -137,14 +137,14 @@ class COCODataSet(DataSet):
                     y1 = max(0, y)
                     x2 = min(im_w - 1, x1 + max(0, box_w - 1))
                     y2 = min(im_h - 1, y1 + max(0, box_h - 1))
-                    if inst['area'] > 0 and x2 >= x1 and y2 >= y1:
+                    if x2 >= x1 and y2 >= y1:
                         inst['clean_bbox'] = [x1, y1, x2, y2]
                         bboxes.append(inst)
                     else:
                         logger.warn(
                             'Found an invalid bbox in annotations: im_id: {}, '
-                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
-                                img_id, float(inst['area']), x1, y1, x2, y2))
+                            'x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                img_id, x1, y1, x2, y2))
                 num_bbox = len(bboxes)
 
                 gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
diff --git a/ppdet/data/tools/x2coco.py b/ppdet/data/tools/x2coco.py
deleted file mode 100644
index 53faa3f5c6c48ea3b81d65df1b18aa7aa1a1f5e1..0000000000000000000000000000000000000000
--- a/ppdet/data/tools/x2coco.py
+++ /dev/null
@@ -1,306 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import glob
-import json
-import os
-import os.path as osp
-import sys
-import shutil
-
-import numpy as np
-import PIL.ImageDraw
-
-label_to_num = {}
-categories_list = []
-labels_list = []
-
-
-class MyEncoder(json.JSONEncoder):
-    def default(self, obj):
-        if isinstance(obj, np.integer):
-            return int(obj)
-        elif isinstance(obj, np.floating):
-            return float(obj)
-        elif isinstance(obj, np.ndarray):
-            return obj.tolist()
-        else:
-            return super(MyEncoder, self).default(obj)
-
-
-def getbbox(self, points):
-    polygons = points
-    mask = self.polygons_to_mask([self.height, self.width], polygons)
-    return self.mask2box(mask)
-
-
-def images_labelme(data, num):
-    image = {}
-    image['height'] = data['imageHeight']
-    image['width'] = data['imageWidth']
-    image['id'] = num + 1
-    image['file_name'] = data['imagePath'].split('/')[-1]
-    return image
-
-
-def images_cityscape(data, num, img_file):
-    image = {}
-    image['height'] = data['imgHeight']
-    image['width'] = data['imgWidth']
-    image['id'] = num + 1
-    image['file_name'] = img_file
-    return image
-
-
-def categories(label, labels_list):
-    category = {}
-    category['supercategory'] = 'component'
-    category['id'] = len(labels_list) + 1
-    category['name'] = label
-    return category
-
-
-def annotations_rectangle(points, label, image_num, object_num, label_to_num):
-    annotation = {}
-    seg_points = np.asarray(points).copy()
-    seg_points[1, :] = np.asarray(points)[2, :]
-    seg_points[2, :] = np.asarray(points)[1, :]
-    annotation['segmentation'] = [list(seg_points.flatten())]
-    annotation['iscrowd'] = 0
-    annotation['image_id'] = image_num + 1
-    annotation['bbox'] = list(
-        map(float, [
-            points[0][0], points[0][1], points[1][0] - points[0][0], points[1][
-                1] - points[0][1]
-        ]))
-    annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3]
-    annotation['category_id'] = label_to_num[label]
-    annotation['id'] = object_num + 1
-    return annotation
-
-
-def annotations_polygon(height, width, points, label, image_num, object_num,
-                        label_to_num):
-    annotation = {}
-    annotation['segmentation'] = [list(np.asarray(points).flatten())]
-    annotation['iscrowd'] = 0
-    annotation['image_id'] = image_num + 1
-    annotation['bbox'] = list(map(float, get_bbox(height, width, points)))
-    annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3]
-    annotation['category_id'] = label_to_num[label]
-    annotation['id'] = object_num + 1
-    return annotation
-
-
-def get_bbox(height, width, points):
-    polygons = points
-    mask = np.zeros([height, width], dtype=np.uint8)
-    mask = PIL.Image.fromarray(mask)
-    xy = list(map(tuple, polygons))
-    PIL.ImageDraw.Draw(mask).polygon(xy=xy, outline=1, fill=1)
-    mask = np.array(mask, dtype=bool)
-    index = np.argwhere(mask == 1)
-    rows = index[:, 0]
-    clos = index[:, 1]
-    left_top_r = np.min(rows)
-    left_top_c = np.min(clos)
-    right_bottom_r = np.max(rows)
-    right_bottom_c = np.max(clos)
-    return [
-        left_top_c, left_top_r, right_bottom_c - left_top_c,
-        right_bottom_r - left_top_r
-    ]
-
-
-def deal_json(ds_type, img_path, json_path):
-    data_coco = {}
-    images_list = []
-    annotations_list = []
-    image_num = -1
-    object_num = -1
-    for img_file in os.listdir(img_path):
-        img_label = os.path.splitext(img_file)[0]
-        if img_file.split('.')[
-                -1] not in ['bmp', 'jpg', 'jpeg', 'png', 'JPEG', 'JPG', 'PNG']:
-            continue
-        label_file = osp.join(json_path, img_label + '.json')
-        print('Generating dataset from:', label_file)
-        image_num = image_num + 1
-        with open(label_file) as f:
-            data = json.load(f)
-            if ds_type == 'labelme':
-                images_list.append(images_labelme(data, image_num))
-            elif ds_type == 'cityscape':
-                images_list.append(images_cityscape(data, image_num, img_file))
-            if ds_type == 'labelme':
-                for shapes in data['shapes']:
-                    object_num = object_num + 1
-                    label = shapes['label']
-                    if label not in labels_list:
-                        categories_list.append(categories(label, labels_list))
-                        labels_list.append(label)
-                        label_to_num[label] = len(labels_list)
-                    points = shapes['points']
-                    p_type = shapes['shape_type']
-                    if p_type == 'polygon':
-                        annotations_list.append(
-                            annotations_polygon(data['imageHeight'], data[
-                                'imageWidth'], points, label, image_num,
-                                                object_num, label_to_num))
-
-                    if p_type == 'rectangle':
-                        points.append([points[0][0], points[1][1]])
-                        points.append([points[1][0], points[0][1]])
-                        annotations_list.append(
-                            annotations_rectangle(points, label, image_num,
-                                                  object_num, label_to_num))
-            elif ds_type == 'cityscape':
-                for shapes in data['objects']:
-                    object_num = object_num + 1
-                    label = shapes['label']
-                    if label not in labels_list:
-                        categories_list.append(categories(label, labels_list))
-                        labels_list.append(label)
-                        label_to_num[label] = len(labels_list)
-                    points = shapes['polygon']
-                    annotations_list.append(
-                        annotations_polygon(data['imgHeight'], data[
-                            'imgWidth'], points, label, image_num, object_num,
-                                            label_to_num))
-    data_coco['images'] = images_list
-    data_coco['categories'] = categories_list
-    data_coco['annotations'] = annotations_list
-    return data_coco
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--dataset_type', help='the type of dataset')
-    parser.add_argument('--json_input_dir', help='input annotated directory')
-    parser.add_argument('--image_input_dir', help='image directory')
-    parser.add_argument(
-        '--output_dir', help='output dataset directory', default='../../../')
-    parser.add_argument(
-        '--train_proportion',
-        help='the proportion of train dataset',
-        type=float,
-        default=1.0)
-    parser.add_argument(
-        '--val_proportion',
-        help='the proportion of validation dataset',
-        type=float,
-        default=0.0)
-    parser.add_argument(
-        '--test_proportion',
-        help='the proportion of test dataset',
-        type=float,
-        default=0.0)
-    args = parser.parse_args()
-    try:
-        assert args.dataset_type in ['labelme', 'cityscape']
-    except AssertionError as e:
-        print('Now only support the cityscape dataset and labelme dataset!!')
-        os._exit(0)
-    try:
-        assert os.path.exists(args.json_input_dir)
-    except AssertionError as e:
-        print('The json folder does not exist!')
-        os._exit(0)
-    try:
-        assert os.path.exists(args.image_input_dir)
-    except AssertionError as e:
-        print('The image folder does not exist!')
-        os._exit(0)
-    try:
-        assert abs(args.train_proportion + args.val_proportion \
-                   + args.test_proportion - 1.0) < 1e-5
-    except AssertionError as e:
-        print(
-            'The sum of pqoportion of training, validation and test datase must be 1!'
-        )
-        os._exit(0)
-
-    # Allocate the dataset.
-    total_num = len(glob.glob(osp.join(args.json_input_dir, '*.json')))
-    if args.train_proportion != 0:
-        train_num = int(total_num * args.train_proportion)
-        os.makedirs(args.output_dir + '/train')
-    else:
-        train_num = 0
-    if args.val_proportion == 0.0:
-        val_num = 0
-        test_num = total_num - train_num
-        if args.test_proportion != 0.0:
-            os.makedirs(args.output_dir + '/test')
-    else:
-        val_num = int(total_num * args.val_proportion)
-        test_num = total_num - train_num - val_num
-        os.makedirs(args.output_dir + '/val')
-        if args.test_proportion != 0.0:
-            os.makedirs(args.output_dir + '/test')
-    count = 1
-    for img_name in os.listdir(args.image_input_dir):
-        if count <= train_num:
-            if osp.exists(args.output_dir + '/train/'):
-                shutil.copyfile(
-                    osp.join(args.image_input_dir, img_name),
-                    osp.join(args.output_dir + '/train/', img_name))
-        else:
-            if count <= train_num + val_num:
-                if osp.exists(args.output_dir + '/val/'):
-                    shutil.copyfile(
-                        osp.join(args.image_input_dir, img_name),
-                        osp.join(args.output_dir + '/val/', img_name))
-            else:
-                if osp.exists(args.output_dir + '/test/'):
-                    shutil.copyfile(
-                        osp.join(args.image_input_dir, img_name),
-                        osp.join(args.output_dir + '/test/', img_name))
-        count = count + 1
-
-    # Deal with the json files.
-    if not os.path.exists(args.output_dir + '/annotations'):
-        os.makedirs(args.output_dir + '/annotations')
-    if args.train_proportion != 0:
-        train_data_coco = deal_json(
-            args.dataset_type, args.output_dir + '/train', args.json_input_dir)
-        train_json_path = osp.join(args.output_dir + '/annotations',
-                                   'instance_train.json')
-        json.dump(
-            train_data_coco,
-            open(train_json_path, 'w'),
-            indent=4,
-            cls=MyEncoder)
-    if args.val_proportion != 0:
-        val_data_coco = deal_json(args.dataset_type, args.output_dir + '/val',
-                                  args.json_input_dir)
-        val_json_path = osp.join(args.output_dir + '/annotations',
-                                 'instance_val.json')
-        json.dump(
-            val_data_coco, open(val_json_path, 'w'), indent=4, cls=MyEncoder)
-    if args.test_proportion != 0:
-        test_data_coco = deal_json(args.dataset_type, args.output_dir + '/test',
-                                   args.json_input_dir)
-        test_json_path = osp.join(args.output_dir + '/annotations',
-                                  'instance_test.json')
-        json.dump(
-            test_data_coco, open(test_json_path, 'w'), indent=4, cls=MyEncoder)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/cpp_infer.py b/tools/cpp_infer.py
deleted file mode 100644
index 1165ed5378e34f76f41c18d06deb7ce6da30f742..0000000000000000000000000000000000000000
--- a/tools/cpp_infer.py
+++ /dev/null
@@ -1,630 +0,0 @@
-import os
-import time
-
-import numpy as np
-from PIL import Image, ImageDraw
-
-import paddle.fluid as fluid
-
-import argparse
-import cv2
-import yaml
-import copy
-
-import logging
-FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
-logging.basicConfig(level=logging.INFO, format=FORMAT)
-logger = logging.getLogger(__name__)
-
-precision_map = {
-    'trt_int8': fluid.core.AnalysisConfig.Precision.Int8,
-    'trt_fp32': fluid.core.AnalysisConfig.Precision.Float32,
-    'trt_fp16': fluid.core.AnalysisConfig.Precision.Half
-}
-
-
-def create_config(model_path, mode='fluid', batch_size=1, min_subgraph_size=3):
-    model_file = os.path.join(model_path, '__model__')
-    params_file = os.path.join(model_path, '__params__')
-    config = fluid.core.AnalysisConfig(model_file, params_file)
-    config.enable_use_gpu(100, 0)
-    config.switch_use_feed_fetch_ops(False)
-    config.switch_specify_input_names(True)
-    logger.info('min_subgraph_size = %d.' % (min_subgraph_size))
-
-    if mode in precision_map.keys():
-        config.enable_tensorrt_engine(
-            workspace_size=1 << 30,
-            max_batch_size=batch_size,
-            min_subgraph_size=min_subgraph_size,
-            precision_mode=precision_map[mode],
-            use_static=False,
-            use_calib_mode=mode == 'trt_int8')
-        logger.info('Run inference by {}.'.format(mode))
-    elif mode == 'fluid':
-        logger.info('Run inference by Fluid FP32.')
-    else:
-        logger.fatal(
-            'Wrong mode, only support trt_int8, trt_fp32, trt_fp16, fluid.')
-    return config
-
-
-def offset_to_lengths(lod):
-    offset = lod[0]
-    lengths = [offset[i + 1] - offset[i] for i in range(len(offset) - 1)]
-    return [lengths]
-
-
-def DecodeImage(im_path):
-    assert os.path.exists(im_path), "Image path {} can not be found".format(
-        im_path)
-    with open(im_path, 'rb') as f:
-        im = f.read()
-    data = np.frombuffer(im, dtype='uint8')
-    im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
-    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
-    return im
-
-
-def get_extra_info(im, arch, shape, scale):
-    info = []
-    input_shape = []
-    im_shape = []
-    logger.info('The architecture is {}'.format(arch))
-    if 'YOLO' in arch:
-        im_size = np.array([shape[:2]]).astype('int32')
-        logger.info('Extra info: im_size')
-        info.append(im_size)
-    elif arch in ['SSD', 'Face']:
-        im_shape = np.array([shape[:2]]).astype('int32')
-        logger.info('Extra info: im_shape')
-        info.append([im_shape])
-    elif 'RetinaNet' in arch:
-        input_shape.extend(im.shape[2:])
-        im_info = np.array([input_shape + [scale]]).astype('float32')
-        logger.info('Extra info: im_info')
-        info.append(im_info)
-    elif 'RCNN' in arch:
-        input_shape.extend(im.shape[2:])
-        im_shape.extend(shape[:2])
-        im_info = np.array([input_shape + [scale]]).astype('float32')
-        im_shape = np.array([im_shape + [1.]]).astype('float32')
-        logger.info('Extra info: im_info, im_shape')
-        info.append(im_info)
-        info.append(im_shape)
-    else:
-        logger.error(
-            "Unsupported arch: {}, expect YOLO, SSD, RetinaNet, RCNN and Face".
-            format(arch))
-    return info
-
-
-class Resize(object):
-    def __init__(self,
-                 target_size,
-                 max_size=0,
-                 interp=cv2.INTER_LINEAR,
-                 use_cv2=True,
-                 image_shape=None):
-        super(Resize, self).__init__()
-        self.target_size = target_size
-        self.max_size = max_size
-        self.interp = interp
-        self.use_cv2 = use_cv2
-        self.image_shape = image_shape
-
-    def __call__(self, im):
-        origin_shape = im.shape[:2]
-        im_c = im.shape[2]
-        if self.max_size != 0:
-            im_size_min = np.min(origin_shape[0:2])
-            im_size_max = np.max(origin_shape[0:2])
-            im_scale = float(self.target_size) / float(im_size_min)
-            if np.round(im_scale * im_size_max) > self.max_size:
-                im_scale = float(self.max_size) / float(im_size_max)
-            im_scale_x = im_scale
-            im_scale_y = im_scale
-            resize_w = int(im_scale_x * float(origin_shape[1]))
-            resize_h = int(im_scale_y * float(origin_shape[0]))
-        else:
-            im_scale_x = float(self.target_size) / float(origin_shape[1])
-            im_scale_y = float(self.target_size) / float(origin_shape[0])
-            resize_w = self.target_size
-            resize_h = self.target_size
-        if self.use_cv2:
-            im = cv2.resize(
-                im,
-                None,
-                None,
-                fx=im_scale_x,
-                fy=im_scale_y,
-                interpolation=self.interp)
-        else:
-            if self.max_size != 0:
-                raise TypeError(
-                    'If you set max_size to cap the maximum size of image,'
-                    'please set use_cv2 to True to resize the image.')
-            im = im.astype('uint8')
-            im = Image.fromarray(im)
-            im = im.resize((int(resize_w), int(resize_h)), self.interp)
-            im = np.array(im)
-        # padding im
-        if self.max_size != 0 and self.image_shape is not None:
-            padding_im = np.zeros(
-                (self.max_size, self.max_size, im_c), dtype=np.float32)
-            im_h, im_w = im.shape[:2]
-            padding_im[:im_h, :im_w, :] = im
-            im = padding_im
-        return im, im_scale_x
-
-
-class Normalize(object):
-    def __init__(self, mean, std, is_scale=True, is_channel_first=False):
-        super(Normalize, self).__init__()
-        self.mean = mean
-        self.std = std
-        self.is_scale = is_scale
-        self.is_channel_first = is_channel_first
-
-    def __call__(self, im):
-        im = im.astype(np.float32, copy=False)
-        if self.is_channel_first:
-            mean = np.array(self.mean)[:, np.newaxis, np.newaxis]
-            std = np.array(self.std)[:, np.newaxis, np.newaxis]
-        else:
-            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
-            std = np.array(self.std)[np.newaxis, np.newaxis, :]
-        if self.is_scale:
-            im = im / 255.0
-        im -= mean
-        im /= std
-        return im
-
-
-class Permute(object):
-    def __init__(self, to_bgr=False, channel_first=True):
-        self.to_bgr = to_bgr
-        self.channel_first = channel_first
-
-    def __call__(self, im):
-        if self.channel_first:
-            im = im.transpose((2, 0, 1))
-        if self.to_bgr:
-            im = im[[2, 1, 0], :, :]
-        return im.copy()
-
-
-class PadStride(object):
-    def __init__(self, stride=0):
-        assert stride >= 0, "Unsupported stride: {},"
-        " the stride in PadStride must be greater "
-        "or equal to 0".format(stride)
-        self.coarsest_stride = stride
-
-    def __call__(self, im):
-        coarsest_stride = self.coarsest_stride
-        if coarsest_stride == 0:
-            return im
-        im_c, im_h, im_w = im.shape
-        pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
-        pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
-        padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
-        padding_im[:, :im_h, :im_w] = im
-        return padding_im
-
-
-def Preprocess(img_path, arch, config):
-    img = DecodeImage(img_path)
-    orig_shape = img.shape
-    scale = 1.
-    data = []
-    data_config = copy.deepcopy(config)
-    for data_aug_conf in data_config:
-        obj = data_aug_conf.pop('type')
-        preprocess = eval(obj)(**data_aug_conf)
-        if obj == 'Resize':
-            img, scale = preprocess(img)
-        else:
-            img = preprocess(img)
-
-    img = img[np.newaxis, :]  # N, C, H, W
-    data.append(img)
-    extra_info = get_extra_info(img, arch, orig_shape, scale)
-    data += extra_info
-    return data
-
-
-def get_category_info(with_background, label_list):
-    if label_list[0] != 'background' and with_background:
-        label_list.insert(0, 'background')
-    if label_list[0] == 'background' and not with_background:
-        label_list = label_list[1:]
-    clsid2catid = {i: i for i in range(len(label_list))}
-    catid2name = {i: name for i, name in enumerate(label_list)}
-    return clsid2catid, catid2name
-
-
-def clip_bbox(bbox):
-    xmin = max(min(bbox[0], 1.), 0.)
-    ymin = max(min(bbox[1], 1.), 0.)
-    xmax = max(min(bbox[2], 1.), 0.)
-    ymax = max(min(bbox[3], 1.), 0.)
-    return xmin, ymin, xmax, ymax
-
-
-def bbox2out(results, clsid2catid, is_bbox_normalized=False):
-    """
-    Args:
-        results: request a dict, should include: `bbox`, `im_id`,
-                 if is_bbox_normalized=True, also need `im_shape`.
-        clsid2catid: class id to category id map of COCO2017 dataset.
-        is_bbox_normalized: whether or not bbox is normalized.
-    """
-    xywh_res = []
-    for t in results:
-        bboxes = t['bbox'][0]
-        lengths = t['bbox'][1][0]
-        if bboxes.shape == (1, 1) or bboxes is None:
-            continue
-
-        k = 0
-        for i in range(len(lengths)):
-            num = lengths[i]
-            for j in range(num):
-                dt = bboxes[k]
-                clsid, score, xmin, ymin, xmax, ymax = dt.tolist()
-                catid = (clsid2catid[int(clsid)])
-
-                if is_bbox_normalized:
-                    xmin, ymin, xmax, ymax = \
-                            clip_bbox([xmin, ymin, xmax, ymax])
-                    w = xmax - xmin
-                    h = ymax - ymin
-                    im_shape = t['im_shape'][0][i].tolist()
-                    im_height, im_width = int(im_shape[0]), int(im_shape[1])
-                    xmin *= im_width
-                    ymin *= im_height
-                    w *= im_width
-                    h *= im_height
-                else:
-                    w = xmax - xmin + 1
-                    h = ymax - ymin + 1
-
-                bbox = [xmin, ymin, w, h]
-                coco_res = {'category_id': catid, 'bbox': bbox, 'score': score}
-                xywh_res.append(coco_res)
-                k += 1
-    return xywh_res
-
-
-def expand_boxes(boxes, scale):
-    """
-    Expand an array of boxes by a given scale.
-    """
-    w_half = (boxes[:, 2] - boxes[:, 0]) * .5
-    h_half = (boxes[:, 3] - boxes[:, 1]) * .5
-    x_c = (boxes[:, 2] + boxes[:, 0]) * .5
-    y_c = (boxes[:, 3] + boxes[:, 1]) * .5
-
-    w_half *= scale
-    h_half *= scale
-
-    boxes_exp = np.zeros(boxes.shape)
-    boxes_exp[:, 0] = x_c - w_half
-    boxes_exp[:, 2] = x_c + w_half
-    boxes_exp[:, 1] = y_c - h_half
-    boxes_exp[:, 3] = y_c + h_half
-
-    return boxes_exp
-
-
-def mask2out(results, clsid2catid, resolution, thresh_binarize=0.5):
-    import pycocotools.mask as mask_util
-    scale = (resolution + 2.0) / resolution
-
-    segm_res = []
-
-    for t in results:
-        bboxes = t['bbox'][0]
-        lengths = t['bbox'][1][0]
-        if bboxes.shape == (1, 1) or bboxes is None:
-            continue
-        if len(bboxes.tolist()) == 0:
-            continue
-        masks = t['mask'][0]
-
-        s = 0
-        # for each sample
-        for i in range(len(lengths)):
-            num = lengths[i]
-            im_shape = t['im_shape'][i]
-
-            bbox = bboxes[s:s + num][:, 2:]
-            clsid_scores = bboxes[s:s + num][:, 0:2]
-            mask = masks[s:s + num]
-            s += num
-
-            im_h = int(im_shape[0])
-            im_w = int(im_shape[1])
-
-            expand_bbox = expand_boxes(bbox, scale)
-            expand_bbox = expand_bbox.astype(np.int32)
-
-            padded_mask = np.zeros(
-                (resolution + 2, resolution + 2), dtype=np.float32)
-
-            for j in range(num):
-                xmin, ymin, xmax, ymax = expand_bbox[j].tolist()
-                clsid, score = clsid_scores[j].tolist()
-                clsid = int(clsid)
-                padded_mask[1:-1, 1:-1] = mask[j, clsid, :, :]
-
-                catid = clsid2catid[clsid]
-
-                w = xmax - xmin + 1
-                h = ymax - ymin + 1
-                w = np.maximum(w, 1)
-                h = np.maximum(h, 1)
-
-                resized_mask = cv2.resize(padded_mask, (w, h))
-                resized_mask = np.array(
-                    resized_mask > thresh_binarize, dtype=np.uint8)
-                im_mask = np.zeros((im_h, im_w), dtype=np.uint8)
-
-                x0 = min(max(xmin, 0), im_w)
-                x1 = min(max(xmax + 1, 0), im_w)
-                y0 = min(max(ymin, 0), im_h)
-                y1 = min(max(ymax + 1, 0), im_h)
-
-                im_mask[y0:y1, x0:x1] = resized_mask[(y0 - ymin):(y1 - ymin), (
-                    x0 - xmin):(x1 - xmin)]
-                segm = mask_util.encode(
-                    np.array(
-                        im_mask[:, :, np.newaxis], order='F'))[0]
-                catid = clsid2catid[clsid]
-                segm['counts'] = segm['counts'].decode('utf8')
-                coco_res = {
-                    'category_id': catid,
-                    'segmentation': segm,
-                    'score': score
-                }
-                segm_res.append(coco_res)
-    return segm_res
-
-
-def color_map(num_classes):
-    color_map = num_classes * [0, 0, 0]
-    for i in range(0, num_classes):
-        j = 0
-        lab = i
-        while lab:
-            color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
-            color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
-            color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
-            j += 1
-            lab >>= 3
-    color_map = np.array(color_map).reshape(-1, 3)
-    return color_map
-
-
-def draw_bbox(image, catid2name, bboxes, threshold, color_list):
-    """
-    draw bbox on image
-    """
-    draw = ImageDraw.Draw(image)
-
-    for dt in np.array(bboxes):
-        catid, bbox, score = dt['category_id'], dt['bbox'], dt['score']
-        if score < threshold:
-            continue
-
-        xmin, ymin, w, h = bbox
-        xmax = xmin + w
-        ymax = ymin + h
-
-        color = tuple(color_list[catid])
-
-        # draw bbox
-        draw.line(
-            [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
-             (xmin, ymin)],
-            width=2,
-            fill=color)
-
-        # draw label
-        text = "{} {:.2f}".format(catid2name[catid], score)
-        tw, th = draw.textsize(text)
-        draw.rectangle(
-            [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
-        draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
-
-    return image
-
-
-def draw_mask(image, masks, threshold, color_list, alpha=0.7):
-    """
-    Draw mask on image
-    """
-    mask_color_id = 0
-    w_ratio = .4
-    img_array = np.array(image).astype('float32')
-    for dt in np.array(masks):
-        segm, score = dt['segmentation'], dt['score']
-        if score < threshold:
-            continue
-        import pycocotools.mask as mask_util
-        mask = mask_util.decode(segm) * 255
-        color_mask = color_list[mask_color_id % len(color_list), 0:3]
-        mask_color_id += 1
-        for c in range(3):
-            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
-        idx = np.nonzero(mask)
-        img_array[idx[0], idx[1], :] *= 1.0 - alpha
-        img_array[idx[0], idx[1], :] += alpha * color_mask
-    return Image.fromarray(img_array.astype('uint8'))
-
-
-def get_bbox_result(output, result, conf, clsid2catid):
-    is_bbox_normalized = True if conf['arch'] in ['SSD', 'Face'] else False
-    lengths = offset_to_lengths(output.lod())
-    np_data = np.array(output) if conf[
-        'use_python_inference'] else output.copy_to_cpu()
-    result['bbox'] = (np_data, lengths)
-    result['im_id'] = np.array([[0]])
-
-    bbox_results = bbox2out([result], clsid2catid, is_bbox_normalized)
-    return bbox_results
-
-
-def get_mask_result(output, result, conf, clsid2catid):
-    resolution = conf['mask_resolution']
-    bbox_out, mask_out = output
-    lengths = offset_to_lengths(bbox_out.lod())
-    bbox = np.array(bbox_out) if conf[
-        'use_python_inference'] else bbox_out.copy_to_cpu()
-    mask = np.array(mask_out) if conf[
-        'use_python_inference'] else mask_out.copy_to_cpu()
-    result['bbox'] = (bbox, lengths)
-    result['mask'] = (mask, lengths)
-    mask_results = mask2out([result], clsid2catid, conf['mask_resolution'])
-    return mask_results
-
-
-def visualize(bbox_results, catid2name, num_classes, mask_results=None):
-    image = Image.open(FLAGS.infer_img).convert('RGB')
-    color_list = color_map(num_classes)
-    image = draw_bbox(image, catid2name, bbox_results, 0.5, color_list)
-    if mask_results is not None:
-        image = draw_mask(image, mask_results, 0.5, color_list)
-    image_path = os.path.split(FLAGS.infer_img)[-1]
-    if not os.path.exists(FLAGS.output_dir):
-        os.makedirs(FLAGS.output_dir)
-    out_path = os.path.join(FLAGS.output_dir, image_path)
-    image.save(out_path, quality=95)
-    logger.info('Save visualize result to {}'.format(out_path))
-
-
-def infer():
-    logger.info("cpp_infer.py is deprecated since release/0.3. Please use"
-                "deploy/python for your python deployment")
-    model_path = FLAGS.model_path
-    config_path = FLAGS.config_path
-    res = {}
-    assert model_path is not None, "Model path: {} does not exist!".format(
-        model_path)
-    assert config_path is not None, "Config path: {} does not exist!".format(
-        config_path)
-    with open(config_path) as f:
-        conf = yaml.safe_load(f)
-
-    use_trt = not conf['use_python_inference'] and 'trt' in conf['mode']
-    if use_trt:
-        logger.warning(
-            "Due to the limitation of tensorRT, the image shape needs to set in export_model"
-        )
-    img_data = Preprocess(FLAGS.infer_img, conf['arch'], conf['Preprocess'])
-    if conf['arch'] in ['SSD', 'Face']:
-        img_data, res['im_shape'] = img_data
-        img_data = [img_data]
-
-    if conf['use_python_inference']:
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        infer_prog, feed_var_names, fetch_targets = fluid.io.load_inference_model(
-            dirname=model_path,
-            executor=exe,
-            model_filename='__model__',
-            params_filename='__params__')
-        data_dict = {k: v for k, v in zip(feed_var_names, img_data)}
-    else:
-        config = create_config(
-            model_path,
-            mode=conf['mode'],
-            min_subgraph_size=conf['min_subgraph_size'])
-        predict = fluid.core.create_paddle_predictor(config)
-        input_names = predict.get_input_names()
-        for ind, d in enumerate(img_data):
-            input_tensor = predict.get_input_tensor(input_names[ind])
-            input_tensor.copy_from_cpu(d.copy())
-
-    logger.info('warmup...')
-    for i in range(10):
-        if conf['use_python_inference']:
-            outs = exe.run(infer_prog,
-                           feed=data_dict,
-                           fetch_list=fetch_targets,
-                           return_numpy=False)
-        else:
-            predict.zero_copy_run()
-
-    cnt = 100
-    logger.info('run benchmark...')
-    t1 = time.time()
-    for i in range(cnt):
-        if conf['use_python_inference']:
-            outs = exe.run(infer_prog,
-                           feed=data_dict,
-                           fetch_list=fetch_targets,
-                           return_numpy=False)
-        else:
-            outs = []
-            predict.zero_copy_run()
-            output_names = predict.get_output_names()
-            for o_name in output_names:
-                outs.append(predict.get_output_tensor(o_name))
-    t2 = time.time()
-
-    ms = (t2 - t1) * 1000.0 / float(cnt)
-
-    print("Inference: {} ms per batch image".format(ms))
-
-    clsid2catid, catid2name = get_category_info(conf['with_background'],
-                                                conf['label_list'])
-    bbox_result = get_bbox_result(outs[0], res, conf, clsid2catid)
-
-    mask_result = None
-    if 'mask_resolution' in conf:
-        res['im_shape'] = img_data[-1]
-        mask_result = get_mask_result(outs, res, conf, clsid2catid)
-
-    if FLAGS.visualize:
-        visualize(bbox_result, catid2name, len(conf['label_list']), mask_result)
-
-    if FLAGS.dump_result:
-        import json
-        bbox_file = os.path.join(FLAGS.output_dir, 'bbox.json')
-        logger.info('dump bbox to {}'.format(bbox_file))
-        with open(bbox_file, 'w') as f:
-            json.dump(bbox_result, f)
-        if mask_result is not None:
-            mask_file = os.path.join(FLAGS.output_dir, 'mask.json')
-            logger.info('dump mask to {}'.format(mask_file))
-            with open(mask_file, 'w') as f:
-                json.dump(mask_result, f)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument(
-        "--model_path", type=str, default=None, help="model path.")
-    parser.add_argument(
-        "--config_path", type=str, default=None, help="preprocess config path.")
-    parser.add_argument(
-        "--infer_img", type=str, default=None, help="Image path")
-    parser.add_argument(
-        "--visualize",
-        action='store_true',
-        default=False,
-        help="Whether to visualize detection output")
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="output",
-        help="Directory for storing the output visualization files.")
-    parser.add_argument(
-        "--dump_result",
-        action='store_true',
-        default=False,
-        help="Whether to dump result")
-    FLAGS = parser.parse_args()
-    infer()
diff --git a/tools/x2coco.py b/tools/x2coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4549746c2f1fdc3752105d44792e4316d817b28
--- /dev/null
+++ b/tools/x2coco.py
@@ -0,0 +1,446 @@
+#!/usr/bin/env python
+# coding: utf-8
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import glob
+import json
+import os
+import os.path as osp
+import sys
+import shutil
+import xml.etree.ElementTree as ET
+from tqdm import tqdm
+import re
+
+import numpy as np
+import PIL.ImageDraw
+
+label_to_num = {}
+categories_list = []
+labels_list = []
+
+
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        elif isinstance(obj, np.floating):
+            return float(obj)
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        else:
+            return super(MyEncoder, self).default(obj)
+
+
+def getbbox(self, points):
+    polygons = points
+    mask = self.polygons_to_mask([self.height, self.width], polygons)
+    return self.mask2box(mask)
+
+
+def images_labelme(data, num):
+    image = {}
+    image['height'] = data['imageHeight']
+    image['width'] = data['imageWidth']
+    image['id'] = num + 1
+    image['file_name'] = data['imagePath'].split('/')[-1]
+    return image
+
+
+def images_cityscape(data, num, img_file):
+    image = {}
+    image['height'] = data['imgHeight']
+    image['width'] = data['imgWidth']
+    image['id'] = num + 1
+    image['file_name'] = img_file
+    return image
+
+
+def categories(label, labels_list):
+    category = {}
+    category['supercategory'] = 'component'
+    category['id'] = len(labels_list) + 1
+    category['name'] = label
+    return category
+
+
+def annotations_rectangle(points, label, image_num, object_num, label_to_num):
+    annotation = {}
+    seg_points = np.asarray(points).copy()
+    seg_points[1, :] = np.asarray(points)[2, :]
+    seg_points[2, :] = np.asarray(points)[1, :]
+    annotation['segmentation'] = [list(seg_points.flatten())]
+    annotation['iscrowd'] = 0
+    annotation['image_id'] = image_num + 1
+    annotation['bbox'] = list(
+        map(float, [
+            points[0][0], points[0][1], points[1][0] - points[0][0], points[1][
+                1] - points[0][1]
+        ]))
+    annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3]
+    annotation['category_id'] = label_to_num[label]
+    annotation['id'] = object_num + 1
+    return annotation
+
+
+def annotations_polygon(height, width, points, label, image_num, object_num,
+                        label_to_num):
+    annotation = {}
+    annotation['segmentation'] = [list(np.asarray(points).flatten())]
+    annotation['iscrowd'] = 0
+    annotation['image_id'] = image_num + 1
+    annotation['bbox'] = list(map(float, get_bbox(height, width, points)))
+    annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3]
+    annotation['category_id'] = label_to_num[label]
+    annotation['id'] = object_num + 1
+    return annotation
+
+
+def get_bbox(height, width, points):
+    polygons = points
+    mask = np.zeros([height, width], dtype=np.uint8)
+    mask = PIL.Image.fromarray(mask)
+    xy = list(map(tuple, polygons))
+    PIL.ImageDraw.Draw(mask).polygon(xy=xy, outline=1, fill=1)
+    mask = np.array(mask, dtype=bool)
+    index = np.argwhere(mask == 1)
+    rows = index[:, 0]
+    clos = index[:, 1]
+    left_top_r = np.min(rows)
+    left_top_c = np.min(clos)
+    right_bottom_r = np.max(rows)
+    right_bottom_c = np.max(clos)
+    return [
+        left_top_c, left_top_r, right_bottom_c - left_top_c,
+        right_bottom_r - left_top_r
+    ]
+
+
+def deal_json(ds_type, img_path, json_path):
+    data_coco = {}
+    images_list = []
+    annotations_list = []
+    image_num = -1
+    object_num = -1
+    for img_file in os.listdir(img_path):
+        img_label = os.path.splitext(img_file)[0]
+        if img_file.split('.')[
+                -1] not in ['bmp', 'jpg', 'jpeg', 'png', 'JPEG', 'JPG', 'PNG']:
+            continue
+        label_file = osp.join(json_path, img_label + '.json')
+        print('Generating dataset from:', label_file)
+        image_num = image_num + 1
+        with open(label_file) as f:
+            data = json.load(f)
+            if ds_type == 'labelme':
+                images_list.append(images_labelme(data, image_num))
+            elif ds_type == 'cityscape':
+                images_list.append(images_cityscape(data, image_num, img_file))
+            if ds_type == 'labelme':
+                for shapes in data['shapes']:
+                    object_num = object_num + 1
+                    label = shapes['label']
+                    if label not in labels_list:
+                        categories_list.append(categories(label, labels_list))
+                        labels_list.append(label)
+                        label_to_num[label] = len(labels_list)
+                    p_type = shapes['shape_type']
+                    if p_type == 'polygon':
+                        points = shapes['points']
+                        annotations_list.append(
+                            annotations_polygon(data['imageHeight'], data[
+                                'imageWidth'], points, label, image_num,
+                                                object_num, label_to_num))
+
+                    if p_type == 'rectangle':
+                        (x1, y1), (x2, y2) = shapes['points']
+                        x1, x2 = sorted([x1, x2])
+                        y1, y2 = sorted([y1, y2])
+                        points = [[x1, y1], [x2, y2], [x1, y2], [x2, y1]]
+                        annotations_list.append(
+                            annotations_rectangle(points, label, image_num,
+                                                  object_num, label_to_num))
+            elif ds_type == 'cityscape':
+                for shapes in data['objects']:
+                    object_num = object_num + 1
+                    label = shapes['label']
+                    if label not in labels_list:
+                        categories_list.append(categories(label, labels_list))
+                        labels_list.append(label)
+                        label_to_num[label] = len(labels_list)
+                    points = shapes['polygon']
+                    annotations_list.append(
+                        annotations_polygon(data['imgHeight'], data[
+                            'imgWidth'], points, label, image_num, object_num,
+                                            label_to_num))
+    data_coco['images'] = images_list
+    data_coco['categories'] = categories_list
+    data_coco['annotations'] = annotations_list
+    return data_coco
+
+
+def voc_get_label_anno(ann_dir_path, ann_ids_path, labels_path):
+    with open(labels_path, 'r') as f:
+        labels_str = f.read().split()
+    labels_ids = list(range(1, len(labels_str) + 1))
+
+    with open(ann_ids_path, 'r') as f:
+        ann_ids = f.read().split()
+    ann_paths = []
+    for aid in ann_ids:
+        if aid.endswith('xml'):
+            ann_path = os.path.join(ann_dir_path, aid)
+        else:
+            ann_path = os.path.join(ann_dir_path, aid + '.xml')
+        ann_paths.append(ann_path)
+
+    return dict(zip(labels_str, labels_ids)), ann_paths
+
+
+def voc_get_image_info(annotation_root, im_id):
+    filename = annotation_root.findtext('filename')
+    assert filename is not None
+    img_name = os.path.basename(filename)
+
+    size = annotation_root.find('size')
+    width = int(size.findtext('width'))
+    height = int(size.findtext('height'))
+
+    image_info = {
+        'file_name': filename,
+        'height': height,
+        'width': width,
+        'id': im_id
+    }
+    return image_info
+
+
+def voc_get_coco_annotation(obj, label2id):
+    label = obj.findtext('name')
+    assert label in label2id, "label is not in label2id."
+    category_id = label2id[label]
+    bndbox = obj.find('bndbox')
+    xmin = int(bndbox.findtext('xmin')) - 1
+    ymin = int(bndbox.findtext('ymin')) - 1
+    xmax = int(bndbox.findtext('xmax'))
+    ymax = int(bndbox.findtext('ymax'))
+    assert xmax > xmin and ymax > ymin, "Box size error."
+    o_width = xmax - xmin
+    o_height = ymax - ymin
+    anno = {
+        'area': o_width * o_height,
+        'iscrowd': 0,
+        'bbox': [xmin, ymin, o_width, o_height],
+        'category_id': category_id,
+        'ignore': 0,
+        'segmentation': []  # This script is not for segmentation
+    }
+    return anno
+
+
+def voc_xmls_to_cocojson(annotation_paths, label2id, output_dir, output_file):
+    output_json_dict = {
+        "images": [],
+        "type": "instances",
+        "annotations": [],
+        "categories": []
+    }
+    bnd_id = 1  # bounding box start id
+    im_id = 0
+    print('Start converting !')
+    for a_path in tqdm(annotation_paths):
+        # Read annotation xml
+        ann_tree = ET.parse(a_path)
+        ann_root = ann_tree.getroot()
+
+        img_info = voc_get_image_info(ann_root, im_id)
+        im_id += 1
+        img_id = img_info['id']
+        output_json_dict['images'].append(img_info)
+
+        for obj in ann_root.findall('object'):
+            ann = voc_get_coco_annotation(obj=obj, label2id=label2id)
+            ann.update({'image_id': img_id, 'id': bnd_id})
+            output_json_dict['annotations'].append(ann)
+            bnd_id = bnd_id + 1
+
+    for label, label_id in label2id.items():
+        category_info = {'supercategory': 'none', 'id': label_id, 'name': label}
+        output_json_dict['categories'].append(category_info)
+    output_file = os.path.join(output_dir, output_file)
+    with open(output_file, 'w') as f:
+        output_json = json.dumps(output_json_dict)
+        f.write(output_json)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--dataset_type', help='the type of dataset')
+    parser.add_argument('--json_input_dir', help='input annotated directory')
+    parser.add_argument('--image_input_dir', help='image directory')
+    parser.add_argument(
+        '--output_dir', help='output dataset directory', default='./')
+    parser.add_argument(
+        '--train_proportion',
+        help='the proportion of train dataset',
+        type=float,
+        default=1.0)
+    parser.add_argument(
+        '--val_proportion',
+        help='the proportion of validation dataset',
+        type=float,
+        default=0.0)
+    parser.add_argument(
+        '--test_proportion',
+        help='the proportion of test dataset',
+        type=float,
+        default=0.0)
+    parser.add_argument(
+        '--voc_anno_dir',
+        help='In Voc format dataset, path to annotation files directory.',
+        type=str,
+        default=None)
+    parser.add_argument(
+        '--voc_anno_list',
+        help='In Voc format dataset, path to annotation files ids list.',
+        type=str,
+        default=None)
+    parser.add_argument(
+        '--voc_label_list',
+        help='In Voc format dataset, path to label list. The content of each line is a category.',
+        type=str,
+        default=None)
+    parser.add_argument(
+        '--voc_out_name',
+        type=str,
+        default='voc.json',
+        help='In Voc format dataset, path to output json file')
+    args = parser.parse_args()
+    try:
+        assert args.dataset_type in ['voc', 'labelme', 'cityscape']
+    except AssertionError as e:
+        print(
+            'Now only support the voc, cityscape dataset and labelme dataset!!')
+        os._exit(0)
+
+    if args.dataset_type == 'voc':
+        assert args.voc_anno_dir and args.voc_anno_list and args.voc_label_list
+        label2id, ann_paths = voc_get_label_anno(
+            args.voc_anno_dir, args.voc_anno_list, args.voc_label_list)
+        voc_xmls_to_cocojson(
+            annotation_paths=ann_paths,
+            label2id=label2id,
+            output_dir=args.output_dir,
+            output_file=args.voc_out_name)
+    else:
+        try:
+            assert os.path.exists(args.json_input_dir)
+        except AssertionError as e:
+            print('The json folder does not exist!')
+            os._exit(0)
+        try:
+            assert os.path.exists(args.image_input_dir)
+        except AssertionError as e:
+            print('The image folder does not exist!')
+            os._exit(0)
+        try:
+            assert abs(args.train_proportion + args.val_proportion \
+                    + args.test_proportion - 1.0) < 1e-5
+        except AssertionError as e:
+            print(
+                'The sum of pqoportion of training, validation and test datase must be 1!'
+            )
+            os._exit(0)
+
+        # Allocate the dataset.
+        total_num = len(glob.glob(osp.join(args.json_input_dir, '*.json')))
+        if args.train_proportion != 0:
+            train_num = int(total_num * args.train_proportion)
+            os.makedirs(args.output_dir + '/train')
+        else:
+            train_num = 0
+        if args.val_proportion == 0.0:
+            val_num = 0
+            test_num = total_num - train_num
+            if args.test_proportion != 0.0:
+                os.makedirs(args.output_dir + '/test')
+        else:
+            val_num = int(total_num * args.val_proportion)
+            test_num = total_num - train_num - val_num
+            os.makedirs(args.output_dir + '/val')
+            if args.test_proportion != 0.0:
+                os.makedirs(args.output_dir + '/test')
+        count = 1
+        for img_name in os.listdir(args.image_input_dir):
+            if count <= train_num:
+                if osp.exists(args.output_dir + '/train/'):
+                    shutil.copyfile(
+                        osp.join(args.image_input_dir, img_name),
+                        osp.join(args.output_dir + '/train/', img_name))
+            else:
+                if count <= train_num + val_num:
+                    if osp.exists(args.output_dir + '/val/'):
+                        shutil.copyfile(
+                            osp.join(args.image_input_dir, img_name),
+                            osp.join(args.output_dir + '/val/', img_name))
+                else:
+                    if osp.exists(args.output_dir + '/test/'):
+                        shutil.copyfile(
+                            osp.join(args.image_input_dir, img_name),
+                            osp.join(args.output_dir + '/test/', img_name))
+            count = count + 1
+
+        # Deal with the json files.
+        if not os.path.exists(args.output_dir + '/annotations'):
+            os.makedirs(args.output_dir + '/annotations')
+        if args.train_proportion != 0:
+            train_data_coco = deal_json(args.dataset_type,
+                                        args.output_dir + '/train',
+                                        args.json_input_dir)
+            train_json_path = osp.join(args.output_dir + '/annotations',
+                                       'instance_train.json')
+            json.dump(
+                train_data_coco,
+                open(train_json_path, 'w'),
+                indent=4,
+                cls=MyEncoder)
+        if args.val_proportion != 0:
+            val_data_coco = deal_json(args.dataset_type,
+                                      args.output_dir + '/val',
+                                      args.json_input_dir)
+            val_json_path = osp.join(args.output_dir + '/annotations',
+                                     'instance_val.json')
+            json.dump(
+                val_data_coco,
+                open(val_json_path, 'w'),
+                indent=4,
+                cls=MyEncoder)
+        if args.test_proportion != 0:
+            test_data_coco = deal_json(args.dataset_type,
+                                       args.output_dir + '/test',
+                                       args.json_input_dir)
+            test_json_path = osp.join(args.output_dir + '/annotations',
+                                      'instance_test.json')
+            json.dump(
+                test_data_coco,
+                open(test_json_path, 'w'),
+                indent=4,
+                cls=MyEncoder)
+
+
+if __name__ == '__main__':
+    main()