Fp32 vs int8 qat C++ performance (#21244) (#21432)

06545fcf · Wojciech Uss · Tao Luo · 072eb5b6 · 06545fcf · 06545fcf
8 changed file
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -16,6 +16,12 @@ function(download_int8_data install_dir data_file)
    endif()
 endfunction()

+function(download_qat_data install_dir data_file)
+    if (NOT EXISTS ${install_dir}/${data_file})
+	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
+    endif()
+endfunction()
+
 function(download_model_and_data install_dir model_name data_name)
    download_data(${install_dir} ${model_name}) 
    download_data(${install_dir} ${data_name})
@@ -27,7 +33,7 @@ function(inference_analysis_api_test target install_dir filename)
        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
 endfunction()

-function(inference_analysis_api_int8_test_build TARGET_NAME filename)
+function(inference_analysis_api_test_build TARGET_NAME filename)
 	inference_analysis_test_build(${TARGET_NAME} SRCS ${filename}
        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark)
 endfunction()
@@ -73,6 +79,18 @@ function(inference_analysis_api_test_with_refer_result target install_dir filena
             --refer_result=${install_dir}/result.txt)
 endfunction()

+function(inference_analysis_api_qat_test_run TARGET_NAME test_binary fp32_model_dir int8_model_dir data_path)
+    inference_analysis_test_run(${TARGET_NAME}
+    COMMAND ${test_binary}
+        ARGS --fp32_model=${fp32_model_dir}
+             --int8_model=${int8_model_dir}
+             --infer_data=${data_path}
+             --batch_size=50
+             --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
+             --with_accuracy_layer=false
+             --iterations=2)
+endfunction()
+
 if(NOT APPLE AND WITH_MKLML)
    # RNN1
    set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
@@ -186,9 +204,10 @@ download_data(${MOBILENET_MODEL_DIR} "mobilenet_model.tar.gz")
 inference_analysis_api_test_with_fake_data_run(test_analyzer_mobilenet_depthwise_conv ${IMG_CLASS_TEST_APP}
 	${MOBILENET_MODEL_DIR} false)

-### INT8 tests
 if(WITH_MKLDNN)

+  ### INT8 tests
+
  set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")

  ### Image classification tests
@@ -200,7 +219,7 @@ if(WITH_MKLDNN)
  download_int8_data(${INT8_DATA_DIR} "imagenet_val_100_tail.tar.gz")

  # build test binary to be used in subsequent tests
-  inference_analysis_api_int8_test_build(${INT8_IMG_CLASS_TEST_APP} ${INT8_IMG_CLASS_TEST_APP_SRC})
+  inference_analysis_api_test_build(${INT8_IMG_CLASS_TEST_APP} ${INT8_IMG_CLASS_TEST_APP_SRC})

  # resnet50 int8
  set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
@@ -249,13 +268,29 @@ if(WITH_MKLDNN)
  download_int8_data(${INT8_DATA_DIR} "pascalvoc_small.tar.gz")

  # build test binary to be used in subsequent tests
-  inference_analysis_api_int8_test_build(${INT8_OBJ_DETECT_TEST_APP} ${INT8_OBJ_DETECT_TEST_APP_SRC})
+  inference_analysis_api_test_build(${INT8_OBJ_DETECT_TEST_APP} ${INT8_OBJ_DETECT_TEST_APP_SRC})

  # mobilenet-ssd int8
  set(INT8_MOBILENET_SSD_MODEL_DIR "${INT8_DATA_DIR}/mobilenet-ssd")
  download_int8_data(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" )
  inference_analysis_api_object_dection_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH})

+  ### optimized FP32 vs. QAT INT8 tests
+  
+  set(QAT_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
+  set(QAT_IMG_CLASS_TEST_APP "test_analyzer_qat_image_classification")
+  set(QAT_IMG_CLASS_TEST_APP_SRC "analyzer_qat_image_classification_tester.cc")
+
+  # build test binary to be used in subsequent tests
+  inference_analysis_api_test_build(${QAT_IMG_CLASS_TEST_APP} ${QAT_IMG_CLASS_TEST_APP_SRC})
+
+  # ResNet50 FP32 vs. QAT INT8
+  set(QAT2_RESNET50_MODEL_DIR "${QAT_DATA_DIR}/ResNet50_qat_perf")
+  download_qat_data(${QAT2_RESNET50_MODEL_DIR} "ResNet50_qat_perf.tar.gz")
+  set(QAT2_INT8_RESNET50_MODEL_DIR "${QAT_DATA_DIR}/ResNet50_qat_perf_int8")
+  download_qat_data(${QAT2_INT8_RESNET50_MODEL_DIR} "ResNet50_qat_perf_int8.tar.gz")
+  inference_analysis_api_qat_test_run(test_analyzer_qat_performance_benchmark ${QAT_IMG_CLASS_TEST_APP} ${QAT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${QAT2_INT8_RESNET50_MODEL_DIR}/ResNet50_qat_perf_int8 ${IMAGENET_DATA_PATH})
+
 endif()

 # bert, max_len=20, embedding_dim=128

--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
@@ -35,9 +35,9 @@ class TensorReader {
 public:
  TensorReader(std::ifstream &file, size_t beginning_offset,
               std::vector<int> shape, std::string name)
-      : file_(file), position(beginning_offset), shape_(shape), name_(name) {
-    numel = std::accumulate(shape_.begin(), shape_.end(), size_t{1},
-                            std::multiplies<size_t>());
+      : file_(file), position_(beginning_offset), shape_(shape), name_(name) {
+    numel_ = std::accumulate(shape_.begin(), shape_.end(), size_t{1},
+                             std::multiplies<size_t>());
  }

  PaddleTensor NextBatch() {
@@ -45,11 +45,11 @@ class TensorReader {
    tensor.name = name_;
    tensor.shape = shape_;
    tensor.dtype = GetPaddleDType<T>();
-    tensor.data.Resize(numel * sizeof(T));
+    tensor.data.Resize(numel_ * sizeof(T));

-    file_.seekg(position);
-    file_.read(static_cast<char *>(tensor.data.data()), numel * sizeof(T));
-    position = file_.tellg();
+    file_.seekg(position_);
+    file_.read(static_cast<char *>(tensor.data.data()), numel_ * sizeof(T));
+    position_ = file_.tellg();

    if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
    if (file_.fail())
@@ -60,10 +60,10 @@ class TensorReader {

 protected:
  std::ifstream &file_;
-  size_t position;
+  size_t position_;
  std::vector<int> shape_;
  std::string name_;
-  size_t numel;
+  size_t numel_;
 };

 std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
@@ -71,10 +71,13 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
    int num_images = FLAGS_warmup_batch_size) {
  int test_data_batch_size = test_data[0][0].shape[0];
  auto iterations = test_data.size();
-  PADDLE_ENFORCE(
-      static_cast<size_t>(num_images) <= iterations * test_data_batch_size,
-      "The requested quantization warmup data size " +
-          std::to_string(num_images) + " is bigger than all test data size.");
+  auto all_test_data_size = iterations * test_data_batch_size;
+  PADDLE_ENFORCE_LE(static_cast<size_t>(num_images), all_test_data_size,
+                    platform::errors::InvalidArgument(
+                        "The requested quantization warmup data size must be "
+                        "smaller than the test data size. But received warmup "
+                        "size is %d and test data size is %d",
+                        num_images, all_test_data_size));

  PaddleTensor images;
  images.name = "image";

--- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
@@ -50,7 +50,7 @@ template <typename T>
 class TensorReader {
 public:
  TensorReader(std::ifstream &file, size_t beginning_offset, std::string name)
-      : file_(file), position(beginning_offset), name_(name) {}
+      : file_(file), position_(beginning_offset), name_(name) {}

  PaddleTensor NextBatch(std::vector<int> shape, std::vector<size_t> lod) {
    int numel =
@@ -64,9 +64,9 @@ class TensorReader {
      tensor.lod.clear();
      tensor.lod.push_back(lod);
    }
-    file_.seekg(position);
+    file_.seekg(position_);
    file_.read(reinterpret_cast<char *>(tensor.data.data()), numel * sizeof(T));
-    position = file_.tellg();
+    position_ = file_.tellg();
    if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
    if (file_.fail())
      throw std::runtime_error(name_ + ": failed reading file.");
@@ -75,7 +75,7 @@ class TensorReader {

 protected:
  std::ifstream &file_;
-  size_t position;
+  size_t position_;
  std::string name_;
 };


--- a/paddle/fluid/inference/tests/api/analyzer_qat_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_qat_image_classification_tester.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void SetConfig(AnalysisConfig *cfg, std::string model_path) {
+  cfg->SetModel(model_path);
+  cfg->DisableGpu();
+  cfg->SwitchIrOptim(false);
+  cfg->SwitchSpecifyInputNames();
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
+  cfg->EnableMKLDNN();
+}
+
+template <typename T>
+class TensorReader {
+ public:
+  TensorReader(std::ifstream &file, size_t beginning_offset,
+               std::vector<int> shape, std::string name)
+      : file_(file), position_(beginning_offset), shape_(shape), name_(name) {
+    numel_ = std::accumulate(shape_.begin(), shape_.end(), size_t{1},
+                             std::multiplies<size_t>());
+  }
+
+  PaddleTensor NextBatch() {
+    PaddleTensor tensor;
+    tensor.name = name_;
+    tensor.shape = shape_;
+    tensor.dtype = GetPaddleDType<T>();
+    tensor.data.Resize(numel_ * sizeof(T));
+
+    file_.seekg(position_);
+    file_.read(static_cast<char *>(tensor.data.data()), numel_ * sizeof(T));
+    position_ = file_.tellg();
+
+    if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
+    if (file_.fail())
+      throw std::runtime_error(name_ + ": failed reading file.");
+
+    return tensor;
+  }
+
+ protected:
+  std::ifstream &file_;
+  size_t position_;
+  std::vector<int> shape_;
+  std::string name_;
+  size_t numel_;
+};
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
+              bool with_accuracy_layer = FLAGS_with_accuracy_layer,
+              int32_t batch_size = FLAGS_batch_size) {
+  std::ifstream file(FLAGS_infer_data, std::ios::binary);
+  if (!file) {
+    FAIL() << "Couldn't open file: " << FLAGS_infer_data;
+  }
+
+  int64_t total_images{0};
+  file.read(reinterpret_cast<char *>(&total_images), sizeof(total_images));
+  LOG(INFO) << "Total images in file: " << total_images;
+
+  std::vector<int> image_batch_shape{batch_size, 3, 224, 224};
+  std::vector<int> label_batch_shape{batch_size, 1};
+  auto images_offset_in_file = static_cast<size_t>(file.tellg());
+
+  TensorReader<float> image_reader(file, images_offset_in_file,
+                                   image_batch_shape, "image");
+
+  auto iterations_max = total_images / batch_size;
+  auto iterations = iterations_max;
+  if (FLAGS_iterations > 0 && FLAGS_iterations < iterations_max) {
+    iterations = FLAGS_iterations;
+  }
+
+  auto labels_offset_in_file =
+      images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;
+
+  TensorReader<int64_t> label_reader(file, labels_offset_in_file,
+                                     label_batch_shape, "label");
+  for (auto i = 0; i < iterations; i++) {
+    auto images = image_reader.NextBatch();
+    std::vector<PaddleTensor> tmp_vec;
+    tmp_vec.push_back(std::move(images));
+    if (with_accuracy_layer) {
+      auto labels = label_reader.NextBatch();
+      tmp_vec.push_back(std::move(labels));
+    }
+    inputs->push_back(std::move(tmp_vec));
+  }
+}
+
+TEST(Analyzer_qat_image_classification, quantization) {
+  AnalysisConfig fp32_cfg;
+  SetConfig(&fp32_cfg, FLAGS_fp32_model);
+
+  AnalysisConfig int8_cfg;
+  SetConfig(&int8_cfg, FLAGS_int8_model);
+
+  // read data from file and prepare batches with test data
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+
+  // 0 is avg_cost, 1 is top1_accuracy, 2 is top5_accuracy or mAP
+  CompareAnalysisAndAnalysis(&fp32_cfg, &int8_cfg, input_slots_all,
+                             FLAGS_with_accuracy_layer, 1);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -39,9 +39,13 @@

 DEFINE_string(model_name, "", "model name");
 DEFINE_string(infer_model, "", "model path");
+DEFINE_string(fp32_model, "", "FP32 model path");
+DEFINE_string(int8_model, "", "INT8 model path");
 DEFINE_string(infer_data, "", "data file");
 DEFINE_string(refer_result, "", "reference result for comparison");
 DEFINE_int32(batch_size, 1, "batch size");
+DEFINE_bool(with_accuracy_layer, true,
+            "Calculate the accuracy while label is in the input");
 DEFINE_bool(enable_fp32, true, "Enable FP32 type prediction");
 DEFINE_bool(enable_int8, true, "Enable INT8 type prediction");
 DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup");
@@ -238,7 +242,11 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
                       const std::vector<std::string> *feed_names = nullptr,
                       const int continuous_inuput_index = 0) {
  // Set fake_image_data
-  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
+  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0,
+                    platform::errors::InvalidArgument(
+                        "In SetFakeImageInput, expected test_all_data = false, "
+                        "but now test_all_data=",
+                        FLAGS_test_all_data));
  std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
      dirname, is_combined, model_filename, params_filename);
  std::ostringstream os;
@@ -251,7 +259,13 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
  }
  LOG(INFO) << os.str();
  if (feed_names) {
-    PADDLE_ENFORCE_EQ(feed_names->size(), feed_target_shapes.size());
+    PADDLE_ENFORCE_EQ(
+        feed_names->size(), feed_target_shapes.size(),
+        platform::errors::InvalidArgument(
+            "The size of feeds_names and size of "
+            "feed_target_shapes must be equal, but now feeds_names "
+            "size is %d and feed_target_shapes size is %d",
+            feed_names->size(), feed_target_shapes.size()));
  }
  std::vector<PaddleTensor> input_slots(feed_target_shapes.size());
  for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
@@ -466,12 +480,20 @@ void TestPrediction(const PaddlePredictor::Config *config,

 void SummarizeAccuracy(float avg_acc_fp32, float avg_acc_int8,
                       int compared_idx) {
-  PADDLE_ENFORCE_LE(compared_idx, 2,
-                    "Compare either top1 accuracy or mAP (top5), the "
-                    "compared_idx is out of range");
-  PADDLE_ENFORCE_GE(compared_idx, 1,
-                    "Compare either top1 accuracy or mAP (top5), the "
-                    "compared_idx is out of range");
+  PADDLE_ENFORCE_LE(
+      compared_idx, 2,
+      platform::errors::InvalidArgument(
+          "The compared_idx should be <= 2. But received compared_idx = %d. "
+          "For top1 accuracy, set compared_idx = 1; For top5 accuracy or mean "
+          "Average Precision (mAP), set compared_idx = 2.",
+          compared_idx));
+  PADDLE_ENFORCE_GE(
+      compared_idx, 1,
+      platform::errors::InvalidArgument(
+          "The compared_idx should be >= 1. But received compared_idx = %d. "
+          "For top1 accuracy, set compared_idx = 1; For top5 accuracy or mean "
+          "Average Precision (mAP), set compared_idx = 2.",
+          compared_idx));
  std::string prefix = (compared_idx == 1) ? "top1_accuracy " : "mAP ";
  LOG(INFO) << "--- Accuracy summary --- ";
  LOG(INFO) << "Accepted " << prefix
@@ -501,9 +523,10 @@ void SummarizePerformance(float sample_latency_fp32,
 float CompareAccuracyOne(
    const std::vector<std::vector<PaddleTensor>> &output_slots,
    int compared_idx) {
-  if (output_slots.size() == 0)
-    throw std::invalid_argument(
-        "CompareAccuracy: output_slots vector is empty.");
+  PADDLE_ENFORCE_GT(output_slots.size(), 0,
+                    platform::errors::InvalidArgument(
+                        "The accuracy vector is empty. The accuracy vector "
+                        "size should be bigger than 0"));

  float total_accs{0};

@@ -512,12 +535,19 @@ float CompareAccuracyOne(
      case 1:
        PADDLE_ENFORCE_GE(
            output_slots[i].size(), 2UL,
-            "To achieve top 1 accuracy, output_slots_quant[i].size()>=2");
+            platform::errors::InvalidArgument(
+                "To achieve top 1 accuracy, output_slots size "
+                "must be bigger than or equal to 2, but now the size is %d",
+                output_slots[i].size()));
        break;
      case 2:
        PADDLE_ENFORCE_GE(
-            output_slots[i].size(), 2UL,
-            "To achieve top 1 accuracy, output_slots_ref[i].size()>=2");
+            output_slots[i].size(), 3UL,
+            platform::errors::InvalidArgument(
+                "To achieve top 5 accuracy or mean Average "
+                "Precision (mAP), output_slots size must be "
+                "bigger than or equal to 3, but now the size is %d",
+                output_slots[i].size()));
        break;
      default:
        throw std::invalid_argument(
@@ -535,8 +565,6 @@ float CompareAccuracyOne(
        *static_cast<float *>(output_slots[i][compared_idx].data.data());
  }

-  CHECK_GT(output_slots.size(), 0);
-
  return total_accs / output_slots.size();
 }

@@ -594,8 +622,14 @@ void CompareNativeAndAnalysis(
  std::vector<std::vector<PaddleTensor>> native_outputs, analysis_outputs;
  TestOneThreadPrediction(config, inputs, &native_outputs, false);
  TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
-  PADDLE_ENFORCE_GT(native_outputs.size(), 0, "Native output is empty.");
-  PADDLE_ENFORCE_GT(analysis_outputs.size(), 0, "Analysis output is empty.");
+  PADDLE_ENFORCE_GT(native_outputs.size(), 0,
+                    platform::errors::InvalidArgument(
+                        "The native outputs is empty. The native outputs "
+                        "vector size must be bigger than 0"));
+  PADDLE_ENFORCE_GT(analysis_outputs.size(), 0,
+                    platform::errors::InvalidArgument(
+                        "The analysis outputs is empty. The analysis outputs "
+                        "vector size must be bigger than 0"));
  CompareResult(analysis_outputs.back(), native_outputs.back());
 }

@@ -603,8 +637,12 @@ void CompareQuantizedAndAnalysis(
    const AnalysisConfig *config, const AnalysisConfig *qconfig,
    const std::vector<std::vector<PaddleTensor>> &inputs,
    const int compared_idx = 1) {
-  PADDLE_ENFORCE_EQ(inputs[0][0].shape[0], FLAGS_batch_size,
-                    "Input data has to be packed batch by batch.");
+  PADDLE_ENFORCE_EQ(
+      inputs[0][0].shape[0], FLAGS_batch_size,
+      platform::errors::InvalidArgument(
+          "Input data has to be packed batch by batch. The batchsize is set to "
+          "%d, but the real input is packed with batchsize = %d",
+          FLAGS_batch_size, inputs[0][0].shape[0]));
  LOG(INFO) << "FP32 & INT8 prediction run: batch_size " << FLAGS_batch_size
            << ", warmup batch size " << FLAGS_warmup_batch_size << ".";

@@ -634,6 +672,48 @@ void CompareQuantizedAndAnalysis(
  CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx);
 }

+void CompareAnalysisAndAnalysis(
+    const AnalysisConfig *config1, const AnalysisConfig *config2,
+    const std::vector<std::vector<PaddleTensor>> &inputs,
+    const bool with_accuracy_layer = FLAGS_with_accuracy_layer,
+    const int compared_idx = 1) {
+  PADDLE_ENFORCE_EQ(
+      inputs[0][0].shape[0], FLAGS_batch_size,
+      platform::errors::InvalidArgument(
+          "Input data has to be packed batch by batch. The batchsize is set to "
+          "%d, but the real input is packed with batchsize = %d",
+          FLAGS_batch_size, inputs[0][0].shape[0]));
+
+  LOG(INFO) << "FP32 & INT8 prediction run: batch_size " << FLAGS_batch_size
+            << ", warmup batch size " << FLAGS_warmup_batch_size << ".";
+
+  LOG(INFO) << "--- FP32 prediction start ---";
+  auto *cfg1 = reinterpret_cast<const PaddlePredictor::Config *>(config1);
+  PrintConfig(cfg1, true);
+  std::vector<std::vector<PaddleTensor>> analysis_outputs;
+  float sample_latency_fp32{-1};
+
+  if (FLAGS_enable_fp32) {
+    TestOneThreadPrediction(cfg1, inputs, &analysis_outputs, true,
+                            VarType::FP32, &sample_latency_fp32);
+  }
+
+  LOG(INFO) << "--- INT8 prediction start ---";
+  auto *cfg2 = reinterpret_cast<const PaddlePredictor::Config *>(config2);
+  PrintConfig(cfg2, true);
+  std::vector<std::vector<PaddleTensor>> int8_outputs;
+  float sample_latency_int8{-1};
+
+  if (FLAGS_enable_int8) {
+    TestOneThreadPrediction(cfg2, inputs, &int8_outputs, true, VarType::INT8,
+                            &sample_latency_int8);
+  }
+  SummarizePerformance(sample_latency_fp32, sample_latency_int8);
+  if (with_accuracy_layer) {
+    CompareAccuracy(int8_outputs, analysis_outputs, compared_idx);
+  }
+}
+
 void CompareNativeAndAnalysis(
    PaddlePredictor *native_pred, PaddlePredictor *analysis_pred,
    const std::vector<std::vector<PaddleTensor>> &inputs) {

--- a/python/paddle/fluid/contrib/slim/quantization/quantization_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_mkldnn_pass.py
@@ -330,6 +330,17 @@ class FakeQAT2MkldnnINT8PerfPass(object):
        graph = self._remove_unused_var_nodes(graph)
        return graph

+    def apply_fp32(self, graph):
+        assert isinstance(graph,
+                          IrGraph), 'graph must be the instance of IrGraph.'
+
+        graph = self._gather_scales(graph)
+        graph = self._remove_fake_ops(graph)
+        graph = self._dequantize_weights(graph)
+        graph = self._optimize_fp32_graph(graph)
+        graph = self._remove_unused_var_nodes(graph)
+        return graph
+
    def _convert_scale2tensor(self, scale):
        tensor = core.LoDTensor()
        tensor.set(scale, core.CPUPlace())

--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -45,6 +45,12 @@ function(inference_qat2_int8_test target model_dir data_dir test_script use_mkld
                 --qat2)
 endfunction()

+function(save_qat_model_test target qat_model_dir fp32_model_save_path int8_model_save_path test_script)
+    py_test(${target} SRCS ${test_script}
+            ARGS --qat_model_path ${qat_model_dir}
+	            --fp32_model_save_path ${fp32_model_save_path}
+	            --int8_model_save_path ${int8_model_save_path})
+endfunction()

 if(WIN32)
    list(REMOVE_ITEM TEST_OPS test_light_nas)
@@ -169,6 +175,13 @@ if(LINUX AND WITH_MKLDNN)
        endif()
        inference_qat2_int8_test(test_qat2_int8_mobilenetv1_mkldnn ${QAT2_MOBILENETV1_MODEL_DIR}/MobileNet_qat_perf ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)

+        # Save qat2 fp32 model or qat2 int8 model
+        
+        set(QAT2_INT8_SAVE_PATH "${QAT_DATA_DIR}/ResNet50_qat2_int8")
+        set(QAT2_FP32_SAVE_PATH "${QAT_DATA_DIR}/ResNet50_qat2_fp32")
+        set(SAVE_QAT2_MODEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/save_qat_model.py")
+        save_qat_model_test(save_qat2_model_resnet50 ${QAT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${QAT2_FP32_SAVE_PATH} ${QAT2_INT8_SAVE_PATH} ${SAVE_QAT2_MODEL_SCRIPT} true)
+
 endif()

 # Since the test for QAT FP32 & INT8 comparison supports only testing on Linux 

--- a/python/paddle/fluid/contrib/slim/tests/save_qat_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/save_qat_model.py
+#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import unittest
+import os
+import sys
+import argparse
+import logging
+import struct
+import six
+import numpy as np
+import time
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.framework import IrGraph
+from paddle.fluid.contrib.slim.quantization import FakeQAT2MkldnnINT8PerfPass
+from paddle.fluid import core
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--qat_model_path', type=str, default='', help='A path to a QAT model.')
+    parser.add_argument(
+        '--fp32_model_save_path',
+        type=str,
+        default='',
+        help='Saved optimized fp32 model')
+    parser.add_argument(
+        '--int8_model_save_path',
+        type=str,
+        default='',
+        help='Saved optimized and quantized INT8 model')
+
+    test_args, args = parser.parse_known_args(namespace=unittest)
+    return test_args, sys.argv[:1] + args
+
+
+def transform_and_save_model(original_path, save_path, save_type):
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    inference_scope = fluid.executor.global_scope()
+    with fluid.scope_guard(inference_scope):
+        if os.path.exists(os.path.join(original_path, '__model__')):
+            [inference_program, feed_target_names,
+             fetch_targets] = fluid.io.load_inference_model(original_path, exe)
+        else:
+            [inference_program, feed_target_names,
+             fetch_targets] = fluid.io.load_inference_model(original_path, exe,
+                                                            'model', 'params')
+
+        transform_to_mkldnn_int8_pass = FakeQAT2MkldnnINT8PerfPass(
+            _scope=inference_scope, _place=place, _core=core)
+
+        graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
+        if save_type == 'FP32':
+            graph = transform_to_mkldnn_int8_pass.apply_fp32(graph)
+        elif save_type == 'INT8':
+            graph = transform_to_mkldnn_int8_pass.apply(graph)
+        inference_program = graph.to_program()
+        with fluid.scope_guard(inference_scope):
+            fluid.io.save_inference_model(save_path, feed_target_names,
+                                          fetch_targets, exe, inference_program)
+        print("Success! Transformed QAT_{0} model can be found at {1}\n".format(
+            save_type, save_path))
+
+
+if __name__ == '__main__':
+    global test_args
+    test_args, remaining_args = parse_args()
+    if test_args.fp32_model_save_path:
+        transform_and_save_model(test_args.qat_model_path,
+                                 test_args.fp32_model_save_path, 'FP32')
+    if test_args.int8_model_save_path:
+        transform_and_save_model(test_args.qat_model_path,
+                                 test_args.int8_model_save_path, 'INT8')