[APU] Add model cache(#4456)

c37e0b55 · barry-ai · GitHub · cde383dc · c37e0b55 · c37e0b55
13 changed file
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -89,6 +89,14 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
      raw_predictor_->scope(), config.subgraph_model_cache_dir());
 #endif
+
+#ifdef LITE_WITH_APU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
+  Context<TargetType::kAPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+
 #ifdef LITE_WITH_HUAWEI_ASCEND_NPU
  Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
      config.get_device_id());

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -47,6 +47,14 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
      raw_predictor_->scope(), config.subgraph_model_cache_dir());
 #endif
+
+#ifdef LITE_WITH_APU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
+  Context<TargetType::kAPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+
 #ifdef LITE_WITH_HUAWEI_ASCEND_NPU
  Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
      config.get_device_id());

--- a/lite/backends/apu/neuron_adapter.cc
+++ b/lite/backends/apu/neuron_adapter.cc
@@ -84,10 +84,14 @@ void NeuronAdapter::InitFunctions() {
  PADDLE_DLSYM(NeuronModel_addOperation);
  PADDLE_DLSYM(NeuronModel_addOperationExtension);
  PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs);
+  PADDLE_DLSYM(NeuronModel_restoreFromCompiledNetwork);
  PADDLE_DLSYM(NeuronCompilation_create);
  PADDLE_DLSYM(NeuronCompilation_free);
  PADDLE_DLSYM(NeuronCompilation_finish);
+  PADDLE_DLSYM(NeuronCompilation_setCaching);
+  PADDLE_DLSYM(NeuronCompilation_storeCompiledNetwork);
  PADDLE_DLSYM(NeuronCompilation_createForDevices);
+  PADDLE_DLSYM(NeuronCompilation_getCompiledNetworkSize);
  PADDLE_DLSYM(NeuronExecution_create);
  PADDLE_DLSYM(NeuronExecution_free);
  PADDLE_DLSYM(NeuronExecution_setInput);
@@ -179,6 +183,15 @@ int NeuronModel_identifyInputsAndOutputs(NeuronModel* model,
          model, inputCount, inputs, outputCount, outputs);
 }

+int NeuronModel_restoreFromCompiledNetwork(NeuronModel** model,
+                                           NeuronCompilation** compilation,
+                                           const void* buffer,
+                                           const size_t size) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronModel_restoreFromCompiledNetwork()(
+          model, compilation, buffer, size);
+}
+
 int NeuronCompilation_create(NeuronModel* model,
                             NeuronCompilation** compilation) {
  return paddle::lite::NeuronAdapter::Global()->NeuronCompilation_create()(
@@ -195,6 +208,26 @@ int NeuronCompilation_finish(NeuronCompilation* compilation) {
      compilation);
 }

+int NeuronCompilation_setCaching(NeuronCompilation* compilation,
+                                 const char* cacheDir,
+                                 const uint8_t* token) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronCompilation_setCaching()(
+      compilation, cacheDir, token);
+}
+
+int NeuronCompilation_storeCompiledNetwork(NeuronCompilation* compilation,
+                                           void* buffer,
+                                           const size_t size) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronCompilation_storeCompiledNetwork()(compilation, buffer, size);
+}
+
+int NeuronCompilation_getCompiledNetworkSize(NeuronCompilation* compilation,
+                                             size_t* size) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronCompilation_getCompiledNetworkSize()(compilation, size);
+}
+
 int NeuronCompilation_createForDevices(NeuronModel* model,
                                       const NeuronDevice* const* devices,
                                       uint32_t numDevices,

--- a/lite/backends/apu/neuron_adapter.h
+++ b/lite/backends/apu/neuron_adapter.h
@@ -52,15 +52,24 @@ class NeuronAdapter final {
                                                         const uint32_t *);
  using NeuronModel_identifyInputsAndOutputs_Type = int (*)(
      NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *);
+  using NeuronModel_restoreFromCompiledNetwork_Type =
+      int (*)(NeuronModel **, NeuronCompilation **, const void *, const size_t);
  using NeuronCompilation_create_Type = int (*)(NeuronModel *,
                                                NeuronCompilation **);
  using NeuronCompilation_free_Type = void (*)(NeuronCompilation *);
  using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *);
+  using NeuronCompilation_setCaching_Type = int (*)(NeuronCompilation *,
+                                                    const char *,
+                                                    const uint8_t *);
  using NeuronCompilation_createForDevices_Type =
      int (*)(NeuronModel *,
              const NeuronDevice *const *,
              uint32_t,
              NeuronCompilation **);
+  using NeuronCompilation_storeCompiledNetwork_Type =
+      int (*)(NeuronCompilation *, void *, const size_t);
+  using NeuronCompilation_getCompiledNetworkSize_Type =
+      int (*)(NeuronCompilation *, size_t *);
  using NeuronExecution_create_Type = int (*)(NeuronCompilation *,
                                              NeuronExecution **);
  using NeuronExecution_free_Type = void (*)(NeuronExecution *);
@@ -78,131 +87,202 @@ class NeuronAdapter final {
                                            const char **);

  Neuron_getVersion_Type Neuron_getVersion() {
-    CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!";
+    CHECK(Neuron_getVersion_ != nullptr) << "Cannot load "
+                                            "Neuron_"
+                                            "getVersion!";
    return Neuron_getVersion_;
  }

+  NeuronModel_restoreFromCompiledNetwork_Type
+  NeuronModel_restoreFromCompiledNetwork() {
+    CHECK(NeuronModel_restoreFromCompiledNetwork_ != nullptr)
+        << "Cannot load "
+           "NeuronModel_"
+           "restoreFromCompil"
+           "edNetwork!";
+    return NeuronModel_restoreFromCompiledNetwork_;
+  }
+
  NeuronModel_create_Type NeuronModel_create() {
-    CHECK(NeuronModel_create_ != nullptr) << "Cannot load NeuronModel_create!";
+    CHECK(NeuronModel_create_ != nullptr) << "Cannot load "
+                                             "NeuronModel_"
+                                             "create!";
    return NeuronModel_create_;
  }

  NeuronModel_free_Type NeuronModel_free() {
-    CHECK(NeuronModel_free_ != nullptr) << "Cannot load NeuronModel_free!";
+    CHECK(NeuronModel_free_ != nullptr) << "Cannot load "
+                                           "NeuronModel_"
+                                           "free!";
    return NeuronModel_free_;
  }

  NeuronModel_finish_Type NeuronModel_finish() {
-    CHECK(NeuronModel_finish_ != nullptr) << "Cannot load NeuronModel_finish!";
+    CHECK(NeuronModel_finish_ != nullptr) << "Cannot load "
+                                             "NeuronModel_"
+                                             "finish!";
    return NeuronModel_finish_;
  }

  NeuronModel_addOperand_Type NeuronModel_addOperand() {
-    CHECK(NeuronModel_addOperand_ != nullptr)
-        << "Cannot load NeuronModel_addOperand!";
+    CHECK(NeuronModel_addOperand_ != nullptr) << "Cannot load "
+                                                 "NeuronModel_"
+                                                 "addOperand!";
    return NeuronModel_addOperand_;
  }

  NeuronModel_setOperandValue_Type NeuronModel_setOperandValue() {
-    CHECK(NeuronModel_setOperandValue_ != nullptr)
-        << "Cannot load NeuronModel_setOperandValue!";
+    CHECK(NeuronModel_setOperandValue_ != nullptr) << "Cannot load "
+                                                      "NeuronModel_"
+                                                      "setOperandValue!";
    return NeuronModel_setOperandValue_;
  }

  NeuronModel_setOperandSymmPerChannelQuantParams_Type
  NeuronModel_setOperandSymmPerChannelQuantParams() {
    CHECK(NeuronModel_setOperandSymmPerChannelQuantParams_ != nullptr)
-        << "Cannot load NeuronModel_setOperandSymmPerChannelQuantParams!";
+        << "Cannot load "
+           "NeuronModel_"
+           "setOperandSymmPer"
+           "ChannelQuantParam"
+           "s!";
    return NeuronModel_setOperandSymmPerChannelQuantParams_;
  }

  NeuronModel_addOperation_Type NeuronModel_addOperation() {
-    CHECK(NeuronModel_addOperation_ != nullptr)
-        << "Cannot load NeuronModel_addOperation!";
+    CHECK(NeuronModel_addOperation_ != nullptr) << "Cannot load "
+                                                   "NeuronModel_"
+                                                   "addOperation!";
    return NeuronModel_addOperation_;
  }

  NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension() {
-    CHECK(NeuronModel_addOperationExtension_ != nullptr)
-        << "Cannot load NeuronModel_addOperationExtension!";
+    CHECK(NeuronModel_addOperationExtension_ != nullptr) << "Cannot load "
+                                                            "NeuronModel_"
+                                                            "addOperationExten"
+                                                            "sion!";
    return NeuronModel_addOperationExtension_;
  }

  NeuronModel_identifyInputsAndOutputs_Type
  NeuronModel_identifyInputsAndOutputs() {
    CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr)
-        << "Cannot load NeuronModel_identifyInputsAndOutputs!";
+        << "Cannot load "
+           "NeuronModel_"
+           "identifyInputsAnd"
+           "Outputs!";
    return NeuronModel_identifyInputsAndOutputs_;
  }

  NeuronCompilation_create_Type NeuronCompilation_create() {
-    CHECK(NeuronCompilation_create_ != nullptr)
-        << "Cannot load NeuronCompilation_create!";
+    CHECK(NeuronCompilation_create_ != nullptr) << "Cannot load "
+                                                   "NeuronCompilation"
+                                                   "_create!";
    return NeuronCompilation_create_;
  }

  NeuronCompilation_free_Type NeuronCompilation_free() {
-    CHECK(NeuronCompilation_free_ != nullptr)
-        << "Cannot load NeuronCompilation_free!";
+    CHECK(NeuronCompilation_free_ != nullptr) << "Cannot load "
+                                                 "NeuronCompilation"
+                                                 "_free!";
    return NeuronCompilation_free_;
  }

  NeuronCompilation_finish_Type NeuronCompilation_finish() {
-    CHECK(NeuronCompilation_finish_ != nullptr)
-        << "Cannot load NeuronCompilation_finish!";
+    CHECK(NeuronCompilation_finish_ != nullptr) << "Cannot load "
+                                                   "NeuronCompilation"
+                                                   "_finish!";
    return NeuronCompilation_finish_;
  }

+  NeuronCompilation_setCaching_Type NeuronCompilation_setCaching() {
+    CHECK(NeuronCompilation_setCaching_ != nullptr) << "Cannot load "
+                                                       "NeuronCompilation"
+                                                       "_setCaching!";
+    return NeuronCompilation_setCaching_;
+  }
+
  NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices() {
-    CHECK(NeuronCompilation_createForDevices_ != nullptr)
-        << "Cannot load NeuronCompilation_createForDevices!";
+    CHECK(NeuronCompilation_createForDevices_ != nullptr) << "Cannot load "
+                                                             "NeuronCompilation"
+                                                             "_createForDevices"
+                                                             "!";
    return NeuronCompilation_createForDevices_;
  }

+  NeuronCompilation_storeCompiledNetwork_Type
+  NeuronCompilation_storeCompiledNetwork() {
+    CHECK(NeuronCompilation_storeCompiledNetwork_ != nullptr)
+        << "Cannot load "
+           "NeuronCompilation"
+           "_storeCompiledNet"
+           "work!";
+    return NeuronCompilation_storeCompiledNetwork_;
+  }
+
+  NeuronCompilation_getCompiledNetworkSize_Type
+  NeuronCompilation_getCompiledNetworkSize() {
+    CHECK(NeuronCompilation_getCompiledNetworkSize_ != nullptr)
+        << "Cannot load "
+           "NeuronCompilation"
+           "_getCompiledNetwo"
+           "rkSize!";
+    return NeuronCompilation_getCompiledNetworkSize_;
+  }
+
  NeuronExecution_create_Type NeuronExecution_create() {
-    CHECK(NeuronExecution_create_ != nullptr)
-        << "Cannot load NeuronExecution_create!";
+    CHECK(NeuronExecution_create_ != nullptr) << "Cannot load "
+                                                 "NeuronExecution_"
+                                                 "create!";
    return NeuronExecution_create_;
  }

  NeuronExecution_free_Type NeuronExecution_free() {
-    CHECK(NeuronExecution_free_ != nullptr)
-        << "Cannot load NeuronExecution_free!";
+    CHECK(NeuronExecution_free_ != nullptr) << "Cannot load "
+                                               "NeuronExecution_"
+                                               "free!";
    return NeuronExecution_free_;
  }

  NeuronExecution_setInput_Type NeuronExecution_setInput() {
-    CHECK(NeuronExecution_setInput_ != nullptr)
-        << "Cannot loadcl NeuronExecution_setInput!";
+    CHECK(NeuronExecution_setInput_ != nullptr) << "Cannot loadcl "
+                                                   "NeuronExecution_"
+                                                   "setInput!";
    return NeuronExecution_setInput_;
  }

  NeuronExecution_setOutput_Type NeuronExecution_setOutput() {
-    CHECK(NeuronExecution_setOutput_ != nullptr)
-        << "Cannot load NeuronExecution_setOutput!";
+    CHECK(NeuronExecution_setOutput_ != nullptr) << "Cannot load "
+                                                    "NeuronExecution_"
+                                                    "setOutput!";
    return NeuronExecution_setOutput_;
  }

  NeuronExecution_compute_Type NeuronExecution_compute() {
-    CHECK(NeuronExecution_compute_ != nullptr)
-        << "Cannot load NeuronExecution_compute!";
+    CHECK(NeuronExecution_compute_ != nullptr) << "Cannot load "
+                                                  "NeuronExecution_"
+                                                  "compute!";
    return NeuronExecution_compute_;
  }

  Neuron_getDeviceCount_Type Neuron_getDeviceCount() {
-    CHECK(Neuron_getDeviceCount_ != nullptr)
-        << "Cannot load Neuron_getDeviceCount!";
+    CHECK(Neuron_getDeviceCount_ != nullptr) << "Cannot load "
+                                                "Neuron_"
+                                                "getDeviceCount!";
    return Neuron_getDeviceCount_;
  }

  Neuron_getDevice_Type Neuron_getDevice() {
-    CHECK(Neuron_getDevice_ != nullptr) << "Cannot load Neuron_getDevice!";
+    CHECK(Neuron_getDevice_ != nullptr) << "Cannot load "
+                                           "Neuron_"
+                                           "getDevice!";
    return Neuron_getDevice_;
  }

  NeuronDevice_getName_Type NeuronDevice_getName() {
-    CHECK(NeuronDevice_getName_ != nullptr)
-        << "Cannot load NeuronDevice_getName!";
+    CHECK(NeuronDevice_getName_ != nullptr) << "Cannot load "
+                                               "NeuronDevice_"
+                                               "getName!";
    return NeuronDevice_getName_;
  }

@@ -226,11 +306,18 @@ class NeuronAdapter final {
      nullptr};
  NeuronModel_identifyInputsAndOutputs_Type
      NeuronModel_identifyInputsAndOutputs_{nullptr};
+  NeuronModel_restoreFromCompiledNetwork_Type
+      NeuronModel_restoreFromCompiledNetwork_{nullptr};
  NeuronCompilation_create_Type NeuronCompilation_create_{nullptr};
  NeuronCompilation_free_Type NeuronCompilation_free_{nullptr};
  NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr};
+  NeuronCompilation_setCaching_Type NeuronCompilation_setCaching_{nullptr};
  NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices_{
      nullptr};
+  NeuronCompilation_storeCompiledNetwork_Type
+      NeuronCompilation_storeCompiledNetwork_{nullptr};
+  NeuronCompilation_getCompiledNetworkSize_Type
+      NeuronCompilation_getCompiledNetworkSize_{nullptr};
  NeuronExecution_create_Type NeuronExecution_create_{nullptr};
  NeuronExecution_free_Type NeuronExecution_free_{nullptr};
  NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr};

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -144,6 +144,21 @@ class Context<TargetType::kAPU> {

  APUContext& operator=(const APUContext& ctx) {}
  std::string name() const { return "APUContext"; }
+
+  static void SetSubgraphModelCacheDir(Scope* scope,
+                                       std::string subgraph_model_cache_dir) {
+    auto var = scope->Var("SUBGRAPH_MODEL_CACHE_DIR");
+    CHECK(var);
+    auto data = var->GetMutable<std::string>();
+    CHECK(data);
+    *data = subgraph_model_cache_dir;
+  }
+
+  static std::string SubgraphModelCacheDir(Scope* scope) {
+    auto var = scope->FindVar("SUBGRAPH_MODEL_CACHE_DIR");
+    if (!var) return "";
+    return var->Get<std::string>();
+  }
 };
 #endif


--- a/lite/kernels/apu/bridges/CMakeLists.txt
+++ b/lite/kernels/apu/bridges/CMakeLists.txt
--- a/lite/kernels/apu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/apu/bridges/conv_transpose_op.cc
@@ -53,7 +53,7 @@ int ConvTransposeConverter(void *ctx, OpLite *op, KernelBase *kernel) {

  auto strides = op_info->GetAttr<std::vector<int>>("strides");
  CHECK_EQ(strides.size(), 2L);
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
  auto groups = op_info->GetAttr<int>("groups");
  if (groups > 1) {
    LOG(WARNING) << "[NPU] only support groups == 1";
@@ -70,7 +70,7 @@ int ConvTransposeConverter(void *ctx, OpLite *op, KernelBase *kernel) {
  auto fuse_relu =
      op_info->HasAttr("fuse_relu") && op_info->GetAttr<bool>("fuse_relu");

-  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  std::vector<int> dilations = op_info->GetAttr<std::vector<int>>("dilations");
  CHECK_EQ(dilations.size(), 2L);
  std::string padding_algorithm =
      op_info->HasAttr("padding_algorithm")

--- a/lite/kernels/apu/bridges/graph.cc
+++ b/lite/kernels/apu/bridges/graph.cc
--- a/lite/kernels/apu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/apu/bridges/paddle_use_bridges.h
--- a/lite/kernels/apu/bridges/utility.h
+++ b/lite/kernels/apu/bridges/utility.h
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -22,17 +22,102 @@
 #include "lite/kernels/apu/bridges/graph.h"
 #include "lite/kernels/apu/bridges/paddle_use_bridges.h"
 #include "lite/kernels/apu/bridges/utility.h"
+#include "lite/utils/io.h"
+#include "lite/utils/md5.h"

 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace apu {

-bool SubgraphEngine::BuildDeviceProgram() {
-  if (!origin_program_) {
-    BuildOriginProgram();
+// Generate the model name by using md5 hashes based on:
+// 1. the sorted variable input names
+// 2. the shapes of the origin input tensors
+// 3. the sorted variable output names
+std::string DeviceProgram::GenerateModelName(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims) {
+  std::ostringstream os;
+  CHECK_EQ(input_names.size(), origin_idims.size());
+  for (int i = 0; i < input_names.size(); i++) {
+    os << input_names[i];
+    for (auto dim : origin_idims[i]) {
+      os << dim;
+    }
+  }
+  for (auto output_name : output_names) {
+    os << output_name;
+  }
+  return MD5(os.str());
+}
+
+// Deserialize the generated model
+bool DeviceProgram::LoadFromCacheFile(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims,
+    const std::string& model_cache_dir) {
+  int status;
+
+  // Generate the model name if not initialized
+  if (model_name_.empty()) {
+    model_name_ = GenerateModelName(input_names, output_names, origin_idims);
  }
+  // Load from the cached model file
+  auto model_path = model_cache_dir + "/" + model_name_ + ".dla";
+  VLOG(3) << "[APU] Load model from " << model_path;

+  std::vector<char> compilationBuffer;
+  if (!ReadFile(model_path, &compilationBuffer)) {
+    LOG(WARNING) << "[NPU] Open " << model_path << " for reading failed!";
+    return false;
+  }
+  model_ = nullptr;
+  compilation_ = nullptr;
+  status = NeuronModel_restoreFromCompiledNetwork(
+      &model_, &compilation_, &compilationBuffer[0], compilationBuffer.size());
+  if (status != NEURON_NO_ERROR) {
+    LOG(WARNING) << "[APU] Load model failed!" << compilationBuffer.size();
+    return false;
+  }
+
+  VLOG(3) << "[APU] Complete Load model!";
+
+  // Deserialize the preicisions and shapes of the origin output tensors from
+  // the
+  // cached configuration file
+  auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
+  VLOG(3) << "[APU] Load configuration from " << config_path;
+  std::vector<char> config_buffer;
+  if (!ReadFile(config_path, &config_buffer)) {
+    LOG(WARNING) << "[APU] read from " << config_path << " failed!";
+    return false;
+  }
+
+  std::string str(config_buffer.begin(), config_buffer.end());
+  // Parse the precision and shapes of the output tensors
+  auto output_options = Split<std::string>(str, ";");
+  CHECK_EQ(output_options.size(), output_names.size());
+  origin_otypes_.resize(output_names.size());
+  origin_odims_.resize(output_names.size());
+  for (int i = 0; i < output_names.size(); i++) {
+    auto items = Split<std::string>(output_options[i], ":");
+    CHECK_EQ(items.size(), 2);  // precision and shapes
+    origin_otypes_[i] = static_cast<PrecisionType>(std::stoi(items[0]));
+    origin_odims_[i] = Split<int64_t>(items[1], ",");
+  }
+  return true;
+}
+
+bool DeviceProgram::BuildGraphAndCacheToFile(
+    RuntimeProgram* origin_program,
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims,
+    const std::vector<Tensor*>& origin_itensors,
+    const std::vector<Tensor*>& origin_otensors,
+    const std::string& model_cache_dir) {
  auto GetCurrentUS = []() -> double {
    struct timeval time;
    gettimeofday(&time, NULL);
@@ -53,13 +138,14 @@ bool SubgraphEngine::BuildDeviceProgram() {
    return false;
  }
  graph.set_model(model_);
-  graph.set_input_names(input_names_);
-  graph.set_output_names(output_names_);
+  graph.set_input_names(input_names);
+  graph.set_output_names(output_names);

  // Convert all of ops and their input vars and weights and added into the APU
  // NIR graph
  const auto& bridges = subgraph::SubgraphBridgeRegistry::Instance();
-  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  const auto& insts = origin_program->instructions(kRootBlockIdx);
+
  for (auto& inst : insts) {
    auto op = const_cast<OpLite*>(inst.op());
    CHECK(op);
@@ -82,25 +168,25 @@ bool SubgraphEngine::BuildDeviceProgram() {

  // Get the index of input tensors
  std::vector<uint32_t> input_indices;
-  for (int i = 0; i < input_names_.size(); i++) {
-    CHECK(graph.Has(input_names_[i])) << "[APU] Failed to find input node "
-                                      << input_names_[i];
-    auto index = graph.Get(input_names_[i])->index();
+  for (int i = 0; i < input_names.size(); i++) {
+    CHECK(graph.Has(input_names[i])) << "[APU] Failed to find input node "
+                                     << input_names[i];
+    auto index = graph.Get(input_names[i])->index();
    input_indices.push_back(index);
-    VLOG(3) << "[APU] Input[" << i << "] name " << input_names_[i] << " dims "
-            << origin_itensors_[i]->dims() << " index " << index;
+    VLOG(3) << "[APU] Input[" << i << "] name " << input_names[i] << " dims "
+            << origin_itensors[i]->dims() << " index " << index;
  }

  // Get the index of output tensors
  std::vector<uint32_t> output_indices;
-  for (int i = 0; i < output_names_.size(); i++) {
-    CHECK(graph.Has(output_names_[i])) << "[APU] Failed to find output node "
-                                       << output_names_[i];
-    origin_otensors_[i]->mutable_data<int8_t>();
-    auto index = graph.Get(output_names_[i])->index();
+  for (int i = 0; i < output_names.size(); i++) {
+    CHECK(graph.Has(output_names[i])) << "[APU] Failed to find output node "
+                                      << output_names[i];
+    origin_otensors[i]->mutable_data<int8_t>();
+    auto index = graph.Get(output_names[i])->index();
    output_indices.push_back(index);
-    VLOG(3) << "[APU] Output[" << i << "] name " << output_names_[i] << " dims "
-            << origin_otensors_[i]->dims() << " index " << index;
+    VLOG(3) << "[APU] Output[" << i << "] name " << output_names[i] << " dims "
+            << origin_otensors[i]->dims() << " index " << index;
  }

  // Indentify the input and output tensors of the neuron model
@@ -114,7 +200,6 @@ bool SubgraphEngine::BuildDeviceProgram() {
    LOG(WARNING) << "[APU] Fail to create NIR model:" << neuron_errCode;
    return false;
  }
-  VLOG(3) << "[APU] APU NIR model created!";

  VLOG(1) << "[APU] APU NIR model created, Create cost "
          << GetCurrentUS() - start_time << " us";
@@ -127,9 +212,109 @@ bool SubgraphEngine::BuildDeviceProgram() {
  }
  VLOG(1) << "[APU] APU DLA model created, Build cost "
          << GetCurrentUS() - start_time << " us";
+
+  CHECK_EQ(origin_otensors.size(), output_names.size());
+  origin_otypes_.resize(output_names.size());
+  origin_odims_.resize(output_names.size());
+  for (size_t i = 0; i < output_names.size(); i++) {
+    origin_otypes_[i] = origin_otensors[i]->precision();
+    origin_odims_[i] = origin_otensors[i]->dims().Vectorize();
+  }
+  if (!model_cache_dir.empty()) {
+    // Save the generated model to file
+    auto model_path = model_cache_dir + "/" + model_name_ + ".dla";
+    VLOG(3) << "[APU] Save model to " << model_path;
+
+    size_t compilationSize;
+    status = NeuronCompilation_getCompiledNetworkSize(compilation_,
+                                                      &compilationSize);
+    if (status == NEURON_NO_ERROR) {
+      // Serialization DLA
+      std::vector<char> model_buffer;
+      model_buffer.resize(compilationSize);
+      status = NeuronCompilation_storeCompiledNetwork(
+          compilation_, &model_buffer[0], compilationSize);
+      if (status != NEURON_NO_ERROR) {
+        LOG(WARNING) << "[APU] Serialization DLA failed!";
+      }
+
+      VLOG(3) << "[APU] Export the model to " << model_path;
+      if (!WriteFile(model_path, model_buffer)) {
+        LOG(WARNING) << "[APU] Open " << model_path << " for writting failed!";
+      }
+    }
+
+    // Serialize the precisions and shapes of the origin output tensors into the
+    // configuration file
+    std::ostringstream os;
+    for (int i = 0; i < output_names.size(); i++) {
+      os << static_cast<int32_t>(origin_otypes_[i]) << ":";
+      for (auto dim : origin_odims_[i]) {
+        os << dim << ",";
+      }
+      os << ";";
+    }
+    auto str = os.str();
+    std::vector<char> config_buffer(str.begin(), str.end());
+    auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
+    VLOG(3) << "[APU] Save configuration to " << config_path;
+    if (!WriteFile(config_path, config_buffer)) {
+      LOG(WARNING) << "[APU] Open " << config_path << " for writting failed!";
+    }
+  }
+
  return true;
 }

+bool SubgraphEngine::BuildDeviceProgram() {
+  // Check if the cache device program exists
+  if (!device_programs_.count(origin_idims_)) {
+    auto device_program = std::make_shared<DeviceProgram>();
+    // Obtain the model cache dir from the NPU Context of the subgraph op
+    auto model_cache_dir =
+        ctx_->As<APUContext>().SubgraphModelCacheDir(exec_scope_);
+    VLOG(3) << "[APU] Getting subgraph_model_cache_dir: " << model_cache_dir;
+    // Check and load if the cached model and configuration file exists
+    if (model_cache_dir.empty() ||
+        !device_program->LoadFromCacheFile(
+            input_names_, output_names_, origin_idims_, model_cache_dir)) {
+      // Build the model online, including converting the paddle ops to the NIR
+      // nodes, building the MTK NIR graph, and compile MTK NIR graph to dla
+      if (!origin_program_) {
+        BuildOriginProgram();
+      }
+      CHECK(origin_program_) << "[APU] The origin program is not initialized!";
+      CHECK_GT(origin_program_->instructions().size(), 0)
+          << "[APU] No instructions found in the origin program!";
+      if (!device_program->BuildGraphAndCacheToFile(origin_program_.get(),
+                                                    input_names_,
+                                                    output_names_,
+                                                    origin_idims_,
+                                                    origin_itensors_,
+                                                    origin_otensors_,
+                                                    model_cache_dir)) {
+        return false;
+      }
+    }
+    if (device_program->model_ == nullptr) {
+      LOG(WARNING) << "dla create fail!";
+      return false;
+    }
+    device_programs_[origin_idims_] = device_program;
+  }
+
+  // Get the index of output tensors
+  auto device_program = device_programs_[origin_idims_];
+  CHECK(device_program && device_program->model_);
+  for (int i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i]->Resize(device_program->origin_odims_[i]);
+    origin_otensors_[i]->mutable_data<int8_t>();
+    VLOG(3) << "[APU] Output[" << i << "] name " << output_names_[i] << " dims "
+            << origin_otensors_[i]->dims() << " memory_size "
+            << origin_otensors_[i]->memory_size();
+  }
+}
+
 bool SubgraphEngine::LaunchDeviceProgram() {
  auto GetCurrentUS = []() -> double {
    struct timeval time;
@@ -137,9 +322,17 @@ bool SubgraphEngine::LaunchDeviceProgram() {
    return 1e+6 * time.tv_sec + time.tv_usec;
  };

+  if (device_programs_.count(origin_idims_) == 0 ||
+      device_programs_[origin_idims_]->model_ == nullptr) {
+    return LaunchOriginProgram();
+  }
+
+  auto device_program = device_programs_[origin_idims_];
+
  auto start_time = GetCurrentUS();
  NeuronExecution* run = NULL;
-  int neuron_errCode = NeuronExecution_create(compilation_, &run);
+  int neuron_errCode =
+      NeuronExecution_create(device_program->compilation_, &run);
  if (NEURON_NO_ERROR != neuron_errCode) {
    LOG(WARNING) << "[APU] Build APU runtime failed!";
    return false;
@@ -187,11 +380,13 @@ bool SubgraphEngine::LaunchDeviceProgram() {
 }

 SubgraphEngine::~SubgraphEngine() {
-  if (compilation_) {
-    NeuronCompilation_free(compilation_);
-  }
-  if (model_) {
-    NeuronModel_free(model_);
+  for (auto& device_program : device_programs_) {
+    if (device_program.second->compilation_) {
+      NeuronCompilation_free(device_program.second->compilation_);
+    }
+    if (device_program.second->model_) {
+      NeuronModel_free(device_program.second->model_);
+    }
  }
 }


--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -27,14 +28,43 @@ namespace lite {
 namespace kernels {
 namespace apu {

+class DeviceProgram {
+ public:
+  DeviceProgram() {}
+  ~DeviceProgram() {}
+  std::string GenerateModelName(
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::vector<int64_t>>& origin_idims);
+  bool LoadFromCacheFile(const std::vector<std::string>& input_names,
+                         const std::vector<std::string>& output_names,
+                         const std::vector<std::vector<int64_t>>& origin_idims,
+                         const std::string& model_cache_dir);
+  bool BuildGraphAndCacheToFile(
+      RuntimeProgram* origin_program,
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::vector<int64_t>>& origin_idims,
+      const std::vector<Tensor*>& origin_itensors,
+      const std::vector<Tensor*>& origin_otensors,
+      const std::string& model_cache_dir);
+
+ public:
+  std::string model_name_{""};
+  std::vector<std::vector<int64_t>> origin_odims_;
+  std::vector<PrecisionType> origin_otypes_;
+  NeuronModel* model_;
+  NeuronCompilation* compilation_;
+};
+
 class SubgraphEngine : public subgraph::SubgraphEngineBase {
 public:
-  SubgraphEngine(KernelContext *ctx,
+  SubgraphEngine(KernelContext* ctx,
                 int block_idx,
-                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
-                 Scope *exec_scope,
-                 const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names)
+                 const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+                 Scope* exec_scope,
+                 const std::vector<std::string>& input_names,
+                 const std::vector<std::string>& output_names)
      : subgraph::SubgraphEngineBase(ctx,
                                     block_idx,
                                     program_desc,
@@ -48,8 +78,8 @@ class SubgraphEngine : public subgraph::SubgraphEngineBase {
  bool BuildDeviceProgram() override;
  bool LaunchDeviceProgram() override;

-  NeuronModel *model_;
-  NeuronCompilation *compilation_;
+  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<DeviceProgram>>
+      device_programs_;
 };

 class SubgraphCompute

--- a/lite/tests/api/test_mobilenetv1_int8_mediatek_apu.cc
+++ b/lite/tests/api/test_mobilenetv1_int8_mediatek_apu.cc
@@ -33,11 +33,13 @@ namespace paddle {
 namespace lite {

 TEST(MobileNetV1, test_mobilenetv1_int8_mediatek_apu) {
+  std::string subgraph_model_cache_dir = FLAGS_model_dir;
  lite_api::CxxConfig config;
  config.set_model_dir(FLAGS_model_dir);
  config.set_valid_places({lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
                           lite_api::Place{TARGET(kARM), PRECISION(kInt8)},
                           lite_api::Place{TARGET(kAPU), PRECISION(kInt8)}});
+  config.set_subgraph_model_cache_dir(subgraph_model_cache_dir);
  auto predictor = lite_api::CreatePaddlePredictor(config);

  std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");