diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index 0b5b9ad94c47a3d97492cd5b91618b184c9ef122..e147fc7ebc18d8fa213503d75a4fe68be06f3293 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -89,6 +89,14 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
   Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
       raw_predictor_->scope(), config.subgraph_model_cache_dir());
 #endif
+
+#ifdef LITE_WITH_APU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
+  Context<TargetType::kAPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+
 #ifdef LITE_WITH_HUAWEI_ASCEND_NPU
   Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
       config.get_device_id());
diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc
index 3c5be7b9cdd340fe0fe82c589706c77875de0030..7f90a069544f0a8b4a7623ce35b34fccd46bb886 100644
--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -47,6 +47,14 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
   Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
       raw_predictor_->scope(), config.subgraph_model_cache_dir());
 #endif
+
+#ifdef LITE_WITH_APU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
+  Context<TargetType::kAPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+
 #ifdef LITE_WITH_HUAWEI_ASCEND_NPU
   Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
       config.get_device_id());
diff --git a/lite/backends/apu/neuron_adapter.cc b/lite/backends/apu/neuron_adapter.cc
index ff08507504b8bd7e5342c5705afb17550f37469e..bd6e41aeb19f0f71122e37e5e695f8b12467eebd 100644
--- a/lite/backends/apu/neuron_adapter.cc
+++ b/lite/backends/apu/neuron_adapter.cc
@@ -84,10 +84,14 @@ void NeuronAdapter::InitFunctions() {
   PADDLE_DLSYM(NeuronModel_addOperation);
   PADDLE_DLSYM(NeuronModel_addOperationExtension);
   PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs);
+  PADDLE_DLSYM(NeuronModel_restoreFromCompiledNetwork);
   PADDLE_DLSYM(NeuronCompilation_create);
   PADDLE_DLSYM(NeuronCompilation_free);
   PADDLE_DLSYM(NeuronCompilation_finish);
+  PADDLE_DLSYM(NeuronCompilation_setCaching);
+  PADDLE_DLSYM(NeuronCompilation_storeCompiledNetwork);
   PADDLE_DLSYM(NeuronCompilation_createForDevices);
+  PADDLE_DLSYM(NeuronCompilation_getCompiledNetworkSize);
   PADDLE_DLSYM(NeuronExecution_create);
   PADDLE_DLSYM(NeuronExecution_free);
   PADDLE_DLSYM(NeuronExecution_setInput);
@@ -179,6 +183,15 @@ int NeuronModel_identifyInputsAndOutputs(NeuronModel* model,
           model, inputCount, inputs, outputCount, outputs);
 }
 
+int NeuronModel_restoreFromCompiledNetwork(NeuronModel** model,
+                                           NeuronCompilation** compilation,
+                                           const void* buffer,
+                                           const size_t size) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronModel_restoreFromCompiledNetwork()(
+          model, compilation, buffer, size);
+}
+
 int NeuronCompilation_create(NeuronModel* model,
                              NeuronCompilation** compilation) {
   return paddle::lite::NeuronAdapter::Global()->NeuronCompilation_create()(
@@ -195,6 +208,26 @@ int NeuronCompilation_finish(NeuronCompilation* compilation) {
       compilation);
 }
 
+int NeuronCompilation_setCaching(NeuronCompilation* compilation,
+                                 const char* cacheDir,
+                                 const uint8_t* token) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronCompilation_setCaching()(
+      compilation, cacheDir, token);
+}
+
+int NeuronCompilation_storeCompiledNetwork(NeuronCompilation* compilation,
+                                           void* buffer,
+                                           const size_t size) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronCompilation_storeCompiledNetwork()(compilation, buffer, size);
+}
+
+int NeuronCompilation_getCompiledNetworkSize(NeuronCompilation* compilation,
+                                             size_t* size) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronCompilation_getCompiledNetworkSize()(compilation, size);
+}
+
 int NeuronCompilation_createForDevices(NeuronModel* model,
                                        const NeuronDevice* const* devices,
                                        uint32_t numDevices,
diff --git a/lite/backends/apu/neuron_adapter.h b/lite/backends/apu/neuron_adapter.h
index c1b9669a98626699b126913dcc840906de4de8e0..8d57075d6c1f5e9d865b2367d119b665f4182b6e 100644
--- a/lite/backends/apu/neuron_adapter.h
+++ b/lite/backends/apu/neuron_adapter.h
@@ -52,15 +52,24 @@ class NeuronAdapter final {
                                                          const uint32_t *);
   using NeuronModel_identifyInputsAndOutputs_Type = int (*)(
       NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *);
+  using NeuronModel_restoreFromCompiledNetwork_Type =
+      int (*)(NeuronModel **, NeuronCompilation **, const void *, const size_t);
   using NeuronCompilation_create_Type = int (*)(NeuronModel *,
                                                 NeuronCompilation **);
   using NeuronCompilation_free_Type = void (*)(NeuronCompilation *);
   using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *);
+  using NeuronCompilation_setCaching_Type = int (*)(NeuronCompilation *,
+                                                    const char *,
+                                                    const uint8_t *);
   using NeuronCompilation_createForDevices_Type =
       int (*)(NeuronModel *,
               const NeuronDevice *const *,
               uint32_t,
               NeuronCompilation **);
+  using NeuronCompilation_storeCompiledNetwork_Type =
+      int (*)(NeuronCompilation *, void *, const size_t);
+  using NeuronCompilation_getCompiledNetworkSize_Type =
+      int (*)(NeuronCompilation *, size_t *);
   using NeuronExecution_create_Type = int (*)(NeuronCompilation *,
                                               NeuronExecution **);
   using NeuronExecution_free_Type = void (*)(NeuronExecution *);
@@ -78,131 +87,202 @@ class NeuronAdapter final {
                                             const char **);
 
   Neuron_getVersion_Type Neuron_getVersion() {
-    CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!";
+    CHECK(Neuron_getVersion_ != nullptr) << "Cannot load "
+                                            "Neuron_"
+                                            "getVersion!";
     return Neuron_getVersion_;
   }
 
+  NeuronModel_restoreFromCompiledNetwork_Type
+  NeuronModel_restoreFromCompiledNetwork() {
+    CHECK(NeuronModel_restoreFromCompiledNetwork_ != nullptr)
+        << "Cannot load "
+           "NeuronModel_"
+           "restoreFromCompil"
+           "edNetwork!";
+    return NeuronModel_restoreFromCompiledNetwork_;
+  }
+
   NeuronModel_create_Type NeuronModel_create() {
-    CHECK(NeuronModel_create_ != nullptr) << "Cannot load NeuronModel_create!";
+    CHECK(NeuronModel_create_ != nullptr) << "Cannot load "
+                                             "NeuronModel_"
+                                             "create!";
     return NeuronModel_create_;
   }
 
   NeuronModel_free_Type NeuronModel_free() {
-    CHECK(NeuronModel_free_ != nullptr) << "Cannot load NeuronModel_free!";
+    CHECK(NeuronModel_free_ != nullptr) << "Cannot load "
+                                           "NeuronModel_"
+                                           "free!";
     return NeuronModel_free_;
   }
 
   NeuronModel_finish_Type NeuronModel_finish() {
-    CHECK(NeuronModel_finish_ != nullptr) << "Cannot load NeuronModel_finish!";
+    CHECK(NeuronModel_finish_ != nullptr) << "Cannot load "
+                                             "NeuronModel_"
+                                             "finish!";
     return NeuronModel_finish_;
   }
 
   NeuronModel_addOperand_Type NeuronModel_addOperand() {
-    CHECK(NeuronModel_addOperand_ != nullptr)
-        << "Cannot load NeuronModel_addOperand!";
+    CHECK(NeuronModel_addOperand_ != nullptr) << "Cannot load "
+                                                 "NeuronModel_"
+                                                 "addOperand!";
     return NeuronModel_addOperand_;
   }
 
   NeuronModel_setOperandValue_Type NeuronModel_setOperandValue() {
-    CHECK(NeuronModel_setOperandValue_ != nullptr)
-        << "Cannot load NeuronModel_setOperandValue!";
+    CHECK(NeuronModel_setOperandValue_ != nullptr) << "Cannot load "
+                                                      "NeuronModel_"
+                                                      "setOperandValue!";
     return NeuronModel_setOperandValue_;
   }
 
   NeuronModel_setOperandSymmPerChannelQuantParams_Type
   NeuronModel_setOperandSymmPerChannelQuantParams() {
     CHECK(NeuronModel_setOperandSymmPerChannelQuantParams_ != nullptr)
-        << "Cannot load NeuronModel_setOperandSymmPerChannelQuantParams!";
+        << "Cannot load "
+           "NeuronModel_"
+           "setOperandSymmPer"
+           "ChannelQuantParam"
+           "s!";
     return NeuronModel_setOperandSymmPerChannelQuantParams_;
   }
 
   NeuronModel_addOperation_Type NeuronModel_addOperation() {
-    CHECK(NeuronModel_addOperation_ != nullptr)
-        << "Cannot load NeuronModel_addOperation!";
+    CHECK(NeuronModel_addOperation_ != nullptr) << "Cannot load "
+                                                   "NeuronModel_"
+                                                   "addOperation!";
     return NeuronModel_addOperation_;
   }
 
   NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension() {
-    CHECK(NeuronModel_addOperationExtension_ != nullptr)
-        << "Cannot load NeuronModel_addOperationExtension!";
+    CHECK(NeuronModel_addOperationExtension_ != nullptr) << "Cannot load "
+                                                            "NeuronModel_"
+                                                            "addOperationExten"
+                                                            "sion!";
     return NeuronModel_addOperationExtension_;
   }
 
   NeuronModel_identifyInputsAndOutputs_Type
   NeuronModel_identifyInputsAndOutputs() {
     CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr)
-        << "Cannot load NeuronModel_identifyInputsAndOutputs!";
+        << "Cannot load "
+           "NeuronModel_"
+           "identifyInputsAnd"
+           "Outputs!";
     return NeuronModel_identifyInputsAndOutputs_;
   }
 
   NeuronCompilation_create_Type NeuronCompilation_create() {
-    CHECK(NeuronCompilation_create_ != nullptr)
-        << "Cannot load NeuronCompilation_create!";
+    CHECK(NeuronCompilation_create_ != nullptr) << "Cannot load "
+                                                   "NeuronCompilation"
+                                                   "_create!";
     return NeuronCompilation_create_;
   }
 
   NeuronCompilation_free_Type NeuronCompilation_free() {
-    CHECK(NeuronCompilation_free_ != nullptr)
-        << "Cannot load NeuronCompilation_free!";
+    CHECK(NeuronCompilation_free_ != nullptr) << "Cannot load "
+                                                 "NeuronCompilation"
+                                                 "_free!";
     return NeuronCompilation_free_;
   }
 
   NeuronCompilation_finish_Type NeuronCompilation_finish() {
-    CHECK(NeuronCompilation_finish_ != nullptr)
-        << "Cannot load NeuronCompilation_finish!";
+    CHECK(NeuronCompilation_finish_ != nullptr) << "Cannot load "
+                                                   "NeuronCompilation"
+                                                   "_finish!";
     return NeuronCompilation_finish_;
   }
 
+  NeuronCompilation_setCaching_Type NeuronCompilation_setCaching() {
+    CHECK(NeuronCompilation_setCaching_ != nullptr) << "Cannot load "
+                                                       "NeuronCompilation"
+                                                       "_setCaching!";
+    return NeuronCompilation_setCaching_;
+  }
+
   NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices() {
-    CHECK(NeuronCompilation_createForDevices_ != nullptr)
-        << "Cannot load NeuronCompilation_createForDevices!";
+    CHECK(NeuronCompilation_createForDevices_ != nullptr) << "Cannot load "
+                                                             "NeuronCompilation"
+                                                             "_createForDevices"
+                                                             "!";
     return NeuronCompilation_createForDevices_;
   }
 
+  NeuronCompilation_storeCompiledNetwork_Type
+  NeuronCompilation_storeCompiledNetwork() {
+    CHECK(NeuronCompilation_storeCompiledNetwork_ != nullptr)
+        << "Cannot load "
+           "NeuronCompilation"
+           "_storeCompiledNet"
+           "work!";
+    return NeuronCompilation_storeCompiledNetwork_;
+  }
+
+  NeuronCompilation_getCompiledNetworkSize_Type
+  NeuronCompilation_getCompiledNetworkSize() {
+    CHECK(NeuronCompilation_getCompiledNetworkSize_ != nullptr)
+        << "Cannot load "
+           "NeuronCompilation"
+           "_getCompiledNetwo"
+           "rkSize!";
+    return NeuronCompilation_getCompiledNetworkSize_;
+  }
+
   NeuronExecution_create_Type NeuronExecution_create() {
-    CHECK(NeuronExecution_create_ != nullptr)
-        << "Cannot load NeuronExecution_create!";
+    CHECK(NeuronExecution_create_ != nullptr) << "Cannot load "
+                                                 "NeuronExecution_"
+                                                 "create!";
     return NeuronExecution_create_;
   }
 
   NeuronExecution_free_Type NeuronExecution_free() {
-    CHECK(NeuronExecution_free_ != nullptr)
-        << "Cannot load NeuronExecution_free!";
+    CHECK(NeuronExecution_free_ != nullptr) << "Cannot load "
+                                               "NeuronExecution_"
+                                               "free!";
     return NeuronExecution_free_;
   }
 
   NeuronExecution_setInput_Type NeuronExecution_setInput() {
-    CHECK(NeuronExecution_setInput_ != nullptr)
-        << "Cannot loadcl NeuronExecution_setInput!";
+    CHECK(NeuronExecution_setInput_ != nullptr) << "Cannot loadcl "
+                                                   "NeuronExecution_"
+                                                   "setInput!";
     return NeuronExecution_setInput_;
   }
 
   NeuronExecution_setOutput_Type NeuronExecution_setOutput() {
-    CHECK(NeuronExecution_setOutput_ != nullptr)
-        << "Cannot load NeuronExecution_setOutput!";
+    CHECK(NeuronExecution_setOutput_ != nullptr) << "Cannot load "
+                                                    "NeuronExecution_"
+                                                    "setOutput!";
     return NeuronExecution_setOutput_;
   }
 
   NeuronExecution_compute_Type NeuronExecution_compute() {
-    CHECK(NeuronExecution_compute_ != nullptr)
-        << "Cannot load NeuronExecution_compute!";
+    CHECK(NeuronExecution_compute_ != nullptr) << "Cannot load "
+                                                  "NeuronExecution_"
+                                                  "compute!";
     return NeuronExecution_compute_;
   }
 
   Neuron_getDeviceCount_Type Neuron_getDeviceCount() {
-    CHECK(Neuron_getDeviceCount_ != nullptr)
-        << "Cannot load Neuron_getDeviceCount!";
+    CHECK(Neuron_getDeviceCount_ != nullptr) << "Cannot load "
+                                                "Neuron_"
+                                                "getDeviceCount!";
     return Neuron_getDeviceCount_;
   }
 
   Neuron_getDevice_Type Neuron_getDevice() {
-    CHECK(Neuron_getDevice_ != nullptr) << "Cannot load Neuron_getDevice!";
+    CHECK(Neuron_getDevice_ != nullptr) << "Cannot load "
+                                           "Neuron_"
+                                           "getDevice!";
     return Neuron_getDevice_;
   }
 
   NeuronDevice_getName_Type NeuronDevice_getName() {
-    CHECK(NeuronDevice_getName_ != nullptr)
-        << "Cannot load NeuronDevice_getName!";
+    CHECK(NeuronDevice_getName_ != nullptr) << "Cannot load "
+                                               "NeuronDevice_"
+                                               "getName!";
     return NeuronDevice_getName_;
   }
 
@@ -226,11 +306,18 @@ class NeuronAdapter final {
       nullptr};
   NeuronModel_identifyInputsAndOutputs_Type
       NeuronModel_identifyInputsAndOutputs_{nullptr};
+  NeuronModel_restoreFromCompiledNetwork_Type
+      NeuronModel_restoreFromCompiledNetwork_{nullptr};
   NeuronCompilation_create_Type NeuronCompilation_create_{nullptr};
   NeuronCompilation_free_Type NeuronCompilation_free_{nullptr};
   NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr};
+  NeuronCompilation_setCaching_Type NeuronCompilation_setCaching_{nullptr};
   NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices_{
       nullptr};
+  NeuronCompilation_storeCompiledNetwork_Type
+      NeuronCompilation_storeCompiledNetwork_{nullptr};
+  NeuronCompilation_getCompiledNetworkSize_Type
+      NeuronCompilation_getCompiledNetworkSize_{nullptr};
   NeuronExecution_create_Type NeuronExecution_create_{nullptr};
   NeuronExecution_free_Type NeuronExecution_free_{nullptr};
   NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr};
diff --git a/lite/core/context.h b/lite/core/context.h
index f140e7575b82b264e27cec00ac8eb05fcd33eb2d..6db0faffe4843d2b2dedb977605268de7be09ac1 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -144,6 +144,21 @@ class Context<TargetType::kAPU> {
 
   APUContext& operator=(const APUContext& ctx) {}
   std::string name() const { return "APUContext"; }
+
+  static void SetSubgraphModelCacheDir(Scope* scope,
+                                       std::string subgraph_model_cache_dir) {
+    auto var = scope->Var("SUBGRAPH_MODEL_CACHE_DIR");
+    CHECK(var);
+    auto data = var->GetMutable<std::string>();
+    CHECK(data);
+    *data = subgraph_model_cache_dir;
+  }
+
+  static std::string SubgraphModelCacheDir(Scope* scope) {
+    auto var = scope->FindVar("SUBGRAPH_MODEL_CACHE_DIR");
+    if (!var) return "";
+    return var->Get<std::string>();
+  }
 };
 #endif
 
diff --git a/lite/kernels/apu/bridges/CMakeLists.txt b/lite/kernels/apu/bridges/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/lite/kernels/apu/bridges/conv_transpose_op.cc b/lite/kernels/apu/bridges/conv_transpose_op.cc
index 386c89c128e476611ebde4b337823775b5ae01a9..ecc6677ac1e8f80f473caefc44d51248e8609476 100644
--- a/lite/kernels/apu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/apu/bridges/conv_transpose_op.cc
@@ -53,7 +53,7 @@ int ConvTransposeConverter(void *ctx, OpLite *op, KernelBase *kernel) {
 
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
   CHECK_EQ(strides.size(), 2L);
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
   auto groups = op_info->GetAttr<int>("groups");
   if (groups > 1) {
     LOG(WARNING) << "[NPU] only support groups == 1";
@@ -70,7 +70,7 @@ int ConvTransposeConverter(void *ctx, OpLite *op, KernelBase *kernel) {
   auto fuse_relu =
       op_info->HasAttr("fuse_relu") && op_info->GetAttr<bool>("fuse_relu");
 
-  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  std::vector<int> dilations = op_info->GetAttr<std::vector<int>>("dilations");
   CHECK_EQ(dilations.size(), 2L);
   std::string padding_algorithm =
       op_info->HasAttr("padding_algorithm")
diff --git a/lite/kernels/apu/bridges/graph.cc b/lite/kernels/apu/bridges/graph.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/apu/bridges/paddle_use_bridges.h b/lite/kernels/apu/bridges/paddle_use_bridges.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/apu/bridges/utility.h b/lite/kernels/apu/bridges/utility.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/apu/subgraph_compute.cc b/lite/kernels/apu/subgraph_compute.cc
old mode 100755
new mode 100644
index 5e86514478f421ece6642afdd0bfaab4025420bb..825e735a2f0709ba979fa7a2a7fc203539e8f483
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -22,17 +22,102 @@
 #include "lite/kernels/apu/bridges/graph.h"
 #include "lite/kernels/apu/bridges/paddle_use_bridges.h"
 #include "lite/kernels/apu/bridges/utility.h"
+#include "lite/utils/io.h"
+#include "lite/utils/md5.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace apu {
 
-bool SubgraphEngine::BuildDeviceProgram() {
-  if (!origin_program_) {
-    BuildOriginProgram();
+// Generate the model name by using md5 hashes based on:
+// 1. the sorted variable input names
+// 2. the shapes of the origin input tensors
+// 3. the sorted variable output names
+std::string DeviceProgram::GenerateModelName(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims) {
+  std::ostringstream os;
+  CHECK_EQ(input_names.size(), origin_idims.size());
+  for (int i = 0; i < input_names.size(); i++) {
+    os << input_names[i];
+    for (auto dim : origin_idims[i]) {
+      os << dim;
+    }
+  }
+  for (auto output_name : output_names) {
+    os << output_name;
+  }
+  return MD5(os.str());
+}
+
+// Deserialize the generated model
+bool DeviceProgram::LoadFromCacheFile(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims,
+    const std::string& model_cache_dir) {
+  int status;
+
+  // Generate the model name if not initialized
+  if (model_name_.empty()) {
+    model_name_ = GenerateModelName(input_names, output_names, origin_idims);
   }
+  // Load from the cached model file
+  auto model_path = model_cache_dir + "/" + model_name_ + ".dla";
+  VLOG(3) << "[APU] Load model from " << model_path;
 
+  std::vector<char> compilationBuffer;
+  if (!ReadFile(model_path, &compilationBuffer)) {
+    LOG(WARNING) << "[NPU] Open " << model_path << " for reading failed!";
+    return false;
+  }
+  model_ = nullptr;
+  compilation_ = nullptr;
+  status = NeuronModel_restoreFromCompiledNetwork(
+      &model_, &compilation_, &compilationBuffer[0], compilationBuffer.size());
+  if (status != NEURON_NO_ERROR) {
+    LOG(WARNING) << "[APU] Load model failed!" << compilationBuffer.size();
+    return false;
+  }
+
+  VLOG(3) << "[APU] Complete Load model!";
+
+  // Deserialize the preicisions and shapes of the origin output tensors from
+  // the
+  // cached configuration file
+  auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
+  VLOG(3) << "[APU] Load configuration from " << config_path;
+  std::vector<char> config_buffer;
+  if (!ReadFile(config_path, &config_buffer)) {
+    LOG(WARNING) << "[APU] read from " << config_path << " failed!";
+    return false;
+  }
+
+  std::string str(config_buffer.begin(), config_buffer.end());
+  // Parse the precision and shapes of the output tensors
+  auto output_options = Split<std::string>(str, ";");
+  CHECK_EQ(output_options.size(), output_names.size());
+  origin_otypes_.resize(output_names.size());
+  origin_odims_.resize(output_names.size());
+  for (int i = 0; i < output_names.size(); i++) {
+    auto items = Split<std::string>(output_options[i], ":");
+    CHECK_EQ(items.size(), 2);  // precision and shapes
+    origin_otypes_[i] = static_cast<PrecisionType>(std::stoi(items[0]));
+    origin_odims_[i] = Split<int64_t>(items[1], ",");
+  }
+  return true;
+}
+
+bool DeviceProgram::BuildGraphAndCacheToFile(
+    RuntimeProgram* origin_program,
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims,
+    const std::vector<Tensor*>& origin_itensors,
+    const std::vector<Tensor*>& origin_otensors,
+    const std::string& model_cache_dir) {
   auto GetCurrentUS = []() -> double {
     struct timeval time;
     gettimeofday(&time, NULL);
@@ -53,13 +138,14 @@ bool SubgraphEngine::BuildDeviceProgram() {
     return false;
   }
   graph.set_model(model_);
-  graph.set_input_names(input_names_);
-  graph.set_output_names(output_names_);
+  graph.set_input_names(input_names);
+  graph.set_output_names(output_names);
 
   // Convert all of ops and their input vars and weights and added into the APU
   // NIR graph
   const auto& bridges = subgraph::SubgraphBridgeRegistry::Instance();
-  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  const auto& insts = origin_program->instructions(kRootBlockIdx);
+
   for (auto& inst : insts) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
@@ -82,25 +168,25 @@ bool SubgraphEngine::BuildDeviceProgram() {
 
   // Get the index of input tensors
   std::vector<uint32_t> input_indices;
-  for (int i = 0; i < input_names_.size(); i++) {
-    CHECK(graph.Has(input_names_[i])) << "[APU] Failed to find input node "
-                                      << input_names_[i];
-    auto index = graph.Get(input_names_[i])->index();
+  for (int i = 0; i < input_names.size(); i++) {
+    CHECK(graph.Has(input_names[i])) << "[APU] Failed to find input node "
+                                     << input_names[i];
+    auto index = graph.Get(input_names[i])->index();
     input_indices.push_back(index);
-    VLOG(3) << "[APU] Input[" << i << "] name " << input_names_[i] << " dims "
-            << origin_itensors_[i]->dims() << " index " << index;
+    VLOG(3) << "[APU] Input[" << i << "] name " << input_names[i] << " dims "
+            << origin_itensors[i]->dims() << " index " << index;
   }
 
   // Get the index of output tensors
   std::vector<uint32_t> output_indices;
-  for (int i = 0; i < output_names_.size(); i++) {
-    CHECK(graph.Has(output_names_[i])) << "[APU] Failed to find output node "
-                                       << output_names_[i];
-    origin_otensors_[i]->mutable_data<int8_t>();
-    auto index = graph.Get(output_names_[i])->index();
+  for (int i = 0; i < output_names.size(); i++) {
+    CHECK(graph.Has(output_names[i])) << "[APU] Failed to find output node "
+                                      << output_names[i];
+    origin_otensors[i]->mutable_data<int8_t>();
+    auto index = graph.Get(output_names[i])->index();
     output_indices.push_back(index);
-    VLOG(3) << "[APU] Output[" << i << "] name " << output_names_[i] << " dims "
-            << origin_otensors_[i]->dims() << " index " << index;
+    VLOG(3) << "[APU] Output[" << i << "] name " << output_names[i] << " dims "
+            << origin_otensors[i]->dims() << " index " << index;
   }
 
   // Indentify the input and output tensors of the neuron model
@@ -114,7 +200,6 @@ bool SubgraphEngine::BuildDeviceProgram() {
     LOG(WARNING) << "[APU] Fail to create NIR model:" << neuron_errCode;
     return false;
   }
-  VLOG(3) << "[APU] APU NIR model created!";
 
   VLOG(1) << "[APU] APU NIR model created, Create cost "
           << GetCurrentUS() - start_time << " us";
@@ -127,9 +212,109 @@ bool SubgraphEngine::BuildDeviceProgram() {
   }
   VLOG(1) << "[APU] APU DLA model created, Build cost "
           << GetCurrentUS() - start_time << " us";
+
+  CHECK_EQ(origin_otensors.size(), output_names.size());
+  origin_otypes_.resize(output_names.size());
+  origin_odims_.resize(output_names.size());
+  for (size_t i = 0; i < output_names.size(); i++) {
+    origin_otypes_[i] = origin_otensors[i]->precision();
+    origin_odims_[i] = origin_otensors[i]->dims().Vectorize();
+  }
+  if (!model_cache_dir.empty()) {
+    // Save the generated model to file
+    auto model_path = model_cache_dir + "/" + model_name_ + ".dla";
+    VLOG(3) << "[APU] Save model to " << model_path;
+
+    size_t compilationSize;
+    status = NeuronCompilation_getCompiledNetworkSize(compilation_,
+                                                      &compilationSize);
+    if (status == NEURON_NO_ERROR) {
+      // Serialization DLA
+      std::vector<char> model_buffer;
+      model_buffer.resize(compilationSize);
+      status = NeuronCompilation_storeCompiledNetwork(
+          compilation_, &model_buffer[0], compilationSize);
+      if (status != NEURON_NO_ERROR) {
+        LOG(WARNING) << "[APU] Serialization DLA failed!";
+      }
+
+      VLOG(3) << "[APU] Export the model to " << model_path;
+      if (!WriteFile(model_path, model_buffer)) {
+        LOG(WARNING) << "[APU] Open " << model_path << " for writting failed!";
+      }
+    }
+
+    // Serialize the precisions and shapes of the origin output tensors into the
+    // configuration file
+    std::ostringstream os;
+    for (int i = 0; i < output_names.size(); i++) {
+      os << static_cast<int32_t>(origin_otypes_[i]) << ":";
+      for (auto dim : origin_odims_[i]) {
+        os << dim << ",";
+      }
+      os << ";";
+    }
+    auto str = os.str();
+    std::vector<char> config_buffer(str.begin(), str.end());
+    auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
+    VLOG(3) << "[APU] Save configuration to " << config_path;
+    if (!WriteFile(config_path, config_buffer)) {
+      LOG(WARNING) << "[APU] Open " << config_path << " for writting failed!";
+    }
+  }
+
   return true;
 }
 
+bool SubgraphEngine::BuildDeviceProgram() {
+  // Check if the cache device program exists
+  if (!device_programs_.count(origin_idims_)) {
+    auto device_program = std::make_shared<DeviceProgram>();
+    // Obtain the model cache dir from the NPU Context of the subgraph op
+    auto model_cache_dir =
+        ctx_->As<APUContext>().SubgraphModelCacheDir(exec_scope_);
+    VLOG(3) << "[APU] Getting subgraph_model_cache_dir: " << model_cache_dir;
+    // Check and load if the cached model and configuration file exists
+    if (model_cache_dir.empty() ||
+        !device_program->LoadFromCacheFile(
+            input_names_, output_names_, origin_idims_, model_cache_dir)) {
+      // Build the model online, including converting the paddle ops to the NIR
+      // nodes, building the MTK NIR graph, and compile MTK NIR graph to dla
+      if (!origin_program_) {
+        BuildOriginProgram();
+      }
+      CHECK(origin_program_) << "[APU] The origin program is not initialized!";
+      CHECK_GT(origin_program_->instructions().size(), 0)
+          << "[APU] No instructions found in the origin program!";
+      if (!device_program->BuildGraphAndCacheToFile(origin_program_.get(),
+                                                    input_names_,
+                                                    output_names_,
+                                                    origin_idims_,
+                                                    origin_itensors_,
+                                                    origin_otensors_,
+                                                    model_cache_dir)) {
+        return false;
+      }
+    }
+    if (device_program->model_ == nullptr) {
+      LOG(WARNING) << "dla create fail!";
+      return false;
+    }
+    device_programs_[origin_idims_] = device_program;
+  }
+
+  // Get the index of output tensors
+  auto device_program = device_programs_[origin_idims_];
+  CHECK(device_program && device_program->model_);
+  for (int i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i]->Resize(device_program->origin_odims_[i]);
+    origin_otensors_[i]->mutable_data<int8_t>();
+    VLOG(3) << "[APU] Output[" << i << "] name " << output_names_[i] << " dims "
+            << origin_otensors_[i]->dims() << " memory_size "
+            << origin_otensors_[i]->memory_size();
+  }
+}
+
 bool SubgraphEngine::LaunchDeviceProgram() {
   auto GetCurrentUS = []() -> double {
     struct timeval time;
@@ -137,9 +322,17 @@ bool SubgraphEngine::LaunchDeviceProgram() {
     return 1e+6 * time.tv_sec + time.tv_usec;
   };
 
+  if (device_programs_.count(origin_idims_) == 0 ||
+      device_programs_[origin_idims_]->model_ == nullptr) {
+    return LaunchOriginProgram();
+  }
+
+  auto device_program = device_programs_[origin_idims_];
+
   auto start_time = GetCurrentUS();
   NeuronExecution* run = NULL;
-  int neuron_errCode = NeuronExecution_create(compilation_, &run);
+  int neuron_errCode =
+      NeuronExecution_create(device_program->compilation_, &run);
   if (NEURON_NO_ERROR != neuron_errCode) {
     LOG(WARNING) << "[APU] Build APU runtime failed!";
     return false;
@@ -187,11 +380,13 @@ bool SubgraphEngine::LaunchDeviceProgram() {
 }
 
 SubgraphEngine::~SubgraphEngine() {
-  if (compilation_) {
-    NeuronCompilation_free(compilation_);
-  }
-  if (model_) {
-    NeuronModel_free(model_);
+  for (auto& device_program : device_programs_) {
+    if (device_program.second->compilation_) {
+      NeuronCompilation_free(device_program.second->compilation_);
+    }
+    if (device_program.second->model_) {
+      NeuronModel_free(device_program.second->model_);
+    }
   }
 }
 
diff --git a/lite/kernels/apu/subgraph_compute.h b/lite/kernels/apu/subgraph_compute.h
index f85f315220e0b52baf1cb1388ce6edd5bc004d1a..f2c593d58da9bbf9716cf4d89a6c8844a4c004b9 100644
--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -27,14 +28,43 @@ namespace lite {
 namespace kernels {
 namespace apu {
 
+class DeviceProgram {
+ public:
+  DeviceProgram() {}
+  ~DeviceProgram() {}
+  std::string GenerateModelName(
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::vector<int64_t>>& origin_idims);
+  bool LoadFromCacheFile(const std::vector<std::string>& input_names,
+                         const std::vector<std::string>& output_names,
+                         const std::vector<std::vector<int64_t>>& origin_idims,
+                         const std::string& model_cache_dir);
+  bool BuildGraphAndCacheToFile(
+      RuntimeProgram* origin_program,
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::vector<int64_t>>& origin_idims,
+      const std::vector<Tensor*>& origin_itensors,
+      const std::vector<Tensor*>& origin_otensors,
+      const std::string& model_cache_dir);
+
+ public:
+  std::string model_name_{""};
+  std::vector<std::vector<int64_t>> origin_odims_;
+  std::vector<PrecisionType> origin_otypes_;
+  NeuronModel* model_;
+  NeuronCompilation* compilation_;
+};
+
 class SubgraphEngine : public subgraph::SubgraphEngineBase {
  public:
-  SubgraphEngine(KernelContext *ctx,
+  SubgraphEngine(KernelContext* ctx,
                  int block_idx,
-                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
-                 Scope *exec_scope,
-                 const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names)
+                 const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+                 Scope* exec_scope,
+                 const std::vector<std::string>& input_names,
+                 const std::vector<std::string>& output_names)
       : subgraph::SubgraphEngineBase(ctx,
                                      block_idx,
                                      program_desc,
@@ -48,8 +78,8 @@ class SubgraphEngine : public subgraph::SubgraphEngineBase {
   bool BuildDeviceProgram() override;
   bool LaunchDeviceProgram() override;
 
-  NeuronModel *model_;
-  NeuronCompilation *compilation_;
+  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<DeviceProgram>>
+      device_programs_;
 };
 
 class SubgraphCompute
diff --git a/lite/tests/api/test_mobilenetv1_int8_mediatek_apu.cc b/lite/tests/api/test_mobilenetv1_int8_mediatek_apu.cc
index 76b3722d2d6d4d15fb57a00b055d714ad8d2e1c5..e61b20b54a2ed0c453e28bd5e0fa19a54a7b2455 100644
--- a/lite/tests/api/test_mobilenetv1_int8_mediatek_apu.cc
+++ b/lite/tests/api/test_mobilenetv1_int8_mediatek_apu.cc
@@ -33,11 +33,13 @@ namespace paddle {
 namespace lite {
 
 TEST(MobileNetV1, test_mobilenetv1_int8_mediatek_apu) {
+  std::string subgraph_model_cache_dir = FLAGS_model_dir;
   lite_api::CxxConfig config;
   config.set_model_dir(FLAGS_model_dir);
   config.set_valid_places({lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
                            lite_api::Place{TARGET(kARM), PRECISION(kInt8)},
                            lite_api::Place{TARGET(kAPU), PRECISION(kInt8)}});
+  config.set_subgraph_model_cache_dir(subgraph_model_cache_dir);
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
   std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");