Add opencl cache (#1333)

* opencl add generate_cl_binary * remove chinese cl * add opencl cache, fix winograd limit support yolov3 * apply code-format changes * add opencl cache, fix winograd limit support yolov3 * apply code-format changes Co-authored-by: N shitouren1994 <shitouren1994@users.noreply.github.com>

Add opencl cache (#1333)
* opencl add generate_cl_binary * remove chinese cl * add opencl cache, fix winograd limit support yolov3 * apply code-format changes * add opencl cache, fix winograd limit support yolov3 * apply code-format changes Co-authored-by: N shitouren1994 <shitouren1994@users.noreply.github.com>
7e20bfc6 · shitouren1994 · GitHub · 009319ed · 7e20bfc6 · 7e20bfc6
20 changed file
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -134,6 +134,7 @@ IF (OpenCV_FOUND)
    TENGINE_EXAMPLE_CV (tm_alphapose             tm_alphapose.cpp)
    TENGINE_EXAMPLE_CV (tm_yolov3                tm_yolov3.cpp)
    TENGINE_EXAMPLE_CV (tm_yolov3_uint8          tm_yolov3_uint8.cpp)
+    TENGINE_EXAMPLE_CV (tm_yolov3_opencl         tm_yolov3_opencl.cpp)
    TENGINE_EXAMPLE_CV (tm_yolov3_tiny           tm_yolov3_tiny.cpp)
    TENGINE_EXAMPLE_CV (tm_yolov3_tiny_opendla   tm_yolov3_tiny_opendla.cpp)
    TENGINE_EXAMPLE_CV (tm_yolov3_tiny_int8      tm_yolov3_tiny_int8.cpp)

--- a/examples/tm_yolov3_opencl.cpp
+++ b/examples/tm_yolov3_opencl.cpp
@@ -35,6 +35,7 @@
 #include "common.h"
 #include "tengine/c_api.h"
 #include "tengine_operations.h"
+#include "../source/device/opencl/ocl_define.h"
 struct Object
 {
@@ -374,7 +375,17 @@ int main(int argc, char* argv[])
    // context_t for opencl
    context_t opencl_context = create_context("ocl", 1);
-    int rtt = set_context_device(opencl_context, "OCL", NULL, 0);
+    struct ocl_option option;
+    // first time use store_cache to generate opencl auto tune cache
+    option.cache_path = "./test.cache";
+    option.store_cache = true;
+    // after generate cache; load cache from cache file; pre_run faster
+    //    option.cache_path = "./test.cache";
+    //    option.load_cache = true;
+    int rtt = set_context_device(opencl_context, "OCL", (void*)&option, sizeof(option));
    if (0 > rtt)
    {
        fprintf(stderr, " add_context_device opencl failed.\n");
@@ -436,6 +447,9 @@ int main(int argc, char* argv[])
        }
        double end = get_current_time();
        double cur = end - start;
+        fprintf(stderr, "Repeat %d times, thread %d, cur time %.2f ms\n", repeat_count, num_thread,
+                cur);
        total_time += cur;
        min_time = std::min(min_time, cur);
        max_time = std::max(max_time, cur);

--- a/source/device/opencl/CMakeLists.txt
+++ b/source/device/opencl/CMakeLists.txt
@@ -9,7 +9,19 @@ UNSET (_DEV_OCL_LINKER_OPTIONS)
 UNSET (_DEV_OCL_LINK_LIBRARIES)
-# 1.  set source root path
+# add link options
+OPTION(TENGINE_OPENCL_PROFILE_TIME "enable opencl profile time" OFF)
+IF (TENGINE_OPENCL_PROFILE_TIME)
+    LIST(APPEND _DEV_OCL_COMPILER_DEFINES OPENCL_PROFILE_TIME)
+ENDIF ()
+OPTION(TENGINE_OPENCL_MODEL_CACHE "enable opencl cache" ON)
+OPTION(TENGINE_OPENCL_DEBUG_DATA "enable node tensor debug data" OFF)
+IF (TENGINE_OPENCL_DEBUG_DATA)
+    LIST(APPEND _DEV_OCL_COMPILER_DEFINES OPENCL_DEBUG_DATA)
+ENDIF()
+#  set source root path
 SET(_OCL_ROOT ${CMAKE_SOURCE_DIR}/source/device/opencl)
@@ -17,39 +29,34 @@ SET(_OCL_ROOT ${CMAKE_SOURCE_DIR}/source/device/opencl)
 LIST (APPEND _DEV_OCL_HEADER_PATH           ${_OCL_ROOT})
 LIST (APPEND _DEV_OCL_HEADER_PATH           ${_OCL_ROOT}/include)
-# 4.  add source files
+# add source files
-AUX_SOURCE_DIRECTORY("${_OCL_ROOT}"         _OCL_BASE_SOURCE)
+AUX_SOURCE_DIRECTORY("${_OCL_ROOT}" _OCL_BASE_SOURCE)
-AUX_SOURCE_DIRECTORY("${_OCL_ROOT}/oppack4"  _OCL_OPS_SOURCE_PACK4)
+AUX_SOURCE_DIRECTORY("${_OCL_ROOT}/oppack4" _OCL_OPS_SOURCE_PACK4)
-LIST (APPEND _DEV_OCL_DEVICE_SOURCE         ${_OCL_BASE_SOURCE})
+LIST(APPEND _DEV_OCL_DEVICE_SOURCE "${_OCL_ROOT}/cl4/ocl_program_hex.cc")
-LIST (APPEND _DEV_OCL_DEVICE_SOURCE         ${_OCL_OPS_SOURCE})
+LIST(APPEND _DEV_OCL_DEVICE_SOURCE ${_OCL_BASE_SOURCE})
-LIST (APPEND _DEV_OCL_DEVICE_SOURCE         ${_OCL_OPS_SOURCE_PACK4})
+LIST(APPEND _DEV_OCL_DEVICE_SOURCE ${_OCL_OPS_SOURCE})
+LIST(APPEND _DEV_OCL_DEVICE_SOURCE ${_OCL_OPS_SOURCE_PACK4})
+IF (TENGINE_OPENCL_MODEL_CACHE)
+    AUX_SOURCE_DIRECTORY("${_OCL_ROOT}/cache" _OCL_CACHE)
+    LIST(APPEND _DEV_OCL_DEVICE_SOURCE ${_OCL_CACHE})
+ENDIF ()
-# 5.  add build options for cpu device
+# add build options for cpu device
-# 5.1 is a gcc or clang like compiler
+# is a gcc or clang like compiler
 IF (TENGINE_COMPILER_GCC OR TENGINE_COMPILER_CLANG)
    IF (TENGINE_COMPILER_GCC AND (${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL "6.1"))
-        LIST (APPEND _DEV_OCL_COMPILER_OPTIONS -Wno-ignored-attributes)
+        LIST(APPEND _DEV_OCL_COMPILER_OPTIONS -Wno-ignored-attributes)
-    ENDIF()
+    ENDIF ()
-ENDIF()
-# 5.2 is Microsoft Visual C++
-IF (TENGINE_COMPILER_MSVC)
-ENDIF()
-# 6.  add link options
-OPTION(TENGINE_OPENCL_PROFILE_TIME "enable opencl profile time" OFF)
-IF (TENGINE_OPENCL_PROFILE_TIME)
-    LIST (APPEND _DEV_OCL_COMPILER_DEFINES OPENCL_PROFILE_TIME)
 ENDIF ()
-# 7.  add link libs
 FIND_PACKAGE(OpenCL)
-IF(NOT OpenCL_FOUND)
+IF (NOT OpenCL_FOUND)
-    message(WARNING "please set cmake -DOpenCL_LIBRARY=user/lib/... manual")
+    message(WARNING "please set opencl library path manual")
-ENDIF()
+ENDIF ()
-LIST (APPEND _DEV_OCL_LINK_LIBRARIES  ${OpenCL_LIBRARY})
+LIST(APPEND _DEV_OCL_LINK_LIBRARIES ${OpenCL_LIBRARY})
 # 8. set all to cmake cache

--- a/source/device/opencl/cache/cache.cc
+++ b/source/device/opencl/cache/cache.cc
+#pragma once
+#include "cache.hpp"
+void cl_cache::de_serializer(const std::string& cache_path)
+{
+    struct stat stat;
+    int fd = open(cache_path.c_str(), O_RDONLY);
+    if (fd < 0)
+    {
+        TLOG_ERR("cannot open file %s\n", cache_path.c_str());
+        return;
+    }
+    fstat(fd, &stat);
+    int file_len = stat.st_size;
+    void* mem_base = (void*)sys_malloc(file_len);
+    int ret = read(fd, mem_base, file_len);
+    char* read_current = (char*)mem_base;
+    uint16_t version = read<uint16_t>(&read_current);
+    TLOG_ERR("current cache version is: %d \n", version);
+    int auto_tune_size = read<int>(&read_current);
+    if (auto_tune_size > 0)
+    {
+        std::vector<char> temp_key;
+        auto_tune_vector.resize(auto_tune_size);
+        for (int i = 0; i < auto_tune_size; ++i)
+        {
+            auto_tune temp_auto_tune{};
+            int key_size = read<int>(&read_current);
+            temp_key.resize(key_size);
+            memcpy(temp_key.data(), read_current, key_size);
+            std::string key(temp_key.begin(), temp_key.end());
+            read_current += key_size;
+            temp_auto_tune.key = key;
+            temp_auto_tune.global_size[0] = read<int>(&read_current);
+            temp_auto_tune.global_size[1] = read<int>(&read_current);
+            temp_auto_tune.global_size[2] = read<int>(&read_current);
+            temp_auto_tune.local_size[0] = read<int>(&read_current);
+            temp_auto_tune.local_size[1] = read<int>(&read_current);
+            temp_auto_tune.local_size[2] = read<int>(&read_current);
+            auto_tune_vector[i] = temp_auto_tune;
+            TLOG_ERR("decode cache: %s %d,%d,%d  %d,%d,%d \n",
+                     key.c_str(),
+                     temp_auto_tune.global_size[0],
+                     temp_auto_tune.global_size[1],
+                     temp_auto_tune.global_size[2],
+                     temp_auto_tune.local_size[0],
+                     temp_auto_tune.local_size[1],
+                     temp_auto_tune.local_size[2]);
+        }
+    }
+    close(fd);
+}
+void cl_cache::serializer(const std::string& cache_path)
+{
+    int fd = open(cache_path.c_str(), O_RDWR | O_CREAT | O_TRUNC, 0666);
+    if (fd == -1)
+    {
+        TLOG_ERR("Could not open %s\n", cache_path.c_str());
+        return;
+    }
+    auto base = (char*)sys_malloc(get_auto_tune_size());
+    auto out_ptr = base;
+    write<uint16_t>(&out_ptr, CACHE_VERSION);
+    write<int>(&out_ptr, auto_tune_vector.size());
+    for (int i = 0; i < auto_tune_vector.size(); ++i)
+    {
+        write<int>(&out_ptr, auto_tune_vector[i].key.size());
+        memcpy(out_ptr, auto_tune_vector[i].key.c_str(), auto_tune_vector[i].key.size());
+        out_ptr += auto_tune_vector[i].key.size();
+        write<int>(&out_ptr, auto_tune_vector[i].global_size[0]);
+        write<int>(&out_ptr, auto_tune_vector[i].global_size[1]);
+        write<int>(&out_ptr, auto_tune_vector[i].global_size[2]);
+        write<int>(&out_ptr, auto_tune_vector[i].local_size[0]);
+        write<int>(&out_ptr, auto_tune_vector[i].local_size[1]);
+        write<int>(&out_ptr, auto_tune_vector[i].local_size[2]);
+    }
+    write(fd, base, get_auto_tune_size());
+    close(fd);
+}
+int cl_cache::get_auto_tune_size()
+{
+    int size = 2;
+    for (int i = 0; i < auto_tune_vector.size(); ++i)
+    {
+        size += 4 + 4 * 3 + 4 * 3;
+        size += auto_tune_vector[i].key.size();
+    }
+    return size;
+}
+void cl_cache::test()
+{
+    int size = 10;
+    for (int i = 0; i < size; ++i)
+    {
+        struct auto_tune temp
+        {
+        };
+        temp.key = "ohoho" + std::to_string(i);
+        temp.global_size[0] = i;
+        temp.local_size[0] = i;
+        auto_tune_vector.push_back(temp);
+    }
+    serializer("./cl.cache");
+    de_serializer("./cl.cache");
+}
+int cl_cache::get_cache_tune(const std::string& key, auto_tune* tune)
+{
+    auto res = std::find_if(auto_tune_vector.begin(), auto_tune_vector.end(), [key](const auto_tune& left) {
+        return left.key == key;
+    });
+    if (res != auto_tune_vector.end())
+    {
+        auto temp_auto_tune = *res;
+        TLOG_ERR("find cache: %s %d,%d,%d  %d,%d,%d \n",
+                 key.c_str(),
+                 temp_auto_tune.global_size[0],
+                 temp_auto_tune.global_size[1],
+                 temp_auto_tune.global_size[2],
+                 temp_auto_tune.local_size[0],
+                 temp_auto_tune.local_size[1],
+                 temp_auto_tune.local_size[2]);
+        *tune = *res;
+        return 0;
+    }
+    return -1;
+}
+void cl_cache::set_auto_tune(const auto_tune& tune)
+{
+    TLOG_ERR("add cache: %s %d,%d,%d  %d,%d,%d \n",
+             tune.key.c_str(),
+             tune.global_size[0],
+             tune.global_size[1],
+             tune.global_size[2],
+             tune.local_size[0],
+             tune.local_size[1],
+             tune.local_size[2]);
+    auto_tune_vector.push_back(tune);
+    TLOG_ERR("cache size: %d \n", auto_tune_vector.size());
+}
--- a/source/device/opencl/cache/cache.hpp
+++ b/source/device/opencl/cache/cache.hpp
+#pragma once
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <unistd.h>
+#include "ocl_cpp_helper.hpp"
+#include "utility/sys_port.h"
+struct auto_tune
+{
+    std::string key;
+    std::vector<int> global_size = {0, 0, 0};
+    std::vector<int> local_size = {0, 0, 0};
+};
+#define CACHE_VERSION 1
+class cl_cache
+{
+public:
+    cl_cache()
+    {
+        auto_tune_vector.clear();
+    };
+    ~cl_cache() = default;
+    void test();
+    void de_serializer(const std::string& path);
+    void serializer(const std::string& path);
+    int get_cache_tune(const std::string& key, auto_tune* tune);
+    void set_auto_tune(const auto_tune& tune);
+private:
+    std::vector<auto_tune> auto_tune_vector;
+    int get_auto_tune_size();
+};
+template<typename Tp>
+static inline Tp read(char** current)
+{
+    auto tpr = (Tp*)*current;
+    *current += sizeof(Tp);
+    return *tpr;
+}
+template<typename Tp>
+static inline void write(char** current, Tp value)
+{
+    auto tpr = (Tp*)*current;
+    tpr[0] = value;
+    *current += sizeof(Tp);
+}
\ No newline at end of file
--- a/source/device/opencl/cl4/conv_2d_2d.cl
+++ b/source/device/opencl/cl4/conv_2d_2d.cl
@@ -38,8 +38,6 @@ __constant sampler_t SAMPLER =
    return;                                                                    \
  }
-#define UNIT 4
 __kernel
 #if SET_ATTRIBUTE
    __attribute__((work_group_size_hint(16, 16, 1)))
@@ -76,7 +74,7 @@ __kernel
  FLOAT4 out3 = out0;
  int in_width0 =
-      mad24(out_width_block_idx, stride_shape.y << 2, -padding_shape.y);  // stride_shape.y = x方向上的stride
+      mad24(out_width_block_idx, stride_shape.y << 2, -padding_shape.y);
  int in_width1 = in_width0 + stride_shape.y;
  int in_width2 = in_width0 + stride_shape.y * 2;
  int in_width3 = in_width0 + stride_shape.y * 3;
@@ -113,9 +111,6 @@ __kernel
        READ_INPUT_IMAGE(1, input_width_base);
        READ_INPUT_IMAGE(2, input_width_base);
        READ_INPUT_IMAGE(3, input_width_base);
        weights0 =
            RI_F(weights, SAMPLER, (int2)(weights_x_idx + 0, weights_y_idx));
        weights1 =
@@ -125,9 +120,6 @@ __kernel
        weights3 =
            RI_F(weights, SAMPLER, (int2)(weights_x_idx + 3, weights_y_idx++));
        CALCULATE_OUTPUT(0);
        CALCULATE_OUTPUT(1);
        CALCULATE_OUTPUT(2);

--- a/source/device/opencl/ocl_cpp_helper.hpp
+++ b/source/device/opencl/ocl_cpp_helper.hpp
@@ -51,6 +51,7 @@ extern "C" {
 #include <cstdio>
 #include <fstream>
 #include <memory>
+#include "cache/cache.hpp"
 #define UP_DIV(x, y)   (((x) + (y) - (1)) / (y))
 #define ROUND_UP(x, y) (((x) + (y) - (1)) / (y) * (y))

--- a/source/device/opencl/ocl_define.h
+++ b/source/device/opencl/ocl_define.h
@@ -30,4 +30,7 @@ typedef struct ocl_option
 {
    char* dev_name;
    int precision; //!< precision of calculation
+    char* cache_path;
+    bool load_cache = false;
+    bool store_cache = false;
 } ocl_opt_t;
--- a/source/device/opencl/ocl_executor.cc
+++ b/source/device/opencl/ocl_executor.cc
@@ -26,12 +26,14 @@
 #include "ocl_node.hpp"
 #include "ocl_convertor.hpp"
 #include "ocl_helper.hpp"
+#include "cache/cache.hpp"
 #include <../examples/common/common.h>
 #include <string>
 extern "C" {
 #include "operator/op.h"
 #include "convolution_param.h"
+#include "ocl_define.h"
 }
 void register_all_ocl_creator();
@@ -59,6 +61,7 @@ bool OCLEngine::init()
    engine_context = std::make_shared<cl::Context>(*engine_device, nullptr, nullptr, nullptr, &res);
    engine_command_queue = std::make_shared<cl::CommandQueue>(*engine_context, *engine_device, 0, &res);
    engine_convertor = std::make_shared<ocl_convertor>(this);
+    gpu_cache = std::make_shared<cl_cache>();
    const std::string device_name = engine_device->getInfo<CL_DEVICE_NAME>();
    const std::string vendor_name = engine_device->getInfo<CL_DEVICE_VENDOR>();
@@ -116,48 +119,6 @@ int OCLEngine::OCLEngineRun(struct subgraph* subgraph)
        int ir_tensor_idx = subgraph->input_tensor_list[i];
        struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_tensor_idx);
        upload_input_nc4hw(input_tensor, ir_tensor_idx);
-#if 0
-        // print input
-        printf("input ------------------ \n");
-        uint32_t input_w = input_tensor->dims[3] * UP_DIV(input_tensor->dims[1], 4);
-        uint32_t input_h = input_tensor->dims[2];
-        std::vector<float> input_debug(input_w * input_h * 4);
-        uint64_t mem = gpu_mem_map.find(ir_tensor_idx)->second;
-        get_command_queue().enqueueReadImage(*(cl::Image*)mem, CL_TRUE, {0, 0, 0}, {input_w, input_h, 1}, input_w * sizeof(float) * 4, 0, input_debug.data());
-        int idx_debug_input = 0;
-        for (int i = 0; i < 3; ++i)
-        {
-            for (int j = 0; j < input_w; ++j)
-            {
-                for (int k = 0; k < 4; ++k)
-                {
-                    printf("%f ", input_debug[idx_debug_input]);
-                    idx_debug_input++;
-                }
-                printf(" ");
-            }
-            printf("\n");
-        }
-        //    uint32_t output_w = width * UP_DIV(output_channel, 4);
-        //    uint32_t output_h = height;
-        //    std::vector<float> output_debug(output_w * output_h * 4);
-        //    engine->get_command_queue().enqueueReadImage(*(cl::Image*)handle_output, CL_TRUE, {0, 0, 0}, {output_w, output_h, 0}, output_w * sizeof(float) * 4, 0, output_debug.data());
-        //    int idx_debug_output = 0;
-        //    for (int i = 0; i < output_h; ++i)
-        //    {
-        //        for (int j = 0; j < input_w; ++j)
-        //        {
-        //            for (int k = 0; k < 4; ++k)
-        //            {
-        //                printf("%f ", output_debug[idx_debug_output]);
-        //                idx_debug_output++;
-        //            }
-        //            printf(" ");
-        //        }
-        //        printf("\n");
-        //    }
-#endif
    }
    for (auto& _ocl_node : exe_ocl_node_list)
@@ -254,11 +215,20 @@ void OCLEngine::allocate_gpu_mem(struct tensor* ir_tensor, int tensor_index, cl_
    size_t image_width = UP_DIV(C, 4) * W;
    size_t image_height = N * H;
    cl_channel_type data_type = CL_FLOAT;
-    auto image = new cl::Image2D(*this->engine_context, flags, cl::ImageFormat(CL_RGBA, data_type), image_width, image_height, 0, nullptr, nullptr);
+    auto image = new cl::Image2D(*this->engine_context,
+                                 flags,
+                                 cl::ImageFormat(CL_RGBA, data_type),
+                                 image_width,
+                                 image_height,
+                                 0,
+                                 nullptr,
+                                 nullptr);
    gpu_mem_map.insert(std::make_pair(tensor_index, (gpu_mem_handle)image));
 }
-cl::Kernel OCLEngine::build_kernel(const std::string& program_name, const std::string& kernel_name, const std::set<std::string>& options)
+cl::Kernel OCLEngine::build_kernel(const std::string& program_name,
+                                   const std::string& kernel_name,
+                                   const std::set<std::string>& options)
 {
    std::string build_option_str;
    build_option_str = "-DFLOAT=float -DFLOAT4=float4 -DFLOAT8=float8 -DRI_F=read_imagef -DFLOAT16=float16 -DWI_F=write_imagef -DCONVERT_FLOAT4=convert_float4";
@@ -270,7 +240,7 @@ cl::Kernel OCLEngine::build_kernel(const std::string& program_name, const std::s
    auto it_source = opencl_program_map.find(program_name);
    if (it_source == opencl_program_map.end())
    {
-      TLOG_ERR("build %s fail cannot find source \n", kernel_name.c_str());
+        TLOG_ERR("build %s fail cannot find source \n", kernel_name.c_str());
    }
    cl::Program::Sources sources;
    std::string source_binary(it_source->second.begin(), it_source->second.end());
@@ -323,8 +293,6 @@ void OCLEngine::upload_input_nc4hw(tensor* ir_tensor, int ir_tensor_idx)
    }
    auto input_gpu_mem = gpu_mem_map.find(ir_tensor_idx)->second;
    get_converter().nchw_buffer_to_image(ir_tensor, temp_buffer_up_down.second.get(), (cl::Image*)input_gpu_mem, false);
-    //
-    //TLOG_INFO("upload_input_nc4hw : %lld \n", input_gpu_mem);
 }
 void OCLEngine::download_output(struct tensor* ir_tensor, int ir_tensor_idx)
@@ -344,23 +312,18 @@ void OCLEngine::download_output(struct tensor* ir_tensor, int ir_tensor_idx)
    W = ir_tensor->dims[3];
    int image_width = UP_DIV(C, 4) * W;
    int image_height = N * H;
-    get_converter().image_to_buffer(ir_tensor, (cl::Image*)input_gpu_mem, temp_buffer_up_down.second.get(), image_width, image_height);
+    get_converter().image_to_buffer(ir_tensor,
-    engine_command_queue->enqueueReadBuffer(*temp_buffer_up_down.second, CL_TRUE, 0, need_size, ir_tensor->data, nullptr, nullptr);
+                                    (cl::Image*)input_gpu_mem,
+                                    temp_buffer_up_down.second.get(),
-    //    float* ptr = (float*)ir_tensor->data;
+                                    image_width,
-    //    int idx = 0;
+                                    image_height);
-    //    for (int i = 0; i < 3; ++i)
+    engine_command_queue->enqueueReadBuffer(*temp_buffer_up_down.second,
-    //    {
+                                            CL_TRUE,
-    //        for (int j = 0; j < 10; ++j)
+                                            0,
-    //        {
+                                            need_size,
-    //            for (int k = 0; k < 10; ++k)
+                                            ir_tensor->data,
-    //            {
+                                            nullptr,
-    //                printf("%.4f ", ptr[idx]);
+                                            nullptr);
-    //                idx ++;
-    //            }
-    //            printf("\n");
-    //        }
-    //    }
 }
 const cl::Context& OCLEngine::get_context() const
@@ -382,6 +345,23 @@ uint64_t OCLEngine::get_gpu_mem_by_idx(int idx)
    }
    return gpu_mem_map.find(idx)->second;
 }
+std::vector<uint32_t> OCLEngine::get_max_image_size()
+{
+    size_t height, width;
+    cl_int res = engine_device->getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT, &height);
+    if (res != CL_SUCCESS)
+    {
+        TLOG_ERR("getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT error height %d\n", res);
+    }
+    res = engine_device->getInfo(CL_DEVICE_IMAGE2D_MAX_WIDTH, &width);
+    if (res != CL_SUCCESS)
+    {
+        TLOG_ERR("getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT error width %d\n", res);
+    }
+    return {(uint32_t)width, (uint32_t)height};
+}
 std::vector<uint32_t> OCLEngine::get_max_work_item_sizes()
 {
    int dims = 3;
@@ -493,3 +473,36 @@ OCLEngine::~OCLEngine()
    gpu_mem_map.clear();
    exe_ocl_node_list.clear();
 }
+int OCLEngine::get_cache_auto_tune(auto_tune* tune)
+{
+    if (gpu_cache->get_cache_tune(tune->key, tune) == 0)
+    {
+        return 0;
+    }
+    else
+    {
+        return -1;
+    }
+}
+int OCLEngine::add_cache_auto_tune(const auto_tune& tune)
+{
+    gpu_cache->set_auto_tune(tune);
+    return 0;
+}
+int OCLEngine::load_cache(const std::string& path)
+{
+    TLOG_ERR("load cache from path: %s \n", path.c_str());
+    gpu_cache->de_serializer(path);
+    return 0;
+}
+int OCLEngine::store_cache(const std::string& path)
+{
+    TLOG_ERR("store cache to path: %s \n", path.c_str());
+    gpu_cache->serializer(path);
+    return 0;
+}
\ No newline at end of file
--- a/source/device/opencl/ocl_executor.hpp
+++ b/source/device/opencl/ocl_executor.hpp
@@ -43,6 +43,7 @@ public:
 public:
    uint64_t get_max_work_group_size(const cl::Kernel& kernel);
    std::vector<uint32_t> get_max_work_item_sizes();
+    std::vector<uint32_t> get_max_image_size();
    cl::Kernel build_kernel(const std::string& program_name, const std::string& kernel_name, const std::set<std::string>& options);
 private:
@@ -68,6 +69,8 @@ private:
    std::map<int, uint64_t> gpu_mem_map;
    std::pair<int, std::shared_ptr<cl::Buffer> > temp_buffer_up_down;
+    std::shared_ptr<cl_cache> gpu_cache;
 public:
    std::vector<std::shared_ptr<ocl_node> > exe_ocl_node_list;
@@ -82,6 +85,11 @@ public:
    void open_command_queue_profile();
    void close_command_queue_profile();
    void alloc_temp_buffer(int len);
+    int add_cache_auto_tune(const auto_tune& tune);
+    int get_cache_auto_tune(auto_tune* tune);
+    int load_cache(const std::string& path);
+    int store_cache(const std::string& path);
 };
 class ocl_node_creator

--- a/source/device/opencl/ocl_graph.cc
+++ b/source/device/opencl/ocl_graph.cc
@@ -24,10 +24,9 @@
 #include "ocl_graph.hpp"
 #include "ocl_executor.hpp"
+#include "ocl_define.h"
 extern "C" {
-#include "graph/tensor.h"
-#include "graph/node.h"
 #include "graph/graph.h"
 #include "graph/subgraph.h"
 }
@@ -35,20 +34,71 @@ extern "C" {
 int ocl_dev_init(struct device* dev)
 {
    (void)dev;
+    auto engine = new OCLEngine;
+    dev->privacy = engine;
    return 0;
 }
-int ocl_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options)
+static bool ocl_graph_index_first(struct subgraph* subgraph)
 {
-    subgraph->device_graph = new OCLEngine;
+    struct graph* ir_graph = subgraph->graph;
-    auto engine = (OCLEngine*)subgraph->device_graph;
+    int subgraph_num = get_vector_num(ir_graph->subgraph_list);
+    for (int i = 0; i < subgraph_num; i++)
+    {
+        struct subgraph* _subgraph = get_ir_graph_subgraph(ir_graph, i);
+        ir_device_t* device = _subgraph->device;
+        char* ocl_name = "OCL";
+        if (0 == strcmp(device->name, ocl_name))
+        {
+            return i == subgraph->index;
+        }
+    }
+    return false;
+}
+static bool ocl_graph_index_last(struct subgraph* subgraph)
+{
+    struct graph* ir_graph = subgraph->graph;
+    int subgraph_num = get_vector_num(ir_graph->subgraph_list);
-    return engine->OCLEnginePreRun(subgraph);
+    int last_ocl_index = -1;
+    for (int i = 0; i < subgraph_num; i++)
+    {
+        struct subgraph* _subgraph = get_ir_graph_subgraph(ir_graph, i);
+        ir_device_t* device = _subgraph->device;
+        char* ocl_name = "OCL";
+        if (0 == strcmp(device->name, ocl_name))
+        {
+            last_ocl_index = i;
+        }
+    }
+    return last_ocl_index == subgraph->index;
+}
+int ocl_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options)
+{
+    auto engine = (OCLEngine*)dev->privacy;
+    auto opt = (ocl_option*)options;
+    std::string cache_path = opt->cache_path;
+    if (ocl_graph_index_first(subgraph) && opt->load_cache)
+    {
+        engine->load_cache(cache_path);
+    }
+    auto ret = engine->OCLEnginePreRun(subgraph);
+    if (ocl_graph_index_last(subgraph) && opt->store_cache)
+    {
+        engine->store_cache(cache_path);
+    }
+    return ret;
 }
 int ocl_dev_run(struct device* dev, struct subgraph* subgraph)
 {
-    auto engine = (OCLEngine*)subgraph->device_graph;
+    auto engine = (OCLEngine*)dev->privacy;
    return engine->OCLEngineRun(subgraph);
 }
@@ -57,12 +107,14 @@ int ocl_dev_postrun(struct device* dev, struct subgraph* subgraph)
    auto engine = (OCLEngine*)subgraph->device_graph;
    engine->OCLEnginePostRun();
    delete engine;
    return 0;
 }
 int ocl_dev_release(struct device* dev)
 {
-    (void)dev;
+    auto engine = (OCLEngine*)dev->privacy;
+    engine->OCLEnginePostRun();
+    delete engine;
    return 0;
 }
--- a/source/device/opencl/ocl_limit.hpp
+++ b/source/device/opencl/ocl_limit.hpp
@@ -37,6 +37,7 @@ const int ocl_supported_ops[] = {
    OP_FC,
    OP_FLATTEN,
    OP_INPUT,
+    OP_INTERP,
    OP_POOL,
    OP_PRELU,
    OP_RELU,

--- a/source/device/opencl/ocl_node.cc
+++ b/source/device/opencl/ocl_node.cc
@@ -86,6 +86,15 @@ const std::vector<uint32_t> find_local_group_2d(std::vector<uint32_t> global_wor
    auto max_work_item_size = engine->get_max_work_item_sizes();
    uint32_t min_cost = UINT32_MAX;
+    auto_tune tune;
+    tune.key = kernel_name;
+    if (engine->get_cache_auto_tune(&tune) == 0)
+    {
+        lws_prefer[0] = tune.local_size[0];
+        lws_prefer[1] = tune.local_size[1];
+        return lws_prefer;
+    }
    if (false)
    {
        while (lws[1] <= global_work_size[1] || lws[1] <= 6)
@@ -151,10 +160,6 @@ const std::vector<uint32_t> find_local_group_2d(std::vector<uint32_t> global_wor
                    {
                        TLOG_ERR("lws tune res %s\n", kernel_name.c_str());
                    }
-//                    else
-//                    {
-//                        TLOG_ERR("%s lws tune res:cost:%d  %d,%d\n", kernel_name.c_str(), cost_time, lws[0], lws[1]);
-//                    }
                    if (cost_time < min_cost)
                    {
@@ -193,6 +198,16 @@ const std::vector<uint32_t> find_local_group_2d(std::vector<uint32_t> global_wor
        min_cost = cost_time;
    }
+    auto_tune tune_save;
+    tune_save.key = kernel_name;
+    tune_save.global_size[0] = global_work_size[0];
+    tune_save.global_size[1] = global_work_size[1];
+    tune_save.global_size[2] = 1;
+    tune_save.local_size[0] = lws_prefer[0];
+    tune_save.local_size[1] = lws_prefer[1];
+    tune_save.local_size[2] = 1;
+    engine->add_cache_auto_tune(tune_save);
    return lws_prefer;
 }
@@ -203,6 +218,16 @@ const std::vector<uint32_t> find_local_group_3d(std::vector<uint32_t> global_wor
    uint32_t min_cost = UINT32_MAX;
    auto max_work_item_size = engine->get_max_work_item_sizes();
+    auto_tune tune;
+    tune.key = kernel_name;
+    if (engine->get_cache_auto_tune(&tune) == 0)
+    {
+        lws_prefer[0] = tune.local_size[0];
+        lws_prefer[1] = tune.local_size[1];
+        lws_prefer[2] = tune.local_size[2];
+        return lws_prefer;
+    }
    while (lws[2] <= global_work_size[2] || lws[2] <= 6)
    {
        lws[1] = 1;
@@ -265,8 +290,6 @@ const std::vector<uint32_t> find_local_group_3d(std::vector<uint32_t> global_wor
        TLOG_ERR("3D lws null res %s\n", kernel_name.c_str());
    }
-   // TLOG_ERR("final %s lws tune   cost:%d  %d,%d,%d\n", kernel_name.c_str(), min_cost, lws_prefer[0], lws_prefer[1], lws_prefer[2]);
    int cost_time = (int)engine->get_cost_time(&event);
    if (cost_time < min_cost)
    {
@@ -276,15 +299,23 @@ const std::vector<uint32_t> find_local_group_3d(std::vector<uint32_t> global_wor
        min_cost = cost_time;
    }
-    //TLOG_ERR("final final %s lws tune compiler_cost:%d  pre_cost:%d  %d,%d,%d\n",  kernel_name.c_str(),cost_time, min_cost, lws_prefer[0], lws_prefer[1], lws_prefer[2]);
+    auto_tune tune_save;
+    tune_save.key = kernel_name;
+    tune_save.global_size[0] = global_work_size[0];
+    tune_save.global_size[1] = global_work_size[1];
+    tune_save.global_size[2] = global_work_size[2];
+    tune_save.local_size[0] = lws_prefer[0];
+    tune_save.local_size[1] = lws_prefer[1];
+    tune_save.local_size[2] = lws_prefer[2];
+    engine->add_cache_auto_tune(tune_save);
    return lws_prefer;
 }
 void print_data_file(struct tensor* tensor, std::string name, float* tensor_data)
 {
-    mkdir("/Users/hebingshi/work/tengine/tengine6xx/cmake-build-local/outputtest", S_IRWXU | S_IRGRP | S_IWGRP | S_IROTH);
+    mkdir("/Users/hebingshi/stnn/tenginetest/Tengine/cmake-build-debuggcc/examples/cl_output", S_IRWXU | S_IRGRP | S_IWGRP | S_IROTH);
-    std::string filename = std::string("/Users/hebingshi/work/tengine/tengine6xx/cmake-build-local/outputtest") + "/" + name + ".txt";
+    std::string filename = std::string("/Users/hebingshi/stnn/tenginetest/Tengine/cmake-build-debuggcc/examples/cl_output") + "/" + name + ".txt";
    FILE* file = fopen(filename.c_str(), "w");
    if (NULL == file)
    {

--- a/source/device/opencl/ocl_register.cc
+++ b/source/device/opencl/ocl_register.cc
@@ -11,6 +11,7 @@ extern void ocl_OP_RELU6_creator();
 extern void ocl_OP_FLATTEN_creator();
 extern void ocl_OP_FC_creator();
 extern void ocl_OP_ELTWISE_creator();
+extern void ocl_OP_INTERP_creator();
 //
 //
 void register_all_ocl_creator(void)
@@ -26,4 +27,5 @@ void register_all_ocl_creator(void)
    ocl_OP_RELU1_creator();
    ocl_OP_RELU6_creator();
    ocl_OP_ELTWISE_creator();
+    ocl_OP_INTERP_creator();
 }
--- a/source/device/opencl/oppack4/ocl_concat.cc
+++ b/source/device/opencl/oppack4/ocl_concat.cc
@@ -139,6 +139,10 @@ void ocl_concat::run(struct subgraph* subgraph)
    {
        run_type_concat_2();
    }
+#ifdef OPENCL_DEBUG_DATA
+    debug_data();
+#endif
 }
 void ocl_concat::pre_run_type_concat_0()

--- a/source/device/opencl/oppack4/ocl_conv2d.cc
+++ b/source/device/opencl/oppack4/ocl_conv2d.cc
--- a/source/device/opencl/oppack4/ocl_eltwise.cc
+++ b/source/device/opencl/oppack4/ocl_eltwise.cc
@@ -136,6 +136,9 @@ void ocl_eltwise::run(struct subgraph* subgraph)
 #else
    run_node_2d(global_work_size, local_work_size, elt_kernel);
 #endif
+#ifdef OPENCL_DEBUG_DATA
+    debug_data();
+#endif
 }
 class ocl_elewise_creator : public ocl_node_creator

--- a/source/device/opencl/oppack4/ocl_relu.cc
+++ b/source/device/opencl/oppack4/ocl_relu.cc
@@ -40,7 +40,8 @@ void ocl_relu::run(struct subgraph* subgraph)
 #else
    run_node_3d(global_work_size, local_work_size, leaky_relu_kernel);
 #endif
-#if 1
+#ifdef OPENCL_DEBUG_DATA
+    debug_data();
 #endif
 }
 ocl_relu::ocl_relu(OCLEngine* engine, struct node* ir_node)

--- a/source/device/opencl/oppack4/ocl_upsample.cc
+++ b/source/device/opencl/oppack4/ocl_upsample.cc
@@ -75,7 +75,7 @@ void ocl_upsample::pre_run()
    ocl_upsample_kernel.setArg(idx++, out_height);
    ocl_upsample_kernel.setArg(idx++, out_height);
-    local_work_size = find_local_group_2d(global_work_size, max_work_group_size, engine, ocl_upsample_kernel, "ocl_upsample");
+    local_work_size = find_local_group_2d(global_work_size, max_work_group_size, engine, ocl_upsample_kernel, ir_node->name);
 }
 void ocl_upsample::run(struct subgraph* subgraph)
@@ -101,3 +101,4 @@ public:
 };
 REGISTER_OCL_OP(OP_UPSAMPLE, ocl_upsample_creator);
+REGISTER_OCL_OP(OP_INTERP, ocl_upsample_creator)
--- a/source/device/opencl/oppack4/ocl_winograd.cc
+++ b/source/device/opencl/oppack4/ocl_winograd.cc
@@ -48,10 +48,6 @@ ocl_winograd::ocl_winograd(OCLEngine* engine, struct node* ir_node)
    int weight_wino_size = ALIGN_UP4(weight_tensor->dims[0]) * ALIGN_UP4(weight_tensor->dims[1]) * 16;
    auto weight_wino = new float[weight_wino_size];
    weight_transform(weight_tensor, weight_wino);
-    //    for (int i = 0; i < weight_wino_size; ++i)
-    //    {
-    //        printf("%.4f,", weight_wino[i]);
-    //    }
    cl::Buffer weight_buffer(engine->get_context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, weight_wino_size * sizeof(float));
    cl_int error;
@@ -98,8 +94,6 @@ void ocl_winograd::pre_run()
    strides = {conv_2d_param->stride_h, conv_2d_param->stride_w};
    dilations = {conv_2d_param->dilation_h, conv_2d_param->dilation_w};
    paddings = {conv_2d_param->pad_h0, conv_2d_param->pad_w0};
-    int kernel_width = conv_2d_param->kernel_w;
-    int kernel_height = conv_2d_param->kernel_h;
    struct graph* ir_graph = ir_node->graph;
@@ -119,9 +113,7 @@ void ocl_winograd::pre_run()
    int input_height = input_tensor->dims[2];
    int input_width = input_tensor->dims[3];
    int input_channel = input_tensor->dims[1];
-    int input_channel_block = UP_DIV(input_channel, 4);
    int output_channel = output_tensor->dims[1];
-    int output_height = output_tensor->dims[2];
    auto w_unit = UP_DIV(width, 2);
    auto h_unit = UP_DIV(height, 2);
@@ -130,6 +122,7 @@ void ocl_winograd::pre_run()
                                               cl::ImageFormat(CL_RGBA, CL_FLOAT), UP_DIV(input_channel, 4) * 4, 16 * UP_DIV(w_unit * h_unit, 4));
    gpu_dest = std::make_shared<cl::Image2D>(engine->get_context(), CL_MEM_READ_WRITE,
                                             cl::ImageFormat(CL_RGBA, CL_FLOAT), 16 * UP_DIV(w_unit * h_unit, 4), 4 * UP_DIV(output_channel, 4));
    int ic_block = UP_DIV(input_channel, 4);
    int oc_block = UP_DIV(output_channel, 4);
@@ -228,50 +221,6 @@ void ocl_winograd::run(struct subgraph* subgraph)
        run_node_2d(global_work_size_dest, local_work_size_dest, dest_transform);
 #endif
    }
-//    int ir_tensor_idx_input = ir_node->input_tensors[0];
-//    int ir_tensor_idx_output = ir_node->output_tensors[0];
-//    struct tensor* input_tensor = get_ir_graph_tensor(subgraph->graph, ir_tensor_idx_input);
-//    struct tensor* output_tensor = get_ir_graph_tensor(subgraph->graph, ir_tensor_idx_output);
-//    uint32_t w = global_work_size_source[1] * 4;
-//    uint32_t h = UP_DIV(global_work_size_source[0], 4) * 16;
-//    std::vector<float> debug_source(w * h * 4);
-//    engine->get_command_queue().enqueueReadImage(*gpu_source, CL_TRUE, {0, 0, 0}, {w, h, 1}, w * sizeof(float) * 4, 0, debug_source.data());
-//    int index = 0;
-//    printf("wino input -> source \n");
-//    for (int i = 0; i < h; ++i)
-//    {
-//        for (int j = 0; j < w; ++j)
-//        {
-//            for (int k = 0; k < 4; ++k)
-//            {
-//                printf("%.4f,", debug_source[index++]);
-//            }
-//            printf(" ");
-//        }
-//        printf("\n");
-//    }
-//
-//
-//    w  = global_work_size_source[1] * 4;
-//    h = global_work_size_dest[1] * 16;
-//    debug_source.resize(w * h * 4);
-//    engine->get_command_queue().enqueueReadImage(*gpu_weight, CL_TRUE, {0, 0, 0}, {w, h, 1}, w * sizeof(float) * 4, 0, debug_source.data());
-//    index = 0;
-//    printf("wino weight \n");
-//    for (int i = 0; i < h; ++i)
-//    {
-//        for (int j = 0; j < w; ++j)
-//        {
-//            for (int k = 0; k < 4; ++k)
-//            {
-//                printf("%.4f,", debug_source[index++]);
-//            }
-//            printf(" ");
-//        }
-//        printf("\n");
-//    }
-    //debug_data();
 }
 void ocl_winograd::weight_transform(struct tensor* weight_tensor, float* weight_dst)
@@ -304,17 +253,6 @@ void ocl_winograd::weight_transform(struct tensor* weight_tensor, float* weight_
            {
                weight_dst[start_pos + k * stride_0] = kernel_trans[k];
            }
-            int index = 0;
-            //            for (int k = 0; k < 4; ++k)
-            //            {
-            //                for (int l = 0; l < 4; ++l)
-            //                {
-            //                    printf("%.4f,", kernel_trans[index++]);
-            //                }
-            //                printf("\n");
-            //            }
-            //            printf("\n");
        }
    }