未验证 提交 7e20bfc6 编写于 作者: S shitouren1994 提交者: GitHub

Add opencl cache (#1333)

* opencl add generate_cl_binary

* remove chinese cl

* add opencl cache, fix winograd limit support yolov3

* apply code-format changes

* add opencl cache, fix winograd limit support yolov3

* apply code-format changes
Co-authored-by: Nshitouren1994 <shitouren1994@users.noreply.github.com>
上级 009319ed
...@@ -134,6 +134,7 @@ IF (OpenCV_FOUND) ...@@ -134,6 +134,7 @@ IF (OpenCV_FOUND)
TENGINE_EXAMPLE_CV (tm_alphapose tm_alphapose.cpp) TENGINE_EXAMPLE_CV (tm_alphapose tm_alphapose.cpp)
TENGINE_EXAMPLE_CV (tm_yolov3 tm_yolov3.cpp) TENGINE_EXAMPLE_CV (tm_yolov3 tm_yolov3.cpp)
TENGINE_EXAMPLE_CV (tm_yolov3_uint8 tm_yolov3_uint8.cpp) TENGINE_EXAMPLE_CV (tm_yolov3_uint8 tm_yolov3_uint8.cpp)
TENGINE_EXAMPLE_CV (tm_yolov3_opencl tm_yolov3_opencl.cpp)
TENGINE_EXAMPLE_CV (tm_yolov3_tiny tm_yolov3_tiny.cpp) TENGINE_EXAMPLE_CV (tm_yolov3_tiny tm_yolov3_tiny.cpp)
TENGINE_EXAMPLE_CV (tm_yolov3_tiny_opendla tm_yolov3_tiny_opendla.cpp) TENGINE_EXAMPLE_CV (tm_yolov3_tiny_opendla tm_yolov3_tiny_opendla.cpp)
TENGINE_EXAMPLE_CV (tm_yolov3_tiny_int8 tm_yolov3_tiny_int8.cpp) TENGINE_EXAMPLE_CV (tm_yolov3_tiny_int8 tm_yolov3_tiny_int8.cpp)
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include "common.h" #include "common.h"
#include "tengine/c_api.h" #include "tengine/c_api.h"
#include "tengine_operations.h" #include "tengine_operations.h"
#include "../source/device/opencl/ocl_define.h"
struct Object struct Object
{ {
...@@ -374,7 +375,17 @@ int main(int argc, char* argv[]) ...@@ -374,7 +375,17 @@ int main(int argc, char* argv[])
// context_t for opencl // context_t for opencl
context_t opencl_context = create_context("ocl", 1); context_t opencl_context = create_context("ocl", 1);
int rtt = set_context_device(opencl_context, "OCL", NULL, 0); struct ocl_option option;
// first time use store_cache to generate opencl auto tune cache
option.cache_path = "./test.cache";
option.store_cache = true;
// after generate cache; load cache from cache file; pre_run faster
// option.cache_path = "./test.cache";
// option.load_cache = true;
int rtt = set_context_device(opencl_context, "OCL", (void*)&option, sizeof(option));
if (0 > rtt) if (0 > rtt)
{ {
fprintf(stderr, " add_context_device opencl failed.\n"); fprintf(stderr, " add_context_device opencl failed.\n");
...@@ -436,6 +447,9 @@ int main(int argc, char* argv[]) ...@@ -436,6 +447,9 @@ int main(int argc, char* argv[])
} }
double end = get_current_time(); double end = get_current_time();
double cur = end - start; double cur = end - start;
fprintf(stderr, "Repeat %d times, thread %d, cur time %.2f ms\n", repeat_count, num_thread,
cur);
total_time += cur; total_time += cur;
min_time = std::min(min_time, cur); min_time = std::min(min_time, cur);
max_time = std::max(max_time, cur); max_time = std::max(max_time, cur);
......
...@@ -9,7 +9,19 @@ UNSET (_DEV_OCL_LINKER_OPTIONS) ...@@ -9,7 +9,19 @@ UNSET (_DEV_OCL_LINKER_OPTIONS)
UNSET (_DEV_OCL_LINK_LIBRARIES) UNSET (_DEV_OCL_LINK_LIBRARIES)
# 1. set source root path # add link options
OPTION(TENGINE_OPENCL_PROFILE_TIME "enable opencl profile time" OFF)
IF (TENGINE_OPENCL_PROFILE_TIME)
LIST(APPEND _DEV_OCL_COMPILER_DEFINES OPENCL_PROFILE_TIME)
ENDIF ()
OPTION(TENGINE_OPENCL_MODEL_CACHE "enable opencl cache" ON)
OPTION(TENGINE_OPENCL_DEBUG_DATA "enable node tensor debug data" OFF)
IF (TENGINE_OPENCL_DEBUG_DATA)
LIST(APPEND _DEV_OCL_COMPILER_DEFINES OPENCL_DEBUG_DATA)
ENDIF()
# set source root path
SET(_OCL_ROOT ${CMAKE_SOURCE_DIR}/source/device/opencl) SET(_OCL_ROOT ${CMAKE_SOURCE_DIR}/source/device/opencl)
...@@ -17,39 +29,34 @@ SET(_OCL_ROOT ${CMAKE_SOURCE_DIR}/source/device/opencl) ...@@ -17,39 +29,34 @@ SET(_OCL_ROOT ${CMAKE_SOURCE_DIR}/source/device/opencl)
LIST (APPEND _DEV_OCL_HEADER_PATH ${_OCL_ROOT}) LIST (APPEND _DEV_OCL_HEADER_PATH ${_OCL_ROOT})
LIST (APPEND _DEV_OCL_HEADER_PATH ${_OCL_ROOT}/include) LIST (APPEND _DEV_OCL_HEADER_PATH ${_OCL_ROOT}/include)
# 4. add source files # add source files
AUX_SOURCE_DIRECTORY("${_OCL_ROOT}" _OCL_BASE_SOURCE) AUX_SOURCE_DIRECTORY("${_OCL_ROOT}" _OCL_BASE_SOURCE)
AUX_SOURCE_DIRECTORY("${_OCL_ROOT}/oppack4" _OCL_OPS_SOURCE_PACK4) AUX_SOURCE_DIRECTORY("${_OCL_ROOT}/oppack4" _OCL_OPS_SOURCE_PACK4)
LIST (APPEND _DEV_OCL_DEVICE_SOURCE ${_OCL_BASE_SOURCE}) LIST(APPEND _DEV_OCL_DEVICE_SOURCE "${_OCL_ROOT}/cl4/ocl_program_hex.cc")
LIST (APPEND _DEV_OCL_DEVICE_SOURCE ${_OCL_OPS_SOURCE}) LIST(APPEND _DEV_OCL_DEVICE_SOURCE ${_OCL_BASE_SOURCE})
LIST (APPEND _DEV_OCL_DEVICE_SOURCE ${_OCL_OPS_SOURCE_PACK4}) LIST(APPEND _DEV_OCL_DEVICE_SOURCE ${_OCL_OPS_SOURCE})
LIST(APPEND _DEV_OCL_DEVICE_SOURCE ${_OCL_OPS_SOURCE_PACK4})
IF (TENGINE_OPENCL_MODEL_CACHE)
AUX_SOURCE_DIRECTORY("${_OCL_ROOT}/cache" _OCL_CACHE)
LIST(APPEND _DEV_OCL_DEVICE_SOURCE ${_OCL_CACHE})
ENDIF ()
# 5. add build options for cpu device # add build options for cpu device
# 5.1 is a gcc or clang like compiler # is a gcc or clang like compiler
IF (TENGINE_COMPILER_GCC OR TENGINE_COMPILER_CLANG) IF (TENGINE_COMPILER_GCC OR TENGINE_COMPILER_CLANG)
IF (TENGINE_COMPILER_GCC AND (${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL "6.1")) IF (TENGINE_COMPILER_GCC AND (${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL "6.1"))
LIST (APPEND _DEV_OCL_COMPILER_OPTIONS -Wno-ignored-attributes) LIST(APPEND _DEV_OCL_COMPILER_OPTIONS -Wno-ignored-attributes)
ENDIF() ENDIF ()
ENDIF()
# 5.2 is Microsoft Visual C++
IF (TENGINE_COMPILER_MSVC)
ENDIF()
# 6. add link options
OPTION(TENGINE_OPENCL_PROFILE_TIME "enable opencl profile time" OFF)
IF (TENGINE_OPENCL_PROFILE_TIME)
LIST (APPEND _DEV_OCL_COMPILER_DEFINES OPENCL_PROFILE_TIME)
ENDIF () ENDIF ()
# 7. add link libs
FIND_PACKAGE(OpenCL) FIND_PACKAGE(OpenCL)
IF(NOT OpenCL_FOUND) IF (NOT OpenCL_FOUND)
message(WARNING "please set cmake -DOpenCL_LIBRARY=user/lib/... manual") message(WARNING "please set opencl library path manual")
ENDIF() ENDIF ()
LIST (APPEND _DEV_OCL_LINK_LIBRARIES ${OpenCL_LIBRARY}) LIST(APPEND _DEV_OCL_LINK_LIBRARIES ${OpenCL_LIBRARY})
# 8. set all to cmake cache # 8. set all to cmake cache
......
#pragma once
#include "cache.hpp"
void cl_cache::de_serializer(const std::string& cache_path)
{
struct stat stat;
int fd = open(cache_path.c_str(), O_RDONLY);
if (fd < 0)
{
TLOG_ERR("cannot open file %s\n", cache_path.c_str());
return;
}
fstat(fd, &stat);
int file_len = stat.st_size;
void* mem_base = (void*)sys_malloc(file_len);
int ret = read(fd, mem_base, file_len);
char* read_current = (char*)mem_base;
uint16_t version = read<uint16_t>(&read_current);
TLOG_ERR("current cache version is: %d \n", version);
int auto_tune_size = read<int>(&read_current);
if (auto_tune_size > 0)
{
std::vector<char> temp_key;
auto_tune_vector.resize(auto_tune_size);
for (int i = 0; i < auto_tune_size; ++i)
{
auto_tune temp_auto_tune{};
int key_size = read<int>(&read_current);
temp_key.resize(key_size);
memcpy(temp_key.data(), read_current, key_size);
std::string key(temp_key.begin(), temp_key.end());
read_current += key_size;
temp_auto_tune.key = key;
temp_auto_tune.global_size[0] = read<int>(&read_current);
temp_auto_tune.global_size[1] = read<int>(&read_current);
temp_auto_tune.global_size[2] = read<int>(&read_current);
temp_auto_tune.local_size[0] = read<int>(&read_current);
temp_auto_tune.local_size[1] = read<int>(&read_current);
temp_auto_tune.local_size[2] = read<int>(&read_current);
auto_tune_vector[i] = temp_auto_tune;
TLOG_ERR("decode cache: %s %d,%d,%d %d,%d,%d \n",
key.c_str(),
temp_auto_tune.global_size[0],
temp_auto_tune.global_size[1],
temp_auto_tune.global_size[2],
temp_auto_tune.local_size[0],
temp_auto_tune.local_size[1],
temp_auto_tune.local_size[2]);
}
}
close(fd);
}
void cl_cache::serializer(const std::string& cache_path)
{
int fd = open(cache_path.c_str(), O_RDWR | O_CREAT | O_TRUNC, 0666);
if (fd == -1)
{
TLOG_ERR("Could not open %s\n", cache_path.c_str());
return;
}
auto base = (char*)sys_malloc(get_auto_tune_size());
auto out_ptr = base;
write<uint16_t>(&out_ptr, CACHE_VERSION);
write<int>(&out_ptr, auto_tune_vector.size());
for (int i = 0; i < auto_tune_vector.size(); ++i)
{
write<int>(&out_ptr, auto_tune_vector[i].key.size());
memcpy(out_ptr, auto_tune_vector[i].key.c_str(), auto_tune_vector[i].key.size());
out_ptr += auto_tune_vector[i].key.size();
write<int>(&out_ptr, auto_tune_vector[i].global_size[0]);
write<int>(&out_ptr, auto_tune_vector[i].global_size[1]);
write<int>(&out_ptr, auto_tune_vector[i].global_size[2]);
write<int>(&out_ptr, auto_tune_vector[i].local_size[0]);
write<int>(&out_ptr, auto_tune_vector[i].local_size[1]);
write<int>(&out_ptr, auto_tune_vector[i].local_size[2]);
}
write(fd, base, get_auto_tune_size());
close(fd);
}
int cl_cache::get_auto_tune_size()
{
int size = 2;
for (int i = 0; i < auto_tune_vector.size(); ++i)
{
size += 4 + 4 * 3 + 4 * 3;
size += auto_tune_vector[i].key.size();
}
return size;
}
void cl_cache::test()
{
int size = 10;
for (int i = 0; i < size; ++i)
{
struct auto_tune temp
{
};
temp.key = "ohoho" + std::to_string(i);
temp.global_size[0] = i;
temp.local_size[0] = i;
auto_tune_vector.push_back(temp);
}
serializer("./cl.cache");
de_serializer("./cl.cache");
}
int cl_cache::get_cache_tune(const std::string& key, auto_tune* tune)
{
auto res = std::find_if(auto_tune_vector.begin(), auto_tune_vector.end(), [key](const auto_tune& left) {
return left.key == key;
});
if (res != auto_tune_vector.end())
{
auto temp_auto_tune = *res;
TLOG_ERR("find cache: %s %d,%d,%d %d,%d,%d \n",
key.c_str(),
temp_auto_tune.global_size[0],
temp_auto_tune.global_size[1],
temp_auto_tune.global_size[2],
temp_auto_tune.local_size[0],
temp_auto_tune.local_size[1],
temp_auto_tune.local_size[2]);
*tune = *res;
return 0;
}
return -1;
}
void cl_cache::set_auto_tune(const auto_tune& tune)
{
TLOG_ERR("add cache: %s %d,%d,%d %d,%d,%d \n",
tune.key.c_str(),
tune.global_size[0],
tune.global_size[1],
tune.global_size[2],
tune.local_size[0],
tune.local_size[1],
tune.local_size[2]);
auto_tune_vector.push_back(tune);
TLOG_ERR("cache size: %d \n", auto_tune_vector.size());
}
#pragma once
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <sys/stat.h>
#include <sys/file.h>
#include <unistd.h>
#include "ocl_cpp_helper.hpp"
#include "utility/sys_port.h"
struct auto_tune
{
std::string key;
std::vector<int> global_size = {0, 0, 0};
std::vector<int> local_size = {0, 0, 0};
};
#define CACHE_VERSION 1
class cl_cache
{
public:
cl_cache()
{
auto_tune_vector.clear();
};
~cl_cache() = default;
void test();
void de_serializer(const std::string& path);
void serializer(const std::string& path);
int get_cache_tune(const std::string& key, auto_tune* tune);
void set_auto_tune(const auto_tune& tune);
private:
std::vector<auto_tune> auto_tune_vector;
int get_auto_tune_size();
};
template<typename Tp>
static inline Tp read(char** current)
{
auto tpr = (Tp*)*current;
*current += sizeof(Tp);
return *tpr;
}
template<typename Tp>
static inline void write(char** current, Tp value)
{
auto tpr = (Tp*)*current;
tpr[0] = value;
*current += sizeof(Tp);
}
\ No newline at end of file
...@@ -38,8 +38,6 @@ __constant sampler_t SAMPLER = ...@@ -38,8 +38,6 @@ __constant sampler_t SAMPLER =
return; \ return; \
} }
#define UNIT 4
__kernel __kernel
#if SET_ATTRIBUTE #if SET_ATTRIBUTE
__attribute__((work_group_size_hint(16, 16, 1))) __attribute__((work_group_size_hint(16, 16, 1)))
...@@ -76,7 +74,7 @@ __kernel ...@@ -76,7 +74,7 @@ __kernel
FLOAT4 out3 = out0; FLOAT4 out3 = out0;
int in_width0 = int in_width0 =
mad24(out_width_block_idx, stride_shape.y << 2, -padding_shape.y); // stride_shape.y = x方向上的stride mad24(out_width_block_idx, stride_shape.y << 2, -padding_shape.y);
int in_width1 = in_width0 + stride_shape.y; int in_width1 = in_width0 + stride_shape.y;
int in_width2 = in_width0 + stride_shape.y * 2; int in_width2 = in_width0 + stride_shape.y * 2;
int in_width3 = in_width0 + stride_shape.y * 3; int in_width3 = in_width0 + stride_shape.y * 3;
...@@ -113,9 +111,6 @@ __kernel ...@@ -113,9 +111,6 @@ __kernel
READ_INPUT_IMAGE(1, input_width_base); READ_INPUT_IMAGE(1, input_width_base);
READ_INPUT_IMAGE(2, input_width_base); READ_INPUT_IMAGE(2, input_width_base);
READ_INPUT_IMAGE(3, input_width_base); READ_INPUT_IMAGE(3, input_width_base);
weights0 = weights0 =
RI_F(weights, SAMPLER, (int2)(weights_x_idx + 0, weights_y_idx)); RI_F(weights, SAMPLER, (int2)(weights_x_idx + 0, weights_y_idx));
weights1 = weights1 =
...@@ -125,9 +120,6 @@ __kernel ...@@ -125,9 +120,6 @@ __kernel
weights3 = weights3 =
RI_F(weights, SAMPLER, (int2)(weights_x_idx + 3, weights_y_idx++)); RI_F(weights, SAMPLER, (int2)(weights_x_idx + 3, weights_y_idx++));
CALCULATE_OUTPUT(0); CALCULATE_OUTPUT(0);
CALCULATE_OUTPUT(1); CALCULATE_OUTPUT(1);
CALCULATE_OUTPUT(2); CALCULATE_OUTPUT(2);
......
...@@ -51,6 +51,7 @@ extern "C" { ...@@ -51,6 +51,7 @@ extern "C" {
#include <cstdio> #include <cstdio>
#include <fstream> #include <fstream>
#include <memory> #include <memory>
#include "cache/cache.hpp"
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
#define ROUND_UP(x, y) (((x) + (y) - (1)) / (y) * (y)) #define ROUND_UP(x, y) (((x) + (y) - (1)) / (y) * (y))
......
...@@ -30,4 +30,7 @@ typedef struct ocl_option ...@@ -30,4 +30,7 @@ typedef struct ocl_option
{ {
char* dev_name; char* dev_name;
int precision; //!< precision of calculation int precision; //!< precision of calculation
char* cache_path;
bool load_cache = false;
bool store_cache = false;
} ocl_opt_t; } ocl_opt_t;
...@@ -26,12 +26,14 @@ ...@@ -26,12 +26,14 @@
#include "ocl_node.hpp" #include "ocl_node.hpp"
#include "ocl_convertor.hpp" #include "ocl_convertor.hpp"
#include "ocl_helper.hpp" #include "ocl_helper.hpp"
#include "cache/cache.hpp"
#include <../examples/common/common.h> #include <../examples/common/common.h>
#include <string> #include <string>
extern "C" { extern "C" {
#include "operator/op.h" #include "operator/op.h"
#include "convolution_param.h" #include "convolution_param.h"
#include "ocl_define.h"
} }
void register_all_ocl_creator(); void register_all_ocl_creator();
...@@ -59,6 +61,7 @@ bool OCLEngine::init() ...@@ -59,6 +61,7 @@ bool OCLEngine::init()
engine_context = std::make_shared<cl::Context>(*engine_device, nullptr, nullptr, nullptr, &res); engine_context = std::make_shared<cl::Context>(*engine_device, nullptr, nullptr, nullptr, &res);
engine_command_queue = std::make_shared<cl::CommandQueue>(*engine_context, *engine_device, 0, &res); engine_command_queue = std::make_shared<cl::CommandQueue>(*engine_context, *engine_device, 0, &res);
engine_convertor = std::make_shared<ocl_convertor>(this); engine_convertor = std::make_shared<ocl_convertor>(this);
gpu_cache = std::make_shared<cl_cache>();
const std::string device_name = engine_device->getInfo<CL_DEVICE_NAME>(); const std::string device_name = engine_device->getInfo<CL_DEVICE_NAME>();
const std::string vendor_name = engine_device->getInfo<CL_DEVICE_VENDOR>(); const std::string vendor_name = engine_device->getInfo<CL_DEVICE_VENDOR>();
...@@ -116,48 +119,6 @@ int OCLEngine::OCLEngineRun(struct subgraph* subgraph) ...@@ -116,48 +119,6 @@ int OCLEngine::OCLEngineRun(struct subgraph* subgraph)
int ir_tensor_idx = subgraph->input_tensor_list[i]; int ir_tensor_idx = subgraph->input_tensor_list[i];
struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_tensor_idx); struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_tensor_idx);
upload_input_nc4hw(input_tensor, ir_tensor_idx); upload_input_nc4hw(input_tensor, ir_tensor_idx);
#if 0
// print input
printf("input ------------------ \n");
uint32_t input_w = input_tensor->dims[3] * UP_DIV(input_tensor->dims[1], 4);
uint32_t input_h = input_tensor->dims[2];
std::vector<float> input_debug(input_w * input_h * 4);
uint64_t mem = gpu_mem_map.find(ir_tensor_idx)->second;
get_command_queue().enqueueReadImage(*(cl::Image*)mem, CL_TRUE, {0, 0, 0}, {input_w, input_h, 1}, input_w * sizeof(float) * 4, 0, input_debug.data());
int idx_debug_input = 0;
for (int i = 0; i < 3; ++i)
{
for (int j = 0; j < input_w; ++j)
{
for (int k = 0; k < 4; ++k)
{
printf("%f ", input_debug[idx_debug_input]);
idx_debug_input++;
}
printf(" ");
}
printf("\n");
}
// uint32_t output_w = width * UP_DIV(output_channel, 4);
// uint32_t output_h = height;
// std::vector<float> output_debug(output_w * output_h * 4);
// engine->get_command_queue().enqueueReadImage(*(cl::Image*)handle_output, CL_TRUE, {0, 0, 0}, {output_w, output_h, 0}, output_w * sizeof(float) * 4, 0, output_debug.data());
// int idx_debug_output = 0;
// for (int i = 0; i < output_h; ++i)
// {
// for (int j = 0; j < input_w; ++j)
// {
// for (int k = 0; k < 4; ++k)
// {
// printf("%f ", output_debug[idx_debug_output]);
// idx_debug_output++;
// }
// printf(" ");
// }
// printf("\n");
// }
#endif
} }
for (auto& _ocl_node : exe_ocl_node_list) for (auto& _ocl_node : exe_ocl_node_list)
...@@ -254,11 +215,20 @@ void OCLEngine::allocate_gpu_mem(struct tensor* ir_tensor, int tensor_index, cl_ ...@@ -254,11 +215,20 @@ void OCLEngine::allocate_gpu_mem(struct tensor* ir_tensor, int tensor_index, cl_
size_t image_width = UP_DIV(C, 4) * W; size_t image_width = UP_DIV(C, 4) * W;
size_t image_height = N * H; size_t image_height = N * H;
cl_channel_type data_type = CL_FLOAT; cl_channel_type data_type = CL_FLOAT;
auto image = new cl::Image2D(*this->engine_context, flags, cl::ImageFormat(CL_RGBA, data_type), image_width, image_height, 0, nullptr, nullptr); auto image = new cl::Image2D(*this->engine_context,
flags,
cl::ImageFormat(CL_RGBA, data_type),
image_width,
image_height,
0,
nullptr,
nullptr);
gpu_mem_map.insert(std::make_pair(tensor_index, (gpu_mem_handle)image)); gpu_mem_map.insert(std::make_pair(tensor_index, (gpu_mem_handle)image));
} }
cl::Kernel OCLEngine::build_kernel(const std::string& program_name, const std::string& kernel_name, const std::set<std::string>& options) cl::Kernel OCLEngine::build_kernel(const std::string& program_name,
const std::string& kernel_name,
const std::set<std::string>& options)
{ {
std::string build_option_str; std::string build_option_str;
build_option_str = "-DFLOAT=float -DFLOAT4=float4 -DFLOAT8=float8 -DRI_F=read_imagef -DFLOAT16=float16 -DWI_F=write_imagef -DCONVERT_FLOAT4=convert_float4"; build_option_str = "-DFLOAT=float -DFLOAT4=float4 -DFLOAT8=float8 -DRI_F=read_imagef -DFLOAT16=float16 -DWI_F=write_imagef -DCONVERT_FLOAT4=convert_float4";
...@@ -270,7 +240,7 @@ cl::Kernel OCLEngine::build_kernel(const std::string& program_name, const std::s ...@@ -270,7 +240,7 @@ cl::Kernel OCLEngine::build_kernel(const std::string& program_name, const std::s
auto it_source = opencl_program_map.find(program_name); auto it_source = opencl_program_map.find(program_name);
if (it_source == opencl_program_map.end()) if (it_source == opencl_program_map.end())
{ {
TLOG_ERR("build %s fail cannot find source \n", kernel_name.c_str()); TLOG_ERR("build %s fail cannot find source \n", kernel_name.c_str());
} }
cl::Program::Sources sources; cl::Program::Sources sources;
std::string source_binary(it_source->second.begin(), it_source->second.end()); std::string source_binary(it_source->second.begin(), it_source->second.end());
...@@ -323,8 +293,6 @@ void OCLEngine::upload_input_nc4hw(tensor* ir_tensor, int ir_tensor_idx) ...@@ -323,8 +293,6 @@ void OCLEngine::upload_input_nc4hw(tensor* ir_tensor, int ir_tensor_idx)
} }
auto input_gpu_mem = gpu_mem_map.find(ir_tensor_idx)->second; auto input_gpu_mem = gpu_mem_map.find(ir_tensor_idx)->second;
get_converter().nchw_buffer_to_image(ir_tensor, temp_buffer_up_down.second.get(), (cl::Image*)input_gpu_mem, false); get_converter().nchw_buffer_to_image(ir_tensor, temp_buffer_up_down.second.get(), (cl::Image*)input_gpu_mem, false);
//
//TLOG_INFO("upload_input_nc4hw : %lld \n", input_gpu_mem);
} }
void OCLEngine::download_output(struct tensor* ir_tensor, int ir_tensor_idx) void OCLEngine::download_output(struct tensor* ir_tensor, int ir_tensor_idx)
...@@ -344,23 +312,18 @@ void OCLEngine::download_output(struct tensor* ir_tensor, int ir_tensor_idx) ...@@ -344,23 +312,18 @@ void OCLEngine::download_output(struct tensor* ir_tensor, int ir_tensor_idx)
W = ir_tensor->dims[3]; W = ir_tensor->dims[3];
int image_width = UP_DIV(C, 4) * W; int image_width = UP_DIV(C, 4) * W;
int image_height = N * H; int image_height = N * H;
get_converter().image_to_buffer(ir_tensor, (cl::Image*)input_gpu_mem, temp_buffer_up_down.second.get(), image_width, image_height); get_converter().image_to_buffer(ir_tensor,
engine_command_queue->enqueueReadBuffer(*temp_buffer_up_down.second, CL_TRUE, 0, need_size, ir_tensor->data, nullptr, nullptr); (cl::Image*)input_gpu_mem,
temp_buffer_up_down.second.get(),
// float* ptr = (float*)ir_tensor->data; image_width,
// int idx = 0; image_height);
// for (int i = 0; i < 3; ++i) engine_command_queue->enqueueReadBuffer(*temp_buffer_up_down.second,
// { CL_TRUE,
// for (int j = 0; j < 10; ++j) 0,
// { need_size,
// for (int k = 0; k < 10; ++k) ir_tensor->data,
// { nullptr,
// printf("%.4f ", ptr[idx]); nullptr);
// idx ++;
// }
// printf("\n");
// }
// }
} }
const cl::Context& OCLEngine::get_context() const const cl::Context& OCLEngine::get_context() const
...@@ -382,6 +345,23 @@ uint64_t OCLEngine::get_gpu_mem_by_idx(int idx) ...@@ -382,6 +345,23 @@ uint64_t OCLEngine::get_gpu_mem_by_idx(int idx)
} }
return gpu_mem_map.find(idx)->second; return gpu_mem_map.find(idx)->second;
} }
std::vector<uint32_t> OCLEngine::get_max_image_size()
{
size_t height, width;
cl_int res = engine_device->getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT, &height);
if (res != CL_SUCCESS)
{
TLOG_ERR("getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT error height %d\n", res);
}
res = engine_device->getInfo(CL_DEVICE_IMAGE2D_MAX_WIDTH, &width);
if (res != CL_SUCCESS)
{
TLOG_ERR("getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT error width %d\n", res);
}
return {(uint32_t)width, (uint32_t)height};
}
std::vector<uint32_t> OCLEngine::get_max_work_item_sizes() std::vector<uint32_t> OCLEngine::get_max_work_item_sizes()
{ {
int dims = 3; int dims = 3;
...@@ -493,3 +473,36 @@ OCLEngine::~OCLEngine() ...@@ -493,3 +473,36 @@ OCLEngine::~OCLEngine()
gpu_mem_map.clear(); gpu_mem_map.clear();
exe_ocl_node_list.clear(); exe_ocl_node_list.clear();
} }
int OCLEngine::get_cache_auto_tune(auto_tune* tune)
{
if (gpu_cache->get_cache_tune(tune->key, tune) == 0)
{
return 0;
}
else
{
return -1;
}
}
int OCLEngine::add_cache_auto_tune(const auto_tune& tune)
{
gpu_cache->set_auto_tune(tune);
return 0;
}
int OCLEngine::load_cache(const std::string& path)
{
TLOG_ERR("load cache from path: %s \n", path.c_str());
gpu_cache->de_serializer(path);
return 0;
}
int OCLEngine::store_cache(const std::string& path)
{
TLOG_ERR("store cache to path: %s \n", path.c_str());
gpu_cache->serializer(path);
return 0;
}
\ No newline at end of file
...@@ -43,6 +43,7 @@ public: ...@@ -43,6 +43,7 @@ public:
public: public:
uint64_t get_max_work_group_size(const cl::Kernel& kernel); uint64_t get_max_work_group_size(const cl::Kernel& kernel);
std::vector<uint32_t> get_max_work_item_sizes(); std::vector<uint32_t> get_max_work_item_sizes();
std::vector<uint32_t> get_max_image_size();
cl::Kernel build_kernel(const std::string& program_name, const std::string& kernel_name, const std::set<std::string>& options); cl::Kernel build_kernel(const std::string& program_name, const std::string& kernel_name, const std::set<std::string>& options);
private: private:
...@@ -68,6 +69,8 @@ private: ...@@ -68,6 +69,8 @@ private:
std::map<int, uint64_t> gpu_mem_map; std::map<int, uint64_t> gpu_mem_map;
std::pair<int, std::shared_ptr<cl::Buffer> > temp_buffer_up_down; std::pair<int, std::shared_ptr<cl::Buffer> > temp_buffer_up_down;
std::shared_ptr<cl_cache> gpu_cache;
public: public:
std::vector<std::shared_ptr<ocl_node> > exe_ocl_node_list; std::vector<std::shared_ptr<ocl_node> > exe_ocl_node_list;
...@@ -82,6 +85,11 @@ public: ...@@ -82,6 +85,11 @@ public:
void open_command_queue_profile(); void open_command_queue_profile();
void close_command_queue_profile(); void close_command_queue_profile();
void alloc_temp_buffer(int len); void alloc_temp_buffer(int len);
int add_cache_auto_tune(const auto_tune& tune);
int get_cache_auto_tune(auto_tune* tune);
int load_cache(const std::string& path);
int store_cache(const std::string& path);
}; };
class ocl_node_creator class ocl_node_creator
......
...@@ -24,10 +24,9 @@ ...@@ -24,10 +24,9 @@
#include "ocl_graph.hpp" #include "ocl_graph.hpp"
#include "ocl_executor.hpp" #include "ocl_executor.hpp"
#include "ocl_define.h"
extern "C" { extern "C" {
#include "graph/tensor.h"
#include "graph/node.h"
#include "graph/graph.h" #include "graph/graph.h"
#include "graph/subgraph.h" #include "graph/subgraph.h"
} }
...@@ -35,20 +34,71 @@ extern "C" { ...@@ -35,20 +34,71 @@ extern "C" {
int ocl_dev_init(struct device* dev) int ocl_dev_init(struct device* dev)
{ {
(void)dev; (void)dev;
auto engine = new OCLEngine;
dev->privacy = engine;
return 0; return 0;
} }
int ocl_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options) static bool ocl_graph_index_first(struct subgraph* subgraph)
{ {
subgraph->device_graph = new OCLEngine; struct graph* ir_graph = subgraph->graph;
auto engine = (OCLEngine*)subgraph->device_graph; int subgraph_num = get_vector_num(ir_graph->subgraph_list);
for (int i = 0; i < subgraph_num; i++)
{
struct subgraph* _subgraph = get_ir_graph_subgraph(ir_graph, i);
ir_device_t* device = _subgraph->device;
char* ocl_name = "OCL";
if (0 == strcmp(device->name, ocl_name))
{
return i == subgraph->index;
}
}
return false;
}
static bool ocl_graph_index_last(struct subgraph* subgraph)
{
struct graph* ir_graph = subgraph->graph;
int subgraph_num = get_vector_num(ir_graph->subgraph_list);
return engine->OCLEnginePreRun(subgraph); int last_ocl_index = -1;
for (int i = 0; i < subgraph_num; i++)
{
struct subgraph* _subgraph = get_ir_graph_subgraph(ir_graph, i);
ir_device_t* device = _subgraph->device;
char* ocl_name = "OCL";
if (0 == strcmp(device->name, ocl_name))
{
last_ocl_index = i;
}
}
return last_ocl_index == subgraph->index;
}
int ocl_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options)
{
auto engine = (OCLEngine*)dev->privacy;
auto opt = (ocl_option*)options;
std::string cache_path = opt->cache_path;
if (ocl_graph_index_first(subgraph) && opt->load_cache)
{
engine->load_cache(cache_path);
}
auto ret = engine->OCLEnginePreRun(subgraph);
if (ocl_graph_index_last(subgraph) && opt->store_cache)
{
engine->store_cache(cache_path);
}
return ret;
} }
int ocl_dev_run(struct device* dev, struct subgraph* subgraph) int ocl_dev_run(struct device* dev, struct subgraph* subgraph)
{ {
auto engine = (OCLEngine*)subgraph->device_graph; auto engine = (OCLEngine*)dev->privacy;
return engine->OCLEngineRun(subgraph); return engine->OCLEngineRun(subgraph);
} }
...@@ -57,12 +107,14 @@ int ocl_dev_postrun(struct device* dev, struct subgraph* subgraph) ...@@ -57,12 +107,14 @@ int ocl_dev_postrun(struct device* dev, struct subgraph* subgraph)
auto engine = (OCLEngine*)subgraph->device_graph; auto engine = (OCLEngine*)subgraph->device_graph;
engine->OCLEnginePostRun(); engine->OCLEnginePostRun();
delete engine; delete engine;
return 0; return 0;
} }
int ocl_dev_release(struct device* dev) int ocl_dev_release(struct device* dev)
{ {
(void)dev; auto engine = (OCLEngine*)dev->privacy;
engine->OCLEnginePostRun();
delete engine;
return 0; return 0;
} }
...@@ -37,6 +37,7 @@ const int ocl_supported_ops[] = { ...@@ -37,6 +37,7 @@ const int ocl_supported_ops[] = {
OP_FC, OP_FC,
OP_FLATTEN, OP_FLATTEN,
OP_INPUT, OP_INPUT,
OP_INTERP,
OP_POOL, OP_POOL,
OP_PRELU, OP_PRELU,
OP_RELU, OP_RELU,
......
...@@ -86,6 +86,15 @@ const std::vector<uint32_t> find_local_group_2d(std::vector<uint32_t> global_wor ...@@ -86,6 +86,15 @@ const std::vector<uint32_t> find_local_group_2d(std::vector<uint32_t> global_wor
auto max_work_item_size = engine->get_max_work_item_sizes(); auto max_work_item_size = engine->get_max_work_item_sizes();
uint32_t min_cost = UINT32_MAX; uint32_t min_cost = UINT32_MAX;
auto_tune tune;
tune.key = kernel_name;
if (engine->get_cache_auto_tune(&tune) == 0)
{
lws_prefer[0] = tune.local_size[0];
lws_prefer[1] = tune.local_size[1];
return lws_prefer;
}
if (false) if (false)
{ {
while (lws[1] <= global_work_size[1] || lws[1] <= 6) while (lws[1] <= global_work_size[1] || lws[1] <= 6)
...@@ -151,10 +160,6 @@ const std::vector<uint32_t> find_local_group_2d(std::vector<uint32_t> global_wor ...@@ -151,10 +160,6 @@ const std::vector<uint32_t> find_local_group_2d(std::vector<uint32_t> global_wor
{ {
TLOG_ERR("lws tune res %s\n", kernel_name.c_str()); TLOG_ERR("lws tune res %s\n", kernel_name.c_str());
} }
// else
// {
// TLOG_ERR("%s lws tune res:cost:%d %d,%d\n", kernel_name.c_str(), cost_time, lws[0], lws[1]);
// }
if (cost_time < min_cost) if (cost_time < min_cost)
{ {
...@@ -193,6 +198,16 @@ const std::vector<uint32_t> find_local_group_2d(std::vector<uint32_t> global_wor ...@@ -193,6 +198,16 @@ const std::vector<uint32_t> find_local_group_2d(std::vector<uint32_t> global_wor
min_cost = cost_time; min_cost = cost_time;
} }
auto_tune tune_save;
tune_save.key = kernel_name;
tune_save.global_size[0] = global_work_size[0];
tune_save.global_size[1] = global_work_size[1];
tune_save.global_size[2] = 1;
tune_save.local_size[0] = lws_prefer[0];
tune_save.local_size[1] = lws_prefer[1];
tune_save.local_size[2] = 1;
engine->add_cache_auto_tune(tune_save);
return lws_prefer; return lws_prefer;
} }
...@@ -203,6 +218,16 @@ const std::vector<uint32_t> find_local_group_3d(std::vector<uint32_t> global_wor ...@@ -203,6 +218,16 @@ const std::vector<uint32_t> find_local_group_3d(std::vector<uint32_t> global_wor
uint32_t min_cost = UINT32_MAX; uint32_t min_cost = UINT32_MAX;
auto max_work_item_size = engine->get_max_work_item_sizes(); auto max_work_item_size = engine->get_max_work_item_sizes();
auto_tune tune;
tune.key = kernel_name;
if (engine->get_cache_auto_tune(&tune) == 0)
{
lws_prefer[0] = tune.local_size[0];
lws_prefer[1] = tune.local_size[1];
lws_prefer[2] = tune.local_size[2];
return lws_prefer;
}
while (lws[2] <= global_work_size[2] || lws[2] <= 6) while (lws[2] <= global_work_size[2] || lws[2] <= 6)
{ {
lws[1] = 1; lws[1] = 1;
...@@ -265,8 +290,6 @@ const std::vector<uint32_t> find_local_group_3d(std::vector<uint32_t> global_wor ...@@ -265,8 +290,6 @@ const std::vector<uint32_t> find_local_group_3d(std::vector<uint32_t> global_wor
TLOG_ERR("3D lws null res %s\n", kernel_name.c_str()); TLOG_ERR("3D lws null res %s\n", kernel_name.c_str());
} }
// TLOG_ERR("final %s lws tune cost:%d %d,%d,%d\n", kernel_name.c_str(), min_cost, lws_prefer[0], lws_prefer[1], lws_prefer[2]);
int cost_time = (int)engine->get_cost_time(&event); int cost_time = (int)engine->get_cost_time(&event);
if (cost_time < min_cost) if (cost_time < min_cost)
{ {
...@@ -276,15 +299,23 @@ const std::vector<uint32_t> find_local_group_3d(std::vector<uint32_t> global_wor ...@@ -276,15 +299,23 @@ const std::vector<uint32_t> find_local_group_3d(std::vector<uint32_t> global_wor
min_cost = cost_time; min_cost = cost_time;
} }
//TLOG_ERR("final final %s lws tune compiler_cost:%d pre_cost:%d %d,%d,%d\n", kernel_name.c_str(),cost_time, min_cost, lws_prefer[0], lws_prefer[1], lws_prefer[2]); auto_tune tune_save;
tune_save.key = kernel_name;
tune_save.global_size[0] = global_work_size[0];
tune_save.global_size[1] = global_work_size[1];
tune_save.global_size[2] = global_work_size[2];
tune_save.local_size[0] = lws_prefer[0];
tune_save.local_size[1] = lws_prefer[1];
tune_save.local_size[2] = lws_prefer[2];
engine->add_cache_auto_tune(tune_save);
return lws_prefer; return lws_prefer;
} }
void print_data_file(struct tensor* tensor, std::string name, float* tensor_data) void print_data_file(struct tensor* tensor, std::string name, float* tensor_data)
{ {
mkdir("/Users/hebingshi/work/tengine/tengine6xx/cmake-build-local/outputtest", S_IRWXU | S_IRGRP | S_IWGRP | S_IROTH); mkdir("/Users/hebingshi/stnn/tenginetest/Tengine/cmake-build-debuggcc/examples/cl_output", S_IRWXU | S_IRGRP | S_IWGRP | S_IROTH);
std::string filename = std::string("/Users/hebingshi/work/tengine/tengine6xx/cmake-build-local/outputtest") + "/" + name + ".txt"; std::string filename = std::string("/Users/hebingshi/stnn/tenginetest/Tengine/cmake-build-debuggcc/examples/cl_output") + "/" + name + ".txt";
FILE* file = fopen(filename.c_str(), "w"); FILE* file = fopen(filename.c_str(), "w");
if (NULL == file) if (NULL == file)
{ {
......
...@@ -11,6 +11,7 @@ extern void ocl_OP_RELU6_creator(); ...@@ -11,6 +11,7 @@ extern void ocl_OP_RELU6_creator();
extern void ocl_OP_FLATTEN_creator(); extern void ocl_OP_FLATTEN_creator();
extern void ocl_OP_FC_creator(); extern void ocl_OP_FC_creator();
extern void ocl_OP_ELTWISE_creator(); extern void ocl_OP_ELTWISE_creator();
extern void ocl_OP_INTERP_creator();
// //
// //
void register_all_ocl_creator(void) void register_all_ocl_creator(void)
...@@ -26,4 +27,5 @@ void register_all_ocl_creator(void) ...@@ -26,4 +27,5 @@ void register_all_ocl_creator(void)
ocl_OP_RELU1_creator(); ocl_OP_RELU1_creator();
ocl_OP_RELU6_creator(); ocl_OP_RELU6_creator();
ocl_OP_ELTWISE_creator(); ocl_OP_ELTWISE_creator();
ocl_OP_INTERP_creator();
} }
...@@ -139,6 +139,10 @@ void ocl_concat::run(struct subgraph* subgraph) ...@@ -139,6 +139,10 @@ void ocl_concat::run(struct subgraph* subgraph)
{ {
run_type_concat_2(); run_type_concat_2();
} }
#ifdef OPENCL_DEBUG_DATA
debug_data();
#endif
} }
void ocl_concat::pre_run_type_concat_0() void ocl_concat::pre_run_type_concat_0()
......
...@@ -136,6 +136,9 @@ void ocl_eltwise::run(struct subgraph* subgraph) ...@@ -136,6 +136,9 @@ void ocl_eltwise::run(struct subgraph* subgraph)
#else #else
run_node_2d(global_work_size, local_work_size, elt_kernel); run_node_2d(global_work_size, local_work_size, elt_kernel);
#endif #endif
#ifdef OPENCL_DEBUG_DATA
debug_data();
#endif
} }
class ocl_elewise_creator : public ocl_node_creator class ocl_elewise_creator : public ocl_node_creator
......
...@@ -40,7 +40,8 @@ void ocl_relu::run(struct subgraph* subgraph) ...@@ -40,7 +40,8 @@ void ocl_relu::run(struct subgraph* subgraph)
#else #else
run_node_3d(global_work_size, local_work_size, leaky_relu_kernel); run_node_3d(global_work_size, local_work_size, leaky_relu_kernel);
#endif #endif
#if 1 #ifdef OPENCL_DEBUG_DATA
debug_data();
#endif #endif
} }
ocl_relu::ocl_relu(OCLEngine* engine, struct node* ir_node) ocl_relu::ocl_relu(OCLEngine* engine, struct node* ir_node)
......
...@@ -75,7 +75,7 @@ void ocl_upsample::pre_run() ...@@ -75,7 +75,7 @@ void ocl_upsample::pre_run()
ocl_upsample_kernel.setArg(idx++, out_height); ocl_upsample_kernel.setArg(idx++, out_height);
ocl_upsample_kernel.setArg(idx++, out_height); ocl_upsample_kernel.setArg(idx++, out_height);
local_work_size = find_local_group_2d(global_work_size, max_work_group_size, engine, ocl_upsample_kernel, "ocl_upsample"); local_work_size = find_local_group_2d(global_work_size, max_work_group_size, engine, ocl_upsample_kernel, ir_node->name);
} }
void ocl_upsample::run(struct subgraph* subgraph) void ocl_upsample::run(struct subgraph* subgraph)
...@@ -101,3 +101,4 @@ public: ...@@ -101,3 +101,4 @@ public:
}; };
REGISTER_OCL_OP(OP_UPSAMPLE, ocl_upsample_creator); REGISTER_OCL_OP(OP_UPSAMPLE, ocl_upsample_creator);
REGISTER_OCL_OP(OP_INTERP, ocl_upsample_creator)
...@@ -48,10 +48,6 @@ ocl_winograd::ocl_winograd(OCLEngine* engine, struct node* ir_node) ...@@ -48,10 +48,6 @@ ocl_winograd::ocl_winograd(OCLEngine* engine, struct node* ir_node)
int weight_wino_size = ALIGN_UP4(weight_tensor->dims[0]) * ALIGN_UP4(weight_tensor->dims[1]) * 16; int weight_wino_size = ALIGN_UP4(weight_tensor->dims[0]) * ALIGN_UP4(weight_tensor->dims[1]) * 16;
auto weight_wino = new float[weight_wino_size]; auto weight_wino = new float[weight_wino_size];
weight_transform(weight_tensor, weight_wino); weight_transform(weight_tensor, weight_wino);
// for (int i = 0; i < weight_wino_size; ++i)
// {
// printf("%.4f,", weight_wino[i]);
// }
cl::Buffer weight_buffer(engine->get_context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, weight_wino_size * sizeof(float)); cl::Buffer weight_buffer(engine->get_context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, weight_wino_size * sizeof(float));
cl_int error; cl_int error;
...@@ -98,8 +94,6 @@ void ocl_winograd::pre_run() ...@@ -98,8 +94,6 @@ void ocl_winograd::pre_run()
strides = {conv_2d_param->stride_h, conv_2d_param->stride_w}; strides = {conv_2d_param->stride_h, conv_2d_param->stride_w};
dilations = {conv_2d_param->dilation_h, conv_2d_param->dilation_w}; dilations = {conv_2d_param->dilation_h, conv_2d_param->dilation_w};
paddings = {conv_2d_param->pad_h0, conv_2d_param->pad_w0}; paddings = {conv_2d_param->pad_h0, conv_2d_param->pad_w0};
int kernel_width = conv_2d_param->kernel_w;
int kernel_height = conv_2d_param->kernel_h;
struct graph* ir_graph = ir_node->graph; struct graph* ir_graph = ir_node->graph;
...@@ -119,9 +113,7 @@ void ocl_winograd::pre_run() ...@@ -119,9 +113,7 @@ void ocl_winograd::pre_run()
int input_height = input_tensor->dims[2]; int input_height = input_tensor->dims[2];
int input_width = input_tensor->dims[3]; int input_width = input_tensor->dims[3];
int input_channel = input_tensor->dims[1]; int input_channel = input_tensor->dims[1];
int input_channel_block = UP_DIV(input_channel, 4);
int output_channel = output_tensor->dims[1]; int output_channel = output_tensor->dims[1];
int output_height = output_tensor->dims[2];
auto w_unit = UP_DIV(width, 2); auto w_unit = UP_DIV(width, 2);
auto h_unit = UP_DIV(height, 2); auto h_unit = UP_DIV(height, 2);
...@@ -130,6 +122,7 @@ void ocl_winograd::pre_run() ...@@ -130,6 +122,7 @@ void ocl_winograd::pre_run()
cl::ImageFormat(CL_RGBA, CL_FLOAT), UP_DIV(input_channel, 4) * 4, 16 * UP_DIV(w_unit * h_unit, 4)); cl::ImageFormat(CL_RGBA, CL_FLOAT), UP_DIV(input_channel, 4) * 4, 16 * UP_DIV(w_unit * h_unit, 4));
gpu_dest = std::make_shared<cl::Image2D>(engine->get_context(), CL_MEM_READ_WRITE, gpu_dest = std::make_shared<cl::Image2D>(engine->get_context(), CL_MEM_READ_WRITE,
cl::ImageFormat(CL_RGBA, CL_FLOAT), 16 * UP_DIV(w_unit * h_unit, 4), 4 * UP_DIV(output_channel, 4)); cl::ImageFormat(CL_RGBA, CL_FLOAT), 16 * UP_DIV(w_unit * h_unit, 4), 4 * UP_DIV(output_channel, 4));
int ic_block = UP_DIV(input_channel, 4); int ic_block = UP_DIV(input_channel, 4);
int oc_block = UP_DIV(output_channel, 4); int oc_block = UP_DIV(output_channel, 4);
...@@ -228,50 +221,6 @@ void ocl_winograd::run(struct subgraph* subgraph) ...@@ -228,50 +221,6 @@ void ocl_winograd::run(struct subgraph* subgraph)
run_node_2d(global_work_size_dest, local_work_size_dest, dest_transform); run_node_2d(global_work_size_dest, local_work_size_dest, dest_transform);
#endif #endif
} }
// int ir_tensor_idx_input = ir_node->input_tensors[0];
// int ir_tensor_idx_output = ir_node->output_tensors[0];
// struct tensor* input_tensor = get_ir_graph_tensor(subgraph->graph, ir_tensor_idx_input);
// struct tensor* output_tensor = get_ir_graph_tensor(subgraph->graph, ir_tensor_idx_output);
// uint32_t w = global_work_size_source[1] * 4;
// uint32_t h = UP_DIV(global_work_size_source[0], 4) * 16;
// std::vector<float> debug_source(w * h * 4);
// engine->get_command_queue().enqueueReadImage(*gpu_source, CL_TRUE, {0, 0, 0}, {w, h, 1}, w * sizeof(float) * 4, 0, debug_source.data());
// int index = 0;
// printf("wino input -> source \n");
// for (int i = 0; i < h; ++i)
// {
// for (int j = 0; j < w; ++j)
// {
// for (int k = 0; k < 4; ++k)
// {
// printf("%.4f,", debug_source[index++]);
// }
// printf(" ");
// }
// printf("\n");
// }
//
//
// w = global_work_size_source[1] * 4;
// h = global_work_size_dest[1] * 16;
// debug_source.resize(w * h * 4);
// engine->get_command_queue().enqueueReadImage(*gpu_weight, CL_TRUE, {0, 0, 0}, {w, h, 1}, w * sizeof(float) * 4, 0, debug_source.data());
// index = 0;
// printf("wino weight \n");
// for (int i = 0; i < h; ++i)
// {
// for (int j = 0; j < w; ++j)
// {
// for (int k = 0; k < 4; ++k)
// {
// printf("%.4f,", debug_source[index++]);
// }
// printf(" ");
// }
// printf("\n");
// }
//debug_data();
} }
void ocl_winograd::weight_transform(struct tensor* weight_tensor, float* weight_dst) void ocl_winograd::weight_transform(struct tensor* weight_tensor, float* weight_dst)
...@@ -304,17 +253,6 @@ void ocl_winograd::weight_transform(struct tensor* weight_tensor, float* weight_ ...@@ -304,17 +253,6 @@ void ocl_winograd::weight_transform(struct tensor* weight_tensor, float* weight_
{ {
weight_dst[start_pos + k * stride_0] = kernel_trans[k]; weight_dst[start_pos + k * stride_0] = kernel_trans[k];
} }
int index = 0;
// for (int k = 0; k < 4; ++k)
// {
// for (int l = 0; l < 4; ++l)
// {
// printf("%.4f,", kernel_trans[index++]);
// }
// printf("\n");
// }
// printf("\n");
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册