提交 afc02d57 编写于 作者: N nihuini

runtime detect armv8.2 dotprod

上级 5f62fdec
...@@ -131,7 +131,9 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm") ...@@ -131,7 +131,9 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
if(NCNN_COMPILER_SUPPORT_ARM82_FP16) if(NCNN_COMPILER_SUPPORT_ARM82_FP16)
option(NCNN_ARM82 "optimize aarch64 platform with armv8.2" ON) option(NCNN_ARM82 "optimize aarch64 platform with armv8.2" ON)
if(NOT NCNN_COMPILER_SUPPORT_ARM82_FP16_DOTPROD) if(NCNN_COMPILER_SUPPORT_ARM82_FP16_DOTPROD)
option(NCNN_ARM82DOT "optimize aarch64 platform with armv8.2 dotprod." ON)
else()
message(WARNING "The compiler does not support armv8.2 dotprod. Upgrading your toolchain is strongly recommended.") message(WARNING "The compiler does not support armv8.2 dotprod. Upgrading your toolchain is strongly recommended.")
endif() endif()
else() else()
......
...@@ -182,11 +182,7 @@ macro(ncnn_add_layer class) ...@@ -182,11 +182,7 @@ macro(ncnn_add_layer class)
) )
set_source_files_properties(${NCNN_ARM82_SOURCE} PROPERTIES GENERATED TRUE) set_source_files_properties(${NCNN_ARM82_SOURCE} PROPERTIES GENERATED TRUE)
if(NCNN_COMPILER_SUPPORT_ARM82_FP16_DOTPROD) set_source_files_properties(${NCNN_ARM82_SOURCE} PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+fp16")
set_source_files_properties(${NCNN_ARM82_SOURCE} PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+fp16+dotprod")
elseif(NCNN_COMPILER_SUPPORT_ARM82_FP16)
set_source_files_properties(${NCNN_ARM82_SOURCE} PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+fp16")
endif()
list(APPEND ncnn_SRCS ${NCNN_ARM82_HEADER} ${NCNN_ARM82_SOURCE}) list(APPEND ncnn_SRCS ${NCNN_ARM82_HEADER} ${NCNN_ARM82_SOURCE})
...@@ -226,6 +222,74 @@ macro(ncnn_add_layer class) ...@@ -226,6 +222,74 @@ macro(ncnn_add_layer class)
endif() endif()
endif() endif()
if(NCNN_RUNTIME_CPU AND NCNN_ARM82DOT AND ((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)")))
# enable armv8.2a+fp16+dot
set(NCNN_ARM_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h)
set(NCNN_ARM_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp)
if(WITH_LAYER_${name} AND EXISTS ${NCNN_ARM_HEADER} AND EXISTS ${NCNN_ARM_SOURCE})
set(NCNN_ARM82DOT_HEADER ${CMAKE_CURRENT_BINARY_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_arm82dot.h)
set(NCNN_ARM82DOT_SOURCE ${CMAKE_CURRENT_BINARY_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_arm82dot.cpp)
add_custom_command(
OUTPUT ${NCNN_ARM82DOT_HEADER}
COMMAND ${CMAKE_COMMAND} -DSRC=${NCNN_ARM_HEADER} -DDST=${NCNN_ARM82DOT_HEADER} -DCLASS=${class} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_arm82dot_source.cmake"
DEPENDS ${NCNN_ARM_HEADER}
COMMENT "Generating source ${name}_${NCNN_TARGET_ARCH}_arm82dot.h"
VERBATIM
)
set_source_files_properties(${NCNN_ARM82DOT_HEADER} PROPERTIES GENERATED TRUE)
add_custom_command(
OUTPUT ${NCNN_ARM82DOT_SOURCE}
COMMAND ${CMAKE_COMMAND} -DSRC=${NCNN_ARM_SOURCE} -DDST=${NCNN_ARM82DOT_SOURCE} -DCLASS=${class} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_arm82dot_source.cmake"
DEPENDS ${NCNN_ARM_SOURCE}
COMMENT "Generating source ${name}_${NCNN_TARGET_ARCH}_arm82dot.cpp"
VERBATIM
)
set_source_files_properties(${NCNN_ARM82DOT_SOURCE} PROPERTIES GENERATED TRUE)
set_source_files_properties(${NCNN_ARM82DOT_SOURCE} PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+fp16+dotprod")
list(APPEND ncnn_SRCS ${NCNN_ARM82DOT_HEADER} ${NCNN_ARM82DOT_SOURCE})
# generate layer_declaration and layer_registry_arm82dot file
set(layer_declaration "${layer_declaration}#include \"layer/${name}.h\"\n")
set(layer_declaration_class "class ${class}_final_arm82dot : virtual public ${class}")
set(create_pipeline_content " { int ret = ${class}::create_pipeline(opt); if (ret) return ret; }\n")
set(destroy_pipeline_content " { int ret = ${class}::destroy_pipeline(opt); if (ret) return ret; }\n")
set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_arm82dot.h\"\n")
set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_${NCNN_TARGET_ARCH}_arm82dot")
set(create_pipeline_content "${create_pipeline_content} { int ret = ${class}_${NCNN_TARGET_ARCH}_arm82dot::create_pipeline(opt); if (ret) return ret; }\n")
set(destroy_pipeline_content " { int ret = ${class}_${NCNN_TARGET_ARCH}_arm82dot::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
if(WITH_LAYER_${name}_vulkan)
set(layer_declaration "${layer_declaration}#include \"layer/vulkan/${name}_vulkan.h\"\n")
set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_vulkan")
set(create_pipeline_content "${create_pipeline_content} if (vkdev) { int ret = ${class}_vulkan::create_pipeline(opt); if (ret) return ret; }\n")
set(destroy_pipeline_content " if (vkdev) { int ret = ${class}_vulkan::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
endif()
set(layer_declaration "${layer_declaration}namespace ncnn {\n${layer_declaration_class}\n{\n")
set(layer_declaration "${layer_declaration}public:\n")
set(layer_declaration "${layer_declaration} virtual int create_pipeline(const Option& opt) {\n${create_pipeline_content} return 0;\n }\n")
set(layer_declaration "${layer_declaration} virtual int destroy_pipeline(const Option& opt) {\n${destroy_pipeline_content} return 0;\n }\n")
set(layer_declaration "${layer_declaration}};\n")
set(layer_declaration "${layer_declaration}DEFINE_LAYER_CREATOR(${class}_final_arm82dot)\n} // namespace ncnn\n\n")
set(layer_registry_arm82dot "${layer_registry_arm82dot}#if NCNN_STRING\n{\"${class}\", ${class}_final_arm82dot_layer_creator},\n#else\n{${class}_final_arm82dot_layer_creator},\n#endif\n")
else()
# no arm optimized version
if(WITH_LAYER_${name})
set(layer_registry_arm82dot "${layer_registry_arm82dot}#if NCNN_STRING\n{\"${class}\", ${class}_final_layer_creator},\n#else\n{${class}_final_layer_creator},\n#endif\n")
else()
set(layer_registry_arm82dot "${layer_registry_arm82dot}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
endif()
endif()
endif()
if(NCNN_RUNTIME_CPU AND NCNN_RVV AND NCNN_TARGET_ARCH STREQUAL "riscv") if(NCNN_RUNTIME_CPU AND NCNN_RVV AND NCNN_TARGET_ARCH STREQUAL "riscv")
# enable rvv+fp16 # enable rvv+fp16
set(NCNN_RISCV_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h) set(NCNN_RISCV_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h)
......
# must define SRC DST CLASS
file(READ ${SRC} source_data)
# replace
string(TOUPPER ${CLASS} CLASS_UPPER)
string(TOLOWER ${CLASS} CLASS_LOWER)
string(REGEX REPLACE "LAYER_${CLASS_UPPER}_ARM_H" "LAYER_${CLASS_UPPER}_ARM_ARM82DOT_H" source_data "${source_data}")
string(REGEX REPLACE "${CLASS}_arm" "${CLASS}_arm_arm82dot" source_data "${source_data}")
string(REGEX REPLACE "#include \"${CLASS_LOWER}_arm.h\"" "#include \"${CLASS_LOWER}_arm_arm82dot.h\"" source_data "${source_data}")
file(WRITE ${DST} "${source_data}")
...@@ -154,6 +154,7 @@ configure_file(layer_declaration.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_declarat ...@@ -154,6 +154,7 @@ configure_file(layer_declaration.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_declarat
configure_file(layer_registry.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h) configure_file(layer_registry.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h)
configure_file(layer_registry_avx2.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_registry_avx2.h) configure_file(layer_registry_avx2.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_registry_avx2.h)
configure_file(layer_registry_arm82.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_registry_arm82.h) configure_file(layer_registry_arm82.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_registry_arm82.h)
configure_file(layer_registry_arm82dot.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_registry_arm82dot.h)
configure_file(layer_registry_rvv.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_registry_rvv.h) configure_file(layer_registry_rvv.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_registry_rvv.h)
configure_file(layer_type_enum.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_type_enum.h) configure_file(layer_type_enum.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_type_enum.h)
configure_file(layer_shader_registry.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_registry.h) configure_file(layer_shader_registry.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_registry.h)
...@@ -346,9 +347,9 @@ endif() ...@@ -346,9 +347,9 @@ endif()
if(((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)"))) if(((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)")))
if(NOT NCNN_RUNTIME_CPU AND NCNN_ARM82) if(NOT NCNN_RUNTIME_CPU AND NCNN_ARM82)
if(NCNN_COMPILER_SUPPORT_ARM82_FP16_DOTPROD) if(NCNN_ARM82DOT)
target_compile_options(ncnn PRIVATE -march=armv8.2-a+fp16+dotprod) target_compile_options(ncnn PRIVATE -march=armv8.2-a+fp16+dotprod)
elseif(NCNN_COMPILER_SUPPORT_ARM82_FP16) else()
target_compile_options(ncnn PRIVATE -march=armv8.2-a+fp16) target_compile_options(ncnn PRIVATE -march=armv8.2-a+fp16)
endif() endif()
endif() endif()
......
...@@ -133,6 +133,7 @@ static unsigned int g_hwcaps = get_elf_hwcap_from_proc_self_auxv(); ...@@ -133,6 +133,7 @@ static unsigned int g_hwcaps = get_elf_hwcap_from_proc_self_auxv();
// from arch/arm64/include/uapi/asm/hwcap.h // from arch/arm64/include/uapi/asm/hwcap.h
#define HWCAP_ASIMD (1 << 1) #define HWCAP_ASIMD (1 << 1)
#define HWCAP_ASIMDHP (1 << 10) #define HWCAP_ASIMDHP (1 << 10)
#define HWCAP_ASIMDDP (1 << 20)
#else #else
// from arch/arm/include/uapi/asm/hwcap.h // from arch/arm/include/uapi/asm/hwcap.h
#define HWCAP_NEON (1 << 12) #define HWCAP_NEON (1 << 12)
...@@ -336,6 +337,25 @@ int cpu_support_arm_asimdhp() ...@@ -336,6 +337,25 @@ int cpu_support_arm_asimdhp()
#endif #endif
} }
int cpu_support_arm_asimddp()
{
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
return g_hwcaps & HWCAP_ASIMDDP;
#else
return 0;
#endif
#elif __APPLE__
#if __aarch64__
return g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM;
#else
return 0;
#endif
#else
return 0;
#endif
}
int cpu_support_x86_avx2() int cpu_support_x86_avx2()
{ {
#if (_M_AMD64 || __x86_64__) || (_M_IX86 || __i386__) #if (_M_AMD64 || __x86_64__) || (_M_IX86 || __i386__)
......
...@@ -51,6 +51,8 @@ NCNN_EXPORT int cpu_support_arm_neon(); ...@@ -51,6 +51,8 @@ NCNN_EXPORT int cpu_support_arm_neon();
NCNN_EXPORT int cpu_support_arm_vfpv4(); NCNN_EXPORT int cpu_support_arm_vfpv4();
// asimdhp = aarch64 asimd half precision // asimdhp = aarch64 asimd half precision
NCNN_EXPORT int cpu_support_arm_asimdhp(); NCNN_EXPORT int cpu_support_arm_asimdhp();
// asimddp = aarch64 asimd dot product
NCNN_EXPORT int cpu_support_arm_asimddp();
// avx2 = x86_64 avx2 + fma + f16c // avx2 = x86_64 avx2 + fma + f16c
NCNN_EXPORT int cpu_support_x86_avx2(); NCNN_EXPORT int cpu_support_x86_avx2();
......
...@@ -214,6 +214,12 @@ static const layer_registry_entry layer_registry_arm82[] = { ...@@ -214,6 +214,12 @@ static const layer_registry_entry layer_registry_arm82[] = {
}; };
#endif // NCNN_RUNTIME_CPU && NCNN_ARM82 #endif // NCNN_RUNTIME_CPU && NCNN_ARM82
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT
static const layer_registry_entry layer_registry_arm82dot[] = {
#include "layer_registry_arm82dot.h"
};
#endif // NCNN_RUNTIME_CPU && NCNN_ARM82DOT
#if NCNN_RUNTIME_CPU && NCNN_RVV #if NCNN_RUNTIME_CPU && NCNN_RVV
static const layer_registry_entry layer_registry_rvv[] = { static const layer_registry_entry layer_registry_rvv[] = {
#include "layer_registry_rvv.h" #include "layer_registry_rvv.h"
...@@ -259,6 +265,13 @@ Layer* create_layer(int index) ...@@ -259,6 +265,13 @@ Layer* create_layer(int index)
} }
else else
#endif // NCNN_RUNTIME_CPU && NCNN_AVX2 #endif // NCNN_RUNTIME_CPU && NCNN_AVX2
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT
if (ncnn::cpu_support_arm_asimdhp() && ncnn::cpu_support_arm_asimddp())
{
layer_creator = layer_registry_arm82dot[index].creator;
}
else
#endif // NCNN_RUNTIME_CPU && NCNN_ARM82DOT
#if NCNN_RUNTIME_CPU && NCNN_ARM82 #if NCNN_RUNTIME_CPU && NCNN_ARM82
if (ncnn::cpu_support_arm_asimdhp()) if (ncnn::cpu_support_arm_asimdhp())
{ {
......
// Layer Registry header
//
// This file is auto-generated by cmake, don't edit it.
@layer_registry_arm82dot@
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#cmakedefine01 NCNN_RUNTIME_CPU #cmakedefine01 NCNN_RUNTIME_CPU
#cmakedefine01 NCNN_AVX2 #cmakedefine01 NCNN_AVX2
#cmakedefine01 NCNN_ARM82 #cmakedefine01 NCNN_ARM82
#cmakedefine01 NCNN_ARM82DOT
#cmakedefine01 NCNN_RVV #cmakedefine01 NCNN_RVV
#cmakedefine01 NCNN_INT8 #cmakedefine01 NCNN_INT8
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册