提交 8764a6c8 编写于 作者: M Megvii Engine Team

feat(dnn/cuda): add volta dp4a int8 sass kernel

GitOrigin-RevId: 9fefd39678729ec185c1b09c5b4abd88ebbde3a0
上级 e296a684
...@@ -6,7 +6,8 @@ ...@@ -6,7 +6,8 @@
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/ */
#include "src/cuda/utils.cuh" #include "src/cuda/utils.cuh"
#include "src/cuda/utils.h" #include "src/cuda/utils.h"
...@@ -30,49 +31,48 @@ struct DevicePropRec { ...@@ -30,49 +31,48 @@ struct DevicePropRec {
constexpr int MAX_NR_DEVICE = 32; constexpr int MAX_NR_DEVICE = 32;
DevicePropRec device_prop_rec[MAX_NR_DEVICE]; DevicePropRec device_prop_rec[MAX_NR_DEVICE];
const char *cublasGetErrorString(cublasStatus_t error) { const char* cublasGetErrorString(cublasStatus_t error) {
switch (error) switch (error) {
{ case CUBLAS_STATUS_SUCCESS:
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
return "CUBLAS_STATUS_SUCCESS"; case CUBLAS_STATUS_NOT_INITIALIZED:
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
return "CUBLAS_STATUS_NOT_INITIALIZED"; case CUBLAS_STATUS_ALLOC_FAILED:
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
return "CUBLAS_STATUS_ALLOC_FAILED"; case CUBLAS_STATUS_INVALID_VALUE:
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
return "CUBLAS_STATUS_INVALID_VALUE"; case CUBLAS_STATUS_ARCH_MISMATCH:
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
return "CUBLAS_STATUS_ARCH_MISMATCH"; case CUBLAS_STATUS_MAPPING_ERROR:
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
return "CUBLAS_STATUS_MAPPING_ERROR"; case CUBLAS_STATUS_EXECUTION_FAILED:
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
return "CUBLAS_STATUS_EXECUTION_FAILED"; case CUBLAS_STATUS_INTERNAL_ERROR:
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
return "CUBLAS_STATUS_INTERNAL_ERROR"; case CUBLAS_STATUS_LICENSE_ERROR:
case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
return "CUBLAS_STATUS_LICENSE_ERROR"; case CUBLAS_STATUS_NOT_SUPPORTED:
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
return "CUBLAS_STATUS_NOT_SUPPORTED"; }
} return "Unknown CUBLAS error";
return "Unknown CUBLAS error";
} }
} // anonymous namespace } // anonymous namespace
void cuda::__throw_cuda_error__(cudaError_t err, const char *msg) { void cuda::__throw_cuda_error__(cudaError_t err, const char* msg) {
auto s = ssprintf("cuda error %s(%d) occurred; expr: %s", auto s = ssprintf("cuda error %s(%d) occurred; expr: %s",
cudaGetErrorString(err), int(err), msg); cudaGetErrorString(err), int(err), msg);
megdnn_throw(s.c_str()); megdnn_throw(s.c_str());
} }
void cuda::__throw_cudnn_error__(cudnnStatus_t err, const char *msg) { void cuda::__throw_cudnn_error__(cudnnStatus_t err, const char* msg) {
auto s = ssprintf("cudnn error %s(%d) occurred; expr: %s", auto s = ssprintf("cudnn error %s(%d) occurred; expr: %s",
cudnnGetErrorString(err), int(err), msg); cudnnGetErrorString(err), int(err), msg);
megdnn_throw(s.c_str()); megdnn_throw(s.c_str());
} }
void cuda::__throw_cublas_error__(cublasStatus_t err, const char *msg) { void cuda::__throw_cublas_error__(cublasStatus_t err, const char* msg) {
auto s = ssprintf("cublas error %s(%d) occurred; expr: %s", auto s = ssprintf("cublas error %s(%d) occurred; expr: %s",
cublasGetErrorString(err), int(err), msg); cublasGetErrorString(err), int(err), msg);
megdnn_throw(s.c_str()); megdnn_throw(s.c_str());
} }
...@@ -92,17 +92,17 @@ void cuda::__throw_cutlass_error__(cutlass::Status err, const char* msg) { ...@@ -92,17 +92,17 @@ void cuda::__throw_cutlass_error__(cutlass::Status err, const char* msg) {
megdnn_throw(s.c_str()); megdnn_throw(s.c_str());
} }
void cuda::report_error(const char *msg) { void cuda::report_error(const char* msg) {
megdnn_throw(msg); megdnn_throw(msg);
MEGDNN_MARK_USED_VAR(msg); MEGDNN_MARK_USED_VAR(msg);
} }
uint32_t cuda::safe_size_in_kern(size_t size) { uint32_t cuda::safe_size_in_kern(size_t size) {
if (!size || size > Uint32Fastdiv::MAX_DIVIDEND) { if (!size || size > Uint32Fastdiv::MAX_DIVIDEND) {
megdnn_throw(ssprintf( megdnn_throw(
"invalid size for element-wise kernel: %zu; " ssprintf("invalid size for element-wise kernel: %zu; "
"max supported size is %u", "max supported size is %u",
size, Uint32Fastdiv::MAX_DIVIDEND)); size, Uint32Fastdiv::MAX_DIVIDEND));
} }
return size; return size;
} }
...@@ -111,7 +111,7 @@ cudaDeviceProp cuda::current_device_prop() { ...@@ -111,7 +111,7 @@ cudaDeviceProp cuda::current_device_prop() {
int dev; int dev;
cuda_check(cudaGetDevice(&dev)); cuda_check(cudaGetDevice(&dev));
megdnn_assert(dev < MAX_NR_DEVICE, "device number too large: %d", dev); megdnn_assert(dev < MAX_NR_DEVICE, "device number too large: %d", dev);
auto &&rec = device_prop_rec[dev]; auto&& rec = device_prop_rec[dev];
if (!rec.init) { if (!rec.init) {
std::lock_guard<std::mutex> lock(rec.mtx); std::lock_guard<std::mutex> lock(rec.mtx);
if (!rec.init) { if (!rec.init) {
...@@ -137,6 +137,19 @@ size_t cuda::max_batch_x_channel_size() { ...@@ -137,6 +137,19 @@ size_t cuda::max_batch_x_channel_size() {
return current_device_prop().maxGridSize[2]; return current_device_prop().maxGridSize[2];
} }
uint32_t cuda::param_buffer_start_address() {
auto&& device_prop = current_device_prop();
int cap = 10 * device_prop.major + device_prop.minor;
// maxwell and pascal: 0x140
if (cap >= 50 && cap < 70)
return 0x140;
// volta ~ ampere: 0x160
else if (cap >= 70)
return 0x160;
megdnn_throw(
ssprintf("unsupported cuda compute capability %d", cap).c_str());
}
const char* cuda::current_device_arch_name() { const char* cuda::current_device_arch_name() {
auto&& device_prop = current_device_prop(); auto&& device_prop = current_device_prop();
int cap = 10 * device_prop.major + device_prop.minor; int cap = 10 * device_prop.major + device_prop.minor;
...@@ -155,4 +168,3 @@ const char* cuda::current_device_arch_name() { ...@@ -155,4 +168,3 @@ const char* cuda::current_device_arch_name() {
} }
// vim: syntax=cpp.doxygen // vim: syntax=cpp.doxygen
...@@ -6,7 +6,8 @@ ...@@ -6,7 +6,8 @@
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/ */
#pragma once #pragma once
...@@ -24,19 +25,19 @@ ...@@ -24,19 +25,19 @@
namespace megdnn { namespace megdnn {
namespace cuda { namespace cuda {
static inline HandleImpl *concrete_handle(Handle *handle) { static inline HandleImpl* concrete_handle(Handle* handle) {
return static_cast<cuda::HandleImpl*>(handle); return static_cast<cuda::HandleImpl*>(handle);
} }
static inline cudnnHandle_t cudnn_handle(Handle *handle) { static inline cudnnHandle_t cudnn_handle(Handle* handle) {
return concrete_handle(handle)->cudnn_handle(); return concrete_handle(handle)->cudnn_handle();
} }
static inline cublasHandle_t cublas_handle(Handle *handle) { static inline cublasHandle_t cublas_handle(Handle* handle) {
return concrete_handle(handle)->cublas_handle(); return concrete_handle(handle)->cublas_handle();
} }
static inline cudaStream_t cuda_stream(Handle *handle) { static inline cudaStream_t cuda_stream(Handle* handle) {
return concrete_handle(handle)->stream(); return concrete_handle(handle)->stream();
} }
...@@ -44,9 +45,8 @@ static inline megcore::AsyncErrorInfo* async_error_info(Handle* handle) { ...@@ -44,9 +45,8 @@ static inline megcore::AsyncErrorInfo* async_error_info(Handle* handle) {
return concrete_handle(handle)->megcore_context().error_info; return concrete_handle(handle)->megcore_context().error_info;
} }
static inline void CUDART_CB callback_free( static inline void CUDART_CB callback_free(cudaStream_t /* stream */,
cudaStream_t /* stream */, cudaError_t status, void *userData) cudaError_t status, void* userData) {
{
cuda_check(status); cuda_check(status);
free(userData); free(userData);
} }
...@@ -64,9 +64,12 @@ bool is_compute_capability_equalto(int major, int minor); ...@@ -64,9 +64,12 @@ bool is_compute_capability_equalto(int major, int minor);
//! third demension //! third demension
size_t max_batch_x_channel_size(); size_t max_batch_x_channel_size();
//! get param buffer start address at cmem[0]
uint32_t param_buffer_start_address();
const char* current_device_arch_name(); const char* current_device_arch_name();
} // namespace cuda } // namespace cuda
} // namespace megdnn } // namespace megdnn
// vim: syntax=cpp.doxygen // vim: syntax=cpp.doxygen
...@@ -6,7 +6,8 @@ ...@@ -6,7 +6,8 @@
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/ */
#include "test/common/conv_bias.h" #include "test/common/conv_bias.h"
#include "megdnn/opr_param_defs.h" #include "megdnn/opr_param_defs.h"
...@@ -413,7 +414,7 @@ std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size, ...@@ -413,7 +414,7 @@ std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size,
megdnn_assert(kernel_size > 0, "not support kernel_size"); megdnn_assert(kernel_size > 0, "not support kernel_size");
using NLMode = param::ConvBias::NonlineMode; using NLMode = param::ConvBias::NonlineMode;
//// clang-format off // clang-format off
for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU}) { for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU}) {
for (auto mode : {param::ConvBias::Mode::CROSS_CORRELATION}) { for (auto mode : {param::ConvBias::Mode::CROSS_CORRELATION}) {
for (size_t b : {1,2}) { for (size_t b : {1,2}) {
...@@ -795,7 +796,7 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype, ...@@ -795,7 +796,7 @@ void check_conv_bias(DType src_dtype, DType filter_dtype, DType bias_dtype,
return z; return z;
}; };
megdnn_assert(rng != nullptr && bias_rng != nullptr); megdnn_assert(rng != nullptr && bias_rng != nullptr);
checker.set_rng(0, rng.get()) checker.set_rng(0, rng.get())
.set_rng(1, rng.get()) .set_rng(1, rng.get())
.set_rng(2, rng.get()) .set_rng(2, rng.get())
.set_rng(3, rng.get()); .set_rng(3, rng.get());
...@@ -1152,8 +1153,7 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, ...@@ -1152,8 +1153,7 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m,
handle->create_operator<WinogradFilterPreprocess>(); handle->create_operator<WinogradFilterPreprocess>();
winograd_preprocess_opr->param().output_block_size = m; winograd_preprocess_opr->param().output_block_size = m;
winograd_preprocess_opr->param().format = format; winograd_preprocess_opr->param().format = format;
winograd_preprocess_opr->param().compute_mode = winograd_preprocess_opr->param().compute_mode = param.compute_mode;
param.compute_mode;
TensorLayout filter_transform_layout; TensorLayout filter_transform_layout;
winograd_preprocess_opr->deduce_layout(tensors[1].layout, winograd_preprocess_opr->deduce_layout(tensors[1].layout,
filter_transform_layout); filter_transform_layout);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册