...
 
Commits (6)
    https://gitcode.net/megvii/megengine/-/commit/0baf6b0d633fb89a55c840d574b7f1f93fb0f767 Merge pull request #175 from tpoisonooo:fix-spell-error 2021-07-23T18:14:07+08:00 liuke liuke@megvii.com GitOrigin-RevId: fe649676ac1aa844a469d1cbf19dd7612cc0f29f https://gitcode.net/megvii/megengine/-/commit/869a03271b463aa1501c36b2d7c42ec3f444e748 perf(mgb): disable FoldingConvBiasDimshufflePass in cuda10 for performance 2021-07-23T18:14:15+08:00 Megvii Engine Team megengine@megvii.com GitOrigin-RevId: d1b95a6f01ba73f98c0094e00fee3e61e9139628 https://gitcode.net/megvii/megengine/-/commit/207a346351d1d7daa74435f43b1992f86331d650 chore(mge): run get_device_count("gpu") in subprocess 2021-07-23T18:14:20+08:00 Megvii Engine Team megengine@megvii.com GitOrigin-RevId: 0f0dc001cfc45fc0d04de1a86c27f8bba8185d6b https://gitcode.net/megvii/megengine/-/commit/c2daea3cbadee0f334fdae8c0daa2d191783d19c chore(release): bump version 2021-07-23T18:14:26+08:00 Megvii Engine Team megengine@megvii.com GitOrigin-RevId: a016ea9d564a32adecaf6a80bc01a8ddb95d88db https://gitcode.net/megvii/megengine/-/commit/093f7ae7746c766e6c4023c2a4d99ae25dd02c07 feat(mge/third_party): update cutlass version 2021-07-23T18:19:28+08:00 huangxinda megengine@megvii.com https://gitcode.net/megvii/megengine/-/commit/6af4a32e1781e213477ea0f9e866576f36afc123 feat(mge/third_party): update MegRay version 2021-07-23T18:20:07+08:00 huangxinda megengine@megvii.com
...@@ -1060,6 +1060,46 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) { ...@@ -1060,6 +1060,46 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) {
param::ConvBias::Format::CHWN4); param::ConvBias::Format::CHWN4);
} }
TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_NCHW) {
CUBenchmarker<ConvBiasForward> benchmarker(handle_cuda());
size_t RUNS = 1000;
benchmarker.set_display(false).set_times(RUNS);
using namespace conv_bias;
UniformIntRNG int_rng{-3, 3};
UniformIntRNG bias_rng{-50, 50};
ConvBias::Param param;
param.format = ConvBias::Param::Format::NCHW4_NCHW;
param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
benchmarker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
"INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
benchmarker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
.set_dtype(1, dtype::QuantizedS8(1.9980927f))
.set_dtype(2, dtype::Float32())
.set_dtype(3, dtype::Float32())
.set_dtype(4, dtype::Float32())
.set_rng(0, &int_rng)
.set_rng(1, &int_rng)
.set_param(param);
auto run = [&](const TensorShapeArray& shapes) {
auto time_in_ms =
benchmarker.execs({shapes[0], shapes[1], shapes[2], {}, {}}) /
RUNS;
printf("src=%s, filter=%s, dst=%s, time=%.2f\n",
shapes[0].to_string().c_str(), shapes[1].to_string().c_str(),
shapes[2].to_string().c_str(), time_in_ms);
};
run({{16, 16, 224, 224, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
run({{16, 16, 92, 160, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
run({{16, 16, 46, 80, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
}
#if CUDA_VERSION >= 10020 #if CUDA_VERSION >= 10020
TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) { TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
......
...@@ -181,11 +181,6 @@ def synchronized(func: Callable): ...@@ -181,11 +181,6 @@ def synchronized(func: Callable):
return wrapper return wrapper
def _get_device_count_worker(queue, device_type):
num = get_device_count(device_type)
queue.put(num)
def _check_device_initialized(device_type: str, rank: int): def _check_device_initialized(device_type: str, rank: int):
try: try:
test = Tensor(1, device=(device_type + str(rank))) test = Tensor(1, device=(device_type + str(rank)))
...@@ -198,19 +193,6 @@ def _check_device_initialized(device_type: str, rank: int): ...@@ -198,19 +193,6 @@ def _check_device_initialized(device_type: str, rank: int):
raise RuntimeError(errmsg) raise RuntimeError(errmsg)
def get_device_count_by_fork(device_type: str):
"""
Get device count in fork thread.
See https://stackoverflow.com/questions/22950047/cuda-initialization-error-after-fork
for more information.
"""
q = mp.Queue()
p = mp.Process(target=_get_device_count_worker, args=(q, device_type))
p.start()
p.join()
return q.get()
def bcast_list_(inps: list, group: Group = WORLD): def bcast_list_(inps: list, group: Group = WORLD):
""" """
Broadcast tensors between given group. Broadcast tensors between given group.
......
...@@ -13,9 +13,10 @@ import queue ...@@ -13,9 +13,10 @@ import queue
from .. import _exit from .. import _exit
from ..core._imperative_rt.core2 import full_sync from ..core._imperative_rt.core2 import full_sync
from ..device import get_device_count
from ..logger import get_logger from ..logger import get_logger
from .group import _set_machine_ranks, group_barrier, init_process_group from .group import _set_machine_ranks, group_barrier, init_process_group
from .helper import _check_device_initialized, get_device_count_by_fork from .helper import _check_device_initialized
from .server import Client, Server from .server import Client, Server
WARN_SUBPROCESS_EXIT_WITHOUT_RETURN = ( WARN_SUBPROCESS_EXIT_WITHOUT_RETURN = (
...@@ -91,9 +92,7 @@ class launcher: ...@@ -91,9 +92,7 @@ class launcher:
backend="auto", backend="auto",
): ):
self.func = func self.func = func
self.n_gpus = ( self.n_gpus = n_gpus if n_gpus is not None else get_device_count(device_type)
n_gpus if n_gpus is not None else get_device_count_by_fork(device_type)
)
self.world_size = world_size if world_size is not None else self.n_gpus self.world_size = world_size if world_size is not None else self.n_gpus
self.rank_start = rank_start self.rank_start = rank_start
self.master_ip = master_ip self.master_ip = master_ip
......
...@@ -1188,11 +1188,11 @@ def copy(inp, device=None): ...@@ -1188,11 +1188,11 @@ def copy(inp, device=None):
import numpy as np import numpy as np
import platform import platform
from megengine import tensor from megengine import tensor
from megengine.distributed.helper import get_device_count_by_fork from megengine.device import get_device_count
import megengine.functional as F import megengine.functional as F
x = tensor([1, 2, 3], np.int32) x = tensor([1, 2, 3], np.int32)
if 1 == get_device_count_by_fork("gpu"): if 1 == get_device_count("gpu"):
y = F.copy(x, "cpu1") y = F.copy(x, "cpu1")
print(y.numpy()) print(y.numpy())
else: else:
......
...@@ -61,14 +61,14 @@ class _FakeQuantize(Module): ...@@ -61,14 +61,14 @@ class _FakeQuantize(Module):
def fake_quant_forward(self, inp, qparams: QParams = None): def fake_quant_forward(self, inp, qparams: QParams = None):
raise NotImplementedError raise NotImplementedError
def normal_foward(self, inp, qparams: QParams = None): def normal_forward(self, inp, qparams: QParams = None):
return inp return inp
def forward(self, inp, qparams: QParams = None): def forward(self, inp, qparams: QParams = None):
if self.enabled: if self.enabled:
return self.fake_quant_forward(inp, qparams=qparams) return self.fake_quant_forward(inp, qparams=qparams)
else: else:
return self.normal_foward(inp, qparams=qparams) return self.normal_forward(inp, qparams=qparams)
class TQT(_FakeQuantize, QParamsModuleMixin): class TQT(_FakeQuantize, QParamsModuleMixin):
......
...@@ -15,7 +15,7 @@ import megengine.functional ...@@ -15,7 +15,7 @@ import megengine.functional
import megengine.module import megengine.module
from megengine import Parameter from megengine import Parameter
from megengine.core._imperative_rt.core2 import sync from megengine.core._imperative_rt.core2 import sync
from megengine.distributed.helper import get_device_count_by_fork from megengine.device import get_device_count
from megengine.experimental.autograd import ( from megengine.experimental.autograd import (
disable_higher_order_directive, disable_higher_order_directive,
enable_higher_order_directive, enable_higher_order_directive,
...@@ -25,7 +25,7 @@ from megengine.module import Linear, Module ...@@ -25,7 +25,7 @@ from megengine.module import Linear, Module
sys.path.append(os.path.join(os.path.dirname(__file__), "helpers")) sys.path.append(os.path.join(os.path.dirname(__file__), "helpers"))
_ngpu = get_device_count_by_fork("gpu") _ngpu = get_device_count("gpu")
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
......
...@@ -16,7 +16,6 @@ import megengine.autodiff as ad ...@@ -16,7 +16,6 @@ import megengine.autodiff as ad
import megengine.distributed as dist import megengine.distributed as dist
import megengine.optimizer as optimizer import megengine.optimizer as optimizer
from megengine import Parameter, tensor from megengine import Parameter, tensor
from megengine.distributed.helper import get_device_count_by_fork
from megengine.module import Module from megengine.module import Module
from megengine.optimizer import SGD from megengine.optimizer import SGD
......
...@@ -18,7 +18,6 @@ import megengine.functional as F ...@@ -18,7 +18,6 @@ import megengine.functional as F
import megengine.module as M import megengine.module as M
import megengine.optimizer as optim import megengine.optimizer as optim
from megengine.autodiff import GradManager from megengine.autodiff import GradManager
from megengine.distributed.helper import get_device_count_by_fork
from megengine.jit import trace from megengine.jit import trace
......
...@@ -20,7 +20,6 @@ from megengine.core._imperative_rt import CompNode, TensorAttr, imperative ...@@ -20,7 +20,6 @@ from megengine.core._imperative_rt import CompNode, TensorAttr, imperative
from megengine.core._imperative_rt.core2 import TensorWeakRef, apply, sync from megengine.core._imperative_rt.core2 import TensorWeakRef, apply, sync
from megengine.core.autodiff.grad import Grad from megengine.core.autodiff.grad import Grad
from megengine.core.ops.builtin import Elemwise, Identity from megengine.core.ops.builtin import Elemwise, Identity
from megengine.distributed.helper import get_device_count_by_fork
from megengine.functional.distributed import remote_recv, remote_send from megengine.functional.distributed import remote_recv, remote_send
......
...@@ -31,7 +31,7 @@ from megengine.core.tensor.dtype import ( ...@@ -31,7 +31,7 @@ from megengine.core.tensor.dtype import (
quint4, quint4,
quint8, quint8,
) )
from megengine.distributed.helper import get_device_count_by_fork from megengine.device import get_device_count
from megengine.tensor import Tensor from megengine.tensor import Tensor
...@@ -184,8 +184,7 @@ def test_dtype_int4_ffi_handle(): ...@@ -184,8 +184,7 @@ def test_dtype_int4_ffi_handle():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("gpu") != 0, get_device_count("gpu") != 0, reason="TypeCvt to quint4 is not supported on GPU",
reason="TypeCvt to quint4 is not supported on GPU",
) )
def test_quint4_typecvt(): def test_quint4_typecvt():
device = "xpux" device = "xpux"
......
...@@ -17,11 +17,7 @@ import megengine as mge ...@@ -17,11 +17,7 @@ import megengine as mge
import megengine.distributed as dist import megengine.distributed as dist
from megengine.core.ops.builtin import CollectiveComm, ParamPackConcat, ParamPackSplit from megengine.core.ops.builtin import CollectiveComm, ParamPackConcat, ParamPackSplit
from megengine.device import get_default_device from megengine.device import get_default_device
from megengine.distributed.helper import ( from megengine.distributed.helper import param_pack_concat, param_pack_split
get_device_count_by_fork,
param_pack_concat,
param_pack_split,
)
def _assert_q_empty(q): def _assert_q_empty(q):
......
...@@ -22,8 +22,7 @@ from megengine import Parameter, Tensor, is_cuda_available, tensor ...@@ -22,8 +22,7 @@ from megengine import Parameter, Tensor, is_cuda_available, tensor
from megengine.core._trace_option import use_symbolic_shape from megengine.core._trace_option import use_symbolic_shape
from megengine.core.autodiff.grad import Grad from megengine.core.autodiff.grad import Grad
from megengine.core.tensor.utils import make_shape_tuple from megengine.core.tensor.utils import make_shape_tuple
from megengine.distributed.helper import get_device_count_by_fork from megengine.device import get_device_count
from megengine.jit import trace
def test_where(): def test_where():
...@@ -613,7 +612,7 @@ def test_nms(): ...@@ -613,7 +612,7 @@ def test_nms():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("gpu") > 0, reason="cuda does not support nchw int8" get_device_count("gpu") > 0, reason="cuda does not support nchw int8"
) )
def test_conv_bias(): def test_conv_bias():
inp_scale = 1.5 inp_scale = 1.5
...@@ -715,9 +714,7 @@ def test_conv_bias(): ...@@ -715,9 +714,7 @@ def test_conv_bias():
run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, True, "relu") run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, True, "relu")
@pytest.mark.skipif( @pytest.mark.skipif(get_device_count("gpu") > 0, reason="no int8 algorithm on cuda")
get_device_count_by_fork("gpu") > 0, reason="no int8 algorithm on cuda"
)
def test_batch_conv_bias(): def test_batch_conv_bias():
inp_scale = 1.5 inp_scale = 1.5
w_scale = 2.5 w_scale = 2.5
......
...@@ -16,7 +16,6 @@ import megengine.distributed as dist ...@@ -16,7 +16,6 @@ import megengine.distributed as dist
from megengine import Parameter, tensor from megengine import Parameter, tensor
from megengine.core._imperative_rt.core2 import sync from megengine.core._imperative_rt.core2 import sync
from megengine.device import get_default_device, set_default_device from megengine.device import get_default_device, set_default_device
from megengine.distributed.helper import get_device_count_by_fork
from megengine.functional.distributed import ( from megengine.functional.distributed import (
all_gather, all_gather,
all_reduce_max, all_reduce_max,
......
...@@ -18,7 +18,6 @@ from megengine import tensor ...@@ -18,7 +18,6 @@ from megengine import tensor
from megengine.core._trace_option import use_symbolic_shape from megengine.core._trace_option import use_symbolic_shape
from megengine.core.tensor import megbrain_graph as G from megengine.core.tensor import megbrain_graph as G
from megengine.core.tensor.utils import astensor1d from megengine.core.tensor.utils import astensor1d
from megengine.distributed.helper import get_device_count_by_fork
from megengine.jit import trace from megengine.jit import trace
from megengine.utils.network import Network, set_symbolic_shape from megengine.utils.network import Network, set_symbolic_shape
from megengine.utils.network_node import VarNode from megengine.utils.network_node import VarNode
......
...@@ -16,7 +16,6 @@ import megengine as mge ...@@ -16,7 +16,6 @@ import megengine as mge
import megengine.distributed as dist import megengine.distributed as dist
from megengine import Tensor from megengine import Tensor
from megengine.core._trace_option import use_symbolic_shape from megengine.core._trace_option import use_symbolic_shape
from megengine.distributed.helper import get_device_count_by_fork
from megengine.module import BatchNorm1d, BatchNorm2d, SyncBatchNorm from megengine.module import BatchNorm1d, BatchNorm2d, SyncBatchNorm
_assert_allclose = functools.partial(np.testing.assert_allclose, atol=5e-6, rtol=5e-6) _assert_allclose = functools.partial(np.testing.assert_allclose, atol=5e-6, rtol=5e-6)
......
...@@ -6,7 +6,7 @@ import pytest ...@@ -6,7 +6,7 @@ import pytest
import megengine.utils.comp_graph_tools as cgtools import megengine.utils.comp_graph_tools as cgtools
from megengine import jit, tensor from megengine import jit, tensor
from megengine.distributed.helper import get_device_count_by_fork from megengine.device import get_device_count
from megengine.functional import expand_dims from megengine.functional import expand_dims
from megengine.module import ( from megengine.module import (
BatchMatMulActivation, BatchMatMulActivation,
...@@ -101,9 +101,7 @@ def test_qat_conv(): ...@@ -101,9 +101,7 @@ def test_qat_conv():
np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy()) np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy())
@pytest.mark.skipif( @pytest.mark.skipif(get_device_count("gpu") > 0, reason="no int8 algorithm on cuda")
get_device_count_by_fork("gpu") > 0, reason="no int8 algorithm on cuda"
)
def test_qat_batchmatmul_activation(): def test_qat_batchmatmul_activation():
batch = 4 batch = 4
in_features = 8 in_features = 8
......
...@@ -13,7 +13,7 @@ import pytest ...@@ -13,7 +13,7 @@ import pytest
import megengine as mge import megengine as mge
import megengine.distributed as dist import megengine.distributed as dist
from megengine.distributed.helper import get_device_count_by_fork from megengine.device import get_device_count
from megengine.quantization import QuantMode, create_qparams from megengine.quantization import QuantMode, create_qparams
from megengine.quantization.observer import ( from megengine.quantization.observer import (
ExponentialMovingAverageObserver, ExponentialMovingAverageObserver,
...@@ -78,7 +78,7 @@ def test_passive_observer(): ...@@ -78,7 +78,7 @@ def test_passive_observer():
@pytest.mark.require_ngpu(2) @pytest.mark.require_ngpu(2)
@pytest.mark.isolated_distributed @pytest.mark.isolated_distributed
def test_sync_min_max_observer(): def test_sync_min_max_observer():
word_size = get_device_count_by_fork("gpu") word_size = get_device_count("gpu")
x = np.random.rand(3 * word_size, 3, 3, 3).astype("float32") x = np.random.rand(3 * word_size, 3, 3, 3).astype("float32")
np_min, np_max = x.min(), x.max() np_min, np_max = x.min(), x.max()
...@@ -96,7 +96,7 @@ def test_sync_min_max_observer(): ...@@ -96,7 +96,7 @@ def test_sync_min_max_observer():
@pytest.mark.require_ngpu(2) @pytest.mark.require_ngpu(2)
@pytest.mark.isolated_distributed @pytest.mark.isolated_distributed
def test_sync_exponential_moving_average_observer(): def test_sync_exponential_moving_average_observer():
word_size = get_device_count_by_fork("gpu") word_size = get_device_count("gpu")
t = np.random.rand() t = np.random.rand()
x1 = np.random.rand(3 * word_size, 3, 3, 3).astype("float32") x1 = np.random.rand(3 * word_size, 3, 3, 3).astype("float32")
x2 = np.random.rand(3 * word_size, 3, 3, 3).astype("float32") x2 = np.random.rand(3 * word_size, 3, 3, 3).astype("float32")
......
...@@ -12,7 +12,7 @@ import pytest ...@@ -12,7 +12,7 @@ import pytest
import megengine as mge import megengine as mge
import megengine.functional as F import megengine.functional as F
from megengine.core.tensor import dtype from megengine.core.tensor import dtype
from megengine.distributed.helper import get_device_count_by_fork from megengine.device import get_device_count
from megengine.functional.elemwise import _elemwise_multi_type, _elwise from megengine.functional.elemwise import _elemwise_multi_type, _elwise
from megengine.quantization import QuantMode, create_qparams from megengine.quantization import QuantMode, create_qparams
...@@ -68,7 +68,7 @@ def test_elemwise(kind): ...@@ -68,7 +68,7 @@ def test_elemwise(kind):
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("gpu") > 0, reason="cuda does not support nchw int8" get_device_count("gpu") > 0, reason="cuda does not support nchw int8"
) )
def test_conv_bias(): def test_conv_bias():
inp_scale = np.float32(np.random.rand() + 1) inp_scale = np.float32(np.random.rand() + 1)
......
...@@ -26,12 +26,12 @@ from megengine.core.ops.builtin import ( ...@@ -26,12 +26,12 @@ from megengine.core.ops.builtin import (
PoissonRNG, PoissonRNG,
UniformRNG, UniformRNG,
) )
from megengine.distributed.helper import get_device_count_by_fork from megengine.device import get_device_count
from megengine.random import RNG, seed, uniform from megengine.random import RNG, seed, uniform
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("xpu") <= 2, reason="xpu counts need > 2", get_device_count("xpu") <= 2, reason="xpu counts need > 2",
) )
def test_gaussian_op(): def test_gaussian_op():
shape = ( shape = (
...@@ -61,7 +61,7 @@ def test_gaussian_op(): ...@@ -61,7 +61,7 @@ def test_gaussian_op():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("xpu") <= 2, reason="xpu counts need > 2", get_device_count("xpu") <= 2, reason="xpu counts need > 2",
) )
def test_uniform_op(): def test_uniform_op():
shape = ( shape = (
...@@ -89,7 +89,7 @@ def test_uniform_op(): ...@@ -89,7 +89,7 @@ def test_uniform_op():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("xpu") <= 2, reason="xpu counts need > 2", get_device_count("xpu") <= 2, reason="xpu counts need > 2",
) )
def test_gamma_op(): def test_gamma_op():
_shape, _scale = 2, 0.8 _shape, _scale = 2, 0.8
...@@ -117,7 +117,7 @@ def test_gamma_op(): ...@@ -117,7 +117,7 @@ def test_gamma_op():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("xpu") <= 2, reason="xpu counts need > 2", get_device_count("xpu") <= 2, reason="xpu counts need > 2",
) )
def test_beta_op(): def test_beta_op():
_alpha, _beta = 2, 0.8 _alpha, _beta = 2, 0.8
...@@ -148,7 +148,7 @@ def test_beta_op(): ...@@ -148,7 +148,7 @@ def test_beta_op():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("xpu") <= 2, reason="xpu counts need > 2", get_device_count("xpu") <= 2, reason="xpu counts need > 2",
) )
def test_poisson_op(): def test_poisson_op():
lam = F.full([8, 9, 11, 12], value=2, dtype="float32") lam = F.full([8, 9, 11, 12], value=2, dtype="float32")
...@@ -171,7 +171,7 @@ def test_poisson_op(): ...@@ -171,7 +171,7 @@ def test_poisson_op():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("xpu") <= 2, reason="xpu counts need > 2", get_device_count("xpu") <= 2, reason="xpu counts need > 2",
) )
def test_permutation_op(): def test_permutation_op():
n = 1000 n = 1000
...@@ -205,7 +205,7 @@ def test_permutation_op(): ...@@ -205,7 +205,7 @@ def test_permutation_op():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("xpu") <= 1, reason="xpu counts need > 1", get_device_count("xpu") <= 1, reason="xpu counts need > 1",
) )
def test_UniformRNG(): def test_UniformRNG():
m1 = RNG(seed=111, device="xpu0") m1 = RNG(seed=111, device="xpu0")
...@@ -233,7 +233,7 @@ def test_UniformRNG(): ...@@ -233,7 +233,7 @@ def test_UniformRNG():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("xpu") <= 1, reason="xpu counts need > 1", get_device_count("xpu") <= 1, reason="xpu counts need > 1",
) )
def test_NormalRNG(): def test_NormalRNG():
m1 = RNG(seed=111, device="xpu0") m1 = RNG(seed=111, device="xpu0")
...@@ -262,7 +262,7 @@ def test_NormalRNG(): ...@@ -262,7 +262,7 @@ def test_NormalRNG():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("xpu") <= 1, reason="xpu counts need > 1", get_device_count("xpu") <= 1, reason="xpu counts need > 1",
) )
def test_GammaRNG(): def test_GammaRNG():
m1 = RNG(seed=111, device="xpu0") m1 = RNG(seed=111, device="xpu0")
...@@ -295,7 +295,7 @@ def test_GammaRNG(): ...@@ -295,7 +295,7 @@ def test_GammaRNG():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("xpu") <= 1, reason="xpu counts need > 1", get_device_count("xpu") <= 1, reason="xpu counts need > 1",
) )
def test_BetaRNG(): def test_BetaRNG():
m1 = RNG(seed=111, device="xpu0") m1 = RNG(seed=111, device="xpu0")
...@@ -330,7 +330,7 @@ def test_BetaRNG(): ...@@ -330,7 +330,7 @@ def test_BetaRNG():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("xpu") <= 1, reason="xpu counts need > 1", get_device_count("xpu") <= 1, reason="xpu counts need > 1",
) )
def test_PoissonRNG(): def test_PoissonRNG():
m1 = RNG(seed=111, device="xpu0") m1 = RNG(seed=111, device="xpu0")
...@@ -359,7 +359,7 @@ def test_PoissonRNG(): ...@@ -359,7 +359,7 @@ def test_PoissonRNG():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("xpu") <= 1, reason="xpu counts need > 1", get_device_count("xpu") <= 1, reason="xpu counts need > 1",
) )
def test_PermutationRNG(): def test_PermutationRNG():
m1 = RNG(seed=111, device="xpu0") m1 = RNG(seed=111, device="xpu0")
......
...@@ -13,8 +13,7 @@ import megengine.random as rand ...@@ -13,8 +13,7 @@ import megengine.random as rand
from megengine.core._imperative_rt.core2 import apply from megengine.core._imperative_rt.core2 import apply
from megengine.core._wrap import Device from megengine.core._wrap import Device
from megengine.core.ops import builtin from megengine.core.ops import builtin
from megengine.device import is_cuda_available from megengine.device import get_device_count, is_cuda_available
from megengine.distributed.helper import get_device_count_by_fork
from megengine.functional.external import tensorrt_runtime_opr from megengine.functional.external import tensorrt_runtime_opr
from megengine.jit.tracing import trace from megengine.jit.tracing import trace
from megengine.tensor import Tensor from megengine.tensor import Tensor
...@@ -273,7 +272,7 @@ def test_deformable_ps_roi_pooling(): ...@@ -273,7 +272,7 @@ def test_deformable_ps_roi_pooling():
@pytest.mark.skipif( @pytest.mark.skipif(
get_device_count_by_fork("gpu") > 0, get_device_count("gpu") > 0,
reason="does not support int8 when gpu compute capability less than 6.1", reason="does not support int8 when gpu compute capability less than 6.1",
) )
def test_convbias(): def test_convbias():
......
...@@ -6,5 +6,5 @@ ...@@ -6,5 +6,5 @@
# Unless required by applicable law or agreed to in writing, # Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an # software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
__version__ = "1.5.0.dev" __version__ = "1.6.0.dev"
...@@ -27,8 +27,14 @@ using namespace mgb; ...@@ -27,8 +27,14 @@ using namespace mgb;
#include <thread> #include <thread>
#include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#ifdef __unix__
#include <unistd.h>
#include <sys/wait.h>
#endif
using CudaCompNodeImpl = CudaCompNode::CompNodeImpl; using CudaCompNodeImpl = CudaCompNode::CompNodeImpl;
namespace { namespace {
...@@ -700,19 +706,90 @@ void CudaCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) { ...@@ -700,19 +706,90 @@ void CudaCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) {
/* ===================== CudaCompNode static methods ===================== */ /* ===================== CudaCompNode static methods ===================== */
namespace {
#ifndef __unix__
CUresult get_device_count_forksafe(int* pcnt) {
cuInit(0);
return cuDeviceGetCount(pcnt);
}
#else
struct RAIICloseFD : NonCopyableObj {
int m_fd = -1;
RAIICloseFD(int fd) : m_fd(fd) {}
~RAIICloseFD() {close();}
void close() {
if (m_fd != -1) {
::close(m_fd);
m_fd = -1;
}
}
};
// an implementation that does not call cuInit
CUresult get_device_count_forksafe(int* pcnt) {
auto err = cuDeviceGetCount(pcnt);
if (err != CUDA_ERROR_NOT_INITIALIZED) return err;
// cuInit not called, call it in child process
int fd[2];
mgb_assert(pipe(fd) == 0, "pipe() failed");
int fdr = fd[0], fdw = fd[1];
RAIICloseFD fdr_guard(fdr);
RAIICloseFD fdw_guard(fdw);
auto cpid = fork();
mgb_assert(cpid != -1, "fork() failed");
if (cpid == 0) {
fdr_guard.close();
do {
err = cuInit(0);
if (err != CUDA_SUCCESS) break;
err = cuDeviceGetCount(pcnt);
} while (0);
auto sz = write(fdw, &err, sizeof(err));
if (sz == sizeof(err) && err == CUDA_SUCCESS) {
sz = write(fdw, pcnt, sizeof(*pcnt));
}
fdw_guard.close();
std::quick_exit(0);
}
fdw_guard.close();
auto sz = read(fdr, &err, sizeof(err));
mgb_assert(sz == sizeof(err), "failed to read error code from child");
if (err == CUDA_SUCCESS) {
sz = read(fdr, pcnt, sizeof(*pcnt));
mgb_assert(sz == sizeof(*pcnt), "failed to read device count from child");
return err;
}
// try again, maybe another thread called cuInit while we fork
auto err2 = cuDeviceGetCount(pcnt);
if (err2 == CUDA_SUCCESS) return err2;
if (err2 == CUDA_ERROR_NOT_INITIALIZED) return err;
return err2;
}
#endif
const char* cu_get_error_string(CUresult err) {
const char* ret = nullptr;
cuGetErrorString(err, &ret);
if (!ret) ret = "unknown cuda error";
return ret;
}
} // namespace
bool CudaCompNode::available() { bool CudaCompNode::available() {
static int result = -1; static int result = -1;
static Spinlock mtx; static Spinlock mtx;
MGB_LOCK_GUARD(mtx); MGB_LOCK_GUARD(mtx);
if (result == -1) { if (result == -1) {
int ndev = -1; int ndev = -1;
auto err = cudaGetDeviceCount(&ndev); auto err = get_device_count_forksafe(&ndev);
result = err == cudaSuccess && ndev > 0; result = err == CUDA_SUCCESS && ndev > 0;
if (!result) { if (!result) {
mgb_log_warn("cuda unavailable: %s(%d) ndev=%d", mgb_log_warn("cuda unavailable: %s(%d) ndev=%d",
cudaGetErrorString(err), static_cast<int>(err), ndev); cu_get_error_string(err), static_cast<int>(err), ndev);
} }
if (err == cudaErrorInitializationError) { if (err == CUDA_ERROR_NOT_INITIALIZED) {
mgb_throw(std::runtime_error, "cuda initialization error."); mgb_throw(std::runtime_error, "cuda initialization error.");
} }
} }
...@@ -857,11 +934,11 @@ size_t CudaCompNode::get_device_count(bool warn) { ...@@ -857,11 +934,11 @@ size_t CudaCompNode::get_device_count(bool warn) {
static Spinlock mtx; static Spinlock mtx;
MGB_LOCK_GUARD(mtx); MGB_LOCK_GUARD(mtx);
if (cnt == -1) { if (cnt == -1) {
auto err = cudaGetDeviceCount(&cnt); auto err = get_device_count_forksafe(&cnt);
if (err != cudaSuccess) { if (err != CUDA_SUCCESS) {
if (warn) if (warn)
mgb_log_error("cudaGetDeviceCount failed: %s (err %d)", mgb_log_error("cudaGetDeviceCount failed: %s (err %d)",
cudaGetErrorString(err), int(err)); cu_get_error_string(err), int(err));
cnt = 0; cnt = 0;
} }
mgb_assert(cnt >= 0); mgb_assert(cnt >= 0);
......
...@@ -495,7 +495,7 @@ TEST_F(TestGraphEagerReeval, MemoryAlloc) { ...@@ -495,7 +495,7 @@ TEST_F(TestGraphEagerReeval, MemoryAlloc) {
// | +--> 8 -> 8 // | +--> 8 -> 8
// |-----> x1(fwd) -> 8 -| // |-----> x1(fwd) -> 8 -|
// total usage : 63 + (16 after the first iteration) // total usage : 63 + (16 after the first iteration)
// x might has iteration i's memory, but x0/x1 foward i-1's memory // x might has iteration i's memory, but x0/x1 forward i-1's memory
size_t length = reserve / (sizeof(dt_int32) * 5 * 16); size_t length = reserve / (sizeof(dt_int32) * 5 * 16);
auto host_x = std::make_shared<HostTensorND>(cn, dtype::Int32()); auto host_x = std::make_shared<HostTensorND>(cn, dtype::Int32());
HostTensorND host_val; HostTensorND host_val;
......
...@@ -772,7 +772,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( ...@@ -772,7 +772,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
add_pass<RemoveRedundantTypeCvtPass>(); add_pass<RemoveRedundantTypeCvtPass>();
add_pass(FuseNCHW4Int8Preprocess::make()); add_pass(FuseNCHW4Int8Preprocess::make());
add_pass<FuseWarpPerspectiveDimshufflePass>(); add_pass<FuseWarpPerspectiveDimshufflePass>();
#if CUDA_VERSION >= 10020
add_pass<FoldingConvBiasDimshufflePass>(); add_pass<FoldingConvBiasDimshufflePass>();
#endif
}); });
cb(chwn4, { cb(chwn4, {
add_pass<FuseConvBiasNonlinPass>(); add_pass<FuseConvBiasNonlinPass>();
...@@ -791,7 +793,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( ...@@ -791,7 +793,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
add_pass<RemoveRedundantTypeCvtPass>(); add_pass<RemoveRedundantTypeCvtPass>();
add_pass(FuseNCHW4Int8Preprocess::make()); add_pass(FuseNCHW4Int8Preprocess::make());
add_pass<FuseWarpPerspectiveDimshufflePass>(); add_pass<FuseWarpPerspectiveDimshufflePass>();
#if CUDA_VERSION >= 10020
add_pass<FoldingConvBiasDimshufflePass>(); add_pass<FoldingConvBiasDimshufflePass>();
#endif
}); });
cb(fuse_conv_bias_nonlinearity, { add_pass<FuseConvBiasNonlinPass>(); }); cb(fuse_conv_bias_nonlinearity, { add_pass<FuseConvBiasNonlinPass>(); });
......
...@@ -3638,6 +3638,7 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const { ...@@ -3638,6 +3638,7 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const {
MIDOUT_E MIDOUT_E
} }
#if CUDA_VERSION >= 10020
/* ==================== FoldingConvBiasDimshufflePass ================= */ /* ==================== FoldingConvBiasDimshufflePass ================= */
const char* FoldingConvBiasDimshufflePass::name() const { const char* FoldingConvBiasDimshufflePass::name() const {
return mgb_cstr_log("folding conv bias dimshuffle pass"); return mgb_cstr_log("folding conv bias dimshuffle pass");
...@@ -4068,20 +4069,17 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { ...@@ -4068,20 +4069,17 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const {
return true; return true;
}; };
MGB_MARK_USED_VAR(try_conv_reformat_nchw322nchw4); MGB_MARK_USED_VAR(try_conv_reformat_nchw322nchw4);
MGB_MARK_USED_VAR(try_conv_reformat_nchw42nchw32);
auto on_opr = [&try_conv_dimshuffle_reshape_typecvt, auto on_opr = [&try_conv_dimshuffle_reshape_typecvt,
&try_conv_reformat_nchw42nchw32, &try_conv_reformat_nchw42nchw32,
&try_conv_reformat_nchw42nhwc, &try_conv_reformat_nchw42nhwc,
#if CUDA_VERSION >= 10020
&try_conv_reformat_nchw322nchw4, &try_conv_reformat_nchw322nchw4,
#endif
&rewriter](OperatorNodeBase* opr) { &rewriter](OperatorNodeBase* opr) {
if (!try_conv_dimshuffle_reshape_typecvt(opr) && if (!try_conv_dimshuffle_reshape_typecvt(opr) &&
!try_conv_reformat_nchw42nchw32(opr) && !try_conv_reformat_nchw42nchw32(opr) &&
!try_conv_reformat_nchw42nhwc(opr) !try_conv_reformat_nchw42nhwc(opr)
#if CUDA_VERSION >= 10020
&& !try_conv_reformat_nchw322nchw4(opr) && !try_conv_reformat_nchw322nchw4(opr)
#endif
) { ) {
rewriter.auto_replace_outputs(opr); rewriter.auto_replace_outputs(opr);
} }
...@@ -4091,6 +4089,7 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { ...@@ -4091,6 +4089,7 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const {
MIDOUT_E MIDOUT_E
} }
#endif
/* ==================== PaddingChannelPass ================= */ /* ==================== PaddingChannelPass ================= */
const char* PaddingChannelPass::name() const { const char* PaddingChannelPass::name() const {
......
...@@ -16,6 +16,10 @@ ...@@ -16,6 +16,10 @@
#include "megbrain/opr/dnn/convolution.h" #include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/search_policy/algo_chooser_helper.h" #include "megbrain/opr/search_policy/algo_chooser_helper.h"
#if MGB_CUDA
#include <cuda.h>
#endif
namespace mgb { namespace mgb {
namespace gopt { namespace gopt {
...@@ -427,11 +431,13 @@ namespace gopt { ...@@ -427,11 +431,13 @@ namespace gopt {
void apply(OptState& opt) const override; void apply(OptState& opt) const override;
}; };
#if CUDA_VERSION >= 10020
class FoldingConvBiasDimshufflePass final : public Pass { class FoldingConvBiasDimshufflePass final : public Pass {
public: public:
const char* name() const override; const char* name() const override;
void apply(OptState& opt) const override; void apply(OptState& opt) const override;
}; };
#endif
/*! /*!
* \brief padding channel to enable fast int8/int4 support * \brief padding channel to enable fast int8/int4 support
......
...@@ -4155,6 +4155,7 @@ TEST(TestGoptInference, WarpAndPreProcessCase1) { ...@@ -4155,6 +4155,7 @@ TEST(TestGoptInference, WarpAndPreProcessCase1) {
MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5); MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
} }
#if CUDA_VERSION >= 10020
TEST(TestGoptInference, FoldingConvDimshuffle) { TEST(TestGoptInference, FoldingConvDimshuffle) {
REQUIRE_GPU(1); REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0"); auto cn = CompNode::load("gpu0");
...@@ -4307,7 +4308,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) { ...@@ -4307,7 +4308,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) {
MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse); MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse);
} }
#if CUDA_VERSION >= 10020
TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) { TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) {
REQUIRE_GPU(1); REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0"); auto cn = CompNode::load("gpu0");
......
Subproject commit eb8365c1015624348dbbb0a3ed97eecb40643e9e Subproject commit 5f0e18e73ced4689f82f1e3b90924bbcc35bd56a
Subproject commit eafd7f8d33d114ef2569a1fc2b851b2325edee46 Subproject commit baee1355d997b7bae8f93d25ba4bf41a61030599