From 5d350fc8433182ec206ffe8a7e50eb58db563809 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Fri, 5 Mar 2021 17:40:25 +0800 Subject: [PATCH] feat(dnn/cuda): add deconv int8 and fix cutlass conv wrapper base on modify cutlass 2.4 GitOrigin-RevId: 49e0565e8a882a455387c95a80a8037c29166a76 --- .gitattributes | 1 + dnn/src/common/convolution.cpp | 37 ++- .../conv_bias/cutlass_convolution_wrapper.cu | 103 ++++--- ...s_int8_implicit_gemm_cutlass_wrapper.cuinl | 30 +- ...4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu | Bin 1915 -> 1848 bytes ...m_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu | Bin 1909 -> 1842 bytes ...dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu | Bin 1913 -> 1846 bytes ...p4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu | Bin 1914 -> 1847 bytes ...mm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu | Bin 1908 -> 1841 bytes ..._dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu | Bin 1912 -> 1845 bytes ...p4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu | Bin 1914 -> 1847 bytes ...mm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu | Bin 1908 -> 1841 bytes ..._dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu | Bin 1912 -> 1845 bytes ...4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu | Bin 1914 -> 1847 bytes ...m_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu | Bin 1908 -> 1841 bytes ...dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu | Bin 1912 -> 1845 bytes ...m_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu | Bin 1910 -> 1843 bytes ..._gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu | Bin 1904 -> 1837 bytes ...emm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu | Bin 1908 -> 1841 bytes ...cdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu | Bin 1916 -> 1849 bytes ...4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu | Bin 1910 -> 1843 bytes ..._ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu | Bin 1914 -> 1847 bytes ...ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu | Bin 1915 -> 1848 bytes ...p4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu | Bin 1909 -> 1842 bytes ...a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu | Bin 1913 -> 1846 bytes ...ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu | Bin 1915 -> 1848 bytes ...p4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu | Bin 1909 -> 1842 bytes ...a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu | Bin 1913 -> 1846 bytes ...cdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu | Bin 1915 -> 1848 bytes ...4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu | Bin 1909 -> 1842 bytes ..._ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu | Bin 1913 -> 1846 bytes ...4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu | Bin 1911 -> 1844 bytes ...m_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu | Bin 1905 -> 1838 bytes ...dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu | Bin 1909 -> 1842 bytes ...ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu | Bin 1915 -> 1848 bytes ...p4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu | Bin 1909 -> 1842 bytes ...a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu | Bin 1913 -> 1846 bytes ..._ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu | Bin 1914 -> 1847 bytes ...dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu | Bin 1908 -> 1841 bytes ...4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu | Bin 1912 -> 1845 bytes ..._ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu | Bin 1914 -> 1847 bytes ...dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu | Bin 1908 -> 1841 bytes ...4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu | Bin 1912 -> 1845 bytes ...ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu | Bin 1915 -> 1848 bytes ...p4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu | Bin 1909 -> 1842 bytes ...a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu | Bin 1913 -> 1846 bytes ..._ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu | Bin 1914 -> 1847 bytes ...dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu | Bin 1908 -> 1841 bytes ...4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu | Bin 1912 -> 1845 bytes ..._ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu | Bin 1914 -> 1847 bytes ...dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu | Bin 1908 -> 1841 bytes ...4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu | Bin 1912 -> 1845 bytes ...p4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu | Bin 1914 -> 1847 bytes ...mm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu | Bin 1908 -> 1841 bytes ..._dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu | Bin 1912 -> 1845 bytes ...dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu | Bin 1913 -> 1846 bytes ...emm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu | Bin 1907 -> 1840 bytes ...m_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu | Bin 1911 -> 1844 bytes ...dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu | Bin 1913 -> 1846 bytes ...emm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu | Bin 1907 -> 1840 bytes ...m_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu | Bin 1911 -> 1844 bytes ...p4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu | Bin 1914 -> 1847 bytes ...mm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu | Bin 1908 -> 1841 bytes ..._dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu | Bin 1912 -> 1845 bytes ...dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu | Bin 1913 -> 1846 bytes ...emm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu | Bin 1907 -> 1840 bytes ...m_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu | Bin 1911 -> 1844 bytes ...dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu | Bin 1913 -> 1846 bytes ...emm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu | Bin 1907 -> 1840 bytes ...m_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu | Bin 1911 -> 1844 bytes ..._ncdiv32hw32_128x128x32_64x32x32_hswish.cu | Bin 1916 -> 1849 bytes ...4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu | Bin 1910 -> 1843 bytes ...w4_ncdiv32hw32_128x128x32_64x32x32_relu.cu | Bin 1914 -> 1847 bytes ...4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu | Bin 1915 -> 1848 bytes ...v4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu | Bin 1909 -> 1842 bytes ...hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu | Bin 1913 -> 1846 bytes ...4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu | Bin 1915 -> 1848 bytes ...v4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu | Bin 1909 -> 1842 bytes ...hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu | Bin 1913 -> 1846 bytes ...iv32hw32_1x1_128x128x32_64x32x32_hswish.cu | Bin 1917 -> 1850 bytes ..._ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu | Bin 1911 -> 1844 bytes ...cdiv32hw32_1x1_128x128x32_64x32x32_relu.cu | Bin 1915 -> 1848 bytes ...div32hw32_1x1_128x32x32_64x32x32_hswish.cu | Bin 1916 -> 1849 bytes ...4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu | Bin 1910 -> 1843 bytes ...ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu | Bin 1914 -> 1847 bytes ...div32hw32_1x1_128x64x32_64x32x32_hswish.cu | Bin 1916 -> 1849 bytes ...4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu | Bin 1910 -> 1843 bytes ...ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu | Bin 1914 -> 1847 bytes ...div32hw32_1x1_32x128x32_32x64x32_hswish.cu | Bin 1916 -> 1849 bytes ...4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu | Bin 1910 -> 1843 bytes ...ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu | Bin 1914 -> 1847 bytes ...cdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu | Bin 1915 -> 1848 bytes ...w4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu | Bin 1909 -> 1842 bytes ..._ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu | Bin 1913 -> 1846 bytes ...cdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu | Bin 1915 -> 1848 bytes ...w4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu | Bin 1909 -> 1842 bytes ..._ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu | Bin 1913 -> 1846 bytes ...div32hw32_1x1_64x128x32_64x32x32_hswish.cu | Bin 1916 -> 1849 bytes ...4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu | Bin 1910 -> 1843 bytes ...ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu | Bin 1914 -> 1847 bytes ...cdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu | Bin 1915 -> 1848 bytes ...w4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu | Bin 1909 -> 1842 bytes ..._ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu | Bin 1913 -> 1846 bytes ...cdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu | Bin 1915 -> 1848 bytes ...w4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu | Bin 1909 -> 1842 bytes ..._ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu | Bin 1913 -> 1846 bytes ...4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu | Bin 1915 -> 1848 bytes ...v4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu | Bin 1909 -> 1842 bytes ...hw4_ncdiv32hw32_32x128x32_32x64x32_relu.cu | Bin 1913 -> 1846 bytes ...w4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu | Bin 1914 -> 1847 bytes ...iv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu | Bin 1908 -> 1841 bytes ...4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu | Bin 1912 -> 1845 bytes ...w4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu | Bin 1914 -> 1847 bytes ...iv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu | Bin 1908 -> 1841 bytes ...4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu | Bin 1912 -> 1845 bytes ...4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu | Bin 1915 -> 1848 bytes ...v4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu | Bin 1909 -> 1842 bytes ...hw4_ncdiv32hw32_64x128x32_64x32x32_relu.cu | Bin 1913 -> 1846 bytes ...w4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu | Bin 1914 -> 1847 bytes ...iv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu | Bin 1908 -> 1841 bytes ...4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu | Bin 1912 -> 1845 bytes ...w4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu | Bin 1914 -> 1847 bytes ...iv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu | Bin 1908 -> 1841 bytes ...4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu | Bin 1912 -> 1845 bytes ...div4hw4_nchw_128x128x32_64x32x32_hswish.cu | Bin 1891 -> 1824 bytes ...a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu | Bin 1885 -> 1818 bytes ...ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu | Bin 1889 -> 1822 bytes ...cdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu | Bin 1890 -> 1823 bytes ...4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu | Bin 1884 -> 1817 bytes ..._ncdiv4hw4_nchw_128x32x32_64x32x32_relu.cu | Bin 1888 -> 1821 bytes ...cdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu | Bin 1890 -> 1823 bytes ...4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu | Bin 1884 -> 1817 bytes ..._ncdiv4hw4_nchw_128x64x32_64x32x32_relu.cu | Bin 1888 -> 1821 bytes ...div4hw4_nchw_16x128x16_16x128x16_hswish.cu | Bin 1890 -> 1823 bytes ...a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu | Bin 1884 -> 1817 bytes ...ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu | Bin 1888 -> 1821 bytes ...a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu | Bin 1886 -> 1819 bytes ..._dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu | Bin 1880 -> 1813 bytes ...p4a_ncdiv4hw4_nchw_16x64x8_16x64x8_relu.cu | Bin 1884 -> 1817 bytes ...hw4_nchw_1x1_128x128x32_64x32x32_hswish.cu | Bin 1892 -> 1825 bytes ...div4hw4_nchw_1x1_128x128x32_64x32x32_id.cu | Bin 1886 -> 1819 bytes ...v4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu | Bin 1890 -> 1823 bytes ...4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu | Bin 1891 -> 1824 bytes ...cdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu | Bin 1885 -> 1818 bytes ...iv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu | Bin 1889 -> 1822 bytes ...4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu | Bin 1891 -> 1824 bytes ...cdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu | Bin 1885 -> 1818 bytes ...iv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu | Bin 1889 -> 1822 bytes ...hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu | Bin 1891 -> 1824 bytes ...div4hw4_nchw_1x1_16x128x16_16x128x16_id.cu | Bin 1885 -> 1818 bytes ...v4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu | Bin 1889 -> 1822 bytes ...div4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu | Bin 1887 -> 1820 bytes ...a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu | Bin 1881 -> 1814 bytes ...ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu | Bin 1885 -> 1818 bytes ...4hw4_nchw_1x1_32x128x32_32x64x32_hswish.cu | Bin 1891 -> 1824 bytes ...cdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu | Bin 1885 -> 1818 bytes ...iv4hw4_nchw_1x1_32x128x32_32x64x32_relu.cu | Bin 1889 -> 1822 bytes ...v4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu | Bin 1890 -> 1823 bytes ...ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu | Bin 1884 -> 1817 bytes ...div4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu | Bin 1888 -> 1821 bytes ...v4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu | Bin 1890 -> 1823 bytes ...ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu | Bin 1884 -> 1817 bytes ...div4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu | Bin 1888 -> 1821 bytes ...4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu | Bin 1891 -> 1824 bytes ...cdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu | Bin 1885 -> 1818 bytes ...iv4hw4_nchw_1x1_64x128x32_64x32x32_relu.cu | Bin 1889 -> 1822 bytes ...v4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu | Bin 1890 -> 1823 bytes ...ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu | Bin 1884 -> 1817 bytes ...div4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu | Bin 1888 -> 1821 bytes ...v4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu | Bin 1890 -> 1823 bytes ...ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu | Bin 1884 -> 1817 bytes ...div4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu | Bin 1888 -> 1821 bytes ...cdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu | Bin 1890 -> 1823 bytes ...4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu | Bin 1884 -> 1817 bytes ..._ncdiv4hw4_nchw_32x128x32_32x64x32_relu.cu | Bin 1888 -> 1821 bytes ...ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu | Bin 1889 -> 1822 bytes ...p4a_ncdiv4hw4_nchw_32x32x32_32x32x32_id.cu | Bin 1883 -> 1816 bytes ...a_ncdiv4hw4_nchw_32x32x32_32x32x32_relu.cu | Bin 1887 -> 1820 bytes ...ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu | Bin 1889 -> 1822 bytes ...p4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu | Bin 1883 -> 1816 bytes ...a_ncdiv4hw4_nchw_32x64x32_32x64x32_relu.cu | Bin 1887 -> 1820 bytes ...cdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu | Bin 1890 -> 1823 bytes ...4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu | Bin 1884 -> 1817 bytes ..._ncdiv4hw4_nchw_64x128x32_64x32x32_relu.cu | Bin 1888 -> 1821 bytes ...ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu | Bin 1889 -> 1822 bytes ...p4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu | Bin 1883 -> 1816 bytes ...a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu | Bin 1887 -> 1820 bytes ...ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu | Bin 1889 -> 1822 bytes ...p4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu | Bin 1883 -> 1816 bytes ...a_ncdiv4hw4_nchw_64x64x32_64x32x32_relu.cu | Bin 1887 -> 1820 bytes ..._ncdiv32hw32_128x128x64_64x64x64_hswish.cu | Bin 1924 -> 1857 bytes ...imma_ncdiv32hw32_128x128x64_64x64x64_id.cu | Bin 1918 -> 1851 bytes ...ma_ncdiv32hw32_128x128x64_64x64x64_relu.cu | Bin 1922 -> 1855 bytes ..._ncdiv32hw32_128x256x64_64x64x64_hswish.cu | Bin 1924 -> 1857 bytes ...imma_ncdiv32hw32_128x256x64_64x64x64_id.cu | Bin 1918 -> 1851 bytes ...ma_ncdiv32hw32_128x256x64_64x64x64_relu.cu | Bin 1922 -> 1855 bytes ...a_ncdiv32hw32_128x64x64_64x32x64_hswish.cu | Bin 1923 -> 1856 bytes ..._imma_ncdiv32hw32_128x64x64_64x32x64_id.cu | Bin 1917 -> 1850 bytes ...mma_ncdiv32hw32_128x64x64_64x32x64_relu.cu | Bin 1921 -> 1854 bytes ...iv32hw32_1x1_128x128x64_64x64x64_hswish.cu | Bin 1925 -> 1858 bytes ..._ncdiv32hw32_1x1_128x128x64_64x64x64_id.cu | Bin 1919 -> 1852 bytes ...cdiv32hw32_1x1_128x128x64_64x64x64_relu.cu | Bin 1923 -> 1856 bytes ...iv32hw32_1x1_128x256x64_64x64x64_hswish.cu | Bin 1925 -> 1858 bytes ..._ncdiv32hw32_1x1_128x256x64_64x64x64_id.cu | Bin 1919 -> 1852 bytes ...cdiv32hw32_1x1_128x256x64_64x64x64_relu.cu | Bin 1923 -> 1856 bytes ...div32hw32_1x1_128x64x64_64x32x64_hswish.cu | Bin 1924 -> 1857 bytes ...a_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu | Bin 1918 -> 1851 bytes ...ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu | Bin 1922 -> 1855 bytes ...iv32hw32_1x1_256x128x64_64x64x64_hswish.cu | Bin 1925 -> 1858 bytes ..._ncdiv32hw32_1x1_256x128x64_64x64x64_id.cu | Bin 1919 -> 1852 bytes ...cdiv32hw32_1x1_256x128x64_64x64x64_relu.cu | Bin 1923 -> 1856 bytes ...cdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu | Bin 1923 -> 1856 bytes ...ma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu | Bin 1917 -> 1850 bytes ..._ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu | Bin 1921 -> 1854 bytes ...div32hw32_1x1_64x128x64_32x64x64_hswish.cu | Bin 1924 -> 1857 bytes ...a_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu | Bin 1918 -> 1851 bytes ...ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu | Bin 1922 -> 1855 bytes ...cdiv32hw32_1x1_64x64x64_32x32x64_hswish.cu | Bin 1923 -> 1856 bytes ...ma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu | Bin 1917 -> 1850 bytes ..._ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu | Bin 1921 -> 1854 bytes ..._ncdiv32hw32_256x128x64_64x64x64_hswish.cu | Bin 1924 -> 1857 bytes ...imma_ncdiv32hw32_256x128x64_64x64x64_id.cu | Bin 1918 -> 1851 bytes ...ma_ncdiv32hw32_256x128x64_64x64x64_relu.cu | Bin 1922 -> 1855 bytes ...ma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu | Bin 1922 -> 1855 bytes ...m_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu | Bin 1916 -> 1849 bytes ...imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu | Bin 1920 -> 1853 bytes ...a_ncdiv32hw32_64x128x64_32x64x64_hswish.cu | Bin 1923 -> 1856 bytes ..._imma_ncdiv32hw32_64x128x64_32x64x64_id.cu | Bin 1917 -> 1850 bytes ...mma_ncdiv32hw32_64x128x64_32x64x64_relu.cu | Bin 1921 -> 1854 bytes ...ma_ncdiv32hw32_64x64x64_32x32x64_hswish.cu | Bin 1922 -> 1855 bytes ...m_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu | Bin 1916 -> 1849 bytes ...imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu | Bin 1920 -> 1853 bytes ...32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu | Bin 1923 -> 1856 bytes ...32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu | Bin 1917 -> 1850 bytes ...hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu | Bin 1921 -> 1854 bytes ...32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu | Bin 1923 -> 1856 bytes ...32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu | Bin 1917 -> 1850 bytes ...hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu | Bin 1921 -> 1854 bytes ...w32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu | Bin 1922 -> 1855 bytes ...v32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu | Bin 1916 -> 1849 bytes ...2hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu | Bin 1920 -> 1853 bytes ...cdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu | Bin 1924 -> 1857 bytes ...32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu | Bin 1918 -> 1851 bytes ..._ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu | Bin 1922 -> 1855 bytes ...cdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu | Bin 1924 -> 1857 bytes ...32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu | Bin 1918 -> 1851 bytes ..._ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu | Bin 1922 -> 1855 bytes ...ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu | Bin 1923 -> 1856 bytes ...w32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu | Bin 1917 -> 1850 bytes ...2_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu | Bin 1921 -> 1854 bytes ...cdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu | Bin 1924 -> 1857 bytes ...32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu | Bin 1918 -> 1851 bytes ..._ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu | Bin 1922 -> 1855 bytes ..._ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu | Bin 1922 -> 1855 bytes ...hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu | Bin 1916 -> 1849 bytes ...32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu | Bin 1920 -> 1853 bytes ...ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu | Bin 1923 -> 1856 bytes ...w32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu | Bin 1917 -> 1850 bytes ...2_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu | Bin 1921 -> 1854 bytes ..._ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu | Bin 1922 -> 1855 bytes ...hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu | Bin 1916 -> 1849 bytes ...32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu | Bin 1920 -> 1853 bytes ...32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu | Bin 1923 -> 1856 bytes ...32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu | Bin 1917 -> 1850 bytes ...hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu | Bin 1921 -> 1854 bytes ...hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu | Bin 1921 -> 1854 bytes ...iv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu | Bin 1915 -> 1848 bytes ...32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu | Bin 1919 -> 1852 bytes ...w32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu | Bin 1922 -> 1855 bytes ...v32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu | Bin 1916 -> 1849 bytes ...2hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu | Bin 1920 -> 1853 bytes ...hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu | Bin 1921 -> 1854 bytes ...iv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu | Bin 1915 -> 1848 bytes ...32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu | Bin 1919 -> 1852 bytes .../cuda/convolution/backward_data/algo.cpp | 97 ++++--- dnn/src/cuda/convolution/backward_data/algo.h | 51 +++- .../convolution/backward_data/chanwise.cpp | 10 +- .../backward_data/chanwise_small.cpp | 33 +-- .../cutlass_deconvolution_wrapper.cu | 100 +++++++ .../cutlass_deconvolution_wrapper.cuh | 44 +++ .../backward_data/deconv_int8_helper.cu | 76 +++++ .../backward_data/deconv_int8_helper.cuh | 27 ++ .../convolution/backward_data/group_conv.cpp | 46 +-- .../implicit_gemm_int8_nchw4_dp4a.cpp | 127 +++++++++ ...v_int8_implicit_gemm_cutlass_wrapper.cuinl | 62 ++++ ...m_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu | Bin 0 -> 1837 bytes ...mm_dp4a_ncdiv4hw4_16x128x16_16x64x16_id.cu | Bin 0 -> 1836 bytes ..._gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu | Bin 0 -> 1833 bytes ...mm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu | Bin 0 -> 1837 bytes ...mm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu | Bin 0 -> 1837 bytes dnn/src/cuda/convolution/opr_impl.cpp | 86 +++--- dnn/src/cuda/convolution/opr_impl.h | 1 + dnn/test/common/convolution.cpp | 172 +++++------ dnn/test/common/convolution.h | 4 +- dnn/test/cuda/convolution.cpp | 267 +++++++++++------- src/opr/test/dnn/convolution.cpp | 45 +++ 296 files changed, 1024 insertions(+), 395 deletions(-) create mode 100644 dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cu create mode 100644 dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh create mode 100644 dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cu create mode 100644 dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cuh create mode 100644 dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp create mode 100644 dnn/src/cuda/convolution/backward_data/int8/deconv_int8_implicit_gemm_cutlass_wrapper.cuinl create mode 100644 dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu create mode 100644 dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x64x16_id.cu create mode 100644 dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu create mode 100644 dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu create mode 100644 dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu diff --git a/.gitattributes b/.gitattributes index be56319f3..07945c657 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5,6 +5,7 @@ dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary dnn/src/cuda/sass/prebuilt/map_defs.cpp binary +dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text *.caffemodel filter=lfs diff=lfs merge=lfs -text imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text diff --git a/dnn/src/common/convolution.cpp b/dnn/src/common/convolution.cpp index e2c7aa916..b23f4f83e 100644 --- a/dnn/src/common/convolution.cpp +++ b/dnn/src/common/convolution.cpp @@ -46,7 +46,7 @@ void make_canonized_filter_meta_nchw_nhwc( size_t src_ndim, const TensorLayout& filter, const Param& param, typename ConvolutionBase::CanonizedFilterMeta& ret) { megdnn_assert(param.format == Param::Format::NCHW || - param.format == Param::Format::NHWC ); + param.format == Param::Format::NHWC); auto img_ndim = src_ndim - 2; size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; if (param.sparse == Param::Sparse::DENSE) { @@ -320,8 +320,8 @@ void make_canonized_filter_meta_nchwxx( img_ndim, filter.ndim); megdnn_assert((filter[filter.ndim - 1] == pack_size && filter[filter.ndim - 2] == pack_size) || - (filter[filter.ndim - 1] == 2 * pack_size && - filter[filter.ndim - 2] == 2 * pack_size), + (filter[filter.ndim - 1] == 2 * pack_size && + filter[filter.ndim - 2] == 2 * pack_size), "last 2 dim of filter must be %zu, but got %zu, %zu", pack_size, filter[filter.ndim - 2], filter[filter.ndim - 1]); @@ -684,7 +684,8 @@ ConvolutionBase::deduce_layout_fwd(const TensorLayout& src, } if (param().format == Param::Format::NCHW44 || param().format == Param::Format::NCHW44_DOT) { - //!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul + //! support nchw44 filter change to 88 for int8 winogradf23_88 using + //! MK8 mamtul megdnn_assert((src.ndim == 4 && filter.ndim == 5 && filter[filter.ndim - 1] == 4) || (src.ndim == 5 && @@ -716,7 +717,7 @@ ConvolutionBase::deduce_layout_fwd(const TensorLayout& src, "currently only convolution on 2D image is supported"); auto cflt = make_canonized_filter_meta(src.ndim, filter); if (param().format == Param::Format::NCHW || - param().format == Param::Format::NHWC ) { + param().format == Param::Format::NHWC) { size_t src_or_dst_c_pos = 0; size_t src_or_dst_spatial_start = 0; if (param().format == Param::Format::NCHW) { @@ -790,7 +791,7 @@ ConvolutionBase::deduce_layout_fwd(const TensorLayout& src, dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], cflt.stride[1], cflt.padding[1]); dst[4] = 32; - } else if (param().format == Param::Format::NCHW88 ) { + } else if (param().format == Param::Format::NCHW88) { megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8), "invalid src ndim for NCHW88, expected=5 or 4, got=%zu", src.ndim); @@ -1042,10 +1043,10 @@ void ConvolutionBackwardData::deduce_dtype(DType filter, DType diff, } megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32 #if !MEGDNN_DISABLE_FLOAT16 - || filter.enumv() == DTypeEnum::Float16 - || filter.enumv() == DTypeEnum::BFloat16 + || filter.enumv() == DTypeEnum::Float16 || + filter.enumv() == DTypeEnum::BFloat16 #endif - , + , "ComputeMode::FLOAT32 is only available for Float16/BFloat16 " "input / output."); } @@ -1096,6 +1097,24 @@ void ConvolutionBackwardData::deduce_layout(const TensorLayout& filter, diff[i + src_or_dst_spatial_start], cflt.dilated_spatial[i], cflt.stride[i], cflt.padding[i]); } + } else if (param().format == Param::Format::NCHW4) { + megdnn_assert(diff.ndim == 5, + "valid diff ndim for NCHW4, expected=5, got=%zu", + diff.ndim); + megdnn_assert(cflt.group == 1, "%s", errmsg().c_str()); + megdnn_assert(cflt.ocpg * cflt.group == diff[1] * 4, "%s", + errmsg().c_str()); + grad.ndim = diff.ndim; + grad[0] = diff[0]; + auto ic = cflt.icpg * cflt.group; + megdnn_assert(ic % 4 == 0); + grad[1] = ic / 4; + grad[2] = deduce(diff[2], cflt.dilated_spatial[0], cflt.stride[0], + cflt.padding[0]); + grad[3] = deduce(diff[3], cflt.dilated_spatial[1], cflt.stride[1], + cflt.padding[1]); + megdnn_assert(diff[4] == 4); + grad[4] = 4; } else { megdnn_assert(param().format == Param::Format::NHWCD4); megdnn_assert(diff.ndim == 5, diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu index 667b5771d..61921a219 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu @@ -62,22 +62,21 @@ void megdnn::cuda::cutlass_wrapper:: threadblock_k_>; \ using WarpShape = cutlass::gemm::GemmShape; \ using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ - using Convolution = cutlass::convolution::device::Convolution< \ + using Convolution = cutlass::conv::device::Convolution< \ int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ cutlass::layout::TensorNCxHWx<32>, int32_t, \ cutlass::layout::TensorNCxHWx<32>, int32_t, \ - cutlass::convolution::ConvType::kConvolution, \ + cutlass::conv::ConvType::kConvolution, \ cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ - cutlass::convolution::threadblock:: \ - ConvolutionNCxHWxThreadblockSwizzle< \ - cutlass::convolution::ConvType::kConvolution>, \ + cutlass::conv::threadblock:: \ + ConvolutionFpropNCxHWxThreadblockSwizzle, \ 2, 16, 16, NeedLoadFromConstMem>; \ - typename Convolution::ConvolutionParameter conv_param{ \ - param.n, param.ci, param.co, param.hi, param.wi, \ - param.fh, param.fw, param.ho, param.wo, param.sh, \ - param.sw, param.ph, param.pw, 1, 1}; \ + typename Convolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ return cutlass_convolution_wrapper( \ d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ epilogue, stream); \ @@ -186,22 +185,21 @@ void megdnn::cuda::cutlass_wrapper:: threadblock_k_>; \ using WarpShape = cutlass::gemm::GemmShape; \ using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ - using Convolution = cutlass::convolution::device::Convolution< \ + using Convolution = cutlass::conv::device::Convolution< \ int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ cutlass::layout::TensorNCxHWx<4>, int32_t, \ cutlass::layout::TensorNCxHWx<4>, int32_t, \ - cutlass::convolution::ConvType::kConvolution, \ + cutlass::conv::ConvType::kConvolution, \ cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ - cutlass::convolution::threadblock:: \ - ConvolutionNCxHWxThreadblockSwizzle< \ - cutlass::convolution::ConvType::kConvolution>, \ + cutlass::conv::threadblock:: \ + ConvolutionFpropNCxHWxThreadblockSwizzle, \ 2, 16, 16, NeedLoadFromConstMem>; \ - typename Convolution::ConvolutionParameter conv_param{ \ - param.n, param.ci, param.co, param.hi, param.wi, \ - param.fh, param.fw, param.ho, param.wo, param.sh, \ - param.sw, param.ph, param.pw, 1, 1}; \ + typename Convolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ return cutlass_convolution_wrapper( \ d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ epilogue, stream); \ @@ -311,22 +309,21 @@ void megdnn::cuda::cutlass_wrapper:: threadblock_k_>; \ using WarpShape = cutlass::gemm::GemmShape; \ using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ - using Convolution = cutlass::convolution::device::Convolution< \ + using Convolution = cutlass::conv::device::Convolution< \ int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ cutlass::layout::TensorNCxHWx<4>, int32_t, \ cutlass::layout::TensorNCxHWx<4>, int32_t, \ - cutlass::convolution::ConvType::kConvolution, \ + cutlass::conv::ConvType::kConvolution, \ cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ - cutlass::convolution::threadblock:: \ - ConvolutionNCxHWxThreadblockSwizzle< \ - cutlass::convolution::ConvType::kConvolution>, \ + cutlass::conv::threadblock:: \ + ConvolutionFpropNCxHWxThreadblockSwizzle, \ stage_, 4, aligned_, NeedLoadFromConstMem>; \ - typename Convolution::ConvolutionParameter conv_param{ \ - param.n, param.ci, param.co, param.hi, param.wi, \ - param.fh, param.fw, param.ho, param.wo, param.sh, \ - param.sw, param.ph, param.pw, 1, 1}; \ + typename Convolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ return cutlass_convolution_wrapper( \ d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ epilogue, stream); \ @@ -441,23 +438,22 @@ void megdnn::cuda::cutlass_wrapper:: threadblock_k_>; \ using WarpShape = cutlass::gemm::GemmShape; \ using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ - using Convolution = cutlass::convolution::device::Convolution< \ + using Convolution = cutlass::conv::device::Convolution< \ int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ cutlass::layout::TensorNCHW, float, \ cutlass::layout::TensorNCHW, int32_t, \ - cutlass::convolution::ConvType::kConvolution, \ + cutlass::conv::ConvType::kConvolution, \ cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ - cutlass::convolution::threadblock:: \ - ConvolutionNCxHWxThreadblockSwizzle< \ - cutlass::convolution::ConvType::kConvolution>, \ + cutlass::conv::threadblock:: \ + ConvolutionFpropNCxHWxThreadblockSwizzle, \ stages_, 4, aligned_, NeedLoadFromConstMem, \ cutlass::arch::OpMultiplyAdd>; \ - typename Convolution::ConvolutionParameter conv_param{ \ - param.n, param.ci, param.co, param.hi, param.wi, \ - param.fh, param.fw, param.ho, param.wo, param.sh, \ - param.sw, param.ph, param.pw, 1, 1}; \ + typename Convolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ return cutlass_convolution_wrapper( \ d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ epilogue, stream); \ @@ -572,36 +568,35 @@ void megdnn::cuda::cutlass_wrapper:: threadblock_k_>; \ using WarpShape = cutlass::gemm::GemmShape; \ using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ - using Convolution = cutlass::convolution::device::Convolution< \ + using Convolution = cutlass::conv::device::Convolution< \ int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ cutlass::layout::TensorNCxHWx<32>, int32_t, \ cutlass::layout::TensorNCxHWx<32>, int32_t, \ - cutlass::convolution::ConvType::kConvolution, \ + cutlass::conv::ConvType::kConvolution, \ cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ - cutlass::convolution::threadblock:: \ - ConvolutionNCxHWxThreadblockSwizzle< \ - cutlass::convolution::ConvType::kConvolution>, \ + cutlass::conv::threadblock:: \ + ConvolutionFpropNCxHWxThreadblockSwizzle, \ stages_, 4, aligned_, NeedLoadFromConstMem>; \ - typename Convolution::ConvolutionParameter conv_param{ \ - param.n, param.ci, param.co, param.hi, param.wi, \ - param.fh, param.fw, param.ho, param.wo, param.sh, \ - param.sw, param.ph, param.pw, 1, 1}; \ + typename Convolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ return cutlass_convolution_wrapper( \ d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ epilogue, stream); \ } #define DISPATCH_KERNEL \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ megdnn_assert(false, \ "unsupported threadblock shape (%dx%dx%d) and warp shape " \ "(%dx%dx%d)", \ diff --git a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl index 31b9c40b9..cf20f616d 100644 --- a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl +++ b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl @@ -29,28 +29,30 @@ void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( cudaStream_t stream) { typename Convolution::TensorRefSrc tensor_src{ const_cast(d_src), - Convolution::LayoutSrc::packed({conv_param.n(), conv_param.hi(), - conv_param.wi(), conv_param.ci()})}; + Convolution::LayoutSrc::packed( + {conv_param.N, conv_param.H, conv_param.W, conv_param.C})}; typename Convolution::TensorRefFilter tensor_filter{ const_cast(d_filter), - Convolution::LayoutFilter::packed({conv_param.co(), conv_param.fh(), - conv_param.fw(), - conv_param.ci()})}; + Convolution::LayoutFilter::packed( + {conv_param.K, conv_param.R, conv_param.S, conv_param.C})}; typename Convolution::TensorRefBias tensor_bias{ const_cast(d_bias), - Convolution::LayoutBias::packed({1, 1, 1, conv_param.co()})}; + Convolution::LayoutBias::packed({1, 1, 1, conv_param.K})}; typename Convolution::TensorRefDst tensor_z{ const_cast(d_z), - Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), - conv_param.wo(), conv_param.co()})}; + Convolution::LayoutDst::packed( + {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; typename Convolution::TensorRefDst tensor_dst{ d_dst, - Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), - conv_param.wo(), conv_param.co()})}; - typename Convolution::Arguments arguments{ - conv_param, tensor_src, tensor_filter, - tensor_bias, tensor_z, tensor_dst.non_const_ref(), - epilogue}; + Convolution::LayoutDst::packed( + {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; + typename Convolution::Arguments arguments{conv_param, + tensor_src.non_const_ref(), + tensor_filter.non_const_ref(), + tensor_bias.non_const_ref(), + tensor_z.non_const_ref(), + tensor_dst.non_const_ref(), + epilogue}; Convolution conv_op; cutlass_check(conv_op.initialize(arguments, workspace)); cutlass_check(conv_op(stream)); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu index 4f0eebb74b659460ee9929c44833da05abb82de9..aede9980adb36d3caa3e4acf8383c11af39132d9 100644 GIT binary patch delta 54 zcmey(w}Wqk9rI*Y=I+VYndeTfW|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)MHWL6oj}VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!-oVnq1OP1$I}HE; diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu index b92f7681e340f99238674782187829367a5dcbaf..57f38b13b383bcb954ea374c6bba84d36996fe6e 100644 GIT binary patch delta 56 zcmey$w~23q1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UdPhG1OO)mI{^Ry diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu index 30eec1369454a5e7c09e70ece0ab6a1801f8019d..4121d19ad3adb33749522c1beba9fd274206eb25 100644 GIT binary patch delta 54 zcmeyxx1Dc;E%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;AHWL6nauq26 delta 164 zcmdna_ls|XEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UeD6O1OO@uI|l#& diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu index 1307f93751e6255a7c323df7416f8a6fa5914179..ebc98a406aea46f81ea813f9ec5276a82a52c20b 100644 GIT binary patch delta 56 zcmeyuw~=pyIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW@~ delta 104 zcmdnU_l0kRIWv2HPH9PIe%@qr=59Fq5c6C(JDx>qavh6=u??335G0qD$HS=UA=I+T?nCF^XS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!Udz(K1OOxeI{N?s diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu index dbf1e900ff5a8301db7e1857f1ee03646ed0e757..9fbad0d37c0b533e6ce61591cab60f17ce2b17e1 100644 GIT binary patch delta 54 zcmeyxx1Dc;E%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;AHWL6nauq26 delta 164 zcmdna_ls|XEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UeD6O1OO@uI|l#& diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu index 1ef223f9670095bb94b07cf08483f3534dbe4395..9fd33cba3fb2c1f5d1cdf2a2ae6595ea400bea05 100644 GIT binary patch delta 56 zcmeyuw~=pyIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW@~ delta 104 zcmdnU_l0kRIWv2HPH9PIe%@qr=59Fq5c6C(JDx>qavh6=u??335G0qD$HS=UA=I+T?nCF^XS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!Udz(K1OOxeI{N?s diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu index 0c779fe15d9856d56b82073d75e6c284fdf267af..820c97570893a0729537a37742e4c802a5cb793f 100644 GIT binary patch delta 54 zcmeyxx1Dc;9rI*Y=I+VYndeTfW|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)M783wIBNZ(G delta 164 zcmdna_ls|X9W#4=PH9PIe%@p|=59FqB=g*QsJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C>L{4# RC|KwyloXY2-oVn%1OP49J5K-r diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu index 85fdecd79fda6911989011249fdb9b5c2f050410..f3d1a46305aafd269b66388742a92fe20600b16e 100644 GIT binary patch delta 56 zcmeyuw~=py1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW$4fAAY=I+T?ndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C>L{4# RC|KwyloXY2UdPhT1OO+^J3{~f diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu index 931f3716f967cee115a607de08e4be4fd838a63c..a952578d9a877f0c35c54d2c17f2999c40e53efd 100644 GIT binary patch delta 56 zcmeyyx0!E)74u|A=I+Utndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8B9kb pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)up2;$c2>`m7FAo3! diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu index 70949c6586f2b6197e02ffe55433a5c1c71dd010..da9f49855f4cb2c261e11ffe9f95f25e9b66893b 100644 GIT binary patch delta 54 zcmeysx0Y{%3G-xY=I+U7ndeR}WRa3}D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%H@783w70Tkx| delta 105 zcmZ3>_knMN2{U_sPH9PIe%@pg=59E9Kl5BTJBCHd$j`aLBfKIcqbM~oB`GIAIXk#K nv#Kg5)rLy}2$D-nauSP+t*nys^U7ex?Pr!?wA(zHWfl_v4xuJs diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu index 0aaa3f7a6f536dcf73c202bb3883f0fb90978701..065b70429f09de343662103a570412b5d34afaa2 100644 GIT binary patch delta 56 zcmeyuw~=py1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEB-oVny1OP3MI`IGi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu index e20f26dc91a6755403d95deb83b3693f319c1070..187f7b96c1ea0f66929b21c6dd922f9f366318ef 100644 GIT binary patch delta 56 zcmeyyx0!E)1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdPhO1OO+6I^_TW diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu index ade39f7a1bfc9744876dcd36e25d04f061081734..4d7d9337ba98f5abba28d5e0aa940abc89b39940 100644 GIT binary patch delta 54 zcmey(w}WqkE%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;A4if-9-W4nW delta 164 zcmdnN_nU8nEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUeD6W1OO_EI_m%c diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu index e07c6dcf2f49ddfe1cd0d46b81cc6b06d63e3974..aeafaf1593745fdb37dea67a22e8757861dc6659 100644 GIT binary patch delta 56 zcmey$w~23qIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWqavh6=u??335G0qDV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdz(S1OOy}I^O^Q diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu index 3e38529ba083a97153b3c79534bda2693148f22b..66a31a7f2d8f55de987260bb6cc9983aa3643812 100644 GIT binary patch delta 54 zcmey(w}WqkE%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;A4if-9-W4nW delta 164 zcmdnN_nU8nEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUeD6W1OO_EI_m%c diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu index 1b3145c4e6255e3096048c20ebf6adaf953124ed..ca0cf04c47b412b769b3a6e722af19e068dd36c9 100644 GIT binary patch delta 56 zcmey$w~23qIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWqavh6=u??335G0qDV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdz(S1OOy}I^O^Q diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu index 14226ecd3b901e9aac04d5444f25e7ac6114efdb..725fed7fb736ae1901dfb7a24144c5961c34cc4a 100644 GIT binary patch delta 54 zcmey(w}Wqk9rI*Y=I+VYndeTfW|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)MHWL6oj}VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C>L{4# RC|Kwyq$TET-oVnq1OP4qJ1PJG diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu index 449dee42ef976533a07d19f702196fbe9a19ec54..e7688245da55156e422aaedfb50952d420b8e8d9 100644 GIT binary patch delta 56 zcmey$w~23q1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C>L{4# RC|Kwyq$TETUdPhG1OO-aJ01W4 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu index 4d3feb12372ce71dfdfda70aa3eb904cdff55ca2..aa5ecf003a0f165b7e44cdfb67e4f5484bb2432f 100644 GIT binary patch delta 56 zcmey)w}o$m74u|A=I+Utndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8B9kb pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)up2;$s2>`p4FA@L% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu index 1dbfbf60739a530a23363f3698e3baf110e52ffb..0a1c1d3a54d987b53c70a6a54f6b527dcc52336f 100644 GIT binary patch delta 54 zcmey!w~lXv3G-xY=I+U7ndeR}WRa3}D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%H@HWL6dZ4~MN delta 105 zcmZ3-_mOXd2{U_sPH9PIe%@pg=59E9Kl5BTJBCHd$j`aLBfKIcqbM~oB`GIAIXk#K nv#Kg5)rLy}2$D-nauSP+t*nys^U7ex?Pr!?wA(zHWi}H44|67C diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu index d4fcfb061335b0cdcee7a2f03f504dc0d775fb4c..a61a466ae3d904d36cef24d4c1e79581e8a87e20 100644 GIT binary patch delta 56 zcmey$w~23q1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUeD6W1OO_EI_m%c diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu index 4d9abe294e0187ce1a4e4b147c797a7c0ad862ca..7237f2de387d49308dcb46bf6216258dbe39b04e 100644 GIT binary patch delta 56 zcmey$w~23qIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWqavh6=u??335G0qDV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdz(S1OOy}I^O^Q diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu index fb6b71da008280971ad7e89c8b6f30bd341704dc..27ad631e576d40591409cfc0818e2bca4791f970 100644 GIT binary patch delta 76 zcmeyxx1Dc;4fAAY=I+T?ndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdPhO1OO+6I^_TW diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu index c38538c325ddd0810767ae25c67d36ffaf3dc83f..ddc61a512ebb176a428938093384cbf45d70d02e 100644 GIT binary patch delta 56 zcmeyuw~=py8S`XY=I+Vondiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW<9%PnawA(zDWeyVnC)^$n diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu index b91efded3aa1b20c3ee78cedc181d72516c11c22..c5aaec6f49ca9f34ce39c2b8f9330fe894d6a980 100644 GIT binary patch delta 56 zcmeytx0P>$74u|A=I+Utndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8B9kb pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)up2;$Y2>`s1FBJd) diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu index c877492e5be9a96a815876f51ee53353b2744090..a5d437a23ef8b72c79d250c928789cd383b6d45f 100644 GIT binary patch delta 76 zcmeyxx1Dc;4fAAY=I+T?ndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdPhO1OO+6I^_TW diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu index 40e6cb5d360682e04b42b29f83385e5500fa0805..d16c179b229532ef3272773b63b12bd06ffe9dad 100644 GIT binary patch delta 56 zcmeyuw~=py8S`XY=I+Vondiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW<9%PnawA(zDWeyVnC)^$n diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu index 19f337a174bbaca1445e471cce1360e7255d40f9..e5968cf52aac02e3ffb410b2ec61534de5bf7ec9 100644 GIT binary patch delta 56 zcmeytx0P>$74u|A=I+Utndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8B9kb pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)up2;$Y2>`s1FBJd) diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu index 27f7d5b869a4aeb61e56eedc69709b6960ffe8ce..61faa3e522870de1b9d88cd06cb52d28e957b182 100644 GIT binary patch delta 54 zcmey(w}WqkE%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;A4if-9-W4nW delta 164 zcmdnN_nU8nEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUeD6W1OO_EI_m%c diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu index 1a8e17fd6c533b32b14cc3f08b48993a79a50f83..b5177e0106608398e3096328cee966c9234dd578 100644 GIT binary patch delta 56 zcmey$w~23qIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWqavh6=u??335G0qDV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdz(S1OOy}I^O^Q diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu index 252a8f779310bdc004f1c3095a0853b8d67b4fd0..79d9c0f8efe456cd55466f6af1360f22c06b8213 100644 GIT binary patch delta 76 zcmeyxx1Dc;4fAAY=I+T?ndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdPhO1OO+6I^_TW diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu index f5d01d75ad737eb3fcf870a47f8b7b59ce3f8df4..cc5c002637aadf653eebbd5ed9abcae67eefbcdc 100644 GIT binary patch delta 56 zcmeyuw~=py8S`XY=I+Vondiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW<9%PnawA(zDWeyVnC)^$n diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu index a29be07e1d171b1d9084f9cfcb570612c32daf55..3a71ccf0f44f0682269252d878ad3b5e49d6e395 100644 GIT binary patch delta 56 zcmeytx0P>$74u|A=I+Utndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8B9kb pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)up2;$Y2>`s1FBJd) diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu index 07333503fbc32c9874ace0c5a14b6d84a72d1822..de8806d6b4ef0ccecabc863eee761817ee4a9c84 100644 GIT binary patch delta 76 zcmeyxx1Dc;4fAAY=I+T?ndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdPhO1OO+6I^_TW diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu index 41f6db050537c7733e5eb259ac9a698b24cbddff..b5e3a342357167bd407ea6b9ebd58850e8672905 100644 GIT binary patch delta 56 zcmeyuw~=py8S`XY=I+Vondiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW<9%PnawA(zDWeyVnC)^$n diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu index 34896baf1e5431ae8432e785b9f2ab05a6ded26b..2243305e9e1ecedae9d9444425172f5e1c6dce6b 100644 GIT binary patch delta 56 zcmeytx0P>$74u|A=I+Utndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8B9kb pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)up2;$Y2>`s1FBJd) diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu index 5f247f0ccff0ff84ae3568364c643aaf427ba20b..6110868bf0e723daa9b163b8eb617c2ed7b5c60e 100644 GIT binary patch delta 54 zcmeyxx1Dc;E%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;AHWL6nauq26 delta 164 zcmdna_ls|XEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UeD6O1OO@uI|l#& diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu index 81cdd18cde0f493047267827589d9f7f00d4164a..0e5c018e8cd57ee3cfeed4a3fafc59ea3ca5be1e 100644 GIT binary patch delta 56 zcmeyuw~=pyIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW@~ delta 104 zcmdnU_l0kRIWv2HPH9PIe%@qr=59Fq5c6C(JDx>qavh6=u??335G0qD$HS=UA=I+T?nCF^XS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!Udz(K1OOxeI{N?s diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu index 1ffdb3c4872a2d8d7e3658362ffd560d3ae8313d..012b70b064cb8dfac7a73dd73efa3bd1e5ac81c5 100644 GIT binary patch delta 76 zcmey#w~cRu4fAAY=I+T?ndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UdPhG1OO)mI{^Ry diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu index bf0fdcff5ca0517944a7ea3e032b4e31b8cfce5a..aff240c47cdeeca03641591b8290b82c8af1cb91 100644 GIT binary patch delta 56 zcmey&w}Eei8S`XY=I+Vondiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW<9%PnawA(zDWi}H4Co&!k diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu index bb3a17ba68ac931d5f3256443598c30e5b49c5be..c101cd36ce39e2709ad49d82b79c41f9ecb8b253 100644 GIT binary patch delta 56 zcmey)w}o$m74u|A=I+Utndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8B9kb pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)up2;$s2>`p4FA@L% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu index 9df0b4f40ce5b18810253283b0238825a21b3f00..487a06fb97a0ca8024fb8b670d69ef2a24b96b96 100644 GIT binary patch delta 76 zcmey#w~cRu4fAAY=I+T?ndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UdPhG1OO)mI{^Ry diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu index c0da1d27367253dca4f67cc6aced74da2b8f7874..450fd723d8955ef91bb10a5cd3184517db4e4b22 100644 GIT binary patch delta 56 zcmey&w}Eei8S`XY=I+Vondiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW<9%PnawA(zDWi}H4Co&!k diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu index 309aa23941b716f9b538f189e1182e4ca3e4f532..efc249c2d471ecb0c46851b4860f199c56519fcb 100644 GIT binary patch delta 56 zcmey)w}o$m74u|A=I+Utndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8B9kb pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)up2;$s2>`p4FA@L% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu index 210c52e07b76f0c45b10369fdf7fcdf6a0de6785..8503592dfbd119060b78ad5dadbecf6c5145881d 100644 GIT binary patch delta 54 zcmeyxx1Dc;E%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;AHWL6nauq26 delta 164 zcmdna_ls|XEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UeD6O1OO@uI|l#& diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu index 4f4f35f8410cff630615d9bd99b9affcf1c6329a..2d9f42b02acf5bca792b89a544b8477a3e9d0b94 100644 GIT binary patch delta 56 zcmeyuw~=pyIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW@~ delta 104 zcmdnU_l0kRIWv2HPH9PIe%@qr=59Fq5c6C(JDx>qavh6=u??335G0qD$HS=UA=I+T?nCF^XS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!Udz(K1OOxeI{N?s diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu index 224e8a63670d2bdc4e298efb6ce5ffd8bcc8ed58..5c473b18408de1b367d724d2b20e4c07019a636a 100644 GIT binary patch delta 76 zcmey#w~cRu4fAAY=I+T?ndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UdPhG1OO)mI{^Ry diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu index 557610101842ffb4c909c3513141d3cef1dd125e..31a15cc73ad7e93278a0448c8a53b576199d21fb 100644 GIT binary patch delta 56 zcmey&w}Eei8S`XY=I+Vondiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW<9%PnawA(zDWi}H4Co&!k diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu index 170f197105f9a19d953030df5143e806ef18b2e3..fc86fa43585df6bb7a54364bae7c93684b20ebe7 100644 GIT binary patch delta 56 zcmey)w}o$m74u|A=I+Utndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8B9kb pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)up2;$s2>`p4FA@L% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu index 03cbfbc5921f85b9674b867c2890b785fd8d104b..2f66a3b3dbc0b30bd4bca69291b11577b66885ee 100644 GIT binary patch delta 76 zcmey#w~cRu4fAAY=I+T?ndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UdPhG1OO)mI{^Ry diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu index 4698d99f6739fd8927e49591ad6ef8cfa3c1d4cd..7da1dd3ffc0ee98e78abe64a418463fd91989a28 100644 GIT binary patch delta 56 zcmey&w}Eei8S`XY=I+Vondiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW<9%PnawA(zDWi}H4Co&!k diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu index 38358f7ead2b030d7be39a69b27c36490d64ccf8..199c133d590acc9429bcddd3d835abaff3ab5fa0 100644 GIT binary patch delta 56 zcmey)w}o$m74u|A=I+Utndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8B9kb pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)up2;$s2>`p4FA@L% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu index b2c9c4628f4f1084d10517b099b89824b4beb5c9..b17a642029f69358ddc6da430132cf6e03f85e8d 100644 GIT binary patch delta 54 zcmeyvx07#!J@aHY=I+TinCDKeVUdz`D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TaVcHWL6ptQ9u^ delta 145 zcmdnV_lIwTJu`cLPH9PIe%@qz=59Fq6!Y91sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp2sqq2>|{S BHV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UeD6O1OO@uI|l#& diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu index d1c44bec4e93537c1c512eec4b703cbd8decf0ad..c854ef867270e133745a3c585ad251f1e1def372 100644 GIT binary patch delta 54 zcmey(w}Wqk9rI*Y=I+VYndeTfW|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)MHWL6oj}VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!-oVnq1OP1$I}HE; diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu index 5158b527d76fc935e73de1de686870785afc514e..1eb4a3aacb3957b7fbba5076fb12c31f15e43f86 100644 GIT binary patch delta 56 zcmey$w~23q1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UdPhG1OO)mI{^Ry diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu index b28f68247219e07bca34c94fcb166aacd470fab4..d07b72e9f4a2028acd951bd94239acff9ff305ba 100644 GIT binary patch delta 54 zcmey(w}Wqk9rI*Y=I+VYndeTfW|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)MHWL6oj}VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!-oVnq1OP1$I}HE; diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu index f106af213e1889d6744bb87bfc84f268b48ee5bf..c9e2b4494452709d53d52433237c3be69b2f148b 100644 GIT binary patch delta 56 zcmey$w~23q1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UdPhG1OO)mI{^Ry diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu index 130b04b863e3fa754e9fb4b24127f227e3c05b27..8f0745635ea3dd38ec6f5f122bcf7838c6d879cf 100644 GIT binary patch delta 54 zcmey%w~KFsJ@aHY=I+TinCDKeVUdz`D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TaVc4if-C7!^7I delta 145 zcmdnR_m^*jJu`cLPH9PIe%@qz=59Fq6!Y91sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp2sqW2>|~# BHV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUeD6W1OO_EI_m%c diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu index 716a540b711bef86ac13934b1de12ba137433287..670b6403db995d43b3ef3adf60c28718c086edef 100644 GIT binary patch delta 54 zcmeyvx07#!9rI*Y=I+VYndeTfW|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)M4if-A`xP?) delta 164 zcmdnV_lIwT9W#4=PH9PIe%@p|=59FqB=g*QsJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEB-oVny1OP3MI`IGi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu index 35b97bcda599aed2b2404d4968942c8f36d5f7b4..5a85818a9e8acffe680c433a7e49999aa77b882f 100644 GIT binary patch delta 56 zcmeyyx0!E)1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdPhO1OO+6I^_TW diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu index 3681959837ff5df0c0583c4b36c43592099f1aab..0d5fdc72cabf6bcbb6ac96ed3b2b7a9615a3ba84 100644 GIT binary patch delta 54 zcmeyvx07#!9rI*Y=I+VYndeTfW|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)M4if-A`xP?) delta 164 zcmdnV_lIwT9W#4=PH9PIe%@p|=59FqB=g*QsJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEB-oVny1OP3MI`IGi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu index ebeb82911b4e51fbc55541f23a10a9ab2fb6a9e5..b0f4bd6f7544fcdf756244446adbdd01e557f410 100644 GIT binary patch delta 56 zcmeyyx0!E)1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdPhO1OO+6I^_TW diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu index 0cd9b194f6a68d2ccd1b833fc7b3925d87b2161a..157cd07b49bb6a3b0e19742df15f3834e18a29b9 100644 GIT binary patch delta 54 zcmeyvx07#!9rI*Y=I+VYndeTfW|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)M4if-A`xP?) delta 164 zcmdnV_lIwT9W#4=PH9PIe%@p|=59FqB=g*QsJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEB-oVny1OP3MI`IGi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu index 4a89381a8c71bc9e076d8028880be7f2f0788428..eb93fa6cd7d753ab18df0d0526005ebacefc8eb3 100644 GIT binary patch delta 56 zcmeyyx0!E)1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdPhO1OO+6I^_TW diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu index a60e3b181f2bc9f11f21596075924161e45cb4cf..99f335be139b5dffdd310147aa2137248528efac 100644 GIT binary patch delta 54 zcmey(w}WqkE%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;A4if-9-W4nW delta 164 zcmdnN_nU8nEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUeD6W1OO_EI_m%c diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu index 4bee34e863810b9e00e6e2eabbe6f433f11396a8..643f43caacae89bc1fd1565e500dd37a6c23e5f1 100644 GIT binary patch delta 56 zcmey$w~23qIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWqavh6=u??335G0qDV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdz(S1OOy}I^O^Q diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu index eb4d164e9f163ad737e9d2aba221c5ca5afa2c67..b84d3f0bd87771e44975863c2644782d5d59130c 100644 GIT binary patch delta 54 zcmey(w}WqkE%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;A4if-9-W4nW delta 164 zcmdnN_nU8nEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUeD6W1OO_EI_m%c diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu index 576e237832cdb914ddea1b42c64d27ac69be9e68..83482b72c363f7fbe53d4345996a3140ac6410d4 100644 GIT binary patch delta 56 zcmey$w~23qIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWqavh6=u??335G0qDV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdz(S1OOy}I^O^Q diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu index 367f6012d731606ffcd95b2905785dbdf4b7c483..6ba333db3b1351e0beca332bbbcdd32a267d75de 100644 GIT binary patch delta 54 zcmeyvx07#!9rI*Y=I+VYndeTfW|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)M4if-A`xP?) delta 164 zcmdnV_lIwT9W#4=PH9PIe%@p|=59FqB=g*QsJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEB-oVny1OP3MI`IGi diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu index e5ba977b18faf2a326b89835afe139284cb16cd9..9fd0f3e94b15f34fbcf08f3326100e023d13b9e8 100644 GIT binary patch delta 56 zcmeyyx0!E)1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdPhO1OO+6I^_TW diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu index 8a81f58713a332d785f5e84de8641d6a5c09a08d..f3f4977aa88ab56a0396040108e6fdd2f443058f 100644 GIT binary patch delta 54 zcmey(w}WqkE%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;A4if-9-W4nW delta 164 zcmdnN_nU8nEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUeD6W1OO_EI_m%c diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu index e391dd646f6040f53bb180d2bf83e4c005adc911..dfe25c5735625432cbb7d2c6e747eb56e565a1ad 100644 GIT binary patch delta 56 zcmey$w~23qIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWqavh6=u??335G0qDV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdz(S1OOy}I^O^Q diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu index 026080c88204d95408fa592b3091cc96dc6ab669..ca51165edb82bd20c2dc98dc6241bbee3ed1ce25 100644 GIT binary patch delta 54 zcmey(w}WqkE%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;A4if-9-W4nW delta 164 zcmdnN_nU8nEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUeD6W1OO_EI_m%c diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu index 4deac474aac6e0e6de47e28f64325e71250240c0..eb0fcc2c3e423f10138e346c2036141f99f9cc03 100644 GIT binary patch delta 56 zcmey$w~23qIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWqavh6=u??335G0qDV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUdz(S1OOy}I^O^Q diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu index 48036be9e0b7f96f188f1afa782f508b00914e18..c10268e670f8dec35d470e2edc488cfe3d9ec47b 100644 GIT binary patch delta 54 zcmey(w}Wqk9rI*Y=I+VYndeTfW|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)MHWL6oj}VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!-oVnq1OP1$I}HE; diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu index 6d1a69d287cb30995e7139d4d6080ac0f842f18e..258c39ba946e20519cbdd60e3a7a066d009b51d4 100644 GIT binary patch delta 56 zcmey$w~23q1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UdPhG1OO)mI{^Ry diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu index 6d4a698f7f9e66e88afa75556a565dcec5b86437..e178f8b208a506604de755e44e15d58b5aa7ecb2 100644 GIT binary patch delta 54 zcmeyxx1Dc;E%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;AHWL6nauq26 delta 164 zcmdna_ls|XEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UeD6O1OO@uI|l#& diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu index aeee09ac7e4bf9e66e7f8319788848958d48dd86..33ab33785e3550738d7c8ec4db87fb357b6e3e4b 100644 GIT binary patch delta 56 zcmeyuw~=pyIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW@~ delta 104 zcmdnU_l0kRIWv2HPH9PIe%@qr=59Fq5c6C(JDx>qavh6=u??335G0qD$HS=UA=I+T?nCF^XS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!Udz(K1OOxeI{N?s diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu index fd235e0aab8e34cbb3112af60c674eaff31b41d5..447d3c9ed3155c4395623bd3dd079e048d2437f9 100644 GIT binary patch delta 54 zcmeyxx1Dc;E%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;AHWL6nauq26 delta 164 zcmdna_ls|XEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UeD6O1OO@uI|l#& diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu index ac180bd42d3ae8a8f9c97de1caae4d54199d02a2..aad75d8a98ce3c3fa6a8a838bf9a61fdfdb7d0b2 100644 GIT binary patch delta 56 zcmeyuw~=pyIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW@~ delta 104 zcmdnU_l0kRIWv2HPH9PIe%@qr=59Fq5c6C(JDx>qavh6=u??335G0qD$HS=UA=I+T?nCF^XS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!Udz(K1OOxeI{N?s diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu index ccaca6e8ea2fc21d02487839ff0076b2b603ea2c..88d1ec1bc1bff88e6b6697783ac6b3658bdfb5b5 100644 GIT binary patch delta 54 zcmey(w}Wqk9rI*Y=I+VYndeTfW|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)MHWL6oj}VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!-oVnq1OP1$I}HE; diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu index 35915b26724c1ec1b1f622f34c6359e9b8f1516d..b414833fb8ad26501e044ac0deec9ba6cc974b8f 100644 GIT binary patch delta 56 zcmey$w~23q1@mNk=I+TCndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UdPhG1OO)mI{^Ry diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu index c97b5e918f1362e796388aafc53503ca016b112e..b8a5f97345a1cda9c83e7e0c0a823f020f896150 100644 GIT binary patch delta 54 zcmeyxx1Dc;E%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;AHWL6nauq26 delta 164 zcmdna_ls|XEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UeD6O1OO@uI|l#& diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu index 8327475ad764906b99f5c5499abb507f208ef675..45d4a9c13a083b99c864fac5f47315b8d7225717 100644 GIT binary patch delta 56 zcmeyuw~=pyIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW@~ delta 104 zcmdnU_l0kRIWv2HPH9PIe%@qr=59Fq5c6C(JDx>qavh6=u??335G0qD$HS=UA=I+T?nCF^XS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!Udz(K1OOxeI{N?s diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu index cabcf1ae65705db3bbe6701f06a589efed9ddbba..afb78f94fab35f6c3300588d42c3ff39e1cecd72 100644 GIT binary patch delta 54 zcmeyxx1Dc;E%Rg-=I+VYnCDKeVv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%;AHWL6nauq26 delta 164 zcmdna_ls|XEi-$5PH9PIe%@qT=59Fq1oPZ_sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!UeD6O1OO@uI|l#& diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu index 2f111fd96779010b8e6353ed1d8c33679a68cb46..20cd85303ec85187244f6373a0945d4f22cbaed0 100644 GIT binary patch delta 56 zcmeyuw~=pyIrC&Y=I+TCnCHq{S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW@~ delta 104 zcmdnU_l0kRIWv2HPH9PIe%@qr=59Fq5c6C(JDx>qavh6=u??335G0qD$HS=UA=I+T?nCF^XS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!Udz(K1OOxeI{N?s diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_hswish.cu index 357366755eee27f49c787f1e952fe6cd70d805d8..889ded1a86dff674d17c3cfbd447ee2807e94ac7 100644 GIT binary patch delta 54 zcmaFNw}5YhA@gJl=8nn7ndeTkAv1e^PH9PIe%@q5<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!Ucl181OW6?I+y?e diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu index c447d319401b553ec8317177424a026ec544b9ee..c7e9abbad953f1f575a1924dc3878d317b8ecbd2 100644 GIT binary patch delta 56 zcmcc1H;ZqBHuGd7=8nk+ndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!p2yO_1OVVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!p3l<21OV|)I+6eY diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu index 5dc680a7104601b625c5b7e5d604e438a327484f..5ba1e043c05741f0dded11a01637be0e42b6c787 100644 GIT binary patch delta 56 zcmcb^HV)4TUj~h=auE>l$K=X=eZRWu??335G0qDV)4TUj~h=auE>l$K=X=eZRW09;rdq5uE@ delta 164 zcmbQs_keGM9y5D>PH9PIe%@p~<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!p3Bm}1OV$qI)(rM diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu index faa8ea1883358e766fb291f3597b24546e68c882..bec858a1a4c9d7b7d31b934c1d505b2db1f7c78a 100644 GIT binary patch delta 54 zcmaFFH=l2V0rO;Y=8nn7nCDK;Vv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%*9G7|tO+Z3b# delta 164 zcmbQw_lR$U0W*7kPH9PIe%@pQ<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!p3l<21OV|)I+6eY diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu index 7e4273a7ab9825f6075a546ad6027b83545356f7..cd029a0c1e2e42d9e1d8dee24c7af8a0c58d20f1 100644 GIT binary patch delta 56 zcmcb^HV)4TUj~h=auE>l$K=X=eZRWu??335G0qDV)4TUj~h=auE>l$K=X=eZRW09;rdq5uE@ delta 164 zcmbQs_keGM9y5D>PH9PIe%@p~<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!p3Bm}1OV$qI)(rM diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu index 8865080e356c1386e72b302a6160bf6c88d97ebe..7c3d55322ab5080ab498433166524dec23a299cb 100644 GIT binary patch delta 54 zcmaFFH=l2VA@gJl=8nn7ndeTVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C>L{4# RC|KwyloXY2Ucge%1OW9LI@$mL diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu index 6a93ab6171cf7b4163c863df6d4bc59a588bcab7..af59cd30a84a09d6e122d2dc5be1dde2984e4f83 100644 GIT binary patch delta 56 zcmcb^HV)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRW=r2qf` delta 164 zcmbQs_keGMJ~MlMPH9PIe%@q#<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C>L{4# RC|KwyloXY2p2t$p1OV?5I?ez9 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu index ec78da97a6961366625a51984ef9af3d5f844b3d..d8832567fce7b006537b39c02f3d2d51765ba284 100644 GIT binary patch delta 56 zcmcb|H=A#RF7sp)=8nmSndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWhGh~H09L;hHUIzs delta 126 zcmbQucaLv_E;D<6PH9PIe%@qV<_VoFj@esZ>zm2-Yx8B9kZ pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)u?qr$71OSWqE{XsE diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu index 27df4ce87072239f0b236d7559e98958152071ea..9c5f1ec04920b8dd128718b2b2d6a615fd71aaa6 100644 GIT binary patch delta 54 zcmcb?Hb=8nmGndeSUWRa3}D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%E?5)%L(Y7}As delta 105 zcmbQrcY|+(Ix~BIPH9PIe%@qt<_V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBUcl1G1OW8YI(z^C diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu index 021eee4646893c907859dd2eabcb918dd8559be6..5a83822c1ae74828503f04703c4a8381945875c9 100644 GIT binary patch delta 56 zcmcb|H=A#RHuGd7=8nk+ndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBp2yP21OV>II&c60 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu index d9fa27c5b76ad06afcf38290012703e3acb01e95..3bed635e63223000f2b6bc3613454f12482b8dba 100644 GIT binary patch delta 54 zcmaFNw}5Yh0rO;Y=8nn7nCDK;Vv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%*93KIY+M--<3 delta 164 zcmZ3$_n2>k0W*7kPH9PIe%@pQ<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBp3lV)4TUj~h=auE>l$K=X=eZRWu??335G0qDV)4TUj~h=auE>l$K=X=eZRWPH9PIe%@p~<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBp3Bn61OV&AI%)s_ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu index 2a13f48cec1f12ef1bd99191a3e769e8478a05eb..1070d213b771e4ef8151e45fd958dc837c6ccf9e 100644 GIT binary patch delta 54 zcmaFNw}5Yh0rO;Y=8nn7nCDK;Vv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%*93KIY+M--<3 delta 164 zcmZ3$_n2>k0W*7kPH9PIe%@pQ<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBp3lV)4TUj~h=auE>l$K=X=eZRWu??335G0qDV)4TUj~h=auE>l$K=X=eZRWPH9PIe%@p~<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBp3Bn61OV&AI%)s_ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu index ddea6211e503043d806664aafd7d4bec23bfe154..080c322296e623dd8fff50771a54e1e564fd4b34 100644 GIT binary patch delta 54 zcmaFNw}5YhA@gJl=8nn7ndeTkAv1e^PH9PIe%@q5<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C>L{4# RC|Kwyq$TETUcl181OW9$I<)`* diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu index 0aafd7fe2ea4d678f887ead8980b8490b7d4ec97..8b82f9a8d397c3541260c27f4e96d5855c065c3f 100644 GIT binary patch delta 56 zcmcc1H;ZqBHuGd7=8nk+ndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C>L{4# RC|Kwyq$TETp2yO_1OV?mI;j8v diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu index 56c22fc6cd1723306d249cd07ab7b40021474cb9..5c8ee5495beec2484f0171ff6b415a97701a9c1b 100644 GIT binary patch delta 56 zcmcc5H-~S7F7sp)=8nmSndiz|S(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWhGjAn09P*+IRF3v delta 126 zcmbQkcb{*AE;D<6PH9PIe%@qV<_VoFj@esZ>zm2-Yx8B9kZ pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)u?qr$F1OSZnE{y;H diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu index 68a8a13bbeaf52764aff28e76e7d90f0a8d1462a..10bb5915ec1f1380b725249e1ba3eebe5c3bd209 100644 GIT binary patch delta 54 zcmcb~H;r$DI`d>b=8nmGndeSUWRa3}D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%E?G7|tE))Zv` delta 105 zcmbQncav{}Ix~BIPH9PIe%@qt<_V)4TUj~h=auE>l$K=X=eZRWk0W*7kPH9PIe%@pQ<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBp3lV)4TUj~h=auE>l$K=X=eZRWu??335G0qDV)4TUj~h=auE>l$K=X=eZRWPH9PIe%@p~<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBp3Bn61OV&AI%)s_ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu index b8c1e2789777ffabaeb72505fb9e4e42a3a68ea1..6b394e11c3aae570534d4d99ff89cf70ce39d09c 100644 GIT binary patch delta 76 zcmaFFH=l2VKJ#QV=8nlnndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBp2yP21OV>II&c60 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu index 393b5a6cb0b4fd2e8043b3e0db6b54c218a37ada..652cea8833ded5dc4bcacad07c2d830bbfca0601 100644 GIT binary patch delta 56 zcmcb^HV)4TUj~h=auE>l$K=X=eZRWzlQ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu index c1ae6410e2759c092448213b16dac1026b5e29e1..5fc015576518cedcf963926db5ad7fdae603f4d7 100644 GIT binary patch delta 56 zcmaFBHV)4TUj~h=auE>l$K=X=eZRWhGhy909T(CJOBUy delta 126 zcmbQs_keGME;D<6PH9PIe%@qV<_VoFj@esZ>zm2-Yx8B9kZ pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)u?qr$51OSckE|35K diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu index 2d327f97b792edbe47f677b0679a243d4b243238..9d743b45237a7202e57611f453de8ec734547fd2 100644 GIT binary patch delta 76 zcmaFFH=l2VKJ#QV=8nlnndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBp2yP21OV>II&c60 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu index 33332c776f52be397055bfa28a50fff5bcbb093a..0921c3e9d7b0617d53287a068fe2e8459cacc53e 100644 GIT binary patch delta 56 zcmcb^HV)4TUj~h=auE>l$K=X=eZRWzlQ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu index 37369d3e85fd0620e08286d455a84fd97ac53878..a9c8dd404f0a772e503a9ed0af71f20f6ef0ffef 100644 GIT binary patch delta 56 zcmaFBHV)4TUj~h=auE>l$K=X=eZRWhGhy909T(CJOBUy delta 126 zcmbQs_keGME;D<6PH9PIe%@qV<_VoFj@esZ>zm2-Yx8B9kZ pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)u?qr$51OSckE|35K diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu index 5ffd599ed326bee539aacadfd5cc040f63ade96f..23443ee28b2aca363b3a2cb677f5931ecddd4049 100644 GIT binary patch delta 54 zcmaFNw}5Yh0rO;Y=8nn7nCDK;Vv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%*93KIY+M--<3 delta 164 zcmZ3$_n2>k0W*7kPH9PIe%@pQ<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBp3lV)4TUj~h=auE>l$K=X=eZRWu??335G0qDV)4TUj~h=auE>l$K=X=eZRWPH9PIe%@p~<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBp3Bn61OV&AI%)s_ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu index 3cfe915fd6b12860cd3002d8de5258d8e01d87dd..1d2b47d20deabb03984dd2cf321c68511abb2452 100644 GIT binary patch delta 76 zcmaFFH=l2VKJ#QV=8nlnndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBp2yP21OV>II&c60 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu index 6b711dc10e92661add3e0da67329bc5b73a544bb..d38622ac29bf2b62a9e81eb4e89c3ef3e7f75ce7 100644 GIT binary patch delta 56 zcmcb^HV)4TUj~h=auE>l$K=X=eZRWzlQ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu index c63406461b48c156fdcde77b06f66692ddcf2f2a..3845f7faf1ac6d6061e9269afbdc88ceb2afbf5c 100644 GIT binary patch delta 56 zcmaFBHV)4TUj~h=auE>l$K=X=eZRWhGhy909T(CJOBUy delta 126 zcmbQs_keGME;D<6PH9PIe%@qV<_VoFj@esZ>zm2-Yx8B9kZ pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)u?qr$51OSckE|35K diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu index e1a4fac14886a77e8b3f3a79e213e63b5423a90f..2948cbbc43f2fc6a355d78e3ba9c5001b6f4852d 100644 GIT binary patch delta 76 zcmaFFH=l2VKJ#QV=8nlnndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD5NEBp2yP21OV>II&c60 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu index 8b2c0d5fdaa053272df6d35e97c63d824a30b33e..0f75f986896adee4307127b049b7e05dec306361 100644 GIT binary patch delta 56 zcmcb^HV)4TUj~h=auE>l$K=X=eZRWzlQ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu index eeb3c5c2510bae23df61ae56f3854146b548027f..4aa41a48f0fd98809c25864e2a452ddce8e5ae5e 100644 GIT binary patch delta 56 zcmaFBHV)4TUj~h=auE>l$K=X=eZRWhGhy909T(CJOBUy delta 126 zcmbQs_keGME;D<6PH9PIe%@qV<_VoFj@esZ>zm2-Yx8B9kZ pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)u?qr$51OSckE|35K diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu index b04889b88617c7be2ab85ea843c26a6d02ab5c48..3a21872acb2ed3e758151676446575c484c3d2fc 100644 GIT binary patch delta 54 zcmaFFH=l2V0rO;Y=8nn7nCDK;Vv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%*9G7|tO+Z3b# delta 164 zcmbQw_lR$U0W*7kPH9PIe%@pQ<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!p3l<21OV|)I+6eY diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu index 03d817af706a7be29991a33e27d49c846824237f..86e0bfb5cafa82e199e4328192925a63b99ae23a 100644 GIT binary patch delta 56 zcmcb^HV)4TUj~h=auE>l$K=X=eZRWu??335G0qDV)4TUj~h=auE>l$K=X=eZRW09;rdq5uE@ delta 164 zcmbQs_keGM9y5D>PH9PIe%@p~<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!p3Bm}1OV$qI)(rM diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu index 22a9bfb1ff672040957668ab385c8acc9a79cb5d..da540ae8f550ce2fb03bdb667ce12c3b066bbcdd 100644 GIT binary patch delta 76 zcmaFJH;-?FKJ#QV=8nlnndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!p2yO_1OVV)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWhGjAn09P*+IRF3v delta 126 zcmbQkcb{*AE;D<6PH9PIe%@qV<_VoFj@esZ>zm2-Yx8B9kZ pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)u?qr$F1OSZnE{y;H diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu index 4ad86a3f8c88e7712ab9c06b036545c1f79b2975..0dca550deaf741a6a7cb666d1327fcca124f953b 100644 GIT binary patch delta 76 zcmaFJH;-?FKJ#QV=8nlnndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!p2yO_1OVV)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWhGjAn09P*+IRF3v delta 126 zcmbQkcb{*AE;D<6PH9PIe%@qV<_VoFj@esZ>zm2-Yx8B9kZ pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)u?qr$F1OSZnE{y;H diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu index 497591db5b63aadf94eb1a9ac915682b9d774fb6..0d47d1cd480d7a2e78f524bbf7df2770493fa94d 100644 GIT binary patch delta 54 zcmaFFH=l2V0rO;Y=8nn7nCDK;Vv&+{D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-F%*9G7|tO+Z3b# delta 164 zcmbQw_lR$U0W*7kPH9PIe%@pQ<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!p3l<21OV|)I+6eY diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu index b06de5af17231728ee878556a4f15ee8cc4b240d..d8927fbbc2b0285986787dec124e29f49f11f6ad 100644 GIT binary patch delta 56 zcmcb^HV)4TUj~h=auE>l$K=X=eZRWu??335G0qDV)4TUj~h=auE>l$K=X=eZRW09;rdq5uE@ delta 164 zcmbQs_keGM9y5D>PH9PIe%@p~<_VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!p3Bm}1OV$qI)(rM diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu index 87e5aae00b259c8f4d3020ceb0d58e89b638a449..d448a3e6a7effee301451ac74969181217d4a501 100644 GIT binary patch delta 76 zcmaFJH;-?FKJ#QV=8nlnndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!p2yO_1OVV)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWhGjAn09P*+IRF3v delta 126 zcmbQkcb{*AE;D<6PH9PIe%@qV<_VoFj@esZ>zm2-Yx8B9kZ pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)u?qr$F1OSZnE{y;H diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu index 6f19370ee918a046116e252562f90a5e81b9ced7..7aad8acb93a4fb23d9f715dd0ad62ba962dd2eae 100644 GIT binary patch delta 76 zcmaFJH;-?FKJ#QV=8nlnndh2YS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRWVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_r`! RC>WaQD3la!p2yO_1OVV)4TUj~h=auE>l$K=X=eZRWV)4TUj~h=auE>l$K=X=eZRWhGjAn09P*+IRF3v delta 126 zcmbQkcb{*AE;D<6PH9PIe%@qV<_VoFj@esZ>zm2-Yx8B9kZ pi-fTamjVzZmzLxt78hGtB_pIEYC|duQmw4Ak+j)u?qr$F1OSZnE{y;H diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu index 42029b8aa35ec24d06f41307a886e3615d1a3e5c..b58174157fb3e7ac712eaee2cbabb5af6b32ecc0 100644 GIT binary patch delta 60 zcmZqSKghSiiFvXobNA%)EE1DHGk3~cS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW QJ0I^mV3;+NC delta 85 zcmX@e*TTQSiJ3h=r?ez9KX0-Vb2prQmU$tZoyQ_IxtB#k%7#k;2$D-nauSP+t*nys T^U7da&oWCe+HGFSGKUEO8Ok0M diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_id.cu index 9de29a4ae162050ba5779033d11a5890187fd739..7a0612f839a74299146e75f1d4595fe4e8be856c 100644 GIT binary patch delta 54 zcmeyzx0`Q+4fAAY=I+T?nHNs3Ws#C~D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-Ta7U4if-Bu@yN0 delta 145 zcmdnZ_m6Lb4KsUwPH9PIe%@po=59FqIP=0BsJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp3gFe2>|_| BHvK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|Ozk93}uO5H2ME diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_hswish.cu index a156009f986b64346cea5af43ed9d5c0a037d224..d68997d0c397a0585230ccc969758bf9c5e528bd 100644 GIT binary patch delta 60 zcmZqSKghSiiFvXobNA%)EE1DHGk3~cS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW QJ0I^mV3;+NC delta 85 zcmX@e*TTQSiJ3h=r?ez9KX0-Vb2prQmU$tZoyQ_IxtB#k%7#k;2$D-nauSP+t*nys T^U7da&oWCe+HGFSGKUEO8Ok0M diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_id.cu index 645fb0f6e2992c4eaa575aed551dea21b174d1e4..d707e5f7f32a0d2d962a38b3f2ea3c1c579e02da 100644 GIT binary patch delta 54 zcmeyzx0`Q+4fAAY=I+T?nHNs3Ws#C~D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-Ta7U4if-Bu@yN0 delta 145 zcmdnZ_m6Lb4KsUwPH9PIe%@po=59FqIP=0BsJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp3gFe2>|_| BHvK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|Ozk93}uO5H2ME diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_hswish.cu index 0d5dc8c483833d735e88914be72604d60ef3beef..3dcf049746066b4fa6ddde9b8f4d143d15e6e23c 100644 GIT binary patch delta 60 zcmZqXKft%ak$JKQbNA$PEE1DHF?Y&aS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW QWyIm;X-0I&@g1ONa4 delta 85 zcmX@W*UZ1ck(oU|r?ez9KX0-lb2prQhIt{Joy#IMxraqU%7#k;2$D-nauSP+t*nys T^U7da&oE0c+HGFKGKUEO7w{en diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_id.cu index dd1e9c2eadba5541d2c9d858f450efa3b7c9bf53..e9dc07cc839982d8f9ead4464f1e60d345222162 100644 GIT binary patch delta 54 zcmey%w~KFsHS=UA=I+T?m={j2VUdz`D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TaVc4if-Aloc`n delta 145 zcmdnR_m^*jH8XpDPH9PIe%@qj=59Fq81up$sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp2sqW2>|-~ BH;w=R diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_relu.cu index 96f54e5dda5c78a5d273f4a75036f543849272ad..c5653626b65ac71e758e95daa6e443686b60a0cd 100644 GIT binary patch delta 58 zcmZqV-^aJXo_VqxbNA#kEE1DHFn3OFV3Cq_D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-Tahg4if--z7|RV delta 124 zcmdnT*T}!Yo|!#Ar?ez9KX0-V)4TUj~h=auE>l$K=X=eZRW QVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp3gFu2>|}W BHvK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|OzkTqXc42rei9 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_hswish.cu index b504399aa390894531aaaf32ea4af8d806c00176..bc75f547787bbdd022490a7a9c14a0862c1a59e3 100644 GIT binary patch delta 60 zcmZqWKg74eiFvXobNA%)EE1DHGk3~cS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW QVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp3gFu2>|}W BHvK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|OzkTqXc42rei9 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_hswish.cu index 006f790a8868ae36f142ab78f1b25a9631950d77..32294cd5ae4b52c358d04bb58aa2a0fbcc40f68c 100644 GIT binary patch delta 60 zcmZqSKghSik$JKQbNA$PEE1DHF?Y&aS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW QWyIm=uo0I-D@2LJ#7 delta 85 zcmX@e*TTQSk(oU|r?ez9KX0-lb2prQhIt{Joy#IMxraqU%7#k;2$D-nauSP+t*nys T^U7da&oE0c+HGFKGM5Pe7@8gq diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu index 0906068e26f8fbecc26350bae57d32e5bfe7b59b..34f00363d3efd00dfba02b64420ece4462f8dd0a 100644 GIT binary patch delta 54 zcmeyzx0`Q+HS=UA=I+T?m={j2VUdz`D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TaVcE)xJh02MU= delta 145 zcmdnZ_m6LbH8XpDPH9PIe%@qj=59Fq81up$sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp2sqm2>|>Y BH<17U diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu index adcbfbeaf89988dc7ea7a100dfb5fc5715ebdc95..eea42928ebdc61619cfe57c61b1becd3632b4fda 100644 GIT binary patch delta 58 zcmZqT-_N(fo_VqxbNA#kEE1DHFn3OFV3Cq_D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-TahgE)xKJG8Rn$ delta 124 zcmdnb*TlcUo|!#Ar?ez9KX0-V)4TUj~h=auE>l$K=X=eZRW QVoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp3gFu2>|}W BHvK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|OzkTqXc42rei9 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu index 7315cecb3d71f45cef14843279a035f30a9674df..dee9e38c334b279dc82d34f9e205c80f7c4b4b13 100644 GIT binary patch delta 58 zcmZqXKft%afqAk!bNA%4EE1DHGIvgHWRa3}D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-TaJYE)xKKjTTh^ delta 124 zcmX@W*UZ1cftfu&r?ez9KX0-Fb2prQnt36dox>vK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|OzkTqXc42rei9 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu index 9d4b8466913c723a7bb5d5acc983666d78410fb8..33939b7bbf45d9db56aed932157a1ad2a1041200 100644 GIT binary patch delta 54 zcmey%w~KFs74u|A=I+UtnHNs3W|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)ME)xJf;}tFd delta 164 zcmdnR_m^*j6*GH&PH9PIe%@p&=59FqDD%R4sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_nYQ ML2%mU4J=(u04VG_z5oCK diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu index 6a87dc00d8be6f116c273c56c72b224c08fc4aba..8926a9d6a5e51c34a9a0c731246e020d644ee085 100644 GIT binary patch delta 54 zcmZqV-^aJXj(M^xbNA%y%nK*ivq;Ih6%^$c_&Ha2gja-Q6s0DnB<18MX9t&OR#oMs KZhpctmk9tTj1V)4TUj~h=auE>l$K=X=eZRW QWyIm=uo0I-D@2LJ#7 delta 85 zcmX@e*TTQSk(oU|r?ez9KX0-lb2prQhIt{Joy#IMxraqU%7#k;2$D-nauSP+t*nys T^U7da&oE0c+HGFKGM5Pe7@8gq diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu index 8846c82f408c075ca63914b01910440626c5d5f8..7e9fc7a451bde2772000b113393e3b7550a613d2 100644 GIT binary patch delta 54 zcmeyzx0`Q+HS=UA=I+T?m={j2VUdz`D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TaVcE)xJh02MU= delta 145 zcmdnZ_m6LbH8XpDPH9PIe%@qj=59Fq81up$sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp2sqm2>|>Y BH<17U diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu index 54a3079223f9258a56dd14805909553c2b5d8264..00ed690b3e81d74bfa6609a4ee7ad0d1efe42611 100644 GIT binary patch delta 58 zcmZqT-_N(fo_VqxbNA#kEE1DHFn3OFV3Cq_D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-TahgE)xKJG8Rn$ delta 124 zcmdnb*TlcUo|!#Ar?ez9KX0-vK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|OzkTqXc42rei9 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu index deba71bbab2ea1184566f223df2a10c331b39ec4..f870aed546abc00659c04e8f57ee2612b866ff2b 100644 GIT binary patch delta 54 zcmey%w~KFs74u|A=I+UtnHNs3W|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)ME)xJf;}tFd delta 164 zcmdnR_m^*j6*GH&PH9PIe%@p&=59FqDD%R4sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_nYQ ML2%mU4J=(u04VG_z5oCK diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu index 2392dc7bc24bdc1bf70dcbc5fa39e6a1c867d3ca..d6bebaf2ca1651c9f671eeb63962c0d019f03fb3 100644 GIT binary patch delta 54 zcmZqV-^aJXj(M^xbNA%y%nK*ivq;Ih6%^$c_&Ha2gja-Q6s0DnB<18MX9t&OR#oMs KZhpctmk9tTj1V)4TUj~h=auE>l$K=X=eZRW QJ0I^mV3;+NC delta 85 zcmX@e*TTQSiJ3h=r?ez9KX0-Vb2prQmU$tZoyQ_IxtB#k%7#k;2$D-nauSP+t*nys T^U7da&oWCe+HGFSGKUEO8Ok0M diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_id.cu index 61ac1b497198efb99ce2771d88ba595df2447304..adca5a47d93382a872400fd07e881e064fdebb08 100644 GIT binary patch delta 54 zcmeyzx0`Q+4fAAY=I+T?nHNs3Ws#C~D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-Ta7U4if-Bu@yN0 delta 145 zcmdnZ_m6Lb4KsUwPH9PIe%@po=59FqIP=0BsJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp3gFe2>|_| BHvK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|Ozk93}uO5H2ME diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu index ffef0da2f7f107049c119d92598ace9ca2540a38..89b8055296b51c7884d243f3b3b082ed7ba5f485 100644 GIT binary patch delta 58 zcmZqT-_N(ffqAk!bNA%4EE1DHGIvgHWRa3}D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-TaJY4if-<85U9i delta 124 zcmdnb*TlcUftfu&r?ez9KX0-Fb2prQnt36dox>vK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|Ozk93}uO5H2ME diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu index aa544e2d48749eddaea471fd7d0c8418f23035f0..30e53fc0927533476b3a78114a1ca266fa44160c 100644 GIT binary patch delta 54 zcmeyvx07#!74u|A=I+UtnHNs3W|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)M4if-9cNHrD delta 164 zcmdnV_lIwT6*GH&PH9PIe%@p&=59FqDD%R4sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_nYQ ML2$|D4J@5Z04Ml5$p8QV diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu index 762bf3e9f3fd79b5d45dfd223f55656808b6194b..465fe2653ca6b5d97fdfda8734e5d1a1219de4c7 100644 GIT binary patch delta 54 zcmZqR-^;hbj(M^xbNA%y%nK*ivq;Ih6%^$c_&Ha2gja-Q6s0DnB<18MX9t&OR#oMs KZhpcthY0{DAQZ>| delta 124 zcmdnX*TBEQj+s3_r?ez9KX0-fb2prQl6fJVoy8*M?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|ODU93}uI?k*Pq diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu index bad9a5ce96622a41c2cf80bb9fb7619714bc78c1..00fb0d97a57068e8274d698968b3131a1d61894d 100644 GIT binary patch delta 60 zcmZqXKft%ak$JKQbNA$PEE1DHF?Y&aS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW QWyIm;X-0I&@g1ONa4 delta 85 zcmX@W*UZ1ck(oU|r?ez9KX0-lb2prQhIt{Joy#IMxraqU%7#k;2$D-nauSP+t*nys T^U7da&oE0c+HGFKGKUEO7w{en diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_id.cu index b08d7a953791729443132aedaa48999e3bd3be82..1b2402dee3697eaffccebc5028de925b41a37656 100644 GIT binary patch delta 54 zcmey%w~KFsHS=UA=I+T?m={j2VUdz`D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TaVc4if-Aloc`n delta 145 zcmdnR_m^*jH8XpDPH9PIe%@qj=59Fq81up$sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp2sqW2>|-~ BH;w=R diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_relu.cu index 9173c90410bac96ef10a9ca19831ea473ced3cf1..647694ce46875c8dbc20fa000522d92fd20f0609 100644 GIT binary patch delta 58 zcmZqV-^aJXo_VqxbNA#kEE1DHFn3OFV3Cq_D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-Tahg4if--z7|RV delta 124 zcmdnT*T}!Yo|!#Ar?ez9KX0-vK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|Ozk93}uO5H2ME diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu index 7f1b55c0f98d4734a3eaf0ee1552e0f4ce11ebc6..48c0ed8f0c909cf0b44936077e3deebda38be882 100644 GIT binary patch delta 54 zcmeyvx07#!74u|A=I+UtnHNs3W|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)M4if-9cNHrD delta 164 zcmdnV_lIwT6*GH&PH9PIe%@p&=59FqDD%R4sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_nYQ ML2$|D4J@5Z04Ml5$p8QV diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu index c599f7acde577bdf6c1f80afebdbe40838f30682..bbfd2311dc9a29e8254e96f4e66eb3edaff6aad8 100644 GIT binary patch delta 54 zcmZqR-^;hbj(M^xbNA%y%nK*ivq;Ih6%^$c_&Ha2gja-Q6s0DnB<18MX9t&OR#oMs KZhpcthY0{DAQZ>| delta 124 zcmdnX*TBEQj+s3_r?ez9KX0-fb2prQl6fJVoy8*M?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|ODU93}uI?k*Pq diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu index e11f92ff8df3965d38121d3f17051e112d851d40..b9333b454200f5148045de5121bf3bba916bbb19 100644 GIT binary patch delta 60 zcmZqXKft%ak$JKQbNA$PEE1DHF?Y&aS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW QWyIm;X-0I&@g1ONa4 delta 85 zcmX@W*UZ1ck(oU|r?ez9KX0-lb2prQhIt{Joy#IMxraqU%7#k;2$D-nauSP+t*nys T^U7da&oE0c+HGFKGKUEO7w{en diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu index bf638233d3e95f19637a082b05fc27dcdb385e96..0ed1e56ac59af1da4be7a89bf09dd9bb7c8cd120 100644 GIT binary patch delta 54 zcmey%w~KFsHS=UA=I+T?m={j2VUdz`D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TaVc4if-Aloc`n delta 145 zcmdnR_m^*jH8XpDPH9PIe%@qj=59Fq81up$sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp2sqW2>|-~ BH;w=R diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu index 5ef27dfd260d4f04676311b442ca52b132f3a29d..9ffd151c2f830452b1a3746d7872d574da734805 100644 GIT binary patch delta 58 zcmZqV-^aJXo_VqxbNA#kEE1DHFn3OFV3Cq_D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-Tahg4if--z7|RV delta 124 zcmdnT*T}!Yo|!#Ar?ez9KX0-V)4TUj~h=auE>l$K=X=eZRW QWyIm;X-0I&@g1ONa4 delta 85 zcmX@W*UZ1ck(oU|r?ez9KX0-lb2prQhIt{Joy#IMxraqU%7#k;2$D-nauSP+t*nys T^U7da&oE0c+HGFKGKUEO7w{en diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu index f9b97cd47327d336721a342f29e89a98b8869f4d..c152bd5e4ebd4ad6f77b4acfd6e45fa649a46a56 100644 GIT binary patch delta 54 zcmey%w~KFsHS=UA=I+T?m={j2VUdz`D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TaVc4if-Aloc`n delta 145 zcmdnR_m^*jH8XpDPH9PIe%@qj=59Fq81up$sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp2sqW2>|-~ BH;w=R diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu index 3ac0bd364f159da54e205f74bd38156d57adc41e..dfe97c7f59e81e37bc8dd4ac2d402fad4a46d3aa 100644 GIT binary patch delta 58 zcmZqV-^aJXo_VqxbNA#kEE1DHFn3OFV3Cq_D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-Tahg4if--z7|RV delta 124 zcmdnT*T}!Yo|!#Ar?ez9KX0-vK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|Ozk93}uO5H2ME diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu index 9fe447f3bc5cdc66ae5068f5948b67f71386064b..4f0e90ad14d0fd176fdde27dc0f65458041773c7 100644 GIT binary patch delta 54 zcmeyvx07#!74u|A=I+UtnHNs3W|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)M4if-9cNHrD delta 164 zcmdnV_lIwT6*GH&PH9PIe%@p&=59FqDD%R4sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_nYQ ML2$|D4J@5Z04Ml5$p8QV diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu index 198293e2e5f9c383d75bd9efb258a50edf7d25ae..c664ccd86647945b89566d4881db9e6d3b04578a 100644 GIT binary patch delta 54 zcmZqR-^;hbj(M^xbNA%y%nK*ivq;Ih6%^$c_&Ha2gja-Q6s0DnB<18MX9t&OR#oMs KZhpcthY0{DAQZ>| delta 124 zcmdnX*TBEQj+s3_r?ez9KX0-fb2prQl6fJVoy8*M?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|ODU93}uI?k*Pq diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu index 1d4a974f24ef62589264e938ebf6209341f57c17..ea9cedcbeab4f5bab02abe016c3a6ec3936a7981 100644 GIT binary patch delta 60 zcmZqSKghSik$JKQbNA$PEE1DHF?Y&aS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW QWyIm=uo0I-D@2LJ#7 delta 85 zcmX@e*TTQSk(oU|r?ez9KX0-lb2prQhIt{Joy#IMxraqU%7#k;2$D-nauSP+t*nys T^U7da&oE0c+HGFKGM5Pe7@8gq diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu index 069e96db4230c0ee9b4b0ac6cb33f58b603d7775..9652ea56abe4ced9b3f459ac955dc2e2685776dc 100644 GIT binary patch delta 54 zcmeyzx0`Q+HS=UA=I+T?m={j2VUdz`D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TaVcE)xJh02MU= delta 145 zcmdnZ_m6LbH8XpDPH9PIe%@qj=59Fq81up$sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp2sqm2>|>Y BH<17U diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu index 80a947c22da4cd66991b3ece5122ecb7077089a4..1beff87fff88b55b5597e5513bc8e2bfbe3f5845 100644 GIT binary patch delta 58 zcmZqT-_N(fo_VqxbNA#kEE1DHFn3OFV3Cq_D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-TahgE)xKJG8Rn$ delta 124 zcmdnb*TlcUo|!#Ar?ez9KX0-V)4TUj~h=auE>l$K=X=eZRW QWyIm=uo0I-D@2LJ#7 delta 85 zcmX@e*TTQSk(oU|r?ez9KX0-lb2prQhIt{Joy#IMxraqU%7#k;2$D-nauSP+t*nys T^U7da&oE0c+HGFKGM5Pe7@8gq diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu index ca80dda287c9f82ab8714da8f4c50493a4e27953..e86c52382450c53f3580d1fbae9c0b7d55520b06 100644 GIT binary patch delta 54 zcmeyzx0`Q+HS=UA=I+T?m={j2VUdz`D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TaVcE)xJh02MU= delta 145 zcmdnZ_m6LbH8XpDPH9PIe%@qj=59Fq81up$sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp2sqm2>|>Y BH<17U diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu index 812cdc8124b4ae6df3ade810bdd81473d1cfd6ce..36b3b3c888506817a045c41c4bfd37a2548ca522 100644 GIT binary patch delta 58 zcmZqT-_N(fo_VqxbNA#kEE1DHFn3OFV3Cq_D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-TahgE)xKJG8Rn$ delta 124 zcmdnb*TlcUo|!#Ar?ez9KX0-vK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|OzkTqXc42rei9 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu index 89608b5bb926679024612b54664113a32342319d..a7829ddcbd2528dbd0e5f3fca9bc57b00546798b 100644 GIT binary patch delta 54 zcmey%w~KFs74u|A=I+UtnHNs3W|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)ME)xJf;}tFd delta 164 zcmdnR_m^*j6*GH&PH9PIe%@p&=59FqDD%R4sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_nYQ ML2%mU4J=(u04VG_z5oCK diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu index 80aac1fcaccd6edd6950694a98200d426f993b2b..e9cd6b18679c6b59db6e29511a01e5dc174ffc65 100644 GIT binary patch delta 54 zcmZqV-^aJXj(M^xbNA%y%nK*ivq;Ih6%^$c_&Ha2gja-Q6s0DnB<18MX9t&OR#oMs KZhpctmk9tTj1V)4TUj~h=auE>l$K=X=eZRW QWyIm=uo0I-D@2LJ#7 delta 85 zcmX@e*TTQSk(oU|r?ez9KX0-lb2prQhIt{Joy#IMxraqU%7#k;2$D-nauSP+t*nys T^U7da&oE0c+HGFKGM5Pe7@8gq diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu index bde7fed6b1ff0937607d55bcdbd8728171d48cc4..f8ca88770f28f272da6900f7ade7c4c2820f8a6b 100644 GIT binary patch delta 54 zcmeyzx0`Q+HS=UA=I+T?m={j2VUdz`D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TaVcE)xJh02MU= delta 145 zcmdnZ_m6LbH8XpDPH9PIe%@qj=59Fq81up$sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp2sqm2>|>Y BH<17U diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu index 5cc092b65c3a41ba1f4f1f3fb97eb88b397fd44e..88420f6da7f0695f243518c094b1b4108294d0c8 100644 GIT binary patch delta 58 zcmZqT-_N(fo_VqxbNA#kEE1DHFn3OFV3Cq_D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-TahgE)xKJG8Rn$ delta 124 zcmdnb*TlcUo|!#Ar?ez9KX0-VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_nYQ ML2%mU^(%bF(oM{KRMgV$~ix;3@YR2 zT;UO35rU8nF3+s0%1O21QUHSF(vqCS;$kbSWP}cg_K?bgR4c1&B$MnmFJPI=1OU2r BHy!{0 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu index 99ff27cd0c4e2602dee58c53d6a8ec8ce0139248..8a9c6cdc3c8cc75a4465e230c336507e93895bac 100644 GIT binary patch delta 58 zcmZqXKft%afqAk!bNA%4EE1DHGIvgHWRa3}D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-TaJYE)xKKjTTh^ delta 124 zcmX@W*UZ1cftfu&r?ez9KX0-Fb2prQnt36dox>vK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|OzkTqXc42rei9 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu index c913c159c6ef972b80ade9bf46476ec6028c08b0..1d081f4c9ec7fd7968ab50022799734d6e75bf6f 100644 GIT binary patch delta 54 zcmey%w~KFs74u|A=I+UtnHNs3W|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)ME)xJf;}tFd delta 164 zcmdnR_m^*j6*GH&PH9PIe%@p&=59FqDD%R4sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_nYQ ML2%mU4J=(u04VG_z5oCK diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu index 5144b3bcbcbaa55a6d08d60acb92f61bffcda14c..997cda702712ff8c03d93ecb245577a2fa64e712 100644 GIT binary patch delta 54 zcmZqV-^aJXj(M^xbNA%y%nK*ivq;Ih6%^$c_&Ha2gja-Q6s0DnB<18MX9t&OR#oMs KZhpctmk9tTj1VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_nYQ ML2%mU^(%bF(oM{KRMgV$~ix;3@YR2 zT;UO35rU8nF3+s0%1O21QUHSF(vqCS;$kbSWP}cg_K?bgR4c1&B$MnmFJPI=1OU2r BHy!{0 diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu index a0a2a8bee1689b7b418bdc6234b429dac2db6975..c27f507cd539c175ab6f4545b23724bad87bd79c 100644 GIT binary patch delta 60 zcmZqXKft%ak$JKQbNA$PEE1DHF?Y&aS(RiIr6#5%<>V)4TUj~h=auE>l$K=X=eZRW QWyIm;X-0I&@g1ONa4 delta 85 zcmX@W*UZ1ck(oU|r?ez9KX0-lb2prQhIt{Joy#IMxraqU%7#k;2$D-nauSP+t*nys T^U7da&oE0c+HGFKGKUEO7w{en diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu index 89645723cd8f980f20255bda375f4e4e5540674c..44c2437d36be664bf784a40d1848be97c4a7808d 100644 GIT binary patch delta 54 zcmey%w~KFsHS=UA=I+T?m={j2VUdz`D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TaVc4if-Aloc`n delta 145 zcmdnR_m^*jH8XpDPH9PIe%@qj=59Fq81up$sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92Bp2sqW2>|-~ BH;w=R diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu index 513ff2695ac3b18f7c6cb0880eee43999971493f..bd2802ee022c9a31c94267e9901daab3dd69e522 100644 GIT binary patch delta 58 zcmZqV-^aJXo_VqxbNA#kEE1DHFn3OFV3Cq_D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-Tahg4if--z7|RV delta 124 zcmdnT*T}!Yo|!#Ar?ez9KX0-VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_nYQ ML2$|D^(>uC03`D|!vFvP diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu index 41cd9ae5656ec339616b4163c7cf0f4a69f36b37..a0786d6c80027e4cc820c1d97466a65148ffae7a 100644 GIT binary patch delta 54 zcmey*w})?oE%Rg-=I+VYm={j2W08_|D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-Tatk4if-C&J{oa delta 145 zcmdnP_n&WrEi-$5PH9PIe%@qT=59Fq1oOfisJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92BUcfSk2><|9 BH<|zd diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu index 3151e3dc38a1dc57985f575c35008ab43c303bb3..964f42196a6ad67aebb6eb78262a17dd43de33ff 100644 GIT binary patch delta 58 zcmZqT-_N(ffqAk!bNA%4EE1DHGIvgHWRa3}D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-TaJY4if-<85U9i delta 124 zcmdnb*TlcUftfu&r?ez9KX0-Fb2prQnt36dox>vK?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|Ozk93}uO5H2ME diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu index c20fe3d06c6a28e861f168a35f98de11e7f743b0..c6dc75cc639d8876bdfee6007ae20d036bda5741 100644 GIT binary patch delta 54 zcmeyvx07#!74u|A=I+UtnHNs3W|5M0D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-TZ)M4if-9cNHrD delta 164 zcmdnV_lIwT6*GH&PH9PIe%@p&=59FqDD%R4sJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_nYQ ML2$|D4J@5Z04Ml5$p8QV diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu index dddbed50f612d50841807e11adcaa5f2a21a4b8d..e8c765cae8c3a43949fceeeeddb0593f7e0188aa 100644 GIT binary patch delta 54 zcmZqR-^;hbj(M^xbNA%y%nK*ivq;Ih6%^$c_&Ha2gja-Q6s0DnB<18MX9t&OR#oMs KZhpcthY0{DAQZ>| delta 124 zcmdnX*TBEQj+s3_r?ez9KX0-fb2prQl6fJVoy8*M?dM$K5nd6JQIwjPl9ZF5oE==A ySyh#jYQv=f1j(f(If=!^R#wURd1X-JtgM`YoRG?bR4c1&5ECkHw|ODU93}uI?k*Pq diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu index 6028fba165cc043a786b248a69ad88ba773c9ddd..e0cbba9b09ab6fd111d747636da26fa5f11a9cb4 100644 GIT binary patch delta 58 zcmZqV-^aJXo_VqxbNA#kEE1DHFn3OFV3Cq_D=5k@@N=&42(JjqC`wICNy^Dj&JHfm Otg6aM-Tahg4if--z7|RV delta 124 zcmdnT*T}!Yo|!#Ar?ez9KX0-VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG93oD1h}C=_nYQ ML2$|D^(>uC03`D|!vFvP diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu index 688064da3aee8198af4f42d9d05da5e753446aba..822b04790310f355042ea30ce25496956e94eba4 100644 GIT binary patch delta 54 zcmey*w})?oE%Rg-=I+VYm={j2W08_|D=5k@@N=&42(JjqC`wICNy^Dj&JHfmtg6aM K-Tatk4if-C&J{oa delta 145 zcmdnP_n&WrEi-$5PH9PIe%@qT=59Fq1oOfisJN9?Nk&m>VoFj@esZ>zm2-Yx8C1s4 zxxyp7A_O5DT%K7~m6K}2r2quUr6oCu#l==u$p{?~?ID!~sa96mNG92BUcfSk2><|9 BH<|zd diff --git a/dnn/src/cuda/convolution/backward_data/algo.cpp b/dnn/src/cuda/convolution/backward_data/algo.cpp index dcdcd5896..c643db1d2 100644 --- a/dnn/src/cuda/convolution/backward_data/algo.cpp +++ b/dnn/src/cuda/convolution/backward_data/algo.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "./algo.h" @@ -20,32 +21,38 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() { non_cudnn_algos.push_back(&chanwise_small); non_cudnn_algos.push_back(&matmul); - all_algos.push_back(&chanwise); // prefer chanwise - all_algos.push_back(&chanwise_small); // prefer small chanwise + all_algos.push_back(&chanwise); // prefer chanwise + all_algos.push_back(&chanwise_small); // prefer small chanwise fill_cudnn_algos(); - for (auto &&i: cudnn) { + for (auto&& i : cudnn) { all_algos.push_back(&i); } all_algos.push_back(&matmul); + fill_int8_dp4a_algos(); + for (auto&& algo : int8_nchw4_dotprod) { + all_algos.push_back(&algo); + int8_algos.push_back(&algo); + } + all_algos.reserve(all_algos.size() * 2); // add gconv algos by AlgoGroupConvGeneral auto all_algos_data = all_algos.data(); size_t group_algo_start = 2; - for (size_t i = group_algo_start; i < all_algos.size(); ++ i) { + for (size_t i = group_algo_start; i < all_algos.size(); ++i) { gconv.push_back({all_algos[i]}); } - for (size_t i = group_algo_start; i < all_algos.size(); ++ i) { + for (size_t i = group_algo_start; i < all_algos.size(); ++i) { algo2gconv[all_algos[i]] = &gconv[i - group_algo_start]; } - for (auto &&i: gconv) { + for (auto&& i : gconv) { all_algos.push_back(&i); } megdnn_assert(all_algos_data == all_algos.data()); - non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul + non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul all_algos.push_back(&bfloat16); bfloat16_algos.push_back(&bfloat16); @@ -59,63 +66,55 @@ MEGDNN_DEF_GET_ALGO_FROM_DESC(ConvolutionBackwardDataImpl) ConvolutionBackwardDataImpl::AlgoCUDNN* ConvolutionBackwardDataImpl::AlgoPack::cudnn_from_enum( cudnnConvolutionBwdDataAlgo_t algo) { - for (auto &&i: cudnn) { + for (auto&& i : cudnn) { if (i.cudnn_enum() == algo) return &i; } - megdnn_throw(megdnn_mangle(ssprintf( - "can not find cudnn bwd_data algorithm %d", - static_cast(algo)))); + megdnn_throw( + megdnn_mangle(ssprintf("can not find cudnn bwd_data algorithm %d", + static_cast(algo)))); } ConvolutionBackwardDataImpl::AlgoPack ConvolutionBackwardDataImpl::sm_algo_pack; ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( - ConvolutionBackwardDataImpl *o, - const TensorLayout &filter, const TensorLayout &diff, - const TensorLayout &grad): - SizeArgs(o, filter, o->check_layout_fwd(grad, filter, diff), diff, grad) -{ -} + ConvolutionBackwardDataImpl* o, const TensorLayout& filter, + const TensorLayout& diff, const TensorLayout& grad) + : SizeArgs(o, filter, o->check_layout_fwd(grad, filter, diff), diff, + grad) {} ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( - ConvolutionBackwardDataImpl *o, const TensorLayout& filter, - const CanonizedFilterMeta &filter_meta, const TensorLayout &diff, - const TensorLayout &grad): - handle{concrete_handle(o->handle())}, - filter_meta{filter_meta}, - diff_layout{&diff}, - grad_layout{&grad}, - filter_layout{&filter}, - opr{o} -{ -} + ConvolutionBackwardDataImpl* o, const TensorLayout& filter, + const CanonizedFilterMeta& filter_meta, const TensorLayout& diff, + const TensorLayout& grad) + : handle{concrete_handle(o->handle())}, + filter_meta{filter_meta}, + diff_layout{&diff}, + grad_layout{&grad}, + filter_layout{&filter}, + opr{o} {} ConvolutionBackwardDataImpl::AlgoBase::ExecArgs::ExecArgs( - ConvolutionBackwardDataImpl *opr, - _megdnn_tensor_in filter, - _megdnn_tensor_in diff, - _megdnn_tensor_out grad, - _megdnn_workspace workspace): - SizeArgs(opr, filter.layout, diff.layout, grad.layout), - filter_tensor{&filter}, diff_tensor{&diff}, grad_tensor{&grad}, - workspace{workspace} -{ -} + ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter, + _megdnn_tensor_in diff, _megdnn_tensor_out grad, + _megdnn_workspace workspace) + : SizeArgs(opr, filter.layout, diff.layout, grad.layout), + filter_tensor{&filter}, + diff_tensor{&diff}, + grad_tensor{&grad}, + workspace{workspace} {} std::string ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::to_string() const { - auto &&fm = filter_meta; + auto&& fm = filter_meta; MEGDNN_MARK_USED_VAR(fm); return megdnn_mangle(ssprintf( - "filter=%u{%u,%u,%u,%u}, diff=%s, grad=%s, " - "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s", - fm.group, fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1], - diff_layout->to_string().c_str(), - grad_layout->to_string().c_str(), - fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1], - fm.dilation[0], fm.dilation[1], - !fm.should_flip, - diff_layout->dtype.name(), grad_layout->dtype.name())); + "filter=%u{%u,%u,%u,%u}, diff=%s, grad=%s, " + "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s", + fm.group, fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1], + diff_layout->to_string().c_str(), grad_layout->to_string().c_str(), + fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1], + fm.dilation[0], fm.dilation[1], !fm.should_flip, + diff_layout->dtype.name(), grad_layout->dtype.name())); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/algo.h b/dnn/src/cuda/convolution/backward_data/algo.h index e3cd9f94d..0f2a1e47d 100644 --- a/dnn/src/cuda/convolution/backward_data/algo.h +++ b/dnn/src/cuda/convolution/backward_data/algo.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #pragma once @@ -38,6 +39,7 @@ public: CUDA_CHANWISE_SMALL, CUDA_BFLOAT16, CUDA_GROUP_CONV_GENERAL, + CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8 }; using Mapper = std::unordered_map; @@ -240,9 +242,53 @@ public: } }; +class ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm final + : public AlgoBase { +public: + struct AlgoParam { + int threadblock_m; + int threadblock_n; + int threadblock_k; + int warp_m; + int warp_n; + int warp_k; + int stage; + std::string to_string() { + /// default algorithm + if (threadblock_m == 128 && threadblock_n == 128 && + threadblock_k == 32 && warp_m == 32 && warp_n == 64 && + warp_k == 32 && stage == 2) { + return ""; + } + return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, + threadblock_n, threadblock_k, warp_m, warp_n, + warp_k, stage); + } + }; + AlgoInt8NCHW4DotProdImplicitGemm(AlgoParam algo_param) + : m_algo_param{algo_param}, + m_name{ssprintf("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s", + m_algo_param.to_string().c_str())} {} + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { return m_name.c_str(); } + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE; + } + MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8) +private: + WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, + const SizeArgs& args) const; + AlgoParam m_algo_param; + std::string m_name; +}; + class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj { // defined in cudnn.cpp void fill_cudnn_algos(); + // defined in implicit_gemm_int8_nchw4_dp4a.cpp + void fill_int8_dp4a_algos(); AlgoBase::Mapper m_all_algos_map; @@ -256,12 +302,13 @@ public: std::vector gconv; std::unordered_map algo2gconv; AlgoBFloat16 bfloat16; + std::vector int8_nchw4_dotprod; std::vector //! all algorithms all_algos, //! non-cudnn algos, used for heuristic if cudnn is not supported - non_cudnn_algos, bfloat16_algos; + non_cudnn_algos, bfloat16_algos, int8_algos; AlgoCUDNN* cudnn_from_enum(cudnnConvolutionBwdDataAlgo_t algo); diff --git a/dnn/src/cuda/convolution/backward_data/chanwise.cpp b/dnn/src/cuda/convolution/backward_data/chanwise.cpp index f6272fca7..89562f4d0 100644 --- a/dnn/src/cuda/convolution/backward_data/chanwise.cpp +++ b/dnn/src/cuda/convolution/backward_data/chanwise.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "./algo.h" @@ -19,8 +20,10 @@ using namespace convolution; bool ConvolutionBackwardDataImpl::AlgoChanwise::is_available( const SizeArgs& args) const { - if (args.diff_layout->dtype == args.filter_layout->dtype && - args.diff_layout->dtype == dtype::BFloat16()) { + if ((args.diff_layout->dtype == args.filter_layout->dtype && + args.diff_layout->dtype == dtype::BFloat16()) || + (args.diff_layout->dtype == args.filter_layout->dtype && + args.diff_layout->dtype == dtype::QuantizedS8())) { return false; } auto&& fm = args.filter_meta; @@ -74,4 +77,3 @@ void ConvolutionBackwardDataImpl::AlgoChanwise::exec( } // vim: syntax=cpp.doxygen - diff --git a/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp b/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp index cc5b12e20..bb8ba182d 100644 --- a/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp +++ b/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/cuda/convolution/backward_data/algo.h" @@ -28,9 +29,11 @@ inline bool is_available_small(const chanwise::Param& param) { } // anonymous namespace bool ConvolutionBackwardDataImpl::AlgoChanwiseSmall::is_available( - const SizeArgs &args) const { - if (args.diff_layout->dtype == args.filter_layout->dtype && - args.diff_layout->dtype == dtype::BFloat16()) { + const SizeArgs& args) const { + if ((args.diff_layout->dtype == args.filter_layout->dtype && + args.diff_layout->dtype == dtype::BFloat16()) || + (args.diff_layout->dtype == args.filter_layout->dtype && + args.diff_layout->dtype == dtype::QuantizedS8())) { return false; } #if CUDA_VERSION < 9000 @@ -38,30 +41,29 @@ bool ConvolutionBackwardDataImpl::AlgoChanwiseSmall::is_available( return false; #endif auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args()); - auto &&fm = args.filter_meta; + auto&& fm = args.filter_meta; return args.filter_meta.format == Param::Format::NCHW && - args.diff_layout->dtype.category() == DTypeCategory::FLOAT && + args.diff_layout->dtype.category() == DTypeCategory::FLOAT && args.opr->param().compute_mode == Param::ComputeMode::DEFAULT && - fm.spatial_ndim == 2 && fm.icpg == 1 && - fm.dilation[0] == 1 && fm.dilation[1] == 1 && - !fm.should_flip && is_available_small(kparam); + fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 && + fm.dilation[1] == 1 && !fm.should_flip && is_available_small(kparam); } size_t ConvolutionBackwardDataImpl::AlgoChanwiseSmall::get_workspace_in_bytes( - const SizeArgs &) const { + const SizeArgs&) const { return 0; } void ConvolutionBackwardDataImpl::AlgoChanwiseSmall::exec( - const ExecArgs &args) const { + const ExecArgs& args) const { auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args()); auto stream = cuda_stream(args.handle); switch (args.grad_layout->dtype.enumv()) { case DTypeEnum::Float32: - return chanwise::run_bwd_data_small(args.grad_tensor->ptr(), - args.diff_tensor->ptr(), - args.filter_tensor->ptr(), kparam, - stream); + return chanwise::run_bwd_data_small( + args.grad_tensor->ptr(), + args.diff_tensor->ptr(), + args.filter_tensor->ptr(), kparam, stream); #if CUDA_VERSION >= 9000 case DTypeEnum::Float16: return chanwise::run_bwd_data_small( @@ -77,4 +79,3 @@ void ConvolutionBackwardDataImpl::AlgoChanwiseSmall::exec( } // vim: syntax=cpp.doxygen - diff --git a/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cu b/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cu new file mode 100644 index 000000000..1d054ae4f --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cu @@ -0,0 +1,100 @@ +/** + * \file src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" + +#if !MEGDNN_TEGRA_X1 +#include "cutlass/convolution/device/convolution.h" +#endif +#include "src/common/opr_param_defs_enumv.cuh" +#include "src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh" +#pragma GCC diagnostic pop + +using namespace megdnn; +using namespace cuda; +using namespace cutlass_wrapper; + +/* ================ cutlass kernel wrapper for nchw4 layout ================= */ +#if MEGDNN_TEGRA_X1 +void megdnn::cuda::cutlass_wrapper::do_deconv_int8_implicit_gemm_dp4a_ncdiv4hw4( + const int8_t* /* d_src */, const int8_t* /* d_filter */, + int8_t* /* d_dst */, int* /* workspace */, + const convolution::ConvParam& /* param */, float /* alpha */, + const GemmCoord& /* threadblock_shape */, + const GemmCoord& /* warp_shape */, int /* stages */, + cudaStream_t /* stream */) {} +#else +void megdnn::cuda::cutlass_wrapper::do_deconv_int8_implicit_gemm_dp4a_ncdiv4hw4( + const int8_t* d_src, const int8_t* d_filter, int8_t* d_dst, + int* workspace, const convolution::ConvParam& param, float alpha, + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, + int stages, cudaStream_t stream) { +#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ + threadblock_k_, warp_m_, warp_n_, \ + warp_k_, stage_, aligned_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_ && stages == stage_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ + using Deconvolution = cutlass::conv::device::Deconvolution< \ + int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ + cutlass::layout::TensorKxRSCx<4>, ElementOutput, \ + cutlass::layout::TensorNCxHWx<4>, int32_t, \ + cutlass::layout::TensorNCxHWx<4>, int32_t, \ + cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ + cutlass::conv::threadblock:: \ + ConvolutionDgradNCxHWxThreadblockSwizzle, \ + stage_, 4, aligned_>; \ + typename Deconvolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ + return cutlass_deconvolution_wrapper( \ + d_src, d_filter, nullptr, nullptr, d_dst, workspace, \ + conv_param, epilogue, stream); \ + } +#define DISPATCH_KERNEL \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 2, 4); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 128, 16, 16, 64, 16, 2, 4); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 128, 16, 16, 128, 16, 1, 8); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ + megdnn_assert(false, \ + "unsupported threadblock shape (%dx%dx%d) and warp shape " \ + "(%dx%dx%d)", \ + threadblock_shape.m(), threadblock_shape.n(), \ + threadblock_shape.k(), warp_shape.m(), warp_shape.n(), \ + warp_shape.k()); + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementBias = int32_t; + using ElementCompute = float; + using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, 0, 0}; + DISPATCH_KERNEL; + +#undef DISPATCH_KERNEL_WITH_TILE_SHAPE +#undef DISPATCH_KERNEL +} +#endif + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh b/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh new file mode 100644 index 000000000..35961673d --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh @@ -0,0 +1,44 @@ +/** + * \file src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once +#include "cutlass/gemm/gemm.h" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace cutlass_wrapper { + +using GemmCoord = cutlass::gemm::GemmCoord; + +template +void cutlass_deconvolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); + +void do_deconv_int8_implicit_gemm_dp4a_ncdiv4hw4( + const int8_t* d_src, const int8_t* d_filter, int8_t* d_dst, + int* workspace, const convolution::ConvParam& param, float alpha, + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, + int stages, cudaStream_t stream); + +} // namespace cutlass_wrapper +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cu b/dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cu new file mode 100644 index 000000000..f3d284c39 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cu @@ -0,0 +1,76 @@ +/** + * \file src/cuda/convolution/backward_data/deconv_int8_helper.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/cuda/convolution/backward_data/deconv_int8_helper.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace deconv; + +#define BLOCKSIZE_X 16 +#define BLOCKSIZE_Y 16 + +namespace { + +// +__global__ void reorder_filter_nc4hw4_to_n4hwc4_kernel( + int8_t* __restrict__ dst, const int8_t* __restrict__ src, uint32_t OC, + uint32_t IC, uint32_t FHFW) { + const int32_t ocb = blockIdx.z; + const int32_t icb = blockIdx.y * BLOCKSIZE_X + threadIdx.y; + const int32_t fhfw = blockIdx.x * BLOCKSIZE_Y + threadIdx.x; + + if (fhfw < FHFW && icb < IC / 4) { + int src0 = *reinterpret_cast( + src + (ocb * 4 + 0) * IC * FHFW + (icb * FHFW + fhfw) * 4); + int src1 = *reinterpret_cast( + src + (ocb * 4 + 1) * IC * FHFW + (icb * FHFW + fhfw) * 4); + int src2 = *reinterpret_cast( + src + (ocb * 4 + 2) * IC * FHFW + (icb * FHFW + fhfw) * 4); + int src3 = *reinterpret_cast( + src + (ocb * 4 + 3) * IC * FHFW + (icb * FHFW + fhfw) * 4); + // transpose 4x4 + int dst01_lo = __byte_perm(src0, src1, 0x5140); + int dst01_hi = __byte_perm(src0, src1, 0x7362); + int dst23_lo = __byte_perm(src2, src3, 0x5140); + int dst23_hi = __byte_perm(src2, src3, 0x7362); + int dst0 = __byte_perm(dst01_lo, dst23_lo, 0x5410); + int dst1 = __byte_perm(dst01_lo, dst23_lo, 0x7632); + int dst2 = __byte_perm(dst01_hi, dst23_hi, 0x5410); + int dst3 = __byte_perm(dst01_hi, dst23_hi, 0x7632); + + *reinterpret_cast( + dst + (ocb * FHFW * IC + fhfw * IC + icb * 4 + 0) * 4) = dst0; + *reinterpret_cast( + dst + (ocb * FHFW * IC + fhfw * IC + icb * 4 + 1) * 4) = dst1; + *reinterpret_cast( + dst + (ocb * FHFW * IC + fhfw * IC + icb * 4 + 2) * 4) = dst2; + *reinterpret_cast( + dst + (ocb * FHFW * IC + fhfw * IC + icb * 4 + 3) * 4) = dst3; + } +} + +} // namespace + +void megdnn::cuda::deconv::reorder_filter_nc4hw4_to_n4hwc4( + int8_t* dst, const int8_t* src, uint32_t OC, uint32_t IC, uint32_t FH, + uint32_t FW, cudaStream_t stream) { + dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y, 1); + dim3 blocks(DIVUP(FH * FW, BLOCKSIZE_X), DIVUP(IC / 4, BLOCKSIZE_Y), + OC / 4); + + reorder_filter_nc4hw4_to_n4hwc4_kernel<<>>( + dst, src, OC, IC, FH * FW); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cuh b/dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cuh new file mode 100644 index 000000000..f50b3c36d --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cuh @@ -0,0 +1,27 @@ +/** + * \file src/cuda/convolution/backward_data/deconv_int8_helper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace deconv { + +void reorder_filter_nc4hw4_to_n4hwc4(int8_t* dst, const int8_t* src, + uint32_t OC, uint32_t IC, uint32_t FH, + uint32_t FW, cudaStream_t stream); + +} // namespace deconv +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/group_conv.cpp b/dnn/src/cuda/convolution/backward_data/group_conv.cpp index e4df3ac8d..c2e3b9f45 100644 --- a/dnn/src/cuda/convolution/backward_data/group_conv.cpp +++ b/dnn/src/cuda/convolution/backward_data/group_conv.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "./algo.h" @@ -16,8 +17,8 @@ using namespace cuda; using namespace convolution; void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::modify_size_args( - ConvolutionBackwardDataImpl::AlgoBase::SizeArgs &args, - TensorLayout &diff_pg, TensorLayout &grad_pg) { + ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args, + TensorLayout& diff_pg, TensorLayout& grad_pg) { diff_pg = *args.diff_layout; grad_pg = *args.grad_layout; auto nr_grp = args.filter_meta.group; @@ -29,17 +30,18 @@ void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::modify_size_args( } ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral( - AlgoBase *impl): - m_impl{impl} -{ + AlgoBase* impl) + : m_impl{impl} { m_name = "group_conv:"; m_name += impl->name(); } bool ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::is_available( - const SizeArgs &args) const { - if (args.diff_layout->dtype == args.filter_layout->dtype && - args.diff_layout->dtype == dtype::BFloat16()) { + const SizeArgs& args) const { + if ((args.diff_layout->dtype == args.filter_layout->dtype && + args.diff_layout->dtype == dtype::BFloat16()) || + (args.diff_layout->dtype == args.filter_layout->dtype && + args.diff_layout->dtype == dtype::QuantizedS8())) { return false; } auto sub_args = args; @@ -48,8 +50,9 @@ bool ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::is_available( return m_impl->is_available(sub_args); } -size_t ConvolutionBackwardDataImpl::AlgoGroupConvGeneral:: -get_workspace_in_bytes(const SizeArgs &args) const { +size_t +ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::get_workspace_in_bytes( + const SizeArgs& args) const { auto sub_args = args; TensorLayout diff_pg, grad_pg; modify_size_args(sub_args, diff_pg, grad_pg); @@ -57,24 +60,24 @@ get_workspace_in_bytes(const SizeArgs &args) const { } void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::exec( - const ExecArgs &args) const { + const ExecArgs& args) const { auto sub_args = args; TensorND tflt{*args.filter_tensor}, tdiff{*args.diff_tensor}, - tgrad{*args.grad_tensor}; + tgrad{*args.grad_tensor}; modify_size_args(sub_args, tdiff.layout, tgrad.layout); sub_args.filter_tensor = &tflt; sub_args.diff_tensor = &tdiff; sub_args.grad_tensor = &tgrad; auto grp = args.filter_meta.group; - auto &&fm = args.filter_meta; - auto strd_flt = (fm.icpg * fm.ocpg * - fm.spatial[0] * fm.spatial[1] * tflt.layout.dtype.size()), - strd_diff = ( - tdiff.layout.stride[1] * fm.ocpg * tdiff.layout.dtype.size()), - strd_grad = ( - tgrad.layout.stride[1] * fm.icpg * tgrad.layout.dtype.size()); - for (uint32_t g = 0; g < grp; ++ g) { + auto&& fm = args.filter_meta; + auto strd_flt = (fm.icpg * fm.ocpg * fm.spatial[0] * fm.spatial[1] * + tflt.layout.dtype.size()), + strd_diff = + (tdiff.layout.stride[1] * fm.ocpg * tdiff.layout.dtype.size()), + strd_grad = + (tgrad.layout.stride[1] * fm.icpg * tgrad.layout.dtype.size()); + for (uint32_t g = 0; g < grp; ++g) { m_impl->exec(sub_args); incr_voidp(tflt.raw_ptr, strd_flt); incr_voidp(tdiff.raw_ptr, strd_diff); @@ -83,4 +86,3 @@ void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::exec( } // vim: syntax=cpp.doxygen - diff --git a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp new file mode 100644 index 000000000..9cb8e647e --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp @@ -0,0 +1,127 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh" +#include "src/cuda/convolution/backward_data/deconv_int8_helper.cuh" + +using namespace megdnn; +using namespace cuda; + +bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: + is_available(const SizeArgs& args) const { + auto&& fm = args.filter_meta; + if (fm.format != Param::Format::NCHW4) + return false; + + bool available = true; + + auto src_dtype = args.diff_layout->dtype, + filter_dtype = args.filter_layout->dtype, + dst_dtype = args.grad_layout->dtype; + + available &= (src_dtype.enumv() == DTypeEnum::QuantizedS8 && + filter_dtype.enumv() == DTypeEnum::QuantizedS8 && + dst_dtype.enumv() == DTypeEnum::QuantizedS8); + // TODO support group deconv int8 + available &= (fm.group == 1); + // mode must be cross correlation + available &= !fm.should_flip; + // mode must be 2D + available &= fm.spatial_ndim == 2; + // TODO: support dialtion + available &= (fm.dilation[0] == 1 && fm.dilation[1] == 1); + // FIXME: too large filter size is not supported now + available &= fm.spatial[0] * fm.spatial[1] <= 64; + // only support sm_61 or later, platform should have fast native int8 + // support + available &= is_compute_capability_required(6, 1); + + return available; +} + +WorkspaceBundle ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: + get_workspace_bundle(dt_byte* raw_ptr, const SizeArgs& args) const { + size_t ws_filter = args.filter_layout->span().dist_byte(); + return WorkspaceBundle{raw_ptr, {ws_filter}}; +} + +size_t ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: + get_workspace_in_bytes(const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( + const ExecArgs& args) const { + auto&& fm = args.filter_meta; + size_t n = args.diff_layout->operator[](0), + co = args.diff_layout->operator[](1) * 4, + ho = args.diff_layout->operator[](2), + wo = args.diff_layout->operator[](3); + size_t ci = args.grad_layout->operator[](1) * 4, + hi = args.grad_layout->operator[](2), + wi = args.grad_layout->operator[](3); + size_t fh = fm.spatial[0], fw = fm.spatial[1]; + size_t sh = fm.stride[0], sw = fm.stride[1]; + size_t ph = fm.padding[0], pw = fm.padding[1]; + + auto&& stream = cuda_stream(args.opr->handle()); + + int8_t* filter_ptr = nullptr; + // TODO: weight preprocess + { + filter_ptr = reinterpret_cast(args.workspace.raw_ptr); + // reformat filter from nc4hw4 to n4hwc4 + megdnn::cuda::deconv::reorder_filter_nc4hw4_to_n4hwc4( + filter_ptr, args.filter_tensor->compatible_ptr(), co, + ci, fh, fw, stream); + } + convolution::ConvParam kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, + kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, + kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, + kern_param.fw = fw; + + float diff_scale = + args.diff_layout->dtype.param().scale, + filter_scale = + args.filter_layout->dtype.param().scale, + grad_scale = + args.grad_layout->dtype.param().scale; + float alpha = diff_scale * filter_scale / grad_scale; + cutlass_wrapper::do_deconv_int8_implicit_gemm_dp4a_ncdiv4hw4( + args.diff_tensor->compatible_ptr(), filter_ptr, + args.grad_tensor->compatible_ptr(), nullptr, kern_param, + alpha, + cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, + m_algo_param.warp_k}, + m_algo_param.stage, stream); + + after_kernel_launch(); +} + +void ConvolutionBackwardDataImpl::AlgoPack::fill_int8_dp4a_algos() { + using AlgoParam = AlgoInt8NCHW4DotProdImplicitGemm::AlgoParam; + int8_nchw4_dotprod.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1}); + int8_nchw4_dotprod.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2}); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/int8/deconv_int8_implicit_gemm_cutlass_wrapper.cuinl b/dnn/src/cuda/convolution/backward_data/int8/deconv_int8_implicit_gemm_cutlass_wrapper.cuinl new file mode 100644 index 000000000..f5382f919 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/int8/deconv_int8_implicit_gemm_cutlass_wrapper.cuinl @@ -0,0 +1,62 @@ +/** + * \file + * dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "cutlass/convolution/device/convolution.h" +#include "src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace cutlass_wrapper; + +template +void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( + const typename Deconvolution::ElementSrc* d_src, + const typename Deconvolution::ElementFilter* d_filter, + const typename Deconvolution::ElementBias* d_bias, + const typename Deconvolution::ElementDst* d_z, + typename Deconvolution::ElementDst* d_dst, int* workspace, + typename Deconvolution::ConvolutionParameter const& conv_param, + typename Deconvolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream) { + typename Deconvolution::TensorRefSrc tensor_src{ + const_cast(d_src), + Deconvolution::LayoutSrc::packed( + {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; + typename Deconvolution::TensorRefFilter tensor_filter{ + const_cast(d_filter), + Deconvolution::LayoutFilter::packed( + {conv_param.K, conv_param.R, conv_param.S, conv_param.C})}; + typename Deconvolution::TensorRefBias tensor_bias{ + const_cast(d_bias), + Deconvolution::LayoutBias::packed({1, 1, 1, conv_param.K})}; + typename Deconvolution::TensorRefDst tensor_z{ + const_cast(d_z), + Deconvolution::LayoutDst::packed( + {conv_param.N, conv_param.H, conv_param.W, conv_param.C})}; + typename Deconvolution::TensorRefDst tensor_dst{ + d_dst, + Deconvolution::LayoutDst::packed( + {conv_param.N, conv_param.H, conv_param.W, conv_param.C})}; + typename Deconvolution::Arguments arguments{conv_param, + tensor_src.non_const_ref(), + tensor_filter.non_const_ref(), + tensor_bias.non_const_ref(), + tensor_z.non_const_ref(), + tensor_dst.non_const_ref(), + epilogue}; + Deconvolution deconv_op; + cutlass_check(deconv_op.initialize(arguments, workspace)); + cutlass_check(deconv_op(stream)); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..296c397a9ee3dae4705d104ee50e0e22a1f7757f GIT binary patch literal 1837 zcmbVN*-jfV6n)QExIwB~6o_amgiMMANt#w=sTd^sHgardV#VXJyd;FL@3m*i1c-p< z!EtP#yPb1y_QVW*xEfDJ*VpOYcyfE5{ypmS`zYgz8{#-a({=cfQqM?A3$;wAg4lGy zjY>tXr9IT^01%lf3`Z+slu#KeW{7$xiM5?QZAg|AG#L&N69nE_Cn(b1&Uf9F#u(}z z+^ZsG*?d)_5^Fv5TZ#V4s!)-Er4jWmw8}rQ7-P~}(E@VL$$((53hKB4g zp1>Nf21my|boAu}mg6SWdt&r|0e>kV#nVvk*YV)8hc@*YYaxrw^Bdg`msh74yO9XjW(i>li;g7DyiR`#!$;Eptlv#OK0(MB}$Bmp>vFHG~{e;B62`K zxl8h6sBT0(RYMOo{PxgO#(HSOLxpcFmnA59m3d;)qLhcN_Ov@8TeInS6<7>6bg>sGUF4$A(n z*sObbutnb4r$#c6AGDcUYp!4lp{-{ak0sB!3QfO{SPB!S$|>mWO~{h}0G`Dm@Lqfn zen5xG{9Y^wEI=PyxL5M55G*LOsSs#&fN#<|e(LT;5%z|BgESKFYY_hB(g9bRB-A)H9OOLM_v&AU0ia zqf(J;X%F=}07Rw=!_kTuB~*rr8KT}vVr^$n8#fr4ef_i`XQA+YS5kc$bV`!p7F^#=LNKKR2APAF`l}WKJ}1Lqm2L zU&0!%hDU=QIyyduWzdBBK#YDDaB#AP`>lWzPeZleh6A#PHszUUA&boOTiuS9SEm@e zm=nf7fb9#$u7u*mjEa0J6bTMTlH_{WL9cuwy$+o95Ny94r?77}Gg%PVWIm39s1(x< zB@4p@$lJd+0f1J+$GW!Z()75|lhlgSIdpZQ5&`8Nx~>mLB7iWF?-S zB(F>zM-EPU2vky^HRkK2uDx_Z%QcXk5a+{q0(u;WDFj=zECfS2&zMpehaL~>RysQ#E^_zZo2Ql6xIl84~T?JAsK)OMWtgLbqhm#zLcIXC6~^MT=gb|&LMG@ zFk(qhD0|jpJk;vXp-e%AO;Vm^(1=i0vNS|%ZVssvN_IOP=R=`uvd*224MpGU24F0M z5#tB0irm}|RzDwC{gaFHEzA!gp}gI&->dt0@DesG>~749P!BW13A>UxoyRkxc^m4h z!*~qO@Tz}w8llsZ2%R;FJ`$t<1NKV+A&&Z1zm1O2QG_%7|wK8ZOq|rr)apq%{rjquJHKba;0%}_sy=DuSD?wsR44q+or=jMeh(!kI zXYb04 zVzZv({uX%`UmD3keK2Bnt+|2?gto3>Jd!-)$`AZLVhL=QswS_ucOgsuJ$M=xz(?_3 z_#W-I@<*{CFb{od;ohjTLNKq)rbD3deYDKYywHT!1o{T-a@NwvpL(yC-TLgm7D?Fl ztq8juHL7%0JA0=w4*rVTS-mG(uDmv43tKQtEP}lRhp*?m{!?_WJ9}I)G41>V9N}_A literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..400b5db240a6868af8ef39dcf6faa217eef16692 GIT binary patch literal 1837 zcmb7FTTk0C6n^io@QO5TC{R%lLZ(B4w(F*GnFtbli(K1HtvZh7*e&JP@7TFC1uCEq zse9t@+`jLe?ui-tax#FH@AkX!BM2LzO?$>#$UJrYPPen= zl@((fYQp#m*uG@!S}0D;Fwdq!5g%|UNu~!K^u{O9Yav*K;CS3m;M^Q$GAFjFd=vyx zDW;u>*9pdqF9qcoZ(S>c)CJ$#&;TO zE{a%WfPV2V&Q77b0rgT05o+j-&`ZW5wBez`H=fHDlsrtmwlE%U#%q)s!b&BU5yO>u zC7z!puUvuDPJs5)2!Tv$W8=TB>c&Ycv|IzrF|p3?C*a3&SVM3`%Um#&@sufru^)2( z+#1%6p`7oE&U%&yTkKtYYcvA|!kF2$<_fkD+Pa4ESn`Z3KlJ;EC9q+tqP*Zfge(Q% z!LzUgK8yFt_h`SFKZ^y0dFXQs_fDM^f_Z5+9RiW>qg8Geg(kEn&^KV0vz9>q*1Nsz z)>r?vOv1)*McC!2k)^ZR**lGK@K@B%>P69V<+Tx8*o0YP5iBM+d_UjyzoK*9+2e|d GS?50}6mtFm literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu new file mode 100644 index 0000000000000000000000000000000000000000..c149a8e1b1bd7db1f72f2d06afee60c913ef14b8 GIT binary patch literal 1837 zcmb7FTTk0C6n^io@QO5TC{R%lLZ(B4w(F*GnFtbli(K1HtvZh7*e&JP@7TFC1uCEq zse9t@+`jLe?ui-taxuBtl31vj`pcn^Yf((H{aH_qTAr6>#FH@AkX!BM2LzO?$>#$UJrYPPen= zl@((fYQp#m*uG@!S}0D;Fwdq!5g%|UNu~!K^u{O9Yav*K;CS3m;M^Q$GAFjFd=vyx zDW;u>*9pdqF9qcoZ(S>c)CJ$#&;TO zE{a%WfPV2V&Q77b0rgT05o+j-&`ZW5wBez`H=fHDlsrtmwlE%U#%q)s!b&BU5yO>u zC7z!puUvszPJs5)2!Tv$W8=TB>c&Ycv|IzrF|p3?C*a3&SVM3`%Um#&@sufru^)2( z+#1%6p`7oE&U%&yTkKtYYcvA|!kF2$<_fkD+Pa4ESn`Z3KlJ;EC9q+tqP*Zfge(Q% z!LzUgK8yFt_h`SFKZ^y0dFXQs_fDM^f_Z5+9RiW>qg8Geg(kEn&^KV0vz9>q*1Nsz z)>r?vOv1)*McC!2k)^ZR**lGK@K@B%>P69V<+Tx8*o0YP5iBM+d_UjyzoK*9+2e|d GS?51NfO7)? literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/convolution/opr_impl.cpp b/dnn/src/cuda/convolution/opr_impl.cpp index badbde22d..a72d9c8f5 100644 --- a/dnn/src/cuda/convolution/opr_impl.cpp +++ b/dnn/src/cuda/convolution/opr_impl.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/cuda/convolution/opr_impl.h" @@ -25,8 +26,9 @@ using namespace convolution; #define TO_STRING2(v) #v #define TO_STRING(v) TO_STRING2(v) -#define CUDNN_VERSION_STR TO_STRING(CUDNN_MAJOR) "." \ - TO_STRING(CUDNN_MINOR) "." TO_STRING(CUDNN_PATCHLEVEL) +#define CUDNN_VERSION_STR \ + TO_STRING(CUDNN_MAJOR) \ + "." TO_STRING(CUDNN_MINOR) "." TO_STRING(CUDNN_PATCHLEVEL) /* ============== ConvolutionForwardImpl ============== */ ConvolutionForwardImpl::Algorithm* @@ -72,25 +74,24 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, } const char* ConvolutionForwardImpl::get_algorithm_set_name() const { - return "CUDA CONVOLUTION_FORWARD" ; + return "CUDA CONVOLUTION_FORWARD"; } /* ============== ConvolutionBackwardDataImpl ============== */ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter, - _megdnn_tensor_in diff, - _megdnn_tensor_out grad, - _megdnn_workspace workspace) { + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) { AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); - auto algo = get_algorithm(this, filter.layout, - diff.layout, grad.layout); + auto algo = get_algorithm(this, filter.layout, diff.layout, grad.layout); algo->check_workspace(args, workspace).exec(args); } -std::vector -ConvolutionBackwardDataImpl::get_all_algorithms(const TensorLayout &filter, - const TensorLayout &diff, - const TensorLayout &grad) { +std::vector +ConvolutionBackwardDataImpl::get_all_algorithms(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad) { return megdnn::get_all_algorithms( {this, filter, diff, grad}); } @@ -106,10 +107,10 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic( } ConvolutionBackwardDataImpl::Algorithm* -ConvolutionBackwardDataImpl::get_algorithm_heuristic(const TensorLayout& filter, - const CanonizedFilterMeta& filter_meta, const TensorLayout& diff, - const TensorLayout& grad, size_t workspace_limit_in_bytes, - bool reproducible) { +ConvolutionBackwardDataImpl::get_algorithm_heuristic( + const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, + const TensorLayout& diff, const TensorLayout& grad, + size_t workspace_limit_in_bytes, bool reproducible) { AlgoBase::SizeArgs args(this, filter, filter_meta, diff, grad); if (args.filter_meta.group > 1 && @@ -119,6 +120,19 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic(const TensorLayout& filter, return &sm_algo_pack.chanwise; } + if (args.filter_layout->dtype.enumv() == + DTypeTrait::enumv) { + if (reproducible) { + return megdnn::get_reproducible_algo( + sm_algo_pack.int8_algos, args, workspace_limit_in_bytes, + "cuda conv bwd_data"); + } else { + return megdnn::get_usable_algo( + sm_algo_pack.int8_algos, args, workspace_limit_in_bytes, + "cuda conv bwd_data"); + } + } + auto get_cudnn_algo = [this, &args, workspace_limit_in_bytes, reproducible]() -> ConvolutionBackwardDataImpl::AlgoBase* { @@ -206,12 +220,11 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic(const TensorLayout& filter, } size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes( - const TensorLayout &filter, - const TensorLayout &diff, - const TensorLayout &grad) { + const TensorLayout& filter, const TensorLayout& diff, + const TensorLayout& grad) { AlgoBase::SizeArgs args(this, filter, diff, grad); - return get_algorithm(this, filter, args.filter_meta, diff, grad)-> - get_workspace_in_bytes(args); + return get_algorithm(this, filter, args.filter_meta, diff, grad) + ->get_workspace_in_bytes(args); } const char* ConvolutionBackwardDataImpl::get_algorithm_set_name() const { @@ -221,19 +234,19 @@ const char* ConvolutionBackwardDataImpl::get_algorithm_set_name() const { /* ============== ConvolutionBackwardFilterImpl ============== */ void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src, - _megdnn_tensor_in diff, - _megdnn_tensor_out grad, - _megdnn_workspace workspace) { + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) { AlgoBase::ExecArgs args(this, src, diff, grad, workspace); - auto algo = get_algorithm(this, src.layout, diff.layout, - grad.layout, args.grad_filter_meta); + auto algo = get_algorithm(this, src.layout, diff.layout, grad.layout, + args.grad_filter_meta); algo->check_workspace(args, workspace).exec(args); } -std::vector -ConvolutionBackwardFilterImpl::get_all_algorithms(const TensorLayout &src, - const TensorLayout &diff, - const TensorLayout &grad) { +std::vector +ConvolutionBackwardFilterImpl::get_all_algorithms(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad) { return megdnn::get_all_algorithms( {this, src, diff, grad}); } @@ -269,7 +282,7 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic( CUDNNBwdFilterDescs desc; args.init_desc(desc); - //disable, segfault in megbrain, need further investigate. + // disable, segfault in megbrain, need further investigate. #if 0 auto is_heuristic_success = convolution::PerformanceModelBackwardFilter:: @@ -358,12 +371,11 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic( } size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes( - const TensorLayout &src, - const TensorLayout &diff, - const TensorLayout &grad) { + const TensorLayout& src, const TensorLayout& diff, + const TensorLayout& grad) { AlgoBase::SizeArgs args(this, src, diff, grad); - return get_algorithm(this, src, diff, grad, args.grad_filter_meta)-> - get_workspace_in_bytes(args); + return get_algorithm(this, src, diff, grad, args.grad_filter_meta) + ->get_workspace_in_bytes(args); } const char* ConvolutionBackwardFilterImpl::get_algorithm_set_name() const { diff --git a/dnn/src/cuda/convolution/opr_impl.h b/dnn/src/cuda/convolution/opr_impl.h index 1ca8db092..7f9dbeaa7 100644 --- a/dnn/src/cuda/convolution/opr_impl.h +++ b/dnn/src/cuda/convolution/opr_impl.h @@ -105,6 +105,7 @@ public: class AlgoChanwiseSmall; class AlgoGroupConvGeneral; class AlgoBFloat16; + class AlgoInt8NCHW4DotProdImplicitGemm; class AlgoPack; diff --git a/dnn/test/common/convolution.cpp b/dnn/test/common/convolution.cpp index 5caebcf86..e3aea119e 100644 --- a/dnn/test/common/convolution.cpp +++ b/dnn/test/common/convolution.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "test/common/checker.h" @@ -44,14 +45,12 @@ std::vector convolution::get_args_common() { param::Convolution param; param.mode = param::Convolution::Mode::CONVOLUTION; - args.emplace_back(param, - TensorShape{5, 2, i, i+1}, - TensorShape{3, 2, 3, 4}); + args.emplace_back(param, TensorShape{5, 2, i, i + 1}, + TensorShape{3, 2, 3, 4}); param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{5, 2, i, i+1}, - TensorShape{3, 2, 3, 4}); + args.emplace_back(param, TensorShape{5, 2, i, i + 1}, + TensorShape{3, 2, 3, 4}); } return args; @@ -65,14 +64,12 @@ std::vector convolution::get_args_padding() { param.pad_w = 2; param.mode = param::Convolution::Mode::CONVOLUTION; - args.emplace_back(param, - TensorShape{5, 2, i, i+1}, - TensorShape{3, 2, 3, 4}); + args.emplace_back(param, TensorShape{5, 2, i, i + 1}, + TensorShape{3, 2, 3, 4}); param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{5, 2, i, i+1}, - TensorShape{3, 2, 3, 4}); + args.emplace_back(param, TensorShape{5, 2, i, i + 1}, + TensorShape{3, 2, 3, 4}); } return args; @@ -84,14 +81,12 @@ std::vector convolution::get_args_large_channel() { param::Convolution param; param.mode = param::Convolution::Mode::CONVOLUTION; - args.emplace_back(param, - TensorShape{2, 20, i, i+1}, - TensorShape{30, 20, 3, 4}); + args.emplace_back(param, TensorShape{2, 20, i, i + 1}, + TensorShape{30, 20, 3, 4}); param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{2, 20, i, i+1}, - TensorShape{30, 20, 3, 4}); + args.emplace_back(param, TensorShape{2, 20, i, i + 1}, + TensorShape{30, 20, 3, 4}); } for (size_t i = 16; i < 24; ++i) { param::Convolution param; @@ -99,14 +94,12 @@ std::vector convolution::get_args_large_channel() { param.pad_w = 2; param.mode = param::Convolution::Mode::CONVOLUTION; - args.emplace_back(param, - TensorShape{2, 20, i, i+1}, - TensorShape{30, 20, 3, 4}); + args.emplace_back(param, TensorShape{2, 20, i, i + 1}, + TensorShape{30, 20, 3, 4}); param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{2, 20, i, i+1}, - TensorShape{30, 20, 3, 4}); + args.emplace_back(param, TensorShape{2, 20, i, i + 1}, + TensorShape{30, 20, 3, 4}); } return args; @@ -118,14 +111,12 @@ std::vector convolution::get_args_1x1() { param::Convolution param; param.mode = param::Convolution::Mode::CONVOLUTION; - args.emplace_back(param, - TensorShape{2, 20, i, i+1}, - TensorShape{30, 20, 1, 1}); + args.emplace_back(param, TensorShape{2, 20, i, i + 1}, + TensorShape{30, 20, 1, 1}); param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{2, 20, i, i+1}, - TensorShape{30, 20, 1, 1}); + args.emplace_back(param, TensorShape{2, 20, i, i + 1}, + TensorShape{30, 20, 1, 1}); } return args; @@ -137,14 +128,12 @@ std::vector convolution::get_args_large_filter() { param::Convolution param; param.mode = param::Convolution::Mode::CONVOLUTION; - args.emplace_back(param, - TensorShape{2, 2, i, i+1}, - TensorShape{3, 2, 7, 8}); + args.emplace_back(param, TensorShape{2, 2, i, i + 1}, + TensorShape{3, 2, 7, 8}); param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{2, 2, i, i+1}, - TensorShape{3, 2, 7, 8}); + args.emplace_back(param, TensorShape{2, 2, i, i + 1}, + TensorShape{3, 2, 7, 8}); } return args; @@ -181,9 +170,8 @@ std::vector convolution::get_args_4x4() { for (size_t oh = 1; oh < 20; ++oh) { param::Convolution param; param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{4, 3, oh+3, oh+4}, - TensorShape{2, 3, 4, 4}); + args.emplace_back(param, TensorShape{4, 3, oh + 3, oh + 4}, + TensorShape{2, 3, 4, 4}); } return args; @@ -289,26 +277,22 @@ std::vector convolution::get_args_fallback_non_templated_impl() { std::vector convolution::get_args_cudnn_5_1_failures() { std::vector args; args.emplace_back( - param::Convolution{ - param::Convolution::Mode::CROSS_CORRELATION, 0, 4, 1, 2}, - TensorShape{5, 3, 25, 20}, - TensorShape{10, 3, 7, 4} - ); + param::Convolution{param::Convolution::Mode::CROSS_CORRELATION, 0, + 4, 1, 2}, + TensorShape{5, 3, 25, 20}, TensorShape{10, 3, 7, 4}); return args; } std::vector convolution::get_args_x86_winograd_algorithm() { std::vector args; - for (size_t ic_size: {8, 16}) - { + for (size_t ic_size : {8, 16}) { param::Convolution param; param.mode = param::Convolution::Mode::CROSS_CORRELATION; param.stride_h = param.stride_w = 1; param.pad_h = param.pad_w = 0; - args.emplace_back(param, - TensorShape{2, ic_size, 102, 102}, - TensorShape{8, ic_size, 3, 3}); + args.emplace_back(param, TensorShape{2, ic_size, 102, 102}, + TensorShape{8, ic_size, 3, 3}); } return args; @@ -317,18 +301,15 @@ std::vector convolution::get_args_x86_winograd_algorithm() { std::vector convolution::get_args_BRAIN_481() { std::vector args; { - param::Convolution param{param::Convolution::Mode::CROSS_CORRELATION, - 0, 1, 1, 2}; - args.emplace_back(param, - TensorShape{4, 4, 14, 13}, - TensorShape{3, 4, 8, 13}); - for (size_t margin = 0; margin < 5; ++margin) - { - param::Convolution param{param::Convolution::Mode::CROSS_CORRELATION, - 1, 1, 2, 2}; - args.emplace_back(param, - TensorShape{4, 4, 14, 13}, - TensorShape{3, 4, 16-margin, 15-margin}); + param::Convolution param{param::Convolution::Mode::CROSS_CORRELATION, 0, + 1, 1, 2}; + args.emplace_back(param, TensorShape{4, 4, 14, 13}, + TensorShape{3, 4, 8, 13}); + for (size_t margin = 0; margin < 5; ++margin) { + param::Convolution param{ + param::Convolution::Mode::CROSS_CORRELATION, 1, 1, 2, 2}; + args.emplace_back(param, TensorShape{4, 4, 14, 13}, + TensorShape{3, 4, 16 - margin, 15 - margin}); } } @@ -337,7 +318,7 @@ std::vector convolution::get_args_BRAIN_481() { std::vector convolution::get_args() { std::vector all_args, args; -#define ADD_ARGS(NAME) \ +#define ADD_ARGS(NAME) \ args = get_args_##NAME(); \ all_args.insert(all_args.end(), args.begin(), args.end()); ADD_ARGS(common) @@ -356,12 +337,12 @@ std::vector convolution::get_args() { ADD_ARGS(BRAIN_481) #undef ADD_ARGS - return all_args; + return all_args; } std::vector convolution::get_args_cuda_conv_bwd_data() { std::vector all_args, args; -#define ADD_ARGS(NAME) \ +#define ADD_ARGS(NAME) \ args = get_args_##NAME(); \ all_args.insert(all_args.end(), args.begin(), args.end()); ADD_ARGS(common) @@ -378,19 +359,19 @@ std::vector convolution::get_args_cuda_conv_bwd_data() { ADD_ARGS(x86_winograd_algorithm) #undef ADD_ARGS - return all_args; + return all_args; } std::vector convolution::get_args_cudnn_7_5_failures() { std::vector all_args, args; -#define ADD_ARGS(NAME) \ +#define ADD_ARGS(NAME) \ args = get_args_##NAME(); \ all_args.insert(all_args.end(), args.begin(), args.end()); ADD_ARGS(cudnn_5_1_failures) ADD_ARGS(BRAIN_481) #undef ADD_ARGS - return all_args; + return all_args; } std::vector convolution::get_chanwise_args() { std::vector args; @@ -421,12 +402,9 @@ std::vector convolution::get_dilated_args() { param::Convolution param; param.pad_h = param.pad_w = 2; param.dilate_h = param.dilate_w = 2; - size_t n = 1, ic = 15, ih = 128, iw = 128, - fh = 3, fw = 3, - oc = 17; - args.emplace_back(param, - TensorShape{n, ic, ih, iw}, - TensorShape{oc, ic, fh, fw}); + size_t n = 1, ic = 15, ih = 128, iw = 128, fh = 3, fw = 3, oc = 17; + args.emplace_back(param, TensorShape{n, ic, ih, iw}, + TensorShape{oc, ic, fh, fw}); // exhaustive search // clang-format off for (size_t n: {2}) @@ -451,13 +429,44 @@ std::vector convolution::get_dilated_args() { return args; } -void convolution::test_conv_config_combinations(int k_size, - Handle* handle, bool test_int8, - bool test_backward, - bool is_cuda, - ConvEPSGetter eps_getter, - bool use_io16xc32) { - Checker checker(handle); +std::vector convolution::get_args_int8_nchw4_conv_bwd_data() { + std::vector args; + param::Convolution cur_param; + + // clang-format off + for (auto mode : {param::ConvBias::Mode::CROSS_CORRELATION}) { + for (size_t b : {64, 16}) { + for (size_t ic : {16, 32}) { + for (size_t oc : {16, 32}) { + for (size_t h : {8}) { + for (size_t w : {8, 11}) { + for (size_t kernel_size : {3, 4, 5, 7}) { + for (int p : {0, static_cast(kernel_size / 2)}) { + for (size_t s : {2}) { + if (kernel_size >= 7) { + b = std::min(b, 32_z); + } + size_t f = kernel_size; + cur_param.mode = mode; + + cur_param.format = param::ConvBias::Format::NCHW4; + cur_param.sparse = param::ConvBias::Sparse::DENSE; + cur_param.pad_h = cur_param.pad_w = p; + cur_param.stride_h = cur_param.stride_w = s; + + //! bias channel + args.emplace_back(cur_param, TensorShape{b, ic / 4, h, w, 4}, + TensorShape{oc, ic / 4, f, f, 4}); + } } } } } } } } } + // clang-format on + + return args; +} + +void convolution::test_conv_config_combinations( + int k_size, Handle* handle, bool test_int8, bool test_backward, + bool is_cuda, ConvEPSGetter eps_getter, bool use_io16xc32) { +Checker checker(handle); std::unique_ptr> checker_bwd_data_ptr; std::unique_ptr> checker_bwd_filter_ptr; if (test_backward) { @@ -657,7 +666,6 @@ void convolution::test_conv_config_combinations(int k_size, } } } - } // vim: syntax=cpp.doxygen diff --git a/dnn/test/common/convolution.h b/dnn/test/common/convolution.h index 2d9a338d6..99b3a8f97 100644 --- a/dnn/test/common/convolution.h +++ b/dnn/test/common/convolution.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #pragma once @@ -47,6 +48,7 @@ std::vector get_args_cudnn_7_5_failures(); std::vector get_1x1_args(); std::vector get_dilated_args(); std::vector get_chanwise_args(); +std::vector get_args_int8_nchw4_conv_bwd_data(); //! \param stage 0 for fwd, 1 for bwd data, 2 for bwd filter using ConvEPSGetter = diff --git a/dnn/test/cuda/convolution.cpp b/dnn/test/cuda/convolution.cpp index 8dd7f4b8c..5d6a2a6f9 100644 --- a/dnn/test/cuda/convolution.cpp +++ b/dnn/test/cuda/convolution.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "megdnn/dtype.h" #include "megdnn/oprs.h" @@ -18,7 +19,6 @@ #include "test/common/convolution.h" #include "test/common/rng.h" #include "test/cuda/benchmark.h" - #include "src/cuda/utils.h" #define V1(x) #x @@ -29,8 +29,7 @@ namespace megdnn { namespace test { -TEST_F(CUDA, CONVOLUTION_8X8X32) -{ +TEST_F(CUDA, CONVOLUTION_8X8X32) { if (!cuda::is_compute_capability_required(6, 1)) { printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device" "doesn't support\n"); @@ -41,66 +40,63 @@ TEST_F(CUDA, CONVOLUTION_8X8X32) std::vector args; { auto v = get_args(); - for (auto &&a: v) { + for (auto&& a : v) { args.push_back(std::move(a)); } } { auto v = get_dilated_args(); - for (auto &&a: v) { + for (auto&& a : v) { args.push_back(std::move(a)); } } { auto v = get_chanwise_args(); - for (auto &&a: v) { + for (auto&& a : v) { args.push_back(std::move(a)); } } Checker checker(handle_cuda()); UniformIntRNG rng(-4, 4); - for (auto arg: args) { + for (auto arg : args) { arg.param.format = param::Convolution::Format::NHWC; arg.src = cvt_src_or_dst_nchw2nhwc(arg.src); arg.filter = cvt_filter_nchw2nhwc(arg.filter); - checker.set_dtype(0, dtype::Int8()). - set_dtype(1, dtype::Int8()). - set_dtype(2, dtype::Int32()). - set_param(arg.param). - set_rng(0, &rng). - set_rng(1, &rng). - execs({arg.src, arg.filter, {}}); + checker.set_dtype(0, dtype::Int8()) + .set_dtype(1, dtype::Int8()) + .set_dtype(2, dtype::Int32()) + .set_param(arg.param) + .set_rng(0, &rng) + .set_rng(1, &rng) + .execs({arg.src, arg.filter, {}}); } } -TEST_F(CUDA, CONVOLUTION_FORWARD) -{ +TEST_F(CUDA, CONVOLUTION_FORWARD) { using namespace convolution; std::vector args = get_args(); Checker checker(handle_cuda()); NormalRNG default_rng; - for (auto &&arg: args) { + for (auto&& arg : args) { float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); UniformFloatRNG rng(scale, 2 * scale); - checker. - set_dtype(0, dtype::Float32()). - set_dtype(1, dtype::Float32()). - set_dtype(2, dtype::Float32()). - set_rng(0, &default_rng). - set_rng(1, &default_rng). - set_epsilon(1e-3). - set_param(arg.param). - execs({arg.src, arg.filter, {}}); - checker. - set_dtype(0, dtype::Float16()). - set_dtype(1, dtype::Float16()). - set_dtype(2, dtype::Float16()). - set_rng(0, &rng). - set_rng(1, &rng). - set_epsilon(1e-1). - set_param(arg.param). - execs({arg.src, arg.filter, {}}); + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_epsilon(1e-3) + .set_param(arg.param) + .execs({arg.src, arg.filter, {}}); + checker.set_dtype(0, dtype::Float16()) + .set_dtype(1, dtype::Float16()) + .set_dtype(2, dtype::Float16()) + .set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-1) + .set_param(arg.param) + .execs({arg.src, arg.filter, {}}); arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; checker.set_dtype(0, dtype::Float16()) .set_dtype(1, dtype::Float16()) @@ -152,51 +148,49 @@ TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) { checker.exec({{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {}}); } -TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) -{ +TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) { using namespace convolution; std::vector args = get_1x1_args(); Checker checker(handle_cuda()); NormalRNG default_rng; - for (auto &&arg: args) { - float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); + for (auto&& arg : args) { + float scale = + 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); UniformFloatRNG rng(scale, 2 * scale); - checker. - set_dtype(0, dtype::Float32()). - set_dtype(1, dtype::Float32()). - set_rng(0, &default_rng). - set_rng(1, &default_rng). - set_epsilon(1e-3). - set_param(arg.param). - execs({arg.src, arg.filter, {}}); + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_epsilon(1e-3) + .set_param(arg.param) + .execs({arg.src, arg.filter, {}}); } } -TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) -{ +TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) { using namespace convolution; std::vector args = get_1x1_args(); Benchmarker marker(handle_cuda()); NormalRNG default_rng; - for (auto &&arg: args) { - float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); + for (auto&& arg : args) { + float scale = + 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); UniformFloatRNG rng(scale, 2 * scale); - marker.set_dtype(0, dtype::Float32()). - set_dtype(1, dtype::Float32()). - set_rng(0, &default_rng). - set_rng(1, &default_rng). - set_param(arg.param). - execs({arg.src, arg.filter, {}}); + marker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_param(arg.param) + .execs({arg.src, arg.filter, {}}); } } -TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) -{ +TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) { using namespace convolution; std::vector args = get_args_cuda_conv_bwd_data(); Checker checker(handle_cuda()); NormalRNG default_rng; - for (auto &&arg: args) { + for (auto&& arg : args) { float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); UniformFloatRNG rng(scale, 2 * scale); @@ -243,8 +237,7 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) } } -TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) -{ +TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) { using namespace convolution; std::vector args = get_args_cuda_conv_bwd_data(); Checker checker(handle_cuda()); @@ -252,7 +245,7 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) checker.set_before_exec_callback(AlgoChecker( ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}})); NormalRNG default_rng; - for (auto &&arg: args) { + for (auto&& arg : args) { float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); UniformFloatRNG rng(scale, 2 * scale); @@ -273,9 +266,39 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) } } +TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_DP4A) { + if (!cuda::is_compute_capability_required(6, 1)) { + printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_DP4A test as current " + "device doesn't support\n"); + return; + } + + using namespace convolution; + std::vector args = get_args_int8_nchw4_conv_bwd_data(); + Checker checker(handle_cuda()); + + checker.set_before_exec_callback(AlgoChecker( + "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM")); + + checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1); -TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) -{ + for (auto&& arg : args) { + UniformIntRNG rng(-3, 3); + auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f}); + auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f}); + TensorLayout dst; + dst.dtype = dtype::QuantizedS8{1.2f}; + { + auto opr = handle_cuda()->create_operator(); + opr->param() = arg.param; + opr->deduce_layout(src, filter, dst); + } + checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec( + TensorLayoutArray{filter, dst, src}); + } +} + +TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) { // BRAIN-481 failed on architectures 7.0, remove the following if statement, // when cudnn fixed the problem. if (cuda::is_compute_capability_required(7, 0)) @@ -284,8 +307,9 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) std::vector args = get_args_cudnn_7_5_failures(); Checker checker(handle_cuda()); NormalRNG default_rng; - for (auto &&arg: args) { - float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); + for (auto&& arg : args) { + float scale = + 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); scale = std::max(scale, 1.f); UniformFloatRNG rng(scale, 2 * scale); auto src = TensorLayout(arg.src, dtype::Float32()); @@ -297,19 +321,17 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) opr->deduce_layout(src, filter, dst); } src.dtype = dst.dtype = filter.dtype = dtype::Float32(); - checker. - set_rng(0, &default_rng). - set_rng(1, &default_rng). - set_epsilon(1e-3). - set_param(arg.param). - exec(TensorLayoutArray{filter, dst, src}); + checker.set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_epsilon(1e-3) + .set_param(arg.param) + .exec(TensorLayoutArray{filter, dst, src}); src.dtype = dst.dtype = filter.dtype = dtype::Float16(); - checker. - set_rng(0, &rng). - set_rng(1, &rng). - set_epsilon(1e-1). - set_param(arg.param). - exec(TensorLayoutArray{filter, dst, src}); + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-1) + .set_param(arg.param) + .exec(TensorLayoutArray{filter, dst, src}); arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; checker.set_rng(0, &rng) .set_rng(1, &rng) @@ -319,13 +341,12 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) } } -TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) -{ +TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) { using namespace convolution; std::vector args = get_args(); Checker checker(handle_cuda()); bool f16_checked = false; - for (auto &&arg: args) { + for (auto&& arg : args) { auto src = TensorLayout(arg.src, dtype::Float32()); auto filter = TensorLayout(arg.filter, dtype::Float32()); TensorLayout dst; @@ -337,12 +358,11 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) float scale = 1.0f / sqrt(dst[2] * dst[3]); UniformFloatRNG rng(scale, 2 * scale); src.dtype = dst.dtype = filter.dtype = dtype::Float32(); - checker. - set_rng(0, &rng). - set_rng(1, &rng). - set_epsilon(1e-3). - set_param(arg.param). - exec(TensorLayoutArray{src, dst, filter}); + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-3) + .set_param(arg.param) + .exec(TensorLayoutArray{src, dst, filter}); // reduce on large f16 array may introduce significant error if (dst.total_nr_elems() >= 1000 && f16_checked) @@ -350,12 +370,11 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) f16_checked = true; src.dtype = dst.dtype = filter.dtype = dtype::Float16(); - checker. - set_rng(0, &rng). - set_rng(1, &rng). - set_epsilon(1e-1). - set_param(arg.param). - exec(TensorLayoutArray{src, dst, filter}); + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-1) + .set_param(arg.param) + .exec(TensorLayoutArray{src, dst, filter}); arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; checker.set_rng(0, &rng) .set_rng(1, &rng) @@ -377,14 +396,13 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) } } -TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) -{ +TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) { using namespace convolution; std::vector args = get_args(); Checker checker(handle_cuda()); checker.set_before_exec_callback(AlgoChecker( ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}})); - for (auto &&arg: args) { + for (auto&& arg : args) { auto src = TensorLayout(arg.src, dtype::Float32()); auto filter = TensorLayout(arg.filter, dtype::Float32()); TensorLayout dst; @@ -396,17 +414,16 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) float scale = 1.0f / sqrt(dst[2] * dst[3]); UniformFloatRNG rng(scale, 2 * scale); src.dtype = dst.dtype = filter.dtype = dtype::Float32(); - checker. - set_rng(0, &rng). - set_rng(1, &rng). - set_epsilon(1e-3). - set_param(arg.param). - exec(TensorLayoutArray{src, dst, filter}); + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-3) + .set_param(arg.param) + .exec(TensorLayoutArray{src, dst, filter}); } } TEST_F(CUDA, CONV_CONFIG_COMBINATIONS) { - auto eps_getter = [](bool f16, int stage, const char *name) -> float { + auto eps_getter = [](bool f16, int stage, const char* name) -> float { if (f16) { return stage == 2 ? 0.5 : 0.2; } @@ -687,6 +704,46 @@ TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) { run(32, 64, 64, 56, 56, 1, 1, 0); } +TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_INT8_DP4A) { + CUBenchmarker bench{handle_cuda()}; + std::unique_ptr> proxy{ + new OprProxy{true}}; + size_t RUNS = 10; + bench.set_proxy(proxy).set_times(RUNS); + + auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, + size_t FH, size_t SH, size_t PH) { + bench.set_dtype(0, dtype::QuantizedS8{1.0f}) + .set_dtype(1, dtype::QuantizedS8{1.0f}) + .set_dtype(2, dtype::QuantizedS8{1.0f}); + param::Convolution param; + param.format = param::Convolution::Format::NCHW4; + param.stride_h = param.stride_w = SH; + param.pad_h = param.pad_w = PH; + param.compute_mode = param::Convolution::ComputeMode::DEFAULT; + bench.set_param(param); + bench.proxy()->target_execution_policy = {}; + TensorLayout src{{N, IC / 4, IH, IW, 4}, dtype::QuantizedS8{1.0f}}, + filter{{OC, IC / 4, FH, FH, 4}, dtype::QuantizedS8{1.0f}}; + TensorLayout dst; + dst.dtype = dtype::QuantizedS8{1.0f}; + { + auto&& opr = handle_cuda()->create_operator(); + opr->param() = param; + opr->deduce_layout(src, filter, dst); + } + auto used = bench.execl({filter, dst, src}) / RUNS; + float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; + printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(), + filter.to_string().c_str(), dst.to_string().c_str()); + printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, + (flo / (used * 1e9))); + }; + run(64, 32, 32, 92, 180, 4, 2, 2); + run(64, 32, 32, 46, 80, 4, 2, 2); + run(16, 16, 16, 92, 180, 4, 2, 2); + run(16, 16, 16, 46, 80, 4, 2, 2); +} TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { CUBenchmarker bench{handle_cuda()}; diff --git a/src/opr/test/dnn/convolution.cpp b/src/opr/test/dnn/convolution.cpp index 9e779f271..899726049 100644 --- a/src/opr/test/dnn/convolution.cpp +++ b/src/opr/test/dnn/convolution.cpp @@ -598,6 +598,51 @@ TEST(TestOprDNN, Deconvolution) { run({TensorShape{4, 6, 7, 2}, {2, 3, 4, 8, 13}}, opt); } +TEST(TestOprDNN, DeconvolutionExePolicy_QuantizedS8) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1); + + Param param; + using Policy = opr::ConvolutionBackwardData::ExecutionPolicy; + using S = Policy::Strategy; + +#if MGB_ENABLE_FASTRUN + for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, + S::PROFILE_HEURISTIC}) { +#else + for (auto strategy : {S : HEURISTIC, S::PROFILE_HEURISTIC}) { +#endif + auto graph = ComputingGraph::make(); + HostTensorGenerator<> gen; + + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name), + dtype); + }; + + auto x = mkvar("x", {16, 4, 50, 50, 4}, dtype::QuantizedS8(1.2f)); + auto w = mkvar("w", {16, 4, 4, 4, 4}, dtype::QuantizedS8(1.3f)); + + param.format = Param::Format::NCHW4; + param.pad_h = param.pad_w = 2; + param.stride_h = param.stride_w = 2; + + Policy policy; + policy.strategy = strategy; + + auto deconv = opr::ConvolutionBackwardData::make_deconv( + x, w, param, policy, + OperatorNodeConfig{dtype::QuantizedS8(1.2f)}); + HostTensorND host_y; + auto func = graph->compile({make_callback_copy(deconv, host_y)}); + func->execute(); + } +} + TEST(TestOprDNN, ConvolutionBackwardFilter) { using Checker = AutoOprChecker<3, 1>; -- GitLab