diff --git a/CMakeLists.txt b/CMakeLists.txt index a2cb1e3433..2712bd148c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -219,6 +219,12 @@ if( DEFINED MIOPEN_OVERRIDE_HIP_VERSION_PATCH ) message(STATUS "MIOPEN_hip_VERSION_PATCH overriden with ${MIOPEN_OVERRIDE_HIP_VERSION_PATCH}") endif() +# Depend on Composable Kernels +option(MIOPEN_USE_COMPOSABLEKERNEL "Enable MIOpen to use composable kernels for various operations" Off) +if(MIOPEN_USE_COMPOSABLEKERNEL) +find_package(composable_kernel 1.0.0 COMPONENTS device_operations host_tensor) +endif() + set_var_to_condition(MIOPEN_USE_COMGR_DEFAULT (NOT DEFINED MIOPEN_BACKEND_OPENCL) AND (NOT (MIOPEN_BACKEND STREQUAL "HIPNOGPU"))) option(MIOPEN_USE_COMGR "Use comgr to build kernels instead of offline tools" ${MIOPEN_USE_COMGR_DEFAULT}) diff --git a/Dockerfile b/Dockerfile index cedc92d7dc..057d0e5ac3 100755 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:18.04 +FROM ubuntu:18.04 as miopen ARG USE_MLIR="OFF" @@ -8,7 +8,8 @@ RUN dpkg --add-architecture i386 # Add rocm repository # Note: The ROCm version with $USE_MLIR should keep in sync with default ROCm version # unless MLIR library is incompatible with current ROCm. - +RUN apt-get update +RUN apt-get install -y wget gnupg RUN if [ "$USE_MLIR" = "ON" ] ; \ then export ROCM_APT_VER=.apt_5.1;\ else \ @@ -17,6 +18,8 @@ RUN if [ "$USE_MLIR" = "ON" ] ; \ echo $ROCM_APT_VER &&\ sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/$ROCM_APT_VER/ ubuntu main > /etc/apt/sources.list.d/rocm.list' RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu bionic main universe | tee -a /etc/apt/sources.list" +RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add - +RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list" #Add gpg keys # Install dependencies @@ -33,7 +36,8 @@ wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ build-essential \ - cmake \ + cmake-data=3.15.1-0kitware1 \ + cmake=3.15.1-0kitware1 \ comgr \ clang-format-10 \ doxygen \ @@ -128,4 +132,11 @@ RUN if [ "$USE_TARGETID" = "ON" ] ; then export HIPCC_LINK_FLAGS_APPEND='-O3 -pa ARG MIOTENSILE_VER="default" RUN if [ "$USE_TARGETID" = "OFF" ] ; then echo "MIOpenTensile is not installed."; elif [ "$MIOTENSILE_VER" = "latest" ] ; then cget -p $PREFIX install ROCmSoftwarePlatform/MIOpenTensile@94a9047741d16a8eccd290131b78fb1aa69cdcdf; else cget -p $PREFIX install ROCmSoftwarePlatform/MIOpenTensile@94a9047741d16a8eccd290131b78fb1aa69cdcdf; fi +ARG CK_COMMIT=91d8b7d67ae9dbf8a6e691ea3e17c0b9705c6ba7 +RUN wget -O ck.tar.gz https://www.github.com/rocmsoftwareplatform/composable_kernel/archive/${CK_COMMIT}.tar.gz && \ + tar zxvf ck.tar.gz &&\ + cd composable_kernel-${CK_COMMIT} && \ + mkdir build && cd build && \ + CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_CXX_COMPILER_LAUNCHER="${COMPILER_LAUNCHER}" -DCMAKE_PREFIX_PATH=/opt/rocm -D CMAKE_CXX_FLAGS=" --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a --offload-arch=gfx1030 -O3 " .. && \ + make -j $(nproc) install RUN groupadd -f render diff --git a/Jenkinsfile b/Jenkinsfile index 33c7359e84..c601e3207a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -301,6 +301,10 @@ def buildDocker(install_prefix) echo "Checking for image: ${image_name}" sh "docker manifest inspect --insecure ${image_name}" echo "Image: ${image_name} found!! Skipping building image" + if(params.DEBUG_FORCE_DOCKER_BUILD) + { + throw new Exception("Docker build override via DEBUG_FORCE_DOCKER_BUILD") + } } catch(Exception ex) { @@ -323,7 +327,7 @@ def buildDocker(install_prefix) /// * The default compiler is usually not specified. /// BuildType := { Release* | Debug | Install } [ BuildTypeModifier ] /// * BuildTypeModifier := { NOCOMGR | Embedded | Static | Normal-Find | Fast-Find -/// MLIR | Tensile | Tensile-Latest | Package | ... } +/// CK | MLIR | Tensile | Tensile-Latest | Package | ... } /// TestSet := { All | Smoke* } [ Codecov ] /// * "All" corresponds to "cmake -DMIOPEN_TEST_ALL=On". /// * "Smoke" (-DMIOPEN_TEST_ALL=Off) is the default and usually not specified. @@ -757,6 +761,20 @@ pipeline { buildHipClangJobAndReboot(compiler: 'g++', setup_flags: Int8_flags, config_targets: Smoke_targets) } } + stage('Int8 Hip Debug CK gfx908') { + when { + beforeAgent true + expression { params.TARGET_GFX908} + } + agent{ label rocmnode("gfx908") } + // This stage should be removed when CK is enabled by default in MIOpen + environment{ + Enable_CK = "-DMIOPEN_USE_COMPOSABLEKERNEL=On" + } + steps{ + buildHipClangJobAndReboot( build_type: 'debug', setup_flags: Enable_CK + Int8_flags , build_env: extra_log_env, test_flags: ' --verbose ') + } + } stage('Fp16 Hip Vega20') { when { beforeAgent true diff --git a/include/miopen/config.h.in b/include/miopen/config.h.in index 249748e940..21e7584906 100644 --- a/include/miopen/config.h.in +++ b/include/miopen/config.h.in @@ -48,6 +48,7 @@ #cmakedefine01 MIOPEN_LOG_FUNC_TIME_ENABLE #cmakedefine01 MIOPEN_ENABLE_SQLITE_BACKOFF #cmakedefine01 MIOPEN_USE_MLIR +#cmakedefine01 MIOPEN_USE_COMPOSABLEKERNEL // "_PACKAGE_" to avoid name contentions: the macros like // HIP_VERSION_MAJOR are defined in hip_version.h. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 748021d6a4..cfdcc92221 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -182,6 +182,7 @@ set( MIOpen_Source solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp + solver/conv_hip_implicit_gemm_fwd_xdlops.cpp solver/conv_hip_implicit_gemm_nonxdlops_common.cpp solver/conv_hip_implicit_gemm_wrw_v4r4.cpp solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp @@ -633,9 +634,14 @@ target_include_directories(MIOpen PUBLIC $ ) +set(MIOPEN_CK_LINK_FLAGS) +if(MIOPEN_USE_COMPOSABLEKERNEL) +set(MIOPEN_CK_LINK_FLAGS composable_kernel::device_operations composable_kernel::host_tensor hip::host) +endif() + target_include_directories(MIOpen SYSTEM PUBLIC $) target_include_directories(MIOpen SYSTEM PRIVATE ${BZIP2_INCLUDE_DIR}) -target_link_libraries(MIOpen PRIVATE ${CMAKE_THREAD_LIBS_INIT} ${BZIP2_LIBRARIES}) +target_link_libraries(MIOpen PRIVATE ${CMAKE_THREAD_LIBS_INIT} ${BZIP2_LIBRARIES} ${MIOPEN_CK_LINK_FLAGS}) generate_export_header(MIOpen EXPORT_FILE_NAME ${PROJECT_BINARY_DIR}/include/miopen/export.h ) diff --git a/src/convolution.cpp b/src/convolution.cpp index 7c9b94df1c..50c1a685a4 100644 --- a/src/convolution.cpp +++ b/src/convolution.cpp @@ -265,8 +265,9 @@ ConvolutionDescriptor::GetForwardOutputTensorWithLayout(const TensorDescriptor& tensor_layout_to_strides( out_lens, default_layout, yLayout, xDesc.GetVectorLength(), out_strides); return {(xDesc.GetType() == miopenInt8 || xDesc.GetType() == miopenInt8x4 - ? (yType == miopenInt32 ? yType : miopenFloat) - : xDesc.GetType()), + ? (yType) + : xDesc.GetType()), // TODO: This function overrides the output type with + // essentially the input which is incorrect. xDesc.GetLayout_t(), out_lens, out_strides}; diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp index 683d8670a9..975bf1bafe 100644 --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -4108,6 +4108,69 @@ struct ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC final const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC& config) const override; }; +struct PerformanceConfigHipImplicitGemmFwdXdlops + : PerfConfigBase +{ + int index; + std::string kernel_id; + int total_size; + PerformanceConfigHipImplicitGemmFwdXdlops(int idx, std::string kernl_id) + : index(idx), kernel_id(kernl_id), total_size(-1) + { + } + PerformanceConfigHipImplicitGemmFwdXdlops() : PerformanceConfigHipImplicitGemmFwdXdlops(0, "") + { + } + PerformanceConfigHipImplicitGemmFwdXdlops(bool) + : PerformanceConfigHipImplicitGemmFwdXdlops(0, "") + { + } + void HeuristicInit(const ConvolutionContext& ctx); + bool SetNextValue(const ConvolutionContext& ctx); + bool IsValidValue() const; + bool IsValid(const ConvolutionContext& ctx) const; + template + static void Visit(Self&& s, F f) + { + f(s.kernel_id, "kernel_id"); + } + bool operator==(const PerformanceConfigHipImplicitGemmFwdXdlops& other) const; +}; + +struct ConvHipImplicitGemmFwdXdlops final + : ConvTunableSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + PerformanceConfigHipImplicitGemmFwdXdlops + GetDefaultPerformanceConfig(const ConvolutionContext&) const override; + bool IsValidPerformanceConfig(const ConvolutionContext&, + const PerformanceConfigHipImplicitGemmFwdXdlops&) const override; + PerformanceConfigHipImplicitGemmFwdXdlops + Search(const ConvolutionContext&, const AnyInvokeParams& invoke_ctx) const override; + size_t GetWorkspaceSize(const ConvolutionContext& ctx) const override; + bool MayNeedWorkspace() const override { return false; } + bool IsApplicable(const ConvolutionContext& ctx) const override; + bool IsDynamic() const override { return true; } + ConvSolution + GetSolution(const ConvolutionContext& ctx, + const PerformanceConfigHipImplicitGemmFwdXdlops& config) const override; + // Magic Number Alert: + // Naive convolutions have GetWti() that return very small value (0.01f). + // This allows MIOpen to use Naive Solvers if no other applicable Solvers + // have known WTIs. Right now this means that in case of find-db miss, + // the library will try to use Winograd or GEMM (whatever is faster according + // to their GetWti's), but if both are not applicable, the library will + // use Naive Solver + // Since we would like to us CK before naive, and use it instead (because + // we do expect that CK is faster than Naive), therefore we use a + // value bigger than 0.01f, e.g. 0.02f. + float GetWti(const ConvolutionContext&) const override { return 0.02f; }; +}; + struct AnySolver; } // namespace solver diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp index 053d258f12..8a960d647a 100644 --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -146,6 +146,7 @@ static auto GetImplicitGemmSolvers() miopen::solver::ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC, miopen::solver::ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC, miopen::solver::ConvCkIgemmFwdV6r1DlopsNchw, + miopen::solver::ConvHipImplicitGemmFwdXdlops, miopen::solver::ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC>{}; } diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp index e8c2fbc6df..37db876bcd 100644 --- a/src/ocl/convolutionocl.cpp +++ b/src/ocl/convolutionocl.cpp @@ -566,10 +566,6 @@ void ValidateConvTensors(const ConvTensors& tensors) const auto trivial_tensor_types_not_matched = tensors.xDesc.GetType() != tensors.yDesc.GetType() && tensors.xDesc.GetType() != miopenInt8 && tensors.xDesc.GetType() != miopenInt8x4; - const auto int8_in8x4_tensor_not_matched = - (tensors.xDesc.GetType() == miopenInt8 && tensors.yDesc.GetType() != miopenInt32 && - tensors.yDesc.GetType() != miopenFloat) || - (tensors.xDesc.GetType() == miopenInt8x4 && tensors.yDesc.GetType() != miopenInt32); // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) { // MIOPEN_THROW(miopenStatusBadParm); @@ -578,8 +574,7 @@ void ValidateConvTensors(const ConvTensors& tensors) const auto x_tensor_invalid = tensors.xDesc.GetSize() < 3; const auto bad_parameters = invalid_buffers || tensor_sizes_not_matched || - trivial_tensor_types_not_matched || int8_in8x4_tensor_not_matched || - x_tensor_invalid; + trivial_tensor_types_not_matched || x_tensor_invalid; if(bad_parameters) MIOPEN_THROW(miopenStatusBadParm); diff --git a/src/solver.cpp b/src/solver.cpp index 468c22607c..19282d48c9 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -508,6 +508,8 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) ++id, ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC{}, miopenConvolutionAlgoImplicitGEMM); + RegisterWithSolver( + registry, ++id, ConvHipImplicitGemmFwdXdlops{}, miopenConvolutionAlgoImplicitGEMM); // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp new file mode 100644 index 0000000000..1ea00f635a --- /dev/null +++ b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp @@ -0,0 +1,292 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL +#include +#endif + +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS) + +namespace miopen { +namespace solver { + +struct CKArgs +{ + CKArgs(const ConvolutionContext& ctx) + { + N = ConvolutionContextInterpreter::GetBatchN(ctx); + K = ConvolutionContextInterpreter::GetOutputChannelK(ctx); + C = ConvolutionContextInterpreter::GetInputChannelC(ctx); + input = {ConvolutionContextInterpreter::GetInputHeightHi(ctx), + ConvolutionContextInterpreter::GetInputWidthWi(ctx)}; + output = {ConvolutionContextInterpreter::GetOutputHeightHo(ctx), + ConvolutionContextInterpreter::GetOutputWidthWo(ctx)}; + filter = {ConvolutionContextInterpreter::GetFilterHeightY(ctx), + ConvolutionContextInterpreter::GetFilterWidthX(ctx)}; + strides = {ConvolutionContextInterpreter::GetAdjustedConvolutionStrideH(ctx), + ConvolutionContextInterpreter::GetAdjustedConvolutionStrideW(ctx)}; + dilation = {ConvolutionContextInterpreter::GetAdjustedConvolutionDilationH(ctx), + ConvolutionContextInterpreter::GetAdjustedConvolutionDilationW(ctx)}; + lPadding = {ConvolutionContextInterpreter::GetInputLeftPadH(ctx), + ConvolutionContextInterpreter::GetInputLeftPadW(ctx)}; + rPadding = {ConvolutionContextInterpreter::GetAdjustedInputRightPadH(ctx), + ConvolutionContextInterpreter::GetAdjustedInputRightPadW(ctx)}; + } + int N; + int K; + int C; + std::vector input; + std::vector output; + std::vector filter; + std::vector strides; + std::vector dilation; + std::vector lPadding; + std::vector rPadding; +}; + +void PerformanceConfigHipImplicitGemmFwdXdlops::HeuristicInit(const ConvolutionContext& ctx) +{ + this->index = 0; +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = ctx; +#else + this->index = 0; + std::vector conv_ptrs; + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs); + assert(!conv_ptrs.empty()); + this->total_size = conv_ptrs.size(); + const auto args = CKArgs{ctx}; + for(auto& conv_ptr : conv_ptrs) + { + auto argument_ptr = conv_ptr.MakeArgumentPointer(nullptr, + nullptr, + nullptr, + args.N, + args.K, + args.C, + args.input, + args.filter, + args.output, + args.strides, + args.dilation, + args.lPadding, + args.rPadding); + if(conv_ptr.IsSupportedArgument(argument_ptr.get())) + { + this->kernel_id = conv_ptr.GetTypeString(); + break; + } + ++this->index; + } +#endif +} + +bool PerformanceConfigHipImplicitGemmFwdXdlops::SetNextValue(const ConvolutionContext& ctx) +{ + if(total_size == -1) + this->HeuristicInit(ctx); + assert(total_size != -1); + if((index + 1) < total_size) + { + ++index; + return true; + } + else + return false; +} + +bool PerformanceConfigHipImplicitGemmFwdXdlops::IsValidValue() const { return index < total_size; } + +bool PerformanceConfigHipImplicitGemmFwdXdlops::IsValid(const ConvolutionContext& ctx) const +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = ctx; + return false; +#else + std::vector conv_ptrs; + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs); + const auto args = CKArgs{ctx}; + auto argument_ptr = conv_ptrs[this->index].MakeArgumentPointer(nullptr, + nullptr, + nullptr, + args.N, + args.K, + args.C, + args.input, + args.filter, + args.input, + args.strides, + args.dilation, + args.lPadding, + args.rPadding); + return conv_ptrs[this->index].IsSupportedArgument(argument_ptr.get()); +#endif +} + +bool PerformanceConfigHipImplicitGemmFwdXdlops::operator==( + const PerformanceConfigHipImplicitGemmFwdXdlops& other) const +{ + return this->index == other.index; +} + +PerformanceConfigHipImplicitGemmFwdXdlops +ConvHipImplicitGemmFwdXdlops::GetDefaultPerformanceConfig(const ConvolutionContext& ctx) const +{ + PerformanceConfigHipImplicitGemmFwdXdlops pp; + pp.HeuristicInit(ctx); + return pp; +} + +bool ConvHipImplicitGemmFwdXdlops::IsValidPerformanceConfig( + const ConvolutionContext& ctx, const PerformanceConfigHipImplicitGemmFwdXdlops& config) const +{ + return config.IsValid(ctx); +} + +PerformanceConfigHipImplicitGemmFwdXdlops +ConvHipImplicitGemmFwdXdlops::Search(const ConvolutionContext& ctx, + const AnyInvokeParams& invoke_ctx) const +{ + return GenericSearch(*this, ctx, invoke_ctx); +} + +size_t ConvHipImplicitGemmFwdXdlops::GetWorkspaceSize(const ConvolutionContext& ctx) const +{ + std::ignore = ctx; + return 0; +} + +bool ConvHipImplicitGemmFwdXdlops::IsApplicable(const ConvolutionContext& ctx) const +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = ctx; + return false; +#else + if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS{})) + return false; + if(miopen::IsEnabled(MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC{})) + return false; + if(!(ctx.conv_problem.GetInDataType() == miopenInt8 && + ctx.conv_problem.GetWeightsDataType() == miopenInt8 && + ctx.conv_problem.GetOutDataType() == miopenInt8)) + return false; + if(!ctx.direction.IsForward()) + return false; + if(!ctx.Is2d()) + return false; + if(ctx.GetStream().GetDeviceName() != "gfx908") + return false; + if(!ctx.IsLayoutNHWC()) + return false; + + const auto args = CKArgs{ctx}; + if(!std::all_of(args.strides.begin(), args.strides.end(), [&](auto x) { return x == 1; })) + return false; + + std::vector conv_ptrs; + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs); + assert(!conv_ptrs.empty()); + + for(auto& conv_ptr : conv_ptrs) + { + auto argument_ptr = conv_ptr.MakeArgumentPointer(nullptr, + nullptr, + nullptr, + args.N, + args.K, + args.C, + args.input, + args.filter, + args.input, + args.strides, + args.dilation, + args.lPadding, + args.rPadding); + if(conv_ptr.IsSupportedArgument(argument_ptr.get())) + return true; + } + return false; +#endif +} + +ConvSolution ConvHipImplicitGemmFwdXdlops::GetSolution( + const ConvolutionContext& ctx, const PerformanceConfigHipImplicitGemmFwdXdlops& config) const +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = ctx; + std::ignore = config; + return {}; +#else + ConvSolution result; + const auto args = CKArgs{ctx}; + result.invoker_factory = [=](const std::vector& kernels) { + std::ignore = kernels; + return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { + std::vector conv_ptrs; + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs); + auto& conv_ptr = conv_ptrs.at(config.index); + const auto& data_ctx = primitive_parameters.CastTo(); + const auto& tensors = data_ctx.tensors; + auto argument_ptr = conv_ptr.MakeArgumentPointer( + const_cast( // NOLINT (cppcoreguidelines-pro-type-const-cast) + static_cast(tensors.in)), + const_cast( // NOLINT (cppcoreguidelines-pro-type-const-cast) + static_cast(tensors.w)), + static_cast(tensors.out), + args.N, + args.K, + args.C, + args.input, + args.filter, + args.input, + args.strides, + args.dilation, + args.lPadding, + args.rPadding); + auto invoker_ptr = conv_ptr.MakeInvokerPointer(); + const auto enable_profiling = handle.IsProfilingEnabled(); + + float elapsed_time = + invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling}); + if(enable_profiling) + { + handle.ResetKernelTime(); + handle.AccumKernelTime(elapsed_time); + } + }; + }; + return result; +#endif +} + +} // namespace solver +} // namespace miopen diff --git a/src/solver/gemm.cpp b/src/solver/gemm.cpp index 9baa9e2614..e23c05729e 100644 --- a/src/solver/gemm.cpp +++ b/src/solver/gemm.cpp @@ -87,6 +87,13 @@ bool GemmFwdBase::IsApplicable(const ExecutionContext& ctx, const auto& xDesc = problem.GetIn(); const auto& wDesc = problem.GetWeights(); const auto& yDesc = problem.GetOut(); + if(xDesc.GetType() == miopenInt8x4 || xDesc.GetType() == miopenInt8) + { + // rocBlas needs the output to be int32 always + if(yDesc.GetType() != miopenFloat && yDesc.GetType() != miopenInt32 && + yDesc.GetType() != miopenInt8x4) + return false; + } return problem.GetDirection() == conv::Direction::Forward && problem.IsLayoutDefault() && !(IsAnyBufferBF16(xDesc, yDesc, wDesc) && !IsBf16Supported) && !(IsAnyBufferFp16(xDesc, yDesc, wDesc) && !IsFp16Supported); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d0a22cd25d..f1984e2af5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -618,7 +618,6 @@ function(add_custom_test NAME) set_tests_properties(${NAME} PROPERTIES DISABLED On) endif() endfunction() - if(${CODECOV_TEST}) add_custom_test(test_conv3d_codecov COMMAND $ ${MIOPEN_TEST_FLOAT_ARG} --input 2 4 4 4 4 --weights 2 4 1 1 1 --pads_strides_dilations 0 0 0 1 1 1 1 1 1 ${MIOPEN_TEST_FLAGS_ARGS} @@ -775,6 +774,15 @@ add_custom_test(test_conv_igemm_mlir_xdlops_fwd SKIP_UNLESS_ALL HALF_ENABLED INT COMMAND ${IMPLICITGEMM_MLIR_ENV_F_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_F} --input 256 256 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --group-count 4 ) +if(MIOPEN_USE_COMPOSABLEKERNEL) +add_custom_test(test_conv_hip_igemm_xdlops SKIP_UNLESS_ALL OCL_DISABLED HALF_DISABLED FLOAT_DISABLED INT8_ENABLED GFX900_DISABLED GFX906_DISABLED GFX90A_DISABLED + COMMAND $ ${MIOPEN_TEST_FLOAT_ARG} --disable-backward-data --disable-backward-weights --verbose --input 256 128 28 28 --weights 128 128 3 3 --output_type int8 --in_layout NHWC --fil_layout NHWC --out_layout NHWC --pads_strides_dilations 1 1 1 1 1 1 + COMMAND $ ${MIOPEN_TEST_FLOAT_ARG} --disable-backward-data --disable-backward-weights --verbose --input 128 512 7 7 --weights 512 512 3 3 --output_type int8 --in_layout NHWC --fil_layout NHWC --out_layout NHWC --pads_strides_dilations 1 1 1 1 1 1 + COMMAND $ ${MIOPEN_TEST_FLOAT_ARG} --disable-backward-data --disable-backward-weights --verbose --input 128 64 56 56 --weights 64 64 1 1 --output_type int8 --in_layout NHWC --fil_layout NHWC --out_layout NHWC --pads_strides_dilations 0 0 1 1 1 1 + COMMAND $ ${MIOPEN_TEST_FLOAT_ARG} --disable-backward-data --disable-backward-weights --verbose --input 256 256 56 56 --weights 256 64 1 1 --output_type int8 --in_layout NHWC --fil_layout NHWC --out_layout NHWC --pads_strides_dilations 0 0 1 1 1 1 +) +endif() + add_custom_test(test_conv_igemm_mlir_xdlops_bwd_wrw SKIP_UNLESS_ALL HALF_ENABLED SKIP_UNLESS_MLIR GFX900_DISABLED GFX906_DISABLED COMMAND ${IMPLICITGEMM_MLIR_ENV_B_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 1024 14 14 --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 COMMAND ${IMPLICITGEMM_MLIR_ENV_B_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 1024 14 14 --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC diff --git a/test/conv2d.cpp b/test/conv2d.cpp index 6a6fb4f6db..f781f1c653 100644 --- a/test/conv2d.cpp +++ b/test/conv2d.cpp @@ -59,6 +59,9 @@ struct conv2d_driver : conv_driver this->add(this->deterministic, "deterministic", this->generate_data({false})); this->add(this->tensor_vect, "tensor_vect", this->generate_data({0})); this->add(this->vector_length, "vector_length", this->generate_data({1})); + // Only valid for int8 input and weights + this->add(this->output_type, "output_type", this->generate_data({"int32"})); + this->add(this->int8_vectorize, "int8_vectorize", this->generate_data({false})); } }; diff --git a/test/conv_common.hpp b/test/conv_common.hpp index 210817d027..6854f61914 100644 --- a/test/conv_common.hpp +++ b/test/conv_common.hpp @@ -239,9 +239,9 @@ tensor get_output_tensor(const miopen::ConvolutionDescriptor& filter, input.desc, weights.desc, yLayout, - weights.desc.GetType() == miopenInt8 || weights.desc.GetType() == miopenInt8x4 + weights.desc.GetType() == miopenInt8x4 ? (std::is_same{} ? miopenInt32 : miopenFloat) - : weights.desc.GetType())}; + : miopen_type{})}; } // Convolution test base class @@ -1652,7 +1652,7 @@ struct verify_forward_conv_int8 : conv_base } }; -template +template struct conv_driver : test_driver { tensor input; @@ -1683,6 +1683,8 @@ struct conv_driver : test_driver bool gen_float = false; bool immed = immed_mode; bool enable_fdb = true; + std::string output_type = ""; + bool int8_vectorize = false; bool deterministic = false; std::unordered_map cmode_lookup = { @@ -1828,6 +1830,7 @@ struct conv_driver : test_driver filter.spatialDim = get_spatial_dim(); else filter.spatialDim = filter_dims.size(); + bool is_int8 = (input.desc.GetType() == miopenInt8 || input.desc.GetType() == miopenInt8x4); filter.mode = cmode_lookup[miopen::ToUpper(conv_mode)]; filter.paddingMode = pmode_lookup[miopen::ToUpper(pad_mode)]; @@ -2007,8 +2010,6 @@ struct conv_driver : test_driver wei_k_len = weights.desc.GetLengths()[3]; } - bool is_int8 = (input.desc.GetType() == miopenInt8 || input.desc.GetType() == miopenInt8x4); - // lack of transposeConv or groupConv for int8 type if(is_int8 && filter.mode == miopenTranspose) { @@ -2118,7 +2119,8 @@ struct conv_driver : test_driver (filter.group_count > 1 && (input.desc.GetLengths().at(1) % weights.desc.GetLengths().at(0) == 0))))) { - auto output = get_output_tensor(filter, input, weights, out_layout); + auto output = get_output_tensor(filter, input, weights, out_layout); + auto gen_positive_value = [=](auto...) { auto data_type = input.desc.GetType(); std::size_t v_max = is_int8 ? 16 : (data_type == miopenHalf) ? 4 : 16; @@ -2143,7 +2145,8 @@ struct conv_driver : test_driver miopen::conv::Direction::Forward); ctx.SetStream(&get_handle()); - bool skip_forward = (input.desc.GetType() == miopenInt8x4 && !IsGemmAplicable(ctx)); + bool skip_forward = false; + bool skip_backward_data = is_int8; bool skip_backward_weights = is_int8; @@ -2194,8 +2197,9 @@ struct conv_driver : test_driver size_t total_mem; if(is_int8) { + // TODO: Tout here was float which should have been int32 auto output_int8 = - get_output_tensor(filter, input, weights, out_layout); + get_output_tensor(filter, input, weights, out_layout); size_t workspace_size = filter.ForwardGetWorkSpaceSize( handle, weights.desc, input.desc, output_int8.desc); @@ -2258,26 +2262,45 @@ struct conv_driver : test_driver { if(is_int8) { - verify(verify_forward_conv{ - input, - weights, - get_output_tensor(filter, input, weights, out_layout), - filter, - stats, - 0, - search, - false, - immed}); - verify(verify_forward_conv{ - input, - weights, - get_output_tensor(filter, input, weights, out_layout), - filter, - stats, - 0, - search, - false, - immed}); + if(output_type == "float") + { + verify(verify_forward_conv{ + input, + weights, + get_output_tensor(filter, input, weights, out_layout), + filter, + stats, + 0, + search, + int8_vectorize, + immed}); + } + else if(output_type == "int32") + { + verify(verify_forward_conv{ + input, + weights, + get_output_tensor(filter, input, weights, out_layout), + filter, + stats, + 0, + search, + int8_vectorize, + immed}); + } + else if(output_type == "int8") + { + verify(verify_forward_conv{ + input, + weights, + get_output_tensor(filter, input, weights, out_layout), + filter, + stats, + 0, + search, + int8_vectorize, + immed}); + } } else { diff --git a/test/tensor_cast.cpp b/test/tensor_cast.cpp index 44f6c3f315..3fe11281cf 100644 --- a/test/tensor_cast.cpp +++ b/test/tensor_cast.cpp @@ -170,8 +170,9 @@ struct tensor_cast_driver : test_driver void run() { - unsigned long max_value = miopen_type{} == miopenHalf ? 5 : 32767; - max_val = miopen_type{} == miopenHalf + unsigned long max_value = + miopen_type{} == miopenHalf ? 5 : (miopen_type{} == miopenInt8 ? 126 : 32767); + max_val = miopen_type{} == miopenHalf ? 65504.0 : miopen_type{} == miopenInt8 ? 127.0 diff --git a/test/tensor_holder.hpp b/test/tensor_holder.hpp index 68edf9483e..79e3972e0f 100644 --- a/test/tensor_holder.hpp +++ b/test/tensor_holder.hpp @@ -165,7 +165,7 @@ struct tensor { assert(rhs.GetType() == miopen_type{} || ((miopen_type{} == miopenInt8 || miopen_type{} == miopenInt8x4) && - rhs.GetType() == miopenFloat)); + (rhs.GetType() == miopenFloat || rhs.GetType() == miopenInt32))); data.resize(desc.GetElementSpace()); }