diff --git a/CMakeLists.txt b/CMakeLists.txt
index a2cb1e3433..2712bd148c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -219,6 +219,12 @@ if( DEFINED MIOPEN_OVERRIDE_HIP_VERSION_PATCH )
     message(STATUS "MIOPEN_hip_VERSION_PATCH overriden with ${MIOPEN_OVERRIDE_HIP_VERSION_PATCH}")
 endif()
 
+# Depend on Composable Kernels
+option(MIOPEN_USE_COMPOSABLEKERNEL "Enable MIOpen to use composable kernels for various operations" Off)
+if(MIOPEN_USE_COMPOSABLEKERNEL)
+find_package(composable_kernel 1.0.0 COMPONENTS device_operations host_tensor)
+endif()
+
 set_var_to_condition(MIOPEN_USE_COMGR_DEFAULT (NOT DEFINED MIOPEN_BACKEND_OPENCL) AND (NOT (MIOPEN_BACKEND STREQUAL "HIPNOGPU")))
 option(MIOPEN_USE_COMGR "Use comgr to build kernels instead of offline tools" ${MIOPEN_USE_COMGR_DEFAULT})
 
diff --git a/Dockerfile b/Dockerfile
index cedc92d7dc..057d0e5ac3 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM ubuntu:18.04 as miopen
 
 ARG USE_MLIR="OFF"
 
@@ -8,7 +8,8 @@ RUN dpkg --add-architecture i386
 # Add rocm repository
 # Note: The ROCm version with $USE_MLIR should keep in sync with default ROCm version
 # unless MLIR library is incompatible with current ROCm.
-
+RUN apt-get update
+RUN apt-get install -y wget gnupg
 RUN if [ "$USE_MLIR" = "ON" ] ; \
         then export ROCM_APT_VER=.apt_5.1;\
     else \
@@ -17,6 +18,8 @@ RUN if [ "$USE_MLIR" = "ON" ] ; \
 echo $ROCM_APT_VER &&\
 sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/$ROCM_APT_VER/ ubuntu main > /etc/apt/sources.list.d/rocm.list'
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu bionic main universe | tee -a /etc/apt/sources.list"
+RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
+RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
 
 #Add gpg keys
 # Install dependencies
@@ -33,7 +36,8 @@ wget -q -O - https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
 apt-get update && \
 DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
     build-essential \
-    cmake \
+    cmake-data=3.15.1-0kitware1 \
+    cmake=3.15.1-0kitware1 \
     comgr \
     clang-format-10 \
     doxygen \
@@ -128,4 +132,11 @@ RUN if [ "$USE_TARGETID" = "ON" ] ; then export HIPCC_LINK_FLAGS_APPEND='-O3 -pa
 ARG MIOTENSILE_VER="default"
 RUN if [ "$USE_TARGETID" = "OFF" ] ; then echo "MIOpenTensile is not installed."; elif [ "$MIOTENSILE_VER" = "latest" ] ; then cget -p $PREFIX install ROCmSoftwarePlatform/MIOpenTensile@94a9047741d16a8eccd290131b78fb1aa69cdcdf; else cget -p $PREFIX install ROCmSoftwarePlatform/MIOpenTensile@94a9047741d16a8eccd290131b78fb1aa69cdcdf; fi
 
+ARG CK_COMMIT=91d8b7d67ae9dbf8a6e691ea3e17c0b9705c6ba7
+RUN  wget -O ck.tar.gz https://www.github.com/rocmsoftwareplatform/composable_kernel/archive/${CK_COMMIT}.tar.gz && \
+    tar zxvf ck.tar.gz &&\
+    cd composable_kernel-${CK_COMMIT} && \
+    mkdir build && cd build && \
+    CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_CXX_COMPILER_LAUNCHER="${COMPILER_LAUNCHER}" -DCMAKE_PREFIX_PATH=/opt/rocm -D CMAKE_CXX_FLAGS=" --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a --offload-arch=gfx1030 -O3 " .. && \
+    make -j $(nproc) install 
 RUN groupadd -f render
diff --git a/Jenkinsfile b/Jenkinsfile
index 33c7359e84..c601e3207a 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -301,6 +301,10 @@ def buildDocker(install_prefix)
         echo "Checking for image: ${image_name}"
         sh "docker manifest inspect --insecure ${image_name}"
         echo "Image: ${image_name} found!! Skipping building image"
+        if(params.DEBUG_FORCE_DOCKER_BUILD)
+        {
+            throw new Exception("Docker build override via DEBUG_FORCE_DOCKER_BUILD")
+        }
     }
     catch(Exception ex)
     {
@@ -323,7 +327,7 @@ def buildDocker(install_prefix)
 ///   * The default compiler is usually not specified.
 /// BuildType := { Release* | Debug | Install } [ BuildTypeModifier ]
 ///   * BuildTypeModifier := { NOCOMGR | Embedded | Static | Normal-Find | Fast-Find
-///                            MLIR | Tensile | Tensile-Latest | Package | ... }
+///                            CK | MLIR | Tensile | Tensile-Latest | Package | ... }
 /// TestSet := { All | Smoke* } [ Codecov ]
 ///   * "All" corresponds to "cmake -DMIOPEN_TEST_ALL=On".
 ///   * "Smoke" (-DMIOPEN_TEST_ALL=Off) is the default and usually not specified.
@@ -757,6 +761,20 @@ pipeline {
                         buildHipClangJobAndReboot(compiler: 'g++', setup_flags: Int8_flags, config_targets: Smoke_targets)
                     }
                 }
+                stage('Int8 Hip Debug CK gfx908') {
+                    when {
+                        beforeAgent true
+                        expression { params.TARGET_GFX908}
+                    }
+                    agent{ label rocmnode("gfx908") }
+                    // This stage should be removed when CK is enabled by default in MIOpen
+                    environment{
+                        Enable_CK = "-DMIOPEN_USE_COMPOSABLEKERNEL=On"
+                    }
+                    steps{
+                        buildHipClangJobAndReboot( build_type: 'debug', setup_flags: Enable_CK + Int8_flags , build_env: extra_log_env, test_flags: ' --verbose ')
+                    }
+                }
                 stage('Fp16 Hip Vega20') {
                     when {
                         beforeAgent true
diff --git a/include/miopen/config.h.in b/include/miopen/config.h.in
index 249748e940..21e7584906 100644
--- a/include/miopen/config.h.in
+++ b/include/miopen/config.h.in
@@ -48,6 +48,7 @@
 #cmakedefine01 MIOPEN_LOG_FUNC_TIME_ENABLE
 #cmakedefine01 MIOPEN_ENABLE_SQLITE_BACKOFF
 #cmakedefine01 MIOPEN_USE_MLIR
+#cmakedefine01 MIOPEN_USE_COMPOSABLEKERNEL
 
 // "_PACKAGE_" to avoid name contentions: the macros like
 // HIP_VERSION_MAJOR are defined in hip_version.h.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 748021d6a4..cfdcc92221 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -182,6 +182,7 @@ set( MIOpen_Source
     solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp
     solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp
     solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp
+    solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
     solver/conv_hip_implicit_gemm_nonxdlops_common.cpp
     solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
     solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
@@ -633,9 +634,14 @@ target_include_directories(MIOpen PUBLIC
     $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src/include>
 )
 
+set(MIOPEN_CK_LINK_FLAGS)
+if(MIOPEN_USE_COMPOSABLEKERNEL)
+set(MIOPEN_CK_LINK_FLAGS composable_kernel::device_operations composable_kernel::host_tensor hip::host)
+endif()
+
 target_include_directories(MIOpen SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
 target_include_directories(MIOpen SYSTEM PRIVATE ${BZIP2_INCLUDE_DIR})
-target_link_libraries(MIOpen PRIVATE ${CMAKE_THREAD_LIBS_INIT} ${BZIP2_LIBRARIES})
+target_link_libraries(MIOpen PRIVATE ${CMAKE_THREAD_LIBS_INIT} ${BZIP2_LIBRARIES} ${MIOPEN_CK_LINK_FLAGS})
 generate_export_header(MIOpen
     EXPORT_FILE_NAME ${PROJECT_BINARY_DIR}/include/miopen/export.h
 )
diff --git a/src/convolution.cpp b/src/convolution.cpp
index 7c9b94df1c..50c1a685a4 100644
--- a/src/convolution.cpp
+++ b/src/convolution.cpp
@@ -265,8 +265,9 @@ ConvolutionDescriptor::GetForwardOutputTensorWithLayout(const TensorDescriptor&
     tensor_layout_to_strides(
         out_lens, default_layout, yLayout, xDesc.GetVectorLength(), out_strides);
     return {(xDesc.GetType() == miopenInt8 || xDesc.GetType() == miopenInt8x4
-                 ? (yType == miopenInt32 ? yType : miopenFloat)
-                 : xDesc.GetType()),
+                 ? (yType)
+                 : xDesc.GetType()), // TODO: This function overrides the output type with
+                                     // essentially the input which is incorrect.
             xDesc.GetLayout_t(),
             out_lens,
             out_strides};
diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp
index 683d8670a9..975bf1bafe 100644
--- a/src/include/miopen/solver.hpp
+++ b/src/include/miopen/solver.hpp
@@ -4108,6 +4108,69 @@ struct ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC final
                 const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC& config) const override;
 };
 
+struct PerformanceConfigHipImplicitGemmFwdXdlops
+    : PerfConfigBase<PerformanceConfigHipImplicitGemmFwdXdlops>
+{
+    int index;
+    std::string kernel_id;
+    int total_size;
+    PerformanceConfigHipImplicitGemmFwdXdlops(int idx, std::string kernl_id)
+        : index(idx), kernel_id(kernl_id), total_size(-1)
+    {
+    }
+    PerformanceConfigHipImplicitGemmFwdXdlops() : PerformanceConfigHipImplicitGemmFwdXdlops(0, "")
+    {
+    }
+    PerformanceConfigHipImplicitGemmFwdXdlops(bool)
+        : PerformanceConfigHipImplicitGemmFwdXdlops(0, "")
+    {
+    }
+    void HeuristicInit(const ConvolutionContext& ctx);
+    bool SetNextValue(const ConvolutionContext& ctx);
+    bool IsValidValue() const;
+    bool IsValid(const ConvolutionContext& ctx) const;
+    template <typename Self, typename F>
+    static void Visit(Self&& s, F f)
+    {
+        f(s.kernel_id, "kernel_id");
+    }
+    bool operator==(const PerformanceConfigHipImplicitGemmFwdXdlops& other) const;
+};
+
+struct ConvHipImplicitGemmFwdXdlops final
+    : ConvTunableSolver<PerformanceConfigHipImplicitGemmFwdXdlops>
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<ConvHipImplicitGemmFwdXdlops>();
+    }
+
+    PerformanceConfigHipImplicitGemmFwdXdlops
+    GetDefaultPerformanceConfig(const ConvolutionContext&) const override;
+    bool IsValidPerformanceConfig(const ConvolutionContext&,
+                                  const PerformanceConfigHipImplicitGemmFwdXdlops&) const override;
+    PerformanceConfigHipImplicitGemmFwdXdlops
+    Search(const ConvolutionContext&, const AnyInvokeParams& invoke_ctx) const override;
+    size_t GetWorkspaceSize(const ConvolutionContext& ctx) const override;
+    bool MayNeedWorkspace() const override { return false; }
+    bool IsApplicable(const ConvolutionContext& ctx) const override;
+    bool IsDynamic() const override { return true; }
+    ConvSolution
+    GetSolution(const ConvolutionContext& ctx,
+                const PerformanceConfigHipImplicitGemmFwdXdlops& config) const override;
+    // Magic Number Alert:
+    // Naive convolutions have GetWti() that return very small value (0.01f).
+    // This allows MIOpen to use Naive Solvers if no other applicable Solvers
+    // have known WTIs. Right now this means that in case of find-db miss,
+    // the library will try to use Winograd or GEMM (whatever is faster according
+    //  to their GetWti's), but if both are not applicable, the library will
+    // use Naive Solver
+    // Since we would like to us CK before naive, and use it instead (because
+    // we do expect that CK is faster than Naive), therefore we use a
+    // value bigger than 0.01f, e.g. 0.02f.
+    float GetWti(const ConvolutionContext&) const override { return 0.02f; };
+};
+
 struct AnySolver;
 
 } // namespace solver
diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp
index 053d258f12..8a960d647a 100644
--- a/src/mlo_dir_conv.cpp
+++ b/src/mlo_dir_conv.cpp
@@ -146,6 +146,7 @@ static auto GetImplicitGemmSolvers()
         miopen::solver::ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC,
         miopen::solver::ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC,
         miopen::solver::ConvCkIgemmFwdV6r1DlopsNchw,
+        miopen::solver::ConvHipImplicitGemmFwdXdlops,
         miopen::solver::ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC>{};
 }
 
diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp
index e8c2fbc6df..37db876bcd 100644
--- a/src/ocl/convolutionocl.cpp
+++ b/src/ocl/convolutionocl.cpp
@@ -566,10 +566,6 @@ void ValidateConvTensors(const ConvTensors& tensors)
     const auto trivial_tensor_types_not_matched =
         tensors.xDesc.GetType() != tensors.yDesc.GetType() &&
         tensors.xDesc.GetType() != miopenInt8 && tensors.xDesc.GetType() != miopenInt8x4;
-    const auto int8_in8x4_tensor_not_matched =
-        (tensors.xDesc.GetType() == miopenInt8 && tensors.yDesc.GetType() != miopenInt32 &&
-         tensors.yDesc.GetType() != miopenFloat) ||
-        (tensors.xDesc.GetType() == miopenInt8x4 && tensors.yDesc.GetType() != miopenInt32);
 
     // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) {
     //    MIOPEN_THROW(miopenStatusBadParm);
@@ -578,8 +574,7 @@ void ValidateConvTensors(const ConvTensors& tensors)
     const auto x_tensor_invalid = tensors.xDesc.GetSize() < 3;
 
     const auto bad_parameters = invalid_buffers || tensor_sizes_not_matched ||
-                                trivial_tensor_types_not_matched || int8_in8x4_tensor_not_matched ||
-                                x_tensor_invalid;
+                                trivial_tensor_types_not_matched || x_tensor_invalid;
 
     if(bad_parameters)
         MIOPEN_THROW(miopenStatusBadParm);
diff --git a/src/solver.cpp b/src/solver.cpp
index 468c22607c..19282d48c9 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -508,6 +508,8 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
                        ++id,
                        ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC{},
                        miopenConvolutionAlgoImplicitGEMM);
+    RegisterWithSolver(
+        registry, ++id, ConvHipImplicitGemmFwdXdlops{}, miopenConvolutionAlgoImplicitGEMM);
 
     // IMPORTANT: New solvers should be added to the end of the function!
 }
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
new file mode 100644
index 0000000000..1ea00f635a
--- /dev/null
+++ b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
@@ -0,0 +1,292 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <vector>
+
+#include <miopen/solver.hpp>
+#include <miopen/generic_search.hpp>
+#include <miopen/conv/data_invoke_params.hpp>
+#include <miopen/solver/convolution_context_interpreter.hpp>
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+#include <ck/library/host/host_interface.hpp>
+#endif
+
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS)
+
+namespace miopen {
+namespace solver {
+
+struct CKArgs
+{
+    CKArgs(const ConvolutionContext& ctx)
+    {
+        N        = ConvolutionContextInterpreter::GetBatchN(ctx);
+        K        = ConvolutionContextInterpreter::GetOutputChannelK(ctx);
+        C        = ConvolutionContextInterpreter::GetInputChannelC(ctx);
+        input    = {ConvolutionContextInterpreter::GetInputHeightHi(ctx),
+                 ConvolutionContextInterpreter::GetInputWidthWi(ctx)};
+        output   = {ConvolutionContextInterpreter::GetOutputHeightHo(ctx),
+                  ConvolutionContextInterpreter::GetOutputWidthWo(ctx)};
+        filter   = {ConvolutionContextInterpreter::GetFilterHeightY(ctx),
+                  ConvolutionContextInterpreter::GetFilterWidthX(ctx)};
+        strides  = {ConvolutionContextInterpreter::GetAdjustedConvolutionStrideH(ctx),
+                   ConvolutionContextInterpreter::GetAdjustedConvolutionStrideW(ctx)};
+        dilation = {ConvolutionContextInterpreter::GetAdjustedConvolutionDilationH(ctx),
+                    ConvolutionContextInterpreter::GetAdjustedConvolutionDilationW(ctx)};
+        lPadding = {ConvolutionContextInterpreter::GetInputLeftPadH(ctx),
+                    ConvolutionContextInterpreter::GetInputLeftPadW(ctx)};
+        rPadding = {ConvolutionContextInterpreter::GetAdjustedInputRightPadH(ctx),
+                    ConvolutionContextInterpreter::GetAdjustedInputRightPadW(ctx)};
+    }
+    int N;
+    int K;
+    int C;
+    std::vector<int> input;
+    std::vector<int> output;
+    std::vector<int> filter;
+    std::vector<int> strides;
+    std::vector<int> dilation;
+    std::vector<int> lPadding;
+    std::vector<int> rPadding;
+};
+
+void PerformanceConfigHipImplicitGemmFwdXdlops::HeuristicInit(const ConvolutionContext& ctx)
+{
+    this->index = 0;
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = ctx;
+#else
+    this->index = 0;
+    std::vector<DeviceConvFwdPtr_t> conv_ptrs;
+    add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs);
+    assert(!conv_ptrs.empty());
+    this->total_size = conv_ptrs.size();
+    const auto args  = CKArgs{ctx};
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        auto argument_ptr = conv_ptr.MakeArgumentPointer(nullptr,
+                                                         nullptr,
+                                                         nullptr,
+                                                         args.N,
+                                                         args.K,
+                                                         args.C,
+                                                         args.input,
+                                                         args.filter,
+                                                         args.output,
+                                                         args.strides,
+                                                         args.dilation,
+                                                         args.lPadding,
+                                                         args.rPadding);
+        if(conv_ptr.IsSupportedArgument(argument_ptr.get()))
+        {
+            this->kernel_id = conv_ptr.GetTypeString();
+            break;
+        }
+        ++this->index;
+    }
+#endif
+}
+
+bool PerformanceConfigHipImplicitGemmFwdXdlops::SetNextValue(const ConvolutionContext& ctx)
+{
+    if(total_size == -1)
+        this->HeuristicInit(ctx);
+    assert(total_size != -1);
+    if((index + 1) < total_size)
+    {
+        ++index;
+        return true;
+    }
+    else
+        return false;
+}
+
+bool PerformanceConfigHipImplicitGemmFwdXdlops::IsValidValue() const { return index < total_size; }
+
+bool PerformanceConfigHipImplicitGemmFwdXdlops::IsValid(const ConvolutionContext& ctx) const
+{
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = ctx;
+    return false;
+#else
+    std::vector<DeviceConvFwdPtr_t> conv_ptrs;
+    add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs);
+    const auto args   = CKArgs{ctx};
+    auto argument_ptr = conv_ptrs[this->index].MakeArgumentPointer(nullptr,
+                                                                   nullptr,
+                                                                   nullptr,
+                                                                   args.N,
+                                                                   args.K,
+                                                                   args.C,
+                                                                   args.input,
+                                                                   args.filter,
+                                                                   args.input,
+                                                                   args.strides,
+                                                                   args.dilation,
+                                                                   args.lPadding,
+                                                                   args.rPadding);
+    return conv_ptrs[this->index].IsSupportedArgument(argument_ptr.get());
+#endif
+}
+
+bool PerformanceConfigHipImplicitGemmFwdXdlops::operator==(
+    const PerformanceConfigHipImplicitGemmFwdXdlops& other) const
+{
+    return this->index == other.index;
+}
+
+PerformanceConfigHipImplicitGemmFwdXdlops
+ConvHipImplicitGemmFwdXdlops::GetDefaultPerformanceConfig(const ConvolutionContext& ctx) const
+{
+    PerformanceConfigHipImplicitGemmFwdXdlops pp;
+    pp.HeuristicInit(ctx);
+    return pp;
+}
+
+bool ConvHipImplicitGemmFwdXdlops::IsValidPerformanceConfig(
+    const ConvolutionContext& ctx, const PerformanceConfigHipImplicitGemmFwdXdlops& config) const
+{
+    return config.IsValid(ctx);
+}
+
+PerformanceConfigHipImplicitGemmFwdXdlops
+ConvHipImplicitGemmFwdXdlops::Search(const ConvolutionContext& ctx,
+                                     const AnyInvokeParams& invoke_ctx) const
+{
+    return GenericSearch(*this, ctx, invoke_ctx);
+}
+
+size_t ConvHipImplicitGemmFwdXdlops::GetWorkspaceSize(const ConvolutionContext& ctx) const
+{
+    std::ignore = ctx;
+    return 0;
+}
+
+bool ConvHipImplicitGemmFwdXdlops::IsApplicable(const ConvolutionContext& ctx) const
+{
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = ctx;
+    return false;
+#else
+    if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS{}))
+        return false;
+    if(miopen::IsEnabled(MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC{}))
+        return false;
+    if(!(ctx.conv_problem.GetInDataType() == miopenInt8 &&
+         ctx.conv_problem.GetWeightsDataType() == miopenInt8 &&
+         ctx.conv_problem.GetOutDataType() == miopenInt8))
+        return false;
+    if(!ctx.direction.IsForward())
+        return false;
+    if(!ctx.Is2d())
+        return false;
+    if(ctx.GetStream().GetDeviceName() != "gfx908")
+        return false;
+    if(!ctx.IsLayoutNHWC())
+        return false;
+
+    const auto args = CKArgs{ctx};
+    if(!std::all_of(args.strides.begin(), args.strides.end(), [&](auto x) { return x == 1; }))
+        return false;
+
+    std::vector<DeviceConvFwdPtr_t> conv_ptrs;
+    add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs);
+    assert(!conv_ptrs.empty());
+
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        auto argument_ptr = conv_ptr.MakeArgumentPointer(nullptr,
+                                                         nullptr,
+                                                         nullptr,
+                                                         args.N,
+                                                         args.K,
+                                                         args.C,
+                                                         args.input,
+                                                         args.filter,
+                                                         args.input,
+                                                         args.strides,
+                                                         args.dilation,
+                                                         args.lPadding,
+                                                         args.rPadding);
+        if(conv_ptr.IsSupportedArgument(argument_ptr.get()))
+            return true;
+    }
+    return false;
+#endif
+}
+
+ConvSolution ConvHipImplicitGemmFwdXdlops::GetSolution(
+    const ConvolutionContext& ctx, const PerformanceConfigHipImplicitGemmFwdXdlops& config) const
+{
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = ctx;
+    std::ignore = config;
+    return {};
+#else
+    ConvSolution result;
+    const auto args        = CKArgs{ctx};
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        std::ignore = kernels;
+        return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
+            std::vector<DeviceConvFwdPtr_t> conv_ptrs;
+            add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs);
+            auto& conv_ptr       = conv_ptrs.at(config.index);
+            const auto& data_ctx = primitive_parameters.CastTo<conv::DataInvokeParams>();
+            const auto& tensors  = data_ctx.tensors;
+            auto argument_ptr    = conv_ptr.MakeArgumentPointer(
+                const_cast<void*>( // NOLINT (cppcoreguidelines-pro-type-const-cast)
+                    static_cast<const void*>(tensors.in)),
+                const_cast<void*>( // NOLINT (cppcoreguidelines-pro-type-const-cast)
+                    static_cast<const void*>(tensors.w)),
+                static_cast<void*>(tensors.out),
+                args.N,
+                args.K,
+                args.C,
+                args.input,
+                args.filter,
+                args.input,
+                args.strides,
+                args.dilation,
+                args.lPadding,
+                args.rPadding);
+            auto invoker_ptr            = conv_ptr.MakeInvokerPointer();
+            const auto enable_profiling = handle.IsProfilingEnabled();
+
+            float elapsed_time =
+                invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling});
+            if(enable_profiling)
+            {
+                handle.ResetKernelTime();
+                handle.AccumKernelTime(elapsed_time);
+            }
+        };
+    };
+    return result;
+#endif
+}
+
+} // namespace solver
+} // namespace miopen
diff --git a/src/solver/gemm.cpp b/src/solver/gemm.cpp
index 9baa9e2614..e23c05729e 100644
--- a/src/solver/gemm.cpp
+++ b/src/solver/gemm.cpp
@@ -87,6 +87,13 @@ bool GemmFwdBase::IsApplicable(const ExecutionContext& ctx,
     const auto& xDesc = problem.GetIn();
     const auto& wDesc = problem.GetWeights();
     const auto& yDesc = problem.GetOut();
+    if(xDesc.GetType() == miopenInt8x4 || xDesc.GetType() == miopenInt8)
+    {
+        // rocBlas needs the output to be int32 always
+        if(yDesc.GetType() != miopenFloat && yDesc.GetType() != miopenInt32 &&
+           yDesc.GetType() != miopenInt8x4)
+            return false;
+    }
     return problem.GetDirection() == conv::Direction::Forward && problem.IsLayoutDefault() &&
            !(IsAnyBufferBF16(xDesc, yDesc, wDesc) && !IsBf16Supported) &&
            !(IsAnyBufferFp16(xDesc, yDesc, wDesc) && !IsFp16Supported);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d0a22cd25d..f1984e2af5 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -618,7 +618,6 @@ function(add_custom_test NAME)
         set_tests_properties(${NAME} PROPERTIES DISABLED On)
     endif()
 endfunction()
-
 if(${CODECOV_TEST})
     add_custom_test(test_conv3d_codecov
         COMMAND $<TARGET_FILE:test_conv3d> ${MIOPEN_TEST_FLOAT_ARG} --input 2 4 4 4 4 --weights 2 4 1 1 1 --pads_strides_dilations 0 0 0 1 1 1 1 1 1 ${MIOPEN_TEST_FLAGS_ARGS}
@@ -775,6 +774,15 @@ add_custom_test(test_conv_igemm_mlir_xdlops_fwd SKIP_UNLESS_ALL HALF_ENABLED INT
     COMMAND ${IMPLICITGEMM_MLIR_ENV_F_XDLOPS} $<TARGET_FILE:test_conv2d> ${IMPLICITGEMM_MLIR_ARGS_F} --input 256 256  56 56 --weights 256  64   1 1 --pads_strides_dilations 0 0 1 1 1 1 --group-count 4
 )
 
+if(MIOPEN_USE_COMPOSABLEKERNEL)
+add_custom_test(test_conv_hip_igemm_xdlops SKIP_UNLESS_ALL OCL_DISABLED HALF_DISABLED FLOAT_DISABLED INT8_ENABLED GFX900_DISABLED GFX906_DISABLED GFX90A_DISABLED
+    COMMAND $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --disable-backward-data --disable-backward-weights --verbose --input 256 128  28 28 --weights 128  128  3 3 --output_type int8 --in_layout NHWC --fil_layout NHWC --out_layout NHWC --pads_strides_dilations 1 1 1 1 1 1
+    COMMAND $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --disable-backward-data --disable-backward-weights --verbose --input 128 512  7  7  --weights 512  512  3 3 --output_type int8 --in_layout NHWC --fil_layout NHWC --out_layout NHWC --pads_strides_dilations 1 1 1 1 1 1
+    COMMAND $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --disable-backward-data --disable-backward-weights --verbose --input 128 64   56 56 --weights 64   64   1 1 --output_type int8 --in_layout NHWC --fil_layout NHWC --out_layout NHWC --pads_strides_dilations 0 0 1 1 1 1
+    COMMAND $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --disable-backward-data --disable-backward-weights --verbose --input 256 256  56 56 --weights 256  64   1 1 --output_type int8 --in_layout NHWC --fil_layout NHWC --out_layout NHWC --pads_strides_dilations 0 0 1 1 1 1
+)
+endif()
+
 add_custom_test(test_conv_igemm_mlir_xdlops_bwd_wrw SKIP_UNLESS_ALL HALF_ENABLED SKIP_UNLESS_MLIR GFX900_DISABLED GFX906_DISABLED
     COMMAND ${IMPLICITGEMM_MLIR_ENV_B_XDLOPS} $<TARGET_FILE:test_conv2d> ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 1024 14 14 --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1
     COMMAND ${IMPLICITGEMM_MLIR_ENV_B_XDLOPS} $<TARGET_FILE:test_conv2d> ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 1024 14 14 --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC
diff --git a/test/conv2d.cpp b/test/conv2d.cpp
index 6a6fb4f6db..f781f1c653 100644
--- a/test/conv2d.cpp
+++ b/test/conv2d.cpp
@@ -59,6 +59,9 @@ struct conv2d_driver : conv_driver<T>
         this->add(this->deterministic, "deterministic", this->generate_data({false}));
         this->add(this->tensor_vect, "tensor_vect", this->generate_data({0}));
         this->add(this->vector_length, "vector_length", this->generate_data({1}));
+        // Only valid for int8 input and weights
+        this->add(this->output_type, "output_type", this->generate_data({"int32"}));
+        this->add(this->int8_vectorize, "int8_vectorize", this->generate_data({false}));
     }
 };
 
diff --git a/test/conv_common.hpp b/test/conv_common.hpp
index 210817d027..6854f61914 100644
--- a/test/conv_common.hpp
+++ b/test/conv_common.hpp
@@ -239,9 +239,9 @@ tensor<Tout> get_output_tensor(const miopen::ConvolutionDescriptor& filter,
         input.desc,
         weights.desc,
         yLayout,
-        weights.desc.GetType() == miopenInt8 || weights.desc.GetType() == miopenInt8x4
+        weights.desc.GetType() == miopenInt8x4
             ? (std::is_same<Tout, int>{} ? miopenInt32 : miopenFloat)
-            : weights.desc.GetType())};
+            : miopen_type<Tout>{})};
 }
 
 // Convolution test base class
@@ -1652,7 +1652,7 @@ struct verify_forward_conv_int8 : conv_base<T>
     }
 };
 
-template <class T, bool immed_mode = false>
+template <class T, bool immed_mode = false, class Tout = T>
 struct conv_driver : test_driver
 {
     tensor<T> input;
@@ -1683,6 +1683,8 @@ struct conv_driver : test_driver
     bool gen_float           = false;
     bool immed               = immed_mode;
     bool enable_fdb          = true;
+    std::string output_type  = "";
+    bool int8_vectorize      = false;
     bool deterministic       = false;
 
     std::unordered_map<std::string, miopenConvolutionMode_t> cmode_lookup = {
@@ -1828,6 +1830,7 @@ struct conv_driver : test_driver
             filter.spatialDim = get_spatial_dim();
         else
             filter.spatialDim = filter_dims.size();
+        bool is_int8 = (input.desc.GetType() == miopenInt8 || input.desc.GetType() == miopenInt8x4);
 
         filter.mode             = cmode_lookup[miopen::ToUpper(conv_mode)];
         filter.paddingMode      = pmode_lookup[miopen::ToUpper(pad_mode)];
@@ -2007,8 +2010,6 @@ struct conv_driver : test_driver
             wei_k_len          = weights.desc.GetLengths()[3];
         }
 
-        bool is_int8 = (input.desc.GetType() == miopenInt8 || input.desc.GetType() == miopenInt8x4);
-
         // lack of transposeConv or groupConv for int8 type
         if(is_int8 && filter.mode == miopenTranspose)
         {
@@ -2118,7 +2119,8 @@ struct conv_driver : test_driver
                  (filter.group_count > 1 &&
                   (input.desc.GetLengths().at(1) % weights.desc.GetLengths().at(0) == 0)))))
             {
-                auto output             = get_output_tensor(filter, input, weights, out_layout);
+                auto output = get_output_tensor<T, Tout>(filter, input, weights, out_layout);
+
                 auto gen_positive_value = [=](auto...) {
                     auto data_type    = input.desc.GetType();
                     std::size_t v_max = is_int8 ? 16 : (data_type == miopenHalf) ? 4 : 16;
@@ -2143,7 +2145,8 @@ struct conv_driver : test_driver
                                                       miopen::conv::Direction::Forward);
                 ctx.SetStream(&get_handle());
 
-                bool skip_forward = (input.desc.GetType() == miopenInt8x4 && !IsGemmAplicable(ctx));
+                bool skip_forward = false;
+
                 bool skip_backward_data    = is_int8;
                 bool skip_backward_weights = is_int8;
 
@@ -2194,8 +2197,9 @@ struct conv_driver : test_driver
                 size_t total_mem;
                 if(is_int8)
                 {
+                    // TODO: Tout here was float which should have been int32
                     auto output_int8 =
-                        get_output_tensor<T, float>(filter, input, weights, out_layout);
+                        get_output_tensor<T, Tout>(filter, input, weights, out_layout);
                     size_t workspace_size = filter.ForwardGetWorkSpaceSize(
                         handle, weights.desc, input.desc, output_int8.desc);
 
@@ -2258,26 +2262,45 @@ struct conv_driver : test_driver
                 {
                     if(is_int8)
                     {
-                        verify(verify_forward_conv<T, float>{
-                            input,
-                            weights,
-                            get_output_tensor<T, float>(filter, input, weights, out_layout),
-                            filter,
-                            stats,
-                            0,
-                            search,
-                            false,
-                            immed});
-                        verify(verify_forward_conv<T, int>{
-                            input,
-                            weights,
-                            get_output_tensor<T, int>(filter, input, weights, out_layout),
-                            filter,
-                            stats,
-                            0,
-                            search,
-                            false,
-                            immed});
+                        if(output_type == "float")
+                        {
+                            verify(verify_forward_conv<T, float>{
+                                input,
+                                weights,
+                                get_output_tensor<T, float>(filter, input, weights, out_layout),
+                                filter,
+                                stats,
+                                0,
+                                search,
+                                int8_vectorize,
+                                immed});
+                        }
+                        else if(output_type == "int32")
+                        {
+                            verify(verify_forward_conv<T, int>{
+                                input,
+                                weights,
+                                get_output_tensor<T, int>(filter, input, weights, out_layout),
+                                filter,
+                                stats,
+                                0,
+                                search,
+                                int8_vectorize,
+                                immed});
+                        }
+                        else if(output_type == "int8")
+                        {
+                            verify(verify_forward_conv<T, int8_t>{
+                                input,
+                                weights,
+                                get_output_tensor<T, int8_t>(filter, input, weights, out_layout),
+                                filter,
+                                stats,
+                                0,
+                                search,
+                                int8_vectorize,
+                                immed});
+                        }
                     }
                     else
                     {
diff --git a/test/tensor_cast.cpp b/test/tensor_cast.cpp
index 44f6c3f315..3fe11281cf 100644
--- a/test/tensor_cast.cpp
+++ b/test/tensor_cast.cpp
@@ -170,8 +170,9 @@ struct tensor_cast_driver : test_driver
 
     void run()
     {
-        unsigned long max_value = miopen_type<T>{} == miopenHalf ? 5 : 32767;
-        max_val                 = miopen_type<T>{} == miopenHalf
+        unsigned long max_value =
+            miopen_type<T>{} == miopenHalf ? 5 : (miopen_type<T>{} == miopenInt8 ? 126 : 32767);
+        max_val = miopen_type<T>{} == miopenHalf
                       ? 65504.0
                       : miopen_type<T>{} == miopenInt8
                             ? 127.0
diff --git a/test/tensor_holder.hpp b/test/tensor_holder.hpp
index 68edf9483e..79e3972e0f 100644
--- a/test/tensor_holder.hpp
+++ b/test/tensor_holder.hpp
@@ -165,7 +165,7 @@ struct tensor
     {
         assert(rhs.GetType() == miopen_type<T>{} ||
                ((miopen_type<T>{} == miopenInt8 || miopen_type<T>{} == miopenInt8x4) &&
-                rhs.GetType() == miopenFloat));
+                (rhs.GetType() == miopenFloat || rhs.GetType() == miopenInt32)));
         data.resize(desc.GetElementSpace());
     }