Merge pull request #58 from ROCm/navi4x_conv_fwd

zjing14 · web-flow · commit 78f637e40124 · 2024-04-08T09:34:52.000-05:00
Navi4x Conv and MHA enablement
diff --git a/example/20_grouped_conv_bwd_weight/CMakeLists.txt b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
@@ -1,5 +1,5 @@
 list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
-list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102)
+list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1200)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
     if(gpu IN_LIST gpu_list_xdl AND target EQUAL 0)
diff --git a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
@@ -1,5 +1,5 @@
 list(APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
-list(APPEND gpu_list2 gfx1100 gfx1101 gfx1102 gfx1103)
+list(APPEND gpu_list2 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200)
 
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(GPU_TARGETS MATCHES "gfx11")
+if(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
     add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp)
     add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp)
     add_example_executable(example_self_attention_forward_wmma_fp16 self_attention_forward_wmma_fp16.cpp)
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
@@ -1,5 +1,5 @@
 list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942 gfx950)
-list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102)
+list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1200)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
     if(gpu IN_LIST gpu_list_xdl AND target EQUAL 0)
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
@@ -70,6 +70,9 @@ struct BlockwiseGemmWMMA
     static constexpr index_t A_KRow = 2;
     static constexpr index_t B_KRow = 2;
 
+    static constexpr index_t A_KRow_ = AEnableLds ? 1 : 2;
+    static constexpr index_t B_KRow_ = BEnableLds ? 1 : 2;
+
     static constexpr index_t A_K1 = ABlockDesc{}.GetLength(I5);
     static constexpr index_t B_K1 = BBlockDesc{}.GetLength(I5);
 
@@ -191,9 +194,6 @@ struct BlockwiseGemmWMMA
         static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 &&
                           NPerBlock % (NPerWMMA * NRepeat) == 0,
                       "wrong!");
-
-        static_assert(AEnableLds == true, "only support EnableLds");
-        static_assert(BEnableLds == true, "only support EnableLds");
     }
 
     // transposed WMMA output C' = B' * A'
@@ -316,7 +316,7 @@ struct BlockwiseGemmWMMA
                         // read A
                         a_thread_copy_.Run(
                             a_block_desc_k0_m0_m1_m2_k1,
-                            make_tuple(Number<k * KPack / A_K1>{}, m0, I0, I0, I0, I0),
+                            make_tuple(Number<k * KPack / A_K1 / A_KRow_>{}, m0, I0, I0, I0, I0),
                             a_block_buf,
                             a_thread_desc_,
                             make_tuple(I0, m0, I0, I0, I0, I0),
@@ -326,7 +326,8 @@ struct BlockwiseGemmWMMA
                             // read B
                             b_thread_copy_.Run(
                                 b_block_desc_k0_n0_n1_n2_k1,
-                                make_tuple(Number<k * KPack / B_K1>{}, n0, I0, I0, I0, I0),
+                                make_tuple(
+                                    Number<k * KPack / B_K1 / B_KRow_>{}, n0, I0, I0, I0, I0),
                                 b_block_buf,
                                 b_thread_desc_,
                                 make_tuple(I0, n0, I0, I0, I0, I0),
@@ -372,15 +373,15 @@ struct BlockwiseGemmWMMA
                         // read B
                         b_thread_copy_.Run(
                             b_block_desc_k0_n0_n1_n2_k1,
-                            make_tuple(Number<k * KPack / B_K1>{}, n0, I0, I0, I0, I0),
+                            make_tuple(Number<k * KPack / B_K1 / B_KRow_>{}, n0, I0, I0, I0, I0),
                             b_block_buf,
                             b_thread_desc_,
                             make_tuple(I0, n0, I0, I0, I0, I0),
                             b_thread_buf);
                         // read A
                         a_thread_copy_.Run(
                             a_block_desc_k0_m0_m1_m2_k1,
-                            make_tuple(Number<k * KPack / A_K1>{}, m0, I0, I0, I0, I0),
+                            make_tuple(Number<k * KPack / A_K1 / A_KRow_>{}, m0, I0, I0, I0, I0),
                             a_block_buf,
                             a_thread_desc_,
                             make_tuple(I0, m0, I0, I0, I0, I0),
@@ -442,44 +443,30 @@ struct BlockwiseGemmWMMA
     static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
         make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, wmma_gemm.GetRegSizePerWmma()));
 
-    template <bool EnableLds>
-    struct AThreadCopySelector;
-
-    template <>
-    struct AThreadCopySelector<true>
-    {
-        using type =
-            ThreadwiseTensorSliceTransfer_v4<FloatA,
-                                             FloatA,
-                                             decltype(a_block_desc_k0_m0_m1_m2_k1),
-                                             decltype(a_thread_desc_),
-                                             Sequence<KPack / A_K1 / A_KRow, 1, 1, 1, 1, A_K1>,
-                                             Sequence<0, 1, 2, 3, 4, 5>,
-                                             5,
-                                             A_K1,
-                                             A_K1>;
-    };
-
-    template <bool EnableLds>
-    struct BThreadCopySelector;
-
-    template <>
-    struct BThreadCopySelector<true>
-    {
-        using type =
-            ThreadwiseTensorSliceTransfer_v4<FloatB,
-                                             FloatB,
-                                             decltype(b_block_desc_k0_n0_n1_n2_k1),
-                                             decltype(b_thread_desc_),
-                                             Sequence<KPack / B_K1 / B_KRow, 1, 1, 1, 1, B_K1>,
-                                             Sequence<0, 1, 2, 3, 4, 5>,
-                                             5,
-                                             B_K1,
-                                             B_K1>;
-    };
-
-    typename AThreadCopySelector<AEnableLds>::type a_thread_copy_;
-    typename BThreadCopySelector<BEnableLds>::type b_thread_copy_;
+    using AThreadCopyType =
+        ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                         FloatA,
+                                         decltype(a_block_desc_k0_m0_m1_m2_k1),
+                                         decltype(a_thread_desc_),
+                                         Sequence<KPack / A_K1 / A_KRow, 1, 1, 1, 1, A_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                         5,
+                                         A_K1,
+                                         A_K1>;
+
+    using BThreadCopyType =
+        ThreadwiseTensorSliceTransfer_v4<FloatB,
+                                         FloatB,
+                                         decltype(b_block_desc_k0_n0_n1_n2_k1),
+                                         decltype(b_thread_desc_),
+                                         Sequence<KPack / B_K1 / B_KRow, 1, 1, 1, 1, B_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                         5,
+                                         B_K1,
+                                         B_K1>;
+
+    AThreadCopyType a_thread_copy_;
+    BThreadCopyType b_thread_copy_;
 };
 #else
 template <index_t BlockSize,
@@ -537,9 +524,8 @@ struct BlockwiseGemmWMMA
     // permutation
     static constexpr index_t A_KRow = AEnableLds ? 1 : 2;
     static constexpr index_t B_KRow = BEnableLds ? 1 : 2;
-
-    static constexpr index_t A_K1 = ABlockDesc{}.GetLength(I5);
-    static constexpr index_t B_K1 = BBlockDesc{}.GetLength(I5);
+    static constexpr index_t A_K1   = ABlockDesc{}.GetLength(I5);
+    static constexpr index_t B_K1   = BBlockDesc{}.GetLength(I5);
 
     static constexpr auto wmma_gemm =
         WmmaGemm<FloatA, FloatB, FloatAcc, MPerWMMA, NPerWMMA, KPack, TransposeC>{};
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
@@ -829,7 +829,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
 
     static bool IsSupportedArgument(const Argument& arg)
     {
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
         {
             if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
             {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
@@ -56,7 +56,7 @@ __global__ void
                                                        bool input_permute,
                                                        bool output_permute)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
 
     // clang-format off
 // ***************************************************
@@ -159,6 +159,7 @@ __global__ void
     ignore = O;
     ignore = G0;
     ignore = G1;
+    ignore = alpha;
     ignore = input_permute;
     ignore = output_permute;
 #endif // end of if (defined(__gfx11__))
@@ -187,7 +188,7 @@ __global__ void
                                            index_t head_size,
                                            float alpha)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
 
     // clang-format off
 // ***************************************************
@@ -858,7 +859,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle
 
     static bool IsSupportedArgument(const RawArg& arg)
     {
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
         {
             if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
             {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
@@ -94,8 +94,8 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm<ALayout,
         (MWaves == 1 && is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value) ? false : true;
 
     // If true, LDS is used unconditionally
-    static constexpr auto AEnableLds_manu = true;
-    static constexpr auto BEnableLds_manu = true;
+    static constexpr auto AEnableLds_manu = false;
+    static constexpr auto BEnableLds_manu = false;
 
     static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1);
     static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
@@ -629,7 +629,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
     static bool IsSupportedArgument(const Argument& arg)
     {
         // check device
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
         {
             if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
             {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp
@@ -702,7 +702,7 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffle
     static bool IsSupportedArgument(const Argument& arg)
     {
         // check device
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
         {
             if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
             {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
@@ -581,7 +581,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
         namespace ctc = tensor_layout::convolution;
 
         // check device
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
         {
             if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, int32_t>))
             {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
@@ -61,7 +61,7 @@ __global__ void
                                             bool input_permute,
                                             bool output_permute)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
 
     // clang-format off
 // ***************************************************
@@ -166,6 +166,7 @@ __global__ void
     ignore = O;
     ignore = G0;
     ignore = G1;
+    ignore = alpha;
     ignore = input_permute;
     ignore = output_permute;
 #endif // end of if (defined(__gfx11__))
@@ -596,7 +597,7 @@ struct DeviceGroupedQueryAttentionForward_Wmma
 
     static bool IsSupportedArgument(const RawArg& arg)
     {
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
         {
             if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
             {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
@@ -60,7 +60,8 @@ __global__ void
                                           bool input_permute,
                                           bool output_permute)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx11__) || \
+    defined(__gfx12__))
 
     // clang-format off
 // ***************************************************
@@ -165,6 +166,7 @@ __global__ void
     ignore = O;
     ignore = G0;
     ignore = G1;
+    ignore = alpha;
     ignore = input_permute;
     ignore = output_permute;
 #endif // end of if (defined(__gfx11__))
@@ -594,7 +596,7 @@ struct DeviceMultiQueryAttentionForward_Wmma
 
     static bool IsSupportedArgument(const RawArg& arg)
     {
-        if(ck::is_navi3_supported())
+        if(ck::is_navi3_supported() || ck::is_navi4_supported())
         {
             if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
             {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -571,15 +571,12 @@ struct GridwiseGemmMultipleD_Wmma
     // *Caution Here repeat is shuffle repeat
     GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat()
     {
-        constexpr index_t MWave = MPerBlock / (MRepeat * MPerWmma);
-        constexpr index_t NWave = NPerBlock / (NRepeat * NPerWmma);
-
         constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
             make_naive_tensor_descriptor_packed(
                 make_tuple(I1,
-                           Number<CShuffleMRepeatPerShuffle * MWave * MPerWmma>{},
+                           Number<CShuffleMRepeatPerShuffle * MWaves * MPerWmma>{},
                            I1,
-                           Number<CShuffleNRepeatPerShuffle * NWave * NPerWmma>{}));
+                           Number<CShuffleNRepeatPerShuffle * NWaves * NPerWmma>{}));
 
         return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat;
     }
@@ -799,8 +796,9 @@ struct GridwiseGemmMultipleD_Wmma
         const auto M = e_grid_desc_m_n.GetLength(I0);
         const auto N = e_grid_desc_m_n.GetLength(I1);
 
-        const auto MBlock                                        = M / MPerBlock;
-        const auto NBlock                                        = N / NPerBlock;
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
         const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
             e_grid_desc_m_n,
             make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
@@ -522,12 +522,6 @@ struct GridwiseGemm_Wmma
             c_grid_desc_m_n);
     }
 
-    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock =
-        remove_cvref_t<decltype(MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-            CGridDesc_M_N{}))>;
-    using DefaultBlock2CTileMap =
-        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1))>;
-
     struct SharedMemTrait
     {
         // LDS allocation for A and B: be careful of alignment
@@ -559,6 +553,12 @@ struct GridwiseGemm_Wmma
                           b_block_space_size_aligned * sizeof(BDataType));
     };
 
+    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock =
+        remove_cvref_t<decltype(MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            CGridDesc_M_N{}))>;
+    using DefaultBlock2CTileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1))>;
+
     template <bool HasMainKBlockLoop, typename Block2CTileMap = DefaultBlock2CTileMap>
     __device__ static void Run(const ADataType* __restrict__ p_a_grid,
                                const BDataType* __restrict__ p_b_grid,

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-if(GPU_TARGETS MATCHES "gfx11")`
	`1`	`+if(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")`
`2`	`2`	`add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp)`
`3`	`3`	`add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp)`
`4`	`4`	`add_example_executable(example_self_attention_forward_wmma_fp16 self_attention_forward_wmma_fp16.cpp)`
Original file line number	Diff line number	Diff line change
`@@ -829,7 +829,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle`
`829`	`829`
`830`	`830`	`static bool IsSupportedArgument(const Argument& arg)`
`831`	`831`	`{`
`832`		`- if(ck::is_navi3_supported())`
	`832`	`+ if(ck::is_navi3_supported() \|\| ck::is_navi4_supported())`
`833`	`833`	`{`
`834`	`834`	`if constexpr(!(is_same_v<AccDataType, float> \|\| is_same_v<AccDataType, int32_t>))`
`835`	`835`	`{`
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ __global__ void`
`56`	`56`	`bool input_permute,`
`57`	`57`	`bool output_permute)`
`58`	`58`	`{`
`59`		`-#if(!defined(__HIP_DEVICE_COMPILE__) \|\| defined(__gfx11__))`
	`59`	`+#if(!defined(__HIP_DEVICE_COMPILE__) \|\| defined(__gfx11__) \|\| defined(__gfx12__))`
`60`	`60`
`61`	`61`	`// clang-format off`
`62`	`62`	`// ***************************************************`
`@@ -159,6 +159,7 @@ __global__ void`
`159`	`159`	`ignore = O;`
`160`	`160`	`ignore = G0;`
`161`	`161`	`ignore = G1;`
	`162`	`+ ignore = alpha;`
`162`	`163`	`ignore = input_permute;`
`163`	`164`	`ignore = output_permute;`
`164`	`165`	`#endif // end of if (defined(__gfx11__))`
`@@ -187,7 +188,7 @@ __global__ void`
`187`	`188`	`index_t head_size,`
`188`	`189`	`float alpha)`
`189`	`190`	`{`
`190`		`-#if(!defined(__HIP_DEVICE_COMPILE__) \|\| defined(__gfx11__))`
	`191`	`+#if(!defined(__HIP_DEVICE_COMPILE__) \|\| defined(__gfx11__) \|\| defined(__gfx12__))`
`191`	`192`
`192`	`193`	`// clang-format off`
`193`	`194`	`// ***************************************************`
`@@ -858,7 +859,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle`
`858`	`859`
`859`	`860`	`static bool IsSupportedArgument(const RawArg& arg)`
`860`	`861`	`{`
`861`		`- if(ck::is_navi3_supported())`
	`862`	`+ if(ck::is_navi3_supported() \|\| ck::is_navi4_supported())`
`862`	`863`	`{`
`863`	`864`	`if constexpr(!(is_same_v<Acc0DataType, float> \|\| is_same_v<Acc0DataType, int32_t>))`
`864`	`865`	`{`
Original file line number	Diff line number	Diff line change
`@@ -629,7 +629,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle`
`629`	`629`	`static bool IsSupportedArgument(const Argument& arg)`
`630`	`630`	`{`
`631`	`631`	`// check device`
`632`		`- if(ck::is_navi3_supported())`
	`632`	`+ if(ck::is_navi3_supported() \|\| ck::is_navi4_supported())`
`633`	`633`	`{`
`634`	`634`	`if constexpr(!(is_same_v<AccDataType, float> \|\| is_same_v<AccDataType, int32_t>))`
`635`	`635`	`{`
Original file line number	Diff line number	Diff line change
`@@ -702,7 +702,7 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffle`
`702`	`702`	`static bool IsSupportedArgument(const Argument& arg)`
`703`	`703`	`{`
`704`	`704`	`// check device`
`705`		`- if(ck::is_navi3_supported())`
	`705`	`+ if(ck::is_navi3_supported() \|\| ck::is_navi4_supported())`
`706`	`706`	`{`
`707`	`707`	`if constexpr(!(is_same_v<AccDataType, float> \|\| is_same_v<AccDataType, int32_t>))`
`708`	`708`	`{`