Skip to content

Conversation

@CongMa13
Copy link
Collaborator

@CongMa13 CongMa13 commented Nov 4, 2025

Proposed changes

We copied the tile engine code of gemm and revise it to align with the StreamK.

Note: The entire tile engine will be refactored to extract common functionality.

  • Use new Streamk implementation as the cpp template in .
  • Add reduction_strategy to default_config.json so that there are instances for atomic and reduction
  • Add persistent==true to default_config.json

benchmark instances are

... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_1x4x1_16x16x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_1x4x1_16x16x32
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_1x4x1_32x32x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_1x4x1_32x32x8
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_1x4x1_4x64x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_2x2x1_16x16x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_2x2x1_16x16x32
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_2x2x1_32x32x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_2x2x1_32x32x8
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_2x2x1_4x64x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_4x1x1_16x16x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_4x1x1_16x16x32
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_4x1x1_32x32x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_4x1x1_32x32x8
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_4x1x1_4x64x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_1x4x1_16x16x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_1x4x1_16x16x32
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_1x4x1_32x32x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_1x4x1_32x32x8
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_1x4x1_4x64x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_2x2x1_16x16x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_2x2x1_16x16x32
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_2x2x1_32x32x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_2x2x1_32x32x8
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_2x2x1_4x64x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_4x1x1_16x16x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_4x1x1_16x16x32
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_4x1x1_32x32x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_4x1x1_32x32x8
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_4x1x1_4x64x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_1x4x1_16x16x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_1x4x1_16x16x32
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_1x4x1_32x32x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_1x4x1_32x32x8
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_1x4x1_4x64x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_2x2x1_16x16x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_2x2x1_16x16x32
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_2x2x1_32x32x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_2x2x1_32x32x8
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_2x2x1_4x64x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_4x1x1_16x16x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_4x1x1_16x16x32
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_4x1x1_32x32x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_4x1x1_32x32x8
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_4x1x1_4x64x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_1x4x1_16x16x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_1x4x1_16x16x32
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_1x4x1_32x32x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_1x4x1_32x32x8
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_1x4x1_4x64x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_2x2x1_16x16x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_2x2x1_16x16x32
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_2x2x1_32x32x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_2x2x1_32x32x8
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_2x2x1_4x64x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_4x1x1_16x16x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_4x1x1_16x16x32
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_4x1x1_32x32x16
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_4x1x1_32x32x8
... benchmark_gemm_streamk_fp16_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_4x1x1_4x64x16
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_1x4x1_16x16x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_1x4x1_32x32x16
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_1x4x1_32x32x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_2x2x1_16x16x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_2x2x1_32x32x16
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_2x2x1_32x32x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_4x1x1_16x16x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_4x1x1_32x32x16
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_atomic_256x256x32_4x1x1_32x32x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_1x4x1_16x16x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_1x4x1_32x32x16
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_1x4x1_32x32x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_2x2x1_16x16x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_2x2x1_32x32x16
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_2x2x1_32x32x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_4x1x1_16x16x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_4x1x1_32x32x16
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_False_reduction_256x256x32_4x1x1_32x32x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_1x4x1_16x16x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_1x4x1_32x32x16
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_1x4x1_32x32x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_2x2x1_16x16x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_2x2x1_32x32x16
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_2x2x1_32x32x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_4x1x1_16x16x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_4x1x1_32x32x16
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_atomic_256x256x32_4x1x1_32x32x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_1x4x1_16x16x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_1x4x1_32x32x16
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_1x4x1_32x32x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_2x2x1_16x16x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_2x2x1_32x32x16
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_2x2x1_32x32x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_4x1x1_16x16x32
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_4x1x1_32x32x16
... benchmark_gemm_streamk_fp8_rcr_compv3_cshuffle_intrawave_False_False_False_True_reduction_256x256x32_4x1x1_32x32x32

Checklist

Please put an x into the boxes that apply. You can also fill these out after creating the PR. If you're not sure, please don't hesitate to ask.

  • I have added tests relevant to the introduced functionality, and the unit tests are passing locally
  • I have added the test to REGRESSION_TESTS list defined at the top of CMakeLists.txt in tests/CMakeLists.txt, IF the test takes more than 30 seconds to run.
  • I have added inline documentation which enables the maintainers with understanding the motivation
  • I have removed the stale documentation which is no longer relevant after this pull request
  • (If this change is user-facing) I have added release notes which provide the end users with a brief summary of the improvement from this pull request
  • I have run clang-format on all changed files
  • Any dependent changes have been merged

Discussion

If this is a relatively large or complex change, feel free to start a discussion by explaining why you chose the solution you did and what alternatives you considered

Copy link
Contributor

Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull Request Overview

This PR adds GEMM StreamK support to the tile engine, introducing a new highly configurable GEMM kernel framework with stream-k partitioning support for AMD GPUs (gfx90a, gfx942, gfx950).

Key changes:

  • Adds Python-based kernel instance builder with parallel generation support and comprehensive validation
  • Implements C++ profiler and benchmark infrastructure for single-kernel and multi-kernel testing
  • Removes reduction_strategy field from StreamKHostArgs to make it a compile-time template parameter

Reviewed Changes

Copilot reviewed 15 out of 15 changed files in this pull request and generated 8 comments.

Show a summary per file
File Description
tile_engine/ops/gemm_streamk/gemm_streamk_validation_utils.py Validation utilities for tile configurations, GPU detection, and constraint checking
tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py Python script to generate individual GEMM kernel instances with parallel processing
tile_engine/ops/gemm_streamk/gemm_streamk_profiler.hpp C++ profiler class for benchmarking kernel performance
tile_engine/ops/gemm_streamk/gemm_streamk_benchmark_single.cpp Single-kernel benchmark executable
tile_engine/ops/gemm_streamk/gemm_streamk_benchmark.hpp Benchmark infrastructure with problem/result structures
tile_engine/ops/gemm_streamk/gemm_streamk_common.hpp Common types, traits, and utility functions
tile_engine/ops/gemm_streamk/CMakeLists.txt Build configuration with individual target generation
tile_engine/ops/gemm_streamk/configs/default_config.json Default configuration for kernel generation
include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp Refactored to use compile-time reduction strategy
example/ck_tile/40_streamk_gemm/*.cpp Updated examples to use compile-time reduction strategy
Jenkinsfile Added CI pipeline integration
Comments suppressed due to low confidence (1)

tile_engine/ops/gemm_streamk/gemm_streamk_instance_builder.py:758

  • Normal methods should have 'self', rather than 'work_item', as their first parameter.
    def _generate_single_kernel_individual(work_item):

💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.

# Filter out unsupported trait combinations
combinations = []
for combo in all_combinations:
pipeline, epilogue, scheduler, reduction_strategy = combo[:4]
Copy link

Copilot AI Nov 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The unpacking order is incorrect. combo is created from itertools.product(pipelines, epilogues, schedulers, pad_m_values, pad_n_values, pad_k_values, persistent_values, reduction_strategy_value) (lines 256-266), so the first 4 elements are (pipeline, epilogue, scheduler, pad_m), not (pipeline, epilogue, scheduler, reduction_strategy). This causes validation to check wrong parameters. Should be: pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent, reduction_strategy = combo

Suggested change
pipeline, epilogue, scheduler, reduction_strategy = combo[:4]
pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent, reduction_strategy = combo

Copilot uses AI. Check for mistakes.
)
else:
# Fallback to minimal default
combinations = [("compv3", "cshuffle", "intrawave", "reduction_strategy", False, False, False, False)]
Copy link

Copilot AI Nov 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fallback default combination has incorrect tuple structure. Based on the combo structure from lines 256-266, the 4th element should be pad_m (boolean), not the string "reduction_strategy". Should be: combinations = [(\"compv3\", \"cshuffle\", \"intrawave\", False, False, False, False, \"reduction\")] to match the order (pipeline, epilogue, scheduler, pad_m, pad_n, pad_k, persistent, reduction_strategy).

Suggested change
combinations = [("compv3", "cshuffle", "intrawave", "reduction_strategy", False, False, False, False)]
combinations = [("compv3", "cshuffle", "intrawave", False, False, False, False, "reduction")]

Copilot uses AI. Check for mistakes.
Comment on lines 758 to 784
def _generate_single_kernel_individual(work_item):
"""Worker function to generate a single individual kernel file"""
tile_config, trait_combo, working_path, datatype, layout = work_item

# Create a temporary builder instance for this worker
builder = GemmKernelBuilder(working_path, datatype, layout)

try:
kernel_name, instance_code = builder._generate_kernel_instance(
tile_config, trait_combo
)

# Create simplified filename without the "gemm_" prefix
# Remove "gemm_" from the beginning of kernel_name for the filename
simplified_name = kernel_name
if simplified_name.startswith("gemm_"):
simplified_name = simplified_name[5:] # Remove "gemm_" prefix

# Write individual header file
header_file = working_path / f"gemm_streamk_single_{simplified_name}.hpp"
with open(header_file, "w") as f:
f.write(instance_code)

return (kernel_name, trait_combo, tile_config)
except Exception as e:
print(f"Error generating individual kernel: {e}")
return None
Copy link

Copilot AI Nov 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is defined inside the class but without self parameter and with incorrect indentation. It should be defined at module level (unindented) or as a static method. This will cause the function to not be accessible when called from executor.submit(_generate_single_kernel_individual, item) at line 634.

Suggested change
def _generate_single_kernel_individual(work_item):
"""Worker function to generate a single individual kernel file"""
tile_config, trait_combo, working_path, datatype, layout = work_item
# Create a temporary builder instance for this worker
builder = GemmKernelBuilder(working_path, datatype, layout)
try:
kernel_name, instance_code = builder._generate_kernel_instance(
tile_config, trait_combo
)
# Create simplified filename without the "gemm_" prefix
# Remove "gemm_" from the beginning of kernel_name for the filename
simplified_name = kernel_name
if simplified_name.startswith("gemm_"):
simplified_name = simplified_name[5:] # Remove "gemm_" prefix
# Write individual header file
header_file = working_path / f"gemm_streamk_single_{simplified_name}.hpp"
with open(header_file, "w") as f:
f.write(instance_code)
return (kernel_name, trait_combo, tile_config)
except Exception as e:
print(f"Error generating individual kernel: {e}")
return None
def _generate_single_kernel_individual(work_item):
"""Worker function to generate a single individual kernel file"""
tile_config, trait_combo, working_path, datatype, layout = work_item
# Create a temporary builder instance for this worker
builder = GemmKernelBuilder(working_path, datatype, layout)
try:
kernel_name, instance_code = builder._generate_kernel_instance(
tile_config, trait_combo
)
# Create simplified filename without the "gemm_" prefix
# Remove "gemm_" from the beginning of kernel_name for the filename
simplified_name = kernel_name
if simplified_name.startswith("gemm_"):
simplified_name = simplified_name[5:] # Remove "gemm_" prefix
# Write individual header file
header_file = working_path / f"gemm_streamk_single_{simplified_name}.hpp"
with open(header_file, "w") as f:
f.write(instance_code)
return (kernel_name, trait_combo, tile_config)
except Exception as e:
print(f"Error generating individual kernel: {e}")
return None

Copilot uses AI. Check for mistakes.
trait_parts[3] == "false", # pad_m
trait_parts[4] == "false", # pad_n
trait_parts[5] == "false", # pad_k
trait_parts[6] , # persistent
Copy link

Copilot AI Nov 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing boolean conversion for persistent field. Lines 868-870 convert pad_m/pad_n/pad_k using == \"false\" (which produces True when the string is "false"), but line 871 passes the raw string. This inconsistency will cause type mismatches. Should be: trait_parts[6] == \"true\", # persistent to convert to boolean.

Suggested change
trait_parts[6] , # persistent
trait_parts[6] == "true", # persistent

Copilot uses AI. Check for mistakes.
Jenkinsfile Outdated
--warmup 5 --repeat 5 --verbose --json results.json && \
ninja -j64 benchmark_gemm_multi_d_all && \
python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" \
--warmup 5 --repeat 5 --verbose --json results.json """
Copy link

Copilot AI Nov 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing line continuation backslash and logical separator. Line 1647 ends with closing triple quotes, but line 1648 continues with more commands. This breaks the shell command syntax. Should add && \\ at the end of line 1647 before the closing quotes, or start a new command string on line 1648.

Suggested change
--warmup 5 --repeat 5 --verbose --json results.json """
--warmup 5 --repeat 5 --verbose --json results.json && \"

Copilot uses AI. Check for mistakes.
Jenkinsfile Outdated
--warmup 5 --repeat 5 --verbose --json results.json && \
ninja -j64 benchmark_gemm_multi_d_all && \
python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" \
--warmup 5 --repeat 5 --verbose --json results.json """
Copy link

Copilot AI Nov 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing line continuation backslash and logical separator (duplicate of earlier issue in different location). Line 1692 ends with closing triple quotes, but line 1693 continues with more commands. This breaks the shell command syntax. Should add && \\ at the end of line 1692 before the closing quotes, or start a new command string on line 1693.

Suggested change
--warmup 5 --repeat 5 --verbose --json results.json """
--warmup 5 --repeat 5 --verbose --json results.json && \

Copilot uses AI. Check for mistakes.
Comment on lines 358 to 362
scheduler_type_map = {
"intrawave": "ck_tile::GemmPipelineScheduler::Intrawave",
"interwave": "ck_tile::GemmPipelineScheduler::Interwave",
"default": "ck_tile::GemmPipelineScheduler::Default",
}
Copy link

Copilot AI Nov 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable scheduler_type_map is not used.

Suggested change
scheduler_type_map = {
"intrawave": "ck_tile::GemmPipelineScheduler::Intrawave",
"interwave": "ck_tile::GemmPipelineScheduler::Interwave",
"default": "ck_tile::GemmPipelineScheduler::Default",
}

Copilot uses AI. Check for mistakes.
Comment on lines 383 to 387
base_pipeline_map = {
"mem": "ck_tile::BaseGemmPipelineAgBgCrMem",
"compv3": "ck_tile::BaseGemmPipelineAgBgCrCompV3",
"compv4": "ck_tile::BaseGemmPipelineAgBgCrCompV4",
}
Copy link

Copilot AI Nov 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable base_pipeline_map is not used.

Suggested change
base_pipeline_map = {
"mem": "ck_tile::BaseGemmPipelineAgBgCrMem",
"compv3": "ck_tile::BaseGemmPipelineAgBgCrCompV3",
"compv4": "ck_tile::BaseGemmPipelineAgBgCrCompV4",
}

Copilot uses AI. Check for mistakes.
Comment on lines 244 to 265
file << "rocm_version,device_name,"
<< "split_k,m,n,k,stride_a,stride_b,stride_c,"
<< "dtype_a,dtype_b,dtype_acc,dtype_c," << "layout_a,layout_b,layout_c,"
<< "structured_sparsity," << "name,"
<< "latency(ms),tflops(TFlops),bandwidth(GB/s),metric\n";
}

const auto& problem = kernel_instance.problem_;
const auto& name = kernel_instance.name_;
const auto& perf = kernel_instance.perf_result_;

file << get_rocm_version() << "," << ck_tile::get_device_name() << ","
<< problem.split_k_ << "," << problem.m_ << "," << problem.n_ << ","
<< problem.k_ << "," << problem.stride_a_ << "," << problem.stride_b_ << ","
<< problem.stride_c_ << "," << problem.dtype_a_ << "," << problem.dtype_b_
<< "," << problem.dtype_acc_ << "," << problem.dtype_c_ << ","
<< problem.layout_a_ << "," << problem.layout_b_ << "," << problem.layout_c_
<< "," << problem.structured_sparsity_ << "," << name << "," << std::fixed
<< std::setprecision(4) << perf.latency_ << "," << std::fixed
<< std::setprecision(4) << perf.tflops_ << "," << std::fixed
<< std::setprecision(4) << perf.bandwidth_ << "," << get_metric_name(metric)
<< "\n";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to add fields for reduction strategy type and for persistent?

…k GEMM.

- This commit lays the groundwork for integrating the tile engine into streamk GEMM.
  It focuses on creating benchmark executables for streamk GEMM.
- Additional scripts like test_benchmark.sh and gemm_benchmark.py will be added once
  the streamk implementation reaches stability.
@CongMa13 CongMa13 force-pushed the congma/ck_tile/tile_engine_streamk branch from 70ab446 to 7363096 Compare November 8, 2025 01:33
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants