ROCm · asroy · Mar 16, 2023 · May 20, 2022 · May 20, 2022 · May 20, 2022
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,53 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+*.ipch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+# vim tags
+tags
+.tags
+.*.swp
+
+# Editors
+.vscode
+
+# build-in-source directory
+build*
+
+# emacs temporary/backup files
+.\#*
+\#*\#
+*~
+
+# GDB temporary files
+.gdb_history
+install.dir*
+
+# directories containing generated documentation
+docs/source/_build/
+docs/docBin/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,24 @@
+# Change Log for Composable Kernel
+
+Full documentation for Composable Kernel is not yet available.
+
+## CK 0.1.1 for ROCm 5.5.0
+
+### Fixed
+- Fixed a bug in 6-dimensional kernels (#555).
+- Fixed grouped ConvBwdWeight test case failure (#524).
+
+### Optimizations
+- Improve proformance of normalization kernel
+
+### Added
+- Added user tutorial (#563).
+- Added more instances for irregular GEMM sizes (#560).
+- Added inter-wave consumer-producer programming model for GEMM kernels (#310).
+- Added multi-D GEMM client APIs (#534).
+- Added multi-embeddings support (#542).
+- Added Navi3x blockwise GEMM and real GEMM support (#541).
+- Added Navi grouped ConvBwdWeight support (#505).
+
+### Changed
+- Changed ...
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,67 @@
+cff-version: 1.2.0
+title: Composable Kernel
+message: If you use this software, please cite using the following metadata.
+type: software
+authors:
+  - given-names: Chao
+    family-names: Liu
+    email: [email protected]
+    affiliation: AMD
+  - given-names: Jing
+    family-names: Zhang
+    email: [email protected]
+    affiliation: AMD
+  - given-names: Letao
+    family-names: Qin
+    email: [email protected]
+    affiliation: AMD
+  - given-names: Qianfeng
+    family-names: Zhang
+    email: [email protected]
+    affiliation: AMD
+  - given-names: Liang
+    family-names: Huang
+    email: [email protected]
+    affiliation: AMD
+  - given-names: Shaojie
+    family-names: Wang
+    email: [email protected]
+    affiliation: AMD
+  - given-names: Anthony
+    family-names: Chang
+    email: [email protected]
+    affiliation: AMD
+  - given-names: Chunyu
+    family-names: Lai
+    email: [email protected]
+    affiliation: AMD
+  - given-names: Illia
+    family-names: Silin
+    email: [email protected]
+    affiliation: AMD
+  - given-names: Adam
+    family-names: Osewski
+    email: [email protected]
+    affiliation: AMD
+  - given-names: Poyen
+    family-names: Chen
+    email: [email protected]
+    affiliation: AMD
+  - given-names: Rosty
+    family-names: Geyyer
+    email: [email protected]
+    affiliation: AMD
+  - given-names: Hanwen
+    family-names: Chen
+  - given-names: Tejash
+    family-names: Shah
+  - given-names: Xiaoyan
+    family-names: Zhou
+  - given-names: Jianfeng
+    family-names: Yan
+repository-code: 'https://github.com/ROCmSoftwarePlatform/composable_kernel'
+abstract: Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for Machine Learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel progarmming languages, like HIP C++.
+keywords:
+  - 'CK, Composable Kernel, Tensor Coordinate Transformation'
+license: MIT
+license-url: https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,10 +1,39 @@
-cmake_minimum_required(VERSION 3.5)
+cmake_minimum_required(VERSION 3.14)
+
+# Check support for CUDA/HIP in Cmake
 project(composable_kernel)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 
+enable_testing()
+
+set(ROCM_SYMLINK_LIBS OFF)
+find_package(ROCM REQUIRED PATHS /opt/rocm)
+
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMSetupVersion)
+include(ROCMInstallSymlinks)
+include(ROCMCreatePackage)
 include(CheckCXXCompilerFlag)
 
+rocm_setup_version(VERSION 0.2.0)
+include(TargetFlags)
+list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)
+
+option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
+
+if(USE_BITINT_EXTENSION_INT4)
+    add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    add_compile_options(-Wno-bit-int-extension)
+    message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
+endif()
+
+## Threads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+link_libraries(Threads::Threads)
+
 ## C++
 enable_language(CXX)
 set(CMAKE_CXX_STANDARD 17)
@@ -30,35 +59,44 @@ message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
 message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
 message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
 
-set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 link_libraries(${OpenMP_gomp_LIBRARY})
 link_libraries(${OpenMP_pthread_LIBRARY})
 
 ## HIP
 find_package(HIP REQUIRED)
-message(STATUS "Build with HIP ${hip_VERSION}")
-
-## half
-#find_path(HALF_INCLUDE_DIR half.hpp)
-message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
-
-# CMAKE_CXX_FLAGS
-SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
-if(BUILD_DEV)
-    string(APPEND CMAKE_CXX_FLAGS " -Werror -Weverything")
+# Override HIP version in config.h, if necessary.
+# The variables set by find_package() can't be overwritten,
+# therefore let's use intermediate variables.
+set(CK_HIP_VERSION_MAJOR "${HIP_VERSION_MAJOR}")
+set(CK_HIP_VERSION_MINOR "${HIP_VERSION_MINOR}")
+set(CK_HIP_VERSION_PATCH "${HIP_VERSION_PATCH}")
+if( DEFINED CK_OVERRIDE_HIP_VERSION_MAJOR )
+    set(CK_HIP_VERSION_MAJOR "${CK_OVERRIDE_HIP_VERSION_MAJOR}")
+    message(STATUS "CK_HIP_VERSION_MAJOR overriden with ${CK_OVERRIDE_HIP_VERSION_MAJOR}")
 endif()
-message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+if( DEFINED CK_OVERRIDE_HIP_VERSION_MINOR )
+    set(CK_HIP_VERSION_MINOR "${CK_OVERRIDE_HIP_VERSION_MINOR}")
+    message(STATUS "CK_HIP_VERSION_MINOR overriden with ${CK_OVERRIDE_HIP_VERSION_MINOR}")
+endif()
+if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
+    set(CK_HIP_VERSION_PATCH "${CK_OVERRIDE_HIP_VERSION_PATCH}")
+    message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
+endif()
+message(STATUS "Build with HIP ${HIP_VERSION}")
+link_libraries(hip::device)
+add_compile_definitions(__HIP_PLATFORM_HCC__=1)
 
 ## tidy
 include(EnableCompilerWarnings)
-set(MIOPEN_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
+set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
 if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
-    set(MIOPEN_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
+    set(CK_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
 # Enable tidy on hip
-elseif(MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU")
-    set(MIOPEN_TIDY_ERRORS ALL)
+elseif(CK_BACKEND STREQUAL "HIP" OR CK_BACKEND STREQUAL "HIPNOGPU")
+    set(CK_TIDY_ERRORS ALL)
 endif()
 
+
 include(ClangTidy)
 enable_clang_tidy(
     CHECKS
@@ -150,13 +188,12 @@ enable_clang_tidy(
         -cppcoreguidelines-narrowing-conversions
         -altera-struct-pack-align
         -cppcoreguidelines-prefer-member-initializer
-
-        ${MIOPEN_TIDY_CHECKS}
-        ${MIOPEN_TIDY_ERRORS}
+        ${CK_TIDY_CHECKS}
+        ${CK_TIDY_ERRORS}
     HEADER_FILTER
         "\.hpp$"
     EXTRA_ARGS
-        -DMIOPEN_USE_CLANG_TIDY
+        -DCK_USE_CLANG_TIDY
 )
 
 include(CppCheck)
@@ -180,19 +217,95 @@ enable_cppcheck(
         unmatchedSuppression
     FORCE
     SOURCES
-        host/host_tensor/src
-        host/driver_offline/src
-        composable_kernel/src/kernel_wrapper
+        library/src
     INCLUDE
-        host/host_tensor/include
-        host/solver/include
-        host/driver_offline/include
-        composable_kernel/include/*
         ${CMAKE_CURRENT_SOURCE_DIR}/include
         ${CMAKE_CURRENT_BINARY_DIR}/include
+        ${CMAKE_CURRENT_SOURCE_DIR}/library/include
     DEFINE
         CPPCHECK=1
         __linux__=1
 )
 
-add_subdirectory(host)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
+
+include_directories(BEFORE
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_SOURCE_DIR}/library/include
+    ${HIP_INCLUDE_DIRS}
+)
+
+
+SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
+if(BUILD_DEV)
+    add_compile_options(-Werror)
+    add_compile_options(-Weverything)
+endif()
+message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
+
+file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
+file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
+set(CK_DEVICE_INSTANCES)
+FOREACH(subdir_path ${dir_list})
+    IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
+       list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
+    ENDIF()
+ENDFOREACH()
+add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES}  SOURCES ${INSTANCE_FILES})
+
+rocm_package_setup_component(tests
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME tests # Prevent -static suffix on package name
+)
+
+rocm_package_setup_component(examples
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME examples
+)
+
+rocm_package_setup_component(profiler
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME ckProfiler
+)
+
+add_subdirectory(library)
+add_subdirectory(example)
+add_subdirectory(test)
+add_subdirectory(profiler)
+
+#Create an interface target for the include only files and call it "composablekernels"
+include(CMakePackageConfigHelpers)
+
+set(version 1.0.0)
+write_basic_package_version_file(
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+    VERSION "${version}"
+    COMPATIBILITY AnyNewerVersion
+)
+
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
+        "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+        INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+        NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
+
+rocm_install(FILES
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
+
+set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
+set(CPACK_RPM_PACKAGE_LICENSE "MIT")
+
+rocm_create_package(
+    NAME composablekernel
+    DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
+    MAINTAINER "MIOpen Kernels Dev Team <[email protected]>"
+    LDCONFIG
+    HEADER_ONLY
+)