Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
362 commits
Select commit Hold shift + click to select a range
a054f7d
Refactor block to C tile map (#235)
rosenrodt May 20, 2022
44943e0
remove options.hpp.in (#240)
May 20, 2022
ac54331
example of conv bwd weight 1d/2d/3d fp32/fp16/bf16 xdl (#244)
shaojiewang May 20, 2022
ba58a93
fix build (#246)
May 23, 2022
0d08cf1
add GetWorkSpaceSize to base arg (#253)
shaojiewang May 24, 2022
1085794
Add performance tests as a stage of CI. (#247)
illsilin May 24, 2022
63eee2d
Overhaul to Reducton and its dependants (#237)
qianfengz May 24, 2022
40b59a6
Navi21 gemm (#197)
j4yan May 24, 2022
61851ae
minor fix for recent PR (#255)
May 25, 2022
e579c9e
Tensile-style block to C tile map (#239)
rosenrodt May 25, 2022
82d7d99
Hotfix binary elementwise (for broadcast on fastest axis) (#254)
rocking5566 May 25, 2022
97c4d48
Add pooling example (#257)
qianfengz May 26, 2022
3e6c261
Add FP64 XDL GEMM built-in function (#199)
ltqin May 26, 2022
91d8b7d
Fixing conv bug (#258)
May 27, 2022
d32a67a
gemm + layernorm (#261)
rocking5566 May 30, 2022
85fc91c
Minor fix for recent PR (#260)
May 31, 2022
7b1e2c3
Multi-kernel CGEMM (#230)
myamlak May 31, 2022
b6eaf3e
Pass gemm_descs for grouped gemm via __constant__ buff (#232)
zjing14 May 31, 2022
86185bd
Unify the naming of the math functions used by the host and kernel (#…
qianfengz Jun 2, 2022
1c5d06f
use old ctile to avoid conv2d fwd bias relu add compute error (#271)
shaojiewang Jun 2, 2022
1677cf7
Adding Resnet50 test to Performance tests (#268)
illsilin Jun 2, 2022
1ced00a
Add performance tests on MI200 in CI, reporting number of CUs, add st…
illsilin Jun 10, 2022
fb9b6b1
Use new github credentials (#278)
illsilin Jun 16, 2022
561ec12
example for convnd bwd weight bf16 splitk (#265)
shaojiewang Jun 16, 2022
6eb5549
Gemm + bias + relu + add + layernorm (#272)
rocking5566 Jun 17, 2022
c7a96ed
add p_workspace to baseargument (#275)
ltqin Jun 17, 2022
63cdd92
use universal workspace pointer in bwd-weight (#286)
shaojiewang Jun 17, 2022
1f543bf
Regulate reduction accumulator operations and Element-wise operations…
qianfengz Jun 17, 2022
e4584d9
Don't look up the /sys/module/amdgpu/version file. (#287)
illsilin Jun 17, 2022
56adf7e
GEMM with Multiple Source, GEMM+Bias+Add+FastGeLU example and ckProfi…
Jun 19, 2022
ccbd8d9
update readme and script (#290)
Jun 21, 2022
1ae2410
bring up to date with the usage of __builtin_amdgcn_sched_barrier (#293)
rosenrodt Jun 21, 2022
be60d60
Create MIT LICENSE (#229)
Jun 21, 2022
15c89e8
Standalone softmax kernel (#284)
rosenrodt Jun 21, 2022
4634b12
fix Issue 291 (#294)
shaojiewang Jun 21, 2022
a2edd7d
Testing all fwd convolution specializations. (#259)
aosewski Jun 23, 2022
a49115b
update license (#297)
Jun 23, 2022
d1db6a0
Absolute include path (#281)
Jun 25, 2022
d3051d7
add license in file (#303)
Jun 25, 2022
b653c5e
Switch to standard ROCm packaging (#301)
lawruble13 Jun 25, 2022
aebd211
External Interface (#304)
Jun 27, 2022
1223511
external api for gemm + layernorm (#285)
rocking5566 Jun 27, 2022
eccf877
Remove incorrect old packaging statement (#308)
lawruble13 Jun 30, 2022
93c99f3
Standalone sweep once softmax kernel w/ ckProfiler (#295)
rosenrodt Jun 30, 2022
ab6c82c
Grouped Gemm ckProfiler hotfix (#313)
zjing14 Jun 30, 2022
fa9a0a5
Gemm + bias + c_permute (#312)
zjing14 Jul 1, 2022
0dcb349
Improve external interface for GEMM and GEMM+add+add+fastgelu (#311)
Jul 1, 2022
1c8126a
add batch_stride into batched gemm (#314)
zjing14 Jul 1, 2022
63fd5da
Single-kernel GEMM + layernorm (#263)
rosenrodt Jul 1, 2022
8e37478
modified grouped gemm addressing method (#307)
guangzlu Jul 1, 2022
9e4429f
Gemm+Bilinear (#316)
Jul 2, 2022
334361c
Batched Gemm with C Permute (#305)
zjing14 Jul 6, 2022
4fe9c39
N-D Tensor Contraction example, instance, and client example (#270)
Jul 7, 2022
763ca61
add conv1d/3d bwd weight instances (#318)
shaojiewang Jul 8, 2022
6391474
GEMM pipeline v2 (#317)
poyenc Jul 8, 2022
39acaea
Add switch between compilers, make 9110 compiler default, add full QA…
illsilin Jul 13, 2022
c5620ed
minor fix in gemm client example (#328)
iq136boy Jul 13, 2022
7f21662
Standalone layernorm (#315)
rocking5566 Jul 13, 2022
a11680c
fix standalone softmax race condition around blockwise reduction (#323)
rosenrodt Jul 15, 2022
7959dad
Grouped Gemm device with multiD grid (#319)
zjing14 Jul 21, 2022
d8415a9
Add full QA with verification option, few other changes. (#331)
illsilin Jul 21, 2022
d7d7829
Batched Gemm with multiD (#329)
zjing14 Jul 22, 2022
85978e0
comment out cron trigger (#334)
illsilin Jul 22, 2022
500fa99
Clean up conv example, Instances, profiler and test (#324)
Jul 29, 2022
984b372
Run CI on MI100 nodes only, run daily QA on MI200 nodes. (#339)
illsilin Aug 2, 2022
fb0dc35
CGEMM examples bf16, fp32, int8 (#332)
aosewski Aug 2, 2022
75ab874
Update Group convolution (#341)
Aug 3, 2022
146972f
fix bug in gemm profiler (#344)
Aug 7, 2022
aba7fef
Fix QA, allow switching compiler versions, fix google test compilatio…
illsilin Aug 8, 2022
e08d68d
Add batched/grouped_gemm contraction deviceOps (#349)
zjing14 Aug 10, 2022
fdfd7eb
ckProfiler for layernorm (#330)
rocking5566 Aug 11, 2022
68b6150
Add examples for GEMM + AddAddFastGelu (data type: int8, bf16, fp32) …
poyenc Aug 11, 2022
de60d29
Build docker only once in CI, fix conv_bwd logfile names. (#353)
illsilin Aug 12, 2022
35e49f2
add g; fixed strides (#355)
zjing14 Aug 12, 2022
0c6ef7c
Add example of conv_fwd_bias_relu_add for int4, int8, bfp16, fp16, an…
geyyer Aug 12, 2022
a670a5a
Move literal ""_uz & ""_zu into namespace 'ck::literals' (#354)
poyenc Aug 12, 2022
cac014f
Fused attention (#345)
rosenrodt Aug 13, 2022
6c3c06b
Gemm multiple d multiple r (#335)
rocking5566 Aug 13, 2022
14932e8
Add examples for reduction fp16/fp32/bp16/int8/fp64 for 3d/4d/5d …
qianfengz Aug 13, 2022
10b3278
Skip lds of b matrix (#326)
ltqin Aug 13, 2022
c20a75b
Fused GEMM+GEMM (#351)
rosenrodt Aug 13, 2022
0bd6b84
Layernorm welford (#346)
rocking5566 Aug 13, 2022
fb1cbf0
Change all device operations to use add_instance_library (#338)
cloudhan Aug 13, 2022
5ee3045
fix build issue (#357)
Aug 13, 2022
53ea471
Batchnorm-forward and Batchnorm-infer Implemented using generic kerne…
qianfengz Aug 15, 2022
c961ce9
Hotfix LDS data hazard in fused attention (#360)
rosenrodt Aug 15, 2022
bac7df8
use scale (#363)
Aug 17, 2022
e00149a
int4 data type (#364)
aosewski Aug 18, 2022
9efd033
restart the stages on MI200 in case of failures (#366)
illsilin Aug 18, 2022
c366de5
[What] Fix bug of verification fail on E Matrix (#371)
rocking5566 Aug 22, 2022
f4047c9
Implement padding and sanity checks for fused GEMM+GEMM (#376)
rosenrodt Aug 23, 2022
2327f1a
Add example of Gemm + AddAddFastGelu (data type: int4) (#369)
poyenc Aug 23, 2022
6091458
Add examples of batched/grouped/SplitK Gemm for int8/bfp16/fp16/fp32 …
zjing14 Aug 23, 2022
e0d8806
Attention with output permutation (#370)
rosenrodt Aug 23, 2022
fa2d894
Add examples of Gemm (data type: int4) (#367)
poyenc Aug 23, 2022
88e4374
Refactor the design of DeviceGemmMultipleDMultipleR_Xdl_CShuffle (#378)
poyenc Aug 24, 2022
e1a3fff
layernorm external api (#379)
rocking5566 Aug 24, 2022
f246fd2
add scripts (#382)
zjing14 Aug 25, 2022
d520d0c
Add int4 reduction examples (#372)
qianfengz Aug 25, 2022
b73ae24
Add int4 example for convnd_fwd_bias_relu_add (#375)
geyyer Aug 25, 2022
3ab20fd
GEMM batched/splitK/cgemm/grouped int4 examples (#383)
aosewski Aug 25, 2022
57fadf6
More int4 tests. (#374)
aosewski Aug 25, 2022
9881625
Fixed splitk gemm fp32 (#384)
zjing14 Aug 26, 2022
1e5b59d
Add an option to build CK with clang directly (#387)
illsilin Aug 26, 2022
9061d39
Fix the slow cpu reference batched gemm kernels. (#388)
illsilin Aug 29, 2022
138faf3
Try to workaround flaky GemmSoftmaxGemm tests (#386)
rosenrodt Aug 29, 2022
45adb73
Padding for attention: bmm+scale+softmax+bmm kernel (#385)
shaojiewang Aug 30, 2022
d00e611
Gemm reduce examples int4/int8/fp32/bf16 (#368)
aosewski Aug 30, 2022
4df6d93
conv+conv (1x1 only) example using gemm+gemm (#393)
Aug 31, 2022
46a675a
Add examples of Conv + reduction (data type: int4, int8, bf16, fp16, …
poyenc Aug 31, 2022
204ef97
add more datatype to gemm+gemm and conv+conv example (#397)
Sep 1, 2022
7589116
[Hotfix] SplitK Gemm fp32 (#401)
zjing14 Sep 2, 2022
3da5c19
Softmax client example (#396)
aosewski Sep 6, 2022
fe52c94
GemmGemm TNNT instances (#399)
rosenrodt Sep 6, 2022
868e5c5
Fused attention instances & padding tests (#395)
rosenrodt Sep 6, 2022
ce74cea
Add stderr to QA logfiles, process splitK and ONNX gemm kernels (#402)
illsilin Sep 7, 2022
d6709dc
Fix gemm-softmax-gemm-permute padding cases (#409)
rosenrodt Sep 8, 2022
efd1d25
embedding fuse layernorm (#405)
carlushuang Sep 9, 2022
b22ebd4
Upgrade the OS and ROCM versions. (#411)
illsilin Sep 13, 2022
370efa6
batched_gemm + multiple_d + gemm + multiple_d (#394)
ltqin Sep 14, 2022
43c898f
disable print for group conv multiple D (#421)
Sep 16, 2022
2785837
Conv bwd data multiple d (#404)
shaojiewang Sep 19, 2022
9287b7c
Grouped batched attention + permute (#412)
rosenrodt Sep 19, 2022
c6b8b47
work around inline asm potential hazard using intrinsic (#416)
rosenrodt Sep 19, 2022
7c788e1
Add batched attention special kernel instances (#424)
rosenrodt Sep 20, 2022
f584ab0
Add 'Permute' device op & example (#408)
poyenc Sep 20, 2022
4eba345
Group norm (#417)
rocking5566 Sep 20, 2022
9f7c193
use rocm5.2 compiler as default, use same flags for amd-stg-open as f…
illsilin Sep 20, 2022
ebab84b
MNKO padding support on bmm+masking+scale+softmax+bmm+premute (#425)
shaojiewang Sep 20, 2022
567f70f
fix build (#427)
Sep 20, 2022
01876af
fixed G offset calc for long_index (#428)
zjing14 Sep 21, 2022
85b0920
Build the CK targets only once. (#433)
illsilin Sep 21, 2022
7acbf10
Updated the supported components (#435)
zhanglx13 Sep 21, 2022
aa0b051
Replace the obsolete offload-arch flags with GPU_TARGETS and fix a bu…
illsilin Sep 22, 2022
e9d4e89
fix build (#434)
Sep 22, 2022
2c6d63d
Fix device instance libarary to include all instances (#418)
JehandadKhan Sep 23, 2022
b882554
Fix build issues, set new compiler default, etc. (#451)
illsilin Sep 27, 2022
7fc3ed7
Allow setting ROCM version, activate cchache, etc. (#462)
illsilin Oct 1, 2022
473ba5b
update document: Readme, contributors, citation, (#463)
Oct 3, 2022
6de749e
Update doc (#464)
Oct 3, 2022
9d8f834
Update readme (#465)
Oct 3, 2022
40942b9
Optimization for gridwise group norm (#453)
shaojiewang Oct 7, 2022
39abb47
Fix build issue and schedule daily tests with latest staging compiler…
illsilin Oct 11, 2022
d8b41e1
Example contraction splitk (#430)
ltqin Oct 11, 2022
a8236c1
Conv2dFwd example. (#467)
aosewski Oct 13, 2022
1b62bfa
Fix bug of layernorm ckProfiler and refine code (#448)
rocking5566 Oct 13, 2022
3048028
Refactor device op implementations into `impl` subdirectory. (#420)
aosewski Oct 13, 2022
cee440f
adding tensor_permutation example folder (#389)
arai713 Oct 17, 2022
685860c
Tensor permutation (#479)
arai713 Oct 19, 2022
efbcc6e
Fused elementwise layernorm (#468)
guangzlu Oct 25, 2022
6ea9257
Revert "Fused elementwise layernorm (#468)" (#491)
guangzlu Oct 25, 2022
dda3a0a
Update to the Reduction API and instances (#476)
qianfengz Oct 25, 2022
0ee3aea
fix the script parsing the QA results (#495)
illsilin Oct 26, 2022
5710604
Gemm standalone bench executable (#480)
rosenrodt Oct 27, 2022
cd51732
Fix Batched Gemm op for int8 data (#482)
geyyer Oct 27, 2022
de37550
Input/output permutation for fused attention (#460)
rosenrodt Oct 27, 2022
24fd4a0
Fused attention client example (#494)
rosenrodt Oct 27, 2022
a5059f8
reduce the number of default targets (#489)
illsilin Oct 27, 2022
d8f5f71
fix missing -fPIC flag for conv3d_fwd instance lib (#473)
rosenrodt Oct 27, 2022
337642a
Add quotes for string option values (#472)
poyenc Oct 27, 2022
7fa892e
Batchnorm-forward implemented using welford method to calculate varia…
qianfengz Oct 28, 2022
3727d00
Add fp32 and bf16 tests (#487)
geyyer Oct 28, 2022
87fd115
Only need one test case here (#483)
rocking5566 Oct 28, 2022
8ee3611
Add Conv Forward on Navi21 for ResNet50 (#490)
ltqin Oct 31, 2022
226bc02
Conv perlayer int8 quantization (#471)
rocking5566 Nov 2, 2022
6d8614e
Softmax unit-test reduction across all and non innermost dims cases. …
aosewski Nov 2, 2022
1a0b0e7
Add pipeline v1/v2 selector, add more instances (#381)
geyyer Nov 2, 2022
9e57a29
Add client example of grouped conv2d backward data (data type: fp16) …
poyenc Nov 2, 2022
451f1e3
remove atten kernel workarounds as we move over to rocm 5.3 (#496)
rosenrodt Nov 2, 2022
d4d1147
Refine layernorm naming and test code (#497)
rocking5566 Nov 2, 2022
79aa3fb
Disable gtest discovery to run tests per-program not per-case (#432)
rosenrodt Nov 2, 2022
8a4253b
Fused elementwise normalization (#492)
guangzlu Nov 3, 2022
67423a2
Remove interface 'DeviceGroupedConvBwdData' (#500)
poyenc Nov 10, 2022
38470e0
Add client example of grouped conv2d backward weight (data type: fp16…
poyenc Nov 10, 2022
f498031
Add client example of grouped conv2d forward (data type: fp16) (#488)
poyenc Nov 10, 2022
7045632
add client example for elementwise_normalization (#501)
guangzlu Nov 10, 2022
6f0564f
Rangify FillUniformDistributionIntegerValue<> (#443)
poyenc Nov 10, 2022
37f2e91
Add packages for examples and profiler (#502)
lawruble13 Nov 10, 2022
4a2a56c
Rangify constructor of HostTensorDescriptor & Tensor<> (#445)
poyenc Nov 11, 2022
4382b41
Fix build errors on CI server (#506)
poyenc Nov 11, 2022
b79bbbc
Rangify check_err() (#444)
poyenc Nov 11, 2022
dc663fa
Rangify STL algorithms (#438)
poyenc Nov 14, 2022
730204e
Introduce ck::accumulate_n() (#439)
poyenc Nov 15, 2022
7038723
Avoid reporting unused member function error (#507)
poyenc Nov 15, 2022
db0eb1e
Add Conv Backward Data on Navi21 for ResNet50 (#499)
ltqin Nov 15, 2022
4c4c732
Add BF16 tests for batched_gemm_softmax_gemm_permute (#504)
guangzlu Nov 15, 2022
892a8d7
Work around develop validation failure (#513)
rosenrodt Nov 17, 2022
43a889b
Client examples AddFastGelu and FastGelu + instances. (#509)
aosewski Nov 20, 2022
4e6a557
BatchNorm forward instance/external api/profiler/tests/client example…
qianfengz Nov 25, 2022
5bf0475
Remove int8 from batchnorm-forward instances since it is not needed f…
qianfengz Nov 28, 2022
44789d9
BatchNorm backward implementation (#461)
qianfengz Nov 29, 2022
0e9c88c
fix GetTypeString
fsx950223 Nov 16, 2022
236bd14
Fix split-k gemm test (#231)
rosenrodt Nov 29, 2022
63af525
BatchNorm backward instance/external API/profiler/tests (#519)
qianfengz Nov 30, 2022
ad541ad
gemm, conv perchannel quantization (#503)
rocking5566 Nov 30, 2022
8784a72
Modularize ckProfiler operations (#514)
poyenc Dec 1, 2022
abf9cc6
[Navi3x-LWPCK-449] wmma_op + unit test (#484)
aska-0096 Dec 2, 2022
23ecf0f
Add multiple d gridwise gemm on Navi21 for ResNet50 (#517)
ltqin Dec 2, 2022
d156709
Fix bug where scaling may not be applied in some code path (#526)
rosenrodt Dec 2, 2022
d072790
Fix CI error. (#530)
illsilin Dec 6, 2022
ce87b4f
modified half function in math_v2.hpp (#528)
guangzlu Dec 7, 2022
c7a4d36
Add padding device_gemm_xdl instances (#529)
geyyer Dec 7, 2022
614a7b1
Fix Grouped ConvBwdWeight test case failure (#524)
poyenc Dec 7, 2022
d58b7f5
Make sure that GEMM sizes in K dimension are supported. (#527)
illsilin Dec 8, 2022
0e5c264
Gridwise elementwise 2d (#466)
arai713 Dec 12, 2022
74744ca
Add a docker hub doc file (#538)
geyyer Dec 14, 2022
9a1f247
Add padding device_gemm_add_add_fastgelu_xdl_c_shuffle instances to e…
geyyer Dec 15, 2022
10c72ac
Add interface GetTypeIdName() and GetTypeIdHashCode() for Device Op (…
qianfengz Dec 15, 2022
1115117
disable the attention test that fails on MI100 (#540)
illsilin Dec 15, 2022
0345963
Add MNK padding, M = 0 support into grouped_gemm (#539)
zjing14 Dec 15, 2022
a17b041
Remove including of cmath (#551)
qianfengz Jan 12, 2023
715e8dd
Add a flag to enable/disable debug output in many kernels. (#549)
illsilin Jan 12, 2023
919aeb1
[Navi3x-LWPCK-545] Block-wise GEMM + Real GEMM_WMMA_FP16 (#541)
aska-0096 Jan 17, 2023
7829d72
Gemm layernorm welford (#413)
rocking5566 Jan 17, 2023
80e0526
Reduction external API and client examples (#493)
qianfengz Jan 17, 2023
5523670
Add client API/examples for 3xGemm+Bias+Add+Permute{0, 2, 3, 1} (#550)
ltqin Jan 18, 2023
147b7db
add multi embeddings support (#542)
fsx950223 Jan 18, 2023
00ff30a
fix a bug for 6-dim kernels (#555)
illsilin Jan 18, 2023
d66421f
Add multiD Gemm client APIs (#534)
ltqin Jan 18, 2023
1cfa876
Wavelet (inter-wave consumer-producer) GEMM (#310)
ramjana Jan 18, 2023
52abc2f
Use double for all scaling values and float-point constant values at …
qianfengz Jan 18, 2023
a1b2441
Batchnorm inference instances, external API, client examples and gtes…
qianfengz Jan 25, 2023
7494c1c
Add more instances for irregular GEMM sizes. (#560)
aosewski Jan 26, 2023
274108d
Use defined seed for deterministic test runs. (#562)
aosewski Jan 30, 2023
ba40c2c
remove unused variable (#564)
fsx950223 Jan 31, 2023
afdfef7
Add the markdown tutorial hello world (#563)
geyyer Feb 1, 2023
f73574f
Fix CI issues. (#572)
illsilin Feb 6, 2023
bb3d954
Fix a couple more CI issues. (#578)
illsilin Feb 8, 2023
332ccc3
Add GemmAddSoftmaxGemm support for MSFT ORT (instances and client API…
ltqin Feb 8, 2023
b63acce
adding the first draft of changelog (#571)
illsilin Feb 8, 2023
76d144f
Add instance for elementwise normlization (#573)
guangzlu Feb 9, 2023
f7d28f3
Gemm+layernorm instance, ckProfiler, client example (#568)
rocking5566 Feb 9, 2023
0ac0f51
enable batched_gemm_softmax_bf16 tests (#582)
illsilin Feb 10, 2023
8f42780
GroupedGEMM more bigger tiles. (#577)
aosewski Feb 13, 2023
06f1fc8
Remove the workaround for bf16 attention tests. (#586)
illsilin Feb 15, 2023
e9fd122
Conv3D FWD BWD WRW fp16 fp32 client examples (#559)
aosewski Feb 15, 2023
0cfda84
[Navi3x] Add Device Operations (#567)
aska-0096 Feb 15, 2023
6a6163a
Improve normalization (#580)
rocking5566 Feb 15, 2023
24c9ee1
Add contraction_fp64 example (#570)
zjing14 Feb 15, 2023
19490ac
Clean up kernel launch output (#569)
illsilin Feb 15, 2023
cb3fac4
Sphinx doc (#581)
pmaybank Feb 15, 2023
584d233
Build and archive deb packages. (#590)
illsilin Feb 16, 2023
bef0cb2
fix a bug when building for gfx1030 target. (#591)
illsilin Feb 16, 2023
830d37a
Grouped conv1d client example (#589)
ltqin Feb 22, 2023
246ceee
Add Grouped Conv Backward Weight on Navi21 for ResNet50. (#505)
geyyer Feb 22, 2023
209baee
disable tensor contraction f64 on MI100 (#602)
zjing14 Feb 24, 2023
8f45561
Fast GeLU using built-in function (#587)
Feb 27, 2023
68dbf40
[Navi3x Bug Fix] fix typo to accept MNKPadding flag correctly. (#597)
aska-0096 Mar 1, 2023
59cbb20
Suppress reserved-identifier warning and catch all warnings. (#608)
illsilin Mar 1, 2023
8933366
Merge branch 'master' into staging
zjing14 Mar 9, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
53 changes: 53 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Compiled Object files
*.slo
*.lo
*.o
*.obj

# Precompiled Headers
*.gch
*.pch
*.ipch

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Fortran module files
*.mod

# Compiled Static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app

# vim tags
tags
.tags
.*.swp

# Editors
.vscode

# build-in-source directory
build*

# emacs temporary/backup files
.\#*
\#*\#
*~

# GDB temporary files
.gdb_history
install.dir*

# directories containing generated documentation
docs/source/_build/
docs/docBin/
24 changes: 24 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Change Log for Composable Kernel

Full documentation for Composable Kernel is not yet available.

## CK 0.1.1 for ROCm 5.5.0

### Fixed
- Fixed a bug in 6-dimensional kernels (#555).
- Fixed grouped ConvBwdWeight test case failure (#524).

### Optimizations
- Improve proformance of normalization kernel

### Added
- Added user tutorial (#563).
- Added more instances for irregular GEMM sizes (#560).
- Added inter-wave consumer-producer programming model for GEMM kernels (#310).
- Added multi-D GEMM client APIs (#534).
- Added multi-embeddings support (#542).
- Added Navi3x blockwise GEMM and real GEMM support (#541).
- Added Navi grouped ConvBwdWeight support (#505).

### Changed
- Changed ...
67 changes: 67 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
cff-version: 1.2.0
title: Composable Kernel
message: If you use this software, please cite using the following metadata.
type: software
authors:
- given-names: Chao
family-names: Liu
email: [email protected]
affiliation: AMD
- given-names: Jing
family-names: Zhang
email: [email protected]
affiliation: AMD
- given-names: Letao
family-names: Qin
email: [email protected]
affiliation: AMD
- given-names: Qianfeng
family-names: Zhang
email: [email protected]
affiliation: AMD
- given-names: Liang
family-names: Huang
email: [email protected]
affiliation: AMD
- given-names: Shaojie
family-names: Wang
email: [email protected]
affiliation: AMD
- given-names: Anthony
family-names: Chang
email: [email protected]
affiliation: AMD
- given-names: Chunyu
family-names: Lai
email: [email protected]
affiliation: AMD
- given-names: Illia
family-names: Silin
email: [email protected]
affiliation: AMD
- given-names: Adam
family-names: Osewski
email: [email protected]
affiliation: AMD
- given-names: Poyen
family-names: Chen
email: [email protected]
affiliation: AMD
- given-names: Rosty
family-names: Geyyer
email: [email protected]
affiliation: AMD
- given-names: Hanwen
family-names: Chen
- given-names: Tejash
family-names: Shah
- given-names: Xiaoyan
family-names: Zhou
- given-names: Jianfeng
family-names: Yan
repository-code: 'https://github.com/ROCmSoftwarePlatform/composable_kernel'
abstract: Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for Machine Learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel progarmming languages, like HIP C++.
keywords:
- 'CK, Composable Kernel, Tensor Coordinate Transformation'
license: MIT
license-url: https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE
171 changes: 142 additions & 29 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,39 @@
cmake_minimum_required(VERSION 3.5)
cmake_minimum_required(VERSION 3.14)

# Check support for CUDA/HIP in Cmake
project(composable_kernel)

list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")

enable_testing()

set(ROCM_SYMLINK_LIBS OFF)
find_package(ROCM REQUIRED PATHS /opt/rocm)

include(ROCMInstallTargets)
include(ROCMPackageConfigHelpers)
include(ROCMSetupVersion)
include(ROCMInstallSymlinks)
include(ROCMCreatePackage)
include(CheckCXXCompilerFlag)

rocm_setup_version(VERSION 0.2.0)
include(TargetFlags)
list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)

option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)

if(USE_BITINT_EXTENSION_INT4)
add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
add_compile_options(-Wno-bit-int-extension)
message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
endif()

## Threads
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
link_libraries(Threads::Threads)

## C++
enable_language(CXX)
set(CMAKE_CXX_STANDARD 17)
Expand All @@ -30,35 +59,44 @@ message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")

set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
link_libraries(${OpenMP_gomp_LIBRARY})
link_libraries(${OpenMP_pthread_LIBRARY})

## HIP
find_package(HIP REQUIRED)
message(STATUS "Build with HIP ${hip_VERSION}")

## half
#find_path(HALF_INCLUDE_DIR half.hpp)
message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")

# CMAKE_CXX_FLAGS
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV)
string(APPEND CMAKE_CXX_FLAGS " -Werror -Weverything")
# Override HIP version in config.h, if necessary.
# The variables set by find_package() can't be overwritten,
# therefore let's use intermediate variables.
set(CK_HIP_VERSION_MAJOR "${HIP_VERSION_MAJOR}")
set(CK_HIP_VERSION_MINOR "${HIP_VERSION_MINOR}")
set(CK_HIP_VERSION_PATCH "${HIP_VERSION_PATCH}")
if( DEFINED CK_OVERRIDE_HIP_VERSION_MAJOR )
set(CK_HIP_VERSION_MAJOR "${CK_OVERRIDE_HIP_VERSION_MAJOR}")
message(STATUS "CK_HIP_VERSION_MAJOR overriden with ${CK_OVERRIDE_HIP_VERSION_MAJOR}")
endif()
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
if( DEFINED CK_OVERRIDE_HIP_VERSION_MINOR )
set(CK_HIP_VERSION_MINOR "${CK_OVERRIDE_HIP_VERSION_MINOR}")
message(STATUS "CK_HIP_VERSION_MINOR overriden with ${CK_OVERRIDE_HIP_VERSION_MINOR}")
endif()
if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
set(CK_HIP_VERSION_PATCH "${CK_OVERRIDE_HIP_VERSION_PATCH}")
message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
endif()
message(STATUS "Build with HIP ${HIP_VERSION}")
link_libraries(hip::device)
add_compile_definitions(__HIP_PLATFORM_HCC__=1)

## tidy
include(EnableCompilerWarnings)
set(MIOPEN_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
set(MIOPEN_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
set(CK_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
# Enable tidy on hip
elseif(MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU")
set(MIOPEN_TIDY_ERRORS ALL)
elseif(CK_BACKEND STREQUAL "HIP" OR CK_BACKEND STREQUAL "HIPNOGPU")
set(CK_TIDY_ERRORS ALL)
endif()


include(ClangTidy)
enable_clang_tidy(
CHECKS
Expand Down Expand Up @@ -150,13 +188,12 @@ enable_clang_tidy(
-cppcoreguidelines-narrowing-conversions
-altera-struct-pack-align
-cppcoreguidelines-prefer-member-initializer

${MIOPEN_TIDY_CHECKS}
${MIOPEN_TIDY_ERRORS}
${CK_TIDY_CHECKS}
${CK_TIDY_ERRORS}
HEADER_FILTER
"\.hpp$"
EXTRA_ARGS
-DMIOPEN_USE_CLANG_TIDY
-DCK_USE_CLANG_TIDY
)

include(CppCheck)
Expand All @@ -180,19 +217,95 @@ enable_cppcheck(
unmatchedSuppression
FORCE
SOURCES
host/host_tensor/src
host/driver_offline/src
composable_kernel/src/kernel_wrapper
library/src
INCLUDE
host/host_tensor/include
host/solver/include
host/driver_offline/include
composable_kernel/include/*
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_BINARY_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/library/include
DEFINE
CPPCHECK=1
__linux__=1
)

add_subdirectory(host)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)

include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/library/include
${HIP_INCLUDE_DIRS}
)


SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV)
add_compile_options(-Werror)
add_compile_options(-Weverything)
endif()
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")

add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})

file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
set(CK_DEVICE_INSTANCES)
FOREACH(subdir_path ${dir_list})
IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
ENDIF()
ENDFOREACH()
add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES})

rocm_package_setup_component(tests
LIBRARY_NAME composablekernel
PACKAGE_NAME tests # Prevent -static suffix on package name
)

rocm_package_setup_component(examples
LIBRARY_NAME composablekernel
PACKAGE_NAME examples
)

rocm_package_setup_component(profiler
LIBRARY_NAME composablekernel
PACKAGE_NAME ckProfiler
)

add_subdirectory(library)
add_subdirectory(example)
add_subdirectory(test)
add_subdirectory(profiler)

#Create an interface target for the include only files and call it "composablekernels"
include(CMakePackageConfigHelpers)

set(version 1.0.0)
write_basic_package_version_file(
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
VERSION "${version}"
COMPATIBILITY AnyNewerVersion
)

configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
NO_CHECK_REQUIRED_COMPONENTS_MACRO
)

rocm_install(FILES
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
)

set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_RPM_PACKAGE_LICENSE "MIT")

rocm_create_package(
NAME composablekernel
DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
MAINTAINER "MIOpen Kernels Dev Team <[email protected]>"
LDCONFIG
HEADER_ONLY
)
Loading