Skip to content
This repository was archived by the owner on Mar 12, 2021. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ version = "0.1.1"

[[GPUArrays]]
deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
git-tree-sha1 = "c63cb01e3b6f48ab39f1e35c31ba870650814a18"
git-tree-sha1 = "bf9f724da10a403a9e85c394d5005789147e77b7"
repo-rev = "6e7560a"
repo-url = "https://github.com/JuliaGPU/GPUArrays.jl.git"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "3.2.0"

Expand All @@ -92,6 +94,7 @@ uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "1.3.4"

[[LibGit2]]
deps = ["Printf"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"

[[Libdl]]
Expand Down Expand Up @@ -127,7 +130,7 @@ uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
version = "1.1.0"

[[Pkg]]
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"

[[Printf]]
Expand Down
24 changes: 14 additions & 10 deletions src/mapreduce.jl
Original file line number Diff line number Diff line change
Expand Up @@ -133,13 +133,17 @@ end

## COV_EXCL_STOP

NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractArray...; init=nothing) where T
# TODO: Broadcast-semantics after JuliaLang-julia#31020
A = first(As)
all(B -> size(A) == size(B), As) || throw(DimensionMismatch("dimensions of containers must be identical"))
if VERSION < v"1.5.0-DEV.748"
Base.axes(bc::Base.Broadcast.Broadcasted{<:CuArrayStyle, <:NTuple{N}},
d::Integer) where N =
d <= N ? axes(bc)[d] : Base.OneTo(1)
end

NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T},
A::Union{AbstractArray,Broadcast.Broadcasted};
init=nothing) where T
Base.check_reducedims(R, A)
isempty(A) && return R
length(A) == 0 && return R # isempty(::Broadcasted) iterates

f = cufunc(f)
op = cufunc(op)
Expand All @@ -156,8 +160,8 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA

# iteration domain, split in two: one part covers the dimensions that should
# be reduced, and the other covers the rest. combining both covers all values.
Rall = CartesianIndices(A)
Rother = CartesianIndices(R)
Rall = CartesianIndices(axes(A))
Rother = CartesianIndices(axes(R))
Rreduce = CartesianIndices(ifelse.(axes(A) .== axes(R), Ref(Base.OneTo(1)), axes(A)))
# NOTE: we hard-code `OneTo` (`first.(axes(A))` would work too) or we get a
# CartesianIndices object with UnitRanges that behave badly on the GPU.
Expand Down Expand Up @@ -187,7 +191,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
# we might not be able to launch all those threads to reduce each slice in one go.
# that's why each threads also loops across their inputs, processing multiple values
# so that we can span the entire reduction dimension using a single thread block.
args = (f, op, init, Rreduce, Rother, Val(shuffle), R′, As...)
args = (f, op, init, Rreduce, Rother, Val(shuffle), R′, A)
kernel_args = cudaconvert.(args)
kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
kernel = cufunction(partial_mapreduce_grid, kernel_tt)
Expand Down Expand Up @@ -218,7 +222,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
if reduce_blocks == 1
# we can cover the dimensions to reduce using a single block
@cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
f, op, init, Rreduce, Rother, Val(shuffle), R′, As...)
f, op, init, Rreduce, Rother, Val(shuffle), R′, A)
else
# we need multiple steps to cover all values to reduce
partial = similar(R, (size(R)..., reduce_blocks))
Expand All @@ -232,7 +236,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
end
end
@cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
f, op, init, Rreduce, Rother, Val(shuffle), partial, As...)
f, op, init, Rreduce, Rother, Val(shuffle), partial, A)

GPUArrays.mapreducedim!(identity, op, R′, partial; init=init)
end
Expand Down