JuliaGPU · maleadt · May 8, 2020 · May 7, 2020
diff --git a/Manifest.toml b/Manifest.toml
@@ -77,7 +77,9 @@ version = "0.1.1"
 
 [[GPUArrays]]
 deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"]
-git-tree-sha1 = "c63cb01e3b6f48ab39f1e35c31ba870650814a18"
+git-tree-sha1 = "bf9f724da10a403a9e85c394d5005789147e77b7"
+repo-rev = "6e7560a"
+repo-url = "https://github.com/JuliaGPU/GPUArrays.jl.git"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 version = "3.2.0"
 
@@ -92,6 +94,7 @@ uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
 version = "1.3.4"
 
 [[LibGit2]]
+deps = ["Printf"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
 
 [[Libdl]]
@@ -127,7 +130,7 @@ uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 version = "1.1.0"
 
 [[Pkg]]
-deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [[Printf]]

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -133,13 +133,17 @@ end
 
 ## COV_EXCL_STOP
 
-NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractArray...; init=nothing) where T
-    # TODO: Broadcast-semantics after JuliaLang-julia#31020
-    A = first(As)
-    all(B -> size(A) == size(B), As) || throw(DimensionMismatch("dimensions of containers must be identical"))
+if VERSION < v"1.5.0-DEV.748"
+    Base.axes(bc::Base.Broadcast.Broadcasted{<:CuArrayStyle, <:NTuple{N}},
+              d::Integer) where N =
+        d <= N ? axes(bc)[d] : Base.OneTo(1)
+end
 
+NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T},
+                                             A::Union{AbstractArray,Broadcast.Broadcasted};
+                                             init=nothing) where T
     Base.check_reducedims(R, A)
-    isempty(A) && return R
+    length(A) == 0 && return R # isempty(::Broadcasted) iterates
 
     f = cufunc(f)
     op = cufunc(op)
@@ -156,8 +160,8 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
 
     # iteration domain, split in two: one part covers the dimensions that should
     # be reduced, and the other covers the rest. combining both covers all values.
-    Rall = CartesianIndices(A)
-    Rother = CartesianIndices(R)
+    Rall = CartesianIndices(axes(A))
+    Rother = CartesianIndices(axes(R))
     Rreduce = CartesianIndices(ifelse.(axes(A) .== axes(R), Ref(Base.OneTo(1)), axes(A)))
     # NOTE: we hard-code `OneTo` (`first.(axes(A))` would work too) or we get a
     #       CartesianIndices object with UnitRanges that behave badly on the GPU.
@@ -187,7 +191,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
     # we might not be able to launch all those threads to reduce each slice in one go.
     # that's why each threads also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single thread block.
-    args = (f, op, init, Rreduce, Rother, Val(shuffle), R′, As...)
+    args = (f, op, init, Rreduce, Rother, Val(shuffle), R′, A)
     kernel_args = cudaconvert.(args)
     kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
     kernel = cufunction(partial_mapreduce_grid, kernel_tt)
@@ -218,7 +222,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
     if reduce_blocks == 1
         # we can cover the dimensions to reduce using a single block
         @cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
-            f, op, init, Rreduce, Rother, Val(shuffle), R′, As...)
+            f, op, init, Rreduce, Rother, Val(shuffle), R′, A)
     else
         # we need multiple steps to cover all values to reduce
         partial = similar(R, (size(R)..., reduce_blocks))
@@ -232,7 +236,7 @@ NVTX.@range function GPUArrays.mapreducedim!(f, op, R::CuArray{T}, As::AbstractA
             end
         end
         @cuda threads=threads blocks=blocks shmem=shmem partial_mapreduce_grid(
-            f, op, init, Rreduce, Rother, Val(shuffle), partial, As...)
+            f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
 
         GPUArrays.mapreducedim!(identity, op, R′, partial; init=init)
     end