Skip to content

Commit 5efb69b

Browse files
committed
Support mapreduce over dimensions with SkipMissing
Allows calling mapreduce and specialized functinos with the dims argument on SkipMissing objects. The implementation is copied on the generic methods, but missing values need to be handled directly inside functions for efficiency and because mapreduce_impl returns a Some object which needs special handling.
1 parent b6a60dd commit 5efb69b

File tree

4 files changed

+400
-12
lines changed

4 files changed

+400
-12
lines changed

base/missing.jl

Lines changed: 195 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,9 @@ float(A::AbstractArray{Missing}) = A
146146
skipmissing(itr)
147147
148148
Return an iterator over the elements in `itr` skipping [`missing`](@ref) values.
149+
In addition to supporting any function taking iterators, the resulting object
150+
implements reductions over dimensions (i.e. the `dims` argument to
151+
[`mapreduce`](@ref), [`reduce`](@ref) and special functions like [`sum`](@ref)).
149152
150153
Use [`collect`](@ref) to obtain an `Array` containing the non-`missing` values in
151154
`itr`. Note that even if `itr` is a multidimensional array, the result will always
@@ -154,9 +157,6 @@ of the input.
154157
155158
# Examples
156159
```jldoctest
157-
julia> sum(skipmissing([1, missing, 2]))
158-
3
159-
160160
julia> collect(skipmissing([1, missing, 2]))
161161
2-element Array{Int64,1}:
162162
1
@@ -166,6 +166,31 @@ julia> collect(skipmissing([1 missing; 2 missing]))
166166
2-element Array{Int64,1}:
167167
1
168168
2
169+
170+
julia> sum(skipmissing([1, missing, 2]))
171+
3
172+
173+
julia> B = [1 missing; 3 4]
174+
2×2 Array{Union{Missing, Int64},2}:
175+
1 missing
176+
3 4
177+
178+
julia> sum(skipmissing(B), dims=1)
179+
1×2 Array{Int64,2}:
180+
4 4
181+
182+
julia> sum(skipmissing(B), dims=2)
183+
2×1 Array{Int64,2}:
184+
1
185+
7
186+
187+
julia> reduce(*, skipmissing(B), dims=1)
188+
1×2 Array{Int64,2}:
189+
3 4
190+
191+
julia> mapreduce(cos, +, skipmissing(B), dims=1)
192+
1×2 Array{Float64,2}:
193+
-0.44969 -0.653644
169194
```
170195
"""
171196
skipmissing(itr) = SkipMissing(itr)
@@ -192,8 +217,8 @@ end
192217
# Optimized mapreduce implementation
193218
# The generic method is faster when !(eltype(A) >: Missing) since it does not need
194219
# additional loops to identify the two first non-missing values of each block
195-
mapreduce(f, op, itr::SkipMissing{<:AbstractArray}) =
196-
_mapreduce(f, op, IndexStyle(itr.x), eltype(itr.x) >: Missing ? itr : itr.x)
220+
mapreduce(f, op, itr::SkipMissing{<:AbstractArray}; dims=:, kw...) =
221+
_mapreduce_dim(f, op, kw.data, eltype(itr.x) >: Missing ? itr : itr.x, dims)
197222

198223
function _mapreduce(f, op, ::IndexLinear, itr::SkipMissing{<:AbstractArray})
199224
A = itr.x
@@ -280,6 +305,171 @@ mapreduce_impl(f, op, A::SkipMissing, ifirst::Integer, ilast::Integer) =
280305
end
281306
end
282307

308+
# mapreduce over dimensions implementation
309+
310+
_mapreduce_dim(f, op, nt::NamedTuple{(:init,)}, itr::SkipMissing{<:AbstractArray}, ::Colon) =
311+
mapfoldl(f, op, itr; nt...)
312+
313+
_mapreduce_dim(f, op, ::NamedTuple{()}, itr::SkipMissing{<:AbstractArray}, ::Colon) =
314+
_mapreduce(f, op, IndexStyle(itr.x), itr)
315+
316+
_mapreduce_dim(f, op, nt::NamedTuple{(:init,)}, itr::SkipMissing{<:AbstractArray}, dims) =
317+
mapreducedim!(f, op, reducedim_initarray(itr, dims, nt.init), itr)
318+
319+
_mapreduce_dim(f, op, ::NamedTuple{()}, itr::SkipMissing{<:AbstractArray}, dims) =
320+
mapreducedim!(f, op, reducedim_init(f, op, itr, dims), itr)
321+
322+
reducedim_initarray(itr::SkipMissing{<:AbstractArray}, region, init, ::Type{R}) where {R} =
323+
reducedim_initarray(itr.x, region, init, R)
324+
reducedim_initarray(itr::SkipMissing{<:AbstractArray}, region, init::T) where {T} =
325+
reducedim_initarray(itr.x, region, init, T)
326+
327+
# initialization when computing minima and maxima requires a little care
328+
for (f1, f2, initval) in ((:min, :max, :Inf), (:max, :min, :(-Inf)))
329+
@eval function reducedim_init(f, op::typeof($f1), itr::SkipMissing{<:AbstractArray}, region)
330+
A = itr.x
331+
T = eltype(itr)
332+
333+
# First compute the reduce indices. This will throw an ArgumentError
334+
# if any region is invalid
335+
ri = reduced_indices(A, region)
336+
337+
# Next, throw if reduction is over a region with length zero
338+
any(i -> isempty(axes(A, i)), region) && _empty_reduce_error()
339+
340+
# Make a view of the first slice of the region
341+
A1 = view(A, ri...)
342+
343+
if isempty(A1)
344+
# If the slice is empty just return non-view, non-missing version as the initial array
345+
return similar(A1, eltype(itr))
346+
else
347+
# otherwise use the min/max of the first slice as initial value
348+
v0 = mapreduce(f, $f2, A1)
349+
350+
R = similar(A1, typeof(v0))
351+
352+
# if any value is missing in first slice, look for first
353+
# non-missing value in each slice
354+
if v0 === missing
355+
v0 = nonmissingval(f, $f2, itr, R)
356+
R = similar(A1, typeof(v0))
357+
end
358+
359+
# but NaNs need to be avoided as initial values
360+
v0 = v0 != v0 ? typeof(v0)($initval) : v0
361+
362+
# equivalent to reducedim_initarray, but we need R in advance
363+
return fill!(R, v0)
364+
end
365+
end
366+
end
367+
368+
# Iterate until we've encountered at least one non-missing value in each slice,
369+
# and return the min/max non-missing value of all clices
370+
function nonmissingval(f, op::Union{typeof(min), typeof(max)},
371+
itr::SkipMissing{<:AbstractArray}, R::AbstractArray)
372+
A = itr.x
373+
lsiz = check_reducedims(R,A)
374+
indsAt, indsRt = safe_tail(axes(A)), safe_tail(axes(R)) # handle d=1 manually
375+
keep, Idefault = Broadcast.shapeindexer(indsRt)
376+
i = findfirst(!ismissing, A)
377+
i === nothing && throw(ArgumentError("cannot reduce over array with only missing values"))
378+
@inbounds v = A[i]
379+
if reducedim1(R, A)
380+
# keep track of state using a single variable when reducing along the first dimension
381+
i1 = first(axes1(R))
382+
@inbounds for IA in CartesianIndices(indsAt)
383+
IR = Broadcast.newindex(IA, keep, Idefault)
384+
filled = false
385+
for i in axes(A, 1)
386+
x = A[i, IA]
387+
if x !== missing
388+
v = op(v, f(x))
389+
filled = true
390+
break
391+
end
392+
end
393+
if !filled
394+
throw(ArgumentError("cannot reduce over slices with only missing values"))
395+
end
396+
end
397+
else
398+
filled = fill!(similar(R, Bool), false)
399+
allfilled = false
400+
@inbounds for IA in CartesianIndices(indsAt)
401+
IR = Broadcast.newindex(IA, keep, Idefault)
402+
for i in axes(A, 1)
403+
x = A[i, IA]
404+
if x !== missing
405+
v = op(v, f(x))
406+
filled[i,IR] = true
407+
end
408+
end
409+
(allfilled = all(filled)) && break
410+
end
411+
if !allfilled
412+
throw(ArgumentError("cannot reduce over slices with only missing values"))
413+
end
414+
end
415+
v
416+
end
417+
418+
function _mapreducedim!(f, op, R::AbstractArray, itr::SkipMissing{<:AbstractArray})
419+
A = itr.x
420+
lsiz = check_reducedims(R,A)
421+
isempty(A) && return R
422+
423+
if has_fast_linear_indexing(A) && lsiz > 16
424+
# use mapreduce_impl, which is probably better tuned to achieve higher performance
425+
nslices = div(length(A), lsiz)
426+
ibase = first(LinearIndices(A))-1
427+
for i = 1:nslices
428+
x = mapreduce_impl(f, op, itr, ibase+1, ibase+lsiz)
429+
if x !== nothing
430+
@inbounds R[i] = op(R[i], something(x))
431+
end
432+
ibase += lsiz
433+
end
434+
return R
435+
end
436+
indsAt, indsRt = safe_tail(axes(A)), safe_tail(axes(R)) # handle d=1 manually
437+
keep, Idefault = Broadcast.shapeindexer(indsRt)
438+
if reducedim1(R, A)
439+
# keep the accumulator as a local variable when reducing along the first dimension
440+
i1 = first(axes1(R))
441+
@inbounds for IA in CartesianIndices(indsAt)
442+
IR = Broadcast.newindex(IA, keep, Idefault)
443+
r = R[i1,IR]
444+
@simd for i in axes(A, 1)
445+
x = A[i, IA]
446+
if x !== missing
447+
r = op(r, f(x))
448+
end
449+
end
450+
451+
R[i1,IR] = r
452+
end
453+
else
454+
@inbounds for IA in CartesianIndices(indsAt)
455+
IR = Broadcast.newindex(IA, keep, Idefault)
456+
@simd for i in axes(A, 1)
457+
x = A[i, IA]
458+
if x !== missing
459+
R[i,IR] = op(R[i,IR], f(x))
460+
end
461+
end
462+
end
463+
end
464+
return R
465+
end
466+
467+
mapreducedim!(f, op, R::AbstractArray, A::SkipMissing{<:AbstractArray}) =
468+
(_mapreducedim!(f, op, R, A); R)
469+
470+
reducedim!(op, R::AbstractArray{RT}, A::SkipMissing{<:AbstractArray}) where {RT} =
471+
mapreducedim!(identity, op, R, A)
472+
283473
"""
284474
coalesce(x, y...)
285475

0 commit comments

Comments
 (0)