Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
name = "KernelDensity"
uuid = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
version = "0.6.8"
authors = ["Simon Byrne and various contributors"]
version = "0.6.10"

[deps]
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

[compat]
Distributions = "0.23, 0.24, 0.25"
DocStringExtensions = "0.8, 0.9"
FFTW = "1"
Interpolations = "0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16"
Interpolations = "0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15"
IrrationalConstants = "0.2.4"
StatsBase = "0.33, 0.34"
julia = "1"

Expand Down
17 changes: 14 additions & 3 deletions src/KernelDensity.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,29 @@ using StatsBase
using Distributions
using Interpolations

import Distributions: twoπ, pdf
import IrrationalConstants: twoπ
import Base: getproperty, propertynames

import Distributions: pdf
import Distributions: ncomponents, component, probs

import FFTW: rfft, irfft

export DiscretisedPDF, KernelEstimate, BandwidthMethod, Silverman, LSCV
export kernel_estimate, precompute!

export kde, kde_lscv, UnivariateKDE, BivariateKDE, InterpKDE, pdf

abstract type AbstractKDE end

# define broadcasting behavior
Base.Broadcast.broadcastable(x::AbstractKDE) = Ref(x)

include("univariate.jl")
include("bivariate.jl")
include("interp.jl")

end # module
include("initialisation.jl")
include("bandwidth_selection.jl")
include("feature_computation.jl")

end
83 changes: 83 additions & 0 deletions src/bandwidth_selection.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@


# Silverman's rule of thumb for KDE bandwidth selection
function get_bandwidth(::Silverman, data::AbstractVector{<:Real}, kernelType = Normal, prior = DiscreteUniform(1,length(data)), alpha::Float64 = 0.9)
# Determine length of data
ndata = length(data)
ndata <= 1 && return alpha

# Calculate width using variance and IQR
var_width = std(data)
q25, q75 = quantile(data, (0.25, 0.75))
quantile_width = (q75 - q25) / 1.34

# Deal with edge cases with 0 IQR or variance
width = min(var_width, quantile_width)
if width == 0.0
if var_width == 0.0
width = 1.0
else
width = var_width
end
end

# Set bandwidth using Silverman's rule of thumb
return alpha * width * ndata^(-0.2), nothing
end

@kwdef struct LSCV <: BandwidthMethod
nPoints::Int = 2048
initBandwidth::Float64 = NaN
boundary::Tuple{Float64,Float64} = (NaN,NaN)
end

# Select bandwidth using least-squares cross validation, from:
# Density Estimation for Statistics and Data Analysis
# B. W. Silverman (1986)
# sections 3.4.3 (pp. 48-52) and 3.5 (pp. 61-66)
function get_bandwidth(lscv::LSCV, data::AbstractVector{<:Real}, kernelType = Normal, prior = DiscreteUniform(1,length(data)))
K = lscv.nPoints
initBandwidth = isnan(lscv.initBandwidth) ? get_bandwidth(Silverman(), data)[1] : lscv.initBandwidth
boundaryLow, boundaryHigh = lscv.boundary
if isnan(boundaryLow) || isnan(boundaryHigh)
lo, hi = extrema(data)
boundaryLow = isnan(boundaryLow) ? lo - 4.0*initBandwidth : boundaryLow
boundaryHigh = isnan(boundaryHigh) ? hi + 4.0*initBandwidth : boundaryHigh
end

ndata = length(data)

midpoints = range(boundaryLow, boundaryHigh, K)

initDen = tabulate(data, midpoints, prior).values

# the ft here is K/ba*sqrt(2pi) * u(s), it is K times the Yl in Silverman's book
ft = rfft(initDen)

ft2 = abs2.(ft)
c = -twoπ/(step(midpoints)*K)
hlb, hub = 0.25*initBandwidth, 1.5*initBandwidth

optimalBandwidth = optimize(hlb, hub) do h
dist = kernel_dist(kernelType, h)
ψ = 0.0
for j in 1:length(ft2)-1
ks = real(cf(dist, j*c))
ψ += ft2[j+1]*(ks-2.0)*ks
end
ψ*step(midpoints)/K + pdf(dist,0.0)/ndata
end

dist = kernel_dist(kernelType, optimalBandwidth)
for j in 0:length(ft)-1
ft[j+1] *= cf(dist, j*c)
end

convolved = irfft(ft, K)

# fix rounding error.
convolved .= max.(0., convolved)

# Invert the Fourier transform to get the KDE
optimalBandwidth, DiscretisedPDF(convolved, midpoints)
end
48 changes: 48 additions & 0 deletions src/exampleUse.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@

using Plots

using Distributions
using .KernelDensity

##

points = [1. 2 3 4 5]
cat = Categorical([1/5,1/5,1/5,1/5,1/5])

ds = DiscretisedPDF(fill(1/20,20),LinRange(0,1,20))

k = KernelEstimate(points,cat, Normal(0,1),1.0, nothing)
ncomponents(k1)
component(k1,3)
probs(k1)

##

X = randn(10^3)

k1 = kernel_estimate(X, 0.2, Normal)

k2 = kernel_estimate(X, Silverman(), Epanechnikov)

k3 = kernel_estimate(X, LSCV(), Epanechnikov)

precompute!(k2,2048,(-5,5))

pdf(k2, 1)
pdf.(k2, [1, 2, 3])
pdf(k2, 1, :precomputed)
pdf.(k2, [1, 2, 3], :precomputed)

cdf(k2, 1)
mean(k2), var(k2)
quantile(k2, 0.9)

rand(k2)

kde = k2.precomputedPDF
plot(kde.xs,kde.values,label = "precomputed")

xs = LinRange(-4,4,100)
plot!(xs,pdf.(k2, xs),label = "naive")


97 changes: 97 additions & 0 deletions src/feature_computation.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@

# 1D pdf precomputation

function precompute!(ke::UnivariateKernelEstimate, nPoints::Integer = 2048,
boundary::Tuple{<:Real,<:Real} =((lo, hi) = extrema(ke.data); (lo - 4.0*ke.bandwidth, hi + 4.0*ke.bandwidth)))

# find the element type of range in ke.precomputedPDF
T = eltype(typeof(ke).parameters[5].parameters[2])
midpoints = range(T(boundary[1]), T(boundary[2]), Int(nPoints))
ke.precomputedPDF = conv(tabulate(vec(ke.data), midpoints, ke.prior), ke.kernel)
end

function tabulate(data::AbstractVector{<:Real}, midpoints::AbstractRange, prior::UnivariateDistribution{Discrete})
npoints = length(midpoints)
s = step(midpoints)

# Set up a grid for discretized data
grid = zeros(Float64, npoints)
ainc = 1.0 / (s*s)

# weighted discretization (cf. Jones and Lotwick)
for (i,x) in enumerate(data)
k = searchsortedfirst(midpoints,x)
j = k-1
if 1 <= j <= npoints-1
grid[j] += (midpoints[k]-x)*ainc*pdf(prior,i)
grid[k] += (x-midpoints[j])*ainc*pdf(prior,i)
end
end

return DiscretisedPDF(grid, midpoints)
end

function conv(den::DiscretisedPDF{1, R, T}, kernel::UnivariateDistribution) where {T<:Real,R<:AbstractRange}
# Transform to Fourier basis
K = length(den.values)
ft = rfft(den.values)

# Convolve fft with characteristic function of kernel
# empirical cf
# = \sum_{n=1}^N e^{i*t*X_n} / N
# = \sum_{k=0}^K e^{i*t*(a+k*s)} N_k / N
# = e^{i*t*a} \sum_{k=0}^K e^{-2pi*i*k*(-t*s*K/2pi)/K} N_k / N
# = A * fft(N_k/N)[-t*s*K/2pi + 1]
c = -twoπ/(step(den.xs)*K)
for j in 0:length(ft)-1
ft[j+1] *= cf(kernel,j*c)
end

# Invert the Fourier transform to get the KDE
convolved = irfft(ft, K)

# fix rounding error.
convolved .= max.(0., convolved)

DiscretisedPDF(convolved, den.xs)
end

# pdf methods

# pdf(ke, x) is implemented in Distributions.jl

function pdf(ke::UnivariateKernelEstimate, x::Real, method::Symbol)
if method == :precomputed
den = ke.precomputedPDF
den === nothing && error("PDF must be first precomputed.")
itp_u = interpolate(den.values, BSpline(Quadratic(Line(OnGrid()))))
itp = scale(itp_u, den.xs)
etp = extrapolate(itp, 0.)
return etp.itp(x)
elseif method == :naive
return pdf(ke, x)
else
error("Method not available.")
end
end

# custom broadcast prepares for interpolation only once for all xs
function Base.Broadcast.broadcasted(::typeof(pdf), ke::UnivariateKernelEstimate, xs, method::Symbol)
if method == :precomputed
den = ke.precomputedPDF
den === nothing && error("PDF must be first precomputed.")
itp_u = interpolate(den.values, BSpline(Quadratic(Line(OnGrid()))))
itp = scale(itp_u, den.xs)
etp = extrapolate(itp, 0.)
return etp.itp.(xs)
elseif method == :naive
return pdf.(ke, x)
else
error("Method not available.")
end
end


# it is possible to add cdf(ke, :precomputed)

# it is possibble to add cf(ke, t)
95 changes: 95 additions & 0 deletions src/initialisation.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@


struct DiscretisedPDF{N,R,T}
values::Array{T,N}
labels::NTuple{N,R}
DiscretisedPDF(values::Array{T,N}, labels::NTuple{N,R}) where {N,T<:Real,R<:AbstractRange} = new{N,R,T}(values, labels)
end

DiscretisedPDF(values::Vector, label::AbstractRange) = DiscretisedPDF(values, (label,))

# provides d.xs, d.ys, d.zs for convenience
function Base.getproperty(d::DiscretisedPDF, prop::Symbol)
if prop == :xs
return d.labels[1]
elseif prop == :ys
return d.labels[2]
elseif prop == :zs
return d.labels[3]
else getfield(d, prop)
end
end
Base.propertynames(::DiscretisedPDF) = (:values, :labels, :xs, :ys, :zs)


mutable struct KernelEstimate{VF<:VariateForm, VS<:ValueSupport, KernelType<:Distribution, PriorDist<:UnivariateDistribution{Discrete}, PDF<:DiscretisedPDF} <: AbstractMixtureModel{VF, VS, KernelType}
const data::Matrix{Float64}
const prior::PriorDist
const kernel::KernelType
const bandwidth::Float64
precomputedPDF::Union{Nothing, PDF}

# these constructors guarantee type agreement
function KernelEstimate(data::Matrix{Float64}, prior::PriorDist, kernel::KernelType, bandwidth::Float64, precomputedPDF::PDF) where {
PriorDist<:UnivariateDistribution{Discrete},
KernelType <: Distribution,
PDF <: DiscretisedPDF}
VF, VS = supertype(KernelType).parameters
new{VF,VS,KernelType,PriorDist,PDF}(data, prior, kernel, bandwidth, precomputedPDF)
end
function KernelEstimate(data::Matrix{Float64}, prior::PriorDist, kernel::KernelType, bandwidth::Float64, ::Nothing) where {
PriorDist<:UnivariateDistribution{Discrete},
KernelType <: Distribution}
VF, VS = supertype(KernelType).parameters

# the default PDF type is based on Float64 and Int
R = Base.return_types(range,(Float64,Float64,Int))[1]
PDF = DiscretisedPDF{size(data)[1],R,Float64}
new{VF,VS,KernelType,PriorDist,PDF}(data, prior, kernel, bandwidth, nothing)
end

end

UnivariateKernelEstimate{VS, K, P, PDF} = KernelEstimate{Univariate, VS, K, P, PDF}
MultivariateKernelEstimate{VS, K, P, PDF} = KernelEstimate{Multivariate, VS, K, P, PDF}

# KernelEstimate is a scalar
Base.broadcastable(ke::KernelEstimate) = Ref(ke)

# It is possible to add linear transformations a*ke + b

abstract type BandwidthMethod end

# default algorithm
struct Silverman<:BandwidthMethod end

# implementing common interface of AbstractMixtureModel
ncomponents(ke::KernelEstimate) = size(ke.data)[2]
component(ke::UnivariateKernelEstimate, k) = ke.kernel + ke.data[1,k]
component(ke::MultivariateKernelEstimate, k) = ke.kernel + ke.data[:,k]
probs(ke::KernelEstimate) = probs(ke.prior)


# creating KernelEstimate instance

# make kernel density given bandwidth
function kernel_estimate(data::Vector{<:Real}, bandwidth::Real, kernelType = Normal, prior::UnivariateDistribution{Discrete} = DiscreteUniform(1,length(data)))
kernel = kernel_dist(kernelType, bandwidth)
KernelEstimate(reshape(data, 1, length(data)), prior, kernel, Float64(bandwidth), nothing)
end

# find bandwidth, then make kernel density
function kernel_estimate(data::Vector{<:Real}, method::BandwidthMethod = Silverman(), kernelType = Normal, prior::UnivariateDistribution{Discrete} = DiscreteUniform(1,length(data)))
bandwidth, kde = get_bandwidth(method, data, kernelType, prior)
kernel = kernel_dist(kernelType, bandwidth)
KernelEstimate(reshape(data,1,length(data)), prior, kernel, bandwidth, kde)
end

# Can add kernel_estimate which takes prior as a vector.

# construct kernel from bandwidth
kernel_dist(::Type{Normal}, bandwidth::Real) = Normal(0.0, bandwidth)
kernel_dist(::Type{Uniform}, bandwidth::Real) = Uniform(-√3*bandwidth, √3*bandwidth)

const LocationScale = Union{Laplace, Logistic, SymTriangularDist, Cosine, Epanechnikov}
kernel_dist(::Type{D},w::Real) where {D<:LocationScale} = D(0.0, w/std(D(0.0,1.0)))
Loading
Loading