diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..934f97b
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,29 @@
+language: julia
+
+os:
+  - linux
+  - osx
+
+julia:
+  - 1.0
+  - nightly
+
+branches:
+  only:
+  - master
+  - TravisTesting
+
+before_script:
+  # Every 30 seconds, look for the build log file. If it exists, then
+  # start watching its contents and printing them to stdout as they
+  # change. This has two effects:
+  #   1. it avoids Travis timing out because the build outputs nothing
+  #   2. it makes it more obvious what part of the build, if any, gets stuck
+  # - while sleep 30; do tail ./deps/build.log -f ; done &
+  - julia --project --color=yes --check-bounds=yes -e 'using Pkg;  Pkg.add(PackageSpec(url="https://github.com/JuliaComputing/MKL.jl"));'
+
+script:
+  # 
+  - export JL_PKG=VML
+  - julia --color=yes -e "if VERSION < v\"0.7.0-DEV.5183\"; Pkg.clone(pwd()); Pkg.build(\"VML\"); else using Pkg; if VERSION >= v\"1.1.0-rc1\"; Pkg.build(\"VML\"; verbose=true); else Pkg.build(\"VML\"); end; end"
+  - julia --check-bounds=yes --color=yes -e "if VERSION < v\"0.7.0-DEV.5183\"; Pkg.test(\"VML\", coverage=true); else using Pkg; Pkg.test(coverage=true); end"
\ No newline at end of file
diff --git a/Project.toml b/Project.toml
new file mode 100644
index 0000000..377f5bd
--- /dev/null
+++ b/Project.toml
@@ -0,0 +1,17 @@
+name = "VML"
+uuid = "c8ce9da6-5d36-5c03-b118-5a70151be7bc"
+
+[deps]
+CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
+Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+
+[compat]
+julia = "≥ 0.7 1.0"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
+
+[targets]
+test = ["Test", "MKL"]
diff --git a/README.md b/README.md
index a3d3c1d..876554a 100644
--- a/README.md
+++ b/README.md
@@ -1,25 +1,60 @@
-# VML
+# VML 
+[![Build Status](https://travis-ci.com/Crown421/VML.jl.svg?branch=master)](https://travis-ci.com/Crown421/VML.jl)
+[![Build status](https://ci.appveyor.com/api/projects/status/btdduqfsxux8fhsr?svg=true)](https://ci.appveyor.com/project/Crown421/vml-jl)
 
 This package provides bindings to the Intel Vector Math Library for
-arithmetic and transcendental functions. It is often substantially
-faster than using Julia's built-in functions.
-
-## Using VML.jl
-
-To use VML.jl, you must have the Intel Vector Math Library installed.
-This is included in [MKL](http://software.intel.com/en-us/intel-mkl),
-which is free for non-commercial use. You must also copy/symlink the
-appropriate shared library to a directory known to the linker (e.g.
-`/usr/local/lib`) or you must modify the path to `lib` in `src/VML.jl`.
-
-Currently, VML.jl is configured to use `libmkl_vml_avx`, which requires
-AVX support. If your system does not have AVX (e.g., most pre-Sandy
-Bridge systems), you will need to modify the `const lib` declaration at
-the top of `src/VML.jl`. Future versions of VML.jl may automatically
-detect CPU architecture.
-
-After loading VML.jl, vector calls to functions listed below will
-automatically use VML instead of openlibm when possible.
+arithmetic and transcendental functions. Especially for large vectors it is often substantially faster than broadcasting Julia's built-in functions.
+
+## Basic install
+
+To use VML.jl, you must have the shared libraries of the Intel Vector Math Library avilable on your system.
+The easiest option is to use [MKL.jl](https://github.com/JuliaComputing/MKL.jl) via 
+```
+julia> ] add https://github.com/JuliaComputing/MKL.jl.git
+```
+Alternatively you can install MKL directly [from intel](https://software.intel.com/en-us/mkl/choose-download).
+
+Note that intel MKL has a separate license, which you may want to check for commercial projects (see [FAQ]( https://software.intel.com/en-us/mkl/license-faq)).
+
+To install VML.jl run 
+```
+julia> ] add https://github.com/Crown421/VML.jl
+```
+
+## Using VML
+After loading `VML`, you have the supported function listed below available to call, i.e. `VML.sin(rand(100))`. This should provide a significant speed-up over broadcasting the Base functions.
+```
+julia> using VML
+julia> a = rand(10000);
+julia>@time  sin.(a);                 
+0.159878 seconds (583.25 k allocations: 30.720 MiB, 2.78% gc time)
+julia> @time VML.sin(a);                                                                                
+0.000465 seconds (6 allocations: 781.484 KiB) 
+```
+
+Most function do currently (julia 1.x) not have a vectorized form, meaning that i.e. `sin(rand(10))` will not work.  If you would like to extend the Base function with this functionality you can overload them with the `@overload` macro:
+```
+julia> @overload sin
+julia> @time sin(a);                                                                                
+0.000485 seconds (6 allocations: 781.484 KiB) 
+```
+Note the lack of the broadcasting dot`.` Now calling i.e. `sin` with an array as input will call the VML functions. 
+
+#### Note:
+Some functions like `exp` and `log` do operate on matrices from Base and refer to the [matrix exponential](https://en.wikipedia.org/wiki/Matrix_exponential) and logarithm. Using `@overload exp` will overwrite this behaviour with element-wise exponentiation/ logarithm. 
+```
+julia> exp([1 1; 1 1.0])
+2×2 Array{Float64,2}:
+ 4.19453  3.19453
+ 3.19453  4.19453
+
+julia> VML.exp([1 1; 1 1.0])
+2×2 Array{Float64,2}:
+ 2.71828  2.71828
+ 2.71828  2.71828
+```
+
+### Accuracy
 
 By default, VML uses `VML_HA` mode, which corresponds to an accuracy of
 <1 ulp, matching the accuracy of Julia's built-in openlibm
@@ -30,7 +65,7 @@ regarding these options is available on
 [Intel's website](http://software.intel.com/sites/products/documentation/hpc/mkl/vml/vmldata.htm).
 
 ## Performance
-
+(These results are currently outdated and will be updated in due course)
 ![VML Performance Comparison](/benchmark/performance.png)
 
 ![VML Complex Performance Comparison](/benchmark/performance_complex.png)
@@ -43,14 +78,13 @@ VML use only a single core when performing these benchmarks.
 
 ## Supported functions
 
-VML.jl supports the following functions, currently for Float32 and
-Float64 only. While VML also offers operators for complex numbers,
-these are not yet implemented in VML.jl.
+VML.jl supports the following functions, most for Float32 and
+Float64, while some also take complex numbers.
 
 ### Unary functions
 
 Allocating forms have signature `f(A)`. Mutating forms have signatures
-`f!(A)` (in place) and `f!(out, A)` (out of place).
+`f!(A)` (in place) and `f!(out, A)` (out of place). The last 9 functions have been moved from Base to `SpecialFunctions.jl` or have no Base equivalent. 
 
 Allocating | Mutating
 -----------|---------
@@ -82,6 +116,8 @@ Allocating | Mutating
 `erfc`     | `erfc!`
 `erfinv`   | `erfinv!`
 `efcinv`   | `efcinv!`
+`gamma`    | `gamma!`
+`lgamma`   | `lgamma!`
 `inv_cbrt` | `inv_cbrt!`
 `inv_sqrt` | `inv_sqrt!`
 `pow2o3`   | `pow2o3!`
@@ -90,12 +126,32 @@ Allocating | Mutating
 ### Binary functions
 
 Allocating forms have signature `f(A, B)`. Mutating forms have
-signature `f!(out, A, B)`. These functions fall back on broadcasting
-when
+signature `f!(out, A, B)`. 
 
 Allocating | Mutating
 -----------|---------
-`atan2`    | `atan2!`
+`atan`    | `atan!`
 `hypot`    | `hypot!`
-`.^`       | `pow!`
-`./`       | `divide!`
+`pow`       | `pow!`
+`divide`       | `divide!`
+
+
+## Next steps
+Next steps for this package 
+* [x] Windows support
+* [x] Basic Testing
+* [x] Avoiding overloading base and optional overload function
+* [ ] Updating Benchmarks
+* [x] Travis and AppVeyor testing
+* [x] Adding CIS function
+* [ ] Add tests for mutating functions
+
+
+
+## Advanced 
+VML.jl works via Libdl which loads the relevant shared libraries. Libdl automatically finds the relevant libraries if the location of the binaries has been added to the system search paths. 
+This already taken care of if you use MKL.jl, but the stand-alone may require you to source `mklvars.sh`. The default command on Mac and Ubuntu is `source /opt/intel/mkl/bin/mklvars.sh intel64`. You may want to add this to your `.bashrc`. 
+Adding a new `*.conf` file in `/etc/ld.so.conf.d` also works, as the `intel-mkl-slim` package in the AUR does automatically. 
+
+Further, VML.jl uses [CpuId.jl](https://github.com/m-j-w/CpuId.jl) to detect if your processor supports the newer `avx2` instructions, and if not defaults to `libmkl_vml_avx`. If your system does not have AVX this package will currently not work for you.
+If the CPU feature detection does not work for you, please open an issue. 
diff --git a/REQUIRE b/REQUIRE
deleted file mode 100644
index d5d6467..0000000
--- a/REQUIRE
+++ /dev/null
@@ -1 +0,0 @@
-julia 0.4
diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 0000000..d8a05fe
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,45 @@
+
+environment:
+  matrix:
+  - julia_version: 1
+  - julia_version: nightly
+
+platform:
+  # - x86 # 32-bit
+  - x64 # 64-bit
+
+# # Uncomment the following lines to allow failures on nightly julia
+# # (tests will run but not make your overall status red)
+matrix:
+  allow_failures:
+  - julia_version: nightly
+
+branches:
+  only:
+    - master
+    # - AppVeyor
+
+notifications:
+  - provider: Email
+    on_build_success: false
+    on_build_failure: false
+    on_build_status_changed: false
+
+install:
+  - ps: iex ((new-object net.webclient).DownloadString("https://raw.githubusercontent.com/JuliaCI/Appveyor.jl/version-1/bin/install.ps1"))
+
+build_script:
+  - C:\julia\bin\julia -e "using Pkg;  Pkg.add(PackageSpec(url=\"https://github.com/JuliaComputing/MKL.jl\"));"
+  - echo "%JL_BUILD_SCRIPT%"
+  - C:\julia\bin\julia -e "%JL_BUILD_SCRIPT%"
+
+test_script:
+  - echo "%JL_TEST_SCRIPT%"
+  - C:\julia\bin\julia -e "%JL_TEST_SCRIPT%"
+
+# # Uncomment to support code coverage upload. Should only be enabled for packages
+# # which would have coverage gaps without running on Windows
+# on_success:
+#   - echo "%JL_CODECOV_SCRIPT%"
+#   - C:\julia\bin\julia -e "%JL_CODECOV_SCRIPT%"
+
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
new file mode 100644
index 0000000..d390084
--- /dev/null
+++ b/benchmark/.gitignore
@@ -0,0 +1 @@
+*.jld2
diff --git a/benchmark/benchmark.jl b/benchmark/benchmark.jl
index 49a2410..c91e7da 100644
--- a/benchmark/benchmark.jl
+++ b/benchmark/benchmark.jl
@@ -1,87 +1,150 @@
-using Distributions, PyCall, PyPlot
-@pyimport matplotlib.gridspec as gridspec
+using VML
+using Distributions, Statistics, BenchmarkTools # for benchmark
+using Plots # for plotting
+using JLD2, FileIO # to save file
 
+cd(dirname(@__FILE__))
 include(joinpath(dirname(dirname(@__FILE__)), "test", "common.jl"))
+
+################################################################
 complex = !isempty(ARGS) && ARGS[1] == "complex"
+complex = false
+
+# First generate some random data and test functions in Base on it
+const NVALS = 10_000
+base_unary = complex ? base_unary_complex : base_unary_real
+base_binary = complex ? base_binary_complex : base_binary_real
+types = complex ? (Complex64, Complex128) : (Float32, Float64)
+
+# arrays of inputs are stored in a Tuple. So later for calling use inp... to get the content of the Tuple
+input = Dict( t =>
+[
+ [(randindomain(t, NVALS, domain),) for (_, _, domain) in base_unary];
+ [(randindomain(t, NVALS, domain1), randindomain(t, NVALS, domain2)) for (_, _, domain1, domain2) in base_binary];
+ # (randindomain(t, NVALS, (0, 100)), randindomain(t, 1, (-1, 20))[1])
+]
+    for t in types)
+
+fns = [[x[1:2] for x in base_unary_real];
+       [x[1:2] for x in base_binary_real]]
+################################################################
 
+"""
+    bench(fns, input)
+
+benchmark function for VML.jl. Calls both Base and VML functions and stores the benchmarks in two nested Dict. First layer specifies type, and second layer specifies the function name. The result is a Tuple, 1st element being benchmark for Base/SpecialFunctions and 2nd element being for VML.
+
+# Examples
+```julia
+fns = [(:Base, :acos); (:Base, :atan); (:SpecialFunctions, :ref)] # array of tuples
+input = Dict( Float64 => [(rand(1000)); (rand(1000), rand(1000)); (rand(1000))]) # Dict of array of tuples
+times = bench(fns, input)
+
+times[Float64][:acos][1] # Base.acos benchmark for Float64
+times[Float64][:acos][2] # VML.acos benchmark for Float64
+```
+"""
 function bench(fns, input)
-    [t=>begin
-        times = Array(Vector{Float64}, length(fns))
-        for ifn = 1:length(fns)
-            fn = fns[ifn]
-            inp = input[t][ifn]
-            fn(inp...)
-            gc()
-            nrep = max(iceil(2/(@elapsed (gc_disable(); fn(inp...); gc_enable(); gc()))), 3)
-            println("Running $nrep reps of $fn($t)")
-            @time times[ifn] = [begin
-                gc()
-                gc_disable()
-                time = @elapsed fn(inp...)
-                gc_enable()
-                time
-            end for i = 1:nrep]
-            # println((mean(times[ifn]), std(times[ifn])))
-        end
-        times
-    end for t in types]
+    Dict(t => begin
+        Dict( fn[2] => begin
+            base_fn = eval(:($(fn[1]).$(fn[2])))
+            vml_fn = eval(:(VML.$(fn[2])))
+            println("benchmarking $vml_fn for type $t")
+            timesBase = @benchmark $base_fn.($inp...)
+            timesVML = @benchmark $vml_fn($inp...)
+
+            [timesBase, timesVML]
+        end for (fn, inp) in zip(fns, input[t]) )
+    end for t in types)
 end
+################################################################
+
+# do benchmark
+benches = bench(fns, input)
+
+@save "benchmarkData.jld" benches
+# @save "benchmarkData-complex.jld" benches
+
+# benches = load("benchmarkData.jld2", "benches")
+
 
-function ratioci(y, x, alpha=0.05)
+# something is wrong with these
+deleteat!(fns, [18, 31])
+delete!(benches[Float32], :lgamma)
+delete!(benches[Float64], :lgamma)
+delete!(benches[Float32], :log10)
+delete!(benches[Float64], :log10)
+
+################################################################
+
+"""
+benchmark analysis function
+"""
+function ratioci(y, x, alpha = 0.05)
     tq² = abs2(quantile(TDist(length(x) + length(y) - 2), alpha))
     μx = mean(x)
     σx² = varm(x, μx)
     μy = mean(y)
     σy² = varm(y, μy)
-    a = sqrt((μx*μy)^2 - (μx^2 - tq²*σx²)*(μy^2 - tq²*σy²))
-    b = μx^2 - tq²*σx²
-    (((μx*μy) - a)/b, ((μx*μy) + a)/b)
+    a = sqrt((μx * μy)^2 - (μx^2 - tq² * σx²) * (μy^2 - tq² * σy²))
+    b = μx^2 - tq² * σx²
+    return (((μx * μy) - a) / b, ((μx * μy) + a) / b)
 end
 
-# First generate some random data and test functions in Base on it
-const NVALS = 1_000_000
-base_unary = complex ? base_unary_complex : base_unary_real
-base_binary = complex ? base_binary_complex : base_binary_real
-types = complex ? (Complex64, Complex128) : (Float32, Float64)
-input = [t=>[[(randindomain(t, NVALS, domain),) for (fn, domain) in base_unary];
-             [(randindomain(t, NVALS, domain1), randindomain(t, NVALS, domain2))
-              for (fn, domain1, domain2) in base_binary];
-             (randindomain(t, NVALS, (0, 100)), randindomain(t, 1, (-1, 20))[1])]
-            for t in types]
-fns = [[x[1] for x in base_unary]; [x[1] for x in base_binary]; (complex ? [] : .^)]
+################################################################
 
-builtin = bench(fns, input)
+"""
+Does analysis of the benchmark data and plots them as bars.
+"""
+function plotBench()
 
-# Now with VML
-using VML
+# Print ratio
+    colors = [:blue, :red]
+    for itype = 1:length(types)
 
-vml = bench(fns, input)
+        # creating arrays of times from benchmarks
+        benchVals = collect(values(benches[types[itype]]))
+        builtint = [x[1].times for x in benchVals]
+        vmlt = [x[2].times for x in benchVals]
 
-# Print ratio
-clf()
-colors = ["r", "y"]
-for itype = 1:length(types)
-    builtint = builtin[types[itype]]
-    vmlt = vml[types[itype]]
-    μ = vec(map(mean, builtint)./map(mean, vmlt))
-    ci = zeros(Float64, 2, length(fns))
-    for ifn = 1:length(builtint)
-        lower, upper = ratioci(builtint[ifn], vmlt[ifn])
-        ci[1, ifn] = μ[ifn] - lower
-        ci[2, ifn] = upper - μ[ifn]
+        # calculating mean of run times
+        μ = vec(map(mean, builtint) ./ map(mean, vmlt))
+
+        # error bar disabled because benchmark tools takes care of errors
+
+        # # calculating error bars
+        # ci = zeros(Float64, 2, length(fns))
+        # for ifn = 1:length(builtint)
+        #     lower, upper = ratioci(builtint[ifn], vmlt[ifn])
+        #     ci[1, ifn] = μ[ifn] - lower
+        #     ci[2, ifn] = upper - μ[ifn]
+        # end
+
+        # adding bar
+        plt = bar!(
+            0.2+(0.4*itype):length(fns),
+            μ,
+            # yerror = ci,
+            fillcolor = colors[itype],
+            labels = [string(x) for x in types],
+            dpi = 600
+        )
     end
-    bar(0.2+(0.4*itype):length(fns), μ, 0.4, yerr=ci, color=colors[itype], ecolor="k")
-end
-ax = gca()
-ax[:set_xlim](0, length(fns)+1)
-fname = [string(fn.env.name) for fn in fns]
-if !complex
-    fname[end-1] = "A.^B"
-    fname[end] = "A.^b"
+    fname = [string(fn[2]) for fn in fns]
+    # if !complex
+    #     fname[end-1] = "A.^B"
+    #     fname[end] = "A.^b"
+    # end
+    xlims!(0, length(fns) + 1)
+    xticks!(1:length(fns)+1, fname, rotation = 70, fontsize = 10)
+    title!("VML Performance for array of size $NVALS")
+    ylabel!("Relative Speed (VML/Base)")
+    hline!([1], line=(4, :dash, 0.6, [:green]), labels = 1)
+    savefig("performance$(complex ? "_complex" : "").png")
+
 end
-xticks(1:length(fns)+1, fname, rotation=70, fontsize=10)
-title("VML Performance")
-ylabel("Relative Speed (Base/VML)")
-legend([string(x) for x in types])
-ax[:axhline](1; color="black", linestyle="--")
-savefig("performance$(complex ? "_complex" : "").png")
+
+################################################################
+
+# do plotting
+plotBench()
diff --git a/benchmark/performance.png b/benchmark/performance.png
index edf8a56..12985d6 100644
Binary files a/benchmark/performance.png and b/benchmark/performance.png differ
diff --git a/deps/.gitignore b/deps/.gitignore
new file mode 100644
index 0000000..9243a81
--- /dev/null
+++ b/deps/.gitignore
@@ -0,0 +1,2 @@
+deps.jl
+build.log
\ No newline at end of file
diff --git a/deps/build.jl b/deps/build.jl
new file mode 100644
index 0000000..1d48fdd
--- /dev/null
+++ b/deps/build.jl
@@ -0,0 +1,37 @@
+
+using CpuId
+using Libdl
+
+## this lets us load CpuId only once 
+
+if cpufeature(:AVX2)
+    libsuffix = :avx2
+    println("AVX2 support detected, vml_avx2 selected")
+else
+    libsuffix = :avx
+    println("AVX2 support missing, vml_avx selected")
+end
+
+if Sys.iswindows()
+    rtlib = :mkl_rt
+    corelib = :mkl_core
+    lib = Symbol(:mkl_vml_, libsuffix)
+else
+    rtlib = :libmkl_rt
+    corelib = :libmkl_core
+    lib = Symbol(:libmkl_vml_, libsuffix)
+end
+
+
+depsjl_path = joinpath(@__DIR__, "deps.jl")
+open(depsjl_path, "w") do depsjl_file
+    println(depsjl_file, strip("""
+        ## This file was autogenerated by build.jl.
+        ## Do not edit.
+        import Libdl
+
+        const lib = :$lib
+        const rtlib = :$rtlib
+        const corelib = :$corelib
+        """))
+end
diff --git a/src/VML.jl b/src/VML.jl
index 9a854d0..1200ee9 100644
--- a/src/VML.jl
+++ b/src/VML.jl
@@ -1,141 +1,48 @@
-module VML
-
-import Base: .^, ./
-
-# TODO detect CPU architecture
-const lib = :libmkl_vml_avx
-Libdl.dlopen(:libmkl_rt)
+__precompile__()
 
-immutable VMLAccuracy
-    mode::UInt
-end
-const VML_LA = VMLAccuracy(0x00000001)
-const VML_HA = VMLAccuracy(0x00000002)
-const VML_EP = VMLAccuracy(0x00000003)
-Base.show(io::IO, m::VMLAccuracy) = print(io, m == VML_LA ? "VML_LA" :
-                                              m == VML_HA ? "VML_HA" : "VML_EP")
-vml_get_mode() = ccall((:_vmlGetMode, lib), Cuint, ())
-vml_set_mode(mode::Integer) = (ccall((:_vmlSetMode, lib), Cuint, (UInt,), mode); nothing)
-
-vml_set_accuracy(m::VMLAccuracy) = vml_set_mode((vml_get_mode() & ~0x03) | m.mode)
-vml_get_accuracy() = VMLAccuracy(vml_get_mode() & 0x3)
-
-vml_set_mode((vml_get_mode() & ~0x0000FF00))
-function vml_check_error()
-    vml_error = ccall((:_vmlClearErrStatus, lib), Cint, ())
-    if vml_error != 0
-        if vml_error == 1
-            throw(DomainError())
-        elseif vml_error == 2 || vml_error == 3 || vml_error == 4
-            # Singularity, overflow, or underflow
-            # I don't think Base throws on these
-        elseif vml_error == 1000
-            warn("VML does not support $(vml_get_accuracy); lower accuracy used instead")
-        else
-            error("an unexpected error occurred in VML ($vml_error)")
-        end
-    end
-end
-
-function vml_prefix(t::DataType)
-    if t == Float32
-        return "_vmls"
-    elseif t == Float64
-        return "_vmld"
-    elseif t == Complex{Float32}
-        return "_vmlc"
-    elseif t == Complex{Float64}
-        return "_vmlz"
-    end
-    error("unknown type $t")
-end
+module VML
 
-function def_unary_op(tin, tout, jlname, jlname!, mklname)
-    mklfn = Base.Meta.quot(Symbol("$(vml_prefix(tin))$mklname"))
-    exports = Symbol[]
-    isa(jlname, Expr) || push!(exports, jlname)
-    isa(jlname!, Expr) || push!(exports, jlname!)
-    @eval begin
-        $(isempty(exports) ? nothing : Expr(:export, exports...))
-        function $(jlname!){N}(out::Array{$tout,N}, A::Array{$tin,N})
-            size(out) == size(A) || throw(DimensionMismatch())
-            ccall(($mklfn, lib), Void, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, out)
-            vml_check_error()
-            out
-        end
-        $(if tin == tout
-            quote
-                function $(jlname!)(A::Array{$tin})
-                    ccall(($mklfn, lib), Void, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, A)
-                    vml_check_error()
-                    A
-                end
-            end
-        end)
-        function $(jlname)(A::Array{$tin})
-            out = similar(A, $tout)
-            ccall(($mklfn, lib), Void, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, out)
-            vml_check_error()
-            out
-        end
-    end
-end
+# import Base: .^, ./
+using SpecialFunctions
+using Libdl
+include(joinpath(dirname(@__DIR__), "deps/deps.jl"))
 
-function def_binary_op(tin, tout, jlname, jlname!, mklname, broadcast)
-    mklfn = Base.Meta.quot(Symbol("$(vml_prefix(tin))$mklname"))
-    exports = Symbol[]
-    isa(jlname, Expr) || push!(exports, jlname)
-    isa(jlname!, Expr) || push!(exports, jlname!)
-    @eval begin
-        $(isempty(exports) ? nothing : Expr(:export, exports...))
-        function $(jlname!){N}(out::Array{$tout,N}, A::Array{$tin,N}, B::Array{$tin,N})
-            size(out) == size(A) == size(B) || $(broadcast ? :(return broadcast!($jlname, out, A, B)) : :(throw(DimensionMismatch())))
-            ccall(($mklfn, lib), Void, (Int, Ptr{$tin}, Ptr{$tin}, Ptr{$tout}), length(A), A, B, out)
-            vml_check_error()
-            out
-        end
-        function $(jlname){N}(A::Array{$tout,N}, B::Array{$tin,N})
-            size(A) == size(B) || $(broadcast ? :(return broadcast($jlname, A, B)) : :(throw(DimensionMismatch())))
-            out = similar(A)
-            ccall(($mklfn, lib), Void, (Int, Ptr{$tin}, Ptr{$tin}, Ptr{$tout}), length(A), A, B, out)
-            vml_check_error()
-            out
-        end
-    end
-end
+include("setup.jl")
 
-for t in (Float32, Float64, Complex64, Complex128)
+for t in (Float32, Float64, ComplexF32, ComplexF64)
     # Unary, real or complex
-    def_unary_op(t, t, :(Base.acos), :acos!, :Acos)
-    def_unary_op(t, t, :(Base.asin), :asin!, :Asin)
-    def_unary_op(t, t, :(Base.acosh), :acosh!, :Acosh)
-    def_unary_op(t, t, :(Base.asinh), :asinh!, :Asinh)
-    def_unary_op(t, t, :(Base.sqrt), :sqrt!, :Sqrt)
-    def_unary_op(t, t, :(Base.exp), :exp!, :Exp)
-    def_unary_op(t, t, :(Base.log), :log!, :Ln)
-
-    # Binary, real or complex
-    def_binary_op(t, t, :(.^), :pow!, :Pow, true)
-    def_binary_op(t, t, :(./), :divide!, :Div, true)
+    def_unary_op(t, t, :acos, :acos!, :Acos)
+    def_unary_op(t, t, :asin, :asin!, :Asin)
+    def_unary_op(t, t, :acosh, :acosh!, :Acosh)
+    def_unary_op(t, t, :asinh, :asinh!, :Asinh)
+    def_unary_op(t, t, :sqrt, :sqrt!, :Sqrt)
+    def_unary_op(t, t, :exp, :exp!, :Exp)
+    def_unary_op(t, t, :log, :log!, :Ln)
+
+    # # Binary, real or complex
+    def_binary_op(t, t, :pow, :pow!, :Pow, true)
+    def_binary_op(t, t, :divide, :divide!, :Div, true)
 end
 
 for t in (Float32, Float64)
     # Unary, real-only
-    def_unary_op(t, t, :(Base.cbrt), :cbrt!, :Cbrt)
-    def_unary_op(t, t, :(Base.expm1), :expm1!, :Expm1)
-    def_unary_op(t, t, :(Base.log1p), :log1p, :Log1p)
-    def_unary_op(t, t, :(Base.abs), :abs!, :Abs)
-    def_unary_op(t, t, :(Base.abs2), :abs2!, :Sqr)
-    def_unary_op(t, t, :(Base.ceil), :ceil!, :Ceil)
-    def_unary_op(t, t, :(Base.floor), :floor!, :Floor)
-    def_unary_op(t, t, :(Base.round), :round!, :Round)
-    def_unary_op(t, t, :(Base.trunc), :trunc!, :Trunc)
-    def_unary_op(t, t, :(Base.erf), :erf!, :Erf)
-    def_unary_op(t, t, :(Base.erfc), :erfc!, :Erfc)
-    def_unary_op(t, t, :(Base.erfinv), :erfinv!, :ErfInv)
-    def_unary_op(t, t, :(Base.erfcinv), :erfcinv!, :ErfcInv)
-    def_unary_op(t, t, :(Base.lgamma), :lgamma!, :LGamma)
-    def_unary_op(t, t, :(Base.gamma), :gamma!, :TGamma)
+    def_unary_op(t, t, :cbrt, :cbrt!, :Cbrt)
+    def_unary_op(t, t, :expm1, :expm1!, :Expm1)
+    def_unary_op(t, t, :log1p, :log1p!, :Log1p)
+    def_unary_op(t, t, :abs, :abs!, :Abs)
+    def_unary_op(t, t, :abs2, :abs2!, :Sqr)
+    def_unary_op(t, t, :ceil, :ceil!, :Ceil)
+    def_unary_op(t, t, :floor, :floor!, :Floor)
+    def_unary_op(t, t, :round, :round!, :Round)
+    def_unary_op(t, t, :trunc, :trunc!, :Trunc)
+
+    # now in SpecialFunctions (make smart, maybe?)
+    def_unary_op(t, t, :erf, :erf!, :Erf)
+    def_unary_op(t, t, :erfc, :erfc!, :Erfc)
+    def_unary_op(t, t, :erfinv, :erfinv!, :ErfInv)
+    def_unary_op(t, t, :erfcinv, :erfcinv!, :ErfcInv)
+    def_unary_op(t, t, :lgamma, :lgamma!, :LGamma)
+    def_unary_op(t, t, :gamma, :gamma!, :TGamma)
     # Not in Base
     def_unary_op(t, t, :inv_cbrt, :inv_cbrt!, :InvCbrt)
     def_unary_op(t, t, :inv_sqrt, :inv_sqrt!, :InvSqrt)
@@ -144,53 +51,97 @@ for t in (Float32, Float64)
 
     # Enabled only for Real. MKL guarantees higher accuracy, but at a
     # substantial performance cost.
-    def_unary_op(t, t, :(Base.atan), :atan!, :Atan)
-    def_unary_op(t, t, :(Base.cos), :cos!, :Cos)
-    def_unary_op(t, t, :(Base.sin), :sin!, :Sin)
-    def_unary_op(t, t, :(Base.tan), :tan!, :Tan)
-    def_unary_op(t, t, :(Base.atanh), :atanh!, :Atanh)
-    def_unary_op(t, t, :(Base.cosh), :cosh!, :Cosh)
-    def_unary_op(t, t, :(Base.sinh), :sinh!, :Sinh)
-    def_unary_op(t, t, :(Base.tanh), :tanh!, :Tanh)
-    def_unary_op(t, t, :(Base.log10), :log10!, :Log10)
-
-    # .^ to scalar power
-    mklfn = Base.Meta.quot(Symbol("$(vml_prefix(t))Powx"))
-    @eval begin
-        export pow!
-        function pow!{N}(out::Array{$t,N}, A::Array{$t,N}, b::$t)
-            size(out) == size(A) || throw(DimensionMismatch())
-            ccall(($mklfn, lib), Void, (Int, Ptr{$t}, $t, Ptr{$t}), length(A), A, b, out)
-            vml_check_error()
-            out
-        end
-        function (.^){N}(A::Array{$t,N}, b::$t)
-            out = similar(A)
-            ccall(($mklfn, lib), Void, (Int, Ptr{$t}, $t, Ptr{$t}), length(A), A, b, out)
-            vml_check_error()
-            out
-        end
-    end
-
-    # Binary, real-only
-    def_binary_op(t, t, :(Base.atan2), :atan2!, :Atan2, false)
-    def_binary_op(t, t, :(Base.hypot), :hypot!, :Hypot, false)
+    def_unary_op(t, t, :atan, :atan!, :Atan)
+    def_unary_op(t, t, :cos, :cos!, :Cos)
+    def_unary_op(t, t, :sin, :sin!, :Sin)
+    def_unary_op(t, t, :tan, :tan!, :Tan)
+    def_unary_op(t, t, :atanh, :atanh!, :Atanh)
+    def_unary_op(t, t, :cosh, :cosh!, :Cosh)
+    def_unary_op(t, t, :sinh, :sinh!, :Sinh)
+    def_unary_op(t, t, :tanh, :tanh!, :Tanh)
+    def_unary_op(t, t, :log10, :log10!, :Log10)
+
+    # # .^ to scalar power
+    # mklfn = Base.Meta.quot(Symbol("$(vml_prefix(t))Powx"))
+    # @eval begin
+    #     export pow!
+    #     function pow!{N}(out::Array{$t,N}, A::Array{$t,N}, b::$t)
+    #         size(out) == size(A) || throw(DimensionMismatch())
+    #         ccall(($mklfn, lib), Nothing, (Int, Ptr{$t}, $t, Ptr{$t}), length(A), A, b, out)
+    #         vml_check_error()
+    #         out
+    #     end
+    #     function (.^){N}(A::Array{$t,N}, b::$t)
+    #         out = similar(A)
+    #         ccall(($mklfn, lib), Nothing, (Int, Ptr{$t}, $t, Ptr{$t}), length(A), A, b, out)
+    #         vml_check_error()
+    #         out
+    #     end
+    # end
+
+    # # Binary, real-only
+    def_binary_op(t, t, :atan, :atan!, :Atan2, false)
+    def_binary_op(t, t, :hypot, :hypot!, :Hypot, false)
 
     # Unary, complex-only
-    def_unary_op(t, Complex{t}, :(Base.cis), :cis!, :CIS)
-    # def_unary_op(Complex{t}, Complex{t}, :(Base.conj), :conj!, :Conj)
-    def_unary_op(Complex{t}, t, :(Base.abs), :abs!, :Abs)
-    def_unary_op(Complex{t}, t, :(Base.angle), :angle!, :Arg)
+    def_unary_op(Complex{t}, Complex{t}, :conj, :conj!, :Conj)
+    def_unary_op(Complex{t}, t, :abs, :abs!, :Abs)
+    def_unary_op(Complex{t}, t, :angle, :angle!, :Arg)
+
+    ### cis is special, vml function is based on output
+    def_unary_op(t, Complex{t}, :cis, :cis!, :CIS; vmltype = Complex{t})
 
     # Binary, complex-only. These are more accurate but performance is
     # either equivalent to Base or slower.
-    # def_binary_op(Complex{t}, Complex{t}, :(Base.(:+)), :add!, :Add, false)
-    # def_binary_op(Complex{t}, Complex{t}, :(Base.(:.+)), :add!, :Add, true)
-    # def_binary_op(Complex{t}, Complex{t}, :(Base.(:.*)), :multiply!, :Mul, true)
-    # def_binary_op(Complex{t}, Complex{t}, :(Base.(:-)), :subtract!, :Sub, false)
-    # def_binary_op(Complex{t}, Complex{t}, :(Base.(:.-)), :subtract!, :Sub, true)
+    # def_binary_op(Complex{t}, Complex{t}, (:+), :add!, :Add, false)
+    # def_binary_op(Complex{t}, Complex{t}, (:.+), :add!, :Add, true)
+    # def_binary_op(Complex{t}, Complex{t}, (:.*), :multiply!, :Mul, true)
+    # def_binary_op(Complex{t}, Complex{t}, (:-), :subtract!, :Sub, false)
+    # def_binary_op(Complex{t}, Complex{t}, (:.-), :subtract!, :Sub, true)
     # def_binary_op(Complex{t}, Complex{t}, :multiply_conj, :multiply_conj!, :Mul, false)
 end
 
-export VML_LA, VML_HA, VML_EP, vml_set_accuracy, vml_get_accuracy
+"""
+    @overload exp log sin
+
+This macro adds a method to each function in `Base` (or perhaps in `SpecialFunctions`), 
+so that when acting on an array (or two arrays) it calls the `VML` function of the same name.
+
+The existing action on scalars is unaffected. However, `exp(M::Matrix)` will now mean
+element-wise `VML.exp(M) == exp.(M)`, rather than matrix exponentiation.
+"""
+macro overload(funs...)
+    out = quote end
+    say = []
+    for f in funs
+        if f in _UNARY
+            if isdefined(Base, f)
+                push!(out.args, :( Base.$f(A::Array) = VML.$f(A) ))
+                push!(say, "Base.$f(A)")
+            elseif isdefined(SpecialFunctions, f)
+                push!(out.args, :( VML.SpecialFunctions.$f(A::Array) = VML.$f(A) ))
+                push!(say, "SpecialFunctions.$f(A)")
+            else
+                @error "function VML.$f is not defined in Base or SpecialFunctions, so there is nothing to overload"
+            end
+        end
+        if f in _BINARY
+            if isdefined(Base, f)
+                push!(out.args, :( Base.$f(A::Array, B::Array) = VML.$f(A, B) ))
+                push!(say, "Base.$f(A, B)")
+            else
+                @error "function VML.$f is not defined in Base, so there is nothing to overload"
+            end
+        end
+        if !(f in _UNARY) && !(f in _BINARY)
+            error("there is no function $f defined by VML.jl")
+        end
+    end
+    str = string("Overloaded these functions: \n  ", join(say, " \n  "))
+    push!(out.args, str)
+    esc(out)
+end
+
+export VML_LA, VML_HA, VML_EP, vml_set_accuracy, vml_get_accuracy, @overload
+
 end
diff --git a/src/setup.jl b/src/setup.jl
new file mode 100644
index 0000000..c201423
--- /dev/null
+++ b/src/setup.jl
@@ -0,0 +1,124 @@
+
+function __init__()
+    MKLpkgid = Base.PkgId(Base.UUID("33e6dc65-8f57-5167-99aa-e5a354878fb2"), "MKL")
+    mklpath = Base.locate_package(MKLpkgid)
+    if !isempty(mklpath)
+        libpath = normpath(joinpath(dirname(mklpath), "../deps/usr/lib"))
+        push!(Libdl.DL_LOAD_PATH, libpath)
+    elseif isempty(Libdl.find_library(rtlib))
+        error("Could not find MKL shared libraries. Please add MKL.jl or install MKL via the intel website. See the github repository for more details.)")
+    end
+
+    Libdl.dlopen(rtlib, RTLD_GLOBAL)
+    Libdl.dlopen(corelib, RTLD_GLOBAL) # maybe only needed on mac
+    Libdl.dlopen(lib, RTLD_GLOBAL)
+
+end
+
+__init__()
+
+struct VMLAccuracy
+    mode::UInt
+end
+
+const VML_LA = VMLAccuracy(0x00000001)
+const VML_HA = VMLAccuracy(0x00000002)
+const VML_EP = VMLAccuracy(0x00000003)
+
+const _UNARY = [] # for @overload to check
+const _BINARY = []
+
+Base.show(io::IO, m::VMLAccuracy) = print(io, m == VML_LA ? "VML_LA" :
+                                              m == VML_HA ? "VML_HA" : "VML_EP")
+vml_get_mode() = ccall((:_vmlGetMode, lib), Cuint, ())
+vml_set_mode(mode::Integer) = (ccall((:_vmlSetMode, lib), Cuint, (UInt,), mode); nothing)
+
+vml_set_accuracy(m::VMLAccuracy) = vml_set_mode((vml_get_mode() & ~0x03) | m.mode)
+vml_get_accuracy() = VMLAccuracy(vml_get_mode() & 0x3)
+
+vml_set_mode((vml_get_mode() & ~0x0000FF00))
+function vml_check_error()
+    vml_error = ccall((:_vmlClearErrStatus, lib), Cint, ())
+    if vml_error != 0
+        if vml_error == 1
+            throw(DomainError(-1, "This function does not support arguments outside its domain"))
+        elseif vml_error == 2 || vml_error == 3 || vml_error == 4
+            # Singularity, overflow, or underflow
+            # I don't think Base throws on these
+        elseif vml_error == 1000
+            warn("VML does not support $(vml_get_accuracy); lower accuracy used instead")
+        else
+            error("an unexpected error occurred in VML ($vml_error)")
+        end
+    end
+end
+
+function vml_prefix(t::DataType)
+    if t == Float32
+        return "_vmls"
+    elseif t == Float64
+        return "_vmld"
+    elseif t == Complex{Float32}
+        return "_vmlc"
+    elseif t == Complex{Float64}
+        return "_vmlz"
+    end
+    error("unknown type $t")
+end
+
+function def_unary_op(tin, tout, jlname, jlname!, mklname; 
+        vmltype = tin)
+    mklfn = Base.Meta.quot(Symbol("$(vml_prefix(vmltype))$mklname"))
+    exports = Symbol[]
+    (@isdefined jlname) || push!(exports, jlname)
+    (@isdefined jlname!) || push!(exports, jlname!)
+    push!(_UNARY, jlname)
+    @eval begin
+        function ($jlname!)(out::Array{$tout,N}, A::Array{$tin,N}) where {N}
+            size(out) == size(A) || throw(DimensionMismatch())
+            ccall(($mklfn, lib), Nothing, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, out)
+            vml_check_error()
+            return out
+        end
+        $(if tin == tout
+            quote
+                function $(jlname!)(A::Array{$tin})
+                    ccall(($mklfn, lib), Nothing, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, A)
+                    vml_check_error()
+                    return A
+                end
+            end
+        end)
+        function ($jlname)(A::Array{$tin})
+            out = similar(A, $tout)
+            ccall(($mklfn, lib), Nothing, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, out)
+            vml_check_error()
+            return out
+        end
+        $(isempty(exports) ? nothing : Expr(:export, exports...))
+    end
+end
+
+function def_binary_op(tin, tout, jlname, jlname!, mklname, broadcast)
+    mklfn = Base.Meta.quot(Symbol("$(vml_prefix(tin))$mklname"))
+    exports = Symbol[]
+    (@isdefined jlname) || push!(exports, jlname)
+    (@isdefined jlname!) || push!(exports, jlname!)
+    push!(_BINARY, jlname)
+    @eval begin
+        $(isempty(exports) ? nothing : Expr(:export, exports...))
+        function ($jlname!)(out::Array{$tout,N}, A::Array{$tin,N}, B::Array{$tin,N}) where {N}
+            size(out) == size(A) == size(B) || $(broadcast ? :(return broadcast!($jlname, out, A, B)) : :(throw(DimensionMismatch())))
+            ccall(($mklfn, lib), Nothing, (Int, Ptr{$tin}, Ptr{$tin}, Ptr{$tout}), length(A), A, B, out)
+            vml_check_error()
+            return out
+        end
+        function ($jlname)(A::Array{$tout,N}, B::Array{$tin,N}) where {N}
+            size(A) == size(B) || $(broadcast ? :(return broadcast($jlname, A, B)) : :(throw(DimensionMismatch())))
+            out = similar(A)
+            ccall(($mklfn, lib), Nothing, (Int, Ptr{$tin}, Ptr{$tin}, Ptr{$tout}), length(A), A, B, out)
+            vml_check_error()
+            return out
+        end
+    end
+end
diff --git a/test/Manifest.toml b/test/Manifest.toml
new file mode 100644
index 0000000..636077b
--- /dev/null
+++ b/test/Manifest.toml
@@ -0,0 +1,174 @@
+# This file is machine-generated - editing it directly is not advised
+
+[[Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
+[[BinDeps]]
+deps = ["Compat", "Libdl", "SHA", "URIParser"]
+git-tree-sha1 = "12093ca6cdd0ee547c39b1870e0c9c3f154d9ca9"
+uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
+version = "0.8.10"
+
+[[BinaryProvider]]
+deps = ["Libdl", "SHA"]
+git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
+uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
+version = "0.5.8"
+
+[[BufferedStreams]]
+deps = ["Compat", "Test"]
+git-tree-sha1 = "5d55b9486590fdda5905c275bb21ce1f0754020f"
+uuid = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d"
+version = "1.0.0"
+
+[[Compat]]
+deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
+git-tree-sha1 = "ed2c4abadf84c53d9e58510b5fc48912c2336fbb"
+uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
+version = "2.2.0"
+
+[[CpuId]]
+deps = ["Markdown", "Test"]
+git-tree-sha1 = "f0464e499ab9973b43c20f8216d088b61fda80c6"
+uuid = "adafc99b-e345-5852-983c-f28acb93d879"
+version = "0.2.2"
+
+[[Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[DelimitedFiles]]
+deps = ["Mmap"]
+uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+
+[[Distributed]]
+deps = ["Random", "Serialization", "Sockets"]
+uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+
+[[HTTPClient]]
+deps = ["Compat", "LibCURL"]
+git-tree-sha1 = "161d5776ae8e585ac0b8c20fb81f17ab755b3671"
+uuid = "0862f596-cf2d-50af-8ef4-f2be67dfa83f"
+version = "0.2.1"
+
+[[InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[LibCURL]]
+deps = ["BinaryProvider", "Libdl"]
+git-tree-sha1 = "fd5fc15f2a04608fe1435a769dbbfc7959ff1daa"
+uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
+version = "0.5.2"
+
+[[LibExpat]]
+deps = ["Compat"]
+git-tree-sha1 = "fde352ec13479e2f90e57939da2440fb78c5e388"
+uuid = "522f3ed2-3f36-55e3-b6df-e94fee9b0c07"
+version = "0.5.0"
+
+[[LibGit2]]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
+[[Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[Libz]]
+deps = ["BufferedStreams", "Random", "Test"]
+git-tree-sha1 = "d405194ffc0293c3519d4f7251ce51baac9cc871"
+uuid = "2ec943e9-cfe8-584d-b93d-64dcb6d567b7"
+version = "1.0.0"
+
+[[LinearAlgebra]]
+deps = ["Libdl"]
+uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+
+[[Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[MKL]]
+deps = ["BinaryProvider", "Libdl", "LinearAlgebra", "PackageCompiler", "Test"]
+git-tree-sha1 = "22e1063dfa595b24956d7b364c33a4f45e70e247"
+repo-rev = "master"
+repo-url = "https://github.com/JuliaComputing/MKL.jl.git"
+uuid = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
+version = "0.0.0"
+
+[[Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[Mmap]]
+uuid = "a63ad114-7e13-5084-954f-fe012c677804"
+
+[[PackageCompiler]]
+deps = ["Libdl", "Pkg", "REPL", "Serialization", "UUIDs", "WinRPM"]
+git-tree-sha1 = "0b5dccd6520dd7072d1d2a7fb22501a98f42e691"
+uuid = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d"
+version = "0.6.4"
+
+[[Pkg]]
+deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+
+[[Printf]]
+deps = ["Unicode"]
+uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[[REPL]]
+deps = ["InteractiveUtils", "Markdown", "Sockets"]
+uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+
+[[Random]]
+deps = ["Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+
+[[Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[SharedArrays]]
+deps = ["Distributed", "Mmap", "Random", "Serialization"]
+uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
+
+[[Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
+[[SparseArrays]]
+deps = ["LinearAlgebra", "Random"]
+uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+[[SpecialFunctions]]
+deps = ["BinDeps", "BinaryProvider", "Libdl"]
+git-tree-sha1 = "3bdd374b6fd78faf0119b8c5d538788dbf910c6e"
+uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
+version = "0.8.0"
+
+[[Statistics]]
+deps = ["LinearAlgebra", "SparseArrays"]
+uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+
+[[Test]]
+deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
+uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[[URIParser]]
+deps = ["Test", "Unicode"]
+git-tree-sha1 = "6ddf8244220dfda2f17539fa8c9de20d6c575b69"
+uuid = "30578b45-9adc-5946-b283-645ec420af67"
+version = "0.4.0"
+
+[[UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[Unicode]]
+uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+
+[[WinRPM]]
+deps = ["BinDeps", "Compat", "HTTPClient", "LibExpat", "Libdl", "Libz", "URIParser"]
+git-tree-sha1 = "2a889d320f3b77d17c037f295859fe570133cfbf"
+uuid = "c17dfb99-b4f7-5aad-8812-456da1ad7187"
+version = "0.4.2"
diff --git a/test/Project.toml b/test/Project.toml
new file mode 100644
index 0000000..f0407c7
--- /dev/null
+++ b/test/Project.toml
@@ -0,0 +1,6 @@
+[deps]
+CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
+Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
+SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/common.jl b/test/common.jl
index c437e43..de54dab 100644
--- a/test/common.jl
+++ b/test/common.jl
@@ -1,72 +1,75 @@
+using SpecialFunctions
 const base_unary_real = (
-    (Base.acos, (-1, 1)),
-    (Base.asin, (-1, 1)),
-    (Base.atan, (-50, 50)),
-    (Base.cos, (-1000, 1000)),
-    (Base.sin, (-1000, 1000)),
-    (Base.tan, (-1000, 1000)),
-    (Base.acosh, (1, 1000)),
-    (Base.asinh, (-1000, 1000)),
-    (Base.atanh, (-1, 1)),
-    (Base.cosh, (0, 89.415985f0)),
-    (Base.sinh, (-89.415985f0, 89.415985f0)),
-    (Base.tanh, (-8.66434f0, 8.66434f0)),
-    (Base.cbrt, (-1000, 1000)),
-    (Base.sqrt, (0, 1000)),
-    (Base.exp, (-88.72284f0, 88.72284f0)),
-    (Base.expm1, (-88.72284f0, 88.72284f0)),
-    (Base.log, (0, 1000)),
-    (Base.log10, (0, 1000)),
-    (Base.log1p, (-1, 1000)),
-    (Base.abs, (-1000, 1000)),
-    (Base.abs2, (-1000, 1000)),
-    (Base.ceil, (-1000, 1000)),
-    (Base.floor, (-1000, 1000)),
-    (Base.round, (-1000, 1000)),
-    (Base.trunc, (-1000, 1000)),
-    (Base.erf, (-3.8325067f0, 3.8325067f0)),
-    (Base.erfc, (-3.7439213f0, 10.019834f0)),
-    (Base.erfinv, (-1, 1)),
-    (Base.erfcinv, (0, 2)),
-    (Base.lgamma, (0, 1000)),
-    (Base.gamma, (0, 36))
+    (:Base, :acos, (-1, 1)),
+    (:Base, :asin, (-1, 1)),
+    (:Base, :atan, (-50, 50)),
+    (:Base, :cos, (-1000, 1000)),
+    (:Base, :sin, (-1000, 1000)),
+    (:Base, :tan, (-1000, 1000)),
+    (:Base, :acosh, (1, 1000)),
+    (:Base, :asinh, (-1000, 1000)),
+    (:Base, :atanh, (-1, 1)),
+    (:Base, :cosh, (0, 89.415985f0)),
+    (:Base, :sinh, (-89.415985f0, 89.415985f0)),
+    (:Base, :tanh, (-8.66434f0, 8.66434f0)),
+    (:Base, :cbrt, (-1000, 1000)),
+    (:Base, :sqrt, (0, 1000)),
+    (:Base, :exp, (-88.72284f0, 88.72284f0)),
+    (:Base, :expm1, (-88.72284f0, 88.72284f0)),
+    (:Base, :log, (0, 1000)),
+    (:Base, :log10, (0, 1000)),
+    (:Base, :log1p, (-1, 1000)),
+    (:Base, :abs, (-1000, 1000)),
+    (:Base, :abs2, (-1000, 1000)),
+    (:Base, :ceil, (-1000, 1000)),
+    (:Base, :floor, (-1000, 1000)),
+    (:Base, :round, (-1000, 1000)),
+    (:Base, :trunc, (-1000, 1000)),
+    (:Base, :cis, (-1000, 1000)),
+    (:SpecialFunctions, :erf, (-3.8325067f0, 3.8325067f0)),
+    (:SpecialFunctions, :erfc, (-3.7439213f0, 10.019834f0)),
+    (:SpecialFunctions, :erfinv, (-1, 1)),
+    (:SpecialFunctions, :erfcinv, (0, 2)),
+    (:SpecialFunctions, :lgamma, (0, 1000)),
+    (:SpecialFunctions, :gamma, (0, 36))
 )
 
 const base_binary_real = (
-    (Base.atan2, (-1, 1), (-1, 1)),
-    (Base.hypot, (-1000, 1000), (-1000, 1000)),
-    (getfield(Base, :./), (-1000, 1000), (-1000, 1000)),
-    (getfield(Base, :.^), (0, 100), (-5, 20))
+    (:Base, :atan, (-1, 1), (-1, 1)),
+    (:Base, :hypot, (-1000, 1000), (-1000, 1000)),
+    # (getfield(Base, :./), (-1000, 1000), (-1000, 1000)),
+    # (getfield(Base, :.^), (0, 100), (-5, 20))
 )
 
 const base_unary_complex = (
-    (Base.acos, (-1, 1)),
-    (Base.asin, (-1, 1)),
-    # (Base.atan, (-50, 50)),
-    # (Base.cos, (-10, 10)),
-    # (Base.sin, (-10, 10)),
-    # (Base.tan, (-10, 10)),
-    (Base.acosh, (1, 1000)),
-    (Base.asinh, (-1000, 1000)),
-    # (Base.atanh, (-1, 1)),
-    # (Base.cosh, (0, 89.415985f0)),
-    # (Base.sinh, (-89.415985f0, 89.415985f0)),
-    # (Base.tanh, (-8.66434f0, 8.66434f0)),
-    (Base.sqrt, (0, 1000)),
-    (Base.exp, (-88.72284f0, 88.72284f0)),
-    (Base.log, (0, 1000)),
-    # (Base.log10, (0, 1000)),
-    (Base.abs, (-1000, 1000)),
-    (Base.angle, (-1000, 1000))
-    # (Base.conj, (-1000, 1000))
+    (:Base, :acos, (-1, 1)),
+    (:Base, :asin, (-1, 1)),
+    (:Base, :acosh, (1, 1000)),
+    (:Base, :asinh, (-1000, 1000)),
+    (:Base, :sqrt, (0, 1000)),
+    (:Base, :exp, (-88.72284f0, 88.72284f0)),
+    (:Base, :log, (0, 1000)),
+    (:Base, :abs, (-1000, 1000)),
+    (:Base, :angle, (-1000, 1000)),
+    (:Base, :conj, (-1000, 1000)),
+    # (atan, (-50, 50)),
+    # (cos, (-10, 10)),
+    # (sin, (-10, 10)),
+    # (tan, (-10, 10)),
+    # (atanh, (-1, 1)),
+    # (cosh, (0, 89.415985f0)),
+    # (sinh, (-89.415985f0, 89.415985f0)),
+    # (tanh, (-8.66434f0, 8.66434f0)),
+    # (log10, (0, 1000)),
+    # (cis, (-1000, 1000))
 )
 
-const base_binary_complex = (
-    (getfield(Base, :./), (-1000, 1000), (-1000, 1000)),
-    # (Base.(:.^), (0, 100), (-2, 10))
-)
+# const base_binary_complex = (
+#     # (getfield(Base, :./), (-1000, 1000), (-1000, 1000)),
+#     # ((:.^), (0, 100), (-2, 10))
+# )
 
-function randindomain{T<:Real}(t::Type{T}, n, domain)
+function randindomain(t::Type{T}, n, domain) where {T<:Real}
     d1 = convert(t, domain[1])
     d2 = convert(t, domain[2])
     ddiff = d2 - d1
@@ -78,7 +81,7 @@ function randindomain{T<:Real}(t::Type{T}, n, domain)
     v
 end
 
-function randindomain{T<:Complex}(t::Type{T}, n, domain)
+function randindomain(t::Type{T}, n, domain) where {T<:Complex}
     d1 = convert(t, domain[1])
     d2 = convert(t, domain[2])
     ddiff = d2 - d1
@@ -87,5 +90,6 @@ function randindomain{T<:Complex}(t::Type{T}, n, domain)
     for i = 1:length(v)
         v[i] = v[i]*ddiff+d1
     end
-    reinterpret(t, v)
+    v
+    # reinterpret(t, v)
 end
diff --git a/test/complex.jl b/test/complex.jl
index b7560b8..b5b1183 100644
--- a/test/complex.jl
+++ b/test/complex.jl
@@ -1,16 +1,30 @@
 # First generate some random data and test functions in Base on it
 const NVALS = 1000
-input = Dict(t=>[[(randindomain(t, NVALS, domain),) for (fn, domain) in base_unary_complex];
-             [(randindomain(t, NVALS, domain1), randindomain(t, NVALS, domain2))
-              for (fn, domain1, domain2) in base_binary_complex];
-             (randindomain(t, NVALS, (0, 100)), randindomain(t, 1, (-2, 10))[1])]
-            for t in (Complex64, Complex128))
-fns = [[x[1] for x in base_unary_complex]; [x[1] for x in base_binary_complex]; .^]
-output = Dict(t=>[fns[i](input[t][i]...) for i = 1:length(fns)] for t in (Complex64, Complex128))
-
-# Now test the same data with VML
-using VML
-for t in (Complex64, Complex128), i = 1:length(fns)
-    fn = fns[i]
-    Base.Test.test_approx_eq(output[t][i], fn(input[t][i]...), "Base $t $fn", "VML $t $fn")
+
+input = Dict(
+    t=>[ (randindomain(t, NVALS, domain),) for (_, _, domain) in base_unary_complex ]
+        for t in (ComplexF32, ComplexF64)
+)
+
+fns = [x[1:2] for x in base_unary_complex]
+
+# output = Dict(
+#     t=>[ fns[i](input[t][i]...) for i = 1:length(fns) ]
+#         for t in (ComplexF32, ComplexF64)
+# )
+
+@testset "Definitions and Comparison with Base for Complex" begin
+
+  for t in (ComplexF32, ComplexF64), i = 1:length(fns)
+
+    base_fn = eval(:($(fns[i][1]).$(fns[i][2]))) 
+    vml_fn = eval(:(VML.$(fns[i][2])))
+
+    Test.@test which(vml_fn, typeof(input[t][i])).module == VML
+
+    # Test.test_approx_eq(output[t][i], fn(input[t][i]...), "Base $t $fn", "VML $t $fn")
+    Test.@test vml_fn(input[t][i]...) ≈ base_fn.(input[t][i]...)
+
+  end
+
 end
diff --git a/test/real.jl b/test/real.jl
index d17bbc3..20980a7 100644
--- a/test/real.jl
+++ b/test/real.jl
@@ -1,25 +1,60 @@
 # First generate some random data and test functions in Base on it
 const NVALS = 1000
-input = Dict(t=>[[(randindomain(t, NVALS, domain),) for (fn, domain) in base_unary_real];
-             [(randindomain(t, NVALS, domain1), randindomain(t, NVALS, domain2))
-              for (fn, domain1, domain2) in base_binary_real];
-             (randindomain(t, NVALS, (0, 100)), randindomain(t, 1, (-5, 20))[1])]
-            for t in (Float32, Float64))
-fns = [[x[1] for x in base_unary_real]; [x[1] for x in base_binary_real]; .^]
-output = Dict(t=>[fns[i](input[t][i]...) for i = 1:length(fns)] for t in (Float32, Float64))
-
-# Now test the same data with VML
-using VML
-for t in (Float32, Float64), i = 1:length(fns)
-    fn = fns[i]
-    Base.Test.test_approx_eq(output[t][i], fn(input[t][i]...), "Base $t $fn", "VML $t $fn")
+
+input = Dict(
+    t=>[
+        [ (randindomain(t, NVALS, domain),) for (_, _, domain) in base_unary_real ];
+        [ (randindomain(t, NVALS, domain1), randindomain(t, NVALS, domain2))
+            for (_, _, domain1, domain2) in base_binary_real ]
+    ]
+    for t in (Float32, Float64)
+)
+
+fns = [[x[1:2] for x in base_unary_real]; [x[1:2] for x in base_binary_real]]
+
+# output = Dict(t=>[fns[i](input[t][i]...) for i = 1:length(fns)] for t in (Float32, Float64))
+
+@testset "Definitions and Comparison with Base for Reals" begin
+
+  for t in (Float32, Float64), i = 1:length(fns)
+
+    base_fn = eval(:($(fns[i][1]).$(fns[i][2]))) 
+    vml_fn = eval(:(VML.$(fns[i][2])))
+    # vml_fn! = eval(:(VML.$(fns[i][2])!))
+
+    Test.@test which(vml_fn, typeof(input[t][i])).module == VML
+
+    # Test.test_approx_eq(output[t][i], fn(input[t][i]...), "Base $t $fn", "VML $t $fn")
+    Test.@test vml_fn(input[t][i]...) ≈ base_fn.(input[t][i]...)
+
+  end
+
 end
 
-# Verify that we still throw DomainErrors
-Base.Test.@test_throws DomainError sqrt([-1.0])
+@testset "Error Handling and Settings" begin
 
-# Setting accuracy
-vml_set_accuracy(VML_LA)
-@assert vml_get_accuracy() == VML_LA
-vml_set_accuracy(VML_EP)
-@assert vml_get_accuracy() == VML_EP
+  # Verify that we still throw DomainErrors
+  Test.@test_throws DomainError VML.sqrt([-1.0])
+  Test.@test_throws DomainError VML.log([-1.0])
+
+  # Setting accuracy
+  vml_set_accuracy(VML_LA)
+  Test.@test vml_get_accuracy() == VML_LA
+
+  vml_set_accuracy(VML_EP)
+  Test.@test vml_get_accuracy() == VML_EP
+
+end
+
+@testset "@overload macro" begin
+
+    @test VML.exp([1.0]) ≈ exp.([1.0])
+    @test_throws MethodError Base.exp([1.0])
+    @test (@overload log exp) isa String
+    @test Base.exp([1.0]) ≈ exp.([1.0])
+
+    @test_throws MethodError Base.atan([1.0], [2.0])
+    @test (@overload atan) isa String
+    @test Base.atan([1.0], [2.0]) ≈ atan.([1.0], [2.0])
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 41cde16..fa7a2ad 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,3 +1,6 @@
+using Test
+using VML
+
 include("common.jl")
 include("real.jl")
 include("complex.jl")