Skip to content

Call xxxI instead of xxx to support strided layout. (2nd take) #58

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "IntelVectorMath"
uuid = "c8ce9da6-5d36-5c03-b118-5a70151be7bc"
version = "0.4.2"
version = "0.4.3"

[deps]
MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
Expand Down
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ julia> b = similar(a);

julia> @btime IVM.sin!(b, a); # in-place version
20.008 μs (0 allocations: 0 bytes)

julia> @views IVM.sin(a[1:2:end]) == b[1:2:end] # all IVM functions support 1d strided input
true
```

### Accuracy
Expand Down Expand Up @@ -247,6 +250,8 @@ Next steps for this package
IntelVectorMath.jl uses [CpuId.jl](https://github.com/m-j-w/CpuId.jl) to detect if your processor supports the newer `avx2` instructions, and if not defaults to `libmkl_vml_avx`. If your system does not have AVX this package will currently not work for you.
If the CPU feature detection does not work for you, please open an issue. -->

As a quick help to convert benchmark timings into operations-per-cycle, IntelVectorMath.jl
1. As a quick help to convert benchmark timings into operations-per-cycle, IntelVectorMath.jl
provides `vml_get_cpu_frequency()` which will return the *actual* current frequency of the
CPU in GHz.

2. Now all IVM functions accept inputs that could be reshaped to an 1d [strided array](https://docs.julialang.org/en/v1/manual/interfaces/#man-interface-strided-arrays).
39 changes: 27 additions & 12 deletions src/IntelVectorMath.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,39 @@ for t in (Float32, Float64)
def_unary_op(t, t, :cbrt, :cbrt!, :Cbrt)
def_unary_op(t, t, :expm1, :expm1!, :Expm1)
def_unary_op(t, t, :log1p, :log1p!, :Log1p)
def_unary_op(t, t, :log2, :log2!, :Log2)
def_unary_op(t, t, :abs, :abs!, :Abs)
def_unary_op(t, t, :abs2, :abs2!, :Sqr)
def_unary_op(t, t, :ceil, :ceil!, :Ceil)
def_unary_op(t, t, :floor, :floor!, :Floor)
def_unary_op(t, t, :round, :round!, :Round)
def_unary_op(t, t, :trunc, :trunc!, :Trunc)

# Enabled only for Real. MKL guarantees higher accuracy, but at a
# substantial performance cost.
def_unary_op(t, t, :atan, :atan!, :Atan)
def_unary_op(t, t, :cos, :cos!, :Cos)
def_unary_op(t, t, :sin, :sin!, :Sin)
def_unary_op(t, t, :tan, :tan!, :Tan)
def_unary_op(t, t, :atanh, :atanh!, :Atanh)
def_unary_op(t, t, :cosh, :cosh!, :Cosh)
def_unary_op(t, t, :sinh, :sinh!, :Sinh)
def_unary_op(t, t, :tanh, :tanh!, :Tanh)
def_unary_op(t, t, :log10, :log10!, :Log10)

# Unary, real-only
def_unary_op(t, t, :cospi, :cospi!, :Cospi)
def_unary_op(t, t, :sinpi, :sinpi!, :Sinpi)
def_unary_op(t, t, :tanpi, :tanpi!, :Tanpi)
def_unary_op(t, t, :acospi, :acospi!, :Acospi)
def_unary_op(t, t, :asinpi, :asinpi!, :Asinpi)
def_unary_op(t, t, :atanpi, :atanpi!, :Atanpi)
def_unary_op(t, t, :cosd, :cosd!, :Cosd)
def_unary_op(t, t, :sind, :sind!, :Sind)
def_unary_op(t, t, :tand, :tand!, :Tand)

def_one2two_op(t, t, :sincos, :sincos!, :SinCos)

# now in SpecialFunctions (make smart, maybe?)
def_unary_op(t, t, :erf, :erf!, :Erf)
def_unary_op(t, t, :erfc, :erfc!, :Erfc)
Expand All @@ -55,18 +81,6 @@ for t in (Float32, Float64)
def_unary_op(t, t, :pow2o3, :pow2o3!, :Pow2o3)
def_unary_op(t, t, :pow3o2, :pow3o2!, :Pow3o2)

# Enabled only for Real. MKL guarantees higher accuracy, but at a
# substantial performance cost.
def_unary_op(t, t, :atan, :atan!, :Atan)
def_unary_op(t, t, :cos, :cos!, :Cos)
def_unary_op(t, t, :sin, :sin!, :Sin)
def_unary_op(t, t, :tan, :tan!, :Tan)
def_unary_op(t, t, :atanh, :atanh!, :Atanh)
def_unary_op(t, t, :cosh, :cosh!, :Cosh)
def_unary_op(t, t, :sinh, :sinh!, :Sinh)
def_unary_op(t, t, :tanh, :tanh!, :Tanh)
def_unary_op(t, t, :log10, :log10!, :Log10)

# # .^ to scalar power
# mklfn = Base.Meta.quot(Symbol("$(vml_prefix(t))Powx"))
# @eval begin
Expand All @@ -87,6 +101,7 @@ for t in (Float32, Float64)

# # Binary, real-only
def_binary_op(t, t, :atan, :atan!, :Atan2, false)
def_binary_op(t, t, :atanpi, :atanpi!, :Atan2pi, false)
def_binary_op(t, t, :hypot, :hypot!, :Hypot, false)

# Unary, complex-only
Expand Down
114 changes: 94 additions & 20 deletions src/setup.jl
Original file line number Diff line number Diff line change
Expand Up @@ -264,57 +264,131 @@ function vml_prefix(t::DataType)
error("unknown type $t")
end

if isdefined(Base, :_checkcontiguous)
alldense(@nospecialize(x)) = Base._checkcontiguous(Bool, x)
else
alldense(x) = x isa DenseArray
alldense(x::Base.ReshapedArray) = alldense(parent(x))
alldense(x::Base.FastContiguousSubArray) = alldense(parent(x))
alldense(x::Base.ReinterpretArray) = alldense(parent(x))
end
alldense(x, y, z...) = alldense(x) && alldense(y, z...)

if isdefined(Base, :merge_adjacent_dim)
const merge_adjacent_dim = Base.merge_adjacent_dim
else
merge_adjacent_dim(::Dims{0}, ::Dims{0}) = 1, 1, 0
merge_adjacent_dim(apsz::Dims{1}, apst::Dims{1}) = apsz[1], apst[1], 1
function merge_adjacent_dim(apsz::Dims{N}, apst::Dims{N}, n::Int = 1) where {N}
sz, st = apsz[n], apst[n]
while n < N
szₙ, stₙ = apsz[n+1], apst[n+1]
if sz == 1
sz, st = szₙ, stₙ
elseif stₙ == st * sz || szₙ == 1
sz *= szₙ
else
break
end
n += 1
end
return sz, st, n
end
end

getstrides(x...) = map(stride1, x)
function stride1(x::AbstractArray)
alldense(x) && return 1
ndims(x) == 1 && return stride(x, 1)
szs::Dims = size(x)
sts::Dims = strides(x)
_, st, n = merge_adjacent_dim(szs, sts)
n === ndims(x) && return st
throw(ArgumentError("only support vector like inputs"))
end

function def_unary_op(tin, tout, jlname, jlname!, mklname;
vmltype = tin)
mklfn = Base.Meta.quot(Symbol("$(vml_prefix(vmltype))$mklname"))
mklfn = Base.Meta.quot(Symbol("$(vml_prefix(vmltype))$(mklname)I"))
mklfndense = Base.Meta.quot(Symbol("$(vml_prefix(vmltype))$mklname"))
exports = Symbol[]
(@isdefined jlname) || push!(exports, jlname)
(@isdefined jlname!) || push!(exports, jlname!)
@eval begin
function ($jlname!)(out::Array{$tout}, A::Array{$tin})
function ($jlname!)(out::AbstractArray{$tout}, A::AbstractArray{$tin})
size(out) == size(A) || throw(DimensionMismatch())
ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, out)
if alldense(out, A) || ((sts = getstrides(out, A)) == (1, 1))
ccall(($mklfndense, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, out)
else
stᵒ, stᴬ = sts
ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Int, Ptr{$tout}, Int), length(A), A, stᴬ, out, stᵒ)
end
vml_check_error()
return out
end
$(if tin == tout
quote
function $(jlname!)(A::Array{$tin})
ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, A)
function $(jlname!)(A::AbstractArray{$tin})
if alldense(A) || ((sts = getstrides(A)) == (1,))
ccall(($mklfndense, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, A)
else
(stᴬ,) = sts
ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Int, Ptr{$tout}, Int), length(A), A, stᴬ, A, stᴬ)
end
vml_check_error()
return A
end
end
end)
function ($jlname)(A::Array{$tin})
out = similar(A, $tout)
ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, out)
vml_check_error()
return out
end
($jlname)(A::AbstractArray{$tin}) = $(jlname!)(similar(A, $tout), A)
$(isempty(exports) ? nothing : Expr(:export, exports...))
end
end

function def_binary_op(tin, tout, jlname, jlname!, mklname, broadcast)
mklfn = Base.Meta.quot(Symbol("$(vml_prefix(tin))$mklname"))
mklfndense = Base.Meta.quot(Symbol("$(vml_prefix(tin))$mklname"))
mklfn = Base.Meta.quot(Symbol("$(vml_prefix(tin))$(mklname)I"))
exports = Symbol[]
(@isdefined jlname) || push!(exports, jlname)
(@isdefined jlname!) || push!(exports, jlname!)
@eval begin
$(isempty(exports) ? nothing : Expr(:export, exports...))
function ($jlname!)(out::Array{$tout}, A::Array{$tin}, B::Array{$tin})
size(out) == size(A) == size(B) || throw(DimensionMismatch("Input arrays and output array need to have the same size"))
ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tin}, Ptr{$tout}), length(A), A, B, out)
function ($jlname!)(out::AbstractArray{$tout}, A::AbstractArray{$tin}, B::AbstractArray{$tin})
size(A) == size(B) || throw(DimensionMismatch("Input arrays need to have the same size"))
size(out) == size(A) || throw(DimensionMismatch("Output array need to have the same size with input"))
if alldense(out, A, B) || ((sts = getstrides(out, A, B)) == (1, 1, 1))
ccall(($mklfndense, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tin}, Ptr{$tout}), length(A), A, B, out)
else
stᵒ, stᴬ, stᴮ = sts
ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Int, Ptr{$tin}, Int, Ptr{$tout}, Int), length(A), A, stᴬ, B, stᴮ, out, stᵒ)
end
vml_check_error()
return out
end
function ($jlname)(A::Array{$tout}, B::Array{$tin})
size(A) == size(B) || throw(DimensionMismatch("Input arrays need to have the same size"))
out = similar(A)
ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tin}, Ptr{$tout}), length(A), A, B, out)
($jlname)(A::AbstractArray{$tin}, B::AbstractArray{$tin}) = ($jlname!)(similar(A, $tout), A, B)
end
end

function def_one2two_op(tin, tout, jlname, jlname!, mklname)
mklfndense = Base.Meta.quot(Symbol("$(vml_prefix(tin))$mklname"))
mklfn = Base.Meta.quot(Symbol("$(vml_prefix(tin))$(mklname)I"))
exports = Symbol[]
(@isdefined jlname) || push!(exports, jlname)
(@isdefined jlname!) || push!(exports, jlname!)
@eval begin
$(isempty(exports) ? nothing : Expr(:export, exports...))
function ($jlname!)(out1::AbstractArray{$tout}, out2::AbstractArray{$tout}, A::AbstractArray{$tin})
size(out1) == size(out2) || throw(DimensionMismatch("Output arrays need to have the same size"))
size(A) == size(out2) || throw(DimensionMismatch("Output array need to have the same size with input"))
if alldense(out1, out2, A) || ((sts = getstrides(out1, out2, A)) == (1, 1, 1))
ccall(($mklfndense, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tin}, Ptr{$tout}), length(A), A, out1, out2)
else
st¹, st², stᴬ = sts
ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Int, Ptr{$tin}, Int, Ptr{$tout}, Int), length(A), A, stᴬ, out1, st¹, out2, st²)
end
vml_check_error()
return out
return out1, out2
end
($jlname)(A::AbstractArray{$tin}) = ($jlname!)(similar(A, $tout), similar(A, $tout), A)
end
end
10 changes: 10 additions & 0 deletions test/common.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
using SpecialFunctions
const base_unary_real = (
(Base, :acos, (-1, 1)),
(Base, :acospi, (-1, 1)),
(Base, :asin, (-1, 1)),
(Base, :asinpi, (-1, 1)),
(Base, :atan, (-50, 50)),
(Base, :atanpi, (-50, 50)),
(Base, :cos, (-1000, 1000)),
(Base, :cosd, (-10000, 10000)),
(Base, :cospi, (-300, 300)),
(Base, :sin, (-1000, 1000)),
(Base, :sind, (-10000, 10000)),
(Base, :sinpi, (-300, 300)),
(Base, :tan, (-1000, 1000)),
(Base, :tand, (-10000, 10000)),
(Base, :tanpi, (-300, 300)),
(Base, :acosh, (1, 1000)),
(Base, :asinh, (-1000, 1000)),
(Base, :atanh, (-1, 1)),
Expand All @@ -17,6 +26,7 @@ const base_unary_real = (
(Base, :exp, (-88.72284f0, 88.72284f0)),
(Base, :expm1, (-88.72284f0, 88.72284f0)),
(Base, :log, (0, 1000)),
(Base, :log2, (0, 1000)),
(Base, :log10, (0, 1000)),
(Base, :log1p, (-1, 1000)),
(Base, :abs, (-1000, 1000)),
Expand Down
38 changes: 32 additions & 6 deletions test/real.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,32 +19,58 @@ const fns = [[x[1:2] for x in base_unary_real]; [x[1:2] for x in base_binary_rea
for t in (Float32, Float64), i = 1:length(fns)
inp = input[t][i]
mod, fn = fns[i]
base_fn = getproperty(mod, fn)
if fn === :acospi || fn === :asinpi || fn === :atanpi
fn′ = getproperty(mod, Symbol(string(fn)[1:end-2]))
base_fn = x -> oftype(x, fn′(widen(x))/pi)
elseif fn === :tanpi
base_fn = x -> oftype(x, Base.tan(widen(x)*pi))
else
base_fn = getproperty(mod, fn)
end
vml_fn = getproperty(IntelVectorMath, fn)
vml_fn! = getproperty(IntelVectorMath, Symbol(fn, "!"))

Test.@test parentmodule(vml_fn) == IntelVectorMath

# Test.test_approx_eq(output[t][i], fn(input[t][i]...), "Base $t $fn", "IntelVectorMath $t $fn")
baseres = base_fn.(inp...)
Test.@test vml_fn(inp...) ≈ base_fn.(inp...)
Test.@test vml_fn(inp...) ≈ baseres

# cis changes type (float to complex, does not have mutating function)
if length(inp) == 1
if fn != :cis
vml_fn!(inp[1])
Test.@test inp[1] ≈ baseres
temp = similar(inp[1], 2NVALS)
inp1′ = @views copyto!(temp[1:2:end], inp[1])
inp1″ = @views copyto!(temp[end:-2:1], inp[1])
for x in (inp[1], inp1′, inp1″)
vml_fn!(x)
Test.@test x ≈ baseres
end
end
elseif length(inp) == 2
out = similar(inp[1])
vml_fn!(out, inp...)
Test.@test out ≈ baseres
temp = similar(inp[1], 2NVALS)
x′ = @views copyto!(temp[1:2:end], inp[1])
y′ = @views copyto!(temp[end:-2:1], inp[2])
for (x, y) in (inp, (x′, y′))
vml_fn!(out, x, y)
Test.@test out ≈ baseres
end
end

end

end

@testset "sincos" begin
for t in (Float32, Float64)
a = randindomain(t, NVALS, (-1000, 1000))
s, c = IVM.sincos(a)
@test s ≈ IVM.sin(a)
@test c ≈ IVM.cos(a)
end
end

@testset "Error Handling and Settings" begin

# Verify that we still throw DomainErrors
Expand Down