JuliaMath · Crown421 · Jan 3, 2023 · Jan 1, 2023 · Jan 1, 2023 · Jan 1, 2023
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "IntelVectorMath"
 uuid = "c8ce9da6-5d36-5c03-b118-5a70151be7bc"
-version = "0.4.2"
+version = "0.4.3"
 
 [deps]
 MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"

diff --git a/README.md b/README.md
@@ -55,6 +55,9 @@ julia> b = similar(a);
 
 julia> @btime IVM.sin!(b, a);  # in-place version
   20.008 μs (0 allocations: 0 bytes)
+
+julia> @views IVM.sin(a[1:2:end]) == b[1:2:end] # all IVM functions support 1d strided input
+true
 ```
 
 ### Accuracy
@@ -247,6 +250,8 @@ Next steps for this package
 IntelVectorMath.jl uses [CpuId.jl](https://github.com/m-j-w/CpuId.jl) to detect if your processor supports the newer `avx2` instructions, and if not defaults to `libmkl_vml_avx`. If your system does not have AVX this package will currently not work for you.
 If the CPU feature detection does not work for you, please open an issue. -->
 
-As a quick help to convert benchmark timings into operations-per-cycle, IntelVectorMath.jl
+1. As a quick help to convert benchmark timings into operations-per-cycle, IntelVectorMath.jl
 provides `vml_get_cpu_frequency()` which will return the *actual* current frequency of the
 CPU in GHz.
+
+2. Now all IVM functions accept inputs that could be reshaped to an 1d [strided array](https://docs.julialang.org/en/v1/manual/interfaces/#man-interface-strided-arrays).
diff --git a/src/IntelVectorMath.jl b/src/IntelVectorMath.jl
@@ -35,13 +35,39 @@ for t in (Float32, Float64)
     def_unary_op(t, t, :cbrt, :cbrt!, :Cbrt)
     def_unary_op(t, t, :expm1, :expm1!, :Expm1)
     def_unary_op(t, t, :log1p, :log1p!, :Log1p)
+    def_unary_op(t, t, :log2, :log2!, :Log2)
     def_unary_op(t, t, :abs, :abs!, :Abs)
     def_unary_op(t, t, :abs2, :abs2!, :Sqr)
     def_unary_op(t, t, :ceil, :ceil!, :Ceil)
     def_unary_op(t, t, :floor, :floor!, :Floor)
     def_unary_op(t, t, :round, :round!, :Round)
     def_unary_op(t, t, :trunc, :trunc!, :Trunc)
 
+    # Enabled only for Real. MKL guarantees higher accuracy, but at a
+    # substantial performance cost.
+    def_unary_op(t, t, :atan, :atan!, :Atan)
+    def_unary_op(t, t, :cos, :cos!, :Cos)
+    def_unary_op(t, t, :sin, :sin!, :Sin)
+    def_unary_op(t, t, :tan, :tan!, :Tan)
+    def_unary_op(t, t, :atanh, :atanh!, :Atanh)
+    def_unary_op(t, t, :cosh, :cosh!, :Cosh)
+    def_unary_op(t, t, :sinh, :sinh!, :Sinh)
+    def_unary_op(t, t, :tanh, :tanh!, :Tanh)
+    def_unary_op(t, t, :log10, :log10!, :Log10)
+
+    # Unary, real-only
+    def_unary_op(t, t, :cospi, :cospi!, :Cospi)
+    def_unary_op(t, t, :sinpi, :sinpi!, :Sinpi)
+    def_unary_op(t, t, :tanpi, :tanpi!, :Tanpi)
+    def_unary_op(t, t, :acospi, :acospi!, :Acospi)
+    def_unary_op(t, t, :asinpi, :asinpi!, :Asinpi)
+    def_unary_op(t, t, :atanpi, :atanpi!, :Atanpi)
+    def_unary_op(t, t, :cosd, :cosd!, :Cosd)
+    def_unary_op(t, t, :sind, :sind!, :Sind)
+    def_unary_op(t, t, :tand, :tand!, :Tand)
+
+    def_one2two_op(t, t, :sincos, :sincos!, :SinCos)
+
     # now in SpecialFunctions (make smart, maybe?)
     def_unary_op(t, t, :erf, :erf!, :Erf)
     def_unary_op(t, t, :erfc, :erfc!, :Erfc)
@@ -55,18 +81,6 @@ for t in (Float32, Float64)
     def_unary_op(t, t, :pow2o3, :pow2o3!, :Pow2o3)
     def_unary_op(t, t, :pow3o2, :pow3o2!, :Pow3o2)
 
-    # Enabled only for Real. MKL guarantees higher accuracy, but at a
-    # substantial performance cost.
-    def_unary_op(t, t, :atan, :atan!, :Atan)
-    def_unary_op(t, t, :cos, :cos!, :Cos)
-    def_unary_op(t, t, :sin, :sin!, :Sin)
-    def_unary_op(t, t, :tan, :tan!, :Tan)
-    def_unary_op(t, t, :atanh, :atanh!, :Atanh)
-    def_unary_op(t, t, :cosh, :cosh!, :Cosh)
-    def_unary_op(t, t, :sinh, :sinh!, :Sinh)
-    def_unary_op(t, t, :tanh, :tanh!, :Tanh)
-    def_unary_op(t, t, :log10, :log10!, :Log10)
-
     # # .^ to scalar power
     # mklfn = Base.Meta.quot(Symbol("$(vml_prefix(t))Powx"))
     # @eval begin
@@ -87,6 +101,7 @@ for t in (Float32, Float64)
 
     # # Binary, real-only
     def_binary_op(t, t, :atan, :atan!, :Atan2, false)
+    def_binary_op(t, t, :atanpi, :atanpi!, :Atan2pi, false)
     def_binary_op(t, t, :hypot, :hypot!, :Hypot, false)
 
     # Unary, complex-only

diff --git a/src/setup.jl b/src/setup.jl
@@ -264,57 +264,131 @@ function vml_prefix(t::DataType)
     error("unknown type $t")
 end
 
+if isdefined(Base, :_checkcontiguous)
+    alldense(@nospecialize(x)) = Base._checkcontiguous(Bool, x)
+else
+    alldense(x) = x isa DenseArray
+    alldense(x::Base.ReshapedArray) = alldense(parent(x))
+    alldense(x::Base.FastContiguousSubArray) = alldense(parent(x))
+    alldense(x::Base.ReinterpretArray) = alldense(parent(x))
+end
+alldense(x, y, z...) = alldense(x) && alldense(y, z...)
+
+if isdefined(Base, :merge_adjacent_dim)
+    const merge_adjacent_dim = Base.merge_adjacent_dim
+else
+    merge_adjacent_dim(::Dims{0}, ::Dims{0}) = 1, 1, 0
+    merge_adjacent_dim(apsz::Dims{1}, apst::Dims{1}) = apsz[1], apst[1], 1
+    function merge_adjacent_dim(apsz::Dims{N}, apst::Dims{N}, n::Int = 1) where {N}
+        sz, st = apsz[n], apst[n]
+        while n < N
+            szₙ, stₙ = apsz[n+1], apst[n+1]
+            if sz == 1
+                sz, st = szₙ, stₙ
+            elseif stₙ == st * sz || szₙ == 1
+                sz *= szₙ
+            else
+                break
+            end
+            n += 1
+        end
+        return sz, st, n
+    end
+end
+
+getstrides(x...) = map(stride1, x)
+function stride1(x::AbstractArray)
+    alldense(x) && return 1
+    ndims(x) == 1 && return stride(x, 1)
+    szs::Dims = size(x)
+    sts::Dims = strides(x)
+    _, st, n = merge_adjacent_dim(szs, sts)
+    n === ndims(x) && return st
+    throw(ArgumentError("only support vector like inputs"))
+end
+
 function def_unary_op(tin, tout, jlname, jlname!, mklname;
         vmltype = tin)
-    mklfn = Base.Meta.quot(Symbol("$(vml_prefix(vmltype))$mklname"))
+    mklfn = Base.Meta.quot(Symbol("$(vml_prefix(vmltype))$(mklname)I"))
+    mklfndense = Base.Meta.quot(Symbol("$(vml_prefix(vmltype))$mklname"))
     exports = Symbol[]
     (@isdefined jlname) || push!(exports, jlname)
     (@isdefined jlname!) || push!(exports, jlname!)
     @eval begin
-        function ($jlname!)(out::Array{$tout}, A::Array{$tin})
+        function ($jlname!)(out::AbstractArray{$tout}, A::AbstractArray{$tin})
             size(out) == size(A) || throw(DimensionMismatch())
-            ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, out)
+            if alldense(out, A) || ((sts = getstrides(out, A)) == (1, 1))
+                ccall(($mklfndense, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, out)
+            else
+                stᵒ, stᴬ = sts
+                ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Int, Ptr{$tout}, Int), length(A), A, stᴬ, out, stᵒ)
+            end
             vml_check_error()
             return out
         end
         $(if tin == tout
             quote
-                function $(jlname!)(A::Array{$tin})
-                    ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, A)
+                function $(jlname!)(A::AbstractArray{$tin})
+                    if alldense(A) || ((sts = getstrides(A)) == (1,))
+                        ccall(($mklfndense, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, A)
+                    else
+                        (stᴬ,) = sts
+                        ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Int, Ptr{$tout}, Int), length(A), A, stᴬ, A, stᴬ)
+                    end
                     vml_check_error()
                     return A
                 end
             end
         end)
-        function ($jlname)(A::Array{$tin})
-            out = similar(A, $tout)
-            ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tout}), length(A), A, out)
-            vml_check_error()
-            return out
-        end
+        ($jlname)(A::AbstractArray{$tin}) = $(jlname!)(similar(A, $tout), A)
         $(isempty(exports) ? nothing : Expr(:export, exports...))
     end
 end
 
 function def_binary_op(tin, tout, jlname, jlname!, mklname, broadcast)
-    mklfn = Base.Meta.quot(Symbol("$(vml_prefix(tin))$mklname"))
+    mklfndense = Base.Meta.quot(Symbol("$(vml_prefix(tin))$mklname"))
+    mklfn = Base.Meta.quot(Symbol("$(vml_prefix(tin))$(mklname)I"))
     exports = Symbol[]
     (@isdefined jlname) || push!(exports, jlname)
     (@isdefined jlname!) || push!(exports, jlname!)
     @eval begin
         $(isempty(exports) ? nothing : Expr(:export, exports...))
-        function ($jlname!)(out::Array{$tout}, A::Array{$tin}, B::Array{$tin}) 
-            size(out) == size(A) == size(B) || throw(DimensionMismatch("Input arrays and output array need to have the same size"))
-            ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tin}, Ptr{$tout}), length(A), A, B, out)
+        function ($jlname!)(out::AbstractArray{$tout}, A::AbstractArray{$tin}, B::AbstractArray{$tin})
+            size(A) == size(B) || throw(DimensionMismatch("Input arrays need to have the same size"))
+            size(out) == size(A) || throw(DimensionMismatch("Output array need to have the same size with input"))
+            if alldense(out, A, B) || ((sts = getstrides(out, A, B)) == (1, 1, 1))
+                ccall(($mklfndense, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tin}, Ptr{$tout}), length(A), A, B, out)
+            else
+                stᵒ, stᴬ, stᴮ = sts
+                ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Int, Ptr{$tin}, Int, Ptr{$tout}, Int), length(A), A, stᴬ, B, stᴮ, out, stᵒ)
+            end
             vml_check_error()
             return out
         end
-        function ($jlname)(A::Array{$tout}, B::Array{$tin}) 
-            size(A) == size(B) || throw(DimensionMismatch("Input arrays need to have the same size"))
-            out = similar(A)
-            ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tin}, Ptr{$tout}), length(A), A, B, out)
+        ($jlname)(A::AbstractArray{$tin}, B::AbstractArray{$tin}) = ($jlname!)(similar(A, $tout), A, B)
+    end
+end
+
+function def_one2two_op(tin, tout, jlname, jlname!, mklname)
+    mklfndense = Base.Meta.quot(Symbol("$(vml_prefix(tin))$mklname"))
+    mklfn = Base.Meta.quot(Symbol("$(vml_prefix(tin))$(mklname)I"))
+    exports = Symbol[]
+    (@isdefined jlname) || push!(exports, jlname)
+    (@isdefined jlname!) || push!(exports, jlname!)
+    @eval begin
+        $(isempty(exports) ? nothing : Expr(:export, exports...))
+        function ($jlname!)(out1::AbstractArray{$tout}, out2::AbstractArray{$tout}, A::AbstractArray{$tin})
+            size(out1) == size(out2) || throw(DimensionMismatch("Output arrays need to have the same size"))
+            size(A) == size(out2) || throw(DimensionMismatch("Output array need to have the same size with input"))
+            if alldense(out1, out2, A) || ((sts = getstrides(out1, out2, A)) == (1, 1, 1))
+                ccall(($mklfndense, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Ptr{$tin}, Ptr{$tout}), length(A), A, out1, out2)
+            else
+                st¹, st², stᴬ = sts
+                ccall(($mklfn, MKL_jll.libmkl_rt), Nothing, (Int, Ptr{$tin}, Int, Ptr{$tin}, Int, Ptr{$tout}, Int), length(A), A, stᴬ, out1, st¹, out2, st²)
+            end
             vml_check_error()
-            return out
+            return out1, out2
         end
+        ($jlname)(A::AbstractArray{$tin}) = ($jlname!)(similar(A, $tout), similar(A, $tout), A)
     end
 end
diff --git a/test/common.jl b/test/common.jl
@@ -1,11 +1,20 @@
 using SpecialFunctions
 const base_unary_real = (
     (Base, :acos, (-1, 1)),
+    (Base, :acospi, (-1, 1)),
     (Base, :asin, (-1, 1)),
+    (Base, :asinpi, (-1, 1)),
     (Base, :atan, (-50, 50)),
+    (Base, :atanpi, (-50, 50)),
     (Base, :cos, (-1000, 1000)),
+    (Base, :cosd, (-10000, 10000)),
+    (Base, :cospi, (-300, 300)),
     (Base, :sin, (-1000, 1000)),
+    (Base, :sind, (-10000, 10000)),
+    (Base, :sinpi, (-300, 300)),
     (Base, :tan, (-1000, 1000)),
+    (Base, :tand, (-10000, 10000)),
+    (Base, :tanpi, (-300, 300)),
     (Base, :acosh, (1, 1000)),
     (Base, :asinh, (-1000, 1000)),
     (Base, :atanh, (-1, 1)),
@@ -17,6 +26,7 @@ const base_unary_real = (
     (Base, :exp, (-88.72284f0, 88.72284f0)),
     (Base, :expm1, (-88.72284f0, 88.72284f0)),
     (Base, :log, (0, 1000)),
+    (Base, :log2, (0, 1000)),
     (Base, :log10, (0, 1000)),
     (Base, :log1p, (-1, 1000)),
     (Base, :abs, (-1000, 1000)),

diff --git a/test/real.jl b/test/real.jl
@@ -19,32 +19,58 @@ const fns = [[x[1:2] for x in base_unary_real]; [x[1:2] for x in base_binary_rea
   for t in (Float32, Float64), i = 1:length(fns)
     inp = input[t][i]
     mod, fn = fns[i]
-    base_fn = getproperty(mod, fn)
+    if fn === :acospi || fn === :asinpi || fn === :atanpi
+      fn′ = getproperty(mod, Symbol(string(fn)[1:end-2]))
+      base_fn = x -> oftype(x, fn′(widen(x))/pi)
+    elseif fn === :tanpi
+      base_fn = x -> oftype(x, Base.tan(widen(x)*pi))
+    else
+      base_fn = getproperty(mod, fn)
+    end
     vml_fn = getproperty(IntelVectorMath, fn)
     vml_fn! = getproperty(IntelVectorMath, Symbol(fn, "!"))
 
     Test.@test parentmodule(vml_fn) == IntelVectorMath
 
     # Test.test_approx_eq(output[t][i], fn(input[t][i]...), "Base $t $fn", "IntelVectorMath $t $fn")
     baseres = base_fn.(inp...)
-    Test.@test vml_fn(inp...) ≈ base_fn.(inp...)
+    Test.@test vml_fn(inp...) ≈ baseres
 
     # cis changes type (float to complex, does not have mutating function)
     if length(inp) == 1
       if fn != :cis
-        vml_fn!(inp[1])
-        Test.@test inp[1] ≈ baseres
+        temp = similar(inp[1], 2NVALS)
+        inp1′ = @views copyto!(temp[1:2:end], inp[1])
+        inp1″ = @views copyto!(temp[end:-2:1], inp[1])
+        for x in (inp[1], inp1′, inp1″)
+          vml_fn!(x)
+          Test.@test x ≈ baseres
+        end
       end
     elseif length(inp) == 2
       out = similar(inp[1])
-      vml_fn!(out, inp...)
-      Test.@test out ≈ baseres
+      temp = similar(inp[1], 2NVALS)
+      x′ = @views copyto!(temp[1:2:end], inp[1])
+      y′ = @views copyto!(temp[end:-2:1], inp[2])
+      for (x, y) in (inp, (x′, y′))
+        vml_fn!(out, x, y)
+        Test.@test out ≈ baseres
+      end
     end
 
   end
 
 end
 
+@testset "sincos" begin
+  for t in (Float32, Float64)
+    a = randindomain(t, NVALS, (-1000, 1000))
+    s, c = IVM.sincos(a)
+    @test s ≈ IVM.sin(a)
+    @test c ≈ IVM.cos(a)
+  end
+end
+
 @testset "Error Handling and Settings" begin
 
   # Verify that we still throw DomainErrors