JuliaMath · Crown421 · Jan 1, 2023 · Jun 8, 2022 · Jun 8, 2022 · Jun 8, 2022
diff --git a/Project.toml b/Project.toml
@@ -1,13 +1,13 @@
 name = "IntelVectorMath"
 uuid = "c8ce9da6-5d36-5c03-b118-5a70151be7bc"
-version = "0.4.1"
+version = "0.4.2"
 
 [deps]
 MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
 
 [compat]
 julia = "1.3"
-MKL_jll = "2020, 2021"
+MKL_jll = "2021, 2022"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 ![](https://github.com/JuliaMath/VML.jl/workflows/julia%201.6/badge.svg)
 ![](https://github.com/JuliaMath/VML.jl/workflows/julia%20nightly/badge.svg)
 
-This package provides bindings to the Intel MKL [Vector Mathematics Functions](https://software.intel.com/en-us/node/521751).
+This package provides bindings to the Intel MKL [Vector Mathematics Functions](https://www.intel.com/content/www/us/en/develop/documentation/onemkl-developer-reference-c/top/vector-mathematical-functions.html).
 This is often substantially faster than broadcasting Julia's built-in functions, especially when applying a transcendental function over a large array.
 Until Julia 0.6 the package was registered as `VML.jl`.
 
@@ -65,7 +65,19 @@ implementation, although the exact results may be different. To specify
 low accuracy, use `vml_set_accuracy(VML_LA)`. To specify enhanced
 performance, use `vml_set_accuracy(VML_EP)`. More documentation
 regarding these options is available on
-[Intel's website](http://software.intel.com/sites/products/documentation/hpc/mkl/IntelVectorMath/vmldata.htm).
+[Intel's website](https://www.intel.com/content/www/us/en/develop/documentation/onemkl-developer-reference-c/top/vector-mathematical-functions.html).
+
+### Denormalized numbers 
+
+On some CPU, operations on denormalized numbers are extremely slow. You case use `vml_set_denormalmode(VML_DENORMAL_FAST)`
+to handle denormalized numbers as zero. See the `?VML_DENORMAL_FAST` for more information. You can get the
+current mode by `vml_get_denormalmode()`. The default is `VML_DENORMAL_ACCURATE`.
+
+### Threads
+
+By default, IntelVectorMath uses multithreading. The maximum number of threads that a call may use
+is given by `vml_get_max_threads()`. On most environment this will default to the number of physical
+cores available to IntelVectorMath. This behavior can be changed using `vml_set_num_threads(numthreads)`.
 
 ## Performance
 Summary of Results:
@@ -229,5 +241,12 @@ Next steps for this package
 
 
 ## Advanced
+
+<!-- This does not seems to be true anymore ? No reference to CpuId.jl in the Manifest ?
+
 IntelVectorMath.jl uses [CpuId.jl](https://github.com/m-j-w/CpuId.jl) to detect if your processor supports the newer `avx2` instructions, and if not defaults to `libmkl_vml_avx`. If your system does not have AVX this package will currently not work for you.
-If the CPU feature detection does not work for you, please open an issue.
+If the CPU feature detection does not work for you, please open an issue. -->
+
+As a quick help to convert benchmark timings into operations-per-cycle, IntelVectorMath.jl
+provides `vml_get_cpu_frequency()` which will return the *actual* current frequency of the
+CPU in GHz.
diff --git a/src/IntelVectorMath.jl b/src/IntelVectorMath.jl
@@ -108,5 +108,12 @@ for t in (Float32, Float64)
 end
 
 export VML_LA, VML_HA, VML_EP, vml_set_accuracy, vml_get_accuracy
+export VML_DENORMAL_FAST, VML_DENORMAL_ACCURATE, vml_set_denormalmode, vml_get_denormalmode
+export vml_get_max_threads, vml_set_num_threads
+export vml_get_cpu_frequency, vml_get_max_cpu_frequency
+
+# do not export, seems to be no-op in 2022
+# export VML_FPU_DEFAULT, VML_FPU_FLOAT32, VML_FPU_FLOAT64, VML_FPU_RESTORE, vml_set_fpumode, vml_get_fpumode
+
 
 end
diff --git a/src/setup.jl b/src/setup.jl
@@ -1,33 +1,251 @@
 import MKL_jll
 
+"""
+    struct VMLAccuracy
+
+See [`VML_LA`](@ref), [`VML_HA`](@ref), [`VML_EP`](@ref).
+"""
 struct VMLAccuracy
     mode::UInt
 end
+Base.show(io::IO, m::VMLAccuracy) = print(io, m == VML_LA ? "VML_LA" :
+                                              m == VML_HA ? "VML_HA" : "VML_EP")
+# mkl\include\mkl_vml_defines.h
+# VML_HA - when VML_HA is set, high accuracy VML functions are called
+# VML_LA - when VML_LA is set, low accuracy VML functions are called
+# VML_EP - when VML_EP is set, enhanced performance VML functions are called
+# NOTE: VML_HA, VML_LA and VML_EP must not be used in combination
+"""
+    VML_LA :: VMLAccuracy
 
+Low Accuracy (LA), which improves performance by reducing accuracy of the two least significant bits.
+"""
 const VML_LA = VMLAccuracy(0x00000001)
+"""
+    VML_HA :: VMLAccuracy
+
+High Accuracy (HA), the default mode. Precision to 1 ulp.
+"""
 const VML_HA = VMLAccuracy(0x00000002)
+"""
+    VML_EP :: VMLAccuracy
+
+Enhanced Performance (EP), which provides better performance at the cost of significantly reduced accuracy.
+Approximately half of the bits in the mantissa are correct.
+"""
 const VML_EP = VMLAccuracy(0x00000003)
 
-Base.show(io::IO, m::VMLAccuracy) = print(io, m == VML_LA ? "VML_LA" :
-                                              m == VML_HA ? "VML_HA" : "VML_EP")
-
+
+"""
+    struct VMLAccuracy
+
+See [`VML_DENORMAL_FAST`](@ref), [`VML_DENORMAL_ACCURATE`](@ref).
+"""
+struct VMLFastDenormal
+    mode::UInt
+end
+Base.show(io::IO, m::VMLFastDenormal) = print(io, m == VML_DENORMAL_FAST ? "VML_DENORMAL_FAST" : "VML_DENORMAL_ACCURATE")
+# mkl\include\mkl_vml_defines.h
+#  FTZ & DAZ mode macros
+#  VML_FTZDAZ_ON   - FTZ & DAZ MXCSR mode enabled
+#                    for faster (sub)denormal values processing
+#  VML_FTZDAZ_OFF  - FTZ & DAZ MXCSR mode disabled
+#                    for accurate (sub)denormal values processing
+"""
+    VML_DENORMAL_FAST :: VMLFastDenormal
+
+Designed to improve the performance of computations that involve denormalized numbers at the cost of reasonable accuracy loss.
+This mode changes the numeric behavior of the functions: denormalized input values are treated as zeros and denormalized results
+are flushed to zero. Accuracy loss may occur if input and/or output values are close to denormal range.
+"""
+const VML_DENORMAL_FAST      = VMLFastDenormal(0x00280000)
+"""
+    VML_DENORMAL_ACCURATE :: VMLFastDenormal
+
+Standard handling of computations that involve denormalized numbers.
+"""
+const VML_DENORMAL_ACCURATE  = VMLFastDenormal(0x00140000)
+
+
+struct VMLFpuMode
+    mode::UInt
+end
+Base.show(io::IO, m::VMLFpuMode) = print(io,  m == VML_FPU_DEFAULT ? "VML_FPU_DEFAULT" :
+                                              m == VML_FPU_FLOAT32 ? "VML_FPU_FLOAT32" : 
+                                              m == VML_FPU_FLOAT64 ? "VML_FPU_FLOAT64" : "VML_FPU_RESTORE")
+# mkl\include\mkl_vml_defines.h
+#  SETTING OPTIMAL FLOATING-POINT PRECISION AND ROUNDING MODE
+#  Definitions below are to set optimal floating-point control word
+#  (precision and rounding mode).
+#
+#  For their correct work, VML functions change floating-point precision and
+#  rounding mode (if necessary). Since control word changing is typically
+#  expensive operation, it is recommended to set precision and rounding mode
+#  to optimal values before VML function calls.
+#
+#  VML_FLOAT_CONSISTENT  - use this value if the calls are typically to single
+#                          precision VML functions
+#  VML_DOUBLE_CONSISTENT - use this value if the calls are typically to double
+#                          precision VML functions
+#  VML_RESTORE           - restore original floating-point precision and
+#                          rounding mode
+#  VML_DEFAULT_PRECISION - use default (current) floating-point precision and
+#                          rounding mode
+#  NOTE: VML_FLOAT_CONSISTENT, VML_DOUBLE_CONSISTENT, VML_RESTORE and
+#        VML_DEFAULT_PRECISION must not be used in combination
+const VML_FPU_DEFAULT = VMLFpuMode(0x00000000) # VML_DEFAULT_PRECISION
+const VML_FPU_FLOAT32 = VMLFpuMode(0x00000010) # VML_FLOAT_CONSISTENT
+const VML_FPU_FLOAT64 = VMLFpuMode(0x00000020) # VML_DOUBLE_CONSISTENT
+const VML_FPU_RESTORE = VMLFpuMode(0x00000030) # VML_RESTORE
+
+# mkl\include\mkl_vml_defines.h
+#  ACCURACY, FLOATING-POINT CONTROL, FTZDAZ AND ERROR HANDLING MASKS
+#  Accuracy, floating-point and error handling control are packed in
+#  the VML mode variable. Macros below are useful to extract accuracy and/or
+#  floating-point control and/or error handling control settings.
+#
+#  VML_ACCURACY_MASK           - extract accuracy bits
+#  VML_FPUMODE_MASK            - extract floating-point control bits
+#  VML_ERRMODE_MASK            - extract error handling control bits
+#                                (including error callback bits)
+#  VML_ERRMODE_STDHANDLER_MASK - extract error handling control bits
+#                                (not including error callback bits)
+#  VML_ERRMODE_CALLBACK_MASK   - extract error callback bits
+#  VML_NUM_THREADS_OMP_MASK    - extract OpenMP(R) number of threads mode bits
+#  VML_FTZDAZ_MASK             - extract FTZ & DAZ bits
+#  VML_TRAP_EXCEPTIONS_MASK    - extract exception trap bits
+const VML_ACCURACY_MASK           = 0x0000000F
+const VML_FPUMODE_MASK            = 0x000000F0
+const VML_ERRMODE_MASK            = 0x0000FF00
+const VML_ERRMODE_STDHANDLER_MASK = 0x00002F00
+const VML_ERRMODE_CALLBACK_MASK   = 0x00001000
+const VML_NUM_THREADS_OMP_MASK    = 0x00030000
+const VML_FTZDAZ_MASK             = 0x003C0000
+const VML_TRAP_EXCEPTIONS_MASK    = 0x0F000000
+
+# https://www.intel.com/content/www/us/en/develop/documentation/onemkl-developer-reference-c/top/vector-mathematical-functions/vm-service-functions.html
 vml_get_mode() = ccall((:vmlGetMode, MKL_jll.libmkl_rt), Cuint, ())
 vml_set_mode(mode::Integer) = (ccall((:vmlSetMode, MKL_jll.libmkl_rt), Cuint, (UInt,), mode); nothing)
 
-vml_set_accuracy(m::VMLAccuracy) = vml_set_mode((vml_get_mode() & ~0x03) | m.mode)
-vml_get_accuracy() = VMLAccuracy(vml_get_mode() & 0x3)
+"""
+    vml_set_accuracy([VML_HA | VML_LA | VML_EP]])
+
+Set the current accuracy mode. See [`VML_LA`](@ref), [`VML_HA`](@ref), [`VML_EP`](@ref).
+"""
+vml_set_accuracy(m::VMLAccuracy) = vml_set_mode((vml_get_mode() & ~VML_ACCURACY_MASK) | m.mode)
+"""
+    vml_get_accuracy() :: VMLAccuracy
+
+Get the current accuracy mode. See [`VML_LA`](@ref), [`VML_HA`](@ref), [`VML_EP`](@ref).
+"""
+vml_get_accuracy() = VMLAccuracy(vml_get_mode() & VML_ACCURACY_MASK)
+
+"""
+    vml_set_denormalmode([VML_DENORMAL_FAST | VML_DENORMAL_ACCURATE]])
+
+Set the current mode of denormal handling. See [`VML_DENORMAL_FAST`](@ref), [`VML_DENORMAL_ACCURATE`](@ref).
+"""
+vml_set_denormalmode(m::VMLFastDenormal) = vml_set_mode((vml_get_mode() & ~VML_FTZDAZ_MASK) | m.mode)
+"""
+    vml_get_denormalmode() :: VMLFastDenormal
+
+Get the current mode of denormal handling. See [`VML_DENORMAL_FAST`](@ref), [`VML_DENORMAL_ACCURATE`](@ref).
+"""
+vml_get_denormalmode() = VMLFastDenormal(vml_get_mode() & VML_FTZDAZ_MASK)
+
+# Ignored with MKL 2022 on i7-5930k, was usefull once upton a time.
+vml_set_fpumode(m::VMLFpuMode) = vml_set_mode((vml_get_mode() & ~VML_FPUMODE_MASK) | m.mode)
+vml_get_fpumode() = VMLFpuMode(vml_get_mode() & VML_FPUMODE_MASK)
+
+# -----------------------------------------------------------------------------------------------
+
+# https://www.intel.com/content/www/us/en/develop/documentation/onemkl-developer-reference-c/top/support-functions/threading-control.html
+#
+# See: mkl\include\mkl_service.h
+# _Mkl_Api(int,MKL_Domain_Set_Num_Threads,(int nth, int MKL_DOMAIN))
+# _Mkl_Api(int,MKL_Domain_Get_Max_Threads,(int MKL_DOMAIN))
+#  #define mkl_domain_set_num_threads  MKL_Domain_Set_Num_Threads
+#  #define mkl_domain_get_max_threads  MKL_Domain_Get_Max_Threads
+#
+# See: mkl\include\mkl_types.h
+#  define MKL_DOMAIN_ALL      0
+#  define MKL_DOMAIN_BLAS     1
+#  define MKL_DOMAIN_FFT      2
+const     MKL_DOMAIN_VML  = 0x3
+#  define MKL_DOMAIN_PARDISO  4
+
+"""
+    vml_get_max_threads() :: Int
+
+Maximum number of threads that VML may use. By default, or after a call to `vml_set_num_threads(0)`,
+should return the number of cores available to VML.
+"""
+vml_get_max_threads() = Int(ccall((:MKL_Domain_Get_Max_Threads, MKL_jll.libmkl_rt), Cint, (Cint,), MKL_DOMAIN_VML))
+"""
+    vml_set_num_threads(numthreads::Int) :: Bool
+
+Set the maximum number of threads that VML may use. Use `numthreads=0` to restore the default.
+Return `true` if the operation completed successfully.
+"""
+vml_set_num_threads(numthreads::Int) = Bool(ccall((:MKL_Domain_Set_Num_Threads, MKL_jll.libmkl_rt), Cuint, (Cint,Cint), numthreads, MKL_DOMAIN_VML))
+
+# See: mkl\include\mkl_service.h
+# _Mkl_Api(double,MKL_Get_Cpu_Frequency,(void))            /* Gets CPU frequency in GHz */
+# _Mkl_Api(double,MKL_Get_Max_Cpu_Frequency,(void))        /* Gets max CPU frequency in GHz */
+# #define mkl_get_cpu_frequency       MKL_Get_Cpu_Frequency
+# #define mkl_get_max_cpu_frequency   MKL_Get_Max_Cpu_Frequency
+#
+# _Mkl_Api(void,MKL_Get_Cpu_Clocks,(unsigned MKL_INT64 *)) /* Gets CPU clocks */
+# _Mkl_Api(double,MKL_Get_Clocks_Frequency,(void))         /* Gets clocks frequency in GHz */
+# #define mkl_get_cpu_clocks          MKL_Get_Cpu_Clocks
+# #define mkl_get_clocks_frequency    MKL_Get_Clocks_Frequency
+
+"""
+    vml_get_cpu_frequency() :: Float64
+
+Current CPU frequency in GHz, maybe less or more than [`vml_get_max_cpu_frequency`](@ref).
+"""
+vml_get_cpu_frequency()     = ccall((:MKL_Get_Cpu_Frequency,     MKL_jll.libmkl_rt), Cdouble, ())
+"""
+    vml_get_max_cpu_frequency() :: Float64
+
+Official CPU frequency in GHz, as per package specification. See also [`vml_get_cpu_frequency`](@ref).
+"""
+vml_get_max_cpu_frequency() = ccall((:MKL_Get_Max_Cpu_Frequency, MKL_jll.libmkl_rt), Cdouble, ())
+
+# -----------------------------------------------------------------------------------------------
+
+# mkl\include\mkl_vml_defines.h
+#  ERROR STATUS MACROS
+#  VML_STATUS_OK        - no errors
+#  VML_STATUS_BADSIZE   - array dimension is not positive
+#  VML_STATUS_BADMEM    - invalid pointer passed
+#  VML_STATUS_ERRDOM    - at least one of arguments is out of function domain
+#  VML_STATUS_SING      - at least one of arguments caused singularity
+#  VML_STATUS_OVERFLOW  - at least one of arguments caused overflow
+#  VML_STATUS_UNDERFLOW - at least one of arguments caused underflow
+#  VML_STATUS_ACCURACYWARNING - function doesn't support set accuracy mode,
+#                               lower accuracy mode was used instead
+const VML_STATUS_OK                  =  0
+const VML_STATUS_BADSIZE             = -1
+const VML_STATUS_BADMEM              = -2
+const VML_STATUS_ERRDOM              =  1
+const VML_STATUS_SING                =  2
+const VML_STATUS_OVERFLOW            =  3
+const VML_STATUS_UNDERFLOW           =  4
+const VML_STATUS_ACCURACYWARNING     =  1000
 
 function vml_check_error()
     vml_error = ccall((:vmlClearErrStatus, MKL_jll.libmkl_rt), Cint, ())
-    if vml_error != 0
-        if vml_error == 1
+    if vml_error != VML_STATUS_OK
+        if vml_error == VML_STATUS_ERRDOM
             throw(DomainError(-1, "This function does not support arguments outside its domain"))
-        elseif vml_error == 2 || vml_error == 3 || vml_error == 4
+        elseif vml_error == VML_STATUS_SING || vml_error == VML_STATUS_OVERFLOW || vml_error == VML_STATUS_UNDERFLOW
             # Singularity, overflow, or underflow
             # I don't think Base throws on these
-        elseif vml_error == 1000
+        elseif vml_error == VML_STATUS_ACCURACYWARNING
             warn("IntelVectorMath does not support $(vml_get_accuracy); lower accuracy used instead")
-        else
+        else # VML_STATUS_BADSIZE or VML_STATUS_BADMEM
             error("an unexpected error occurred in IntelVectorMath ($vml_error)")
         end
     end

diff --git a/test/real.jl b/test/real.jl
@@ -58,4 +58,19 @@ end
   vml_set_accuracy(VML_EP)
   Test.@test vml_get_accuracy() == VML_EP
 
+  # Setting denormal
+  vml_set_denormalmode(VML_DENORMAL_FAST)
+  Test.@test vml_get_denormalmode() == VML_DENORMAL_FAST
+
+  vml_set_denormalmode(VML_DENORMAL_ACCURATE)
+  Test.@test vml_get_denormalmode() == VML_DENORMAL_ACCURATE
+
+  # Setting number of threads (should have at least one 1)
+  Test.@test vml_set_num_threads(1)
+  Test.@test !vml_set_num_threads(-1)
+  Test.@test vml_get_max_threads() == 1
+
+  Test.@test vml_set_num_threads(0)
+  Test.@test vml_get_max_threads() >= 1
+
 end