12
12
* such small operations that this file covers.
13
13
*/
14
14
#define NPY_SIMD_FORCE_128
15
+ #define NPY_NO_DEPRECATED_API NPY_API_VERSION
15
16
#include <float.h>
16
17
#include "numpy/npy_math.h"
17
18
#include "simd/simd.h"
@@ -104,7 +105,7 @@ npyv_isnan_@sfx@(npyv_@sfx@ v)
104
105
{
105
106
// (v != v) >> (size - 1)
106
107
npyv_@sfx@ r = npyv_cvt_@sfx@_b@ssfx@(npyv_cmpneq_@sfx@(v, v));
107
- return npyv_shri_u@ssfx@(r , (sizeof(npyv_lanetype_@sfx@)*8)-1);
108
+ return npyv_shri_u@ssfx@(npyv_reinterpret_u@ssfx@_@sfx@(r) , (sizeof(npyv_lanetype_@sfx@)*8)-1);
108
109
}
109
110
110
111
static NPY_INLINE npyv_u@ssfx@
@@ -113,7 +114,7 @@ npyv_isinf_@sfx@(npyv_@sfx@ v)
113
114
// (abs(v) > fltmax) >> (size - 1)
114
115
const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@);
115
116
#if defined(NPY_HAVE_NEON)
116
- npyv_@sfx @ r = vcagtq_@sfx@(v, fltmax);
117
+ npyv_u@ssfx @ r = vcagtq_@sfx@(v, fltmax);
117
118
#else
118
119
// fabs via masking of sign bit
119
120
const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
@@ -129,7 +130,7 @@ npyv_isfinite_@sfx@(npyv_@sfx@ v)
129
130
// ((v & signmask) <= fltmax) >> (size-1)
130
131
const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@);
131
132
#if defined(NPY_HAVE_NEON)
132
- npyv_@sfx @ r = vcaleq_@sfx@(v, fltmax);
133
+ npyv_u@ssfx @ r = vcaleq_@sfx@(v, fltmax);
133
134
#else
134
135
// fabs via masking of sign bit
135
136
const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
@@ -142,7 +143,7 @@ npyv_isfinite_@sfx@(npyv_@sfx@ v)
142
143
static NPY_INLINE npyv_u@ssfx@
143
144
npyv_signbit_@sfx@(npyv_@sfx@ v)
144
145
{
145
- return npyv_shri_u@ssfx@(v , (sizeof(npyv_lanetype_@sfx@)*8)-1);
146
+ return npyv_shri_u@ssfx@(npyv_reinterpret_u@ssfx@_@sfx@(v) , (sizeof(npyv_lanetype_@sfx@)*8)-1);
146
147
}
147
148
148
149
#endif // @VCHK@
@@ -162,10 +163,10 @@ npyv_signbit_@sfx@(npyv_@sfx@ v)
162
163
// with only exponent in high byte. If not all bits are set,
163
164
// then we've got a finite number.
164
165
uint8x16x4_t tbl;
165
- tbl.val[0] = npyv_shli_u32(v0 , 1);
166
- tbl.val[1] = npyv_shli_u32(v1 , 1);
167
- tbl.val[2] = npyv_shli_u32(v2 , 1);
168
- tbl.val[3] = npyv_shli_u32(v3 , 1);
166
+ tbl.val[0] = npyv_reinterpret_u8_u32( npyv_shli_u32(npyv_reinterpret_u32_f32(v0) , 1) );
167
+ tbl.val[1] = npyv_reinterpret_u8_u32( npyv_shli_u32(npyv_reinterpret_u32_f32(v1) , 1) );
168
+ tbl.val[2] = npyv_reinterpret_u8_u32( npyv_shli_u32(npyv_reinterpret_u32_f32(v2) , 1) );
169
+ tbl.val[3] = npyv_reinterpret_u8_u32( npyv_shli_u32(npyv_reinterpret_u32_f32(v3) , 1) );
169
170
170
171
const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63};
171
172
npyv_u8 r = vqtbl4q_u8(tbl, permute);
@@ -182,10 +183,10 @@ npyv_signbit_@sfx@(npyv_@sfx@ v)
182
183
// We only need high byte for signbit, which means we can pack
183
184
// multiple inputs into a single vector.
184
185
uint8x16x4_t tbl;
185
- tbl.val[0] = v0 ;
186
- tbl.val[1] = v1 ;
187
- tbl.val[2] = v2 ;
188
- tbl.val[3] = v3 ;
186
+ tbl.val[0] = npyv_reinterpret_u8_f32(v0) ;
187
+ tbl.val[1] = npyv_reinterpret_u8_f32(v1) ;
188
+ tbl.val[2] = npyv_reinterpret_u8_f32(v2) ;
189
+ tbl.val[3] = npyv_reinterpret_u8_f32(v3) ;
189
190
190
191
const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63};
191
192
npyv_u8 r = vqtbl4q_u8(tbl, permute);
@@ -205,18 +206,18 @@ npyv_signbit_@sfx@(npyv_@sfx@ v)
205
206
// a single vector. We'll need to use u16 to fit all exponent
206
207
// bits. If not all bits are set, then we've got a finite number.
207
208
uint8x16x4_t t0123, t4567;
208
- t0123.val[0] = v0 ;
209
- t0123.val[1] = v1 ;
210
- t0123.val[2] = v2 ;
211
- t0123.val[3] = v3 ;
212
- t4567.val[0] = v4 ;
213
- t4567.val[1] = v5 ;
214
- t4567.val[2] = v6 ;
215
- t4567.val[3] = v7 ;
209
+ t0123.val[0] = npyv_reinterpret_u8_f64(v0) ;
210
+ t0123.val[1] = npyv_reinterpret_u8_f64(v1) ;
211
+ t0123.val[2] = npyv_reinterpret_u8_f64(v2) ;
212
+ t0123.val[3] = npyv_reinterpret_u8_f64(v3) ;
213
+ t4567.val[0] = npyv_reinterpret_u8_f64(v4) ;
214
+ t4567.val[1] = npyv_reinterpret_u8_f64(v5) ;
215
+ t4567.val[2] = npyv_reinterpret_u8_f64(v6) ;
216
+ t4567.val[3] = npyv_reinterpret_u8_f64(v7) ;
216
217
217
218
const npyv_u8 permute = {6,7,14,15, 22,23,30,31, 38,39,46,47, 54,55,62,63};
218
- npyv_u16 r0 = vqtbl4q_u8(t0123, permute);
219
- npyv_u16 r1 = vqtbl4q_u8(t4567, permute);
219
+ npyv_u16 r0 = npyv_reinterpret_u16_u8( vqtbl4q_u8(t0123, permute) );
220
+ npyv_u16 r1 = npyv_reinterpret_u16_u8( vqtbl4q_u8(t4567, permute) );
220
221
221
222
const npyv_u16 expmask = npyv_setall_u16(0x7ff0);
222
223
r0 = npyv_and_u16(r0, expmask);
@@ -238,15 +239,15 @@ npyv_signbit_@sfx@(npyv_@sfx@ v)
238
239
// multiple inputs into a single vector.
239
240
240
241
// vuzp2 faster than vtbl for f64
241
- npyv_u32 v01 = vuzp2q_u32(v0, v1 );
242
- npyv_u32 v23 = vuzp2q_u32(v2, v3 );
243
- npyv_u32 v45 = vuzp2q_u32(v4, v5 );
244
- npyv_u32 v67 = vuzp2q_u32(v6, v7 );
242
+ npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1) );
243
+ npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3) );
244
+ npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5) );
245
+ npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7) );
245
246
246
- npyv_u16 v0123 = vuzp2q_u16(v01, v23);
247
- npyv_u16 v4567 = vuzp2q_u16(v45, v67);
247
+ npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32( v01), npyv_reinterpret_u16_u32( v23) );
248
+ npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32( v45), npyv_reinterpret_u16_u32( v67) );
248
249
249
- npyv_u8 r = vuzp2q_u8(v0123, v4567);
250
+ npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16( v0123), npyv_reinterpret_u8_u16( v4567) );
250
251
r = vshrq_n_u8(r, 7);
251
252
return r;
252
253
}
@@ -540,7 +541,7 @@ static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
540
541
// Results are packed, so we can just loop over them
541
542
npy_uint8 lane_@N@[npyv_nlanes_u8];
542
543
npyv_store_u8(lane_@N@, r_@N@);
543
- for (int ln=0; ln< npyv_nlanes_u8; ++ln){
544
+ for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){
544
545
op[(ln + @N@ * PACK_FACTOR * vstep) * ostride] = lane_@N@[ln * sizeof(npyv_lanetype_@sfx@)];
545
546
}
546
547
#else
@@ -550,7 +551,7 @@ static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
550
551
*/
551
552
#if @R@ < PACK_FACTOR
552
553
npy_uint8 lane@R@_@N@[npyv_nlanes_u8];
553
- npyv_store_u8(lane@R@_@N@, r@R@_@N@);
554
+ npyv_store_u8(lane@R@_@N@, npyv_reinterpret_u8_u@ssfx@( r@R@_@N@) );
554
555
op[(0 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[0 * sizeof(npyv_lanetype_@sfx@)];
555
556
op[(1 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[1 * sizeof(npyv_lanetype_@sfx@)];
556
557
#if npyv_nlanes_@sfx@ == 4
@@ -576,7 +577,7 @@ static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
576
577
npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v);
577
578
578
579
npy_uint8 lane[npyv_nlanes_u8];
579
- npyv_store_u8(lane, r );
580
+ npyv_store_u8(lane, npyv_reinterpret_u8_u@ssfx@(r) );
580
581
581
582
op[0*ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)];
582
583
op[1*ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)];
0 commit comments