Skip to content

Commit 44b6bd2

Browse files
Fix gcc failures
Use reinterpret to support casting across many compiler generations Resolve deprecation warnings
1 parent f094768 commit 44b6bd2

File tree

1 file changed

+33
-32
lines changed

1 file changed

+33
-32
lines changed

numpy/core/src/umath/loops_unary_fp.dispatch.c.src

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* such small operations that this file covers.
1313
*/
1414
#define NPY_SIMD_FORCE_128
15+
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
1516
#include <float.h>
1617
#include "numpy/npy_math.h"
1718
#include "simd/simd.h"
@@ -104,7 +105,7 @@ npyv_isnan_@sfx@(npyv_@sfx@ v)
104105
{
105106
// (v != v) >> (size - 1)
106107
npyv_@sfx@ r = npyv_cvt_@sfx@_b@ssfx@(npyv_cmpneq_@sfx@(v, v));
107-
return npyv_shri_u@ssfx@(r, (sizeof(npyv_lanetype_@sfx@)*8)-1);
108+
return npyv_shri_u@ssfx@(npyv_reinterpret_u@ssfx@_@sfx@(r), (sizeof(npyv_lanetype_@sfx@)*8)-1);
108109
}
109110

110111
static NPY_INLINE npyv_u@ssfx@
@@ -113,7 +114,7 @@ npyv_isinf_@sfx@(npyv_@sfx@ v)
113114
// (abs(v) > fltmax) >> (size - 1)
114115
const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@);
115116
#if defined(NPY_HAVE_NEON)
116-
npyv_@sfx@ r = vcagtq_@sfx@(v, fltmax);
117+
npyv_u@ssfx@ r = vcagtq_@sfx@(v, fltmax);
117118
#else
118119
// fabs via masking of sign bit
119120
const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
@@ -129,7 +130,7 @@ npyv_isfinite_@sfx@(npyv_@sfx@ v)
129130
// ((v & signmask) <= fltmax) >> (size-1)
130131
const npyv_@sfx@ fltmax = npyv_setall_@sfx@(@FDMAX@);
131132
#if defined(NPY_HAVE_NEON)
132-
npyv_@sfx@ r = vcaleq_@sfx@(v, fltmax);
133+
npyv_u@ssfx@ r = vcaleq_@sfx@(v, fltmax);
133134
#else
134135
// fabs via masking of sign bit
135136
const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
@@ -142,7 +143,7 @@ npyv_isfinite_@sfx@(npyv_@sfx@ v)
142143
static NPY_INLINE npyv_u@ssfx@
143144
npyv_signbit_@sfx@(npyv_@sfx@ v)
144145
{
145-
return npyv_shri_u@ssfx@(v, (sizeof(npyv_lanetype_@sfx@)*8)-1);
146+
return npyv_shri_u@ssfx@(npyv_reinterpret_u@ssfx@_@sfx@(v), (sizeof(npyv_lanetype_@sfx@)*8)-1);
146147
}
147148

148149
#endif // @VCHK@
@@ -162,10 +163,10 @@ npyv_signbit_@sfx@(npyv_@sfx@ v)
162163
// with only exponent in high byte. If not all bits are set,
163164
// then we've got a finite number.
164165
uint8x16x4_t tbl;
165-
tbl.val[0] = npyv_shli_u32(v0, 1);
166-
tbl.val[1] = npyv_shli_u32(v1, 1);
167-
tbl.val[2] = npyv_shli_u32(v2, 1);
168-
tbl.val[3] = npyv_shli_u32(v3, 1);
166+
tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1));
167+
tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1));
168+
tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1));
169+
tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1));
169170

170171
const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63};
171172
npyv_u8 r = vqtbl4q_u8(tbl, permute);
@@ -182,10 +183,10 @@ npyv_signbit_@sfx@(npyv_@sfx@ v)
182183
// We only need high byte for signbit, which means we can pack
183184
// multiple inputs into a single vector.
184185
uint8x16x4_t tbl;
185-
tbl.val[0] = v0;
186-
tbl.val[1] = v1;
187-
tbl.val[2] = v2;
188-
tbl.val[3] = v3;
186+
tbl.val[0] = npyv_reinterpret_u8_f32(v0);
187+
tbl.val[1] = npyv_reinterpret_u8_f32(v1);
188+
tbl.val[2] = npyv_reinterpret_u8_f32(v2);
189+
tbl.val[3] = npyv_reinterpret_u8_f32(v3);
189190

190191
const npyv_u8 permute = {3,7,11,15, 19,23,27,31, 35,39,43,47, 51,55,59,63};
191192
npyv_u8 r = vqtbl4q_u8(tbl, permute);
@@ -205,18 +206,18 @@ npyv_signbit_@sfx@(npyv_@sfx@ v)
205206
// a single vector. We'll need to use u16 to fit all exponent
206207
// bits. If not all bits are set, then we've got a finite number.
207208
uint8x16x4_t t0123, t4567;
208-
t0123.val[0] = v0;
209-
t0123.val[1] = v1;
210-
t0123.val[2] = v2;
211-
t0123.val[3] = v3;
212-
t4567.val[0] = v4;
213-
t4567.val[1] = v5;
214-
t4567.val[2] = v6;
215-
t4567.val[3] = v7;
209+
t0123.val[0] = npyv_reinterpret_u8_f64(v0);
210+
t0123.val[1] = npyv_reinterpret_u8_f64(v1);
211+
t0123.val[2] = npyv_reinterpret_u8_f64(v2);
212+
t0123.val[3] = npyv_reinterpret_u8_f64(v3);
213+
t4567.val[0] = npyv_reinterpret_u8_f64(v4);
214+
t4567.val[1] = npyv_reinterpret_u8_f64(v5);
215+
t4567.val[2] = npyv_reinterpret_u8_f64(v6);
216+
t4567.val[3] = npyv_reinterpret_u8_f64(v7);
216217

217218
const npyv_u8 permute = {6,7,14,15, 22,23,30,31, 38,39,46,47, 54,55,62,63};
218-
npyv_u16 r0 = vqtbl4q_u8(t0123, permute);
219-
npyv_u16 r1 = vqtbl4q_u8(t4567, permute);
219+
npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute));
220+
npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute));
220221

221222
const npyv_u16 expmask = npyv_setall_u16(0x7ff0);
222223
r0 = npyv_and_u16(r0, expmask);
@@ -238,15 +239,15 @@ npyv_signbit_@sfx@(npyv_@sfx@ v)
238239
// multiple inputs into a single vector.
239240

240241
// vuzp2 faster than vtbl for f64
241-
npyv_u32 v01 = vuzp2q_u32(v0, v1);
242-
npyv_u32 v23 = vuzp2q_u32(v2, v3);
243-
npyv_u32 v45 = vuzp2q_u32(v4, v5);
244-
npyv_u32 v67 = vuzp2q_u32(v6, v7);
242+
npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1));
243+
npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3));
244+
npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5));
245+
npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7));
245246

246-
npyv_u16 v0123 = vuzp2q_u16(v01, v23);
247-
npyv_u16 v4567 = vuzp2q_u16(v45, v67);
247+
npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23));
248+
npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67));
248249

249-
npyv_u8 r = vuzp2q_u8(v0123, v4567);
250+
npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567));
250251
r = vshrq_n_u8(r, 7);
251252
return r;
252253
}
@@ -540,7 +541,7 @@ static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
540541
// Results are packed, so we can just loop over them
541542
npy_uint8 lane_@N@[npyv_nlanes_u8];
542543
npyv_store_u8(lane_@N@, r_@N@);
543-
for (int ln=0; ln<npyv_nlanes_u8; ++ln){
544+
for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){
544545
op[(ln + @N@ * PACK_FACTOR * vstep) * ostride] = lane_@N@[ln * sizeof(npyv_lanetype_@sfx@)];
545546
}
546547
#else
@@ -550,7 +551,7 @@ static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
550551
*/
551552
#if @R@ < PACK_FACTOR
552553
npy_uint8 lane@R@_@N@[npyv_nlanes_u8];
553-
npyv_store_u8(lane@R@_@N@, r@R@_@N@);
554+
npyv_store_u8(lane@R@_@N@, npyv_reinterpret_u8_u@ssfx@(r@R@_@N@));
554555
op[(0 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[0 * sizeof(npyv_lanetype_@sfx@)];
555556
op[(1 + (@R@ + @N@ * PACK_FACTOR) * vstep) * ostride] = lane@R@_@N@[1 * sizeof(npyv_lanetype_@sfx@)];
556557
#if npyv_nlanes_@sfx@ == 4
@@ -576,7 +577,7 @@ static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
576577
npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v);
577578

578579
npy_uint8 lane[npyv_nlanes_u8];
579-
npyv_store_u8(lane, r);
580+
npyv_store_u8(lane, npyv_reinterpret_u8_u@ssfx@(r));
580581

581582
op[0*ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)];
582583
op[1*ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)];

0 commit comments

Comments
 (0)