Skip to content

Commit 3b73c9f

Browse files
Sebastian Popdstogov
authored andcommitted
neon vectorization for base64
A similar algorithm is used to vectorize on x86_64, with a good description in https://arxiv.org/abs/1704.00605 . On AArch64 the implementation differs in that instead of using multiplies to shift bits around, it uses the vld3+vst4 and vld4+vst3 combinations to load and store interleaved data. This patch is based on the NEON implementation of Wojciech Mula: https://github.com/WojciechMula/base64simd/blob/master/encode/encode.neon.cpp https://github.com/WojciechMula/base64simd/blob/master/encode/lookup.neon.cpp and https://github.com/WojciechMula/base64simd/blob/master/encode/encode.neon.cpp https://github.com/WojciechMula/base64simd/blob/master/encode/encode.neon.cpp adapted to php/ext/standard/base64.c and vectorized with factor 16 instead of 8. On a Graviton A1 instance and on the synthetic benchmarks in https://github.com/lemire/fastbase64 I see 175% speedup on base64 encoding and 60% speedup on base64 decode compared to the scalar implementation. The patch passes `make test` regression testing on aarch64-linux.
1 parent 2a535a9 commit 3b73c9f

File tree

2 files changed

+174
-0
lines changed

2 files changed

+174
-0
lines changed

ext/standard/base64.c

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,85 @@ static const short base64_reverse_table[256] = {
5353
};
5454
/* }}} */
5555

56+
#ifdef __aarch64__
57+
#include <arm_neon.h>
58+
59+
static zend_always_inline uint8x16_t encode_toascii(const uint8x16_t input, const uint8x16x2_t shift_LUT)
60+
{
61+
/* reduce 0..51 -> 0
62+
52..61 -> 1 .. 10
63+
62 -> 11
64+
63 -> 12 */
65+
uint8x16_t result = vqsubq_u8(input, vdupq_n_u8(51));
66+
/* distinguish between ranges 0..25 and 26..51:
67+
0 .. 25 -> remains 0
68+
26 .. 51 -> becomes 13 */
69+
const uint8x16_t less = vcgtq_u8(vdupq_n_u8(26), input);
70+
result = vorrq_u8(result, vandq_u8(less, vdupq_n_u8(13)));
71+
/* read shift */
72+
result = vqtbl2q_u8(shift_LUT, result);
73+
return vaddq_u8(result, input);
74+
}
75+
76+
static zend_always_inline unsigned char *neon_base64_encode(const unsigned char *in, size_t inl, unsigned char *out, size_t *left)
77+
{
78+
const uint8_t shift_LUT_[32] = {'a' - 26, '0' - 52, '0' - 52, '0' - 52,
79+
'0' - 52, '0' - 52, '0' - 52, '0' - 52,
80+
'0' - 52, '0' - 52, '0' - 52, '+' - 62,
81+
'/' - 63, 'A', 0, 0,
82+
'a' - 26, '0' - 52, '0' - 52, '0' - 52,
83+
'0' - 52, '0' - 52, '0' - 52, '0' - 52,
84+
'0' - 52, '0' - 52, '0' - 52, '+' - 62,
85+
'/' - 63, 'A', 0, 0};
86+
const uint8x16x2_t shift_LUT = *((const uint8x16x2_t *)shift_LUT_);
87+
do {
88+
/* [ccdddddd | bbbbcccc | aaaaaabb]
89+
x.val[2] | x.val[1] | x.val[0] */
90+
const uint8x16x3_t x = vld3q_u8((const uint8_t *)(in));
91+
92+
/* [00aa_aaaa] */
93+
const uint8x16_t field_a = vshrq_n_u8(x.val[0], 2);
94+
95+
const uint8x16_t field_b = /* [00bb_bbbb] */
96+
vbslq_u8(vdupq_n_u8(0x30), /* [0011_0000] */
97+
vshlq_n_u8(x.val[0], 4), /* [aabb_0000] */
98+
vshrq_n_u8(x.val[1], 4)); /* [0000_bbbb] */
99+
100+
const uint8x16_t field_c = /* [00cc_cccc] */
101+
vbslq_u8(vdupq_n_u8(0x3c), /* [0011_1100] */
102+
vshlq_n_u8(x.val[1], 2), /* [bbcc_cc00] */
103+
vshrq_n_u8(x.val[2], 6)); /* [0000_00cc] */
104+
105+
/* [00dd_dddd] */
106+
const uint8x16_t field_d = vandq_u8(x.val[2], vdupq_n_u8(0x3f));
107+
108+
uint8x16x4_t result;
109+
result.val[0] = encode_toascii(field_a, shift_LUT);
110+
result.val[1] = encode_toascii(field_b, shift_LUT);
111+
result.val[2] = encode_toascii(field_c, shift_LUT);
112+
result.val[3] = encode_toascii(field_d, shift_LUT);
113+
114+
vst4q_u8((uint8_t *)out, result);
115+
out += 64;
116+
in += 16 * 3;
117+
inl -= 16 * 3;
118+
} while (inl >= 16 * 3);
119+
120+
*left = inl;
121+
return out;
122+
}
123+
#endif /* __aarch64__ */
124+
56125
static zend_always_inline unsigned char *php_base64_encode_impl(const unsigned char *in, size_t inl, unsigned char *out) /* {{{ */
57126
{
127+
#ifdef __aarch64__
128+
if (inl >= 16 * 3) {
129+
size_t left = 0;
130+
out = neon_base64_encode(in, inl, out, &left);
131+
in += inl - left;
132+
inl = left;
133+
}
134+
#endif
58135

59136
while (inl > 2) { /* keep going until we have less than 24 bits */
60137
*out++ = base64_table[in[0] >> 2];
@@ -86,11 +163,103 @@ static zend_always_inline unsigned char *php_base64_encode_impl(const unsigned c
86163
}
87164
/* }}} */
88165

166+
#ifdef __aarch64__
167+
static zend_always_inline uint8x16_t decode_fromascii(const uint8x16_t input, uint8x16_t *error, const uint8x16x2_t shiftLUT, const uint8x16x2_t maskLUT, const uint8x16x2_t bitposLUT) {
168+
const uint8x16_t higher_nibble = vshrq_n_u8(input, 4);
169+
const uint8x16_t lower_nibble = vandq_u8(input, vdupq_n_u8(0x0f));
170+
const uint8x16_t sh = vqtbl2q_u8(shiftLUT, higher_nibble);
171+
const uint8x16_t eq_2f = vceqq_u8(input, vdupq_n_u8(0x2f));
172+
const uint8x16_t shift = vbslq_u8(eq_2f, vdupq_n_u8(16), sh);
173+
const uint8x16_t M = vqtbl2q_u8(maskLUT, lower_nibble);
174+
const uint8x16_t bit = vqtbl2q_u8(bitposLUT, higher_nibble);
175+
*error = vceqq_u8(vandq_u8(M, bit), vdupq_n_u8(0));
176+
return vaddq_u8(input, shift);
177+
}
178+
179+
static zend_always_inline size_t neon_base64_decode(const unsigned char *in, size_t inl, unsigned char *out, size_t *left) {
180+
unsigned char *out_orig = out;
181+
const uint8_t shiftLUT_[32] = {
182+
0, 0, 19, 4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71,
183+
0, 0, 0, 0, 0, 0, 0, 0,
184+
0, 0, 19, 4, (uint8_t)-65, (uint8_t)-65, (uint8_t)-71, (uint8_t)-71,
185+
0, 0, 0, 0, 0, 0, 0, 0};
186+
const uint8_t maskLUT_[32] = {
187+
/* 0 : 0b1010_1000*/ 0xa8,
188+
/* 1 .. 9 : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
189+
/* 10 : 0b1111_0000*/ 0xf0,
190+
/* 11 : 0b0101_0100*/ 0x54,
191+
/* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
192+
/* 15 : 0b0101_0100*/ 0x54,
193+
194+
/* 0 : 0b1010_1000*/ 0xa8,
195+
/* 1 .. 9 : 0b1111_1000*/ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
196+
/* 10 : 0b1111_0000*/ 0xf0,
197+
/* 11 : 0b0101_0100*/ 0x54,
198+
/* 12 .. 14 : 0b0101_0000*/ 0x50, 0x50, 0x50,
199+
/* 15 : 0b0101_0100*/ 0x54
200+
};
201+
const uint8_t bitposLUT_[32] = {
202+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
203+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
204+
205+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
206+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
207+
};
208+
const uint8x16x2_t shiftLUT = *((const uint8x16x2_t *)shiftLUT_);
209+
const uint8x16x2_t maskLUT = *((const uint8x16x2_t *)maskLUT_);
210+
const uint8x16x2_t bitposLUT = *((const uint8x16x2_t *)bitposLUT_);;
211+
212+
do {
213+
const uint8x16x4_t x = vld4q_u8((const unsigned char *)in);
214+
uint8x16_t error_a;
215+
uint8x16_t error_b;
216+
uint8x16_t error_c;
217+
uint8x16_t error_d;
218+
uint8x16_t field_a = decode_fromascii(x.val[0], &error_a, shiftLUT, maskLUT, bitposLUT);
219+
uint8x16_t field_b = decode_fromascii(x.val[1], &error_b, shiftLUT, maskLUT, bitposLUT);
220+
uint8x16_t field_c = decode_fromascii(x.val[2], &error_c, shiftLUT, maskLUT, bitposLUT);
221+
uint8x16_t field_d = decode_fromascii(x.val[3], &error_d, shiftLUT, maskLUT, bitposLUT);
222+
223+
const uint8x16_t err = vorrq_u8(vorrq_u8(error_a, error_b), vorrq_u8(error_c, error_d));
224+
union {uint8_t mem[16]; uint64_t dw[2]; } error;
225+
vst1q_u8(error.mem, err);
226+
227+
/* Check that the input only contains bytes belonging to the alphabet of
228+
Base64. If there are errors, decode the rest of the string with the
229+
scalar decoder. */
230+
if (error.dw[0] | error.dw[1])
231+
break;
232+
233+
uint8x16x3_t result;
234+
result.val[0] = vorrq_u8(vshrq_n_u8(field_b, 4), vshlq_n_u8(field_a, 2));
235+
result.val[1] = vorrq_u8(vshrq_n_u8(field_c, 2), vshlq_n_u8(field_b, 4));
236+
result.val[2] = vorrq_u8(field_d, vshlq_n_u8(field_c, 6));
237+
238+
vst3q_u8((unsigned char *)out, result);
239+
out += 16 * 3;
240+
in += 16 * 4;
241+
inl -= 16 * 4;
242+
} while (inl >= 16 * 4);
243+
*left = inl;
244+
return out - out_orig;
245+
}
246+
#endif /* __aarch64__ */
247+
89248
static zend_always_inline int php_base64_decode_impl(const unsigned char *in, size_t inl, unsigned char *out, size_t *outl, zend_bool strict) /* {{{ */
90249
{
91250
int ch;
92251
size_t i = 0, padding = 0, j = *outl;
93252

253+
#ifdef __aarch64__
254+
if (inl >= 16 * 4) {
255+
size_t left = 0;
256+
j += neon_base64_decode(in, inl, out, &left);
257+
i = inl - left;
258+
in += i;
259+
inl = left;
260+
}
261+
#endif
262+
94263
/* run through the whole string, converting as we go */
95264
while (inl-- > 0) {
96265
ch = *in++;

ext/standard/base64.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@
2121
#define BASE64_H
2222

2323
/*
24+
* NEON implementation is based on https://github.com/WojciechMula/base64simd
25+
* which is copyrighted to:
26+
* Copyright (c) 2015-2018, Wojciech Mula
27+
* All rights reserved.
28+
*
2429
* SSSE3 and AVX2 implementation are based on https://github.com/aklomp/base64
2530
* which is copyrighted to:
2631
*

0 commit comments

Comments
 (0)