@@ -53,8 +53,85 @@ static const short base64_reverse_table[256] = {
53
53
};
54
54
/* }}} */
55
55
56
+ #ifdef __aarch64__
57
+ #include <arm_neon.h>
58
+
59
+ static zend_always_inline uint8x16_t encode_toascii (const uint8x16_t input , const uint8x16x2_t shift_LUT )
60
+ {
61
+ /* reduce 0..51 -> 0
62
+ 52..61 -> 1 .. 10
63
+ 62 -> 11
64
+ 63 -> 12 */
65
+ uint8x16_t result = vqsubq_u8 (input , vdupq_n_u8 (51 ));
66
+ /* distinguish between ranges 0..25 and 26..51:
67
+ 0 .. 25 -> remains 0
68
+ 26 .. 51 -> becomes 13 */
69
+ const uint8x16_t less = vcgtq_u8 (vdupq_n_u8 (26 ), input );
70
+ result = vorrq_u8 (result , vandq_u8 (less , vdupq_n_u8 (13 )));
71
+ /* read shift */
72
+ result = vqtbl2q_u8 (shift_LUT , result );
73
+ return vaddq_u8 (result , input );
74
+ }
75
+
76
+ static zend_always_inline unsigned char * neon_base64_encode (const unsigned char * in , size_t inl , unsigned char * out , size_t * left )
77
+ {
78
+ const uint8_t shift_LUT_ [32 ] = {'a' - 26 , '0' - 52 , '0' - 52 , '0' - 52 ,
79
+ '0' - 52 , '0' - 52 , '0' - 52 , '0' - 52 ,
80
+ '0' - 52 , '0' - 52 , '0' - 52 , '+' - 62 ,
81
+ '/' - 63 , 'A' , 0 , 0 ,
82
+ 'a' - 26 , '0' - 52 , '0' - 52 , '0' - 52 ,
83
+ '0' - 52 , '0' - 52 , '0' - 52 , '0' - 52 ,
84
+ '0' - 52 , '0' - 52 , '0' - 52 , '+' - 62 ,
85
+ '/' - 63 , 'A' , 0 , 0 };
86
+ const uint8x16x2_t shift_LUT = * ((const uint8x16x2_t * )shift_LUT_ );
87
+ do {
88
+ /* [ccdddddd | bbbbcccc | aaaaaabb]
89
+ x.val[2] | x.val[1] | x.val[0] */
90
+ const uint8x16x3_t x = vld3q_u8 ((const uint8_t * )(in ));
91
+
92
+ /* [00aa_aaaa] */
93
+ const uint8x16_t field_a = vshrq_n_u8 (x .val [0 ], 2 );
94
+
95
+ const uint8x16_t field_b = /* [00bb_bbbb] */
96
+ vbslq_u8 (vdupq_n_u8 (0x30 ), /* [0011_0000] */
97
+ vshlq_n_u8 (x .val [0 ], 4 ), /* [aabb_0000] */
98
+ vshrq_n_u8 (x .val [1 ], 4 )); /* [0000_bbbb] */
99
+
100
+ const uint8x16_t field_c = /* [00cc_cccc] */
101
+ vbslq_u8 (vdupq_n_u8 (0x3c ), /* [0011_1100] */
102
+ vshlq_n_u8 (x .val [1 ], 2 ), /* [bbcc_cc00] */
103
+ vshrq_n_u8 (x .val [2 ], 6 )); /* [0000_00cc] */
104
+
105
+ /* [00dd_dddd] */
106
+ const uint8x16_t field_d = vandq_u8 (x .val [2 ], vdupq_n_u8 (0x3f ));
107
+
108
+ uint8x16x4_t result ;
109
+ result .val [0 ] = encode_toascii (field_a , shift_LUT );
110
+ result .val [1 ] = encode_toascii (field_b , shift_LUT );
111
+ result .val [2 ] = encode_toascii (field_c , shift_LUT );
112
+ result .val [3 ] = encode_toascii (field_d , shift_LUT );
113
+
114
+ vst4q_u8 ((uint8_t * )out , result );
115
+ out += 64 ;
116
+ in += 16 * 3 ;
117
+ inl -= 16 * 3 ;
118
+ } while (inl >= 16 * 3 );
119
+
120
+ * left = inl ;
121
+ return out ;
122
+ }
123
+ #endif /* __aarch64__ */
124
+
56
125
static zend_always_inline unsigned char * php_base64_encode_impl (const unsigned char * in , size_t inl , unsigned char * out ) /* {{{ */
57
126
{
127
+ #ifdef __aarch64__
128
+ if (inl >= 16 * 3 ) {
129
+ size_t left = 0 ;
130
+ out = neon_base64_encode (in , inl , out , & left );
131
+ in += inl - left ;
132
+ inl = left ;
133
+ }
134
+ #endif
58
135
59
136
while (inl > 2 ) { /* keep going until we have less than 24 bits */
60
137
* out ++ = base64_table [in [0 ] >> 2 ];
@@ -86,11 +163,103 @@ static zend_always_inline unsigned char *php_base64_encode_impl(const unsigned c
86
163
}
87
164
/* }}} */
88
165
166
+ #ifdef __aarch64__
167
+ static zend_always_inline uint8x16_t decode_fromascii (const uint8x16_t input , uint8x16_t * error , const uint8x16x2_t shiftLUT , const uint8x16x2_t maskLUT , const uint8x16x2_t bitposLUT ) {
168
+ const uint8x16_t higher_nibble = vshrq_n_u8 (input , 4 );
169
+ const uint8x16_t lower_nibble = vandq_u8 (input , vdupq_n_u8 (0x0f ));
170
+ const uint8x16_t sh = vqtbl2q_u8 (shiftLUT , higher_nibble );
171
+ const uint8x16_t eq_2f = vceqq_u8 (input , vdupq_n_u8 (0x2f ));
172
+ const uint8x16_t shift = vbslq_u8 (eq_2f , vdupq_n_u8 (16 ), sh );
173
+ const uint8x16_t M = vqtbl2q_u8 (maskLUT , lower_nibble );
174
+ const uint8x16_t bit = vqtbl2q_u8 (bitposLUT , higher_nibble );
175
+ * error = vceqq_u8 (vandq_u8 (M , bit ), vdupq_n_u8 (0 ));
176
+ return vaddq_u8 (input , shift );
177
+ }
178
+
179
+ static zend_always_inline size_t neon_base64_decode (const unsigned char * in , size_t inl , unsigned char * out , size_t * left ) {
180
+ unsigned char * out_orig = out ;
181
+ const uint8_t shiftLUT_ [32 ] = {
182
+ 0 , 0 , 19 , 4 , (uint8_t )-65 , (uint8_t )-65 , (uint8_t )-71 , (uint8_t )-71 ,
183
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
184
+ 0 , 0 , 19 , 4 , (uint8_t )-65 , (uint8_t )-65 , (uint8_t )-71 , (uint8_t )-71 ,
185
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
186
+ const uint8_t maskLUT_ [32 ] = {
187
+ /* 0 : 0b1010_1000*/ 0xa8 ,
188
+ /* 1 .. 9 : 0b1111_1000*/ 0xf8 , 0xf8 , 0xf8 , 0xf8 , 0xf8 , 0xf8 , 0xf8 , 0xf8 , 0xf8 ,
189
+ /* 10 : 0b1111_0000*/ 0xf0 ,
190
+ /* 11 : 0b0101_0100*/ 0x54 ,
191
+ /* 12 .. 14 : 0b0101_0000*/ 0x50 , 0x50 , 0x50 ,
192
+ /* 15 : 0b0101_0100*/ 0x54 ,
193
+
194
+ /* 0 : 0b1010_1000*/ 0xa8 ,
195
+ /* 1 .. 9 : 0b1111_1000*/ 0xf8 , 0xf8 , 0xf8 , 0xf8 , 0xf8 , 0xf8 , 0xf8 , 0xf8 , 0xf8 ,
196
+ /* 10 : 0b1111_0000*/ 0xf0 ,
197
+ /* 11 : 0b0101_0100*/ 0x54 ,
198
+ /* 12 .. 14 : 0b0101_0000*/ 0x50 , 0x50 , 0x50 ,
199
+ /* 15 : 0b0101_0100*/ 0x54
200
+ };
201
+ const uint8_t bitposLUT_ [32 ] = {
202
+ 0x01 , 0x02 , 0x04 , 0x08 , 0x10 , 0x20 , 0x40 , 0x80 ,
203
+ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
204
+
205
+ 0x01 , 0x02 , 0x04 , 0x08 , 0x10 , 0x20 , 0x40 , 0x80 ,
206
+ 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00
207
+ };
208
+ const uint8x16x2_t shiftLUT = * ((const uint8x16x2_t * )shiftLUT_ );
209
+ const uint8x16x2_t maskLUT = * ((const uint8x16x2_t * )maskLUT_ );
210
+ const uint8x16x2_t bitposLUT = * ((const uint8x16x2_t * )bitposLUT_ );;
211
+
212
+ do {
213
+ const uint8x16x4_t x = vld4q_u8 ((const unsigned char * )in );
214
+ uint8x16_t error_a ;
215
+ uint8x16_t error_b ;
216
+ uint8x16_t error_c ;
217
+ uint8x16_t error_d ;
218
+ uint8x16_t field_a = decode_fromascii (x .val [0 ], & error_a , shiftLUT , maskLUT , bitposLUT );
219
+ uint8x16_t field_b = decode_fromascii (x .val [1 ], & error_b , shiftLUT , maskLUT , bitposLUT );
220
+ uint8x16_t field_c = decode_fromascii (x .val [2 ], & error_c , shiftLUT , maskLUT , bitposLUT );
221
+ uint8x16_t field_d = decode_fromascii (x .val [3 ], & error_d , shiftLUT , maskLUT , bitposLUT );
222
+
223
+ const uint8x16_t err = vorrq_u8 (vorrq_u8 (error_a , error_b ), vorrq_u8 (error_c , error_d ));
224
+ union {uint8_t mem [16 ]; uint64_t dw [2 ]; } error ;
225
+ vst1q_u8 (error .mem , err );
226
+
227
+ /* Check that the input only contains bytes belonging to the alphabet of
228
+ Base64. If there are errors, decode the rest of the string with the
229
+ scalar decoder. */
230
+ if (error .dw [0 ] | error .dw [1 ])
231
+ break ;
232
+
233
+ uint8x16x3_t result ;
234
+ result .val [0 ] = vorrq_u8 (vshrq_n_u8 (field_b , 4 ), vshlq_n_u8 (field_a , 2 ));
235
+ result .val [1 ] = vorrq_u8 (vshrq_n_u8 (field_c , 2 ), vshlq_n_u8 (field_b , 4 ));
236
+ result .val [2 ] = vorrq_u8 (field_d , vshlq_n_u8 (field_c , 6 ));
237
+
238
+ vst3q_u8 ((unsigned char * )out , result );
239
+ out += 16 * 3 ;
240
+ in += 16 * 4 ;
241
+ inl -= 16 * 4 ;
242
+ } while (inl >= 16 * 4 );
243
+ * left = inl ;
244
+ return out - out_orig ;
245
+ }
246
+ #endif /* __aarch64__ */
247
+
89
248
static zend_always_inline int php_base64_decode_impl (const unsigned char * in , size_t inl , unsigned char * out , size_t * outl , zend_bool strict ) /* {{{ */
90
249
{
91
250
int ch ;
92
251
size_t i = 0 , padding = 0 , j = * outl ;
93
252
253
+ #ifdef __aarch64__
254
+ if (inl >= 16 * 4 ) {
255
+ size_t left = 0 ;
256
+ j += neon_base64_decode (in , inl , out , & left );
257
+ i = inl - left ;
258
+ in += i ;
259
+ inl = left ;
260
+ }
261
+ #endif
262
+
94
263
/* run through the whole string, converting as we go */
95
264
while (inl -- > 0 ) {
96
265
ch = * in ++ ;
0 commit comments