Skip to content

Implement an SSE2 accelerated version of zend_adler32 to speedup startup time for the file cache #10507

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 5, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 66 additions & 14 deletions ext/opcache/zend_accelerator_util_funcs.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
#include "zend_shared_alloc.h"
#include "zend_observer.h"

#ifdef __SSE2__
/* For SSE2 adler32 */
#include <immintrin.h>
#endif

typedef int (*id_function_t)(void *, void *);
typedef void (*unique_copy_ctor_func_t)(void *pElement);

Expand Down Expand Up @@ -451,11 +456,62 @@ zend_op_array* zend_accel_load_script(zend_persistent_script *persistent_script,
#define ADLER32_NMAX 5552
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */

#define ADLER32_DO1(buf) {s1 += *(buf); s2 += s1;}
#define ADLER32_DO2(buf, i) ADLER32_DO1(buf + i); ADLER32_DO1(buf + i + 1);
#define ADLER32_DO4(buf, i) ADLER32_DO2(buf, i); ADLER32_DO2(buf, i + 2);
#define ADLER32_DO8(buf, i) ADLER32_DO4(buf, i); ADLER32_DO4(buf, i + 4);
#define ADLER32_DO16(buf) ADLER32_DO8(buf, 0); ADLER32_DO8(buf, 8);
#define ADLER32_SCALAR_DO1(buf) {s1 += *(buf); s2 += s1;}
#define ADLER32_SCALAR_DO2(buf, i) ADLER32_SCALAR_DO1(buf + i); ADLER32_SCALAR_DO1(buf + i + 1);
#define ADLER32_SCALAR_DO4(buf, i) ADLER32_SCALAR_DO2(buf, i); ADLER32_SCALAR_DO2(buf, i + 2);
#define ADLER32_SCALAR_DO8(buf, i) ADLER32_SCALAR_DO4(buf, i); ADLER32_SCALAR_DO4(buf, i + 4);
#define ADLER32_SCALAR_DO16(buf) ADLER32_SCALAR_DO8(buf, 0); ADLER32_SCALAR_DO8(buf, 8);

static zend_always_inline void adler32_do16_loop(unsigned char *buf, unsigned char *end, unsigned int *s1_out, unsigned int *s2_out)
{
unsigned int s1 = *s1_out;
unsigned int s2 = *s2_out;

#ifdef __SSE2__
const __m128i zero = _mm_setzero_si128();

__m128i accumulate_s2 = zero;
unsigned int accumulate_s1 = 0;

do {
__m128i read = _mm_loadu_si128((__m128i *) buf); /* [A:P] */

/* Split the 8-bit-element vector into two 16-bit-element vectors where each element gets zero-extended from 8-bits to 16-bits */
__m128i lower = _mm_unpacklo_epi8(read, zero); /* [A:H] zero-extended to 16-bits */
__m128i higher = _mm_unpackhi_epi8(read, zero); /* [I:P] zero-extended to 16-bits */
lower = _mm_madd_epi16(lower, _mm_set_epi16(9, 10, 11, 12, 13, 14, 15, 16)); /* [A * 16:H * 9] */
higher = _mm_madd_epi16(higher, _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8)); /* [I * 8:P * 1] */

/* We'll cheat here: it's difficult to add 16-bit elementwise, but we can do 32-bit additions.
* The highest value the sum of two elements of the vectors can take is 0xff * 16 + 0xff * 8 < 0xffff.
* That means there is no carry possible from 16->17 bits so the 32-bit addition is safe. */
__m128i sum = _mm_add_epi32(lower, higher); /* [A * 16 + I * 8:H * 9 + P * 1] */
accumulate_s2 = _mm_add_epi32(accumulate_s2, sum);
accumulate_s1 += s1;

/* Computes 8-bit element-wise abs(buf - zero) and then sums the elements into two 16 bit parts */
sum = _mm_sad_epu8(read, zero);
s1 += _mm_cvtsi128_si32(sum) + _mm_extract_epi16(sum, 4);

buf += 16;
} while (buf != end);

/* For convenience, let's do a rename of variables and let accumulate_s2 = [X, Y, Z, W] */
__m128i shuffled = _mm_shuffle_epi32(accumulate_s2, _MM_SHUFFLE(1, 0, 0, 2)); /* [Y, X, X, Z] */
accumulate_s2 = _mm_add_epi32(accumulate_s2, shuffled); /* [X + Y, Y + X, Z + X, W + Z] */
shuffled = _mm_shuffle_epi32(accumulate_s2, _MM_SHUFFLE(3, 3, 3, 3)); /* [X + Y, X + Y, X + Y, X + Y] */
accumulate_s2 = _mm_add_epi32(accumulate_s2, shuffled); /* [/, /, /, W + Z + X + Y] */
s2 += accumulate_s1 * 16 + _mm_cvtsi128_si32(accumulate_s2);
#else
do {
ADLER32_SCALAR_DO16(buf);
buf += 16;
} while (buf != end);
#endif

*s1_out = s1;
*s2_out = s2;
}

unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t len)
{
Expand All @@ -466,10 +522,8 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le
while (len >= ADLER32_NMAX) {
len -= ADLER32_NMAX;
end = buf + ADLER32_NMAX;
do {
ADLER32_DO16(buf);
buf += 16;
} while (buf != end);
adler32_do16_loop(buf, end, &s1, &s2);
buf = end;
s1 %= ADLER32_BASE;
s2 %= ADLER32_BASE;
}
Expand All @@ -478,15 +532,13 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le
if (len >= 16) {
end = buf + (len & 0xfff0);
len &= 0xf;
do {
ADLER32_DO16(buf);
buf += 16;
} while (buf != end);
adler32_do16_loop(buf, end, &s1, &s2);
buf = end;
}
if (len) {
end = buf + len;
do {
ADLER32_DO1(buf);
ADLER32_SCALAR_DO1(buf);
buf++;
} while (buf != end);
}
Expand Down