Skip to content

Bug #85819 Add AArch64 optimized crc32c implementation #136

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions cmake/build_configurations/compiler_options.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@ IF(UNIX)

# Default GCC flags
IF(CMAKE_COMPILER_IS_GNUCC)
SET(COMMON_C_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing")
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
SET(COMMON_C_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing -march=armv8-a+crypto+crc")
ELSE()
SET(COMMON_C_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing")
ENDIF()
# Disable inline optimizations for valgrind testing to avoid false positives
IF(WITH_VALGRIND)
SET(COMMON_C_FLAGS "-fno-inline ${COMMON_C_FLAGS}")
Expand All @@ -54,7 +58,11 @@ IF(UNIX)
SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 ${COMMON_C_FLAGS}")
ENDIF()
IF(CMAKE_COMPILER_IS_GNUCXX)
SET(COMMON_CXX_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing")
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
SET(COMMON_CXX_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing -march=armv8-a+crypto+crc")
ELSE()
SET(COMMON_CXX_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing")
ENDIF()
# GCC 6 has C++14 as default, set it explicitly to the old default.
EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion
OUTPUT_VARIABLE GXX_VERSION)
Expand Down
3 changes: 3 additions & 0 deletions config.h.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -448,4 +448,7 @@
/* For default value of --early_plugin_load */
#cmakedefine DEFAULT_EARLY_PLUGIN_LOAD @DEFAULT_EARLY_PLUGIN_LOAD@

/* Support ARMv8 CRC instructions */
#cmakedefine ENABLE_ARMV8_CRC32

#endif
6 changes: 6 additions & 0 deletions configure.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -929,3 +929,9 @@ CHECK_TYPE_SIZE("socklen_t" SIZEOF_SOCKLEN_T)
IF(SIZEOF_SOCKLEN_T)
SET(HAVE_SOCKLEN_T 1)
ENDIF()

# Enable crc32 on AArch64 Platform
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
MESSAGE(STATUS "ARMv8 crc32 enabled.")
SET(ENABLE_ARMV8_CRC32 1)
ENDIF()
163 changes: 162 additions & 1 deletion storage/innobase/ut/ut0crc32.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,62 @@ ut_crc32_swap_byteorder(
| i >> 56);
}


/* CRC32 hardware implementation. */

/*For AArch64*/
#ifdef ENABLE_ARMV8_CRC32
#include <arm_acle.h>
#include <arm_neon.h>

#define ARM_CRC32_INTRINSIC

#define CRC32C3X8(buffer,ITR) \
crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));

#define CRC32C7X3X8(buffer,ITR) do {\
CRC32C3X8(buffer,(ITR)*7+0) \
CRC32C3X8(buffer,(ITR)*7+1) \
CRC32C3X8(buffer,(ITR)*7+2) \
CRC32C3X8(buffer,(ITR)*7+3) \
CRC32C3X8(buffer,(ITR)*7+4) \
CRC32C3X8(buffer,(ITR)*7+5) \
CRC32C3X8(buffer,(ITR)*7+6) \
} while(0)

#define PREF4X64L1(buffer,PREF_OFFSET, ITR) \
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));

#define PREF1KL1(buffer,PREF_OFFSET) \
PREF4X64L1(buffer,(PREF_OFFSET), 0) \
PREF4X64L1(buffer,(PREF_OFFSET), 4) \
PREF4X64L1(buffer,(PREF_OFFSET), 8) \
PREF4X64L1(buffer,(PREF_OFFSET), 12)

#define PREF4X64L2(buffer,PREF_OFFSET, ITR) \
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));

#define PREF1KL2(buffer,PREF_OFFSET) \
PREF4X64L2(buffer,(PREF_OFFSET), 0) \
PREF4X64L2(buffer,(PREF_OFFSET), 4) \
PREF4X64L2(buffer,(PREF_OFFSET), 8) \
PREF4X64L2(buffer,(PREF_OFFSET), 12)
#else
#undef ARM_CRC32_INTRINSIC
#endif

/* Flag that tells whether the CPU supports CRC32 or not */
bool ut_crc32_sse2_enabled = false;


#if defined(__GNUC__) && defined(__x86_64__)
/********************************************************************//**
Fetches CPU info */
Expand Down Expand Up @@ -421,6 +472,107 @@ ut_crc32_byte_by_byte_hw(
}
#endif /* defined(__GNUC__) && defined(__x86_64__) */


#ifdef ARM_CRC32_INTRINSIC
uint32_t
ut_crc32_byte_by_byte_aarch64(
const uint8_t* buf,
uint64_t len)
{
uint32_t crc = 0xFFFFFFFFU;

ut_a(ut_crc32_sse2_enabled);

while (len > 0) {
crc = __crc32cb(crc, *buf++);
len--;
}

return(~crc);
}

uint32_t
ut_crc32_aarch64(
const uint8_t* buf,
uint64_t len)
{
register uint32_t crc = 0xFFFFFFFFU;
register const uint16_t *buf2;
register const uint32_t *buf4;
register const uint64_t *buf8;

ut_a(ut_crc32_sse2_enabled);

uint32_t crc0, crc1, crc2;
int64_t length = (int64_t)len;
buf8 = (const uint64_t *)(const void *)buf;

/* Calculate reflected crc with PMULL Instruction */
const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
uint64_t t0, t1;

/* crc done "by 3" for fixed input block size of 1024 bytes */
while ((length -= 1024) >= 0) {
/* Prefetch data for following block to avoid cache miss */
PREF1KL2(buf,1024*3);
/* Do first 8 bytes here for better pipelining */
crc0 = __crc32cd(crc, *buf8++);
crc1 = 0;
crc2 = 0;

/* Process block inline
Process crc0 last to avoid dependency with above */
CRC32C7X3X8(buf8,0);
CRC32C7X3X8(buf8,1);
CRC32C7X3X8(buf8,2);
CRC32C7X3X8(buf8,3);
CRC32C7X3X8(buf8,4);
CRC32C7X3X8(buf8,5);

buf8 += 42*3;
/* Prefetch data for following block to avoid cache miss */
PREF1KL1((uint8_t *)buf8,1024);

/* Merge crc0 and crc1 into crc2
crc1 multiply by K2
crc0 multiply by K1 */

t1 = (uint64_t)vmull_p64(crc1, k2);
t0 = (uint64_t)vmull_p64(crc0, k1);
crc = __crc32cd(crc2, *buf8++);
crc1 = __crc32cd(0, t1);
crc ^= crc1;
crc0 = __crc32cd(0, t0);
crc ^= crc0;
}

if(!(length += 1024))
return (~crc);

while ((length -= sizeof(uint64_t)) >= 0) {
crc = __crc32cd(crc, *buf8++);
}

/* The following is more efficient than the straight loop */
buf4 = (const uint32_t *)(const void *)buf8;
if (length & sizeof(uint32_t)) {
crc = __crc32cw(crc, *buf4++);
}

buf2 = (const uint16_t *)(const void *)buf4;
if (length & sizeof(uint16_t)) {
crc = __crc32ch(crc, *buf2++);
}

buf = (const uint8_t *)(const void *)buf2;
if (length & sizeof(uint8_t))
crc = __crc32cb(crc, *buf);

return(~crc);
}
#endif


/* CRC32 software implementation. */

/* Precalculated table used to generate the CRC32 if the CPU does not
Expand All @@ -431,7 +583,7 @@ static bool ut_crc32_slice8_table_initialized = false;
/********************************************************************//**
Initializes the table that is used to generate the CRC32 if the CPU does
not have support for it. */
static
//static
void
ut_crc32_slice8_table_init()
/*========================*/
Expand Down Expand Up @@ -727,6 +879,15 @@ ut_crc32_init()

#endif /* defined(__GNUC__) && defined(__x86_64__) */

#ifdef ARM_CRC32_INTRINSIC
ut_crc32_sse2_enabled = 0x1;
if (ut_crc32_sse2_enabled) {
ut_crc32 = ut_crc32_aarch64;
ut_crc32_legacy_big_endian = NULL;
ut_crc32_byte_by_byte = ut_crc32_byte_by_byte_aarch64;
}
#endif

if (!ut_crc32_sse2_enabled) {
ut_crc32_slice8_table_init();
ut_crc32 = ut_crc32_sw;
Expand Down