Skip to content

[ARM] Fix lane ordering for AdvSIMD intrinsics on big-endian targets #127068

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions clang/test/CodeGen/arm-neon-endianness.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5

// REQUIRES: arm-registered-target

// RUN: %clang_cc1 -triple armv8a-arm-none-eabihf -target-cpu generic -emit-llvm -o - %s -disable-O0-optnone | \
// RUN: opt -S -passes=instcombine -o - | FileCheck %s --check-prefix=LE
// RUN: %clang_cc1 -triple armebv8a-arm-none-eabihf -target-cpu generic -emit-llvm -o - %s -disable-O0-optnone | \
// RUN: opt -S -passes=instcombine -o - | FileCheck %s --check-prefix=BE

#include <arm_neon.h>

// LE-LABEL: define dso_local i32 @int32x4_t_lane_0(
// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 0
// LE-NEXT: ret i32 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i32 @int32x4_t_lane_0(
// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 1
// BE-NEXT: ret i32 [[VGET_LANE]]
//
int int32x4_t_lane_0(int32x4_t a) { return vgetq_lane_s32(a, 0); }
// LE-LABEL: define dso_local i32 @int32x4_t_lane_1(
// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 1
// LE-NEXT: ret i32 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i32 @int32x4_t_lane_1(
// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 0
// BE-NEXT: ret i32 [[VGET_LANE]]
//
int int32x4_t_lane_1(int32x4_t a) { return vgetq_lane_s32(a, 1); }
// LE-LABEL: define dso_local i32 @int32x4_t_lane_2(
// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 2
// LE-NEXT: ret i32 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i32 @int32x4_t_lane_2(
// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3
// BE-NEXT: ret i32 [[VGET_LANE]]
//
int int32x4_t_lane_2(int32x4_t a) { return vgetq_lane_s32(a, 2); }
// LE-LABEL: define dso_local i32 @int32x4_t_lane_3(
// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3
// LE-NEXT: ret i32 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i32 @int32x4_t_lane_3(
// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 2
// BE-NEXT: ret i32 [[VGET_LANE]]
//
int int32x4_t_lane_3(int32x4_t a) { return vgetq_lane_s32(a, 3); }
// LE-LABEL: define dso_local i32 @int32x2_t_lane_0(
// LE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 0
// LE-NEXT: ret i32 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i32 @int32x2_t_lane_0(
// BE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1
// BE-NEXT: ret i32 [[VGET_LANE]]
//
int int32x2_t_lane_0(int32x2_t a) { return vget_lane_s32(a, 0); }
// LE-LABEL: define dso_local i32 @int32x2_t_lane_1(
// LE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1
// LE-NEXT: ret i32 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i32 @int32x2_t_lane_1(
// BE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 0
// BE-NEXT: ret i32 [[VGET_LANE]]
//
int int32x2_t_lane_1(int32x2_t a) { return vget_lane_s32(a, 1); }
// LE-LABEL: define dso_local i64 @int64x2_t_lane_0(
// LE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 0
// LE-NEXT: ret i64 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i64 @int64x2_t_lane_0(
// BE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 0
// BE-NEXT: ret i64 [[VGET_LANE]]
//
int64_t int64x2_t_lane_0(int64x2_t a) { return vgetq_lane_s64(a, 0); }
// LE-LABEL: define dso_local i64 @int64x2_t_lane_1(
// LE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1
// LE-NEXT: ret i64 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i64 @int64x2_t_lane_1(
// BE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1
// BE-NEXT: ret i64 [[VGET_LANE]]
//
int64_t int64x2_t_lane_1(int64x2_t a) { return vgetq_lane_s64(a, 1); }
53 changes: 38 additions & 15 deletions clang/utils/TableGen/NeonEmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1263,20 +1263,17 @@ void Intrinsic::emitReverseVariable(Variable &Dest, Variable &Src) {

for (unsigned K = 0; K < Dest.getType().getNumVectors(); ++K) {
OS << " " << Dest.getName() << ".val[" << K << "] = "
<< "__builtin_shufflevector("
<< Src.getName() << ".val[" << K << "], "
<< Src.getName() << ".val[" << K << "]";
for (int J = Dest.getType().getNumElements() - 1; J >= 0; --J)
OS << ", " << J;
OS << ");";
<< "__builtin_shufflevector(" << Src.getName() << ".val[" << K << "], "
<< Src.getName() << ".val[" << K << "], __lane_reverse_"
<< Dest.getType().getSizeInBits() << "_"
<< Dest.getType().getElementSizeInBits() << ");";
emitNewLine();
}
} else {
OS << " " << Dest.getName()
<< " = __builtin_shufflevector(" << Src.getName() << ", " << Src.getName();
for (int J = Dest.getType().getNumElements() - 1; J >= 0; --J)
OS << ", " << J;
OS << ");";
OS << " " << Dest.getName() << " = __builtin_shufflevector("
<< Src.getName() << ", " << Src.getName() << ", __lane_reverse_"
<< Dest.getType().getSizeInBits() << "_"
<< Dest.getType().getElementSizeInBits() << ");";
emitNewLine();
}
}
Expand Down Expand Up @@ -1877,10 +1874,11 @@ std::string Intrinsic::generate() {

OS << "#else\n";

// Big endian intrinsics are more complex. The user intended these
// intrinsics to operate on a vector "as-if" loaded by (V)LDR,
// but we load as-if (V)LD1. So we should swap all arguments and
// swap the return value too.
// Big endian intrinsics are more complex. The user intended these intrinsics
// to operate on a vector "as-if" loaded by LDR (for AArch64), VLDR (for
// 64-bit vectors on AArch32), or VLDM (for 128-bit vectors on AArch32) but
// we load as-if LD1 (for AArch64) or VLD1 (for AArch32). So we should swap
// all arguments and swap the return value too.
//
// If we call sub-intrinsics, we should call a version that does
// not re-swap the arguments!
Expand Down Expand Up @@ -2434,6 +2432,31 @@ void NeonEmitter::run(raw_ostream &OS) {
OS << "#define __ai static __inline__ __attribute__((__always_inline__, "
"__nodebug__))\n\n";

// Shufflevector arguments lists for endian-swapping vectors for big-endian
// targets. For AArch64, we need to reverse every lane in the vector, but for
// AArch32 we need to reverse the lanes within each 64-bit chunk of the
// vector. The naming convention here is __lane_reverse_<n>_<m>, where <n> is
// the length of the vector in bits, and <m> is length of each lane in bits.
OS << "#if !defined(__LITTLE_ENDIAN__)\n";
OS << "#if defined(__aarch64__) || defined(__arm64ec__)\n";
OS << "#define __lane_reverse_64_32 1,0\n";
OS << "#define __lane_reverse_64_16 3,2,1,0\n";
OS << "#define __lane_reverse_64_8 7,6,5,4,3,2,1,0\n";
OS << "#define __lane_reverse_128_64 1,0\n";
OS << "#define __lane_reverse_128_32 3,2,1,0\n";
OS << "#define __lane_reverse_128_16 7,6,5,4,3,2,1,0\n";
OS << "#define __lane_reverse_128_8 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0\n";
OS << "#else\n";
OS << "#define __lane_reverse_64_32 1,0\n";
OS << "#define __lane_reverse_64_16 3,2,1,0\n";
OS << "#define __lane_reverse_64_8 7,6,5,4,3,2,1,0\n";
OS << "#define __lane_reverse_128_64 0,1\n";
OS << "#define __lane_reverse_128_32 1,0,3,2\n";
OS << "#define __lane_reverse_128_16 3,2,1,0,7,6,5,4\n";
OS << "#define __lane_reverse_128_8 7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8\n";
OS << "#endif\n";
OS << "#endif\n";

SmallVector<Intrinsic *, 128> Defs;
for (const Record *R : Records.getAllDerivedDefinitions("Inst"))
createIntrinsic(R, Defs);
Expand Down