[ARM][Codegen] Fix vector data miscompilation in arm32be #105519

Zhenhang1213 · 2024-08-21T13:02:31Z

Fix #102418， resolved the issue of generating incorrect vrev during vectorization in big-endian scenarios

github-actions · 2024-08-21T13:02:48Z

Thank you for submitting a Pull Request (PR) to the LLVM Project!

This PR will be automatically labeled and the relevant teams will be notified.

If you wish to, you can add reviewers by using the "Reviewers" section on this page.

If this is not working for you, it is probably because you do not have write permissions for the repository. In which case you can instead tag reviewers by name in a comment by using @ followed by their GitHub username.

If you have received no comments on your PR for a week, you can request a review by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate is once a week. Please remember that you are asking for valuable time from other developers.

If you have further questions, they may be answered by the LLVM GitHub User Guide.

You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums.

llvmbot · 2024-08-21T13:03:21Z

@llvm/pr-subscribers-backend-arm

Author: Austin (Zhenhang1213)

Changes

Fix #102418

Full diff: https://github.com/llvm/llvm-project/pull/105519.diff

8 Files Affected:

(modified) llvm/lib/Target/ARM/ARMISelLowering.cpp (+1-15)
(modified) llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll (+1-2)
(modified) llvm/test/CodeGen/ARM/big-endian-vmov.ll (+11-10)
(modified) llvm/test/CodeGen/ARM/sdiv_shl.ll (+9-8)
(modified) llvm/test/CodeGen/ARM/vmov.ll (+2-1)
(modified) llvm/test/CodeGen/Thumb2/mve-be.ll (-2)
(modified) llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovl.ll (-2)
(modified) llvm/test/CodeGen/Thumb2/mve-masked-load.ll (+14-11)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 1e8bb8a495e68b..05e3fc6f505268 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -7121,19 +7121,6 @@ static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
       ImmMask <<= 1;
     }
 
-    if (DAG.getDataLayout().isBigEndian()) {
-      // Reverse the order of elements within the vector.
-      unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;
-      unsigned Mask = (1 << BytesPerElem) - 1;
-      unsigned NumElems = 8 / BytesPerElem;
-      unsigned NewImm = 0;
-      for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {
-        unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);
-        NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;
-      }
-      Imm = NewImm;
-    }
-
     // Op=1, Cmode=1110.
     OpCmode = 0x1e;
     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
@@ -7966,7 +7953,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
       if (Val.getNode()) {
         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
-        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
+        return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
       }
 
       // Try an immediate VMVN.
@@ -18601,7 +18588,6 @@ static SDValue PerformBITCASTCombine(SDNode *N,
   if ((Src.getOpcode() == ARMISD::VMOVIMM ||
        Src.getOpcode() == ARMISD::VMVNIMM ||
        Src.getOpcode() == ARMISD::VMOVFPIMM) &&
-      SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
       DAG.getDataLayout().isBigEndian())
     return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
 
diff --git a/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll b/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
index 4026495a0f2b41..a4f5d1c61eae73 100644
--- a/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
+++ b/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
@@ -101,9 +101,8 @@ define void @conv_v4i16_to_v4f16( <4 x i16> %a, ptr %store ) {
 ; CHECK-NEXT:    vmov.i64 d16, #0xffff00000000ffff
 ; CHECK-NEXT:    vldr d17, [r0]
 ; CHECK-NEXT:    vrev64.16 d18, d0
-; CHECK-NEXT:    vrev64.16 d17, d17
-; CHECK-NEXT:    vrev64.16 d16, d16
 ; CHECK-NEXT:    vadd.i16 d16, d18, d16
+; CHECK-NEXT:    vrev64.16 d17, d17
 ; CHECK-NEXT:    vadd.f16 d16, d16, d17
 ; CHECK-NEXT:    vrev64.16 d16, d16
 ; CHECK-NEXT:    vstr d16, [r0]
diff --git a/llvm/test/CodeGen/ARM/big-endian-vmov.ll b/llvm/test/CodeGen/ARM/big-endian-vmov.ll
index 2cb22b4d5fbc26..a29a8eb733219f 100644
--- a/llvm/test/CodeGen/ARM/big-endian-vmov.ll
+++ b/llvm/test/CodeGen/ARM/big-endian-vmov.ll
@@ -10,7 +10,7 @@ define arm_aapcs_vfpcc <8 x i8> @vmov_i8() {
 ;
 ; CHECK-BE-LABEL: vmov_i8:
 ; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i64 d0, #0xff
+; CHECK-BE-NEXT:    vmov.i64 d0, #0xff00000000000000Î
 ; CHECK-BE-NEXT:    bx lr
   ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1>
 }
@@ -23,7 +23,7 @@ define arm_aapcs_vfpcc <4 x i16> @vmov_i16_a() {
 ;
 ; CHECK-BE-LABEL: vmov_i16_a:
 ; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i64 d0, #0xffff
+; CHECK-BE-NEXT:    vmov.i64 d0, #0xffff000000000000
 ; CHECK-BE-NEXT:    bx lr
   ret <4 x i16> <i16 0, i16 0, i16 0, i16 -1>
 }
@@ -36,7 +36,7 @@ define arm_aapcs_vfpcc <4 x i16> @vmov_i16_b() {
 ;
 ; CHECK-BE-LABEL: vmov_i16_b:
 ; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i64 d0, #0xff
+; CHECK-BE-NEXT:    vmov.i64 d0, #0xff000000000000
 ; CHECK-BE-NEXT:    bx lr
   ret <4 x i16> <i16 0, i16 0, i16 0, i16 255>
 }
@@ -49,7 +49,7 @@ define arm_aapcs_vfpcc <4 x i16> @vmov_i16_c() {
 ;
 ; CHECK-BE-LABEL: vmov_i16_c:
 ; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i64 d0, #0xff00
+; CHECK-BE-NEXT:    vmov.i64 d0, #0xff00000000000000
 ; CHECK-BE-NEXT:    bx lr
   ret <4 x i16> <i16 0, i16 0, i16 0, i16 65280>
 }
@@ -62,7 +62,7 @@ define arm_aapcs_vfpcc <2 x i32> @vmov_i32_a() {
 ;
 ; CHECK-BE-LABEL: vmov_i32_a:
 ; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i64 d0, #0xffffffff
+; CHECK-BE-NEXT:    vmov.i64 d0, #0xffffffff00000000
 ; CHECK-BE-NEXT:    bx lr
   ret <2 x i32> <i32 0, i32 -1>
 }
@@ -75,7 +75,7 @@ define arm_aapcs_vfpcc <2 x i32> @vmov_i32_b() {
 ;
 ; CHECK-BE-LABEL: vmov_i32_b:
 ; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i64 d0, #0xff
+; CHECK-BE-NEXT:    vmov.i64 d0, #0xff00000000
 ; CHECK-BE-NEXT:    bx lr
   ret <2 x i32> <i32 0, i32 255>
 }
@@ -88,7 +88,7 @@ define arm_aapcs_vfpcc <2 x i32> @vmov_i32_c() {
 ;
 ; CHECK-BE-LABEL: vmov_i32_c:
 ; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i64 d0, #0xff00
+; CHECK-BE-NEXT:    vmov.i64 d0, #0xff0000000000
 ; CHECK-BE-NEXT:    bx lr
   ret <2 x i32> <i32 0, i32 65280>
 }
@@ -101,7 +101,7 @@ define arm_aapcs_vfpcc <2 x i32> @vmov_i32_d() {
 ;
 ; CHECK-BE-LABEL: vmov_i32_d:
 ; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i64 d0, #0xff0000
+; CHECK-BE-NEXT:    vmov.i64 d0, #0xff000000000000
 ; CHECK-BE-NEXT:    bx lr
   ret <2 x i32> <i32 0, i32 16711680>
 }
@@ -114,7 +114,7 @@ define arm_aapcs_vfpcc <2 x i32> @vmov_i32_e() {
 ;
 ; CHECK-BE-LABEL: vmov_i32_e:
 ; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    vmov.i64 d0, #0xff000000
+; CHECK-BE-NEXT:    vmov.i64 d0, #0xff00000000000000
 ; CHECK-BE-NEXT:    bx lr
   ret <2 x i32> <i32 0, i32 4278190080>
 }
@@ -130,7 +130,8 @@ define arm_aapcs_vfpcc <1 x i64> @vmov_i64_a() {
 define arm_aapcs_vfpcc <1 x i64> @vmov_i64_b() {
 ; CHECK-LABEL: vmov_i64_b:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.i64 d0, #0xffff00ff0000ff
+; CHECK-LE: vmov.i64 d0, #0xffff00ff0000ff{{$}}
+; CHECK-BE: vmov.i64 d0, #0xff0000ff00ffff00{{$}}
 ; CHECK-NEXT:    bx lr
   ret <1 x i64> <i64 72056498804490495>
 }
diff --git a/llvm/test/CodeGen/ARM/sdiv_shl.ll b/llvm/test/CodeGen/ARM/sdiv_shl.ll
index 01615ce2c46af5..f105cefc3cbb0e 100644
--- a/llvm/test/CodeGen/ARM/sdiv_shl.ll
+++ b/llvm/test/CodeGen/ARM/sdiv_shl.ll
@@ -53,20 +53,21 @@ define void @sdiv_shl(ptr %x, ptr %y) nounwind {
 ; BE:       @ %bb.0: @ %entry
 ; BE-NEXT:    adr r2, .LCPI0_0
 ; BE-NEXT:    vld1.64 {d18, d19}, [r1]
-; BE-NEXT:    adr r1, .LCPI0_1
+; BE-NEXT:    vmov.i32        q10, #0x0
+; BE-NEXT:    adr     r1, .LCPI0_1
 ; BE-NEXT:    vld1.64 {d16, d17}, [r2:128]
 ; BE-NEXT:    vrev64.16 q8, q8
 ; BE-NEXT:    vrev64.16 q9, q9
-; BE-NEXT:    vneg.s16 q8, q8
-; BE-NEXT:    vld1.64 {d20, d21}, [r1:128]
+; BE-NEXT:    vsub.i16        q8, q10, q8
+; BE-NEXT:    vld1.64 {d22, d23}, [r1:128]
 ; BE-NEXT:    adr r1, .LCPI0_2
-; BE-NEXT:    vshr.s16 q11, q9, #15
-; BE-NEXT:    vrev64.16 q10, q10
-; BE-NEXT:    vshl.u16 q8, q11, q8
+; BE-NEXT:    vshr.s16 q12, q9, #15
+; BE-NEXT:    vrev64.16 q11, q11
+; BE-NEXT:    vshl.u16 q8, q12, q8
+; BE-NEXT:    vsub.i16 q10, q10, q11
 ; BE-NEXT:    vld1.64 {d22, d23}, [r1:128]
-; BE-NEXT:    vneg.s16 q10, q10
+; BE-NEXT:    vadd.i16  q8, q9, q8
 ; BE-NEXT:    vrev64.16 q11, q11
-; BE-NEXT:    vadd.i16 q8, q9, q8
 ; BE-NEXT:    vshl.s16 q8, q8, q10
 ; BE-NEXT:    vbit q8, q9, q11
 ; BE-NEXT:    vrev64.16 q8, q8
diff --git a/llvm/test/CodeGen/ARM/vmov.ll b/llvm/test/CodeGen/ARM/vmov.ll
index 8835497669b324..df7f968854e72d 100644
--- a/llvm/test/CodeGen/ARM/vmov.ll
+++ b/llvm/test/CodeGen/ARM/vmov.ll
@@ -141,7 +141,8 @@ define arm_aapcs_vfpcc <2 x i32> @v_mvni32f() nounwind {
 define arm_aapcs_vfpcc <1 x i64> @v_movi64() nounwind {
 ; CHECK-LABEL: v_movi64:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.i64 d0, #0xff0000ff0000ffff
+; CHECK-LE:      vmov.i64 d0, #0xff0000ff0000ffff
+; CHECK-BE:      vmov.i64 d0, #0xffffff0000ff
 ; CHECK-NEXT:    mov pc, lr
 	ret <1 x i64> < i64 18374687574888349695 >
 }
diff --git a/llvm/test/CodeGen/Thumb2/mve-be.ll b/llvm/test/CodeGen/Thumb2/mve-be.ll
index 522d6f8704b6af..c7066ee34dc902 100644
--- a/llvm/test/CodeGen/Thumb2/mve-be.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-be.ll
@@ -232,7 +232,6 @@ define arm_aapcs_vfpcc <16 x i8> @and_v16i8_le(<4 x i32> %src) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
 ; CHECK-BE-NEXT:    vmov.i32 q0, #0x1
-; CHECK-BE-NEXT:    vrev32.8 q0, q0
 ; CHECK-BE-NEXT:    vand q1, q1, q0
 ; CHECK-BE-NEXT:    vrev64.8 q0, q1
 ; CHECK-BE-NEXT:    bx lr
@@ -254,7 +253,6 @@ define arm_aapcs_vfpcc <16 x i8> @and_v16i8_be(<4 x i32> %src) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
 ; CHECK-BE-NEXT:    vmov.i32 q0, #0x1000000
-; CHECK-BE-NEXT:    vrev32.8 q0, q0
 ; CHECK-BE-NEXT:    vand q1, q1, q0
 ; CHECK-BE-NEXT:    vrev64.8 q0, q1
 ; CHECK-BE-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovl.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovl.ll
index fd33dddc685e5e..6d99dc4bd0b2f8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovl.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovl.ll
@@ -65,7 +65,6 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmovlbq_u16(<8 x i16> %a) {
 ; BE-LABEL: test_vmovlbq_u16:
 ; BE:       @ %bb.0: @ %entry
 ; BE-NEXT:    vrev64.16 q1, q0
-; BE-NEXT:    vmovlb.u16 q1, q1
 ; BE-NEXT:    vrev64.32 q0, q1
 ; BE-NEXT:    bx lr
 entry:
@@ -138,7 +137,6 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmovltq_u16(<8 x i16> %a) {
 ; BE:       @ %bb.0: @ %entry
 ; BE-NEXT:    vrev64.16 q1, q0
 ; BE-NEXT:    vmovlt.u16 q1, q1
-; BE-NEXT:    vrev64.32 q0, q1
 ; BE-NEXT:    bx lr
 entry:
   %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
index b0a3a6354daa70..255b003f11ff3a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
@@ -293,13 +293,14 @@ define arm_aapcs_vfpcc <4 x i32> @zext16_masked_v4i32_align2_other(ptr %dest, <4
 ;
 ; CHECK-BE-LABEL: zext16_masked_v4i32_align2_other:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    vrev64.32 q1, q0
-; CHECK-BE-NEXT:    vmovlb.u16 q0, q1
-; CHECK-BE-NEXT:    vmovlb.s16 q1, q1
+; CHECK-BE-NEXT:    vmov.i32        q1, #0xffff
+; CHECK-BE-NEXT:    vrev64.32       q2, q0
+; CHECK-BE-NEXT:    vand    q0, q2, q1
+; CHECK-BE-NEXT:    vmovlb.s16      q1, q2
 ; CHECK-BE-NEXT:    vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vldrht.u32 q1, [r0]
-; CHECK-BE-NEXT:    vpsel q1, q1, q0
-; CHECK-BE-NEXT:    vrev64.32 q0, q1
+; CHECK-BE-NEXT:    vldrht.u32      q1, [r0]
+; CHECK-BE-NEXT:    vpsel   q1, q1, q0
+; CHECK-BE-NEXT:    vrev64.32       q0, q1
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %c = icmp sgt <4 x i16> %a, zeroinitializer
@@ -2071,12 +2072,14 @@ define arm_aapcs_vfpcc <4 x i32> @multi_user_zext(ptr %dest, <4 x i32> %a) {
 ; CHECK-BE-NEXT:    vpush {d8, d9}
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vldrht.u32 q0, [r0]
-; CHECK-BE-NEXT:    vrev64.32 q4, q0
-; CHECK-BE-NEXT:    vmov r1, r0, d8
-; CHECK-BE-NEXT:    vmov r3, r2, d9
+; CHECK-BE-NEXT:    vldrht.u32 q4, [r0]
+; CHECK-BE-NEXT:    vrev64.32 q0, q4
+; CHECK-BE-NEXT:    vmov r1, r0, d0
+; CHECK-BE-NEXT:    vmov r3, r2, d1
 ; CHECK-BE-NEXT:    bl foo
-; CHECK-BE-NEXT:    vmov q0, q4
+; CHECK-BE-NEXT:    vmov.i32 q0, #0xffff
+; CHECK-BE-NEXT:    vand    q1, q4, q0
+; CHECK-BE-NEXT:    vrev64.32 q0, q1
 ; CHECK-BE-NEXT:    vpop {d8, d9}
 ; CHECK-BE-NEXT:    pop {r7, pc}
 entry:

davemgreen

Thanks for putting this up.

llvm/lib/Target/ARM/ARMISelLowering.cpp

llvm/test/CodeGen/ARM/big-endian-vmov.ll

llvm/lib/Target/ARM/ARMISelLowering.cpp

Zhenhang1213 · 2024-08-22T16:45:07Z

@davemgreen , could you review it again?

hstk30-hw · 2024-08-23T02:59:30Z

Split to two commit, then we can see which code impact which testcases.

Zhenhang1213 · 2024-08-24T07:06:40Z

Split to two commit, then we can see which code impact which testcases.

OK, split it, can see in this pr Link

davemgreen · 2024-08-24T08:45:24Z

llvm/test/CodeGen/Thumb2/mve-intrinsics/vmovl.ll

@@ -65,7 +65,8 @@ define arm_aapcs_vfpcc <4 x i32> @test_vmovlbq_u16(<8 x i16> %a) {
 ; BE-LABEL: test_vmovlbq_u16:
 ; BE:       @ %bb.0: @ %entry
 ; BE-NEXT:    vrev64.16 q1, q0
-; BE-NEXT:    vmovlb.u16 q1, q1
+; BE-NEXT:   vmov.i32 q0, #0xffff


Please run update_llc_test_checks.py to update the tests, it should make updating them nice an easy, and keeps them all consistent.

With the change in b9a0276 I'm hoping this difference goes away.

llvm/test/CodeGen/Thumb2/mve-vmovimm.ll

llvm/test/CodeGen/ARM/big-endian-vmov.ll

llvm/test/CodeGen/Thumb2/mve-vmovimm.ll

davemgreen · 2024-08-30T15:12:32Z

Did you manage to look into fixing PerformBITCASTCombine yet?

Zhenhang1213 · 2024-08-31T01:03:05Z

Did you manage to look into fixing PerformBITCASTCombine yet?

Logically, I understand that this change does not involve PerformBITCASTCombine function, right?

Zhenhang1213 · 2024-08-31T09:31:24Z

Did you manage to look into fixing PerformBITCASTCombine yet?

And I think if we don't fix by change ISD::BITCAST to ARMISD::VECTOR_REG_CAST, could I Identify the failure scenario of this test case, and in PerformBITCASTCombine function, call ARMISD::VECTOR_REG_CAST?

Zhenhang1213 · 2024-08-31T16:42:00Z

Did you manage to look into fixing PerformBITCASTCombine yet?

And I think if we don't fix by change ISD::BITCAST to ARMISD::VECTOR_REG_CAST, could I Identify the failure scenario of this test case, and in PerformBITCASTCombine function, call ARMISD::VECTOR_REG_CAST?

I looked at this question again and did I have a chance to flip it when concat_vectors in PerformBITCASTCombine ?

davemgreen · 2024-08-31T16:43:53Z

Hi - I think all the changes you have so far are good, but some of the tests are still wrong and it would be good to fix PerformBITCASTCombine too.

If you consider the xor_int64_ff0000ff0000ffff case:

define arm_aapcs_vfpcc <2 x i64> @xor_int64_ff0000ff0000ffff(<2 x i64> %a) {
entry:
  %b = xor <2 x i64> %a, <i64 -72056498821201921, i64 -72056498821201921>
  ret <2 x i64> %b
}

A series of transforms, which I believe are all OK, lead to:

    t3: v2i64 = bitcast t2
        t16: v2i64 = ARMISD::VMOVIMM TargetConstant:i32<7737>
      t17: v4i32 = ARMISD::VECTOR_REG_CAST t16
    t14: v2i64 = bitcast t17
  t6: v2i64 = xor t3, t14
t7: v2f64 = bitcast t6

The bitcast ( VECTOR_REG_CAST ( VMOVIMM )) is converted to VECTOR_REG_CAST ( VMOVIMM ) as it is believed that the bitcast does not matter as each lane of the input will be identical. It only looks at the bitcast dst type and the vmovimm type, ignoring that the VECTOR_REG_CAST makes the bitcast important.

It ends up with this, that has the top/bottom half of each i64 in the wrong order.

   t16: v2i64 = ARMISD::VMOVIMM TargetConstant:i32<7737>
 t6: v2i64 = xor t3, t16

  vmov.i64        q8, #0xffffff0000ff
  veor    q0, q0, q8

Zhenhang1213 · 2024-09-02T15:08:32Z

Hi - I think all the changes you have so far are good, but some of the tests are still wrong and it would be good to fix PerformBITCASTCombine too.

If you consider the xor_int64_ff0000ff0000ffff case:
define arm_aapcs_vfpcc <2 x i64> @xor_int64_ff0000ff0000ffff(<2 x i64> %a) {
entry:
  %b = xor <2 x i64> %a, <i64 -72056498821201921, i64 -72056498821201921>
  ret <2 x i64> %b
}
A series of transforms, which I believe are all OK, lead to:
    t3: v2i64 = bitcast t2
        t16: v2i64 = ARMISD::VMOVIMM TargetConstant:i32<7737>
      t17: v4i32 = ARMISD::VECTOR_REG_CAST t16
    t14: v2i64 = bitcast t17
  t6: v2i64 = xor t3, t14
t7: v2f64 = bitcast t6
The bitcast ( VECTOR_REG_CAST ( VMOVIMM )) is converted to VECTOR_REG_CAST ( VMOVIMM ) as it is believed that the bitcast does not matter as each lane of the input will be identical. It only looks at the bitcast dst type and the vmovimm type, ignoring that the VECTOR_REG_CAST makes the bitcast important.

It ends up with this, that has the top/bottom half of each i64 in the wrong order.
   t16: v2i64 = ARMISD::VMOVIMM TargetConstant:i32<7737>
 t6: v2i64 = xor t3, t16

  vmov.i64        q8, #0xffffff0000ff
  veor    q0, q0, q8

I have modify the tests, OK?

davemgreen · 2024-09-04T13:26:52Z

Sorry I haven't had a chance to look into this in detail yet. I wasn't really sure when I last looked what the best fix for that part was - I was thinking it might be worth just removing the while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST) Src = Src.getOperand(0); if there isn't a better idea. As far as I understand the part for bitcasts is always correct (as the vmov element size is smaller than swapping around each lanes in the larger size doesn't alter the result).

If we want to keep that optimization, then IMO it might be good the have a combine that works a little differently - that looks through bitcast(vector_reg_cast(vmovimm)), reconstructs the constant as it should be and generates a new vmovimm with the correct value. It should handle more cases. In the meantime, so long as everyone is happy enough with the code being a bit slower we can just drop the while loop.

Zhenhang1213 · 2024-09-05T13:52:25Z

Sorry I haven't had a chance to look into this in detail yet. I wasn't really sure when I last looked what the best fix for that part was - I was thinking it might be worth just removing the while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST) Src = Src.getOperand(0); if there isn't a better idea. As far as I understand the part for bitcasts is always correct (as the vmov element size is smaller than swapping around each lanes in the larger size doesn't alter the result).

If we want to keep that optimization, then IMO it might be good the have a combine that works a little differently - that looks through bitcast(vector_reg_cast(vmovimm)), reconstructs the constant as it should be and generates a new vmovimm with the correct value. It should handle more cases. In the meantime, so long as everyone is happy enough with the code being a bit slower we can just drop the while loop.

Hi, yesterday I try to make it more restictive

SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits()

to

SrcVT.getScalarSizeInBits() < DstVT.getScalarSizeInBits()

and the case xor_int64_ff0000ff0000ffff is satisfiying, right？ Thanks

 CHECKBE-LABEL: xor_int64_ff0000ff0000ffff:
; CHECKBE:       @ %bb.0: @ %entry
; CHECKBE-NEXT:    vmov.i64 q1, #0xffffff0000ff
; CHECKBE-NEXT:    vrev64.32 q2, q1
; CHECKBE-NEXT:    veor q0, q0, q2
; CHECKBE-NEXT:    bx lr

davemgreen · 2024-09-06T07:26:53Z

I do believe xor_int64_ff0000ff0000ffff is correct now. My worry is that that might mean that cases that are legal to fold would not any more, and possibly not fix cases that should be blocked. Something like (v4i32 bitcast (v16i8 regcast (v8i16 vmovimm))).

Can you either remove that while loop, or maybe change it to something like:

  // We may have a bitcast of something that has already had this bitcast
  // combine performed on it, so skip past any VECTOR_REG_CASTs.
  if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
      Src.getOperand(0).getValueType().getScalarSizeInBits() <=
          Src.getValueType().getScalarSizeInBits())
    Src = Src.getOperand(0);

davemgreen

Thanks, this is looking good to me now with a couple of adjustments.

llvm/lib/Target/ARM/ARMISelLowering.cpp

llvm/test/CodeGen/ARM/big-endian-vmov.ll

github-actions · 2024-09-06T14:19:56Z

✅ With the latest revision this PR passed the C/C++ code formatter.

davemgreen

Thanks. LGTM

llvm/test/CodeGen/ARM/big-endian-vmov.ll

…XME in tests

…ASTCombine

github-actions · 2024-09-07T06:09:45Z

@Zhenhang1213 Congratulations on having your first Pull Request (PR) merged into the LLVM Project!

Your changes will be combined with recent changes from other authors, then tested by our build bots. If there is a problem with a build, you may receive a report in an email or a comment on this PR.

Please check whether problems have been caused by your change specifically, as the builds can include changes from many authors. It is not uncommon for your change to be included in a build that fails due to someone else's changes, or infrastructure issues.

How to do this, and the rest of the post-merge process, is covered in detail here.

If your change does cause a problem, it may be reverted, or you can revert it yourself. This is a normal part of LLVM development. You can fix your changes and open a new PR to merge them again.

If you don't get any reports, no action is required from you. Your changes are working as expected, well done!

llvm-ci · 2024-09-07T07:00:07Z

LLVM Buildbot has detected a new failure on builder sanitizer-aarch64-linux running on sanitizer-buildbot8 while building llvm at step 2 "annotate".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/51/builds/3494

Here is the relevant piece of the build log for the reference

Step 2 (annotate) failure: 'python ../sanitizer_buildbot/sanitizers/zorg/buildbot/builders/sanitizers/buildbot_selector.py' (failure)
...
[172/176] Generating MSAN_INST_TEST_OBJECTS.msan_test.cpp.aarch64-with-call.o
[173/176] Generating Msan-aarch64-with-call-Test
[174/176] Generating MSAN_INST_TEST_OBJECTS.msan_test.cpp.aarch64.o
[175/176] Generating Msan-aarch64-Test
[175/176] Running compiler_rt regression tests
llvm-lit: /home/b/sanitizer-aarch64-linux/build/llvm-project/llvm/utils/lit/lit/discovery.py:276: warning: input '/home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/interception/Unit' contained no tests
llvm-lit: /home/b/sanitizer-aarch64-linux/build/llvm-project/llvm/utils/lit/lit/discovery.py:276: warning: input '/home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/Unit' contained no tests
llvm-lit: /home/b/sanitizer-aarch64-linux/build/llvm-project/llvm/utils/lit/lit/main.py:72: note: The test suite configuration requested an individual test timeout of 0 seconds but a timeout of 900 seconds was requested on the command line. Forcing timeout to be 900 seconds.
-- Testing: 5639 tests, 48 workers --
Testing:  0.. 10.. 20.. 
FAIL: LeakSanitizer-AddressSanitizer-aarch64 :: TestCases/use_registers.cpp (1799 of 5639)
******************** TEST 'LeakSanitizer-AddressSanitizer-aarch64 :: TestCases/use_registers.cpp' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
Test alloc: 0xe78bd8c20080 

=================================================================
==2088527==ERROR: LeakSanitizer: detected memory leaks

Direct leak of 1337 byte(s) in 1 object(s) allocated from:
    #0 0xb4e94c64d9dc in malloc /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:68:3
    #1 0xb4e94c778c78 in registers_thread_func /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/TestCases/use_registers.cpp:16:13
    #2 0xb4e94c64b2c0 in asan_thread_start(void*) /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/lib/asan/asan_interceptors.cpp:239:28
    #3 0xe9ebd9aeba48 in thread_start misc/../sysdeps/unix/sysv/linux/aarch64/clone3.S:76

Objects leaked above:
0xe78bd8c20080 (1337 bytes)

SUMMARY: AddressSanitizer: 1337 byte(s) leaked in 1 allocation(s).

--
Command Output (stderr):
--
RUN: at line 2: /home/b/sanitizer-aarch64-linux/build/build_default/./bin/clang  --driver-mode=g++ -O0   -Wthread-safety -Wthread-safety-reference -Wthread-safety-beta   -gline-tables-only -fsanitize=address -I/home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/../ -pthread /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/TestCases/use_registers.cpp -o /home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/lsan/AARCH64AsanConfig/TestCases/Output/use_registers.cpp.tmp
+ /home/b/sanitizer-aarch64-linux/build/build_default/./bin/clang --driver-mode=g++ -O0 -Wthread-safety -Wthread-safety-reference -Wthread-safety-beta -gline-tables-only -fsanitize=address -I/home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/../ -pthread /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/TestCases/use_registers.cpp -o /home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/lsan/AARCH64AsanConfig/TestCases/Output/use_registers.cpp.tmp
RUN: at line 3: env LSAN_OPTIONS=:detect_leaks=1:"report_objects=1:use_stacks=0:use_registers=0" not  /home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/lsan/AARCH64AsanConfig/TestCases/Output/use_registers.cpp.tmp 2>&1 | FileCheck /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/TestCases/use_registers.cpp
+ env LSAN_OPTIONS=:detect_leaks=1:report_objects=1:use_stacks=0:use_registers=0 not /home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/lsan/AARCH64AsanConfig/TestCases/Output/use_registers.cpp.tmp
+ FileCheck /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/TestCases/use_registers.cpp
RUN: at line 4: env LSAN_OPTIONS=:detect_leaks=1:"report_objects=1:use_stacks=0:use_registers=1"  /home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/lsan/AARCH64AsanConfig/TestCases/Output/use_registers.cpp.tmp 2>&1
+ env LSAN_OPTIONS=:detect_leaks=1:report_objects=1:use_stacks=0:use_registers=1 /home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/lsan/AARCH64AsanConfig/TestCases/Output/use_registers.cpp.tmp

--

********************
Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90.. 

2 warning(s) in tests
Step 9 (test compiler-rt symbolizer) failure: test compiler-rt symbolizer (failure)
...
[172/176] Generating MSAN_INST_TEST_OBJECTS.msan_test.cpp.aarch64-with-call.o
[173/176] Generating Msan-aarch64-with-call-Test
[174/176] Generating MSAN_INST_TEST_OBJECTS.msan_test.cpp.aarch64.o
[175/176] Generating Msan-aarch64-Test
[175/176] Running compiler_rt regression tests
llvm-lit: /home/b/sanitizer-aarch64-linux/build/llvm-project/llvm/utils/lit/lit/discovery.py:276: warning: input '/home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/interception/Unit' contained no tests
llvm-lit: /home/b/sanitizer-aarch64-linux/build/llvm-project/llvm/utils/lit/lit/discovery.py:276: warning: input '/home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/sanitizer_common/Unit' contained no tests
llvm-lit: /home/b/sanitizer-aarch64-linux/build/llvm-project/llvm/utils/lit/lit/main.py:72: note: The test suite configuration requested an individual test timeout of 0 seconds but a timeout of 900 seconds was requested on the command line. Forcing timeout to be 900 seconds.
-- Testing: 5639 tests, 48 workers --
Testing:  0.. 10.. 20.. 
FAIL: LeakSanitizer-AddressSanitizer-aarch64 :: TestCases/use_registers.cpp (1799 of 5639)
******************** TEST 'LeakSanitizer-AddressSanitizer-aarch64 :: TestCases/use_registers.cpp' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
Test alloc: 0xe78bd8c20080 

=================================================================
==2088527==ERROR: LeakSanitizer: detected memory leaks

Direct leak of 1337 byte(s) in 1 object(s) allocated from:
    #0 0xb4e94c64d9dc in malloc /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:68:3
    #1 0xb4e94c778c78 in registers_thread_func /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/TestCases/use_registers.cpp:16:13
    #2 0xb4e94c64b2c0 in asan_thread_start(void*) /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/lib/asan/asan_interceptors.cpp:239:28
    #3 0xe9ebd9aeba48 in thread_start misc/../sysdeps/unix/sysv/linux/aarch64/clone3.S:76

Objects leaked above:
0xe78bd8c20080 (1337 bytes)

SUMMARY: AddressSanitizer: 1337 byte(s) leaked in 1 allocation(s).

--
Command Output (stderr):
--
RUN: at line 2: /home/b/sanitizer-aarch64-linux/build/build_default/./bin/clang  --driver-mode=g++ -O0   -Wthread-safety -Wthread-safety-reference -Wthread-safety-beta   -gline-tables-only -fsanitize=address -I/home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/../ -pthread /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/TestCases/use_registers.cpp -o /home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/lsan/AARCH64AsanConfig/TestCases/Output/use_registers.cpp.tmp
+ /home/b/sanitizer-aarch64-linux/build/build_default/./bin/clang --driver-mode=g++ -O0 -Wthread-safety -Wthread-safety-reference -Wthread-safety-beta -gline-tables-only -fsanitize=address -I/home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/../ -pthread /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/TestCases/use_registers.cpp -o /home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/lsan/AARCH64AsanConfig/TestCases/Output/use_registers.cpp.tmp
RUN: at line 3: env LSAN_OPTIONS=:detect_leaks=1:"report_objects=1:use_stacks=0:use_registers=0" not  /home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/lsan/AARCH64AsanConfig/TestCases/Output/use_registers.cpp.tmp 2>&1 | FileCheck /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/TestCases/use_registers.cpp
+ env LSAN_OPTIONS=:detect_leaks=1:report_objects=1:use_stacks=0:use_registers=0 not /home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/lsan/AARCH64AsanConfig/TestCases/Output/use_registers.cpp.tmp
+ FileCheck /home/b/sanitizer-aarch64-linux/build/llvm-project/compiler-rt/test/lsan/TestCases/use_registers.cpp
RUN: at line 4: env LSAN_OPTIONS=:detect_leaks=1:"report_objects=1:use_stacks=0:use_registers=1"  /home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/lsan/AARCH64AsanConfig/TestCases/Output/use_registers.cpp.tmp 2>&1
+ env LSAN_OPTIONS=:detect_leaks=1:report_objects=1:use_stacks=0:use_registers=1 /home/b/sanitizer-aarch64-linux/build/build_default/runtimes/runtimes-bins/compiler-rt/test/lsan/AARCH64AsanConfig/TestCases/Output/use_registers.cpp.tmp

--

********************
Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90..

2 warning(s) in tests

This is a smaller follow on to llvm#105519 that fixes VBICimm and VORRimm too. The logic behind lowering vector immediates under big endian Neon/MVE is to treat them in natural lane ordering (same as little endian), and VECTOR_REG_CAST them to the correct type (as opposed to creating the constants in big endian form and bitcasting them). This makes sure that is done when creating VORRIMM and VBICIMM.

This is a smaller follow on to #105519 that fixes VBICimm and VORRimm too. The logic behind lowering vector immediates under big endian Neon/MVE is to treat them in natural lane ordering (same as little endian), and VECTOR_REG_CAST them to the correct type (as opposed to creating the constants in big endian form and bitcasting them). This makes sure that is done when creating VORRIMM and VBICIMM.

llvmbot added the backend:ARM label Aug 21, 2024

CoTinker requested review from davemgreen, MacDue and hstk30-hw August 21, 2024 13:04

CoTinker changed the title ~~[ARM][Codegen] Fix vector data misscompilation in arm32be~~ [ARM][Codegen] Fix vector data miscompilation in arm32be Aug 21, 2024

davemgreen requested a review from john-brawn-arm August 21, 2024 17:34

davemgreen reviewed Aug 21, 2024

View reviewed changes

llvm/lib/Target/ARM/ARMISelLowering.cpp Show resolved Hide resolved

llvm/test/CodeGen/ARM/big-endian-vmov.ll Outdated Show resolved Hide resolved

llvm/lib/Target/ARM/ARMISelLowering.cpp Outdated Show resolved Hide resolved

davemgreen mentioned this pull request Aug 24, 2024

[Clang][Codegen] fix vector data by modifying VMVN #105913

Closed

davemgreen reviewed Aug 24, 2024

View reviewed changes

davemgreen reviewed Aug 26, 2024

View reviewed changes

llvm/test/CodeGen/ARM/big-endian-vmov.ll Show resolved Hide resolved

llvm/test/CodeGen/ARM/big-endian-vmov.ll Show resolved Hide resolved

llvm/test/CodeGen/Thumb2/mve-vmovimm.ll Show resolved Hide resolved

davemgreen reviewed Sep 6, 2024

View reviewed changes

llvm/lib/Target/ARM/ARMISelLowering.cpp Outdated Show resolved Hide resolved

llvm/test/CodeGen/ARM/big-endian-vmov.ll Show resolved Hide resolved

davemgreen approved these changes Sep 6, 2024

View reviewed changes

llvm/test/CodeGen/ARM/big-endian-vmov.ll Show resolved Hide resolved

Zhenhang1213 added 3 commits September 7, 2024 09:28

[ARM][Codegen] Fix vector data miscompilation in arm32be

9905f7c

[Clang][Codegen] fix vector data by modifying VMVN

8153661

fix rearranged i64 vectors in PerformBITCASTCombine and remove the FI…

0cef137

…XME in tests

Changing the loop of skiping past any VECTOR_REG_CASTs in PerformBITC…

a24a444

…ASTCombine

CoTinker merged commit 3242e77 into llvm:main Sep 7, 2024
8 checks passed

davemgreen mentioned this pull request Sep 9, 2024

[ARM] Fix VBICimm and VORRimm generation under Big endian. #107813

Merged

[ARM][Codegen] Fix vector data miscompilation in arm32be #105519

[ARM][Codegen] Fix vector data miscompilation in arm32be #105519

Uh oh!

Conversation

Zhenhang1213 commented Aug 21, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Aug 21, 2024

Uh oh!

llvmbot commented Aug 21, 2024

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Zhenhang1213 commented Aug 22, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

hstk30-hw commented Aug 23, 2024

Uh oh!

Zhenhang1213 commented Aug 24, 2024

Uh oh!

davemgreen Aug 24, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

davemgreen commented Aug 30, 2024

Uh oh!

Zhenhang1213 commented Aug 31, 2024

Uh oh!

Zhenhang1213 commented Aug 31, 2024

Uh oh!

Zhenhang1213 commented Aug 31, 2024

Uh oh!

davemgreen commented Aug 31, 2024

Uh oh!

Zhenhang1213 commented Sep 2, 2024

Uh oh!

davemgreen commented Sep 4, 2024

Uh oh!

Zhenhang1213 commented Sep 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

davemgreen commented Sep 6, 2024

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Sep 6, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Sep 7, 2024

Uh oh!

llvm-ci commented Sep 7, 2024

Uh oh!

Uh oh!

Zhenhang1213 commented Aug 21, 2024 •

edited

Loading

Zhenhang1213 commented Aug 22, 2024 •

edited

Loading

Zhenhang1213 commented Sep 5, 2024 •

edited

Loading

github-actions bot commented Sep 6, 2024 •

edited

Loading