[AArch64][SVE] Support logical operation BIC with DestructiveBinary patterns

vfdff · vfdff · commit ffb109b6852d · 2022-11-11T09:15:50.000+08:00
Logical operation BIC with DestructiveBinary patterns is temporarily removed as causes an assert (commit 3c382ed), so try to fix that. The most significant being that for pseudo instructions that do not have real instructions (including movpfx'd ones) that cover all combinations of register allocation, their expansion will be broken. This is the main reason the zeroing is an experimental feature because it has known bugs. So we add an extra LSL for movprfx expand BIC_ZPZZ_ZERO A, P, A, A when necessary. movprfx z0.s, p0/z, z0.s lsl z0.b, p0/m, z0.b, #0 bic z0.s, p0/m, z0.s, z0.s Depends on D88595
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -447,15 +447,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
   uint64_t DType = TII->get(Opcode).TSFlags & AArch64::DestructiveInstTypeMask;
   uint64_t FalseLanes = MI.getDesc().TSFlags & AArch64::FalseLanesMask;
   bool FalseZero = FalseLanes == AArch64::FalseLanesZero;
-
   Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
-
-  if (DType == AArch64::DestructiveBinary)
-    assert(DstReg != MI.getOperand(3).getReg());
-
   bool UseRev = false;
   unsigned PredIdx, DOPIdx, SrcIdx, Src2Idx;
+
   switch (DType) {
   case AArch64::DestructiveBinaryComm:
   case AArch64::DestructiveBinaryCommWithRev:
@@ -489,12 +485,13 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
     llvm_unreachable("Unsupported Destructive Operand type");
   }
 
-#ifndef NDEBUG
   // MOVPRFX can only be used if the destination operand
   // is the destructive operand, not as any other operand,
   // so the Destructive Operand must be unique.
   bool DOPRegIsUnique = false;
   switch (DType) {
+  case AArch64::DestructiveBinary:
+    DOPRegIsUnique = DstReg != MI.getOperand(SrcIdx).getReg();
   case AArch64::DestructiveBinaryComm:
   case AArch64::DestructiveBinaryCommWithRev:
     DOPRegIsUnique =
@@ -512,7 +509,6 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
          MI.getOperand(DOPIdx).getReg() != MI.getOperand(Src2Idx).getReg());
     break;
   }
-#endif
 
   // Resolve the reverse opcode
   if (UseRev) {
@@ -527,23 +523,27 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
 
   // Get the right MOVPRFX
   uint64_t ElementSize = TII->getElementSizeForOpcode(Opcode);
-  unsigned MovPrfx, MovPrfxZero;
+  unsigned MovPrfx, LSLZero, MovPrfxZero;
   switch (ElementSize) {
   case AArch64::ElementSizeNone:
   case AArch64::ElementSizeB:
     MovPrfx = AArch64::MOVPRFX_ZZ;
+    LSLZero = AArch64::LSL_ZPmI_B;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_B;
     break;
   case AArch64::ElementSizeH:
     MovPrfx = AArch64::MOVPRFX_ZZ;
+    LSLZero = AArch64::LSL_ZPmI_H;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_H;
     break;
   case AArch64::ElementSizeS:
     MovPrfx = AArch64::MOVPRFX_ZZ;
+    LSLZero = AArch64::LSL_ZPmI_S;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_S;
     break;
   case AArch64::ElementSizeD:
     MovPrfx = AArch64::MOVPRFX_ZZ;
+    LSLZero = AArch64::LSL_ZPmI_D;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_D;
     break;
   default:
@@ -555,9 +555,10 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
   //
   MachineInstrBuilder PRFX, DOP;
   if (FalseZero) {
-#ifndef NDEBUG
-    assert(DOPRegIsUnique && "The destructive operand should be unique");
-#endif
+    // If we cannot prefix the requested instruction we'll instead emit a
+    // prefixed_zeroing_mov for DestructiveBinary.
+    assert((DOPRegIsUnique || AArch64::DestructiveBinary == DType) &&
+           "The destructive operand should be unique");
     assert(ElementSize != AArch64::ElementSizeNone &&
            "This instruction is unpredicated");
 
@@ -569,10 +570,19 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
 
     // After the movprfx, the destructive operand is same as Dst
     DOPIdx = 0;
+
+    // Create the additional LSL to zero the lanes when the DstReg is not
+    // unique. Zeros the lanes in z0 that aren't active in p0 with sequence
+    // movprfx z0.b, p0/z, z0.b; lsl z0.b, p0/m, z0.b, #0;
+    if (DType == AArch64::DestructiveBinary && !DOPRegIsUnique) {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LSLZero))
+          .addReg(DstReg, RegState::Define)
+          .add(MI.getOperand(PredIdx))
+          .addReg(DstReg)
+          .addImm(0);
+    }
   } else if (DstReg != MI.getOperand(DOPIdx).getReg()) {
-#ifndef NDEBUG
     assert(DOPRegIsUnique && "The destructive operand should be unique");
-#endif
     PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx))
                .addReg(DstReg, RegState::Define)
                .addReg(MI.getOperand(DOPIdx).getReg());
@@ -591,6 +601,7 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
         .add(MI.getOperand(PredIdx))
         .add(MI.getOperand(SrcIdx));
     break;
+  case AArch64::DestructiveBinary:
   case AArch64::DestructiveBinaryImm:
   case AArch64::DestructiveBinaryComm:
   case AArch64::DestructiveBinaryCommWithRev:
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -433,7 +433,7 @@ let Predicates = [HasSVEorSME, UseExperimentalZeroingPseudos] in {
   defm ORR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_orr>;
   defm EOR_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_eor>;
   defm AND_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_and>;
-  defm BIC_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<null_frag>;
+  defm BIC_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_bic>;
 } // End HasSVEorSME, UseExperimentalZeroingPseudos
 
 let Predicates = [HasSVEorSME] in {
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll
@@ -344,8 +344,7 @@ define <vscale x 2 x i64> @and_i64_zero(<vscale x 2 x i1> %pg, <vscale x 2 x i64
 define <vscale x 16 x i8> @bic_i8_zero(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: bic_i8_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.b, #0 // =0x0
-; CHECK-NEXT:    sel z0.b, p0, z0.b, z2.b
+; CHECK-NEXT:    movprfx z0.b, p0/z, z0.b
 ; CHECK-NEXT:    bic z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
   %a_z = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> zeroinitializer
@@ -358,8 +357,7 @@ define <vscale x 16 x i8> @bic_i8_zero(<vscale x 16 x i1> %pg, <vscale x 16 x i8
 define <vscale x 8 x i16> @bic_i16_zero(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: bic_i16_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.h, #0 // =0x0
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    movprfx z0.h, p0/z, z0.h
 ; CHECK-NEXT:    bic z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> zeroinitializer
@@ -372,8 +370,7 @@ define <vscale x 8 x i16> @bic_i16_zero(<vscale x 8 x i1> %pg, <vscale x 8 x i16
 define <vscale x 4 x i32> @bic_i32_zero(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: bic_i32_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.s, #0 // =0x0
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z2.s
+; CHECK-NEXT:    movprfx z0.s, p0/z, z0.s
 ; CHECK-NEXT:    bic z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %a_z = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> zeroinitializer
@@ -386,8 +383,7 @@ define <vscale x 4 x i32> @bic_i32_zero(<vscale x 4 x i1> %pg, <vscale x 4 x i32
 define <vscale x 2 x i64> @bic_i64_zero(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: bic_i64_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.d, #0 // =0x0
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
+; CHECK-NEXT:    movprfx z0.d, p0/z, z0.d
 ; CHECK-NEXT:    bic z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %a_z = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
@@ -397,6 +393,39 @@ define <vscale x 2 x i64> @bic_i64_zero(<vscale x 2 x i1> %pg, <vscale x 2 x i64
   ret <vscale x 2 x i64> %out
 }
 
+; BIC (i.e. A & ~A) is illegal operation with movprfx, so the codegen depend on IR before expand-pseudo
+define <vscale x 2 x i64> @bic_i64_zero_no_unique_reg(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: bic_i64_zero_no_unique_reg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p0/m, z0.d
+; CHECK-NEXT:    movprfx z0.d, p0/z, z0.d
+; CHECK-NEXT:    bic z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.bic.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a_z,
+                                                               <vscale x 2 x i64> %a_z)
+  ret <vscale x 2 x i64> %out
+}
+
+; BIC (i.e. A & ~B) is not a commutative operation, so disable it when the
+; destination operand is not the destructive operand
+define <vscale x 2 x i64> @bic_i64_zero_no_comm(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: bic_i64_zero_no_comm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
+; CHECK-NEXT:    bic z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %a_z = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.bic.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %b,
+                                                               <vscale x 2 x i64> %a_z)
+  ret <vscale x 2 x i64> %out
+}
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.add.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.add.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.mir b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.mir
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -mattr=+sve -mattr=+use-experimental-zeroing-pseudos -run-pass=aarch64-expand-pseudo %s -o - | FileCheck %s
+
+# Should create an additional LSL to zero the lanes as the DstReg is not unique
+
+--- |
+  define <vscale x 8 x i16> @bic_i16_zero(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a){
+    %a_z = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> zeroinitializer
+    %out = call <vscale x 8 x i16> @llvm.aarch64.sve.bic.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a_z, <vscale x 8 x i16> %a_z)
+    ret <vscale x 8 x i16> %out
+  }
+
+  declare <vscale x 8 x i16> @llvm.aarch64.sve.bic.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+...
+---
+name:            bic_i16_zero
+alignment:       4
+tracksRegLiveness: true
+tracksDebugUserValues: true
+registers:       []
+liveins:
+  - { reg: '$p0', virtual-reg: '' }
+  - { reg: '$z0', virtual-reg: '' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $p0, $z0
+
+    ; CHECK-LABEL: name: bic_i16_zero
+    ; CHECK: liveins: $p0, $z0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $z0, implicit-def $q0, implicit-def $d0, implicit-def $s0, implicit-def $h0, implicit-def $b0, implicit-def $z0_hi, implicit killed $p0, implicit $z0 {
+    ; CHECK-NEXT:   $z0 = MOVPRFX_ZPzZ_H $p0, $z0
+    ; CHECK-NEXT:   $z0 = LSL_ZPmI_H killed renamable $p0, internal $z0, 0
+    ; CHECK-NEXT:   $z0 = BIC_ZPmZ_H killed renamable $p0, internal killed $z0, internal killed renamable $z0
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: RET undef $lr, implicit $z0
+    renamable $z0 = BIC_ZPZZ_ZERO_H killed renamable $p0, killed renamable $z0, killed renamable $z0
+    RET_ReallyLR implicit $z0
+...