[AArch64] Split the neon.addp intrinsic into integer and fp variants.

aemerson · aemerson · commit 398ab5fe3372 · 2019-03-21T22:31:37.000Z
This is the result of discussions on the list about how to deal with intrinsics which require codegen to disambiguate them via only the integer/fp overloads. It causes problems for GlobalISel as some of that information is lost during translation, while with other operations like IR instructions the information is encoded into the instruction opcode. This patch changes clang to emit the new faddp intrinsic if the vector operands to the builtin have FP element types. LLVM IR AutoUpgrade has been taught to upgrade existing calls to aarch64.neon.addp with fp vector arguments, and we remove the workarounds introduced for GlobalISel in r355865. This is a more permanent solution to PR40968. Differential Revision: https://reviews.llvm.org/D59655 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356722 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td
@@ -289,6 +289,7 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
 
   // Pairwise Add
   def int_aarch64_neon_addp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_faddp : AdvSIMD_2VectorArg_Intrinsic;
 
   // Long Pairwise Add
   // FIXME: In theory, we shouldn't need intrinsics for saddlp or
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
@@ -568,6 +568,17 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::thread_pointer);
       return true;
     }
+    if (Name.startswith("aarch64.neon.addp")) {
+      if (F->arg_size() != 2)
+        break; // Invalid IR.
+      auto fArgs = F->getFunctionType()->params();
+      VectorType *ArgTy = dyn_cast<VectorType>(fArgs[0]);
+      if (ArgTy && ArgTy->getElementType()->isFloatingPointTy()) {
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          Intrinsic::aarch64_neon_faddp, fArgs);
+        return true;
+      }
+    }
     break;
   }
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3499,7 +3499,7 @@ def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, V
 }
 defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
 defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
-defm FADDP   : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;
+defm FADDP   : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>;
 defm FADD    : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
 defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
 defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -64,11 +64,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
         return std::make_pair(0, EltTy);
       });
 
-  // HACK: Check that the intrinsic isn't ambiguous.
-  // (See: https://bugs.llvm.org/show_bug.cgi?id=40968)
-  getActionDefinitionsBuilder(G_INTRINSIC)
-    .custom();
-
   getActionDefinitionsBuilder(G_PHI)
       .legalFor({p0, s16, s32, s64})
       .clampScalar(0, s16, s64)
@@ -517,30 +512,11 @@ bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
     return false;
   case TargetOpcode::G_VAARG:
     return legalizeVaArg(MI, MRI, MIRBuilder);
-  case TargetOpcode::G_INTRINSIC:
-    return legalizeIntrinsic(MI, MRI, MIRBuilder);
   }
 
   llvm_unreachable("expected switch to return");
 }
 
-bool AArch64LegalizerInfo::legalizeIntrinsic(
-    MachineInstr &MI, MachineRegisterInfo &MRI,
-    MachineIRBuilder &MIRBuilder) const {
-  // HACK: Don't allow faddp/addp for now. We don't pass down the type info
-  // necessary to get this right today.
-  //
-  // It looks like addp/faddp is the only intrinsic that's impacted by this.
-  // All other intrinsics fully describe the required types in their names.
-  //
-  // (See: https://bugs.llvm.org/show_bug.cgi?id=40968)
-  const MachineOperand &IntrinOp = MI.getOperand(1);
-  if (IntrinOp.isIntrinsicID() &&
-      IntrinOp.getIntrinsicID() == Intrinsic::aarch64_neon_addp)
-    return false;
-  return true;
-}
-
 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
                                          MachineRegisterInfo &MRI,
                                          MachineIRBuilder &MIRBuilder) const {
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.h b/lib/Target/AArch64/AArch64LegalizerInfo.h
@@ -34,9 +34,6 @@ class AArch64LegalizerInfo : public LegalizerInfo {
 private:
   bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
                      MachineIRBuilder &MIRBuilder) const;
-
-  bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
-                         MachineIRBuilder &MIRBuilder) const;
 };
 } // End llvm namespace.
 #endif
diff --git a/test/CodeGen/AArch64/GlobalISel/fallback-ambiguous-addp-intrinsic.mir b/test/CodeGen/AArch64/GlobalISel/fallback-ambiguous-addp-intrinsic.mir
diff --git a/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -151,7 +151,7 @@
 # DEBUG:      .. the first uncovered type index: 1, OK
 #
 # DEBUG-NEXT: G_INTRINSIC (opcode {{[0-9]+}}): 0 type indices
-# DEBUG:      .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG:      .. type index coverage check SKIPPED: no rules defined
 #
 # DEBUG-NEXT: G_INTRINSIC_W_SIDE_EFFECTS (opcode {{[0-9]+}}): 0 type indices
 # DEBUG:      .. type index coverage check SKIPPED: no rules defined
diff --git a/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll b/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
@@ -65,27 +65,27 @@ define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
         ret <2 x i64> %val
 }
 
-declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>)
+declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>)
 
 define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
 ; CHECK: test_faddp_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+        %val = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
 ; CHECK: faddp v0.2s, v0.2s, v1.2s
         ret <2 x float> %val
 }
 
 define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK: test_faddp_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+        %val = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
 ; CHECK: faddp v0.4s, v0.4s, v1.4s
         ret <4 x float> %val
 }
 
 define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
 ; CHECK: test_faddp_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+        %val = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
 ; CHECK: faddp v0.2d, v0.2d, v1.2d
         ret <2 x double> %val
 }
diff --git a/test/CodeGen/AArch64/arm64-vadd.ll b/test/CodeGen/AArch64/arm64-vadd.ll
@@ -712,7 +712,7 @@ define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK: faddp.2s
         %tmp1 = load <2 x float>, <2 x float>* %A
         %tmp2 = load <2 x float>, <2 x float>* %B
-        %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        %tmp3 = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
         ret <2 x float> %tmp3
 }
 
@@ -721,7 +721,7 @@ define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
 ;CHECK: faddp.4s
         %tmp1 = load <4 x float>, <4 x float>* %A
         %tmp2 = load <4 x float>, <4 x float>* %B
-        %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        %tmp3 = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
         ret <4 x float> %tmp3
 }
 
@@ -730,13 +730,13 @@ define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
 ;CHECK: faddp.2d
         %tmp1 = load <2 x double>, <2 x double>* %A
         %tmp2 = load <2 x double>, <2 x double>* %B
-        %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        %tmp3 = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
         ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: uaddl_duprhs
diff --git a/test/CodeGen/AArch64/autoupgrade-aarch64-neon-addp-float.ll b/test/CodeGen/AArch64/autoupgrade-aarch64-neon-addp-float.ll
@@ -0,0 +1,9 @@
+; RUN: opt -S < %s -mtriple=arm64 | FileCheck %s
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>)
+
+; CHECK: call <4 x float> @llvm.aarch64.neon.faddp.v4f32
+define <4 x float> @upgrade_aarch64_neon_addp_float(<4 x float> %a, <4 x float> %b) {
+  %res = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %res
+}
+

Original file line number	Diff line number	Diff line change
`@@ -3499,7 +3499,7 @@ def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, V`
`3499`	`3499`	`}`
`3500`	`3500`	`defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;`
`3501`	`3501`	`defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;`
`3502`		`-defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;`
	`3502`	`+defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>;`
`3503`	`3503`	`defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;`
`3504`	`3504`	`defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;`
`3505`	`3505`	`defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;`
Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@`
`151`	`151`	`# DEBUG: .. the first uncovered type index: 1, OK`
`152`	`152`	`#`
`153`	`153`	`# DEBUG-NEXT: G_INTRINSIC (opcode {{[0-9]+}}): 0 type indices`
`154`		`-# DEBUG: .. type index coverage check SKIPPED: user-defined predicate detected`
	`154`	`+# DEBUG: .. type index coverage check SKIPPED: no rules defined`
`155`	`155`	`#`
`156`	`156`	`# DEBUG-NEXT: G_INTRINSIC_W_SIDE_EFFECTS (opcode {{[0-9]+}}): 0 type indices`
`157`	`157`	`# DEBUG: .. type index coverage check SKIPPED: no rules defined`