[AArch64] Don't model legal subvector insert/extract as scalarization

huntergr-arm · huntergr-arm · commit 0394c212cb63 · 2024-02-08T12:46:41.000Z
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -568,6 +568,32 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     return Cost;
   }
+  case Intrinsic::vector_extract: {
+    // If both the vector argument and the return type are legal types, then
+    // this should be a no-op or simple operation; return a relatively low cost.
+    LLVMContext &C = RetTy->getContext();
+    EVT MRTy = getTLI()->getValueType(DL, RetTy);
+    EVT MPTy = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    TargetLoweringBase::LegalizeKind RLK = getTLI()->getTypeConversion(C, MRTy);
+    TargetLoweringBase::LegalizeKind PLK = getTLI()->getTypeConversion(C, MPTy);
+    if (RLK.first == TargetLoweringBase::TypeLegal &&
+        PLK.first == TargetLoweringBase::TypeLegal)
+      return InstructionCost(1);
+    break;
+  }
+  case Intrinsic::vector_insert: {
+    // If both the vector and subvector arguments are legal types, then this
+    // should be a no-op or simple operation; return a relatively low cost.
+    LLVMContext &C = RetTy->getContext();
+    EVT MTy0 = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    EVT MTy1 = getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
+    TargetLoweringBase::LegalizeKind LK0 = getTLI()->getTypeConversion(C, MTy0);
+    TargetLoweringBase::LegalizeKind LK1 = getTLI()->getTypeConversion(C, MTy1);
+    if (LK0.first == TargetLoweringBase::TypeLegal &&
+        LK1.first == TargetLoweringBase::TypeLegal)
+      return InstructionCost(1);
+    break;
+  }
   case Intrinsic::bitreverse: {
     static const CostTblEntry BitreverseTbl[] = {
         {Intrinsic::bitreverse, MVT::i32, 1},
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
@@ -38,26 +38,30 @@ define void @test_ins_ext_cost(ptr readonly %a, ptr readonly %b, ptr readonly %c
 ; UNROLL-256-LABEL: define void @test_ins_ext_cost(
 ; UNROLL-256-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 ; UNROLL-256-NEXT:  entry:
-; UNROLL-256-NEXT:    br label [[FOR_BODY:%.*]]
-; UNROLL-256:       for.body:
-; UNROLL-256-NEXT:    [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
-; UNROLL-256-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
-; UNROLL-256-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
-; UNROLL-256-NEXT:    [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
-; UNROLL-256-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
-; UNROLL-256-NEXT:    [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
-; UNROLL-256-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
-; UNROLL-256-NEXT:    [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
+; UNROLL-256-NEXT:    [[LOAD_A:%.*]] = load <8 x float>, ptr [[A]], align 16
+; UNROLL-256-NEXT:    [[LOAD_B:%.*]] = load <8 x float>, ptr [[B]], align 16
+; UNROLL-256-NEXT:    [[LOAD_C:%.*]] = load <8 x float>, ptr [[C]], align 16
 ; UNROLL-256-NEXT:    [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
 ; UNROLL-256-NEXT:    [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
 ; UNROLL-256-NEXT:    [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
 ; UNROLL-256-NEXT:    [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
 ; UNROLL-256-NEXT:    [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
 ; UNROLL-256-NEXT:    [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
-; UNROLL-256-NEXT:    [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
-; UNROLL-256-NEXT:    store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
-; UNROLL-256-NEXT:    br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
-; UNROLL-256:       exit:
+; UNROLL-256-NEXT:    store <8 x float> [[CAST_FIXED_D]], ptr [[D]], align 16
+; UNROLL-256-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 1
+; UNROLL-256-NEXT:    [[LOAD_A_1:%.*]] = load <8 x float>, ptr [[GEP_A_1]], align 16
+; UNROLL-256-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 1
+; UNROLL-256-NEXT:    [[LOAD_B_1:%.*]] = load <8 x float>, ptr [[GEP_B_1]], align 16
+; UNROLL-256-NEXT:    [[GEP_C_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 1
+; UNROLL-256-NEXT:    [[LOAD_C_1:%.*]] = load <8 x float>, ptr [[GEP_C_1]], align 16
+; UNROLL-256-NEXT:    [[CAST_SCALABLE_B_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B_1]], i64 0)
+; UNROLL-256-NEXT:    [[CAST_SCALABLE_C_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C_1]], i64 0)
+; UNROLL-256-NEXT:    [[ADD_1:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B_1]], [[CAST_SCALABLE_C_1]]
+; UNROLL-256-NEXT:    [[CAST_SCALABLE_A_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A_1]], i64 0)
+; UNROLL-256-NEXT:    [[MUL_1:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A_1]], [[ADD_1]]
+; UNROLL-256-NEXT:    [[CAST_FIXED_D_1:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL_1]], i64 0)
+; UNROLL-256-NEXT:    [[GEP_D_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 1
+; UNROLL-256-NEXT:    store <8 x float> [[CAST_FIXED_D_1]], ptr [[GEP_D_1]], align 16
 ; UNROLL-256-NEXT:    ret void
 ;
 entry: