-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[InstCombine] Canonicalize non-i8 gep of mul to i8 #96606
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: David Green (davemgreen) ChangesThis is a small canonicalization for It currently doesn't attempt to check for multiple uses on the mul, but that should be possible if it sounds better. Let me know what you think of the idea in general. Patch is 152.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96606.diff 16 Files Affected:
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index a9c994f8076b2..5765324ec802f 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2787,9 +2787,16 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
GEP.getNoWrapFlags()));
}
- // Canonicalize scalable GEPs to an explicit offset using the llvm.vscale
- // intrinsic. This has better support in BasicAA.
- if (GEPEltType->isScalableTy()) {
+ // Canonicalize
+ // - scalable GEPs to an explicit offset using the llvm.vscale intrinsic.
+ // This has better support in BasicAA.
+ // - gep i32 p, mul(O, C) -> gep i8, p, mul(O, C*4) to fold the two
+ // multiplies together.
+ if (GEPEltType->isScalableTy() ||
+ (!GEPEltType->isIntegerTy(8) && GEP.getNumIndices() == 1 &&
+ match(GEP.getOperand(1),
+ m_CombineOr(m_Mul(m_Value(), m_ConstantInt()),
+ m_Shl(m_Value(), m_ConstantInt()))))) {
Value *Offset = EmitGEPOffset(cast<GEPOperator>(&GEP));
return replaceInstUsesWith(
GEP, Builder.CreatePtrAdd(PtrOp, Offset, "", GEP.isInBounds()));
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-gep-mul.ll b/llvm/test/Transforms/InstCombine/canonicalize-gep-mul.ll
new file mode 100644
index 0000000000000..376a849705384
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/canonicalize-gep-mul.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define ptr @mul4(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @mul4(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[GEP_IDX:%.*]] = shl i64 [[X]], 4
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[GEP_IDX]]
+; CHECK-NEXT: ret ptr [[GEP]]
+;
+entry:
+ %mul = mul i64 %x, 4
+ %gep = getelementptr inbounds i32, ptr %p, i64 %mul
+ ret ptr %gep
+}
+
+define ptr @mul5(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @mul5(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[GEP_IDX:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[GEP_IDX]]
+; CHECK-NEXT: ret ptr [[GEP]]
+;
+entry:
+ %mul = mul i64 %x, 5
+ %gep = getelementptr inbounds i32, ptr %p, i64 %mul
+ ret ptr %gep
+}
+
+define ptr @noinbounds(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @noinbounds(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[GEP_IDX:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[P]], i64 [[GEP_IDX]]
+; CHECK-NEXT: ret ptr [[GEP]]
+;
+entry:
+ %mul = mul i64 %x, 5
+ %gep = getelementptr i32, ptr %p, i64 %mul
+ ret ptr %gep
+}
+
+define ptr @usemul(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @usemul(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[X]], 5
+; CHECK-NEXT: [[GEP_IDX:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[GEP_IDX]]
+; CHECK-NEXT: call void @use(i64 [[MUL]])
+; CHECK-NEXT: ret ptr [[GEP]]
+;
+entry:
+ %mul = mul i64 %x, 5
+ %gep = getelementptr inbounds i32, ptr %p, i64 %mul
+ call void @use(i64 %mul)
+ ret ptr %gep
+}
+
+define void @multiple(ptr %p, i64 %x) {
+; CHECK-LABEL: define void @multiple(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[MUL21:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[MUL21]]
+; CHECK-NEXT: [[MUL20:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[MUL20]]
+; CHECK-NEXT: call void @use2(ptr [[GEP3]], ptr [[GEP2]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %mul5 = mul i64 %x, 5
+ %gep1 = getelementptr inbounds i32, ptr %p, i64 %mul5
+ %mul20 = mul i64 %x, 20
+ %gep2 = getelementptr inbounds i8, ptr %p, i64 %mul20
+ call void @use2(ptr %gep1, ptr %gep2)
+ ret void
+}
+
+define void @multiplestore(ptr %p, i64 %x) {
+; CHECK-LABEL: define void @multiplestore(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[MUL20:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[MUL20]]
+; CHECK-NEXT: [[MUL21:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[MUL21]]
+; CHECK-NEXT: store i32 0, ptr [[GEP2]], align 4
+; CHECK-NEXT: store i32 1, ptr [[GEP3]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %mul5 = mul i64 %x, 5
+ %gep1 = getelementptr inbounds i32, ptr %p, i64 %mul5
+ %mul20 = mul i64 %x, 20
+ %gep2 = getelementptr inbounds i8, ptr %p, i64 %mul20
+ store i32 0, ptr %gep1
+ store i32 1, ptr %gep2
+ ret void
+}
+
+declare void @use(i64)
+declare void @use2(ptr, ptr)
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 722ced1f6abb1..f25abae60904c 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -1014,8 +1014,8 @@ define i8 @test_gep_bitcast_as1(ptr addrspace(1) %arr, i16 %N) {
; The element size of the array matches the element size of the pointer
define i64 @test_gep_bitcast_array_same_size_element(ptr %arr, i64 %N) {
; CHECK-LABEL: @test_gep_bitcast_array_same_size_element(
-; CHECK-NEXT: [[V:%.*]] = shl i64 [[N:%.*]], 3
-; CHECK-NEXT: [[T:%.*]] = getelementptr i64, ptr [[ARR:%.*]], i64 [[V]]
+; CHECK-NEXT: [[T_IDX:%.*]] = shl i64 [[N:%.*]], 6
+; CHECK-NEXT: [[T:%.*]] = getelementptr i8, ptr [[ARR:%.*]], i64 [[T_IDX]]
; CHECK-NEXT: [[X:%.*]] = load i64, ptr [[T]], align 4
; CHECK-NEXT: ret i64 [[X]]
;
@@ -1029,8 +1029,8 @@ define i64 @test_gep_bitcast_array_same_size_element(ptr %arr, i64 %N) {
define i64 @test_gep_bitcast_array_same_size_element_addrspacecast(ptr %arr, i64 %N) {
; CHECK-LABEL: @test_gep_bitcast_array_same_size_element_addrspacecast(
; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr [[ARR:%.*]] to ptr addrspace(3)
-; CHECK-NEXT: [[V:%.*]] = shl i64 [[N:%.*]], 3
-; CHECK-NEXT: [[T:%.*]] = getelementptr i64, ptr addrspace(3) [[CAST]], i64 [[V]]
+; CHECK-NEXT: [[T_IDX:%.*]] = shl i64 [[N:%.*]], 6
+; CHECK-NEXT: [[T:%.*]] = getelementptr i8, ptr addrspace(3) [[CAST]], i64 [[T_IDX]]
; CHECK-NEXT: [[X:%.*]] = load i64, ptr addrspace(3) [[T]], align 4
; CHECK-NEXT: ret i64 [[X]]
;
@@ -1057,8 +1057,8 @@ define i8 @test_gep_bitcast_array_different_size_element(ptr %arr, i64 %N) {
define i64 @test_gep_bitcast_array_same_size_element_as1(ptr addrspace(1) %arr, i16 %N) {
; CHECK-LABEL: @test_gep_bitcast_array_same_size_element_as1(
-; CHECK-NEXT: [[V:%.*]] = shl i16 [[N:%.*]], 3
-; CHECK-NEXT: [[T:%.*]] = getelementptr i64, ptr addrspace(1) [[ARR:%.*]], i16 [[V]]
+; CHECK-NEXT: [[T_IDX:%.*]] = shl i16 [[N:%.*]], 6
+; CHECK-NEXT: [[T:%.*]] = getelementptr i8, ptr addrspace(1) [[ARR:%.*]], i16 [[T_IDX]]
; CHECK-NEXT: [[X:%.*]] = load i64, ptr addrspace(1) [[T]], align 4
; CHECK-NEXT: ret i64 [[X]]
;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
index ce1cfda438170..2a80a7affa4f8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
@@ -312,14 +312,14 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias
; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> poison)
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP14]]
+; CHECK-NEXT: [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP13]], 4
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX]]
; CHECK-NEXT: store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], ptr [[TMP12]], align 4
-; CHECK-NEXT: store <vscale x 4 x float> [[WIDE_MASKED_GATHER2]], ptr [[TMP15]], align 4
+; CHECK-NEXT: store <vscale x 4 x float> [[WIDE_MASKED_GATHER2]], ptr [[TMP14]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -328,11 +328,11 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[INDVARS_IV_STRIDE2:%.*]] = shl i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV_STRIDE2]]
-; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX_IDX:%.*]] = shl i64 [[INDVARS_IV]], 3
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[ARRAYIDX_IDX]]
+; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[TMP17]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: store float [[TMP16]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 1853e551806bc..70f2e99a4e03a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -480,18 +480,18 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = and i64 [[INDEX]], 9223372036854775804
+; CHECK-NEXT: [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP4]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP6:%.*]] = shl nsw <vscale x 4 x i32> [[TMP5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP7:%.*]] = and i64 [[INDEX]], 9223372036854775804
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP7]]
-; CHECK-NEXT: store <vscale x 4 x i32> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: store <vscale x 4 x i32> [[TMP6]], ptr [[TMP7]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
@@ -566,18 +566,18 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = and i64 [[INDEX]], 9223372036854775804
+; CHECK-NEXT: [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP14:%.*]] = shl nsw <vscale x 4 x i32> [[TMP13]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP15:%.*]] = and i64 [[INDEX]], 9223372036854775804
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP15]]
-; CHECK-NEXT: store <vscale x 4 x i32> [[TMP14]], ptr [[TMP16]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: store <vscale x 4 x i32> [[TMP14]], ptr [[TMP15]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
@@ -715,7 +715,8 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 01468474dd5f3..3a939952bf818 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -35,36 +35,36 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 5
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i64 [[TMP7]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
-; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <vscale x 8 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP7]]
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <vscale x 8 x i32>, ptr [[NEXT_GEP2]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC2]])
-; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC3]], 0
-; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC3]], 1
-; CHECK-NEXT: [[TMP15:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP16:%.*]] = add nsw <vscale x 4 x i32> [[TMP13]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 2
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP19]]
-; CHECK-NEXT: store <vscale x 4 x i32> [[TMP15]], ptr [[TMP17]], align 4
-; CHECK-NEXT: store <vscale x 4 x i32> [[TMP16]], ptr [[TMP20]], align 4
-; CHECK-NEXT: [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP12]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP22:%.*]] = add nsw <vscale x 4 x i32> [[TMP14]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP25:%.*]] = shl nuw nsw i64 [[TMP24]], 2
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP25]]
-; CHECK-NEXT: store <vscale x 4 x i32> [[TMP21]], ptr [[TMP23]], align 4
-; CHECK-NEXT: store <...
[truncated]
|
@llvm/pr-subscribers-backend-systemz Author: David Green (davemgreen) ChangesThis is a small canonicalization for It currently doesn't attempt to check for multiple uses on the mul, but that should be possible if it sounds better. Let me know what you think of the idea in general. Patch is 152.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96606.diff 16 Files Affected:
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index a9c994f8076b2..5765324ec802f 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2787,9 +2787,16 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
GEP.getNoWrapFlags()));
}
- // Canonicalize scalable GEPs to an explicit offset using the llvm.vscale
- // intrinsic. This has better support in BasicAA.
- if (GEPEltType->isScalableTy()) {
+ // Canonicalize
+ // - scalable GEPs to an explicit offset using the llvm.vscale intrinsic.
+ // This has better support in BasicAA.
+ // - gep i32 p, mul(O, C) -> gep i8, p, mul(O, C*4) to fold the two
+ // multiplies together.
+ if (GEPEltType->isScalableTy() ||
+ (!GEPEltType->isIntegerTy(8) && GEP.getNumIndices() == 1 &&
+ match(GEP.getOperand(1),
+ m_CombineOr(m_Mul(m_Value(), m_ConstantInt()),
+ m_Shl(m_Value(), m_ConstantInt()))))) {
Value *Offset = EmitGEPOffset(cast<GEPOperator>(&GEP));
return replaceInstUsesWith(
GEP, Builder.CreatePtrAdd(PtrOp, Offset, "", GEP.isInBounds()));
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-gep-mul.ll b/llvm/test/Transforms/InstCombine/canonicalize-gep-mul.ll
new file mode 100644
index 0000000000000..376a849705384
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/canonicalize-gep-mul.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define ptr @mul4(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @mul4(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[GEP_IDX:%.*]] = shl i64 [[X]], 4
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[GEP_IDX]]
+; CHECK-NEXT: ret ptr [[GEP]]
+;
+entry:
+ %mul = mul i64 %x, 4
+ %gep = getelementptr inbounds i32, ptr %p, i64 %mul
+ ret ptr %gep
+}
+
+define ptr @mul5(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @mul5(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[GEP_IDX:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[GEP_IDX]]
+; CHECK-NEXT: ret ptr [[GEP]]
+;
+entry:
+ %mul = mul i64 %x, 5
+ %gep = getelementptr inbounds i32, ptr %p, i64 %mul
+ ret ptr %gep
+}
+
+define ptr @noinbounds(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @noinbounds(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[GEP_IDX:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[P]], i64 [[GEP_IDX]]
+; CHECK-NEXT: ret ptr [[GEP]]
+;
+entry:
+ %mul = mul i64 %x, 5
+ %gep = getelementptr i32, ptr %p, i64 %mul
+ ret ptr %gep
+}
+
+define ptr @usemul(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @usemul(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[X]], 5
+; CHECK-NEXT: [[GEP_IDX:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[GEP_IDX]]
+; CHECK-NEXT: call void @use(i64 [[MUL]])
+; CHECK-NEXT: ret ptr [[GEP]]
+;
+entry:
+ %mul = mul i64 %x, 5
+ %gep = getelementptr inbounds i32, ptr %p, i64 %mul
+ call void @use(i64 %mul)
+ ret ptr %gep
+}
+
+define void @multiple(ptr %p, i64 %x) {
+; CHECK-LABEL: define void @multiple(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[MUL21:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[MUL21]]
+; CHECK-NEXT: [[MUL20:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[MUL20]]
+; CHECK-NEXT: call void @use2(ptr [[GEP3]], ptr [[GEP2]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %mul5 = mul i64 %x, 5
+ %gep1 = getelementptr inbounds i32, ptr %p, i64 %mul5
+ %mul20 = mul i64 %x, 20
+ %gep2 = getelementptr inbounds i8, ptr %p, i64 %mul20
+ call void @use2(ptr %gep1, ptr %gep2)
+ ret void
+}
+
+define void @multiplestore(ptr %p, i64 %x) {
+; CHECK-LABEL: define void @multiplestore(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[MUL20:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[MUL20]]
+; CHECK-NEXT: [[MUL21:%.*]] = mul i64 [[X]], 20
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[MUL21]]
+; CHECK-NEXT: store i32 0, ptr [[GEP2]], align 4
+; CHECK-NEXT: store i32 1, ptr [[GEP3]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %mul5 = mul i64 %x, 5
+ %gep1 = getelementptr inbounds i32, ptr %p, i64 %mul5
+ %mul20 = mul i64 %x, 20
+ %gep2 = getelementptr inbounds i8, ptr %p, i64 %mul20
+ store i32 0, ptr %gep1
+ store i32 1, ptr %gep2
+ ret void
+}
+
+declare void @use(i64)
+declare void @use2(ptr, ptr)
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 722ced1f6abb1..f25abae60904c 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -1014,8 +1014,8 @@ define i8 @test_gep_bitcast_as1(ptr addrspace(1) %arr, i16 %N) {
; The element size of the array matches the element size of the pointer
define i64 @test_gep_bitcast_array_same_size_element(ptr %arr, i64 %N) {
; CHECK-LABEL: @test_gep_bitcast_array_same_size_element(
-; CHECK-NEXT: [[V:%.*]] = shl i64 [[N:%.*]], 3
-; CHECK-NEXT: [[T:%.*]] = getelementptr i64, ptr [[ARR:%.*]], i64 [[V]]
+; CHECK-NEXT: [[T_IDX:%.*]] = shl i64 [[N:%.*]], 6
+; CHECK-NEXT: [[T:%.*]] = getelementptr i8, ptr [[ARR:%.*]], i64 [[T_IDX]]
; CHECK-NEXT: [[X:%.*]] = load i64, ptr [[T]], align 4
; CHECK-NEXT: ret i64 [[X]]
;
@@ -1029,8 +1029,8 @@ define i64 @test_gep_bitcast_array_same_size_element(ptr %arr, i64 %N) {
define i64 @test_gep_bitcast_array_same_size_element_addrspacecast(ptr %arr, i64 %N) {
; CHECK-LABEL: @test_gep_bitcast_array_same_size_element_addrspacecast(
; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr [[ARR:%.*]] to ptr addrspace(3)
-; CHECK-NEXT: [[V:%.*]] = shl i64 [[N:%.*]], 3
-; CHECK-NEXT: [[T:%.*]] = getelementptr i64, ptr addrspace(3) [[CAST]], i64 [[V]]
+; CHECK-NEXT: [[T_IDX:%.*]] = shl i64 [[N:%.*]], 6
+; CHECK-NEXT: [[T:%.*]] = getelementptr i8, ptr addrspace(3) [[CAST]], i64 [[T_IDX]]
; CHECK-NEXT: [[X:%.*]] = load i64, ptr addrspace(3) [[T]], align 4
; CHECK-NEXT: ret i64 [[X]]
;
@@ -1057,8 +1057,8 @@ define i8 @test_gep_bitcast_array_different_size_element(ptr %arr, i64 %N) {
define i64 @test_gep_bitcast_array_same_size_element_as1(ptr addrspace(1) %arr, i16 %N) {
; CHECK-LABEL: @test_gep_bitcast_array_same_size_element_as1(
-; CHECK-NEXT: [[V:%.*]] = shl i16 [[N:%.*]], 3
-; CHECK-NEXT: [[T:%.*]] = getelementptr i64, ptr addrspace(1) [[ARR:%.*]], i16 [[V]]
+; CHECK-NEXT: [[T_IDX:%.*]] = shl i16 [[N:%.*]], 6
+; CHECK-NEXT: [[T:%.*]] = getelementptr i8, ptr addrspace(1) [[ARR:%.*]], i16 [[T_IDX]]
; CHECK-NEXT: [[X:%.*]] = load i64, ptr addrspace(1) [[T]], align 4
; CHECK-NEXT: ret i64 [[X]]
;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
index ce1cfda438170..2a80a7affa4f8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
@@ -312,14 +312,14 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias
; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> poison)
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP14]]
+; CHECK-NEXT: [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP13]], 4
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX]]
; CHECK-NEXT: store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], ptr [[TMP12]], align 4
-; CHECK-NEXT: store <vscale x 4 x float> [[WIDE_MASKED_GATHER2]], ptr [[TMP15]], align 4
+; CHECK-NEXT: store <vscale x 4 x float> [[WIDE_MASKED_GATHER2]], ptr [[TMP14]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -328,11 +328,11 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[INDVARS_IV_STRIDE2:%.*]] = shl i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV_STRIDE2]]
-; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX_IDX:%.*]] = shl i64 [[INDVARS_IV]], 3
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[ARRAYIDX_IDX]]
+; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[TMP17]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: store float [[TMP16]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 1853e551806bc..70f2e99a4e03a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -480,18 +480,18 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = and i64 [[INDEX]], 9223372036854775804
+; CHECK-NEXT: [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP4]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP6:%.*]] = shl nsw <vscale x 4 x i32> [[TMP5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP7:%.*]] = and i64 [[INDEX]], 9223372036854775804
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP7]]
-; CHECK-NEXT: store <vscale x 4 x i32> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: store <vscale x 4 x i32> [[TMP6]], ptr [[TMP7]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
@@ -566,18 +566,18 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = and i64 [[INDEX]], 9223372036854775804
+; CHECK-NEXT: [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP14:%.*]] = shl nsw <vscale x 4 x i32> [[TMP13]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP15:%.*]] = and i64 [[INDEX]], 9223372036854775804
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP15]]
-; CHECK-NEXT: store <vscale x 4 x i32> [[TMP14]], ptr [[TMP16]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: store <vscale x 4 x i32> [[TMP14]], ptr [[TMP15]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
@@ -715,7 +715,8 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 01468474dd5f3..3a939952bf818 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -35,36 +35,36 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 5
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i64 [[TMP7]]
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
-; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <vscale x 8 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP7]]
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <vscale x 8 x i32>, ptr [[NEXT_GEP2]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC2]])
-; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC3]], 0
-; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC3]], 1
-; CHECK-NEXT: [[TMP15:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP16:%.*]] = add nsw <vscale x 4 x i32> [[TMP13]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 2
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP19]]
-; CHECK-NEXT: store <vscale x 4 x i32> [[TMP15]], ptr [[TMP17]], align 4
-; CHECK-NEXT: store <vscale x 4 x i32> [[TMP16]], ptr [[TMP20]], align 4
-; CHECK-NEXT: [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP12]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP22:%.*]] = add nsw <vscale x 4 x i32> [[TMP14]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP25:%.*]] = shl nuw nsw i64 [[TMP24]], 2
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP25]]
-; CHECK-NEXT: store <vscale x 4 x i32> [[TMP21]], ptr [[TMP23]], align 4
-; CHECK-NEXT: store <...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks reasonable to me. At least to start with, I would limit this to the one-use case, to minimize impact. This kind of change tends to have fallout...
This is a small canonicalization for `gep i32, p, (mul x, C)` -> `gep i8, p, (mul x, C*4)`, so that the mul can combine both of the constant multiplications, and we take a small step towards canonicalizing more geps to i8.
76a00a7
to
8d8f2b1
Compare
Yeah that sounds good. You are right there can be quite a lot of fallout, even from the limited version of the canonicalization. I haven't seen anything in my results that was worse that didn't turn out to be noise, but no set of benchmarks can capture every case. |
(!GEPEltType->isIntegerTy(8) && GEP.getNumIndices() == 1 && | ||
match(GEP.getOperand(1), | ||
m_OneUse(m_CombineOr(m_Mul(m_Value(), m_ConstantInt()), | ||
m_Shl(m_Value(), m_ConstantInt())))))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if it comes up but you could also have exact
div
/shr
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because it can fold into the mul? Thanks - I can take a look.
But I would prefer to do it in a separate patch if that is OK. If I rebase I can check if it comes up in practice. (Or I could just implement it anyway, let me know if you think that is better).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We already handle the exact div/shr case separately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We dont seem to: https://godbolt.org/z/1xhh1h5e9 (but we could: https://alive2.llvm.org/ce/z/QvqkHE)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We handle the case where the div and mul cancel. I guess it makes sense to extend it to the case where the div constant is a multiple of the mul constant.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/144/builds/926 Here is the relevant piece of the build log for the reference:
|
This is a small canonicalization for `gep i32, p, (mul x, C)` -> `gep i8, p, (mul x, C*4)`, so that the mul can combine both of the constant multiplications, and we take a small step towards canonicalizing more geps to i8. It currently doesn't attempt to check for multiple uses on the mul, but that should be possible if it sounds better. Let me know what you think of the idea in general.
This is a small canonicalization for `gep i32, p, (mul x, C)` -> `gep i8, p, (mul x, C*4)`, so that the mul can combine both of the constant multiplications, and we take a small step towards canonicalizing more geps to i8. It currently doesn't attempt to check for multiple uses on the mul, but that should be possible if it sounds better. Let me know what you think of the idea in general.
; CHECK-NEXT: [[V:%.*]] = shl i16 [[N:%.*]], 3 | ||
; CHECK-NEXT: [[T:%.*]] = getelementptr i64, ptr addrspace(1) [[ARR:%.*]], i16 [[V]] | ||
; CHECK-NEXT: [[T_IDX:%.*]] = shl i16 [[N:%.*]], 6 | ||
; CHECK-NEXT: [[T:%.*]] = getelementptr i8, ptr addrspace(1) [[ARR:%.*]], i16 [[T_IDX]] | ||
; CHECK-NEXT: [[X:%.*]] = load i64, ptr addrspace(1) [[T]], align 4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hi @davemgreen
Should we check the range of [[T_IDX:%.]] = shl i16 [[N:%.]], 6 it not overflow the i16 ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi - I'm not sure. I think if the multiply would have overflowed before then it is still fine for it to overflow after the canonicalization. We only generate a mul, not the shift directly, and let instcombine convert it if necessary.
This is a small canonicalization for
gep i32, p, (mul x, C)
->gep i8, p, (mul x, C*4)
, so that the mul can combine both of the constant multiplications, and we take a small step towards canonicalizing more geps to i8.It currently doesn't attempt to check for multiple uses on the mul, but that should be possible if it sounds better. Let me know what you think of the idea in general.