Skip to content

Commit 9239054

Browse files
igogo-x86fhahn
authored andcommitted
[SelectOpt] Optimise big select groups in the latch of a non-inner loop to branches (llvm#119728)
Loop latches often have a loop-carried dependency, and if they have several SelectLike instructions in one select group, it is usually profitable to convert it to branches rather than keep selects. (cherry picked from commit 3469996)
1 parent a5eebc8 commit 9239054

File tree

2 files changed

+124
-0
lines changed

2 files changed

+124
-0
lines changed

llvm/lib/CodeGen/SelectOptimize.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,6 +1006,18 @@ bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
10061006
return true;
10071007
}
10081008

1009+
// If latch has a select group with several elements, it is usually profitable
1010+
// to convert it to branches. We let `optimizeSelectsInnerLoops` decide if
1011+
// conversion is profitable for innermost loops.
1012+
auto *BB = SI.getI()->getParent();
1013+
auto *L = LI->getLoopFor(BB);
1014+
if (L && !L->isInnermost() && L->getLoopLatch() == BB &&
1015+
ASI.Selects.size() >= 3) {
1016+
OR << "Converted to branch because select group in the latch block is big.";
1017+
EmitAndPrintRemark(ORE, OR);
1018+
return true;
1019+
}
1020+
10091021
ORmiss << "Not profitable to convert to branch (base heuristic).";
10101022
EmitAndPrintRemark(ORE, ORmiss);
10111023
return false;

llvm/test/CodeGen/AArch64/selectopt.ll

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -875,3 +875,115 @@ if.end:
875875
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
876876
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
877877
}
878+
879+
declare i64 @payload(i64, ptr, ptr, i64)
880+
881+
define void @outer_latch_heuristic(ptr %dst, ptr %src, i64 %p, i64 %dim) {
882+
; CHECKOO-LABEL: @outer_latch_heuristic(
883+
; CHECKOO-NEXT: entry:
884+
; CHECKOO-NEXT: br label [[OUTER_LOOP:%.*]]
885+
; CHECKOO: outer.loop:
886+
; CHECKOO-NEXT: [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[SELECT_END:%.*]] ], [ 0, [[ENTRY:%.*]] ]
887+
; CHECKOO-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ]
888+
; CHECKOO-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[SELECT_END]] ], [ 0, [[ENTRY]] ]
889+
; CHECKOO-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
890+
; CHECKOO-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
891+
; CHECKOO-NEXT: [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
892+
; CHECKOO-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8
893+
; CHECKOO-NEXT: br label [[INNER_LOOP:%.*]]
894+
; CHECKOO: inner.loop:
895+
; CHECKOO-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ]
896+
; CHECKOO-NEXT: [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ]
897+
; CHECKOO-NEXT: [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]])
898+
; CHECKOO-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
899+
; CHECKOO-NEXT: [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
900+
; CHECKOO-NEXT: br i1 [[EXITCOND_NOT_I_US]], label [[SELECT_END]], label [[INNER_LOOP]]
901+
; CHECKOO: latch:
902+
; CHECKOO-NEXT: [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1
903+
; CHECKOO-NEXT: [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63
904+
; CHECKOO-NEXT: [[I_NEXT]] = add nsw i64 [[DIFF_0_LCSSA_I_LOBIT_US]], [[I]]
905+
; CHECKOO-NEXT: [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64
906+
; CHECKOO-NEXT: [[J_NEXT]] = add nsw i64 [[J]], [[INC4_US]]
907+
; CHECKOO-NEXT: [[COND_IN_US:%.*]] = select i1 [[CMP2_US]], ptr [[ARRAYIDX1_US]], ptr [[ARRAYIDX_US]]
908+
; CHECKOO-NEXT: [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8
909+
; CHECKOO-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]]
910+
; CHECKOO-NEXT: store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8
911+
; CHECKOO-NEXT: [[INC7_US]] = add i64 [[K_020_US]], 1
912+
; CHECKOO-NEXT: [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000
913+
; CHECKOO-NEXT: br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]]
914+
; CHECKOO: exit:
915+
; CHECKOO-NEXT: ret void
916+
;
917+
; CHECKII-LABEL: @outer_latch_heuristic(
918+
; CHECKII-NEXT: entry:
919+
; CHECKII-NEXT: br label [[OUTER_LOOP:%.*]]
920+
; CHECKII: outer.loop:
921+
; CHECKII-NEXT: [[K_020_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
922+
; CHECKII-NEXT: [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ]
923+
; CHECKII-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH]] ], [ 0, [[ENTRY]] ]
924+
; CHECKII-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
925+
; CHECKII-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
926+
; CHECKII-NEXT: [[ARRAYIDX1_US:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
927+
; CHECKII-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX1_US]], align 8
928+
; CHECKII-NEXT: br label [[INNER_LOOP:%.*]]
929+
; CHECKII: inner.loop:
930+
; CHECKII-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[DIM:%.*]], [[OUTER_LOOP]] ], [ [[LSR_IV_NEXT:%.*]], [[INNER_LOOP]] ]
931+
; CHECKII-NEXT: [[DIFF_04_I_US:%.*]] = phi i64 [ [[CALL_I_US:%.*]], [[INNER_LOOP]] ], [ 0, [[OUTER_LOOP]] ]
932+
; CHECKII-NEXT: [[CALL_I_US]] = tail call i64 @payload(i64 [[DIFF_04_I_US]], ptr [[TMP0]], ptr [[TMP1]], i64 [[P:%.*]])
933+
; CHECKII-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
934+
; CHECKII-NEXT: [[EXITCOND_NOT_I_US:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
935+
; CHECKII-NEXT: br i1 [[EXITCOND_NOT_I_US]], label [[LATCH]], label [[INNER_LOOP]]
936+
; CHECKII: latch:
937+
; CHECKII-NEXT: [[CMP2_US:%.*]] = icmp sgt i64 [[CALL_I_US]], -1
938+
; CHECKII-NEXT: [[DIFF_0_LCSSA_I_LOBIT_US:%.*]] = lshr i64 [[CALL_I_US]], 63
939+
; CHECKII-NEXT: [[I_NEXT]] = add nsw i64 [[DIFF_0_LCSSA_I_LOBIT_US]], [[I]]
940+
; CHECKII-NEXT: [[INC4_US:%.*]] = zext i1 [[CMP2_US]] to i64
941+
; CHECKII-NEXT: [[J_NEXT]] = add nsw i64 [[J]], [[INC4_US]]
942+
; CHECKII-NEXT: [[COND_IN_US:%.*]] = select i1 [[CMP2_US]], ptr [[ARRAYIDX1_US]], ptr [[ARRAYIDX_US]]
943+
; CHECKII-NEXT: [[COND_US:%.*]] = load ptr, ptr [[COND_IN_US]], align 8
944+
; CHECKII-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[K_020_US]]
945+
; CHECKII-NEXT: store ptr [[COND_US]], ptr [[ARRAYIDX6_US]], align 8
946+
; CHECKII-NEXT: [[INC7_US]] = add i64 [[K_020_US]], 1
947+
; CHECKII-NEXT: [[EXITCOND23_NOT:%.*]] = icmp eq i64 [[K_020_US]], 1000
948+
; CHECKII-NEXT: br i1 [[EXITCOND23_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]]
949+
; CHECKII: exit:
950+
; CHECKII-NEXT: ret void
951+
;
952+
entry:
953+
br label %outer.loop
954+
955+
outer.loop:
956+
%k.020.us = phi i64 [ %inc7.us, %latch ], [ 0, %entry ]
957+
%j = phi i64 [ %j.next, %latch ], [ 0, %entry ]
958+
%i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
959+
%arrayidx.us = getelementptr inbounds ptr, ptr %src, i64 %i
960+
%4 = load ptr, ptr %arrayidx.us, align 8
961+
%arrayidx1.us = getelementptr inbounds ptr, ptr %src, i64 %j
962+
%5 = load ptr, ptr %arrayidx1.us, align 8
963+
br label %inner.loop
964+
965+
inner.loop:
966+
%lsr.iv = phi i64 [ %dim, %outer.loop ], [ %lsr.iv.next, %inner.loop ]
967+
%diff.04.i.us = phi i64 [ %call.i.us, %inner.loop ], [ 0, %outer.loop ]
968+
%call.i.us = tail call i64 @payload(i64 %diff.04.i.us, ptr %4, ptr %5, i64 %p)
969+
%lsr.iv.next = add i64 %lsr.iv, -1
970+
%exitcond.not.i.us = icmp eq i64 %lsr.iv.next, 0
971+
br i1 %exitcond.not.i.us, label %latch, label %inner.loop
972+
973+
latch:
974+
%cmp2.us = icmp sgt i64 %call.i.us, -1
975+
%diff.0.lcssa.i.lobit.us = lshr i64 %call.i.us, 63
976+
%i.next = add nsw i64 %diff.0.lcssa.i.lobit.us, %i
977+
%inc4.us = zext i1 %cmp2.us to i64
978+
%j.next = add nsw i64 %j, %inc4.us
979+
%cond.in.us = select i1 %cmp2.us, ptr %arrayidx1.us, ptr %arrayidx.us
980+
%cond.us = load ptr, ptr %cond.in.us, align 8
981+
%arrayidx6.us = getelementptr inbounds ptr, ptr %dst, i64 %k.020.us
982+
store ptr %cond.us, ptr %arrayidx6.us, align 8
983+
%inc7.us = add i64 %k.020.us, 1
984+
%exitcond23.not = icmp eq i64 %k.020.us, 1000
985+
br i1 %exitcond23.not, label %exit, label %outer.loop
986+
987+
exit:
988+
ret void
989+
}

0 commit comments

Comments
 (0)