-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[VPlan] Simplify BLEND %a, %b, NOT(%m) -> BLEND %b, %a, %m. #128375
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
a69e89a
112c494
6b0f436
c77792e
06de98a
ea4d37f
2e0ea61
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -851,8 +851,20 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { | |||||||
return; | ||||||||
} | ||||||||
|
||||||||
if (Blend->isNormalized()) | ||||||||
if (Blend->isNormalized()) { | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better placed below - after normalizing Blend if needed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved, thanks! |
||||||||
/// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask. | ||||||||
VPValue *NewMask; | ||||||||
if (Blend->getNumOperands() == 3 && | ||||||||
match(Blend->getMask(1), m_Not(m_VPValue(NewMask)))) { | ||||||||
VPValue *Inc0 = Blend->getIncomingValue(0); | ||||||||
VPValue *Inc1 = Blend->getIncomingValue(1); | ||||||||
Blend->setOperand(0, Inc1); | ||||||||
Blend->setOperand(1, Inc0); | ||||||||
Blend->setOperand(2, NewMask); | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
or check if unused and delete, knowing its operand is live? Although it does seem to get collected eventually. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated, although there are no more opportunities from the manual cleanup. |
||||||||
} | ||||||||
|
||||||||
return; | ||||||||
} | ||||||||
|
||||||||
// Normalize the blend so its first incoming value is used as the initial | ||||||||
// value with the others blended into it. | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,16 +36,16 @@ define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 % | |
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PB]], i32 [[OFFSET_IDX5]] | ||
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[NEXT_GEP]], align 4 | ||
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[NEXT_GEP6]], align 4 | ||
; CHECK-NEXT: [[TMP2:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], zeroinitializer | ||
; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD7]], zeroinitializer | ||
; CHECK-NEXT: [[DOTNOT9:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer | ||
; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast une <4 x float> [[WIDE_LOAD]], zeroinitializer | ||
; CHECK-NEXT: [[TMP19:%.*]] = fcmp fast une <4 x float> [[WIDE_LOAD7]], zeroinitializer | ||
; CHECK-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP18]], <4 x i1> splat (i1 true), <4 x i1> [[TMP19]] | ||
Comment on lines
+39
to
+41
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This NOTing of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It may not be profitable, depending on the cost to materialize After LV, we get
|
||
; CHECK-NEXT: [[TMP4:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD]]) | ||
; CHECK-NEXT: [[TMP5:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD7]]) | ||
; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], [[TMP4]] | ||
; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD7]] | ||
; CHECK-NEXT: [[TMP8:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP7]]) | ||
; CHECK-NEXT: [[TMP9:%.*]] = fdiv fast <4 x float> [[TMP8]], [[TMP6]] | ||
; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[DOTNOT9]], <4 x float> splat (float -0.000000e+00), <4 x float> [[TMP9]] | ||
; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP20]], <4 x float> [[TMP9]], <4 x float> splat (float -0.000000e+00) | ||
; CHECK-NEXT: [[PREDPHI]] = fadd reassoc arcp contract afn <4 x float> [[VEC_PHI]], [[TMP10]] | ||
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 | ||
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -272,9 +272,8 @@ define void @preserve_nuw_nsw_no_addr(ptr %output) local_unnamed_addr #0 { | |
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ] | ||
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 | ||
; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer | ||
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) | ||
; CHECK-NEXT: [[TMP5:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1) | ||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP5]], <4 x i64> zeroinitializer | ||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> zeroinitializer, <4 x i64> [[TMP5]] | ||
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT:%.*]], i64 [[TMP0]] | ||
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 | ||
; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP8]], align 4 | ||
|
@@ -364,8 +363,7 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 { | |
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP2]] | ||
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[TMP3]], i32 0 | ||
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP4]], i32 8, <4 x i1> [[TMP0]], <4 x double> poison) | ||
; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true) | ||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x double> zeroinitializer, <4 x double> [[WIDE_MASKED_LOAD]] | ||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> zeroinitializer | ||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[PREDPHI]], i32 3 | ||
; CHECK-NEXT: store double [[TMP6]], ptr [[P1]], align 8 | ||
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 | ||
|
@@ -479,9 +477,8 @@ define void @preserve_exact_no_addr(ptr %output) local_unnamed_addr #0 { | |
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ] | ||
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 | ||
; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer | ||
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) | ||
; CHECK-NEXT: [[TMP5:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], splat (i64 2) | ||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP5]], <4 x i64> zeroinitializer | ||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> zeroinitializer, <4 x i64> [[TMP5]] | ||
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT:%.*]], i64 [[TMP0]] | ||
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 | ||
; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP8]], align 4 | ||
|
@@ -604,7 +601,7 @@ define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) { | |
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP16]] | ||
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0 | ||
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1 | ||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i8> [[WIDE_LOAD]], <4 x i8> zeroinitializer | ||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i8> zeroinitializer, <4 x i8> [[WIDE_LOAD]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (here the NOTed mask TMP2 has other uses so remains) |
||
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr %dst, i64 [[TMP0]] | ||
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i32 0 | ||
; CHECK-NEXT: store <4 x i8> [[PREDPHI]], ptr [[TMP20]], align 4 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,10 +47,8 @@ define void @smax_call_uniform(ptr %dst, i64 %x) { | |
; CHECK: [[PRED_UREM_CONTINUE6]]: | ||
; CHECK-NEXT: [[TMP12:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP4]], i64 0) | ||
; CHECK-NEXT: [[TMP13:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP9]], i64 0) | ||
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 | ||
; CHECK-NEXT: [[P:%.*]] = select i1 [[TMP14]], i64 [[TMP12]], i64 1 | ||
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 | ||
; CHECK-NEXT: [[PREDPHI7:%.*]] = select i1 [[TMP15]], i64 [[TMP13]], i64 1 | ||
; CHECK-NEXT: [[P:%.*]] = select i1 [[C]], i64 1, i64 [[TMP12]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Old mask TMP14 turns out to be NOT C.. and same for TMP15.) |
||
; CHECK-NEXT: [[PREDPHI7:%.*]] = select i1 [[C]], i64 1, i64 [[TMP13]] | ||
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[P]], 1 | ||
; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[PREDPHI7]], 1 | ||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[DST]], i64 [[ADD]] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,7 +71,7 @@ for.end: | |
|
||
;CHECK-LABEL: @reduction_func( | ||
;CHECK: load <4 x i32> | ||
;CHECK: icmp slt <4 x i32> | ||
;CHECK: icmp sgt <4 x i32> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this reversing slt into sgt? What about equality. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is due to missing checking the operands. Previously we had |
||
;CHECK: add <4 x i32> | ||
;CHECK: select <4 x i1> | ||
;CHECK: ret i32 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unrelated, while we're here: above optimization checks for a single unique (non masked out) value, can hold a pointer rather than a SmallPtrSet. OTOH, can filter away all masked out values, even if more than one remains.