Skip to content

Commit 06f136f

Browse files
committed
[instcombine][x86] Converted pdep/pext with shifted mask to simple arithmetic
If the mask of a pdep or pext instruction is a shift masked (i.e. one contiguous block of ones) we need at most one and and one shift to represent the operation without the intrinsic. One all platforms I know of, this is faster than the pdep/pext. The cost modelling for multiple contiguous blocks might be worth exploring in a follow up, but it's not relevant for my current use case. It would almost certainly be a win on AMDs where these are really really slow though. Differential Revision: https://reviews.llvm.org/D87861
1 parent 7c10129 commit 06f136f

File tree

2 files changed

+68
-0
lines changed

2 files changed

+68
-0
lines changed

llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -999,6 +999,20 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
999999
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
10001000
}
10011001

1002+
if (MaskC->getValue().isShiftedMask()) {
1003+
// any single contingous sequence of 1s anywhere in the mask simply
1004+
// describes a subset of the input bits shifted to the appropriate
1005+
// position. Replace with the straight forward IR.
1006+
unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
1007+
Value *Input = II.getArgOperand(0);
1008+
Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
1009+
Value *Shifted = IC.Builder.CreateLShr(Masked,
1010+
ConstantInt::get(II.getType(),
1011+
ShiftAmount));
1012+
return IC.replaceInstUsesWith(II, Shifted);
1013+
}
1014+
1015+
10021016
if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
10031017
uint64_t Src = SrcC->getZExtValue();
10041018
uint64_t Mask = MaskC->getZExtValue();
@@ -1030,6 +1044,18 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
10301044
if (MaskC->isAllOnesValue()) {
10311045
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
10321046
}
1047+
if (MaskC->getValue().isShiftedMask()) {
1048+
// any single contingous sequence of 1s anywhere in the mask simply
1049+
// describes a subset of the input bits shifted to the appropriate
1050+
// position. Replace with the straight forward IR.
1051+
unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
1052+
Value *Input = II.getArgOperand(0);
1053+
Value *Shifted = IC.Builder.CreateShl(Input,
1054+
ConstantInt::get(II.getType(),
1055+
ShiftAmount));
1056+
Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
1057+
return IC.replaceInstUsesWith(II, Masked);
1058+
}
10331059

10341060
if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
10351061
uint64_t Src = SrcC->getZExtValue();

llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,27 @@ define i64 @test_x86_pext_64_allones_mask(i64 %x) nounwind readnone {
306306
ret i64 %1
307307
}
308308

309+
define i32 @test_x86_pext_32_shifted_mask(i32 %x) nounwind readnone {
310+
; CHECK-LABEL: @test_x86_pext_32_shifted_mask(
311+
; CHECK-NEXT: %1 = lshr i32 %x, 1
312+
; CHECK-NEXT: %2 = and i32 %1, 3
313+
; CHECK-NEXT: ret i32 %2
314+
;
315+
%1 = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 6)
316+
ret i32 %1
317+
}
318+
319+
define i64 @test_x86_pext_64_shifted_mask(i64 %x) nounwind readnone {
320+
; CHECK-LABEL: @test_x86_pext_64_shifted_mask(
321+
; CHECK-NEXT: %1 = lshr i64 %x, 1
322+
; CHECK-NEXT: %2 = and i64 %1, 3
323+
; CHECK-NEXT: ret i64 %2
324+
;
325+
%1 = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 6)
326+
ret i64 %1
327+
}
328+
329+
309330
define i32 @test_x86_pext_32_constant_fold() nounwind readnone {
310331
; CHECK-LABEL: @test_x86_pext_32_constant_fold(
311332
; CHECK-NEXT: ret i32 30001
@@ -370,6 +391,27 @@ define i64 @test_x86_pdep_64_allones_mask(i64 %x) nounwind readnone {
370391
ret i64 %1
371392
}
372393

394+
define i32 @test_x86_pdep_32_shifted_mask(i32 %x) nounwind readnone {
395+
; CHECK-LABEL: @test_x86_pdep_32_shifted_mask(
396+
; CHECK-NEXT: %1 = shl i32 %x, 2
397+
; CHECK-NEXT: %2 = and i32 %1, 12
398+
; CHECK-NEXT: ret i32 %2
399+
;
400+
%1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 12)
401+
ret i32 %1
402+
}
403+
404+
define i64 @test_x86_pdep_64_shifted_mask(i64 %x) nounwind readnone {
405+
; CHECK-LABEL: @test_x86_pdep_64_shifted_mask(
406+
; CHECK-NEXT: %1 = shl i64 %x, 2
407+
; CHECK-NEXT: %2 = and i64 %1, 12
408+
; CHECK-NEXT: ret i64 %2
409+
;
410+
%1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 12)
411+
ret i64 %1
412+
}
413+
414+
373415
define i32 @test_x86_pdep_32_constant_fold() nounwind readnone {
374416
; CHECK-LABEL: @test_x86_pdep_32_constant_fold(
375417
; CHECK-NEXT: ret i32 807407616

0 commit comments

Comments
 (0)