Skip to content

Commit 8e7e6a8

Browse files
committed
[X86] Restore selection of MULX on BMI2 targets.
Looking back over gcc and icc behavior it looks like icc does use mulx32 on 32-bit targets and mulx64 on 64-bit targets. It's also used when dividing i32 by constant on 32-bit targets and i64 by constant on 64-bit targets. gcc uses it multiplies producing a 64 bit result on 32-bit targets and 128-bit results on a 64-bit target. gcc does not appear to use it for division by constant. After this patch clang is closer to the icc behavior. This basically reverts d1c6186, but there were no strong feelings at the time. Fixes PR45518. Differential Revision: https://reviews.llvm.org/D80498
1 parent fa3b587 commit 8e7e6a8

File tree

9 files changed

+400
-246
lines changed

9 files changed

+400
-246
lines changed

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

Lines changed: 48 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4758,17 +4758,24 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
47584758
unsigned Opc, MOpc;
47594759
unsigned LoReg, HiReg;
47604760
bool IsSigned = Opcode == ISD::SMUL_LOHI;
4761+
bool UseMULX = !IsSigned && Subtarget->hasBMI2();
47614762
switch (NVT.SimpleTy) {
47624763
default: llvm_unreachable("Unsupported VT!");
47634764
case MVT::i32:
4764-
Opc = IsSigned ? X86::IMUL32r : X86::MUL32r;
4765-
MOpc = IsSigned ? X86::IMUL32m : X86::MUL32m;
4766-
LoReg = X86::EAX; HiReg = X86::EDX;
4765+
Opc = UseMULX ? X86::MULX32rr :
4766+
IsSigned ? X86::IMUL32r : X86::MUL32r;
4767+
MOpc = UseMULX ? X86::MULX32rm :
4768+
IsSigned ? X86::IMUL32m : X86::MUL32m;
4769+
LoReg = UseMULX ? X86::EDX : X86::EAX;
4770+
HiReg = X86::EDX;
47674771
break;
47684772
case MVT::i64:
4769-
Opc = IsSigned ? X86::IMUL64r : X86::MUL64r;
4770-
MOpc = IsSigned ? X86::IMUL64m : X86::MUL64m;
4771-
LoReg = X86::RAX; HiReg = X86::RDX;
4773+
Opc = UseMULX ? X86::MULX64rr :
4774+
IsSigned ? X86::IMUL64r : X86::MUL64r;
4775+
MOpc = UseMULX ? X86::MULX64rm :
4776+
IsSigned ? X86::IMUL64m : X86::MUL64m;
4777+
LoReg = UseMULX ? X86::RDX : X86::RAX;
4778+
HiReg = X86::RDX;
47724779
break;
47734780
}
47744781

@@ -4783,43 +4790,63 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
47834790

47844791
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
47854792
N0, SDValue()).getValue(1);
4793+
SDValue ResHi, ResLo;
47864794
if (foldedLoad) {
47874795
SDValue Chain;
47884796
MachineSDNode *CNode = nullptr;
47894797
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
47904798
InFlag };
4791-
SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
4792-
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4793-
Chain = SDValue(CNode, 0);
4794-
InFlag = SDValue(CNode, 1);
4799+
if (UseMULX) {
4800+
SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
4801+
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4802+
ResHi = SDValue(CNode, 0);
4803+
ResLo = SDValue(CNode, 1);
4804+
Chain = SDValue(CNode, 2);
4805+
} else {
4806+
SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
4807+
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4808+
Chain = SDValue(CNode, 0);
4809+
InFlag = SDValue(CNode, 1);
4810+
}
47954811

47964812
// Update the chain.
47974813
ReplaceUses(N1.getValue(1), Chain);
47984814
// Record the mem-refs
47994815
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
48004816
} else {
48014817
SDValue Ops[] = { N1, InFlag };
4802-
SDVTList VTs = CurDAG->getVTList(MVT::Glue);
4803-
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4804-
InFlag = SDValue(CNode, 0);
4818+
if (UseMULX) {
4819+
SDVTList VTs = CurDAG->getVTList(NVT, NVT);
4820+
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4821+
ResHi = SDValue(CNode, 0);
4822+
ResLo = SDValue(CNode, 1);
4823+
} else {
4824+
SDVTList VTs = CurDAG->getVTList(MVT::Glue);
4825+
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4826+
InFlag = SDValue(CNode, 0);
4827+
}
48054828
}
48064829

48074830
// Copy the low half of the result, if it is needed.
48084831
if (!SDValue(Node, 0).use_empty()) {
4809-
assert(LoReg && "Register for low half is not defined!");
4810-
SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
4811-
NVT, InFlag);
4812-
InFlag = ResLo.getValue(2);
4832+
if (!ResLo) {
4833+
assert(LoReg && "Register for low half is not defined!");
4834+
ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
4835+
NVT, InFlag);
4836+
InFlag = ResLo.getValue(2);
4837+
}
48134838
ReplaceUses(SDValue(Node, 0), ResLo);
48144839
LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
48154840
dbgs() << '\n');
48164841
}
48174842
// Copy the high half of the result, if it is needed.
48184843
if (!SDValue(Node, 1).use_empty()) {
4819-
assert(HiReg && "Register for high half is not defined!");
4820-
SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
4821-
NVT, InFlag);
4822-
InFlag = ResHi.getValue(2);
4844+
if (!ResHi) {
4845+
assert(HiReg && "Register for high half is not defined!");
4846+
ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
4847+
NVT, InFlag);
4848+
InFlag = ResHi.getValue(2);
4849+
}
48234850
ReplaceUses(SDValue(Node, 1), ResHi);
48244851
LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
48254852
dbgs() << '\n');

llvm/test/CodeGen/X86/atomic-unordered.ll

Lines changed: 40 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -837,18 +837,16 @@ define i64 @load_fold_udiv1(i64* %p) {
837837
;
838838
; CHECK-O3-CUR-LABEL: load_fold_udiv1:
839839
; CHECK-O3-CUR: # %bb.0:
840-
; CHECK-O3-CUR-NEXT: movq (%rdi), %rax
841-
; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
842-
; CHECK-O3-CUR-NEXT: mulq %rcx
843-
; CHECK-O3-CUR-NEXT: movq %rdx, %rax
840+
; CHECK-O3-CUR-NEXT: movq (%rdi), %rdx
841+
; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
842+
; CHECK-O3-CUR-NEXT: mulxq %rax, %rcx, %rax
844843
; CHECK-O3-CUR-NEXT: shrq $3, %rax
845844
; CHECK-O3-CUR-NEXT: retq
846845
;
847846
; CHECK-O3-EX-LABEL: load_fold_udiv1:
848847
; CHECK-O3-EX: # %bb.0:
849-
; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
850-
; CHECK-O3-EX-NEXT: mulq (%rdi)
851-
; CHECK-O3-EX-NEXT: movq %rdx, %rax
848+
; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
849+
; CHECK-O3-EX-NEXT: mulxq (%rdi), %rcx, %rax
852850
; CHECK-O3-EX-NEXT: shrq $3, %rax
853851
; CHECK-O3-EX-NEXT: retq
854852
%v = load atomic i64, i64* %p unordered, align 8
@@ -1033,15 +1031,14 @@ define i64 @load_fold_urem1(i64* %p) {
10331031
;
10341032
; CHECK-O3-LABEL: load_fold_urem1:
10351033
; CHECK-O3: # %bb.0:
1036-
; CHECK-O3-NEXT: movq (%rdi), %rcx
1037-
; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
1038-
; CHECK-O3-NEXT: movq %rcx, %rax
1039-
; CHECK-O3-NEXT: mulq %rdx
1034+
; CHECK-O3-NEXT: movq (%rdi), %rax
1035+
; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
1036+
; CHECK-O3-NEXT: movq %rax, %rdx
1037+
; CHECK-O3-NEXT: mulxq %rcx, %rcx, %rdx
10401038
; CHECK-O3-NEXT: shrq $3, %rdx
1041-
; CHECK-O3-NEXT: leaq (%rdx,%rdx,4), %rax
1042-
; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax
1043-
; CHECK-O3-NEXT: subq %rax, %rcx
1044-
; CHECK-O3-NEXT: movq %rcx, %rax
1039+
; CHECK-O3-NEXT: leaq (%rdx,%rdx,4), %rcx
1040+
; CHECK-O3-NEXT: leaq (%rcx,%rcx,2), %rcx
1041+
; CHECK-O3-NEXT: subq %rcx, %rax
10451042
; CHECK-O3-NEXT: retq
10461043
%v = load atomic i64, i64* %p unordered, align 8
10471044
%ret = urem i64 %v, 15
@@ -1694,28 +1691,28 @@ define void @rmw_fold_sdiv2(i64* %p, i64 %v) {
16941691
define void @rmw_fold_udiv1(i64* %p, i64 %v) {
16951692
; CHECK-O0-LABEL: rmw_fold_udiv1:
16961693
; CHECK-O0: # %bb.0:
1697-
; CHECK-O0-NEXT: movq (%rdi), %rax
1698-
; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
1699-
; CHECK-O0-NEXT: mulq %rcx
1700-
; CHECK-O0-NEXT: shrq $3, %rdx
1701-
; CHECK-O0-NEXT: movq %rdx, (%rdi)
1694+
; CHECK-O0-NEXT: movq (%rdi), %rdx
1695+
; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
1696+
; CHECK-O0-NEXT: mulxq %rax, %rcx, %rax
1697+
; CHECK-O0-NEXT: shrq $3, %rax
1698+
; CHECK-O0-NEXT: movq %rax, (%rdi)
17021699
; CHECK-O0-NEXT: retq
17031700
;
17041701
; CHECK-O3-CUR-LABEL: rmw_fold_udiv1:
17051702
; CHECK-O3-CUR: # %bb.0:
1706-
; CHECK-O3-CUR-NEXT: movq (%rdi), %rax
1707-
; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
1708-
; CHECK-O3-CUR-NEXT: mulq %rcx
1709-
; CHECK-O3-CUR-NEXT: shrq $3, %rdx
1710-
; CHECK-O3-CUR-NEXT: movq %rdx, (%rdi)
1703+
; CHECK-O3-CUR-NEXT: movq (%rdi), %rdx
1704+
; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
1705+
; CHECK-O3-CUR-NEXT: mulxq %rax, %rax, %rcx
1706+
; CHECK-O3-CUR-NEXT: shrq $3, %rcx
1707+
; CHECK-O3-CUR-NEXT: movq %rcx, (%rdi)
17111708
; CHECK-O3-CUR-NEXT: retq
17121709
;
17131710
; CHECK-O3-EX-LABEL: rmw_fold_udiv1:
17141711
; CHECK-O3-EX: # %bb.0:
1715-
; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
1716-
; CHECK-O3-EX-NEXT: mulq (%rdi)
1717-
; CHECK-O3-EX-NEXT: shrq $3, %rdx
1718-
; CHECK-O3-EX-NEXT: movq %rdx, (%rdi)
1712+
; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
1713+
; CHECK-O3-EX-NEXT: mulxq (%rdi), %rax, %rcx
1714+
; CHECK-O3-EX-NEXT: shrq $3, %rcx
1715+
; CHECK-O3-EX-NEXT: movq %rcx, (%rdi)
17191716
; CHECK-O3-EX-NEXT: retq
17201717
%prev = load atomic i64, i64* %p unordered, align 8
17211718
%val = udiv i64 %prev, 15
@@ -1842,27 +1839,25 @@ define void @rmw_fold_urem1(i64* %p, i64 %v) {
18421839
; CHECK-O0: # %bb.0:
18431840
; CHECK-O0-NEXT: movq (%rdi), %rax
18441841
; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
1845-
; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1846-
; CHECK-O0-NEXT: mulq %rcx
1847-
; CHECK-O0-NEXT: shrq $3, %rdx
1848-
; CHECK-O0-NEXT: leaq (%rdx,%rdx,4), %rax
1849-
; CHECK-O0-NEXT: leaq (%rax,%rax,2), %rax
1850-
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
1851-
; CHECK-O0-NEXT: subq %rax, %rcx
1852-
; CHECK-O0-NEXT: movq %rcx, (%rdi)
1842+
; CHECK-O0-NEXT: movq %rax, %rdx
1843+
; CHECK-O0-NEXT: mulxq %rcx, %rdx, %rcx
1844+
; CHECK-O0-NEXT: shrq $3, %rcx
1845+
; CHECK-O0-NEXT: leaq (%rcx,%rcx,4), %rcx
1846+
; CHECK-O0-NEXT: leaq (%rcx,%rcx,2), %rcx
1847+
; CHECK-O0-NEXT: subq %rcx, %rax
1848+
; CHECK-O0-NEXT: movq %rax, (%rdi)
18531849
; CHECK-O0-NEXT: retq
18541850
;
18551851
; CHECK-O3-LABEL: rmw_fold_urem1:
18561852
; CHECK-O3: # %bb.0:
1857-
; CHECK-O3-NEXT: movq (%rdi), %rcx
1858-
; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
1859-
; CHECK-O3-NEXT: movq %rcx, %rax
1860-
; CHECK-O3-NEXT: mulq %rdx
1861-
; CHECK-O3-NEXT: shrq $3, %rdx
1862-
; CHECK-O3-NEXT: leaq (%rdx,%rdx,4), %rax
1853+
; CHECK-O3-NEXT: movq (%rdi), %rdx
1854+
; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
1855+
; CHECK-O3-NEXT: mulxq %rax, %rax, %rcx
1856+
; CHECK-O3-NEXT: shrq $3, %rcx
1857+
; CHECK-O3-NEXT: leaq (%rcx,%rcx,4), %rax
18631858
; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax
1864-
; CHECK-O3-NEXT: subq %rax, %rcx
1865-
; CHECK-O3-NEXT: movq %rcx, (%rdi)
1859+
; CHECK-O3-NEXT: subq %rax, %rdx
1860+
; CHECK-O3-NEXT: movq %rdx, (%rdi)
18661861
; CHECK-O3-NEXT: retq
18671862
%prev = load atomic i64, i64* %p unordered, align 8
18681863
%val = urem i64 %prev, 15

llvm/test/CodeGen/X86/bmi2-x86_64.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ define i64 @mulx64(i64 %x, i64 %y, i64* %p) {
6868
; CHECK-LABEL: mulx64:
6969
; CHECK: # %bb.0:
7070
; CHECK-NEXT: movq %rdx, %rcx
71-
; CHECK-NEXT: movq %rdi, %rax
72-
; CHECK-NEXT: mulq %rsi
71+
; CHECK-NEXT: movq %rdi, %rdx
72+
; CHECK-NEXT: mulxq %rsi, %rax, %rdx
7373
; CHECK-NEXT: movq %rdx, (%rcx)
7474
; CHECK-NEXT: retq
7575
%x1 = zext i64 %x to i128
@@ -86,8 +86,8 @@ define i64 @mulx64_load(i64 %x, i64* %y, i64* %p) {
8686
; CHECK-LABEL: mulx64_load:
8787
; CHECK: # %bb.0:
8888
; CHECK-NEXT: movq %rdx, %rcx
89-
; CHECK-NEXT: movq %rdi, %rax
90-
; CHECK-NEXT: mulq (%rsi)
89+
; CHECK-NEXT: movq %rdi, %rdx
90+
; CHECK-NEXT: mulxq (%rsi), %rax, %rdx
9191
; CHECK-NEXT: movq %rdx, (%rcx)
9292
; CHECK-NEXT: retq
9393
%y1 = load i64, i64* %y

llvm/test/CodeGen/X86/bmi2.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -120,11 +120,11 @@ define i32 @mulx32(i32 %x, i32 %y, i32* %p) {
120120
; X86-LABEL: mulx32:
121121
; X86: # %bb.0:
122122
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
123-
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
124123
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
125-
; X86-NEXT: addl %eax, %eax
124+
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
126125
; X86-NEXT: addl %edx, %edx
127-
; X86-NEXT: mull %edx
126+
; X86-NEXT: addl %eax, %eax
127+
; X86-NEXT: mulxl %eax, %eax, %edx
128128
; X86-NEXT: movl %edx, (%ecx)
129129
; X86-NEXT: retl
130130
;
@@ -156,10 +156,10 @@ define i32 @mulx32_load(i32 %x, i32* %y, i32* %p) {
156156
; X86-LABEL: mulx32_load:
157157
; X86: # %bb.0:
158158
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
159-
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
160159
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
161-
; X86-NEXT: addl %eax, %eax
162-
; X86-NEXT: mull (%edx)
160+
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
161+
; X86-NEXT: addl %edx, %edx
162+
; X86-NEXT: mulxl (%eax), %eax, %edx
163163
; X86-NEXT: movl %edx, (%ecx)
164164
; X86-NEXT: retl
165165
;

llvm/test/CodeGen/X86/hoist-invariant-load.ll

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -215,22 +215,21 @@ declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
215215
define void @test_multi_def(i64* dereferenceable(8) %x1,
216216
; CHECK-LABEL: test_multi_def:
217217
; CHECK: ## %bb.0: ## %entry
218-
; CHECK-NEXT: movq %rdx, %r8
219-
; CHECK-NEXT: xorl %r9d, %r9d
220-
; CHECK-NEXT: movq (%rdi), %rdi
221-
; CHECK-NEXT: movq (%rsi), %rsi
218+
; CHECK-NEXT: movq %rdx, %rax
219+
; CHECK-NEXT: xorl %r8d, %r8d
220+
; CHECK-NEXT: movq (%rdi), %rdx
221+
; CHECK-NEXT: movq (%rsi), %r9
222222
; CHECK-NEXT: .p2align 4, 0x90
223223
; CHECK-NEXT: LBB4_2: ## %for.body
224224
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
225-
; CHECK-NEXT: movq %rdi, %rax
226-
; CHECK-NEXT: mulq %rsi
227-
; CHECK-NEXT: addq %rax, (%r8)
228-
; CHECK-NEXT: adcq %rdx, 8(%r8)
225+
; CHECK-NEXT: mulxq %r9, %rsi, %rdi
226+
; CHECK-NEXT: addq %rsi, (%rax)
227+
; CHECK-NEXT: adcq %rdi, 8(%rax)
229228
; CHECK-NEXT: ## %bb.1: ## %for.check
230229
; CHECK-NEXT: ## in Loop: Header=BB4_2 Depth=1
231-
; CHECK-NEXT: incq %r9
232-
; CHECK-NEXT: addq $16, %r8
233-
; CHECK-NEXT: cmpq %rcx, %r9
230+
; CHECK-NEXT: incq %r8
231+
; CHECK-NEXT: addq $16, %rax
232+
; CHECK-NEXT: cmpq %rcx, %r8
234233
; CHECK-NEXT: jl LBB4_2
235234
; CHECK-NEXT: ## %bb.3: ## %exit
236235
; CHECK-NEXT: retq

0 commit comments

Comments
 (0)