Skip to content

Commit 3b08238

Browse files
committed
AMDGPU: Eliminate half of i64 or if one operand is zero_extend from i32
This helps clean up some of the mess when expanding unaligned 64-bit loads when changed to be promote to v2i32, and fixes situations where or x, 0 was emitted after splitting 64-bit ors during moveToVALU. I think this could be a generic combine but I'm not sure. llvm-svn: 266104
1 parent 15d1b4e commit 3b08238

File tree

2 files changed

+71
-0
lines changed

2 files changed

+71
-0
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2332,6 +2332,36 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
23322332
SDValue LHS = N->getOperand(0);
23332333
SDValue RHS = N->getOperand(1);
23342334

2335+
EVT VT = N->getValueType(0);
2336+
if (VT == MVT::i64) {
2337+
// TODO: This could be a generic combine with a predicate for extracting the
2338+
// high half of an integer being free.
2339+
2340+
// (or i64:x, (zero_extend i32:y)) ->
2341+
// i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
2342+
if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
2343+
RHS.getOpcode() != ISD::ZERO_EXTEND)
2344+
std::swap(LHS, RHS);
2345+
2346+
if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
2347+
SDValue ExtSrc = RHS.getOperand(0);
2348+
EVT SrcVT = ExtSrc.getValueType();
2349+
if (SrcVT == MVT::i32) {
2350+
SDLoc SL(N);
2351+
SDValue LowLHS, HiBits;
2352+
std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
2353+
SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
2354+
2355+
DCI.AddToWorklist(LowOr.getNode());
2356+
DCI.AddToWorklist(HiBits.getNode());
2357+
2358+
SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
2359+
LowOr, HiBits);
2360+
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2361+
}
2362+
}
2363+
}
2364+
23352365
// or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
23362366
if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
23372367
RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
2+
3+
; GCN-LABEL: {{^}}zext_or_operand_i64:
4+
; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
5+
; GCN: buffer_load_dword v[[LD32:[0-9]+]]
6+
; GCN-NOT: _or_
7+
; GCN-NOT: v[[HI]]
8+
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
9+
; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
10+
; GCN-NOT: _or_
11+
; GCN-NOT: v[[HI]]
12+
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
13+
; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
14+
define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
15+
%ld.64 = load volatile i64, i64 addrspace(1)* %in0
16+
%ld.32 = load volatile i32, i32 addrspace(1)* %in1
17+
%ext = zext i32 %ld.32 to i64
18+
%or = or i64 %ld.64, %ext
19+
store i64 %or, i64 addrspace(1)* %out
20+
ret void
21+
}
22+
23+
; GCN-LABEL: {{^}}zext_or_operand_commute_i64:
24+
; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
25+
; GCN: buffer_load_dword v[[LD32:[0-9]+]]
26+
; GCN-NOT: _or_
27+
; GCN-NOT: v[[HI]]
28+
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
29+
; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
30+
; GCN-NOT: v[[HI]]
31+
; GCN-NOT: _or_
32+
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
33+
; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
34+
define void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
35+
%ld.64 = load volatile i64, i64 addrspace(1)* %in0
36+
%ld.32 = load volatile i32, i32 addrspace(1)* %in1
37+
%ext = zext i32 %ld.32 to i64
38+
%or = or i64 %ext, %ld.64
39+
store i64 %or, i64 addrspace(1)* %out
40+
ret void
41+
}

0 commit comments

Comments
 (0)