Skip to content

Commit 310e798

Browse files
authored
[NVPTX] Avoid introducing unnecessary ProxyRegs and Movs in ISel (#120486)
Avoid introducing `ProxyReg` and `MOV` nodes during ISel when lowering `bitconvert` or similar operations. These nodes are all erased by a later pass but not introducing them in the first place is simpler and likely saves compile time. Also remove redundant `MOV` instruction definitions.
1 parent a161e73 commit 310e798

31 files changed

+921
-1013
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
176176
case ISD::ADDRSPACECAST:
177177
SelectAddrSpaceCast(N);
178178
return;
179-
case ISD::ConstantFP:
180-
if (tryConstantFP(N))
181-
return;
182-
break;
183179
case ISD::CopyToReg: {
184180
if (N->getOperand(1).getValueType() == MVT::i128) {
185181
SelectV2I64toI128(N);
@@ -212,21 +208,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
212208
}
213209
}
214210

215-
// There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we
216-
// have to load them into an .(b)f16 register first.
217-
bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) {
218-
if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16)
219-
return false;
220-
SDValue Val = CurDAG->getTargetConstantFP(
221-
cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0));
222-
SDNode *LoadConstF16 = CurDAG->getMachineNode(
223-
(N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16
224-
: NVPTX::LOAD_CONST_BF16),
225-
SDLoc(N), N->getValueType(0), Val);
226-
ReplaceNode(N, LoadConstF16);
227-
return true;
228-
}
229-
230211
// Map ISD:CONDCODE value to appropriate CmpMode expected by
231212
// NVPTXInstPrinter::printCmpMode()
232213
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 33 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -1177,17 +1177,6 @@ def NegDoubleConst : SDNodeXForm<fpimm, [{
11771177
SDLoc(N), MVT::f64);
11781178
}]>;
11791179

1180-
// Loads FP16 constant into a register.
1181-
//
1182-
// ptxas does not have hex representation for fp16, so we can't use
1183-
// fp16 immediate values in .f16 instructions. Instead we have to load
1184-
// the constant into a register using mov.b16.
1185-
def LOAD_CONST_F16 :
1186-
NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a),
1187-
"mov.b16 \t$dst, $a;", []>;
1188-
def LOAD_CONST_BF16 :
1189-
NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a),
1190-
"mov.b16 \t$dst, $a;", []>;
11911180
defm FADD : F3_fma_component<"add", fadd>;
11921181
defm FSUB : F3_fma_component<"sub", fsub>;
11931182
defm FMUL : F3_fma_component<"mul", fmul>;
@@ -1963,7 +1952,7 @@ let hasSideEffects = false in {
19631952

19641953

19651954
// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
1966-
let IsSimpleMove=1, hasSideEffects=0 in {
1955+
let IsSimpleMove=1, hasSideEffects=0, isAsCheapAsAMove=1 in {
19671956
def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
19681957
"mov.pred \t$dst, $sss;", []>;
19691958
def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
@@ -1975,48 +1964,37 @@ let IsSimpleMove=1, hasSideEffects=0 in {
19751964
def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss),
19761965
"mov.b128 \t$dst, $sss;", []>;
19771966

1978-
def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
1979-
"mov.b16 \t$dst, $sss;", []>;
1980-
def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
1981-
"mov.b32 \t$dst, $sss;", []>;
1982-
def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
1983-
"mov.b64 \t$dst, $sss;", []>;
1984-
1985-
def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1986-
// We have to use .b16 here as there's no mov.f16.
1987-
"mov.b16 \t$dst, $src;", []>;
19881967
def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
19891968
"mov.f32 \t$dst, $src;", []>;
19901969
def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
19911970
"mov.f64 \t$dst, $src;", []>;
1992-
}
19931971

1994-
def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
1995-
"mov.pred \t$dst, $src;",
1996-
[(set i1:$dst, imm:$src)]>;
1997-
def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
1998-
"mov.u16 \t$dst, $src;",
1999-
[(set i16:$dst, imm:$src)]>;
2000-
def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
2001-
"mov.u32 \t$dst, $src;",
2002-
[(set i32:$dst, imm:$src)]>;
2003-
def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
2004-
"mov.u64 \t$dst, $src;",
2005-
[(set i64:$dst, imm:$src)]>;
2006-
2007-
def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
2008-
"mov.b16 \t$dst, $src;", []>;
2009-
def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
2010-
"mov.b32 \t$dst, $src;", []>;
2011-
def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
2012-
"mov.b64 \t$dst, $src;", []>;
2013-
2014-
def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
2015-
"mov.f32 \t$dst, $src;",
2016-
[(set f32:$dst, fpimm:$src)]>;
2017-
def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
2018-
"mov.f64 \t$dst, $src;",
2019-
[(set f64:$dst, fpimm:$src)]>;
1972+
def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
1973+
"mov.pred \t$dst, $src;",
1974+
[(set i1:$dst, imm:$src)]>;
1975+
def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
1976+
"mov.b16 \t$dst, $src;",
1977+
[(set i16:$dst, imm:$src)]>;
1978+
def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
1979+
"mov.b32 \t$dst, $src;",
1980+
[(set i32:$dst, imm:$src)]>;
1981+
def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
1982+
"mov.b64 \t$dst, $src;",
1983+
[(set i64:$dst, imm:$src)]>;
1984+
1985+
def FMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$src),
1986+
"mov.b16 \t$dst, $src;",
1987+
[(set f16:$dst, fpimm:$src)]>;
1988+
def BFMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$src),
1989+
"mov.b16 \t$dst, $src;",
1990+
[(set bf16:$dst, fpimm:$src)]>;
1991+
def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
1992+
"mov.f32 \t$dst, $src;",
1993+
[(set f32:$dst, fpimm:$src)]>;
1994+
def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
1995+
"mov.f64 \t$dst, $src;",
1996+
[(set f64:$dst, fpimm:$src)]>;
1997+
}
20201998

20211999
def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
20222000
def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>;
@@ -2215,18 +2193,6 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
22152193
def : Pat<(i1 (OpNode f16:$a, f16:$b)),
22162194
(SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
22172195
Requires<[useFP16Math]>;
2218-
def : Pat<(i1 (OpNode f16:$a, fpimm:$b)),
2219-
(SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2220-
Requires<[useFP16Math,doF32FTZ]>;
2221-
def : Pat<(i1 (OpNode f16:$a, fpimm:$b)),
2222-
(SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2223-
Requires<[useFP16Math]>;
2224-
def : Pat<(i1 (OpNode fpimm:$a, f16:$b)),
2225-
(SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2226-
Requires<[useFP16Math,doF32FTZ]>;
2227-
def : Pat<(i1 (OpNode fpimm:$a, f16:$b)),
2228-
(SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2229-
Requires<[useFP16Math]>;
22302196

22312197
// bf16 -> pred
22322198
def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
@@ -2235,18 +2201,6 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
22352201
def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
22362202
(SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
22372203
Requires<[hasBF16Math]>;
2238-
def : Pat<(i1 (OpNode bf16:$a, fpimm:$b)),
2239-
(SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2240-
Requires<[hasBF16Math,doF32FTZ]>;
2241-
def : Pat<(i1 (OpNode bf16:$a, fpimm:$b)),
2242-
(SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2243-
Requires<[hasBF16Math]>;
2244-
def : Pat<(i1 (OpNode fpimm:$a, bf16:$b)),
2245-
(SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2246-
Requires<[hasBF16Math,doF32FTZ]>;
2247-
def : Pat<(i1 (OpNode fpimm:$a, bf16:$b)),
2248-
(SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2249-
Requires<[hasBF16Math]>;
22502204

22512205
// f32 -> pred
22522206
def : Pat<(i1 (OpNode f32:$a, f32:$b)),
@@ -2280,18 +2234,6 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
22802234
def : Pat<(i32 (OpNode f16:$a, f16:$b)),
22812235
(SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
22822236
Requires<[useFP16Math]>;
2283-
def : Pat<(i32 (OpNode f16:$a, fpimm:$b)),
2284-
(SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
2285-
Requires<[useFP16Math, doF32FTZ]>;
2286-
def : Pat<(i32 (OpNode f16:$a, fpimm:$b)),
2287-
(SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
2288-
Requires<[useFP16Math]>;
2289-
def : Pat<(i32 (OpNode fpimm:$a, f16:$b)),
2290-
(SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2291-
Requires<[useFP16Math, doF32FTZ]>;
2292-
def : Pat<(i32 (OpNode fpimm:$a, f16:$b)),
2293-
(SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
2294-
Requires<[useFP16Math]>;
22952237

22962238
// bf16 -> i32
22972239
def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
@@ -2300,18 +2242,6 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
23002242
def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
23012243
(SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
23022244
Requires<[hasBF16Math]>;
2303-
def : Pat<(i32 (OpNode bf16:$a, fpimm:$b)),
2304-
(SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
2305-
Requires<[hasBF16Math, doF32FTZ]>;
2306-
def : Pat<(i32 (OpNode bf16:$a, fpimm:$b)),
2307-
(SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
2308-
Requires<[hasBF16Math]>;
2309-
def : Pat<(i32 (OpNode fpimm:$a, bf16:$b)),
2310-
(SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
2311-
Requires<[hasBF16Math, doF32FTZ]>;
2312-
def : Pat<(i32 (OpNode fpimm:$a, bf16:$b)),
2313-
(SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
2314-
Requires<[hasBF16Math]>;
23152245

23162246
// f32 -> i32
23172247
def : Pat<(i32 (OpNode f32:$a, f32:$b)),
@@ -3104,21 +3034,17 @@ def: Pat<(f32 (bitconvert vt:$a)),
31043034
(BITCONVERT_32_I2F Int32Regs:$a)>;
31053035
}
31063036
foreach vt = [f16, bf16] in {
3107-
def: Pat<(vt (bitconvert (i16 UInt16Const:$a))),
3108-
(IMOVB16ri UInt16Const:$a)>;
3109-
def: Pat<(vt (bitconvert i16:$a)),
3110-
(ProxyRegI16 Int16Regs:$a)>;
3111-
def: Pat<(i16 (bitconvert vt:$a)),
3112-
(ProxyRegI16 Int16Regs:$a)>;
3037+
def: Pat<(vt (bitconvert i16:$a)),
3038+
(vt Int16Regs:$a)>;
3039+
def: Pat<(i16 (bitconvert vt:$a)),
3040+
(i16 Int16Regs:$a)>;
31133041
}
31143042

31153043
foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
3116-
def: Pat<(ta (bitconvert (i32 UInt32Const:$a))),
3117-
(IMOVB32ri UInt32Const:$a)>;
31183044
foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in {
31193045
if !ne(ta, tb) then {
3120-
def: Pat<(ta (bitconvert (tb Int32Regs:$a))),
3121-
(ProxyRegI32 Int32Regs:$a)>;
3046+
def: Pat<(ta (bitconvert tb:$a)),
3047+
(ta Int32Regs:$a)>;
31223048
}
31233049
}
31243050
}

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2803,10 +2803,10 @@ def : Pat<(int_nvvm_ptr_param_to_gen i64:$src),
28032803

28042804
// nvvm.ptr.gen.to.param
28052805
def : Pat<(int_nvvm_ptr_gen_to_param i32:$src),
2806-
(IMOV32rr Int32Regs:$src)>;
2806+
(i32 Int32Regs:$src)>;
28072807

28082808
def : Pat<(int_nvvm_ptr_gen_to_param i64:$src),
2809-
(IMOV64rr Int64Regs:$src)>;
2809+
(i64 Int64Regs:$src)>;
28102810

28112811
// nvvm.move intrinsicc
28122812
def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),

llvm/test/CodeGen/NVPTX/atomics-sm70.ll

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
4646
; CHECKPTX62-LABEL: test(
4747
; CHECKPTX62: {
4848
; CHECKPTX62-NEXT: .reg .pred %p<5>;
49-
; CHECKPTX62-NEXT: .reg .b16 %rs<19>;
49+
; CHECKPTX62-NEXT: .reg .b16 %rs<11>;
5050
; CHECKPTX62-NEXT: .reg .b32 %r<58>;
5151
; CHECKPTX62-EMPTY:
5252
; CHECKPTX62-NEXT: // %bb.0:
@@ -65,8 +65,8 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
6565
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
6666
; CHECKPTX62-NEXT: shr.u32 %r28, %r54, %r2;
6767
; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r28;
68-
; CHECKPTX62-NEXT: add.rn.f16 %rs4, %rs2, %rs1;
69-
; CHECKPTX62-NEXT: cvt.u32.u16 %r29, %rs4;
68+
; CHECKPTX62-NEXT: add.rn.f16 %rs3, %rs2, %rs1;
69+
; CHECKPTX62-NEXT: cvt.u32.u16 %r29, %rs3;
7070
; CHECKPTX62-NEXT: shl.b32 %r30, %r29, %r2;
7171
; CHECKPTX62-NEXT: and.b32 %r31, %r54, %r3;
7272
; CHECKPTX62-NEXT: or.b32 %r32, %r31, %r30;
@@ -79,10 +79,10 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
7979
; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27
8080
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
8181
; CHECKPTX62-NEXT: shr.u32 %r33, %r55, %r2;
82-
; CHECKPTX62-NEXT: cvt.u16.u32 %rs6, %r33;
83-
; CHECKPTX62-NEXT: mov.b16 %rs8, 0x3C00;
84-
; CHECKPTX62-NEXT: add.rn.f16 %rs9, %rs6, %rs8;
85-
; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs9;
82+
; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r33;
83+
; CHECKPTX62-NEXT: mov.b16 %rs5, 0x3C00;
84+
; CHECKPTX62-NEXT: add.rn.f16 %rs6, %rs4, %rs5;
85+
; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs6;
8686
; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r2;
8787
; CHECKPTX62-NEXT: and.b32 %r36, %r55, %r3;
8888
; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35;
@@ -94,15 +94,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
9494
; CHECKPTX62-NEXT: and.b32 %r10, %r22, -4;
9595
; CHECKPTX62-NEXT: shl.b32 %r38, %r22, 3;
9696
; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24;
97-
; CHECKPTX62-NEXT: shl.b32 %r40, %r26, %r11;
97+
; CHECKPTX62-NEXT: mov.b32 %r39, 65535;
98+
; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11;
9899
; CHECKPTX62-NEXT: not.b32 %r12, %r40;
99100
; CHECKPTX62-NEXT: ld.global.u32 %r56, [%r10];
100101
; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9
101102
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
102103
; CHECKPTX62-NEXT: shr.u32 %r41, %r56, %r11;
103-
; CHECKPTX62-NEXT: cvt.u16.u32 %rs11, %r41;
104-
; CHECKPTX62-NEXT: add.rn.f16 %rs13, %rs11, %rs1;
105-
; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs13;
104+
; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r41;
105+
; CHECKPTX62-NEXT: add.rn.f16 %rs8, %rs7, %rs1;
106+
; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs8;
106107
; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11;
107108
; CHECKPTX62-NEXT: and.b32 %r44, %r56, %r12;
108109
; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43;
@@ -114,15 +115,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
114115
; CHECKPTX62-NEXT: and.b32 %r16, %r23, -4;
115116
; CHECKPTX62-NEXT: shl.b32 %r46, %r23, 3;
116117
; CHECKPTX62-NEXT: and.b32 %r17, %r46, 24;
117-
; CHECKPTX62-NEXT: shl.b32 %r48, %r26, %r17;
118+
; CHECKPTX62-NEXT: mov.b32 %r47, 65535;
119+
; CHECKPTX62-NEXT: shl.b32 %r48, %r47, %r17;
118120
; CHECKPTX62-NEXT: not.b32 %r18, %r48;
119121
; CHECKPTX62-NEXT: ld.shared.u32 %r57, [%r16];
120122
; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start
121123
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
122124
; CHECKPTX62-NEXT: shr.u32 %r49, %r57, %r17;
123-
; CHECKPTX62-NEXT: cvt.u16.u32 %rs15, %r49;
124-
; CHECKPTX62-NEXT: add.rn.f16 %rs17, %rs15, %rs1;
125-
; CHECKPTX62-NEXT: cvt.u32.u16 %r50, %rs17;
125+
; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r49;
126+
; CHECKPTX62-NEXT: add.rn.f16 %rs10, %rs9, %rs1;
127+
; CHECKPTX62-NEXT: cvt.u32.u16 %r50, %rs10;
126128
; CHECKPTX62-NEXT: shl.b32 %r51, %r50, %r17;
127129
; CHECKPTX62-NEXT: and.b32 %r52, %r57, %r18;
128130
; CHECKPTX62-NEXT: or.b32 %r53, %r52, %r51;

0 commit comments

Comments
 (0)