Skip to content

Commit 802a2e3

Browse files
authored
[NVPTX] Add intrinsics for the szext instruction (#139126)
This change adds support for `llvm.nvvm.{sext,zext}.{wrap,clamp}` intrinsics.
1 parent 2cc8734 commit 802a2e3

File tree

6 files changed

+260
-132
lines changed

6 files changed

+260
-132
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,36 @@ to left-shift the found bit into the most-significant bit position, otherwise
568568
the result is the shift amount needed to right-shift the found bit into the
569569
least-significant bit position. 0xffffffff is returned if no 1 bit is found.
570570

571+
'``llvm.nvvm.{zext,sext}.{wrap,clamp}``' Intrinsics
572+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
573+
574+
Syntax:
575+
"""""""
576+
577+
.. code-block:: llvm
578+
579+
declare i32 @llvm.nvvm.zext.wrap(i32 %a, i32 %b)
580+
declare i32 @llvm.nvvm.zext.clamp(i32 %a, i32 %b)
581+
declare i32 @llvm.nvvm.sext.wrap(i32 %a, i32 %b)
582+
declare i32 @llvm.nvvm.sext.clamp(i32 %a, i32 %b)
583+
584+
Overview:
585+
"""""""""
586+
587+
The '``llvm.nvvm.{zext,sext}.{wrap,clamp}``' family of intrinsics extracts the
588+
low bits of the input value, and zero- or sign-extends them back to the original
589+
width.
590+
591+
Semantics:
592+
""""""""""
593+
594+
The '``llvm.nvvm.{zext,sext}.{wrap,clamp}``' family of intrinsics returns
595+
extension of N lowest bits of operand %a. For the '``wrap``' variants, N is the
596+
value of operand %b modulo 32. For the '``clamp``' variants, N is the value of
597+
operand %b clamped to the range [0, 32]. The N lowest bits are then
598+
zero-extended the case of the '``zext``' variants, or sign-extended the case of
599+
the '``sext``' variants. If N is 0, the result is 0.
600+
571601
TMA family of Intrinsics
572602
------------------------
573603

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,6 +1356,17 @@ let TargetPrefix = "nvvm" in {
13561356
[llvm_anyint_ty, llvm_i1_ty],
13571357
[IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
13581358

1359+
1360+
//
1361+
// szext
1362+
//
1363+
foreach ext = ["sext", "zext"] in
1364+
foreach mode = ["wrap", "clamp"] in
1365+
def int_nvvm_ # ext # _ # mode :
1366+
DefaultAttrsIntrinsic<[llvm_i32_ty],
1367+
[llvm_i32_ty, llvm_i32_ty],
1368+
[IntrNoMem, IntrSpeculatable]>;
1369+
13591370
//
13601371
// Convert
13611372
//

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 51 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm, SDNode imm_node,
227227
int Size = ty.Size;
228228
}
229229

230+
def I1RT : RegTyInfo<i1, Int1Regs, i1imm, imm>;
230231
def I16RT : RegTyInfo<i16, Int16Regs, i16imm, imm>;
231232
def I32RT : RegTyInfo<i32, Int32Regs, i32imm, imm>;
232233
def I64RT : RegTyInfo<i64, Int64Regs, i64imm, imm>;
@@ -240,26 +241,33 @@ def F16X2RT : RegTyInfo<v2f16, Int32Regs, ?, ?, supports_imm = 0>;
240241
def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>;
241242

242243

244+
multiclass I3Inst<string op_str, SDPatternOperator op_node, RegTyInfo t,
245+
bit commutative, list<Predicate> requires = []> {
246+
defvar asmstr = op_str # " \t$dst, $a, $b;";
247+
248+
def rr :
249+
NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b),
250+
asmstr,
251+
[(set t.Ty:$dst, (op_node t.Ty:$a, t.Ty:$b))]>,
252+
Requires<requires>;
253+
def ri :
254+
NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b),
255+
asmstr,
256+
[(set t.Ty:$dst, (op_node t.Ty:$a, (t.Ty imm:$b)))]>,
257+
Requires<requires>;
258+
if !not(commutative) then
259+
def ir :
260+
NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b),
261+
asmstr,
262+
[(set t.Ty:$dst, (op_node (t.Ty imm:$a), t.Ty:$b))]>,
263+
Requires<requires>;
264+
}
265+
243266
// Template for instructions which take three int64, int32, or int16 args.
244267
// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
245-
multiclass I3<string OpcStr, SDNode OpNode, bit commutative> {
246-
foreach t = [I16RT, I32RT, I64RT] in {
247-
defvar asmstr = OpcStr # t.Size # " \t$dst, $a, $b;";
248-
249-
def t.Ty # rr :
250-
NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b),
251-
asmstr,
252-
[(set t.Ty:$dst, (OpNode t.Ty:$a, t.Ty:$b))]>;
253-
def t.Ty # ri :
254-
NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b),
255-
asmstr,
256-
[(set t.Ty:$dst, (OpNode t.RC:$a, imm:$b))]>;
257-
if !not(commutative) then
258-
def t.Ty # ir :
259-
NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b),
260-
asmstr,
261-
[(set t.Ty:$dst, (OpNode imm:$a, t.RC:$b))]>;
262-
}
268+
multiclass I3<string op_str, SDPatternOperator op_node, bit commutative> {
269+
foreach t = [I16RT, I32RT, I64RT] in
270+
defm t.Ty# : I3Inst<op_str # t.Size, op_node, t, commutative>;
263271
}
264272

265273
class I16x2<string OpcStr, SDNode OpNode> :
@@ -270,26 +278,11 @@ class I16x2<string OpcStr, SDNode OpNode> :
270278

271279
// Template for instructions which take 3 int args. The instructions are
272280
// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
273-
multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
281+
multiclass ADD_SUB_INT_CARRY<string op_str, SDNode op_node, bit commutative> {
274282
let hasSideEffects = 1 in {
275-
def i32rr :
276-
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
277-
!strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
278-
[(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
279-
def i32ri :
280-
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
281-
!strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
282-
[(set i32:$dst, (OpNode i32:$a, imm:$b))]>;
283-
def i64rr :
284-
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
285-
!strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
286-
[(set i64:$dst, (OpNode i64:$a, i64:$b))]>,
287-
Requires<[hasPTX<43>]>;
288-
def i64ri :
289-
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
290-
!strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
291-
[(set i64:$dst, (OpNode i64:$a, imm:$b))]>,
292-
Requires<[hasPTX<43>]>;
283+
defm i32 : I3Inst<op_str # ".s32", op_node, I32RT, commutative>;
284+
defm i64 : I3Inst<op_str # ".s64", op_node, I64RT, commutative,
285+
requires = [hasPTX<43>]>;
293286
}
294287
}
295288

@@ -841,31 +834,31 @@ defm SUB_i1 : ADD_SUB_i1<sub>;
841834

842835
// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we
843836
// also use these for unsigned arithmetic.
844-
defm ADD : I3<"add.s", add, /*commutative=*/ true>;
845-
defm SUB : I3<"sub.s", sub, /*commutative=*/ false>;
837+
defm ADD : I3<"add.s", add, commutative = true>;
838+
defm SUB : I3<"sub.s", sub, commutative = false>;
846839

847840
def ADD16x2 : I16x2<"add.s", add>;
848841

849842
// in32 and int64 addition and subtraction with carry-out.
850-
defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>;
851-
defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>;
843+
defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc, commutative = true>;
844+
defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc, commutative = false>;
852845

853846
// int32 and int64 addition and subtraction with carry-in and carry-out.
854-
defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>;
855-
defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>;
847+
defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde, commutative = true>;
848+
defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube, commutative = false>;
856849

857-
defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>;
850+
defm MULT : I3<"mul.lo.s", mul, commutative = true>;
858851

859-
defm MULTHS : I3<"mul.hi.s", mulhs, /*commutative=*/ true>;
860-
defm MULTHU : I3<"mul.hi.u", mulhu, /*commutative=*/ true>;
852+
defm MULTHS : I3<"mul.hi.s", mulhs, commutative = true>;
853+
defm MULTHU : I3<"mul.hi.u", mulhu, commutative = true>;
861854

862-
defm SDIV : I3<"div.s", sdiv, /*commutative=*/ false>;
863-
defm UDIV : I3<"div.u", udiv, /*commutative=*/ false>;
855+
defm SDIV : I3<"div.s", sdiv, commutative = false>;
856+
defm UDIV : I3<"div.u", udiv, commutative = false>;
864857

865858
// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
866859
// will lower it.
867-
defm SREM : I3<"rem.s", srem, /*commutative=*/ false>;
868-
defm UREM : I3<"rem.u", urem, /*commutative=*/ false>;
860+
defm SREM : I3<"rem.s", srem, commutative = false>;
861+
defm UREM : I3<"rem.u", urem, commutative = false>;
869862

870863
// Integer absolute value. NumBits should be one minus the bit width of RC.
871864
// This idiom implements the algorithm at
@@ -880,10 +873,10 @@ defm ABS_32 : ABS<i32, Int32Regs, ".s32">;
880873
defm ABS_64 : ABS<i64, Int64Regs, ".s64">;
881874

882875
// Integer min/max.
883-
defm SMAX : I3<"max.s", smax, /*commutative=*/ true>;
884-
defm UMAX : I3<"max.u", umax, /*commutative=*/ true>;
885-
defm SMIN : I3<"min.s", smin, /*commutative=*/ true>;
886-
defm UMIN : I3<"min.u", umin, /*commutative=*/ true>;
876+
defm SMAX : I3<"max.s", smax, commutative = true>;
877+
defm UMAX : I3<"max.u", umax, commutative = true>;
878+
defm SMIN : I3<"min.s", smin, commutative = true>;
879+
defm UMIN : I3<"min.u", umin, commutative = true>;
887880

888881
def SMAX16x2 : I16x2<"max.s", smax>;
889882
def UMAX16x2 : I16x2<"max.u", umax>;
@@ -1393,38 +1386,10 @@ def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
13931386
// Template for three-arg bitwise operations. Takes three args, Creates .b16,
13941387
// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
13951388
multiclass BITWISE<string OpcStr, SDNode OpNode> {
1396-
def b1rr :
1397-
NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
1398-
!strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
1399-
[(set i1:$dst, (OpNode i1:$a, i1:$b))]>;
1400-
def b1ri :
1401-
NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
1402-
!strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
1403-
[(set i1:$dst, (OpNode i1:$a, imm:$b))]>;
1404-
def b16rr :
1405-
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
1406-
!strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
1407-
[(set i16:$dst, (OpNode i16:$a, i16:$b))]>;
1408-
def b16ri :
1409-
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
1410-
!strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
1411-
[(set i16:$dst, (OpNode i16:$a, imm:$b))]>;
1412-
def b32rr :
1413-
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1414-
!strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
1415-
[(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
1416-
def b32ri :
1417-
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1418-
!strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
1419-
[(set i32:$dst, (OpNode i32:$a, imm:$b))]>;
1420-
def b64rr :
1421-
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
1422-
!strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
1423-
[(set i64:$dst, (OpNode i64:$a, i64:$b))]>;
1424-
def b64ri :
1425-
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
1426-
!strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
1427-
[(set i64:$dst, (OpNode i64:$a, imm:$b))]>;
1389+
defm b1 : I3Inst<OpcStr # ".pred", OpNode, I1RT, commutative = true>;
1390+
defm b16 : I3Inst<OpcStr # ".b16", OpNode, I16RT, commutative = true>;
1391+
defm b32 : I3Inst<OpcStr # ".b32", OpNode, I32RT, commutative = true>;
1392+
defm b64 : I3Inst<OpcStr # ".b64", OpNode, I64RT, commutative = true>;
14281393
}
14291394

14301395
defm OR : BITWISE<"or", or>;

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1678,6 +1678,21 @@ foreach t = [I32RT, I64RT] in {
16781678
}
16791679
}
16801680

1681+
//
1682+
// szext
1683+
//
1684+
1685+
foreach sign = ["s", "u"] in {
1686+
foreach mode = ["wrap", "clamp"] in {
1687+
defvar ext = !if(!eq(sign, "s"), "sext", "zext");
1688+
defvar intrin = !cast<Intrinsic>("int_nvvm_" # ext # "_" # mode);
1689+
defm SZEXT_ # sign # _ # mode
1690+
: I3Inst<"szext." # mode # "." # sign # "32",
1691+
intrin, I32RT, commutative = false,
1692+
requires = [hasSM<70>, hasPTX<76>]>;
1693+
}
1694+
}
1695+
16811696
//
16821697
// Convert
16831698
//

llvm/test/CodeGen/NVPTX/i128.ll

Lines changed: 46 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -13,37 +13,37 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
1313
; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0];
1414
; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1];
1515
; CHECK-NEXT: shr.s64 %rd2, %rd46, 63;
16-
; CHECK-NEXT: mov.b64 %rd117, 0;
17-
; CHECK-NEXT: sub.cc.s64 %rd52, %rd117, %rd45;
18-
; CHECK-NEXT: subc.cc.s64 %rd53, %rd117, %rd46;
16+
; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45;
17+
; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46;
1918
; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0;
20-
; CHECK-NEXT: selp.b64 %rd4, %rd53, %rd46, %p1;
21-
; CHECK-NEXT: selp.b64 %rd3, %rd52, %rd45, %p1;
22-
; CHECK-NEXT: sub.cc.s64 %rd54, %rd117, %rd49;
23-
; CHECK-NEXT: subc.cc.s64 %rd55, %rd117, %rd50;
19+
; CHECK-NEXT: selp.b64 %rd4, %rd52, %rd46, %p1;
20+
; CHECK-NEXT: selp.b64 %rd3, %rd51, %rd45, %p1;
21+
; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49;
22+
; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50;
2423
; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0;
25-
; CHECK-NEXT: selp.b64 %rd6, %rd55, %rd50, %p2;
26-
; CHECK-NEXT: selp.b64 %rd5, %rd54, %rd49, %p2;
27-
; CHECK-NEXT: or.b64 %rd56, %rd5, %rd6;
28-
; CHECK-NEXT: setp.eq.s64 %p3, %rd56, 0;
29-
; CHECK-NEXT: or.b64 %rd57, %rd3, %rd4;
30-
; CHECK-NEXT: setp.eq.s64 %p4, %rd57, 0;
24+
; CHECK-NEXT: selp.b64 %rd6, %rd54, %rd50, %p2;
25+
; CHECK-NEXT: selp.b64 %rd5, %rd53, %rd49, %p2;
26+
; CHECK-NEXT: or.b64 %rd55, %rd5, %rd6;
27+
; CHECK-NEXT: setp.eq.s64 %p3, %rd55, 0;
28+
; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4;
29+
; CHECK-NEXT: setp.eq.s64 %p4, %rd56, 0;
3130
; CHECK-NEXT: or.pred %p5, %p3, %p4;
3231
; CHECK-NEXT: setp.ne.s64 %p6, %rd6, 0;
3332
; CHECK-NEXT: clz.b64 %r1, %rd6;
34-
; CHECK-NEXT: cvt.u64.u32 %rd58, %r1;
33+
; CHECK-NEXT: cvt.u64.u32 %rd57, %r1;
3534
; CHECK-NEXT: clz.b64 %r2, %rd5;
36-
; CHECK-NEXT: cvt.u64.u32 %rd59, %r2;
37-
; CHECK-NEXT: add.s64 %rd60, %rd59, 64;
38-
; CHECK-NEXT: selp.b64 %rd61, %rd58, %rd60, %p6;
35+
; CHECK-NEXT: cvt.u64.u32 %rd58, %r2;
36+
; CHECK-NEXT: add.s64 %rd59, %rd58, 64;
37+
; CHECK-NEXT: selp.b64 %rd60, %rd57, %rd59, %p6;
3938
; CHECK-NEXT: setp.ne.s64 %p7, %rd4, 0;
4039
; CHECK-NEXT: clz.b64 %r3, %rd4;
41-
; CHECK-NEXT: cvt.u64.u32 %rd62, %r3;
40+
; CHECK-NEXT: cvt.u64.u32 %rd61, %r3;
4241
; CHECK-NEXT: clz.b64 %r4, %rd3;
43-
; CHECK-NEXT: cvt.u64.u32 %rd63, %r4;
44-
; CHECK-NEXT: add.s64 %rd64, %rd63, 64;
45-
; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7;
46-
; CHECK-NEXT: sub.cc.s64 %rd66, %rd61, %rd65;
42+
; CHECK-NEXT: cvt.u64.u32 %rd62, %r4;
43+
; CHECK-NEXT: add.s64 %rd63, %rd62, 64;
44+
; CHECK-NEXT: selp.b64 %rd64, %rd61, %rd63, %p7;
45+
; CHECK-NEXT: mov.b64 %rd117, 0;
46+
; CHECK-NEXT: sub.cc.s64 %rd66, %rd60, %rd64;
4747
; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0;
4848
; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127;
4949
; CHECK-NEXT: setp.eq.s64 %p9, %rd67, 0;
@@ -314,39 +314,39 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
314314
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
315315
; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0];
316316
; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1];
317-
; CHECK-NEXT: mov.b64 %rd112, 0;
318-
; CHECK-NEXT: sub.cc.s64 %rd52, %rd112, %rd45;
319-
; CHECK-NEXT: subc.cc.s64 %rd53, %rd112, %rd46;
317+
; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45;
318+
; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46;
320319
; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0;
321-
; CHECK-NEXT: selp.b64 %rd2, %rd53, %rd46, %p1;
322-
; CHECK-NEXT: selp.b64 %rd1, %rd52, %rd45, %p1;
323-
; CHECK-NEXT: sub.cc.s64 %rd54, %rd112, %rd49;
324-
; CHECK-NEXT: subc.cc.s64 %rd55, %rd112, %rd50;
320+
; CHECK-NEXT: selp.b64 %rd2, %rd52, %rd46, %p1;
321+
; CHECK-NEXT: selp.b64 %rd1, %rd51, %rd45, %p1;
322+
; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49;
323+
; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50;
325324
; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0;
326-
; CHECK-NEXT: selp.b64 %rd4, %rd55, %rd50, %p2;
327-
; CHECK-NEXT: selp.b64 %rd3, %rd54, %rd49, %p2;
328-
; CHECK-NEXT: xor.b64 %rd56, %rd50, %rd46;
329-
; CHECK-NEXT: shr.s64 %rd5, %rd56, 63;
330-
; CHECK-NEXT: or.b64 %rd57, %rd3, %rd4;
331-
; CHECK-NEXT: setp.eq.s64 %p3, %rd57, 0;
332-
; CHECK-NEXT: or.b64 %rd58, %rd1, %rd2;
333-
; CHECK-NEXT: setp.eq.s64 %p4, %rd58, 0;
325+
; CHECK-NEXT: selp.b64 %rd4, %rd54, %rd50, %p2;
326+
; CHECK-NEXT: selp.b64 %rd3, %rd53, %rd49, %p2;
327+
; CHECK-NEXT: xor.b64 %rd55, %rd50, %rd46;
328+
; CHECK-NEXT: shr.s64 %rd5, %rd55, 63;
329+
; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4;
330+
; CHECK-NEXT: setp.eq.s64 %p3, %rd56, 0;
331+
; CHECK-NEXT: or.b64 %rd57, %rd1, %rd2;
332+
; CHECK-NEXT: setp.eq.s64 %p4, %rd57, 0;
334333
; CHECK-NEXT: or.pred %p5, %p3, %p4;
335334
; CHECK-NEXT: setp.ne.s64 %p6, %rd4, 0;
336335
; CHECK-NEXT: clz.b64 %r1, %rd4;
337-
; CHECK-NEXT: cvt.u64.u32 %rd59, %r1;
336+
; CHECK-NEXT: cvt.u64.u32 %rd58, %r1;
338337
; CHECK-NEXT: clz.b64 %r2, %rd3;
339-
; CHECK-NEXT: cvt.u64.u32 %rd60, %r2;
340-
; CHECK-NEXT: add.s64 %rd61, %rd60, 64;
341-
; CHECK-NEXT: selp.b64 %rd62, %rd59, %rd61, %p6;
338+
; CHECK-NEXT: cvt.u64.u32 %rd59, %r2;
339+
; CHECK-NEXT: add.s64 %rd60, %rd59, 64;
340+
; CHECK-NEXT: selp.b64 %rd61, %rd58, %rd60, %p6;
342341
; CHECK-NEXT: setp.ne.s64 %p7, %rd2, 0;
343342
; CHECK-NEXT: clz.b64 %r3, %rd2;
344-
; CHECK-NEXT: cvt.u64.u32 %rd63, %r3;
343+
; CHECK-NEXT: cvt.u64.u32 %rd62, %r3;
345344
; CHECK-NEXT: clz.b64 %r4, %rd1;
346-
; CHECK-NEXT: cvt.u64.u32 %rd64, %r4;
347-
; CHECK-NEXT: add.s64 %rd65, %rd64, 64;
348-
; CHECK-NEXT: selp.b64 %rd66, %rd63, %rd65, %p7;
349-
; CHECK-NEXT: sub.cc.s64 %rd67, %rd62, %rd66;
345+
; CHECK-NEXT: cvt.u64.u32 %rd63, %r4;
346+
; CHECK-NEXT: add.s64 %rd64, %rd63, 64;
347+
; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7;
348+
; CHECK-NEXT: mov.b64 %rd112, 0;
349+
; CHECK-NEXT: sub.cc.s64 %rd67, %rd61, %rd65;
350350
; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0;
351351
; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127;
352352
; CHECK-NEXT: setp.eq.s64 %p9, %rd68, 0;

0 commit comments

Comments
 (0)