Skip to content

[NVPTX] Add intrinsics for the szext instruction #139126

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions llvm/docs/NVPTXUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,36 @@ to left-shift the found bit into the most-significant bit position, otherwise
the result is the shift amount needed to right-shift the found bit into the
least-significant bit position. 0xffffffff is returned if no 1 bit is found.

'``llvm.nvvm.{zext,sext}.{wrap,clamp}``' Intrinsics
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Syntax:
"""""""

.. code-block:: llvm

declare i32 @llvm.nvvm.zext.wrap(i32 %a, i32 %b)
declare i32 @llvm.nvvm.zext.clamp(i32 %a, i32 %b)
declare i32 @llvm.nvvm.sext.wrap(i32 %a, i32 %b)
declare i32 @llvm.nvvm.sext.clamp(i32 %a, i32 %b)

Overview:
"""""""""

The '``llvm.nvvm.{zext,sext}.{wrap,clamp}``' family of intrinsics extracts the
low bits of the input value, and zero- or sign-extends them back to the original
width.

Semantics:
""""""""""

The '``llvm.nvvm.{zext,sext}.{wrap,clamp}``' family of intrinsics returns
extension of N lowest bits of operand %a. For the '``wrap``' variants, N is the
value of operand %b modulo 32. For the '``clamp``' variants, N is the value of
operand %b clamped to the range [0, 32]. The N lowest bits are then
zero-extended the case of the '``zext``' variants, or sign-extended the case of
the '``sext``' variants. If N is 0, the result is 0.

TMA family of Intrinsics
------------------------

Expand Down
11 changes: 11 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -1356,6 +1356,17 @@ let TargetPrefix = "nvvm" in {
[llvm_anyint_ty, llvm_i1_ty],
[IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<1>>]>;


//
// szext
//
foreach ext = ["sext", "zext"] in
foreach mode = ["wrap", "clamp"] in
def int_nvvm_ # ext # _ # mode :
DefaultAttrsIntrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]>;

//
// Convert
//
Expand Down
137 changes: 51 additions & 86 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm, SDNode imm_node,
int Size = ty.Size;
}

def I1RT : RegTyInfo<i1, Int1Regs, i1imm, imm>;
def I16RT : RegTyInfo<i16, Int16Regs, i16imm, imm>;
def I32RT : RegTyInfo<i32, Int32Regs, i32imm, imm>;
def I64RT : RegTyInfo<i64, Int64Regs, i64imm, imm>;
Expand All @@ -240,26 +241,33 @@ def F16X2RT : RegTyInfo<v2f16, Int32Regs, ?, ?, supports_imm = 0>;
def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>;


multiclass I3Inst<string op_str, SDPatternOperator op_node, RegTyInfo t,
bit commutative, list<Predicate> requires = []> {
defvar asmstr = op_str # " \t$dst, $a, $b;";

def rr :
NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b),
asmstr,
[(set t.Ty:$dst, (op_node t.Ty:$a, t.Ty:$b))]>,
Requires<requires>;
def ri :
NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b),
asmstr,
[(set t.Ty:$dst, (op_node t.Ty:$a, (t.Ty imm:$b)))]>,
Requires<requires>;
if !not(commutative) then
def ir :
NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b),
asmstr,
[(set t.Ty:$dst, (op_node (t.Ty imm:$a), t.Ty:$b))]>,
Requires<requires>;
}

// Template for instructions which take three int64, int32, or int16 args.
// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
multiclass I3<string OpcStr, SDNode OpNode, bit commutative> {
foreach t = [I16RT, I32RT, I64RT] in {
defvar asmstr = OpcStr # t.Size # " \t$dst, $a, $b;";

def t.Ty # rr :
NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b),
asmstr,
[(set t.Ty:$dst, (OpNode t.Ty:$a, t.Ty:$b))]>;
def t.Ty # ri :
NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b),
asmstr,
[(set t.Ty:$dst, (OpNode t.RC:$a, imm:$b))]>;
if !not(commutative) then
def t.Ty # ir :
NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b),
asmstr,
[(set t.Ty:$dst, (OpNode imm:$a, t.RC:$b))]>;
}
multiclass I3<string op_str, SDPatternOperator op_node, bit commutative> {
foreach t = [I16RT, I32RT, I64RT] in
defm t.Ty# : I3Inst<op_str # t.Size, op_node, t, commutative>;
}

class I16x2<string OpcStr, SDNode OpNode> :
Expand All @@ -270,26 +278,11 @@ class I16x2<string OpcStr, SDNode OpNode> :

// Template for instructions which take 3 int args. The instructions are
// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
multiclass ADD_SUB_INT_CARRY<string op_str, SDNode op_node, bit commutative> {
let hasSideEffects = 1 in {
def i32rr :
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
[(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
def i32ri :
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
!strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
[(set i32:$dst, (OpNode i32:$a, imm:$b))]>;
def i64rr :
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
!strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
[(set i64:$dst, (OpNode i64:$a, i64:$b))]>,
Requires<[hasPTX<43>]>;
def i64ri :
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
!strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
[(set i64:$dst, (OpNode i64:$a, imm:$b))]>,
Requires<[hasPTX<43>]>;
defm i32 : I3Inst<op_str # ".s32", op_node, I32RT, commutative>;
defm i64 : I3Inst<op_str # ".s64", op_node, I64RT, commutative,
requires = [hasPTX<43>]>;
}
}

Expand Down Expand Up @@ -841,31 +834,31 @@ defm SUB_i1 : ADD_SUB_i1<sub>;

// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we
// also use these for unsigned arithmetic.
defm ADD : I3<"add.s", add, /*commutative=*/ true>;
defm SUB : I3<"sub.s", sub, /*commutative=*/ false>;
defm ADD : I3<"add.s", add, commutative = true>;
defm SUB : I3<"sub.s", sub, commutative = false>;

def ADD16x2 : I16x2<"add.s", add>;

// in32 and int64 addition and subtraction with carry-out.
defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>;
defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>;
defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc, commutative = true>;
defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc, commutative = false>;

// int32 and int64 addition and subtraction with carry-in and carry-out.
defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>;
defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>;
defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde, commutative = true>;
defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube, commutative = false>;

defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>;
defm MULT : I3<"mul.lo.s", mul, commutative = true>;

defm MULTHS : I3<"mul.hi.s", mulhs, /*commutative=*/ true>;
defm MULTHU : I3<"mul.hi.u", mulhu, /*commutative=*/ true>;
defm MULTHS : I3<"mul.hi.s", mulhs, commutative = true>;
defm MULTHU : I3<"mul.hi.u", mulhu, commutative = true>;

defm SDIV : I3<"div.s", sdiv, /*commutative=*/ false>;
defm UDIV : I3<"div.u", udiv, /*commutative=*/ false>;
defm SDIV : I3<"div.s", sdiv, commutative = false>;
defm UDIV : I3<"div.u", udiv, commutative = false>;

// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
// will lower it.
defm SREM : I3<"rem.s", srem, /*commutative=*/ false>;
defm UREM : I3<"rem.u", urem, /*commutative=*/ false>;
defm SREM : I3<"rem.s", srem, commutative = false>;
defm UREM : I3<"rem.u", urem, commutative = false>;

// Integer absolute value. NumBits should be one minus the bit width of RC.
// This idiom implements the algorithm at
Expand All @@ -880,10 +873,10 @@ defm ABS_32 : ABS<i32, Int32Regs, ".s32">;
defm ABS_64 : ABS<i64, Int64Regs, ".s64">;

// Integer min/max.
defm SMAX : I3<"max.s", smax, /*commutative=*/ true>;
defm UMAX : I3<"max.u", umax, /*commutative=*/ true>;
defm SMIN : I3<"min.s", smin, /*commutative=*/ true>;
defm UMIN : I3<"min.u", umin, /*commutative=*/ true>;
defm SMAX : I3<"max.s", smax, commutative = true>;
defm UMAX : I3<"max.u", umax, commutative = true>;
defm SMIN : I3<"min.s", smin, commutative = true>;
defm UMIN : I3<"min.u", umin, commutative = true>;

def SMAX16x2 : I16x2<"max.s", smax>;
def UMAX16x2 : I16x2<"max.u", umax>;
Expand Down Expand Up @@ -1393,38 +1386,10 @@ def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
// Template for three-arg bitwise operations. Takes three args, Creates .b16,
// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
multiclass BITWISE<string OpcStr, SDNode OpNode> {
def b1rr :
NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
!strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
[(set i1:$dst, (OpNode i1:$a, i1:$b))]>;
def b1ri :
NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
!strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
[(set i1:$dst, (OpNode i1:$a, imm:$b))]>;
def b16rr :
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
!strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
[(set i16:$dst, (OpNode i16:$a, i16:$b))]>;
def b16ri :
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
!strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
[(set i16:$dst, (OpNode i16:$a, imm:$b))]>;
def b32rr :
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
!strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
[(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
def b32ri :
NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
!strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
[(set i32:$dst, (OpNode i32:$a, imm:$b))]>;
def b64rr :
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
!strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
[(set i64:$dst, (OpNode i64:$a, i64:$b))]>;
def b64ri :
NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
!strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
[(set i64:$dst, (OpNode i64:$a, imm:$b))]>;
defm b1 : I3Inst<OpcStr # ".pred", OpNode, I1RT, commutative = true>;
defm b16 : I3Inst<OpcStr # ".b16", OpNode, I16RT, commutative = true>;
defm b32 : I3Inst<OpcStr # ".b32", OpNode, I32RT, commutative = true>;
defm b64 : I3Inst<OpcStr # ".b64", OpNode, I64RT, commutative = true>;
}

defm OR : BITWISE<"or", or>;
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -1678,6 +1678,21 @@ foreach t = [I32RT, I64RT] in {
}
}

//
// szext
//

foreach sign = ["s", "u"] in {
foreach mode = ["wrap", "clamp"] in {
defvar ext = !if(!eq(sign, "s"), "sext", "zext");
defvar intrin = !cast<Intrinsic>("int_nvvm_" # ext # "_" # mode);
defm SZEXT_ # sign # _ # mode
: I3Inst<"szext." # mode # "." # sign # "32",
intrin, I32RT, commutative = false,
requires = [hasSM<70>, hasPTX<76>]>;
}
}

//
// Convert
//
Expand Down
92 changes: 46 additions & 46 deletions llvm/test/CodeGen/NVPTX/i128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,37 +13,37 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0];
; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1];
; CHECK-NEXT: shr.s64 %rd2, %rd46, 63;
; CHECK-NEXT: mov.b64 %rd117, 0;
; CHECK-NEXT: sub.cc.s64 %rd52, %rd117, %rd45;
; CHECK-NEXT: subc.cc.s64 %rd53, %rd117, %rd46;
; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45;
; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46;
; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0;
; CHECK-NEXT: selp.b64 %rd4, %rd53, %rd46, %p1;
; CHECK-NEXT: selp.b64 %rd3, %rd52, %rd45, %p1;
; CHECK-NEXT: sub.cc.s64 %rd54, %rd117, %rd49;
; CHECK-NEXT: subc.cc.s64 %rd55, %rd117, %rd50;
; CHECK-NEXT: selp.b64 %rd4, %rd52, %rd46, %p1;
; CHECK-NEXT: selp.b64 %rd3, %rd51, %rd45, %p1;
; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49;
; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50;
; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0;
; CHECK-NEXT: selp.b64 %rd6, %rd55, %rd50, %p2;
; CHECK-NEXT: selp.b64 %rd5, %rd54, %rd49, %p2;
; CHECK-NEXT: or.b64 %rd56, %rd5, %rd6;
; CHECK-NEXT: setp.eq.s64 %p3, %rd56, 0;
; CHECK-NEXT: or.b64 %rd57, %rd3, %rd4;
; CHECK-NEXT: setp.eq.s64 %p4, %rd57, 0;
; CHECK-NEXT: selp.b64 %rd6, %rd54, %rd50, %p2;
; CHECK-NEXT: selp.b64 %rd5, %rd53, %rd49, %p2;
; CHECK-NEXT: or.b64 %rd55, %rd5, %rd6;
; CHECK-NEXT: setp.eq.s64 %p3, %rd55, 0;
; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4;
; CHECK-NEXT: setp.eq.s64 %p4, %rd56, 0;
; CHECK-NEXT: or.pred %p5, %p3, %p4;
; CHECK-NEXT: setp.ne.s64 %p6, %rd6, 0;
; CHECK-NEXT: clz.b64 %r1, %rd6;
; CHECK-NEXT: cvt.u64.u32 %rd58, %r1;
; CHECK-NEXT: cvt.u64.u32 %rd57, %r1;
; CHECK-NEXT: clz.b64 %r2, %rd5;
; CHECK-NEXT: cvt.u64.u32 %rd59, %r2;
; CHECK-NEXT: add.s64 %rd60, %rd59, 64;
; CHECK-NEXT: selp.b64 %rd61, %rd58, %rd60, %p6;
; CHECK-NEXT: cvt.u64.u32 %rd58, %r2;
; CHECK-NEXT: add.s64 %rd59, %rd58, 64;
; CHECK-NEXT: selp.b64 %rd60, %rd57, %rd59, %p6;
; CHECK-NEXT: setp.ne.s64 %p7, %rd4, 0;
; CHECK-NEXT: clz.b64 %r3, %rd4;
; CHECK-NEXT: cvt.u64.u32 %rd62, %r3;
; CHECK-NEXT: cvt.u64.u32 %rd61, %r3;
; CHECK-NEXT: clz.b64 %r4, %rd3;
; CHECK-NEXT: cvt.u64.u32 %rd63, %r4;
; CHECK-NEXT: add.s64 %rd64, %rd63, 64;
; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7;
; CHECK-NEXT: sub.cc.s64 %rd66, %rd61, %rd65;
; CHECK-NEXT: cvt.u64.u32 %rd62, %r4;
; CHECK-NEXT: add.s64 %rd63, %rd62, 64;
; CHECK-NEXT: selp.b64 %rd64, %rd61, %rd63, %p7;
; CHECK-NEXT: mov.b64 %rd117, 0;
; CHECK-NEXT: sub.cc.s64 %rd66, %rd60, %rd64;
; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0;
; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127;
; CHECK-NEXT: setp.eq.s64 %p9, %rd67, 0;
Expand Down Expand Up @@ -314,39 +314,39 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0];
; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1];
; CHECK-NEXT: mov.b64 %rd112, 0;
; CHECK-NEXT: sub.cc.s64 %rd52, %rd112, %rd45;
; CHECK-NEXT: subc.cc.s64 %rd53, %rd112, %rd46;
; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45;
; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46;
; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0;
; CHECK-NEXT: selp.b64 %rd2, %rd53, %rd46, %p1;
; CHECK-NEXT: selp.b64 %rd1, %rd52, %rd45, %p1;
; CHECK-NEXT: sub.cc.s64 %rd54, %rd112, %rd49;
; CHECK-NEXT: subc.cc.s64 %rd55, %rd112, %rd50;
; CHECK-NEXT: selp.b64 %rd2, %rd52, %rd46, %p1;
; CHECK-NEXT: selp.b64 %rd1, %rd51, %rd45, %p1;
; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49;
; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50;
; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0;
; CHECK-NEXT: selp.b64 %rd4, %rd55, %rd50, %p2;
; CHECK-NEXT: selp.b64 %rd3, %rd54, %rd49, %p2;
; CHECK-NEXT: xor.b64 %rd56, %rd50, %rd46;
; CHECK-NEXT: shr.s64 %rd5, %rd56, 63;
; CHECK-NEXT: or.b64 %rd57, %rd3, %rd4;
; CHECK-NEXT: setp.eq.s64 %p3, %rd57, 0;
; CHECK-NEXT: or.b64 %rd58, %rd1, %rd2;
; CHECK-NEXT: setp.eq.s64 %p4, %rd58, 0;
; CHECK-NEXT: selp.b64 %rd4, %rd54, %rd50, %p2;
; CHECK-NEXT: selp.b64 %rd3, %rd53, %rd49, %p2;
; CHECK-NEXT: xor.b64 %rd55, %rd50, %rd46;
; CHECK-NEXT: shr.s64 %rd5, %rd55, 63;
; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4;
; CHECK-NEXT: setp.eq.s64 %p3, %rd56, 0;
; CHECK-NEXT: or.b64 %rd57, %rd1, %rd2;
; CHECK-NEXT: setp.eq.s64 %p4, %rd57, 0;
; CHECK-NEXT: or.pred %p5, %p3, %p4;
; CHECK-NEXT: setp.ne.s64 %p6, %rd4, 0;
; CHECK-NEXT: clz.b64 %r1, %rd4;
; CHECK-NEXT: cvt.u64.u32 %rd59, %r1;
; CHECK-NEXT: cvt.u64.u32 %rd58, %r1;
; CHECK-NEXT: clz.b64 %r2, %rd3;
; CHECK-NEXT: cvt.u64.u32 %rd60, %r2;
; CHECK-NEXT: add.s64 %rd61, %rd60, 64;
; CHECK-NEXT: selp.b64 %rd62, %rd59, %rd61, %p6;
; CHECK-NEXT: cvt.u64.u32 %rd59, %r2;
; CHECK-NEXT: add.s64 %rd60, %rd59, 64;
; CHECK-NEXT: selp.b64 %rd61, %rd58, %rd60, %p6;
; CHECK-NEXT: setp.ne.s64 %p7, %rd2, 0;
; CHECK-NEXT: clz.b64 %r3, %rd2;
; CHECK-NEXT: cvt.u64.u32 %rd63, %r3;
; CHECK-NEXT: cvt.u64.u32 %rd62, %r3;
; CHECK-NEXT: clz.b64 %r4, %rd1;
; CHECK-NEXT: cvt.u64.u32 %rd64, %r4;
; CHECK-NEXT: add.s64 %rd65, %rd64, 64;
; CHECK-NEXT: selp.b64 %rd66, %rd63, %rd65, %p7;
; CHECK-NEXT: sub.cc.s64 %rd67, %rd62, %rd66;
; CHECK-NEXT: cvt.u64.u32 %rd63, %r4;
; CHECK-NEXT: add.s64 %rd64, %rd63, 64;
; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7;
; CHECK-NEXT: mov.b64 %rd112, 0;
; CHECK-NEXT: sub.cc.s64 %rd67, %rd61, %rd65;
; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0;
; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127;
; CHECK-NEXT: setp.eq.s64 %p9, %rd68, 0;
Expand Down
Loading
Loading