Skip to content

Commit 888e284

Browse files
authored
[NVPTX] Use PTX prmt for llvm.bswap (#85545)
1 parent 353fbeb commit 888e284

File tree

3 files changed

+96
-3
lines changed

3 files changed

+96
-3
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -580,9 +580,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
580580
setOperationAction(ISD::ROTL, MVT::i8, Expand);
581581
setOperationAction(ISD::ROTR, MVT::i8, Expand);
582582
setOperationAction(ISD::BSWAP, MVT::i16, Expand);
583-
setOperationAction(ISD::BSWAP, MVT::v2i16, Expand);
584-
setOperationAction(ISD::BSWAP, MVT::i32, Expand);
585-
setOperationAction(ISD::BSWAP, MVT::i64, Expand);
586583

587584
// Indirect branch is not supported.
588585
// This also disables Jump Table creation.

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3549,6 +3549,11 @@ let hasSideEffects = false in {
35493549
(ins Int64Regs:$s),
35503550
"{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
35513551
[]>;
3552+
def I64toI32L : NVPTXInst<(outs Int32Regs:$low),
3553+
(ins Int64Regs:$s),
3554+
"{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
3555+
[]>;
3556+
35523557
}
35533558

35543559
// Using partial vectorized move produces better SASS code for extraction of
@@ -3838,3 +3843,17 @@ include "NVPTXIntrinsics.td"
38383843
// - for sm_20, use pmpt (use vector scalar mov to get the pack and
38393844
// unpack). sm_20 supports native 32-bit register, but not native 16-bit
38403845
// register.
3846+
3847+
def : Pat <
3848+
(i32 (bswap i32:$a)),
3849+
(INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x0123))>;
3850+
3851+
def : Pat <
3852+
(v2i16 (bswap v2i16:$a)),
3853+
(INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>;
3854+
3855+
def : Pat <
3856+
(i64 (bswap i64:$a)),
3857+
(V2I32toI64
3858+
(INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)),
3859+
(INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>;

llvm/test/CodeGen/NVPTX/bswap.ll

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
3+
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
4+
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
define i16 @bswap16(i16 %a) {
8+
; CHECK-LABEL: bswap16(
9+
; CHECK: {
10+
; CHECK-NEXT: .reg .b16 %rs<5>;
11+
; CHECK-NEXT: .reg .b32 %r<2>;
12+
; CHECK-EMPTY:
13+
; CHECK-NEXT: // %bb.0:
14+
; CHECK-NEXT: ld.param.u16 %rs1, [bswap16_param_0];
15+
; CHECK-NEXT: shr.u16 %rs2, %rs1, 8;
16+
; CHECK-NEXT: shl.b16 %rs3, %rs1, 8;
17+
; CHECK-NEXT: or.b16 %rs4, %rs3, %rs2;
18+
; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
19+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
20+
; CHECK-NEXT: ret;
21+
%b = tail call i16 @llvm.bswap.i16(i16 %a)
22+
ret i16 %b
23+
}
24+
25+
26+
define i32 @bswap32(i32 %a) {
27+
; CHECK-LABEL: bswap32(
28+
; CHECK: {
29+
; CHECK-NEXT: .reg .b32 %r<3>;
30+
; CHECK-EMPTY:
31+
; CHECK-NEXT: // %bb.0:
32+
; CHECK-NEXT: ld.param.u32 %r1, [bswap32_param_0];
33+
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 291;
34+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
35+
; CHECK-NEXT: ret;
36+
%b = tail call i32 @llvm.bswap.i32(i32 %a)
37+
ret i32 %b
38+
}
39+
40+
41+
define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
42+
; CHECK-LABEL: bswapv2i16(
43+
; CHECK: {
44+
; CHECK-NEXT: .reg .b32 %r<3>;
45+
; CHECK-EMPTY:
46+
; CHECK-NEXT: // %bb.0:
47+
; CHECK-NEXT: ld.param.u32 %r1, [bswapv2i16_param_0];
48+
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 8961;
49+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
50+
; CHECK-NEXT: ret;
51+
%b = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a)
52+
ret <2 x i16> %b
53+
}
54+
55+
define i64 @bswap64(i64 %a) {
56+
; CHECK-LABEL: bswap64(
57+
; CHECK: {
58+
; CHECK-NEXT: .reg .b32 %r<5>;
59+
; CHECK-NEXT: .reg .b64 %rd<3>;
60+
; CHECK-EMPTY:
61+
; CHECK-NEXT: // %bb.0:
62+
; CHECK-NEXT: ld.param.u64 %rd1, [bswap64_param_0];
63+
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
64+
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 291;
65+
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
66+
; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 291;
67+
; CHECK-NEXT: mov.b64 %rd2, {%r4, %r2};
68+
; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd2;
69+
; CHECK-NEXT: ret;
70+
%b = tail call i64 @llvm.bswap.i64(i64 %a)
71+
ret i64 %b
72+
}
73+
74+
declare i16 @llvm.bswap.i16(i16)
75+
declare i32 @llvm.bswap.i32(i32)
76+
declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)
77+
declare i64 @llvm.bswap.i64(i64)

0 commit comments

Comments
 (0)