Skip to content

Commit 832e09b

Browse files
[NVPTX] Add support for efficient rotate instructions on SM 3.2+
llvm-svn: 211934
1 parent 7be57de commit 832e09b

File tree

4 files changed

+247
-4
lines changed

4 files changed

+247
-4
lines changed

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1948,6 +1948,25 @@ def int_nvvm_sust_p_3d_v4i32_trap
19481948
"llvm.nvvm.sust.p.3d.v4i32.trap">,
19491949
GCCBuiltin<"__nvvm_sust_p_3d_v4i32_trap">;
19501950

1951+
def int_nvvm_rotate_b32
1952+
: Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1953+
[IntrNoMem], "llvm.nvvm.rotate.b32">,
1954+
GCCBuiltin<"__nvvm_rotate_b32">;
1955+
1956+
def int_nvvm_rotate_b64
1957+
:Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
1958+
[IntrNoMem], "llvm.nvvm.rotate.b64">,
1959+
GCCBuiltin<"__nvvm_rotate_b64">;
1960+
1961+
def int_nvvm_rotate_right_b64
1962+
: Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
1963+
[IntrNoMem], "llvm.nvvm.rotate.right.b64">,
1964+
GCCBuiltin<"__nvvm_rotate_right_b64">;
1965+
1966+
def int_nvvm_swap_lo_hi_b64
1967+
: Intrinsic<[llvm_i64_ty], [llvm_i64_ty],
1968+
[IntrNoMem], "llvm.nvvm.swap.lo.hi.b64">,
1969+
GCCBuiltin<"__nvvm_swap_lo_hi_b64">;
19511970

19521971

19531972
// Old PTX back-end intrinsics retained here for backwards-compatibility

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
158158
def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
159159

160160
def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
161+
def noHWROT32 : Predicate<"!Subtarget.hasHWROT32()">;
161162

162163
def true : Predicate<"1">;
163164

@@ -1085,6 +1086,43 @@ multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
10851086
defm SRA : RSHIFT_FORMAT<"shr.s", sra>;
10861087
defm SRL : RSHIFT_FORMAT<"shr.u", srl>;
10871088

1089+
//
1090+
// Rotate: use ptx shf instruction if available.
1091+
//
1092+
1093+
// 32 bit r2 = rotl r1, n
1094+
// =>
1095+
// r2 = shf.l r1, r1, n
1096+
def ROTL32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
1097+
(ins Int32Regs:$src, i32imm:$amt),
1098+
"shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1099+
[(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
1100+
Requires<[hasHWROT32]> ;
1101+
1102+
def ROTL32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
1103+
(ins Int32Regs:$src, Int32Regs:$amt),
1104+
"shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1105+
[(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1106+
Requires<[hasHWROT32]>;
1107+
1108+
// 32 bit r2 = rotr r1, n
1109+
// =>
1110+
// r2 = shf.r r1, r1, n
1111+
def ROTR32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
1112+
(ins Int32Regs:$src, i32imm:$amt),
1113+
"shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1114+
[(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
1115+
Requires<[hasHWROT32]>;
1116+
1117+
def ROTR32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
1118+
(ins Int32Regs:$src, Int32Regs:$amt),
1119+
"shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
1120+
[(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1121+
Requires<[hasHWROT32]>;
1122+
1123+
//
1124+
// Rotate: if ptx shf instruction is not available, then use shift+add
1125+
//
10881126
// 32bit
10891127
def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst),
10901128
(ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
@@ -1102,9 +1140,11 @@ def SUB_FRM_32 : SDNodeXForm<imm, [{
11021140
}]>;
11031141

11041142
def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
1105-
(ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>;
1143+
(ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1144+
Requires<[noHWROT32]>;
11061145
def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
1107-
(ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>;
1146+
(ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
1147+
Requires<[noHWROT32]>;
11081148

11091149
def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
11101150
Int32Regs:$amt),
@@ -1117,7 +1157,8 @@ def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
11171157
!strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t",
11181158
!strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
11191159
!strconcat("}}", ""))))))))),
1120-
[(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>;
1160+
[(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
1161+
Requires<[noHWROT32]>;
11211162

11221163
def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
11231164
Int32Regs:$amt),
@@ -1130,7 +1171,8 @@ def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
11301171
!strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t",
11311172
!strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
11321173
!strconcat("}}", ""))))))))),
1133-
[(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>;
1174+
[(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
1175+
Requires<[noHWROT32]>;
11341176

11351177
// 64bit
11361178
def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1864,6 +1864,130 @@ def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>;
18641864
def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
18651865

18661866

1867+
// rotate builtin support
1868+
1869+
def ROTATE_B32_HW_IMM
1870+
: NVPTXInst<(outs Int32Regs:$dst),
1871+
(ins Int32Regs:$src, i32imm:$amt),
1872+
"shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1873+
[(set Int32Regs:$dst,
1874+
(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>,
1875+
Requires<[hasHWROT32]> ;
1876+
1877+
def ROTATE_B32_HW_REG
1878+
: NVPTXInst<(outs Int32Regs:$dst),
1879+
(ins Int32Regs:$src, Int32Regs:$amt),
1880+
"shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
1881+
[(set Int32Regs:$dst,
1882+
(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>,
1883+
Requires<[hasHWROT32]> ;
1884+
1885+
def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)),
1886+
(ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1887+
Requires<[noHWROT32]> ;
1888+
1889+
def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
1890+
(ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
1891+
Requires<[noHWROT32]> ;
1892+
1893+
def GET_LO_INT64
1894+
: NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
1895+
!strconcat("{{\n\t",
1896+
!strconcat(".reg .b32 %dummy;\n\t",
1897+
!strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
1898+
!strconcat("}}", "")))),
1899+
[]> ;
1900+
1901+
def GET_HI_INT64
1902+
: NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
1903+
!strconcat("{{\n\t",
1904+
!strconcat(".reg .b32 %dummy;\n\t",
1905+
!strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
1906+
!strconcat("}}", "")))),
1907+
[]> ;
1908+
1909+
def PACK_TWO_INT32
1910+
: NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
1911+
"mov.b64 \t$dst, {{$lo, $hi}};", []> ;
1912+
1913+
def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
1914+
(PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src),
1915+
(GET_LO_INT64 Int64Regs:$src))> ;
1916+
1917+
// funnel shift, requires >= sm_32
1918+
def SHF_L_WRAP_B32_IMM
1919+
: NVPTXInst<(outs Int32Regs:$dst),
1920+
(ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
1921+
"shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
1922+
Requires<[hasHWROT32]>;
1923+
1924+
def SHF_L_WRAP_B32_REG
1925+
: NVPTXInst<(outs Int32Regs:$dst),
1926+
(ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1927+
"shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
1928+
Requires<[hasHWROT32]>;
1929+
1930+
def SHF_R_WRAP_B32_IMM
1931+
: NVPTXInst<(outs Int32Regs:$dst),
1932+
(ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
1933+
"shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
1934+
Requires<[hasHWROT32]>;
1935+
1936+
def SHF_R_WRAP_B32_REG
1937+
: NVPTXInst<(outs Int32Regs:$dst),
1938+
(ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
1939+
"shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
1940+
Requires<[hasHWROT32]>;
1941+
1942+
// HW version of rotate 64
1943+
def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
1944+
(PACK_TWO_INT32
1945+
(SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
1946+
(GET_LO_INT64 Int64Regs:$src), imm:$amt),
1947+
(SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
1948+
(GET_HI_INT64 Int64Regs:$src), imm:$amt))>,
1949+
Requires<[hasHWROT32]>;
1950+
1951+
def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
1952+
(PACK_TWO_INT32
1953+
(SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
1954+
(GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt),
1955+
(SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
1956+
(GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>,
1957+
Requires<[hasHWROT32]>;
1958+
1959+
1960+
def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
1961+
(PACK_TWO_INT32
1962+
(SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
1963+
(GET_HI_INT64 Int64Regs:$src), imm:$amt),
1964+
(SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
1965+
(GET_LO_INT64 Int64Regs:$src), imm:$amt))>,
1966+
Requires<[hasHWROT32]>;
1967+
1968+
def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
1969+
(PACK_TWO_INT32
1970+
(SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
1971+
(GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt),
1972+
(SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
1973+
(GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>,
1974+
Requires<[hasHWROT32]>;
1975+
1976+
// SW version of rotate 64
1977+
def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
1978+
(ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
1979+
Requires<[noHWROT32]>;
1980+
def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
1981+
(ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
1982+
Requires<[noHWROT32]>;
1983+
def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
1984+
(ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>,
1985+
Requires<[noHWROT32]>;
1986+
def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
1987+
(ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
1988+
Requires<[noHWROT32]>;
1989+
1990+
18671991
//-----------------------------------
18681992
// Texture Intrinsics
18691993
//-----------------------------------

llvm/test/CodeGen/NVPTX/rotate.ll

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
2+
; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
3+
4+
5+
declare i32 @llvm.nvvm.rotate.b32(i32, i32)
6+
declare i64 @llvm.nvvm.rotate.b64(i64, i32)
7+
declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
8+
9+
; SM20: rotate32
10+
; SM35: rotate32
11+
define i32 @rotate32(i32 %a, i32 %b) {
12+
; SM20: shl.b32
13+
; SM20: sub.s32
14+
; SM20: shr.b32
15+
; SM20: add.u32
16+
; SM35: shf.l.wrap.b32
17+
%val = tail call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 %b)
18+
ret i32 %val
19+
}
20+
21+
; SM20: rotate64
22+
; SM35: rotate64
23+
define i64 @rotate64(i64 %a, i32 %b) {
24+
; SM20: shl.b64
25+
; SM20: sub.u32
26+
; SM20: shr.b64
27+
; SM20: add.u64
28+
; SM35: shf.l.wrap.b32
29+
; SM35: shf.l.wrap.b32
30+
%val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b)
31+
ret i64 %val
32+
}
33+
34+
; SM20: rotateright64
35+
; SM35: rotateright64
36+
define i64 @rotateright64(i64 %a, i32 %b) {
37+
; SM20: shr.b64
38+
; SM20: sub.u32
39+
; SM20: shl.b64
40+
; SM20: add.u64
41+
; SM35: shf.r.wrap.b32
42+
; SM35: shf.r.wrap.b32
43+
%val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b)
44+
ret i64 %val
45+
}
46+
47+
; SM20: rotl0
48+
; SM35: rotl0
49+
define i32 @rotl0(i32 %x) {
50+
; SM20: shl.b32
51+
; SM20: shr.b32
52+
; SM20: add.u32
53+
; SM35: shf.l.wrap.b32
54+
%t0 = shl i32 %x, 8
55+
%t1 = lshr i32 %x, 24
56+
%t2 = or i32 %t0, %t1
57+
ret i32 %t2
58+
}

0 commit comments

Comments
 (0)