Skip to content

Commit 7045966

Browse files
committed
[NVPTX] Lower extraction of upper half of i32/i64 as partial move.
This produces better SASS than right-shift + truncate and is fairly common for CUDA code that operates on __half2 values represented as opaque integer. Differential Revision: https://reviews.llvm.org/D143448
1 parent 62c7475 commit 7045966

File tree

4 files changed

+114
-8
lines changed

4 files changed

+114
-8
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3063,8 +3063,27 @@ let hasSideEffects = false in {
30633063
(ins Float64Regs:$s),
30643064
"mov.b64 \t{{$d1, $d2}}, $s;", []>;
30653065

3066+
def I32toI16H : NVPTXInst<(outs Int16Regs:$high),
3067+
(ins Int32Regs:$s),
3068+
"{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}",
3069+
[]>;
3070+
def I64toI32H : NVPTXInst<(outs Int32Regs:$high),
3071+
(ins Int64Regs:$s),
3072+
"{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
3073+
[]>;
30663074
}
30673075

3076+
// Using partial vectorized move produces better SASS code for extraction of
3077+
// upper/lower parts of an integer.
3078+
def : Pat<(i16 (trunc (srl Int32Regs:$s, (i32 16)))),
3079+
(I32toI16H Int32Regs:$s)>;
3080+
def : Pat<(i16 (trunc (sra Int32Regs:$s, (i32 16)))),
3081+
(I32toI16H Int32Regs:$s)>;
3082+
def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))),
3083+
(I64toI32H Int64Regs:$s)>;
3084+
def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))),
3085+
(I64toI32H Int64Regs:$s)>;
3086+
30683087
let hasSideEffects = false in {
30693088
// Extract element of f16x2 register. PTX does not provide any way
30703089
// to access elements of f16x2 vector directly, so we need to

llvm/test/CodeGen/NVPTX/f16-instructions.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,8 +1032,7 @@ define half @test_copysign(half %a, half %b) #0 {
10321032
; CHECK-DAG: mov.b32 [[B:%r[0-9]+]], [[BF]];
10331033
; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767;
10341034
; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[B]], -2147483648;
1035-
; CHECK-DAG: shr.u32 [[BX1:%r[0-9]+]], [[BX0]], 16;
1036-
; CHECK-DAG: cvt.u16.u32 [[BX2:%rs[0-9]+]], [[BX1]];
1035+
; CHECK-DAG: mov.b32 {tmp, [[BX2:%rs[0-9]+]]}, [[BX0]];
10371036
; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
10381037
; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]];
10391038
; CHECK: st.param.b16 [func_retval0+0], [[R]];

llvm/test/CodeGen/NVPTX/f16x2-instructions.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -999,8 +999,7 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
999999
; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16(
10001000
; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0];
10011001
; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A]]
1002-
; CHECK-DAG: shr.u32 [[AH:%r[0-9]+]], [[A]], 16
1003-
; CHECK-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[AH]]
1002+
; CHECK-DAG: mov.b32 {tmp, [[R1:%rs[0-9]+]]}, [[A]];
10041003
; CHECK: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}
10051004
; CHECK: ret;
10061005
define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
@@ -1291,10 +1290,8 @@ define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
12911290
; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767;
12921291
; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648;
12931292
; CHECK-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648;
1294-
; CHECK-DAG: shr.u32 [[BY0:%r[0-9]+]], [[BX0]], 16;
1295-
; CHECK-DAG: shr.u32 [[BY1:%r[0-9]+]], [[BX1]], 16;
1296-
; CHECK-DAG: cvt.u16.u32 [[BZ0:%rs[0-9]+]], [[BY0]];
1297-
; CHECK-DAG: cvt.u16.u32 [[BZ1:%rs[0-9]+]], [[BY1]];
1293+
; CHECK-DAG: mov.b32 {tmp, [[BZ0:%rs[0-9]+]]}, [[BX0]]; }
1294+
; CHECK-DAG: mov.b32 {tmp, [[BZ1:%rs[0-9]+]]}, [[BX1]]; }
12981295
; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
12991296
; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
13001297
; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];

llvm/test/CodeGen/NVPTX/idioms.ll

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}
66
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
77

8+
%struct.S16 = type { i16, i16 }
9+
%struct.S32 = type { i32, i32 }
10+
811
; CHECK-LABEL: abs_i16(
912
define i16 @abs_i16(i16 %a) {
1013
; CHECK: abs.s16
@@ -31,3 +34,91 @@ define i64 @abs_i64(i64 %a) {
3134
%abs = select i1 %abs.cond, i64 %a, i64 %neg
3235
ret i64 %abs
3336
}
37+
38+
; CHECK-LABEL: i32_to_2xi16(
39+
define %struct.S16 @i32_to_2xi16(i32 noundef %in) {
40+
%low = trunc i32 %in to i16
41+
%high32 = lshr i32 %in, 16
42+
%high = trunc i32 %high32 to i16
43+
; CHECK: ld.param.u32 %[[R32:r[0-9]+]], [i32_to_2xi16_param_0];
44+
; CHECK-DAG: cvt.u16.u32 %rs{{[0-9+]}}, %[[R32]];
45+
; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32]];
46+
%s1 = insertvalue %struct.S16 poison, i16 %low, 0
47+
%s = insertvalue %struct.S16 %s1, i16 %high, 1
48+
ret %struct.S16 %s
49+
}
50+
51+
; CHECK-LABEL: i32_to_2xi16_lh(
52+
; Same as above, but with rearranged order of low/high parts.
53+
define %struct.S16 @i32_to_2xi16_lh(i32 noundef %in) {
54+
%high32 = lshr i32 %in, 16
55+
%high = trunc i32 %high32 to i16
56+
%low = trunc i32 %in to i16
57+
; CHECK: ld.param.u32 %[[R32:r[0-9]+]], [i32_to_2xi16_lh_param_0];
58+
; CHECK-DAG: cvt.u16.u32 %rs{{[0-9+]}}, %[[R32]];
59+
; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32]];
60+
%s1 = insertvalue %struct.S16 poison, i16 %low, 0
61+
%s = insertvalue %struct.S16 %s1, i16 %high, 1
62+
ret %struct.S16 %s
63+
}
64+
65+
66+
; CHECK-LABEL: i32_to_2xi16_not(
67+
define %struct.S16 @i32_to_2xi16_not(i32 noundef %in) {
68+
%low = trunc i32 %in to i16
69+
; Shift by any value other than 16 blocks the conversiopn to mov.
70+
%high32 = lshr i32 %in, 15
71+
%high = trunc i32 %high32 to i16
72+
; CHECK: cvt.u16.u32
73+
; CHECK: shr.u32
74+
; CHECK: cvt.u16.u32
75+
%s1 = insertvalue %struct.S16 poison, i16 %low, 0
76+
%s = insertvalue %struct.S16 %s1, i16 %high, 1
77+
ret %struct.S16 %s
78+
}
79+
80+
; CHECK-LABEL: i64_to_2xi32(
81+
define %struct.S32 @i64_to_2xi32(i64 noundef %in) {
82+
%low = trunc i64 %in to i32
83+
%high64 = lshr i64 %in, 32
84+
%high = trunc i64 %high64 to i32
85+
; CHECK: ld.param.u64 %[[R64:rd[0-9]+]], [i64_to_2xi32_param_0];
86+
; CHECK-DAG: cvt.u32.u64 %r{{[0-9+]}}, %[[R64]];
87+
; CHECK-DAG mov.b64 {tmp, %r{{[0-9+]}}}, %[[R64]];
88+
%s1 = insertvalue %struct.S32 poison, i32 %low, 0
89+
%s = insertvalue %struct.S32 %s1, i32 %high, 1
90+
ret %struct.S32 %s
91+
}
92+
93+
; CHECK-LABEL: i64_to_2xi32_not(
94+
define %struct.S32 @i64_to_2xi32_not(i64 noundef %in) {
95+
%low = trunc i64 %in to i32
96+
; Shift by any value other than 32 blocks the conversiopn to mov.
97+
%high64 = lshr i64 %in, 31
98+
%high = trunc i64 %high64 to i32
99+
; CHECK: cvt.u32.u64
100+
; CHECK: shr.u64
101+
; CHECK: cvt.u32.u64
102+
%s1 = insertvalue %struct.S32 poison, i32 %low, 0
103+
%s = insertvalue %struct.S32 %s1, i32 %high, 1
104+
ret %struct.S32 %s
105+
}
106+
107+
; CHECK-LABEL: i32_to_2xi16_shr(
108+
; Make sure we do not get confused when our input itself is [al]shr.
109+
define %struct.S16 @i32_to_2xi16_shr(i32 noundef %i){
110+
call void @escape_int(i32 %i); // Force %i to be loaded completely.
111+
%i1 = ashr i32 %i, 16
112+
%l = trunc i32 %i1 to i16
113+
%h32 = ashr i32 %i1, 16
114+
%h = trunc i32 %h32 to i16
115+
; CHECK: ld.param.u32 %[[R32:r[0-9]+]], [i32_to_2xi16_shr_param_0];
116+
; CHECK: shr.s32 %[[R32H:r[0-9]+]], %[[R32]], 16;
117+
; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32]];
118+
; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32H]];
119+
%s0 = insertvalue %struct.S16 poison, i16 %l, 0
120+
%s1 = insertvalue %struct.S16 %s0, i16 %h, 1
121+
ret %struct.S16 %s1
122+
}
123+
declare dso_local void @escape_int(i32 noundef)
124+

0 commit comments

Comments
 (0)