Skip to content

Commit 6030936

Browse files
authored
[NVPTX] Fix bug in sign of bfe folding (llvm#130862)
This change fixes an edge case where the unsigned variant of bfe was incorrectly used instead of the signed variant.
1 parent fc127ff commit 6030936

File tree

2 files changed

+184
-23
lines changed

2 files changed

+184
-23
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2039,6 +2039,13 @@ bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
20392039
Val = AndLHS;
20402040
Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
20412041
Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
2042+
2043+
// If pre-shift AND includes the sign bit in the bitfield, we must use
2044+
// signed BFE to replicate that bit during bitfield extraction. If the
2045+
// sign bit is not part of the mask, unsigned BFE will zero out upper bits
2046+
// of the result
2047+
if (N->getOpcode() == ISD::SRA)
2048+
IsSigned = (ShiftAmt + NumBits) == Val.getValueSizeInBits();
20422049
} else if (LHS->getOpcode() == ISD::SHL) {
20432050
// Here, we have a pattern like:
20442051
//

llvm/test/CodeGen/NVPTX/bfe.ll

Lines changed: 177 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,67 +1,221 @@
1-
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=CHECK,CHECK-O3
3+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -O0 | FileCheck %s --check-prefixes=CHECK,CHECK-O0
24
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
5+
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -O0 | %ptxas-verify %}
36

7+
target triple = "nvptx64-nvidia-cuda"
48

5-
; CHECK: bfe0
69
define i32 @bfe0(i32 %a) {
7-
; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 4, 4
8-
; CHECK-NOT: shr
9-
; CHECK-NOT: and
10+
; CHECK-LABEL: bfe0(
11+
; CHECK: {
12+
; CHECK-NEXT: .reg .b32 %r<3>;
13+
; CHECK-EMPTY:
14+
; CHECK-NEXT: // %bb.0:
15+
; CHECK-NEXT: ld.param.u32 %r1, [bfe0_param_0];
16+
; CHECK-NEXT: bfe.u32 %r2, %r1, 4, 4;
17+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
18+
; CHECK-NEXT: ret;
1019
%val0 = ashr i32 %a, 4
1120
%val1 = and i32 %val0, 15
1221
ret i32 %val1
1322
}
1423

15-
; CHECK: bfe1
1624
define i32 @bfe1(i32 %a) {
17-
; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 3, 3
18-
; CHECK-NOT: shr
19-
; CHECK-NOT: and
25+
; CHECK-LABEL: bfe1(
26+
; CHECK: {
27+
; CHECK-NEXT: .reg .b32 %r<3>;
28+
; CHECK-EMPTY:
29+
; CHECK-NEXT: // %bb.0:
30+
; CHECK-NEXT: ld.param.u32 %r1, [bfe1_param_0];
31+
; CHECK-NEXT: bfe.u32 %r2, %r1, 3, 3;
32+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
33+
; CHECK-NEXT: ret;
2034
%val0 = ashr i32 %a, 3
2135
%val1 = and i32 %val0, 7
2236
ret i32 %val1
2337
}
2438

25-
; CHECK: bfe2
2639
define i32 @bfe2(i32 %a) {
27-
; CHECK: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 5, 3
28-
; CHECK-NOT: shr
29-
; CHECK-NOT: and
40+
; CHECK-LABEL: bfe2(
41+
; CHECK: {
42+
; CHECK-NEXT: .reg .b32 %r<3>;
43+
; CHECK-EMPTY:
44+
; CHECK-NEXT: // %bb.0:
45+
; CHECK-NEXT: ld.param.u32 %r1, [bfe2_param_0];
46+
; CHECK-NEXT: bfe.u32 %r2, %r1, 5, 3;
47+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
48+
; CHECK-NEXT: ret;
3049
%val0 = ashr i32 %a, 5
3150
%val1 = and i32 %val0, 7
3251
ret i32 %val1
3352
}
3453

35-
; CHECK-LABEL: no_bfe_on_32bit_overflow
3654
define i32 @no_bfe_on_32bit_overflow(i32 %a) {
37-
; CHECK-NOT: bfe.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, 31, 4
55+
; CHECK-LABEL: no_bfe_on_32bit_overflow(
56+
; CHECK: {
57+
; CHECK-NEXT: .reg .b32 %r<4>;
58+
; CHECK-EMPTY:
59+
; CHECK-NEXT: // %bb.0:
60+
; CHECK-NEXT: ld.param.u32 %r1, [no_bfe_on_32bit_overflow_param_0];
61+
; CHECK-NEXT: shr.s32 %r2, %r1, 31;
62+
; CHECK-NEXT: and.b32 %r3, %r2, 15;
63+
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
64+
; CHECK-NEXT: ret;
3865
%val0 = ashr i32 %a, 31
3966
%val1 = and i32 %val0, 15
4067
ret i32 %val1
4168
}
4269

43-
; CHECK-LABEL: no_bfe_on_32bit_overflow_shr_and_pair
4470
define i32 @no_bfe_on_32bit_overflow_shr_and_pair(i32 %a) {
45-
; CHECK: shr.s32 %r{{[0-9]+}}, %r{{[0-9]+}}, 31
46-
; CHECK: and.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 15
71+
; CHECK-LABEL: no_bfe_on_32bit_overflow_shr_and_pair(
72+
; CHECK: {
73+
; CHECK-NEXT: .reg .b32 %r<4>;
74+
; CHECK-EMPTY:
75+
; CHECK-NEXT: // %bb.0:
76+
; CHECK-NEXT: ld.param.u32 %r1, [no_bfe_on_32bit_overflow_shr_and_pair_param_0];
77+
; CHECK-NEXT: shr.s32 %r2, %r1, 31;
78+
; CHECK-NEXT: and.b32 %r3, %r2, 15;
79+
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
80+
; CHECK-NEXT: ret;
4781
%val0 = ashr i32 %a, 31
4882
%val1 = and i32 %val0, 15
4983
ret i32 %val1
5084
}
5185

52-
; CHECK-LABEL: no_bfe_on_64bit_overflow
5386
define i64 @no_bfe_on_64bit_overflow(i64 %a) {
54-
; CHECK-NOT: bfe.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 63, 3
87+
; CHECK-LABEL: no_bfe_on_64bit_overflow(
88+
; CHECK: {
89+
; CHECK-NEXT: .reg .b64 %rd<4>;
90+
; CHECK-EMPTY:
91+
; CHECK-NEXT: // %bb.0:
92+
; CHECK-NEXT: ld.param.u64 %rd1, [no_bfe_on_64bit_overflow_param_0];
93+
; CHECK-NEXT: shr.s64 %rd2, %rd1, 63;
94+
; CHECK-NEXT: and.b64 %rd3, %rd2, 7;
95+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
96+
; CHECK-NEXT: ret;
5597
%val0 = ashr i64 %a, 63
5698
%val1 = and i64 %val0, 7
5799
ret i64 %val1
58100
}
59101

60-
; CHECK-LABEL: no_bfe_on_64bit_overflow_shr_and_pair
61102
define i64 @no_bfe_on_64bit_overflow_shr_and_pair(i64 %a) {
62-
; CHECK: shr.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 63
63-
; CHECK: and.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 7
103+
; CHECK-LABEL: no_bfe_on_64bit_overflow_shr_and_pair(
104+
; CHECK: {
105+
; CHECK-NEXT: .reg .b64 %rd<4>;
106+
; CHECK-EMPTY:
107+
; CHECK-NEXT: // %bb.0:
108+
; CHECK-NEXT: ld.param.u64 %rd1, [no_bfe_on_64bit_overflow_shr_and_pair_param_0];
109+
; CHECK-NEXT: shr.s64 %rd2, %rd1, 63;
110+
; CHECK-NEXT: and.b64 %rd3, %rd2, 7;
111+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
112+
; CHECK-NEXT: ret;
64113
%val0 = ashr i64 %a, 63
65114
%val1 = and i64 %val0, 7
66115
ret i64 %val1
67116
}
117+
118+
define i32 @bfe_ashr_signed_32(i32 %x) {
119+
; CHECK-O3-LABEL: bfe_ashr_signed_32(
120+
; CHECK-O3: {
121+
; CHECK-O3-NEXT: .reg .b32 %r<3>;
122+
; CHECK-O3-EMPTY:
123+
; CHECK-O3-NEXT: // %bb.0:
124+
; CHECK-O3-NEXT: ld.param.u16 %r1, [bfe_ashr_signed_32_param_0+2];
125+
; CHECK-O3-NEXT: bfe.s32 %r2, %r1, 4, 12;
126+
; CHECK-O3-NEXT: st.param.b32 [func_retval0], %r2;
127+
; CHECK-O3-NEXT: ret;
128+
;
129+
; CHECK-O0-LABEL: bfe_ashr_signed_32(
130+
; CHECK-O0: {
131+
; CHECK-O0-NEXT: .reg .b32 %r<3>;
132+
; CHECK-O0-EMPTY:
133+
; CHECK-O0-NEXT: // %bb.0:
134+
; CHECK-O0-NEXT: ld.param.u32 %r1, [bfe_ashr_signed_32_param_0];
135+
; CHECK-O0-NEXT: bfe.s32 %r2, %r1, 20, 12;
136+
; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r2;
137+
; CHECK-O0-NEXT: ret;
138+
%and = and i32 %x, -65536
139+
%shr = ashr exact i32 %and, 20
140+
ret i32 %shr
141+
}
142+
143+
define i32 @bfe_ashr_unsigned_32(i32 %x) {
144+
; CHECK-LABEL: bfe_ashr_unsigned_32(
145+
; CHECK: {
146+
; CHECK-NEXT: .reg .b32 %r<3>;
147+
; CHECK-EMPTY:
148+
; CHECK-NEXT: // %bb.0:
149+
; CHECK-NEXT: ld.param.u32 %r1, [bfe_ashr_unsigned_32_param_0];
150+
; CHECK-NEXT: bfe.u32 %r2, %r1, 5, 6;
151+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
152+
; CHECK-NEXT: ret;
153+
%and = and i32 %x, 2047
154+
%shr = ashr exact i32 %and, 5
155+
ret i32 %shr
156+
}
157+
158+
define i64 @bfe_ashr_signed_64(i64 %x) {
159+
; CHECK-LABEL: bfe_ashr_signed_64(
160+
; CHECK: {
161+
; CHECK-NEXT: .reg .b64 %rd<3>;
162+
; CHECK-EMPTY:
163+
; CHECK-NEXT: // %bb.0:
164+
; CHECK-NEXT: ld.param.u64 %rd1, [bfe_ashr_signed_64_param_0];
165+
; CHECK-NEXT: bfe.s64 %rd2, %rd1, 16, 48;
166+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
167+
; CHECK-NEXT: ret;
168+
%and = and i64 %x, -65536
169+
%shr = ashr exact i64 %and, 16
170+
ret i64 %shr
171+
}
172+
173+
define i64 @bfe_ashr_unsigned_64(i64 %x) {
174+
; CHECK-LABEL: bfe_ashr_unsigned_64(
175+
; CHECK: {
176+
; CHECK-NEXT: .reg .b64 %rd<3>;
177+
; CHECK-EMPTY:
178+
; CHECK-NEXT: // %bb.0:
179+
; CHECK-NEXT: ld.param.u64 %rd1, [bfe_ashr_unsigned_64_param_0];
180+
; CHECK-NEXT: bfe.u64 %rd2, %rd1, 5, 6;
181+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
182+
; CHECK-NEXT: ret;
183+
%and = and i64 %x, 2047
184+
%shr = ashr exact i64 %and, 5
185+
ret i64 %shr
186+
}
187+
188+
define i32 @bfe3(i128 %a) {
189+
; CHECK-LABEL: bfe3(
190+
; CHECK: {
191+
; CHECK-NEXT: .reg .b32 %r<3>;
192+
; CHECK-NEXT: .reg .b64 %rd<3>;
193+
; CHECK-EMPTY:
194+
; CHECK-NEXT: // %bb.0:
195+
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [bfe3_param_0];
196+
; CHECK-NEXT: cvt.u32.u64 %r1, %rd1;
197+
; CHECK-NEXT: bfe.s32 %r2, %r1, 15, 17;
198+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
199+
; CHECK-NEXT: ret;
200+
%trunc = trunc i128 %a to i32
201+
%and = and i32 %trunc, -32768
202+
%shr = ashr exact i32 %and, 15
203+
ret i32 %shr
204+
}
205+
206+
define i64 @bfe4(i128 %a) {
207+
; CHECK-LABEL: bfe4(
208+
; CHECK: {
209+
; CHECK-NEXT: .reg .b64 %rd<4>;
210+
; CHECK-EMPTY:
211+
; CHECK-NEXT: // %bb.0:
212+
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [bfe4_param_0];
213+
; CHECK-NEXT: bfe.s64 %rd3, %rd1, 17, 47;
214+
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
215+
; CHECK-NEXT: ret;
216+
%trunc = trunc i128 %a to i64
217+
%and = and i64 %trunc, -131072
218+
%shr = ashr exact i64 %and, 17
219+
ret i64 %shr
220+
}
221+

0 commit comments

Comments
 (0)