-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[NVPTX] Fix 64 bits rotations with large shift values #89399
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,8 @@ | ||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s | ||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s | ||
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} | ||
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %} | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 | ||
; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s | ||
; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s | ||
; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} | ||
; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %} | ||
|
||
|
||
declare i32 @llvm.nvvm.rotate.b32(i32, i32) | ||
|
@@ -11,50 +12,338 @@ declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) | |
; SM20: rotate32 | ||
; SM35: rotate32 | ||
define i32 @rotate32(i32 %a, i32 %b) { | ||
; SM20: shl.b32 | ||
; SM20: sub.s32 | ||
; SM20: shr.b32 | ||
; SM20: add.u32 | ||
; SM35: shf.l.wrap.b32 | ||
; SM20-LABEL: rotate32( | ||
; SM20: { | ||
; SM20-NEXT: .reg .b32 %r<4>; | ||
; SM20-EMPTY: | ||
; SM20-NEXT: // %bb.0: | ||
; SM20-NEXT: ld.param.u32 %r1, [rotate32_param_0]; | ||
; SM20-NEXT: ld.param.u32 %r2, [rotate32_param_1]; | ||
; SM20-NEXT: { | ||
; SM20-NEXT: .reg .b32 %lhs; | ||
; SM20-NEXT: .reg .b32 %rhs; | ||
; SM20-NEXT: .reg .b32 %amt2; | ||
; SM20-NEXT: shl.b32 %lhs, %r1, %r2; | ||
; SM20-NEXT: sub.s32 %amt2, 32, %r2; | ||
; SM20-NEXT: shr.b32 %rhs, %r1, %amt2; | ||
; SM20-NEXT: add.u32 %r3, %lhs, %rhs; | ||
; SM20-NEXT: } | ||
; SM20-NEXT: st.param.b32 [func_retval0+0], %r3; | ||
; SM20-NEXT: ret; | ||
; | ||
; SM35-LABEL: rotate32( | ||
; SM35: { | ||
; SM35-NEXT: .reg .b32 %r<4>; | ||
; SM35-EMPTY: | ||
; SM35-NEXT: // %bb.0: | ||
; SM35-NEXT: ld.param.u32 %r1, [rotate32_param_0]; | ||
; SM35-NEXT: ld.param.u32 %r2, [rotate32_param_1]; | ||
; SM35-NEXT: shf.l.wrap.b32 %r3, %r1, %r1, %r2; | ||
; SM35-NEXT: st.param.b32 [func_retval0+0], %r3; | ||
; SM35-NEXT: ret; | ||
%val = tail call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 %b) | ||
ret i32 %val | ||
} | ||
|
||
; SM20: rotate64 | ||
; SM35: rotate64 | ||
define i64 @rotate64(i64 %a, i32 %b) { | ||
; SM20: shl.b64 | ||
; SM20: sub.u32 | ||
; SM20: shr.b64 | ||
; SM20: add.u64 | ||
; SM35: shf.l.wrap.b32 | ||
; SM35: shf.l.wrap.b32 | ||
; SM20-LABEL: rotate64( | ||
; SM20: { | ||
; SM20-NEXT: .reg .b32 %r<2>; | ||
; SM20-NEXT: .reg .b64 %rd<3>; | ||
; SM20-EMPTY: | ||
; SM20-NEXT: // %bb.0: | ||
; SM20-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; | ||
; SM20-NEXT: ld.param.u32 %r1, [rotate64_param_1]; | ||
; SM20-NEXT: { | ||
; SM20-NEXT: .reg .b64 %lhs; | ||
; SM20-NEXT: .reg .b64 %rhs; | ||
; SM20-NEXT: .reg .u32 %amt2; | ||
; SM20-NEXT: and.b32 %amt2, %r1, 63; | ||
; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2; | ||
; SM20-NEXT: sub.u32 %amt2, 64, %amt2; | ||
; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2; | ||
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; | ||
; SM20-NEXT: } | ||
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; | ||
; SM20-NEXT: ret; | ||
; | ||
; SM35-LABEL: rotate64( | ||
; SM35: { | ||
; SM35-NEXT: .reg .b32 %r<6>; | ||
; SM35-NEXT: .reg .b64 %rd<3>; | ||
; SM35-EMPTY: | ||
; SM35-NEXT: // %bb.0: | ||
; SM35-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; | ||
; SM35-NEXT: { | ||
; SM35-NEXT: .reg .b32 %dummy; | ||
; SM35-NEXT: mov.b64 {%dummy,%r1}, %rd1; | ||
; SM35-NEXT: } | ||
; SM35-NEXT: { | ||
; SM35-NEXT: .reg .b32 %dummy; | ||
; SM35-NEXT: mov.b64 {%r2,%dummy}, %rd1; | ||
; SM35-NEXT: } | ||
Comment on lines
+78
to
+86
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like a minor optimization opportunity for the future -- this could be done as |
||
; SM35-NEXT: ld.param.u32 %r3, [rotate64_param_1]; | ||
; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3; | ||
; SM35-NEXT: shf.l.wrap.b32 %r5, %r1, %r2, %r3; | ||
; SM35-NEXT: mov.b64 %rd2, {%r5, %r4}; | ||
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; | ||
; SM35-NEXT: ret; | ||
%val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b) | ||
ret i64 %val | ||
} | ||
|
||
; SM20: rotateright64 | ||
; SM35: rotateright64 | ||
define i64 @rotateright64(i64 %a, i32 %b) { | ||
; SM20: shr.b64 | ||
; SM20: sub.u32 | ||
; SM20: shl.b64 | ||
; SM20: add.u64 | ||
; SM35: shf.r.wrap.b32 | ||
; SM35: shf.r.wrap.b32 | ||
; SM20-LABEL: rotateright64( | ||
; SM20: { | ||
; SM20-NEXT: .reg .b32 %r<2>; | ||
; SM20-NEXT: .reg .b64 %rd<3>; | ||
; SM20-EMPTY: | ||
; SM20-NEXT: // %bb.0: | ||
; SM20-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; | ||
; SM20-NEXT: ld.param.u32 %r1, [rotateright64_param_1]; | ||
; SM20-NEXT: { | ||
; SM20-NEXT: .reg .b64 %lhs; | ||
; SM20-NEXT: .reg .b64 %rhs; | ||
; SM20-NEXT: .reg .u32 %amt2; | ||
; SM20-NEXT: and.b32 %amt2, %r1, 63; | ||
; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2; | ||
; SM20-NEXT: sub.u32 %amt2, 64, %amt2; | ||
; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2; | ||
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; | ||
; SM20-NEXT: } | ||
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; | ||
; SM20-NEXT: ret; | ||
; | ||
; SM35-LABEL: rotateright64( | ||
; SM35: { | ||
; SM35-NEXT: .reg .b32 %r<6>; | ||
; SM35-NEXT: .reg .b64 %rd<3>; | ||
; SM35-EMPTY: | ||
; SM35-NEXT: // %bb.0: | ||
; SM35-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; | ||
; SM35-NEXT: { | ||
; SM35-NEXT: .reg .b32 %dummy; | ||
; SM35-NEXT: mov.b64 {%r1,%dummy}, %rd1; | ||
; SM35-NEXT: } | ||
; SM35-NEXT: { | ||
; SM35-NEXT: .reg .b32 %dummy; | ||
; SM35-NEXT: mov.b64 {%dummy,%r2}, %rd1; | ||
; SM35-NEXT: } | ||
; SM35-NEXT: ld.param.u32 %r3, [rotateright64_param_1]; | ||
; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3; | ||
; SM35-NEXT: shf.r.wrap.b32 %r5, %r1, %r2, %r3; | ||
; SM35-NEXT: mov.b64 %rd2, {%r5, %r4}; | ||
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; | ||
; SM35-NEXT: ret; | ||
%val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b) | ||
ret i64 %val | ||
} | ||
|
||
; SM20: rotl0 | ||
; SM35: rotl0 | ||
define i32 @rotl0(i32 %x) { | ||
; SM20: shl.b32 | ||
; SM20: shr.b32 | ||
; SM20: add.u32 | ||
; SM35: shf.l.wrap.b32 | ||
; SM20-LABEL: rotl0( | ||
; SM20: { | ||
; SM20-NEXT: .reg .b32 %r<3>; | ||
; SM20-EMPTY: | ||
; SM20-NEXT: // %bb.0: | ||
; SM20-NEXT: ld.param.u32 %r1, [rotl0_param_0]; | ||
; SM20-NEXT: { | ||
; SM20-NEXT: .reg .b32 %lhs; | ||
; SM20-NEXT: .reg .b32 %rhs; | ||
; SM20-NEXT: shl.b32 %lhs, %r1, 8; | ||
; SM20-NEXT: shr.b32 %rhs, %r1, 24; | ||
; SM20-NEXT: add.u32 %r2, %lhs, %rhs; | ||
; SM20-NEXT: } | ||
; SM20-NEXT: st.param.b32 [func_retval0+0], %r2; | ||
; SM20-NEXT: ret; | ||
; | ||
; SM35-LABEL: rotl0( | ||
; SM35: { | ||
; SM35-NEXT: .reg .b32 %r<3>; | ||
; SM35-EMPTY: | ||
; SM35-NEXT: // %bb.0: | ||
; SM35-NEXT: ld.param.u32 %r1, [rotl0_param_0]; | ||
; SM35-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 8; | ||
; SM35-NEXT: st.param.b32 [func_retval0+0], %r2; | ||
; SM35-NEXT: ret; | ||
%t0 = shl i32 %x, 8 | ||
%t1 = lshr i32 %x, 24 | ||
%t2 = or i32 %t0, %t1 | ||
ret i32 %t2 | ||
} | ||
|
||
declare i64 @llvm.fshl.i64(i64, i64, i64) | ||
declare i64 @llvm.fshr.i64(i64, i64, i64) | ||
|
||
; SM35: rotl64 | ||
define i64 @rotl64(i64 %a, i64 %n) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test should probably be converted to use llvm/utils/update_llc_test_checks.py There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ^^^ we still want to improve the test. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh right! Updated with the script now, it was confusing for a little bit because it doesn't understand |
||
; SM20-LABEL: rotl64( | ||
; SM20: { | ||
; SM20-NEXT: .reg .b32 %r<2>; | ||
; SM20-NEXT: .reg .b64 %rd<3>; | ||
; SM20-EMPTY: | ||
; SM20-NEXT: // %bb.0: | ||
; SM20-NEXT: ld.param.u64 %rd1, [rotl64_param_0]; | ||
; SM20-NEXT: ld.param.u32 %r1, [rotl64_param_1]; | ||
; SM20-NEXT: { | ||
; SM20-NEXT: .reg .b64 %lhs; | ||
; SM20-NEXT: .reg .b64 %rhs; | ||
; SM20-NEXT: .reg .u32 %amt2; | ||
; SM20-NEXT: and.b32 %amt2, %r1, 63; | ||
; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2; | ||
; SM20-NEXT: sub.u32 %amt2, 64, %amt2; | ||
; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2; | ||
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; | ||
; SM20-NEXT: } | ||
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; | ||
; SM20-NEXT: ret; | ||
; | ||
; SM35-LABEL: rotl64( | ||
; SM35: { | ||
; SM35-NEXT: .reg .b32 %r<2>; | ||
; SM35-NEXT: .reg .b64 %rd<3>; | ||
; SM35-EMPTY: | ||
; SM35-NEXT: // %bb.0: | ||
; SM35-NEXT: ld.param.u64 %rd1, [rotl64_param_0]; | ||
; SM35-NEXT: ld.param.u32 %r1, [rotl64_param_1]; | ||
; SM35-NEXT: { | ||
; SM35-NEXT: .reg .b64 %lhs; | ||
; SM35-NEXT: .reg .b64 %rhs; | ||
; SM35-NEXT: .reg .u32 %amt2; | ||
; SM35-NEXT: and.b32 %amt2, %r1, 63; | ||
; SM35-NEXT: shl.b64 %lhs, %rd1, %amt2; | ||
; SM35-NEXT: sub.u32 %amt2, 64, %amt2; | ||
; SM35-NEXT: shr.b64 %rhs, %rd1, %amt2; | ||
; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; | ||
; SM35-NEXT: } | ||
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; | ||
; SM35-NEXT: ret; | ||
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n) | ||
ret i64 %val | ||
} | ||
|
||
; SM35: rotl64_imm | ||
define i64 @rotl64_imm(i64 %a) { | ||
; SM20-LABEL: rotl64_imm( | ||
; SM20: { | ||
; SM20-NEXT: .reg .b64 %rd<3>; | ||
; SM20-EMPTY: | ||
; SM20-NEXT: // %bb.0: | ||
; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0]; | ||
; SM20-NEXT: { | ||
; SM20-NEXT: .reg .b64 %lhs; | ||
; SM20-NEXT: .reg .b64 %rhs; | ||
; SM20-NEXT: shl.b64 %lhs, %rd1, 2; | ||
; SM20-NEXT: shr.b64 %rhs, %rd1, 62; | ||
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; | ||
; SM20-NEXT: } | ||
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; | ||
; SM20-NEXT: ret; | ||
; | ||
; SM35-LABEL: rotl64_imm( | ||
; SM35: { | ||
; SM35-NEXT: .reg .b64 %rd<3>; | ||
; SM35-EMPTY: | ||
; SM35-NEXT: // %bb.0: | ||
; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0]; | ||
; SM35-NEXT: { | ||
; SM35-NEXT: .reg .b64 %lhs; | ||
; SM35-NEXT: .reg .b64 %rhs; | ||
; SM35-NEXT: shl.b64 %lhs, %rd1, 2; | ||
; SM35-NEXT: shr.b64 %rhs, %rd1, 62; | ||
; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; | ||
; SM35-NEXT: } | ||
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; | ||
; SM35-NEXT: ret; | ||
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66) | ||
ret i64 %val | ||
} | ||
|
||
; SM35: rotr64 | ||
define i64 @rotr64(i64 %a, i64 %n) { | ||
; SM20-LABEL: rotr64( | ||
; SM20: { | ||
; SM20-NEXT: .reg .b32 %r<2>; | ||
; SM20-NEXT: .reg .b64 %rd<3>; | ||
; SM20-EMPTY: | ||
; SM20-NEXT: // %bb.0: | ||
; SM20-NEXT: ld.param.u64 %rd1, [rotr64_param_0]; | ||
; SM20-NEXT: ld.param.u32 %r1, [rotr64_param_1]; | ||
; SM20-NEXT: { | ||
; SM20-NEXT: .reg .b64 %lhs; | ||
; SM20-NEXT: .reg .b64 %rhs; | ||
; SM20-NEXT: .reg .u32 %amt2; | ||
; SM20-NEXT: and.b32 %amt2, %r1, 63; | ||
; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2; | ||
; SM20-NEXT: sub.u32 %amt2, 64, %amt2; | ||
; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2; | ||
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; | ||
; SM20-NEXT: } | ||
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; | ||
; SM20-NEXT: ret; | ||
; | ||
; SM35-LABEL: rotr64( | ||
; SM35: { | ||
; SM35-NEXT: .reg .b32 %r<2>; | ||
; SM35-NEXT: .reg .b64 %rd<3>; | ||
; SM35-EMPTY: | ||
; SM35-NEXT: // %bb.0: | ||
; SM35-NEXT: ld.param.u64 %rd1, [rotr64_param_0]; | ||
; SM35-NEXT: ld.param.u32 %r1, [rotr64_param_1]; | ||
; SM35-NEXT: { | ||
; SM35-NEXT: .reg .b64 %lhs; | ||
; SM35-NEXT: .reg .b64 %rhs; | ||
; SM35-NEXT: .reg .u32 %amt2; | ||
; SM35-NEXT: and.b32 %amt2, %r1, 63; | ||
; SM35-NEXT: shr.b64 %lhs, %rd1, %amt2; | ||
; SM35-NEXT: sub.u32 %amt2, 64, %amt2; | ||
; SM35-NEXT: shl.b64 %rhs, %rd1, %amt2; | ||
; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; | ||
; SM35-NEXT: } | ||
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; | ||
; SM35-NEXT: ret; | ||
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n) | ||
ret i64 %val | ||
} | ||
|
||
; SM35: rotr64_imm | ||
define i64 @rotr64_imm(i64 %a) { | ||
; SM20-LABEL: rotr64_imm( | ||
; SM20: { | ||
; SM20-NEXT: .reg .b64 %rd<3>; | ||
; SM20-EMPTY: | ||
; SM20-NEXT: // %bb.0: | ||
; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0]; | ||
; SM20-NEXT: { | ||
; SM20-NEXT: .reg .b64 %lhs; | ||
; SM20-NEXT: .reg .b64 %rhs; | ||
; SM20-NEXT: shl.b64 %lhs, %rd1, 62; | ||
; SM20-NEXT: shr.b64 %rhs, %rd1, 2; | ||
; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; | ||
; SM20-NEXT: } | ||
; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; | ||
; SM20-NEXT: ret; | ||
; | ||
; SM35-LABEL: rotr64_imm( | ||
; SM35: { | ||
; SM35-NEXT: .reg .b64 %rd<3>; | ||
; SM35-EMPTY: | ||
; SM35-NEXT: // %bb.0: | ||
; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0]; | ||
; SM35-NEXT: { | ||
; SM35-NEXT: .reg .b64 %lhs; | ||
; SM35-NEXT: .reg .b64 %rhs; | ||
; SM35-NEXT: shl.b64 %lhs, %rd1, 62; | ||
; SM35-NEXT: shr.b64 %rhs, %rd1, 2; | ||
; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; | ||
; SM35-NEXT: } | ||
; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; | ||
; SM35-NEXT: ret; | ||
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66) | ||
ret i64 %val | ||
} |
Uh oh!
There was an error while loading. Please reload this page.