Skip to content

Commit 4e80a03

Browse files
authored
[NVPTX] Use prmt.f4e to lower pointer alignment fshr idiom (llvm#143407)
1 parent 432d06a commit 4e80a03

File tree

2 files changed

+23
-2
lines changed

2 files changed

+23
-2
lines changed

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,6 +1621,10 @@ let hasSideEffects = false in {
16211621

16221622
}
16231623

1624+
// PRMT folding patterns
1625+
def : Pat<(fshr i32:$hi, i32:$lo, (shl i32:$amt, (i32 3))),
1626+
(PRMT_B32rrr $lo, $hi, $amt, PrmtF4E)>;
1627+
16241628

16251629
// byte extraction + signed/unsigned extension to i32.
16261630
def : Pat<(i32 (sext_inreg (bfe i32:$s, i32:$o, 8), i8)),

llvm/test/CodeGen/NVPTX/prmt.ll

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc < %s -verify-machineinstrs | FileCheck %s
3-
; RUN: %if ptxas %{ llc < %s -verify-machineinstrs | %ptxas-verify %}
2+
; RUN: llc < %s -verify-machineinstrs -mcpu=sm_50 | FileCheck %s
3+
; RUN: %if ptxas %{ llc < %s -verify-machineinstrs -mcpu=sm_50 | %ptxas-verify %}
44

55
target triple = "nvptx64-nvidia-cuda"
66

@@ -111,3 +111,20 @@ define i32 @test_prmt_rc16(i32 %lo, i32 %selector) {
111111
%val = call i32 @llvm.nvvm.prmt.rc16(i32 %lo, i32 %selector)
112112
ret i32 %val
113113
}
114+
115+
define i32 @test_prmt_f4e_folding(i32 %lo, i32 %hi, i32 %ptr) {
116+
; CHECK-LABEL: test_prmt_f4e_folding(
117+
; CHECK: {
118+
; CHECK-NEXT: .reg .b32 %r<5>;
119+
; CHECK-EMPTY:
120+
; CHECK-NEXT: // %bb.0:
121+
; CHECK-NEXT: ld.param.b32 %r1, [test_prmt_f4e_folding_param_0];
122+
; CHECK-NEXT: ld.param.b32 %r2, [test_prmt_f4e_folding_param_1];
123+
; CHECK-NEXT: ld.param.b32 %r3, [test_prmt_f4e_folding_param_2];
124+
; CHECK-NEXT: prmt.b32.f4e %r4, %r1, %r2, %r3;
125+
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
126+
; CHECK-NEXT: ret;
127+
%sh_amt = shl i32 %ptr, 3
128+
%val = call i32 @llvm.fshr.i32(i32 %hi, i32 %lo, i32 %sh_amt)
129+
ret i32 %val
130+
}

0 commit comments

Comments
 (0)