-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[RISCV] Add pattern for PACK/PACKH in common misaligned load case #110644
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Alex Bradbury (asb) ChangesPACKH is currently only selected for assembling the first two bytes of a misligned load. A fairly complex RV32-only pattern is added for producing PACKH+PACKH+PACK to assemble the result of a misaligned 32-bit load. Another pattern was added that just covers PACKH for shifted offsets 16 and 24, producing a packh and shift to replace two shifts and an 'or'. This slightly improves RV64IZKBK for a 64-bit load, but fails to match for the misaligned 32-bit load because the load of the upper byte is anyext in the SelectionDAG. I wrote the patch this way because it was quick and easy and has at least some benefit, but the "right" approach probably merits further discussion. Introducing target-specific SDNodes for PACK* and having custom lowering for unaligned load/stores that introduces those nodes them seems like it might be attractive. I don't have a strong personal interest in zbkb codegen at this time, it's just all the recent discussions about misaligned load/store made me check if we're generating the best code we can in this case. Full diff: https://github.com/llvm/llvm-project/pull/110644.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 8e6f281027695a..3eb1fc68694032 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -599,15 +599,26 @@ def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 8)),
def : Pat<(and (or (shl GPR:$rs2, (XLenVT 8)),
(zexti8 (XLenVT GPR:$rs1))), 0xFFFF),
(PACKH GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)),
+ (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
+ (SLLI (PACKH GPR:$rs1, GPR:$rs2), (XLenVT 16))>;
def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)),
(zexti8 (XLenVT GPR:$rs1))),
(PACKH GPR:$rs1, GPR:$rs2)>;
} // Predicates = [HasStdExtZbkb]
-let Predicates = [HasStdExtZbkb, IsRV32] in
+let Predicates = [HasStdExtZbkb, IsRV32] in {
def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))),
(PACK GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (or
+ (shl (zexti8 (XLenVT GPR:$op1rs2)), (XLenVT 24)),
+ (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
+ (or
+ (shl (zexti8 (XLenVT GPR:$op0rs2)), (XLenVT 8)),
+ (zexti8 (XLenVT GPR:$op0rs1)))),
+ (PACK (PACKH GPR:$op0rs1, GPR:$op0rs2), (PACKH GPR:$op1rs1, GPR:$op1rs2))>;
+}
let Predicates = [HasStdExtZbkb, IsRV64] in {
def : Pat<(i64 (or (zexti32 (i64 GPR:$rs1)), (shl GPR:$rs2, (i64 32)))),
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index 9af18428adf196..93dc67a3947598 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -82,6 +82,8 @@ define i24 @load_i24(ptr %p) {
ret i24 %res
}
+; FIXME: In the RV64IZBKB case, the second packh isn't selected because the
+; upper byte is an anyext load in the SDag.
define i32 @load_i32(ptr %p) {
; SLOWBASE-LABEL: load_i32:
; SLOWBASE: # %bb.0:
@@ -97,18 +99,29 @@ define i32 @load_i32(ptr %p) {
; SLOWBASE-NEXT: or a0, a0, a1
; SLOWBASE-NEXT: ret
;
-; SLOWZBKB-LABEL: load_i32:
-; SLOWZBKB: # %bb.0:
-; SLOWZBKB-NEXT: lbu a1, 1(a0)
-; SLOWZBKB-NEXT: lbu a2, 0(a0)
-; SLOWZBKB-NEXT: lbu a3, 2(a0)
-; SLOWZBKB-NEXT: lbu a0, 3(a0)
-; SLOWZBKB-NEXT: packh a1, a2, a1
-; SLOWZBKB-NEXT: slli a3, a3, 16
-; SLOWZBKB-NEXT: slli a0, a0, 24
-; SLOWZBKB-NEXT: or a0, a0, a3
-; SLOWZBKB-NEXT: or a0, a0, a1
-; SLOWZBKB-NEXT: ret
+; RV32IZBKB-LABEL: load_i32:
+; RV32IZBKB: # %bb.0:
+; RV32IZBKB-NEXT: lbu a1, 3(a0)
+; RV32IZBKB-NEXT: lbu a2, 2(a0)
+; RV32IZBKB-NEXT: lbu a3, 1(a0)
+; RV32IZBKB-NEXT: lbu a0, 0(a0)
+; RV32IZBKB-NEXT: packh a1, a2, a1
+; RV32IZBKB-NEXT: packh a0, a0, a3
+; RV32IZBKB-NEXT: pack a0, a0, a1
+; RV32IZBKB-NEXT: ret
+;
+; RV64IZBKB-LABEL: load_i32:
+; RV64IZBKB: # %bb.0:
+; RV64IZBKB-NEXT: lbu a1, 1(a0)
+; RV64IZBKB-NEXT: lbu a2, 0(a0)
+; RV64IZBKB-NEXT: lbu a3, 2(a0)
+; RV64IZBKB-NEXT: lbu a0, 3(a0)
+; RV64IZBKB-NEXT: packh a1, a2, a1
+; RV64IZBKB-NEXT: slli a3, a3, 16
+; RV64IZBKB-NEXT: slli a0, a0, 24
+; RV64IZBKB-NEXT: or a0, a0, a3
+; RV64IZBKB-NEXT: or a0, a0, a1
+; RV64IZBKB-NEXT: ret
;
; FAST-LABEL: load_i32:
; FAST: # %bb.0:
@@ -172,45 +185,39 @@ define i64 @load_i64(ptr %p) {
;
; RV32IZBKB-LABEL: load_i64:
; RV32IZBKB: # %bb.0:
-; RV32IZBKB-NEXT: lbu a1, 1(a0)
-; RV32IZBKB-NEXT: lbu a2, 0(a0)
-; RV32IZBKB-NEXT: lbu a3, 2(a0)
-; RV32IZBKB-NEXT: lbu a4, 3(a0)
+; RV32IZBKB-NEXT: lbu a1, 3(a0)
+; RV32IZBKB-NEXT: lbu a2, 2(a0)
; RV32IZBKB-NEXT: packh a1, a2, a1
-; RV32IZBKB-NEXT: slli a3, a3, 16
-; RV32IZBKB-NEXT: slli a4, a4, 24
-; RV32IZBKB-NEXT: or a3, a4, a3
-; RV32IZBKB-NEXT: lbu a2, 5(a0)
-; RV32IZBKB-NEXT: lbu a4, 4(a0)
+; RV32IZBKB-NEXT: lbu a2, 1(a0)
+; RV32IZBKB-NEXT: lbu a3, 0(a0)
+; RV32IZBKB-NEXT: lbu a4, 7(a0)
; RV32IZBKB-NEXT: lbu a5, 6(a0)
-; RV32IZBKB-NEXT: lbu a6, 7(a0)
-; RV32IZBKB-NEXT: or a0, a3, a1
-; RV32IZBKB-NEXT: packh a1, a4, a2
-; RV32IZBKB-NEXT: slli a5, a5, 16
-; RV32IZBKB-NEXT: slli a6, a6, 24
-; RV32IZBKB-NEXT: or a2, a6, a5
-; RV32IZBKB-NEXT: or a1, a2, a1
+; RV32IZBKB-NEXT: lbu a6, 5(a0)
+; RV32IZBKB-NEXT: lbu a7, 4(a0)
+; RV32IZBKB-NEXT: packh a0, a3, a2
+; RV32IZBKB-NEXT: pack a0, a0, a1
+; RV32IZBKB-NEXT: packh a1, a5, a4
+; RV32IZBKB-NEXT: packh a2, a7, a6
+; RV32IZBKB-NEXT: pack a1, a2, a1
; RV32IZBKB-NEXT: ret
;
; RV64IZBKB-LABEL: load_i64:
; RV64IZBKB: # %bb.0:
; RV64IZBKB-NEXT: lbu a1, 5(a0)
; RV64IZBKB-NEXT: lbu a2, 4(a0)
-; RV64IZBKB-NEXT: lbu a3, 6(a0)
-; RV64IZBKB-NEXT: lbu a4, 7(a0)
+; RV64IZBKB-NEXT: lbu a3, 7(a0)
+; RV64IZBKB-NEXT: lbu a4, 6(a0)
; RV64IZBKB-NEXT: packh a1, a2, a1
-; RV64IZBKB-NEXT: slli a3, a3, 16
-; RV64IZBKB-NEXT: slli a4, a4, 24
-; RV64IZBKB-NEXT: or a3, a4, a3
-; RV64IZBKB-NEXT: lbu a2, 1(a0)
+; RV64IZBKB-NEXT: packh a2, a4, a3
+; RV64IZBKB-NEXT: lbu a3, 1(a0)
; RV64IZBKB-NEXT: lbu a4, 0(a0)
-; RV64IZBKB-NEXT: lbu a5, 2(a0)
-; RV64IZBKB-NEXT: lbu a0, 3(a0)
-; RV64IZBKB-NEXT: or a1, a3, a1
-; RV64IZBKB-NEXT: packh a2, a4, a2
-; RV64IZBKB-NEXT: slli a5, a5, 16
-; RV64IZBKB-NEXT: slli a0, a0, 24
-; RV64IZBKB-NEXT: or a0, a0, a5
+; RV64IZBKB-NEXT: lbu a5, 3(a0)
+; RV64IZBKB-NEXT: lbu a0, 2(a0)
+; RV64IZBKB-NEXT: slli a2, a2, 16
+; RV64IZBKB-NEXT: or a1, a2, a1
+; RV64IZBKB-NEXT: packh a2, a4, a3
+; RV64IZBKB-NEXT: packh a0, a0, a5
+; RV64IZBKB-NEXT: slli a0, a0, 16
; RV64IZBKB-NEXT: or a0, a0, a2
; RV64IZBKB-NEXT: pack a0, a0, a1
; RV64IZBKB-NEXT: ret
@@ -574,3 +581,5 @@ define void @store_large_constant(ptr %x) {
store i64 18364758544493064720, ptr %x, align 1
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SLOWZBKB: {{.*}}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
PACKH is currently only selected for assembling the first two bytes of a misligned load. A fairly complex RV32-only pattern is added for producing PACKH+PACKH+PACK to assemble the result of a misaligned 32-bit load. Another pattern was added that just covers PACKH for shifted offsets 16 and 24, producing a packh and shift to replace two shifts and an 'or'. This slightly improves RV64IZKBK for a 64-bit load, but fails to match for the misaligned 32-bit load because the load of the upper byte is anyext in the SelectionDAG.
22719f3
to
c1f7df1
Compare
…vm#110644) PACKH is currently only selected for assembling the first two bytes of a misligned load. A fairly complex RV32-only pattern is added for producing PACKH+PACKH+PACK to assemble the result of a misaligned 32-bit load. Another pattern was added that just covers PACKH for shifted offsets 16 and 24, producing a packh and shift to replace two shifts and an 'or'. This slightly improves RV64IZKBK for a 64-bit load, but fails to match for the misaligned 32-bit load because the load of the upper byte is anyext in the SelectionDAG. I wrote the patch this way because it was quick and easy and has at least some benefit, but the "right" approach probably merits further discussion. Introducing target-specific SDNodes for PACK* and having custom lowering for unaligned load/stores that introduces those nodes them seems like it might be attractive. However, adding these patterns does provide benefit - so that's what this patch does for now.
PACKH is currently only selected for assembling the first two bytes of a misligned load. A fairly complex RV32-only pattern is added for producing PACKH+PACKH+PACK to assemble the result of a misaligned 32-bit load.
Another pattern was added that just covers PACKH for shifted offsets 16 and 24, producing a packh and shift to replace two shifts and an 'or'. This slightly improves RV64IZKBK for a 64-bit load, but fails to match for the misaligned 32-bit load because the load of the upper byte is anyext in the SelectionDAG.
I wrote the patch this way because it was quick and easy and has at least some benefit, but the "right" approach probably merits further discussion. Introducing target-specific SDNodes for PACK* and having custom lowering for unaligned load/stores that introduces those nodes them seems like it might be attractive.
I don't have a strong personal interest in zbkb codegen at this time, it's just all the recent discussions about misaligned load/store made me check if we're generating the best code we can in this case.