Skip to content

Commit 6409bf5

Browse files
committed
[GlobalISel] Fix buildCopyFromRegs for split vectors
Fixes #77055
1 parent 839435c commit 6409bf5

File tree

2 files changed

+127
-3
lines changed

2 files changed

+127
-3
lines changed

llvm/lib/CodeGen/GlobalISel/CallLowering.cpp

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -478,9 +478,37 @@ static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
478478
} else {
479479
// Vector was split, and elements promoted to a wider type.
480480
// FIXME: Should handle floating point promotions.
481-
LLT BVType = LLT::fixed_vector(LLTy.getNumElements(), PartLLT);
482-
auto BV = B.buildBuildVector(BVType, Regs);
483-
B.buildTrunc(OrigRegs[0], BV);
481+
unsigned NumElts = LLTy.getNumElements();
482+
LLT BVType = LLT::fixed_vector(NumElts, PartLLT);
483+
484+
Register BuildVec;
485+
if (NumElts == Regs.size())
486+
BuildVec = B.buildBuildVector(BVType, Regs).getReg(0);
487+
else {
488+
SmallVector<Register, 0> BVRegs;
489+
BVRegs.reserve(NumElts);
490+
491+
// Vector elements are packed in the inputs.
492+
// e.g. we have a <4 x s16> but 2 x s32 in regs.
493+
assert(NumElts > Regs.size());
494+
LLT SrcEltTy = MRI.getType(Regs[0]);
495+
LLT OriginalEltTy = MRI.getType(OrigRegs[0]).getElementType();
496+
497+
// Input registers contain packed elements.
498+
// Determine how many elements per reg.
499+
assert((SrcEltTy.getSizeInBits() % OriginalEltTy.getSizeInBits()) == 0);
500+
unsigned EltPerReg =
501+
(SrcEltTy.getSizeInBits() / OriginalEltTy.getSizeInBits());
502+
503+
for (Register R : Regs) {
504+
auto Unmerge = B.buildUnmerge(OriginalEltTy, R);
505+
for (unsigned K = 0; K < EltPerReg; ++K)
506+
BVRegs.push_back(B.buildAnyExt(PartLLT, Unmerge.getReg(K)).getReg(0));
507+
}
508+
assert(BVRegs.size() == NumElts);
509+
BuildVec = B.buildBuildVector(BVType, BVRegs).getReg(0);
510+
}
511+
B.buildTrunc(OrigRegs[0], BuildVec);
484512
}
485513
}
486514

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -global-isel -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
3+
; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
4+
; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
5+
; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
6+
; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
7+
; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
8+
; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
9+
10+
; TODO: expand testcases - currently only contains cases that were known to crash.
11+
12+
; assert in IRTranslator, #77055
13+
define <4 x bfloat> @v4bf16(<4 x bfloat> %arg0) {
14+
; GCN-LABEL: v4bf16:
15+
; GCN: ; %bb.0:
16+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17+
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
18+
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
19+
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3
20+
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
21+
; GCN-NEXT: v_or_b32_e32 v3, v1, v0
22+
; GCN-NEXT: v_or_b32_e32 v2, v4, v2
23+
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2
24+
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3
25+
; GCN-NEXT: s_setpc_b64 s[30:31]
26+
;
27+
; GFX7-LABEL: v4bf16:
28+
; GFX7: ; %bb.0:
29+
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30+
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
31+
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
32+
; GFX7-NEXT: v_or_b32_e32 v4, v1, v0
33+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
34+
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
35+
; GFX7-NEXT: v_or_b32_e32 v2, v0, v1
36+
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v2
37+
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4
38+
; GFX7-NEXT: v_mov_b32_e32 v3, v4
39+
; GFX7-NEXT: s_setpc_b64 s[30:31]
40+
;
41+
; GFX8-LABEL: v4bf16:
42+
; GFX8: ; %bb.0:
43+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
44+
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
45+
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
46+
; GFX8-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
47+
; GFX8-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
48+
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
49+
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
50+
; GFX8-NEXT: v_mov_b32_e32 v0, v2
51+
; GFX8-NEXT: s_setpc_b64 s[30:31]
52+
;
53+
; GFX9-LABEL: v4bf16:
54+
; GFX9: ; %bb.0:
55+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56+
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
57+
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
58+
; GFX9-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
59+
; GFX9-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
60+
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
61+
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
62+
; GFX9-NEXT: v_mov_b32_e32 v0, v2
63+
; GFX9-NEXT: s_setpc_b64 s[30:31]
64+
;
65+
; GFX10-LABEL: v4bf16:
66+
; GFX10: ; %bb.0:
67+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68+
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
69+
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
70+
; GFX10-NEXT: v_mov_b32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
71+
; GFX10-NEXT: v_mov_b32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
72+
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
73+
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
74+
; GFX10-NEXT: v_mov_b32_e32 v0, v2
75+
; GFX10-NEXT: s_setpc_b64 s[30:31]
76+
;
77+
; GFX11-LABEL: v4bf16:
78+
; GFX11: ; %bb.0:
79+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80+
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
81+
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
82+
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
83+
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
84+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
85+
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
86+
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
87+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
88+
; GFX11-NEXT: v_or_b32_e32 v1, v2, v1
89+
; GFX11-NEXT: v_or_b32_e32 v2, v3, v0
90+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
91+
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
92+
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
93+
; GFX11-NEXT: s_setpc_b64 s[30:31]
94+
%res = shufflevector <4 x bfloat> %arg0, <4 x bfloat> zeroinitializer, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
95+
ret <4 x bfloat> %res
96+
}

0 commit comments

Comments
 (0)