-
Notifications
You must be signed in to change notification settings - Fork 14.3k
Reapply "[AMDGPU][GlobalISel] Properly handle lane op lowering for larger vector types (#132358)" #135758
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…rger vector types (llvm#132358)"
@llvm/pr-subscribers-backend-amdgpu Author: Vikram Hegde (vikramRH) Changesreapply #132358, tests updated. Patch is 137.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135758.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 275d0193452a5..5fcbf810abcbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5580,6 +5580,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
return false;
LLT PartialResTy = LLT::scalar(SplitSize);
+ bool NeedsBitcast = false;
if (Ty.isVector()) {
LLT EltTy = Ty.getElementType();
unsigned EltSize = EltTy.getSizeInBits();
@@ -5588,8 +5589,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
} else if (EltSize == 16 || EltSize == 32) {
unsigned NElem = SplitSize / EltSize;
PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
+ } else {
+ // Handle all other cases via S32/S64 pieces
+ NeedsBitcast = true;
}
- // Handle all other cases via S32/S64 pieces;
}
SmallVector<Register, 4> PartialRes;
@@ -5615,7 +5618,12 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
}
- B.buildMergeLikeInstr(DstReg, PartialRes);
+ if (NeedsBitcast)
+ B.buildBitcast(DstReg, B.buildMergeLikeInstr(
+ LLT::scalar(Ty.getSizeInBits()), PartialRes));
+ else
+ B.buildMergeLikeInstr(DstReg, PartialRes);
+
MI.eraseFromParent();
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index db557ff23c085..693e0ebd0280c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -9398,3 +9398,1015 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
store <8 x i16> %v, ptr addrspace(1) %out
ret void
}
+
+define void @v_permlane16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v2i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v6
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v7
+; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v2i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v6
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v7
+; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v2i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v6
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v7
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlane16_v2i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v6
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v7
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v2i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v2i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7
+; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <2 x i64> @llvm.amdgcn.permlane16.v2i64(<2 x i64> %src0, <2 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <2 x i64> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v3i64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v8
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v9
+; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v3i64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v8
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v9
+; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v3i64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v8
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v9
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlane16_v3i64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v8
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v9
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v3i64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v3i64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v8
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v9
+; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <3 x i64> @llvm.amdgcn.permlane16.v3i64(<3 x i64> %src0, <3 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <3 x i64> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v4f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v10
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v11
+; GFX10-SDAG-NEXT: v_permlane16_b32 v9, v9, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v8, v8, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v4f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v10
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v11
+; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v8, v8, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v9, v9, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_permlane16_v4f64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v10
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v11
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_permlane16_v4f64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v10
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v11
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX11-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: v_permlane16_v4f64:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v10
+; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v11
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: v_permlane16_v4f64:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v10
+; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v11
+; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff
+; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v6, v6, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v7, v7, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v8, v8, s0, s1
+; GFX12-GISEL-NEXT: v_permlane16_b32 v9, v9, s0, s1
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %v = call <4 x double> @llvm.amdgcn.permlane16.v4f64(<4 x double> %src0, <4 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store <4 x double> %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 %src1, i32 %src2) {
+; GFX10-SDAG-LABEL: v_permlane16_v8f64:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v18
+; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v19
+; GFX10-SDAG-NEXT: v_permlane16_b32 v17, v17, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v16, v16, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v15, v15, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v14, v14, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v13, v13, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v12, v12, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v11, v11, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v10, v10, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v9, v9, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v8, v8, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:48
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:32
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_permlane16_v8f64:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v18
+; GFX10-GISEL-NEXT: v_readfirstlane_b32 s5, v19
+; GFX10-GISEL-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v3, v3, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v4, v4, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v5, v5, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v6, v6, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v7, v7, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v8, v8, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v9, v9, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v10, v10, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v11, v11, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v12, v12, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v13, v13, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v14, v14, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v15, v15, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v16, v16, s4, s5
+; GFX10-GISEL-NEXT: v_permlane16_b32 v17, v17, s4, s5
+; GFX10-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX10-GI...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. check-llvm seems to be happy. Thanks for the update!
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/190/builds/18388 Here is the relevant piece of the build log for the reference
|
…rger vector types (llvm#132358)" (llvm#135758) reapply llvm#132358, tests updated.
reapply #132358, tests updated.