-
Notifications
You must be signed in to change notification settings - Fork 14.3k
PeepholeOpt: Do not add subregister indexes to reg_sequence operands #124111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PeepholeOpt: Do not add subregister indexes to reg_sequence operands #124111
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Matt Arsenault (arsenm) ChangesGiven the rest of the pass just gives up when it needs to compose It may still be profitable to do these folds if we start handling Patch is 301.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124111.diff 29 Files Affected:
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 48c25d5039bfd4..af4f2dc49b690b 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -436,6 +436,12 @@ class RegSequenceRewriter : public Rewriter {
if ((CurrentSrcIdx & 1) != 1 || CurrentSrcIdx > CopyLike.getNumOperands())
return false;
+ // Do not introduce new subregister uses in a reg_sequence. Until composing
+ // subregister indices is supported while folding, we're just blocking
+ // folding of subregister copies later in the function.
+ if (NewSubReg)
+ return false;
+
MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
MO.setReg(NewReg);
MO.setSubReg(NewSubReg);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 756eb2788607bf..b92d9c74342748 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2074,208 +2074,208 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7-LABEL: v_mul_i256:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v16, v0
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
-; GFX7-NEXT: v_mov_b32_e32 v17, v1
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
-; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
-; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
-; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
-; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
-; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX7-NEXT: v_mov_b32_e32 v18, v23
-; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
-; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
-; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX7-NEXT: v_mov_b32_e32 v0, v20
-; GFX7-NEXT: v_mov_b32_e32 v1, v23
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
-; GFX7-NEXT: v_mul_lo_u32 v20, v6, v9
-; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
-; GFX7-NEXT: v_mul_lo_u32 v23, v5, v10
-; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
-; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v14, 0
+; GFX7-NEXT: v_mov_b32_e32 v16, v1
+; GFX7-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v10, 0
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v16, v13, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v12, 0
+; GFX7-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v16, v9, v[17:18]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v2, v12, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[17:18], vcc, v2, v8, v[17:18]
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v3, v11, v[19:20]
+; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v1, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v4, v10, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v11, v[21:22]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v5, v9, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], vcc, v2, v10, v[21:22]
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], vcc, v3, v9, v[21:22]
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v8, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], vcc, v4, v8, v[21:22]
+; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc
+; GFX7-NEXT: v_mov_b32_e32 v23, v19
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v0, v13, v[22:23]
+; GFX7-NEXT: v_mov_b32_e32 v19, v21
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v0, v11, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v12, v[22:23]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v10, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v2, v11, v[21:22]
+; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX7-NEXT: v_mul_lo_u32 v23, v4, v11
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[21:22]
+; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v0, v8, 0
+; GFX7-NEXT: v_mul_lo_u32 v26, v6, v9
+; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v1, s[8:9]
; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
+; GFX7-NEXT: v_mul_lo_u32 v28, v3, v12
; GFX7-NEXT: v_mul_lo_u32 v13, v2, v13
-; GFX7-NEXT: v_mov_b32_e32 v2, v22
-; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
+; GFX7-NEXT: v_mov_b32_e32 v2, v17
+; GFX7-NEXT: v_mov_b32_e32 v1, v11
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v4, v9, v[21:22]
+; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v0, v9, v[1:2]
; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX7-NEXT: v_mul_lo_u32 v12, v3, v12
; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX7-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
-; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
+; GFX7-NEXT: v_addc_u32_e64 v17, s[8:9], 0, v6, s[8:9]
+; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12]
+; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v16, v8, v[1:2]
; GFX7-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
-; GFX7-NEXT: v_mul_lo_u32 v10, v16, v15
-; GFX7-NEXT: v_mul_lo_u32 v9, v17, v14
-; GFX7-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX7-NEXT: v_mul_lo_u32 v9, v16, v14
+; GFX7-NEXT: v_addc_u32_e64 v4, s[8:9], v24, v4, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v5, s[8:9], v17, v5, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], v25, v6, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v20, v0, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v9, s[14:15]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v13, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v23, s[6:7]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v26, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, v10
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v16, v0
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
-; GFX8-NEXT: v_mov_b32_e32 v17, v1
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
-; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
-; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
-; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
-; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX8-NEXT: v_mov_b32_e32 v18, v23
-; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
-; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
-; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX8-NEXT: v_mov_b32_e32 v0, v20
-; GFX8-NEXT: v_mov_b32_e32 v1, v23
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v20, v6, v9
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
-; GFX8-NEXT: v_mul_lo_u32 v23, v5, v10
-; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
-; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v14, 0
+; GFX8-NEXT: v_mov_b32_e32 v16, v1
+; GFX8-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v10, 0
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v16, v13, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v12, 0
+; GFX8-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v16, v9, v[17:18]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v2, v12, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[17:18], vcc, v2, v8, v[17:18]
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v3, v11, v[19:20]
+; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v4, v10, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v11, v[21:22]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v5, v9, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], vcc, v2, v10, v[21:22]
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], vcc, v3, v9, v[21:22]
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v8, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], vcc, v4, v8, v[21:22]
+; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v23, v19
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v0, v13, v[22:23]
+; GFX8-NEXT: v_mov_b32_e32 v19, v21
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v0, v11, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v12, v[22:23]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v10, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v2, v11, v[21:22]
+; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX8-NEXT: v_mul_lo_u32 v23, v4, v11
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[21:22]
+; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v0, v8, 0
+; GFX8-NEXT: v_mul_lo_u32 v26, v6, v9
+; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v1, s[8:9]
; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
+; GFX8-NEXT: v_mul_lo_u32 v28, v3, v12
; GFX8-NEXT: v_mul_lo_u32 v13, v2, v13
-; GFX8-NEXT: v_mov_b32_e32 v2, v22
-; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v2, v17
+; GFX8-NEXT: v_mov_b32_e32 v1, v11
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v4, v9, v[21:22]
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v0, v9, v[1:2]
; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX8-NEXT: v_mul_lo_u32 v12, v3, v12
; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX8-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
+; GFX8-NEXT: v_addc_u32_e64 v17, s[8:9], 0, v6, s[8:9]
+; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12]
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v16, v8, v[1:2]
; GFX8-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
-; GFX8-NEXT: v_mul_lo_u32 v10, v16, v15
-; GFX8-NEXT: v_mul_lo_u32 v9, v17, v14
-; GFX8-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX8-NEXT: v_mul_lo_u32 v9, v16, v14
+; GFX8-NEXT: v_addc_u32_e64 v4, s[8:9], v24, v4, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v5, s[8:9], v17, v5, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], v25, v6, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v20, v0, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v9, s[14:15]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v13, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v23, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v26, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, v10
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v16, v0
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
-; GFX9-NEXT: v_mov_b32_e32 v17, v1
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
-; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v24, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
-; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
-; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
-; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
-; GFX9-NEXT: v_mov_b32_e32 v18, v23
-; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
-; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
-; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, v20
-; GFX9-NEXT: v_mov_b32_e32 v1, v23
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v20, v6, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
-; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10
-; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v14, 0
+; GFX9-NEXT: v_mov_b32_e32 v16, v1
+; GFX9-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v10, 0
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v16, v13, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v16, v9, v[17:18]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v2, v12, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[17:18], vcc, v2, v8, v[17:18]
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v3, v11, v[19:20]
+; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v4, v10, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v11, v[21:22]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v5, v9, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], vcc, v2, v10, v[21:22]
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], vcc, v3, v9, v[21:22]
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v8, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], vcc, v4, v8, v[21:22]
+; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v0, v13, v[22:23]
+; GFX9-NEXT: v_mov_b32_e32 v19, v21
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v0, v11, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v12, v[22:23]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v10, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v2, v11, v[21:22]
+; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX9-NEXT: v_mul_lo_u32 v23, v4, v11
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[21:22]
+; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v0, v8, 0
+; GFX9-NEXT: v_mul_lo_u32 v26, v6, v9
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v1, s[8:9]
; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
+; GFX9-NEXT: v_mul_lo_u32 v28, v3, v12
; GFX9-NEXT: v_mul_lo_u32 v13, v2, v13
-; GFX9-NEXT: v_mov_b32_e32 v2, v22
-; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v2, v17
+; GFX9-NEXT: v_mov_b32_e32 v1, v11
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v4, v9, v[21:22]
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v0, v9, v[1:2]
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX9-NEXT: v_mul_lo_u32 v12, v3, v12
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
; GFX9-NEXT...
[truncated]
|
ab3d495
to
e39e8c0
Compare
e39e8c0
to
3ff3cb7
Compare
ping |
Given the rest of the pass just gives up when it needs to compose subregisters, folding a subregister extract directly into a reg_sequence is counterproductive. Later fold attempts in the function will give up on the subregister operand, preventing looking up through the reg_sequence. It may still be profitable to do these folds if we start handling the composes. There are some test regressions, but this mostly looks better.
3ff3cb7
to
d1e1edd
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/60/builds/18400 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/56/builds/17542 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/140/builds/15934 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/175/builds/12415 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/137/builds/12563 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/12389 Here is the relevant piece of the build log for the reference
|
breaks tests: http://45.33.8.238/linux/158957/step_11.txt |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/174/builds/12286 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/12986 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/190/builds/13794 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/108/builds/8751 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/10567 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/85/builds/4957 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/153/builds/21437 Here is the relevant piece of the build log for the reference
|
Given the rest of the pass just gives up when it needs to compose
subregisters, folding a subregister extract directly into a reg_sequence
is counterproductive. Later fold attempts in the function will give up
on the subregister operand, preventing looking up through the reg_sequence.
It may still be profitable to do these folds if we start handling
the composes. There are some test regressions, but this mostly
looks better.