Skip to content

Commit ce72f78

Browse files
authored
[AMDGPU] Fix mul combine for MUL24 (#79110)
MUL24 can now return a i64 for i32 operands, but the combine was never updated to handle this case. Extend the operand when rewriting the ADD to handle it. Fixes SWDEV-436654
1 parent cfb7026 commit ce72f78

File tree

2 files changed

+226
-29
lines changed

2 files changed

+226
-29
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4206,6 +4206,7 @@ static SDValue getAddOneOp(const SDNode *V) {
42064206

42074207
SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
42084208
DAGCombinerInfo &DCI) const {
4209+
assert(N->getOpcode() == ISD::MUL);
42094210
EVT VT = N->getValueType(0);
42104211

42114212
// Don't generate 24-bit multiplies on values that are in SGPRs, since
@@ -4254,10 +4255,6 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
42544255
return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
42554256
}
42564257

4257-
// Skip if already mul24.
4258-
if (N->getOpcode() != ISD::MUL)
4259-
return SDValue();
4260-
42614258
// There are i16 integer mul/mad.
42624259
if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
42634260
return SDValue();
@@ -5081,7 +5078,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
50815078
case AMDGPUISD::MUL_I24: {
50825079
if (SDValue Simplified = simplifyMul24(N, DCI))
50835080
return Simplified;
5084-
return performMulCombine(N, DCI);
5081+
break;
50855082
}
50865083
case AMDGPUISD::MULHI_I24:
50875084
case AMDGPUISD::MULHI_U24:

llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll

Lines changed: 224 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -338,25 +338,29 @@ define i24 @v_mul_add_1_i24_zext(i24 zeroext %x, i24 zeroext %y) {
338338
; GFX67-LABEL: v_mul_add_1_i24_zext:
339339
; GFX67: ; %bb.0:
340340
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
341-
; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, v0
341+
; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1
342+
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1
342343
; GFX67-NEXT: s_setpc_b64 s[30:31]
343344
;
344345
; GFX8-LABEL: v_mul_add_1_i24_zext:
345346
; GFX8: ; %bb.0:
346347
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
347-
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v0
348+
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1
349+
; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v1
348350
; GFX8-NEXT: s_setpc_b64 s[30:31]
349351
;
350352
; GFX9-LABEL: v_mul_add_1_i24_zext:
351353
; GFX9: ; %bb.0:
352354
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
353-
; GFX9-NEXT: v_mad_u32_u24 v0, v0, v1, v0
355+
; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
356+
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v1
354357
; GFX9-NEXT: s_setpc_b64 s[30:31]
355358
;
356359
; GFX10-LABEL: v_mul_add_1_i24_zext:
357360
; GFX10: ; %bb.0:
358361
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
359-
; GFX10-NEXT: v_mad_u32_u24 v0, v0, v1, v0
362+
; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
363+
; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v1
360364
; GFX10-NEXT: s_setpc_b64 s[30:31]
361365
%add = add i24 %y, 1
362366
%mul = mul i24 %x, %add
@@ -429,25 +433,29 @@ define i24 @v_mul_add_1_i24_sext(i24 signext %x, i24 signext %y) {
429433
; GFX67-LABEL: v_mul_add_1_i24_sext:
430434
; GFX67: ; %bb.0:
431435
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432-
; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, v0
436+
; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1
437+
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1
433438
; GFX67-NEXT: s_setpc_b64 s[30:31]
434439
;
435440
; GFX8-LABEL: v_mul_add_1_i24_sext:
436441
; GFX8: ; %bb.0:
437442
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
438-
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v0
443+
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v1
444+
; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v1
439445
; GFX8-NEXT: s_setpc_b64 s[30:31]
440446
;
441447
; GFX9-LABEL: v_mul_add_1_i24_sext:
442448
; GFX9: ; %bb.0:
443449
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
444-
; GFX9-NEXT: v_mad_u32_u24 v0, v0, v1, v0
450+
; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
451+
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v1
445452
; GFX9-NEXT: s_setpc_b64 s[30:31]
446453
;
447454
; GFX10-LABEL: v_mul_add_1_i24_sext:
448455
; GFX10: ; %bb.0:
449456
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450-
; GFX10-NEXT: v_mad_u32_u24 v0, v0, v1, v0
457+
; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
458+
; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v1
451459
; GFX10-NEXT: s_setpc_b64 s[30:31]
452460
%add = add i24 %y, 1
453461
%mul = mul i24 %x, %add
@@ -2306,29 +2314,37 @@ define <2 x i24> @v_mul_add_1_v2i24(<2 x i24> %x, <2 x i24> %y) {
23062314
; GFX67-LABEL: v_mul_add_1_v2i24:
23072315
; GFX67: ; %bb.0:
23082316
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2309-
; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, v0
2310-
; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, v1
2317+
; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v3
2318+
; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v2
2319+
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2
2320+
; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3
23112321
; GFX67-NEXT: s_setpc_b64 s[30:31]
23122322
;
23132323
; GFX8-LABEL: v_mul_add_1_v2i24:
23142324
; GFX8: ; %bb.0:
23152325
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2316-
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v0
2317-
; GFX8-NEXT: v_mad_u32_u24 v1, v1, v3, v1
2326+
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v3
2327+
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v2
2328+
; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v2
2329+
; GFX8-NEXT: v_mul_u32_u24_e32 v1, v1, v3
23182330
; GFX8-NEXT: s_setpc_b64 s[30:31]
23192331
;
23202332
; GFX9-LABEL: v_mul_add_1_v2i24:
23212333
; GFX9: ; %bb.0:
23222334
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2323-
; GFX9-NEXT: v_mad_u32_u24 v0, v0, v2, v0
2324-
; GFX9-NEXT: v_mad_u32_u24 v1, v1, v3, v1
2335+
; GFX9-NEXT: v_add_u32_e32 v3, 1, v3
2336+
; GFX9-NEXT: v_add_u32_e32 v2, 1, v2
2337+
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v2
2338+
; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v3
23252339
; GFX9-NEXT: s_setpc_b64 s[30:31]
23262340
;
23272341
; GFX10-LABEL: v_mul_add_1_v2i24:
23282342
; GFX10: ; %bb.0:
23292343
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2330-
; GFX10-NEXT: v_mad_u32_u24 v0, v0, v2, v0
2331-
; GFX10-NEXT: v_mad_u32_u24 v1, v1, v3, v1
2344+
; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v2
2345+
; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v3
2346+
; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v2
2347+
; GFX10-NEXT: v_mul_u32_u24_e32 v1, v1, v3
23322348
; GFX10-NEXT: s_setpc_b64 s[30:31]
23332349
%add = add <2 x i24> %y, <i24 1, i24 1>
23342350
%mul = mul <2 x i24> %x, %add
@@ -2339,29 +2355,37 @@ define <2 x i24> @v_mul_add_1_v2i24_commute(<2 x i24> %x, <2 x i24> %y) {
23392355
; GFX67-LABEL: v_mul_add_1_v2i24_commute:
23402356
; GFX67: ; %bb.0:
23412357
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2342-
; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, v0
2343-
; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, v1
2358+
; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v3
2359+
; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v2
2360+
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v2, v0
2361+
; GFX67-NEXT: v_mul_u32_u24_e32 v1, v3, v1
23442362
; GFX67-NEXT: s_setpc_b64 s[30:31]
23452363
;
23462364
; GFX8-LABEL: v_mul_add_1_v2i24_commute:
23472365
; GFX8: ; %bb.0:
23482366
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2349-
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v0
2350-
; GFX8-NEXT: v_mad_u32_u24 v1, v1, v3, v1
2367+
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v3
2368+
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v2
2369+
; GFX8-NEXT: v_mul_u32_u24_e32 v0, v2, v0
2370+
; GFX8-NEXT: v_mul_u32_u24_e32 v1, v3, v1
23512371
; GFX8-NEXT: s_setpc_b64 s[30:31]
23522372
;
23532373
; GFX9-LABEL: v_mul_add_1_v2i24_commute:
23542374
; GFX9: ; %bb.0:
23552375
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2356-
; GFX9-NEXT: v_mad_u32_u24 v0, v0, v2, v0
2357-
; GFX9-NEXT: v_mad_u32_u24 v1, v1, v3, v1
2376+
; GFX9-NEXT: v_add_u32_e32 v3, 1, v3
2377+
; GFX9-NEXT: v_add_u32_e32 v2, 1, v2
2378+
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v2, v0
2379+
; GFX9-NEXT: v_mul_u32_u24_e32 v1, v3, v1
23582380
; GFX9-NEXT: s_setpc_b64 s[30:31]
23592381
;
23602382
; GFX10-LABEL: v_mul_add_1_v2i24_commute:
23612383
; GFX10: ; %bb.0:
23622384
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2363-
; GFX10-NEXT: v_mad_u32_u24 v0, v0, v2, v0
2364-
; GFX10-NEXT: v_mad_u32_u24 v1, v1, v3, v1
2385+
; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v2
2386+
; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v3
2387+
; GFX10-NEXT: v_mul_u32_u24_e32 v0, v2, v0
2388+
; GFX10-NEXT: v_mul_u32_u24_e32 v1, v3, v1
23652389
; GFX10-NEXT: s_setpc_b64 s[30:31]
23662390
%add = add <2 x i24> %y, <i24 1, i24 1>
23672391
%mul = mul <2 x i24> %add, %x
@@ -3692,10 +3716,186 @@ define <2 x i8> @v_mul_add_1_v2i8_commute(<2 x i8> %x, <2 x i8> %y) {
36923716
ret <2 x i8> %mul
36933717
}
36943718

3719+
; test mul_u24 intrinsic with (i32, i32) -> i64
3720+
define i64 @mul_u24_with_uneven_operands(i32 %z) {
3721+
; GFX67-LABEL: mul_u24_with_uneven_operands:
3722+
; GFX67: ; %bb.0: ; %entry
3723+
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3724+
; GFX67-NEXT: v_and_b32_e32 v0, 1, v0
3725+
; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v0
3726+
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v1, v0
3727+
; GFX67-NEXT: v_mov_b32_e32 v1, 0
3728+
; GFX67-NEXT: s_setpc_b64 s[30:31]
3729+
;
3730+
; GFX8-LABEL: mul_u24_with_uneven_operands:
3731+
; GFX8: ; %bb.0: ; %entry
3732+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3733+
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
3734+
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v0
3735+
; GFX8-NEXT: v_mul_u32_u24_e32 v0, v1, v0
3736+
; GFX8-NEXT: v_mov_b32_e32 v1, 0
3737+
; GFX8-NEXT: s_setpc_b64 s[30:31]
3738+
;
3739+
; GFX9-LABEL: mul_u24_with_uneven_operands:
3740+
; GFX9: ; %bb.0: ; %entry
3741+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3742+
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
3743+
; GFX9-NEXT: v_add_u32_e32 v1, 1, v0
3744+
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v1, v0
3745+
; GFX9-NEXT: v_mov_b32_e32 v1, 0
3746+
; GFX9-NEXT: s_setpc_b64 s[30:31]
3747+
;
3748+
; GFX10-LABEL: mul_u24_with_uneven_operands:
3749+
; GFX10: ; %bb.0: ; %entry
3750+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3751+
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
3752+
; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v0
3753+
; GFX10-NEXT: v_mul_u32_u24_e32 v0, v1, v0
3754+
; GFX10-NEXT: v_mov_b32_e32 v1, 0
3755+
; GFX10-NEXT: s_setpc_b64 s[30:31]
3756+
entry:
3757+
%c = and i32 %z, 1
3758+
%d = add nuw nsw i32 %c, 1
3759+
%f = call i64 @llvm.amdgcn.mul.u24(i32 %d, i32 %c)
3760+
ret i64 %f
3761+
}
3762+
3763+
define i64 @mul_u24_with_uneven_operands_swapped(i32 %z) {
3764+
; GFX67-LABEL: mul_u24_with_uneven_operands_swapped:
3765+
; GFX67: ; %bb.0: ; %entry
3766+
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3767+
; GFX67-NEXT: v_and_b32_e32 v0, 1, v0
3768+
; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v0
3769+
; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1
3770+
; GFX67-NEXT: v_mov_b32_e32 v1, 0
3771+
; GFX67-NEXT: s_setpc_b64 s[30:31]
3772+
;
3773+
; GFX8-LABEL: mul_u24_with_uneven_operands_swapped:
3774+
; GFX8: ; %bb.0: ; %entry
3775+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3776+
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
3777+
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 1, v0
3778+
; GFX8-NEXT: v_mul_u32_u24_e32 v0, v0, v1
3779+
; GFX8-NEXT: v_mov_b32_e32 v1, 0
3780+
; GFX8-NEXT: s_setpc_b64 s[30:31]
3781+
;
3782+
; GFX9-LABEL: mul_u24_with_uneven_operands_swapped:
3783+
; GFX9: ; %bb.0: ; %entry
3784+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3785+
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
3786+
; GFX9-NEXT: v_add_u32_e32 v1, 1, v0
3787+
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v1
3788+
; GFX9-NEXT: v_mov_b32_e32 v1, 0
3789+
; GFX9-NEXT: s_setpc_b64 s[30:31]
3790+
;
3791+
; GFX10-LABEL: mul_u24_with_uneven_operands_swapped:
3792+
; GFX10: ; %bb.0: ; %entry
3793+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3794+
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
3795+
; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v0
3796+
; GFX10-NEXT: v_mul_u32_u24_e32 v0, v0, v1
3797+
; GFX10-NEXT: v_mov_b32_e32 v1, 0
3798+
; GFX10-NEXT: s_setpc_b64 s[30:31]
3799+
entry:
3800+
%c = and i32 %z, 1
3801+
%d = add nuw nsw i32 %c, 1
3802+
%f = call i64 @llvm.amdgcn.mul.u24(i32 %c, i32 %d)
3803+
ret i64 %f
3804+
}
3805+
3806+
; test mul_i24 intrinsic with (i32, i32) -> i64
3807+
define i64 @mul_i24_with_uneven_operands(i32 %z) {
3808+
; GFX67-LABEL: mul_i24_with_uneven_operands:
3809+
; GFX67: ; %bb.0: ; %entry
3810+
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3811+
; GFX67-NEXT: v_and_b32_e32 v0, 1, v0
3812+
; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v0
3813+
; GFX67-NEXT: v_mul_hi_i32_i24_e32 v1, v2, v0
3814+
; GFX67-NEXT: v_mul_i32_i24_e32 v0, v2, v0
3815+
; GFX67-NEXT: s_setpc_b64 s[30:31]
3816+
;
3817+
; GFX8-LABEL: mul_i24_with_uneven_operands:
3818+
; GFX8: ; %bb.0: ; %entry
3819+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3820+
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
3821+
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
3822+
; GFX8-NEXT: v_mul_hi_i32_i24_e32 v1, v2, v0
3823+
; GFX8-NEXT: v_mul_i32_i24_e32 v0, v2, v0
3824+
; GFX8-NEXT: s_setpc_b64 s[30:31]
3825+
;
3826+
; GFX9-LABEL: mul_i24_with_uneven_operands:
3827+
; GFX9: ; %bb.0: ; %entry
3828+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3829+
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
3830+
; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
3831+
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v2, v0
3832+
; GFX9-NEXT: v_mul_i32_i24_e32 v0, v2, v0
3833+
; GFX9-NEXT: s_setpc_b64 s[30:31]
3834+
;
3835+
; GFX10-LABEL: mul_i24_with_uneven_operands:
3836+
; GFX10: ; %bb.0: ; %entry
3837+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3838+
; GFX10-NEXT: v_and_b32_e32 v1, 1, v0
3839+
; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v1
3840+
; GFX10-NEXT: v_mul_i32_i24_e32 v0, v2, v1
3841+
; GFX10-NEXT: v_mul_hi_i32_i24_e32 v1, v2, v1
3842+
; GFX10-NEXT: s_setpc_b64 s[30:31]
3843+
entry:
3844+
%c = and i32 %z, 1
3845+
%d = add nuw nsw i32 %c, 1
3846+
%f = call i64 @llvm.amdgcn.mul.i24(i32 %d, i32 %c)
3847+
ret i64 %f
3848+
}
3849+
3850+
define i64 @mul_i24_with_uneven_operands_swapped(i32 %z) {
3851+
; GFX67-LABEL: mul_i24_with_uneven_operands_swapped:
3852+
; GFX67: ; %bb.0: ; %entry
3853+
; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3854+
; GFX67-NEXT: v_and_b32_e32 v0, 1, v0
3855+
; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v0
3856+
; GFX67-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2
3857+
; GFX67-NEXT: v_mul_i32_i24_e32 v0, v0, v2
3858+
; GFX67-NEXT: s_setpc_b64 s[30:31]
3859+
;
3860+
; GFX8-LABEL: mul_i24_with_uneven_operands_swapped:
3861+
; GFX8: ; %bb.0: ; %entry
3862+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3863+
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
3864+
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0
3865+
; GFX8-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2
3866+
; GFX8-NEXT: v_mul_i32_i24_e32 v0, v0, v2
3867+
; GFX8-NEXT: s_setpc_b64 s[30:31]
3868+
;
3869+
; GFX9-LABEL: mul_i24_with_uneven_operands_swapped:
3870+
; GFX9: ; %bb.0: ; %entry
3871+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3872+
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
3873+
; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
3874+
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2
3875+
; GFX9-NEXT: v_mul_i32_i24_e32 v0, v0, v2
3876+
; GFX9-NEXT: s_setpc_b64 s[30:31]
3877+
;
3878+
; GFX10-LABEL: mul_i24_with_uneven_operands_swapped:
3879+
; GFX10: ; %bb.0: ; %entry
3880+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3881+
; GFX10-NEXT: v_and_b32_e32 v1, 1, v0
3882+
; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v1
3883+
; GFX10-NEXT: v_mul_i32_i24_e32 v0, v1, v2
3884+
; GFX10-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v2
3885+
; GFX10-NEXT: s_setpc_b64 s[30:31]
3886+
entry:
3887+
%c = and i32 %z, 1
3888+
%d = add nuw nsw i32 %c, 1
3889+
%f = call i64 @llvm.amdgcn.mul.i24(i32 %c, i32 %d)
3890+
ret i64 %f
3891+
}
3892+
36953893
declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #2
36963894
declare i32 @llvm.amdgcn.workitem.id.x() #2
36973895
declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #2
36983896
declare i32 @llvm.amdgcn.workgroup.id.x() #2
3897+
declare i64 @llvm.amdgcn.mul.u24(i32, i32)
3898+
declare i64 @llvm.amdgcn.mul.i24(i32, i32)
36993899

37003900
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
37013901
attributes #1 = { mustprogress nofree nosync nounwind willreturn memory(read, argmem: readwrite, inaccessiblemem: none) }

0 commit comments

Comments
 (0)