Skip to content

Commit 52b43d1

Browse files
committed
[AMDGPU] Fix cvt_f32_ubyte combine with shl
Shift node is still needed to check if the shift is shr or shl to increment/decrement offset. Do not override the node. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D112733
1 parent bb16e83 commit 52b43d1

File tree

2 files changed

+124
-92
lines changed

2 files changed

+124
-92
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10863,7 +10863,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
1086310863
// cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
1086410864
// cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
1086510865
if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
10866-
Shift = DAG.getZExtOrTrunc(Shift.getOperand(0),
10866+
SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
1086710867
SDLoc(Shift.getOperand(0)), MVT::i32);
1086810868

1086910869
unsigned ShiftOffset = 8 * Offset;
@@ -10874,7 +10874,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
1087410874

1087510875
if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
1087610876
return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
10877-
MVT::f32, Shift);
10877+
MVT::f32, Shifted);
1087810878
}
1087910879
}
1088010880
}

llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll

Lines changed: 122 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,SI
33
; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI
44
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
5+
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx908 -start-before=amdgpu-isel -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
56

67
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
78
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
@@ -823,18 +824,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
823824
; SI-NEXT: v_mov_b32_e32 v1, 0
824825
; SI-NEXT: s_waitcnt lgkmcnt(0)
825826
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
826-
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1
827-
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2
827+
; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1
828+
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:2
828829
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
829830
; SI-NEXT: s_mov_b32 s6, -1
830831
; SI-NEXT: s_waitcnt vmcnt(2)
831-
; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2
832+
; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
833+
; SI-NEXT: s_waitcnt vmcnt(1)
834+
; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
832835
; SI-NEXT: s_waitcnt vmcnt(0)
833-
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
834-
; SI-NEXT: v_or_b32_e32 v0, v0, v3
835-
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
836-
; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
837-
; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
836+
; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0
838837
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
839838
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
840839
; SI-NEXT: s_endpgm
@@ -865,7 +864,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
865864
; VI-NEXT: s_waitcnt vmcnt(2)
866865
; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5
867866
; VI-NEXT: s_waitcnt vmcnt(1)
868-
; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6
867+
; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
869868
; VI-NEXT: s_waitcnt vmcnt(0)
870869
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
871870
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -888,7 +887,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
888887
; GFX10-NEXT: s_waitcnt vmcnt(2)
889888
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
890889
; GFX10-NEXT: s_waitcnt vmcnt(1)
891-
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v4
890+
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
892891
; GFX10-NEXT: s_waitcnt vmcnt(0)
893892
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
894893
; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -1038,31 +1037,29 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
10381037
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
10391038
; SI-NEXT: v_mov_b32_e32 v1, 0
10401039
; SI-NEXT: s_waitcnt lgkmcnt(0)
1041-
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
1042-
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
1043-
; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2
1044-
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3
1045-
; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4
1046-
; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5
1040+
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
1041+
; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1
1042+
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:2
1043+
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:3
1044+
; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:4
1045+
; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:5
10471046
; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6
10481047
; SI-NEXT: s_mov_b32 s6, -1
10491048
; SI-NEXT: s_waitcnt vmcnt(6)
1050-
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
1049+
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
10511050
; SI-NEXT: s_waitcnt vmcnt(5)
1052-
; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3
1051+
; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
1052+
; SI-NEXT: s_waitcnt vmcnt(4)
1053+
; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
10531054
; SI-NEXT: s_waitcnt vmcnt(3)
1054-
; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4
1055-
; SI-NEXT: v_or_b32_e32 v3, v9, v6
1055+
; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
1056+
; SI-NEXT: s_waitcnt vmcnt(2)
1057+
; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6
10561058
; SI-NEXT: s_waitcnt vmcnt(1)
1057-
; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5
1059+
; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v7
10581060
; SI-NEXT: s_waitcnt vmcnt(0)
1059-
; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8
1060-
; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
1061-
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24
1062-
; SI-NEXT: s_waitcnt expcnt(0)
1063-
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
1064-
; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
1065-
; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
1061+
; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8
1062+
; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:24
10661063
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
10671064
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
10681065
; SI-NEXT: s_endpgm
@@ -1078,39 +1075,39 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
10781075
; VI-NEXT: v_mov_b32_e32 v1, s1
10791076
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
10801077
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1081-
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0
1078+
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
10821079
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1083-
; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
1080+
; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0
10841081
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1085-
; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0
1086-
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
1087-
; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0
1088-
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
1089-
; VI-NEXT: flat_load_ubyte v10, v[4:5]
1090-
; VI-NEXT: flat_load_ubyte v11, v[6:7]
1091-
; VI-NEXT: flat_load_ubyte v8, v[8:9]
1082+
; VI-NEXT: flat_load_ubyte v12, v[4:5]
10921083
; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
10931084
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
10941085
; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0
10951086
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
1087+
; VI-NEXT: v_add_u32_e32 v8, vcc, 2, v0
1088+
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
1089+
; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v0
1090+
; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
1091+
; VI-NEXT: flat_load_ubyte v8, v[8:9]
1092+
; VI-NEXT: flat_load_ubyte v9, v[10:11]
10961093
; VI-NEXT: flat_load_ubyte v6, v[6:7]
10971094
; VI-NEXT: flat_load_ubyte v4, v[4:5]
1098-
; VI-NEXT: flat_load_ubyte v2, v[2:3]
1095+
; VI-NEXT: flat_load_ubyte v7, v[2:3]
10991096
; VI-NEXT: flat_load_ubyte v0, v[0:1]
1097+
; VI-NEXT: s_waitcnt vmcnt(6)
1098+
; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v12
1099+
; VI-NEXT: s_waitcnt vmcnt(5)
1100+
; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8
11001101
; VI-NEXT: s_waitcnt vmcnt(4)
1101-
; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8
1102+
; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v9
11021103
; VI-NEXT: s_waitcnt vmcnt(3)
1103-
; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6
1104+
; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
11041105
; VI-NEXT: s_waitcnt vmcnt(2)
11051106
; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
11061107
; VI-NEXT: s_waitcnt vmcnt(1)
1107-
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1108-
; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v11
1109-
; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1108+
; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
11101109
; VI-NEXT: s_waitcnt vmcnt(0)
11111110
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1112-
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
1113-
; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
11141111
; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
11151112
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
11161113
; VI-NEXT: s_endpgm
@@ -1119,32 +1116,31 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
11191116
; GFX10: ; %bb.0:
11201117
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
11211118
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1122-
; GFX10-NEXT: v_mov_b32_e32 v2, 0
1119+
; GFX10-NEXT: v_mov_b32_e32 v4, 0
11231120
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1124-
; GFX10-NEXT: v_mov_b32_e32 v8, 0
1121+
; GFX10-NEXT: v_mov_b32_e32 v7, 0
11251122
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
11261123
; GFX10-NEXT: s_clause 0x5
1127-
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2
1128-
; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3
1129-
; GFX10-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4
1130-
; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6
1131-
; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1
1132-
; GFX10-NEXT: global_load_ubyte v7, v0, s[2:3]
1124+
; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:6
1125+
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3
1126+
; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2
1127+
; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:1
1128+
; GFX10-NEXT: global_load_short_d16 v4, v0, s[2:3] offset:4
1129+
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
11331130
; GFX10-NEXT: s_waitcnt vmcnt(4)
1134-
; GFX10-NEXT: v_lshl_or_b32 v0, v3, 8, v1
1131+
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
1132+
; GFX10-NEXT: s_waitcnt vmcnt(3)
1133+
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
11351134
; GFX10-NEXT: s_waitcnt vmcnt(2)
1136-
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
1135+
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
1136+
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v5
11371137
; GFX10-NEXT: s_waitcnt vmcnt(1)
1138-
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v5
1139-
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
1140-
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1141-
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
1142-
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1143-
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1138+
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v4
1139+
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
11441140
; GFX10-NEXT: s_waitcnt vmcnt(0)
1145-
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
1146-
; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
1147-
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
1141+
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1142+
; GFX10-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] offset:16
1143+
; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1]
11481144
; GFX10-NEXT: s_endpgm
11491145
%tid = call i32 @llvm.amdgcn.workitem.id.x()
11501146
%gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
@@ -1417,18 +1413,16 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
14171413
; SI-NEXT: v_mov_b32_e32 v1, 0
14181414
; SI-NEXT: s_waitcnt lgkmcnt(0)
14191415
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
1420-
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1
1421-
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2
1416+
; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1
1417+
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:2
14221418
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
14231419
; SI-NEXT: s_mov_b32 s6, -1
14241420
; SI-NEXT: s_waitcnt vmcnt(2)
1425-
; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2
1421+
; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
1422+
; SI-NEXT: s_waitcnt vmcnt(1)
1423+
; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
14261424
; SI-NEXT: s_waitcnt vmcnt(0)
1427-
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1428-
; SI-NEXT: v_or_b32_e32 v0, v0, v3
1429-
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1430-
; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1431-
; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1425+
; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0
14321426
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
14331427
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
14341428
; SI-NEXT: s_endpgm
@@ -1444,24 +1438,24 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
14441438
; VI-NEXT: v_mov_b32_e32 v1, s1
14451439
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
14461440
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1447-
; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0
1441+
; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
14481442
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
14491443
; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
14501444
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1451-
; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0
1445+
; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0
14521446
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
14531447
; VI-NEXT: flat_load_ubyte v4, v[4:5]
14541448
; VI-NEXT: flat_load_ubyte v5, v[6:7]
1455-
; VI-NEXT: flat_load_ubyte v2, v[2:3]
1449+
; VI-NEXT: flat_load_ubyte v6, v[2:3]
14561450
; VI-NEXT: flat_load_ubyte v0, v[0:1]
1451+
; VI-NEXT: s_waitcnt vmcnt(3)
1452+
; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
1453+
; VI-NEXT: s_waitcnt vmcnt(2)
1454+
; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5
14571455
; VI-NEXT: s_waitcnt vmcnt(1)
1458-
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
1459-
; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1456+
; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
14601457
; VI-NEXT: s_waitcnt vmcnt(0)
14611458
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1462-
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1
1463-
; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1
1464-
; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5
14651459
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
14661460
; VI-NEXT: s_endpgm
14671461
;
@@ -1470,23 +1464,22 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
14701464
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
14711465
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
14721466
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1473-
; GFX10-NEXT: v_mov_b32_e32 v5, 0
1467+
; GFX10-NEXT: v_mov_b32_e32 v6, 0
14741468
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
14751469
; GFX10-NEXT: s_clause 0x3
14761470
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3
14771471
; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2
1478-
; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:1
1479-
; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3]
1472+
; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1
1473+
; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3]
1474+
; GFX10-NEXT: s_waitcnt vmcnt(3)
1475+
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
14801476
; GFX10-NEXT: s_waitcnt vmcnt(2)
1481-
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 8, v2
1477+
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
14821478
; GFX10-NEXT: s_waitcnt vmcnt(1)
1483-
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v3
1484-
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1485-
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1486-
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1479+
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
14871480
; GFX10-NEXT: s_waitcnt vmcnt(0)
1488-
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1489-
; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
1481+
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
1482+
; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
14901483
; GFX10-NEXT: s_endpgm
14911484
%tid = call i32 @llvm.amdgcn.workitem.id.x()
14921485
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
@@ -1790,3 +1783,42 @@ bb:
17901783
store float %add, float addrspace(1)* %out
17911784
ret void
17921785
}
1786+
1787+
%Vec = type { [4 x i8] }
1788+
1789+
define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr {
1790+
; GFX9-LABEL: cvt_f32_ubyte0_vector:
1791+
; GFX9: ; %bb.0: ; %entry
1792+
; GFX9: global_load_ubyte [[REG0:v[0-9]+]], v[0:1], off
1793+
; GFX9: global_load_ubyte [[REG1:v[0-9]+]], v[0:1], off offset:1
1794+
; GFX9: global_load_ubyte [[REG2:v[0-9]+]], v[0:1], off offset:2
1795+
; GFX9: global_load_ubyte [[REG3:v[0-9]+]], v[0:1], off offset:3
1796+
; GFX9: v_cvt_f32_ubyte0_e32 [[CVT:v[0-9]+]], [[REG3]]
1797+
; GFX9: v_fma_f32 v0, v1, [[CVT]], 0.5
1798+
entry:
1799+
br label %for.body.i
1800+
1801+
for.body.i: ; preds = %for.body.i, %entry
1802+
%retval.sroa.0.0.copyload = load %Vec*, %Vec* addrspace(1)* undef, align 8
1803+
%add.ptr = getelementptr inbounds %Vec, %Vec* %retval.sroa.0.0.copyload, i64 undef
1804+
%retval.sroa.0.0..sroa_cast = bitcast %Vec* %add.ptr to i32*
1805+
%retval.sroa.0.0..sroa_cast_adr = addrspacecast i32* %retval.sroa.0.0..sroa_cast to i32 addrspace(1)*
1806+
%retval.sroa.0.0.copyload.i = load i32, i32 addrspace(1)* %retval.sroa.0.0..sroa_cast_adr, align 1
1807+
%p1.sroa.6.0.extract.shift = lshr i32 %retval.sroa.0.0.copyload.i, 24
1808+
%p1.sroa.6.0.extract.trunc = trunc i32 %p1.sroa.6.0.extract.shift to i8
1809+
%conv12 = uitofp i8 %p1.sroa.6.0.extract.trunc to float
1810+
%0 = load float, float addrspace(1)* undef, align 8
1811+
%mul = fmul contract float %0, %conv12
1812+
%add = fadd contract float %mul, 5.000000e-01
1813+
%conv13 = fptoui float %add to i8
1814+
%retval.sroa.4.0.insert.ext = zext i8 %conv13 to i32
1815+
%retval.sroa.4.0.insert.shift = shl nuw i32 %retval.sroa.4.0.insert.ext, 24
1816+
%retval.sroa.3.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i, 16711680
1817+
%retval.sroa.3.0.insert.insert = or i32 %retval.sroa.4.0.insert.shift, %retval.sroa.3.0.insert.ext
1818+
%retval.sroa.2.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i, 65280
1819+
%retval.sroa.2.0.insert.insert = or i32 %retval.sroa.3.0.insert.insert, %retval.sroa.2.0.insert.ext
1820+
%retval.sroa.0.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i, 255
1821+
%retval.sroa.0.0.insert.insert = or i32 %retval.sroa.2.0.insert.insert, %retval.sroa.0.0.insert.ext
1822+
store i32 %retval.sroa.0.0.insert.insert, i32 addrspace(1)* undef, align 1
1823+
br label %for.body.i
1824+
}

0 commit comments

Comments
 (0)