2
2
; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,SI
3
3
; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI
4
4
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
5
+ ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx908 -start-before=amdgpu-isel -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
5
6
6
7
declare i32 @llvm.amdgcn.workitem.id.x () nounwind readnone
7
8
declare i32 @llvm.amdgcn.workitem.id.y () nounwind readnone
@@ -823,18 +824,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
823
824
; SI-NEXT: v_mov_b32_e32 v1, 0
824
825
; SI-NEXT: s_waitcnt lgkmcnt(0)
825
826
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
826
- ; SI-NEXT: buffer_load_ubyte v2 , v[0:1], s[0:3], 0 addr64 offset:1
827
- ; SI-NEXT: buffer_load_ubyte v3 , v[0:1], s[0:3], 0 addr64 offset:2
827
+ ; SI-NEXT: buffer_load_ubyte v5 , v[0:1], s[0:3], 0 addr64 offset:1
828
+ ; SI-NEXT: buffer_load_ubyte v2 , v[0:1], s[0:3], 0 addr64 offset:2
828
829
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
829
830
; SI-NEXT: s_mov_b32 s6, -1
830
831
; SI-NEXT: s_waitcnt vmcnt(2)
831
- ; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2
832
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
833
+ ; SI-NEXT: s_waitcnt vmcnt(1)
834
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
832
835
; SI-NEXT: s_waitcnt vmcnt(0)
833
- ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
834
- ; SI-NEXT: v_or_b32_e32 v0, v0, v3
835
- ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
836
- ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
837
- ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
836
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0
838
837
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
839
838
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
840
839
; SI-NEXT: s_endpgm
@@ -865,7 +864,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
865
864
; VI-NEXT: s_waitcnt vmcnt(2)
866
865
; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5
867
866
; VI-NEXT: s_waitcnt vmcnt(1)
868
- ; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6
867
+ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
869
868
; VI-NEXT: s_waitcnt vmcnt(0)
870
869
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
871
870
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -888,7 +887,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
888
887
; GFX10-NEXT: s_waitcnt vmcnt(2)
889
888
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
890
889
; GFX10-NEXT: s_waitcnt vmcnt(1)
891
- ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v4
890
+ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
892
891
; GFX10-NEXT: s_waitcnt vmcnt(0)
893
892
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
894
893
; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1]
@@ -1038,31 +1037,29 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
1038
1037
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1039
1038
; SI-NEXT: v_mov_b32_e32 v1, 0
1040
1039
; SI-NEXT: s_waitcnt lgkmcnt(0)
1041
- ; SI-NEXT: buffer_load_ubyte v2 , v[0:1], s[0:3], 0 addr64
1042
- ; SI-NEXT: buffer_load_ubyte v3 , v[0:1], s[0:3], 0 addr64 offset:1
1043
- ; SI-NEXT: buffer_load_ubyte v6 , v[0:1], s[0:3], 0 addr64 offset:2
1044
- ; SI-NEXT: buffer_load_ubyte v4 , v[0:1], s[0:3], 0 addr64 offset:3
1045
- ; SI-NEXT: buffer_load_ubyte v7 , v[0:1], s[0:3], 0 addr64 offset:4
1046
- ; SI-NEXT: buffer_load_ubyte v5 , v[0:1], s[0:3], 0 addr64 offset:5
1040
+ ; SI-NEXT: buffer_load_ubyte v4 , v[0:1], s[0:3], 0 addr64
1041
+ ; SI-NEXT: buffer_load_ubyte v5 , v[0:1], s[0:3], 0 addr64 offset:1
1042
+ ; SI-NEXT: buffer_load_ubyte v2 , v[0:1], s[0:3], 0 addr64 offset:2
1043
+ ; SI-NEXT: buffer_load_ubyte v3 , v[0:1], s[0:3], 0 addr64 offset:3
1044
+ ; SI-NEXT: buffer_load_ubyte v6 , v[0:1], s[0:3], 0 addr64 offset:4
1045
+ ; SI-NEXT: buffer_load_ubyte v7 , v[0:1], s[0:3], 0 addr64 offset:5
1047
1046
; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6
1048
1047
; SI-NEXT: s_mov_b32 s6, -1
1049
1048
; SI-NEXT: s_waitcnt vmcnt(6)
1050
- ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
1049
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1051
1050
; SI-NEXT: s_waitcnt vmcnt(5)
1052
- ; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3
1051
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
1052
+ ; SI-NEXT: s_waitcnt vmcnt(4)
1053
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
1053
1054
; SI-NEXT: s_waitcnt vmcnt(3)
1054
- ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4
1055
- ; SI-NEXT: v_or_b32_e32 v3, v9, v6
1055
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
1056
+ ; SI-NEXT: s_waitcnt vmcnt(2)
1057
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6
1056
1058
; SI-NEXT: s_waitcnt vmcnt(1)
1057
- ; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5
1059
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v7
1058
1060
; SI-NEXT: s_waitcnt vmcnt(0)
1059
- ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8
1060
- ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
1061
- ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24
1062
- ; SI-NEXT: s_waitcnt expcnt(0)
1063
- ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
1064
- ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
1065
- ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
1061
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8
1062
+ ; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:24
1066
1063
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
1067
1064
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1068
1065
; SI-NEXT: s_endpgm
@@ -1078,39 +1075,39 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
1078
1075
; VI-NEXT: v_mov_b32_e32 v1, s1
1079
1076
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1080
1077
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1081
- ; VI-NEXT: v_add_u32_e32 v2, vcc, 3 , v0
1078
+ ; VI-NEXT: v_add_u32_e32 v2, vcc, 4 , v0
1082
1079
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1083
- ; VI-NEXT: v_add_u32_e32 v4, vcc, 2 , v0
1080
+ ; VI-NEXT: v_add_u32_e32 v4, vcc, 5 , v0
1084
1081
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1085
- ; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0
1086
- ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
1087
- ; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0
1088
- ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
1089
- ; VI-NEXT: flat_load_ubyte v10, v[4:5]
1090
- ; VI-NEXT: flat_load_ubyte v11, v[6:7]
1091
- ; VI-NEXT: flat_load_ubyte v8, v[8:9]
1082
+ ; VI-NEXT: flat_load_ubyte v12, v[4:5]
1092
1083
; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0
1093
1084
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1094
1085
; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0
1095
1086
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
1087
+ ; VI-NEXT: v_add_u32_e32 v8, vcc, 2, v0
1088
+ ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
1089
+ ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v0
1090
+ ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
1091
+ ; VI-NEXT: flat_load_ubyte v8, v[8:9]
1092
+ ; VI-NEXT: flat_load_ubyte v9, v[10:11]
1096
1093
; VI-NEXT: flat_load_ubyte v6, v[6:7]
1097
1094
; VI-NEXT: flat_load_ubyte v4, v[4:5]
1098
- ; VI-NEXT: flat_load_ubyte v2 , v[2:3]
1095
+ ; VI-NEXT: flat_load_ubyte v7 , v[2:3]
1099
1096
; VI-NEXT: flat_load_ubyte v0, v[0:1]
1097
+ ; VI-NEXT: s_waitcnt vmcnt(6)
1098
+ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v12
1099
+ ; VI-NEXT: s_waitcnt vmcnt(5)
1100
+ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8
1100
1101
; VI-NEXT: s_waitcnt vmcnt(4)
1101
- ; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8
1102
+ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v9
1102
1103
; VI-NEXT: s_waitcnt vmcnt(3)
1103
- ; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6
1104
+ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
1104
1105
; VI-NEXT: s_waitcnt vmcnt(2)
1105
1106
; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
1106
1107
; VI-NEXT: s_waitcnt vmcnt(1)
1107
- ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
1108
- ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v11
1109
- ; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1108
+ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
1110
1109
; VI-NEXT: s_waitcnt vmcnt(0)
1111
1110
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1112
- ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
1113
- ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
1114
1111
; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
1115
1112
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1116
1113
; VI-NEXT: s_endpgm
@@ -1119,32 +1116,31 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
1119
1116
; GFX10: ; %bb.0:
1120
1117
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1121
1118
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1122
- ; GFX10-NEXT: v_mov_b32_e32 v2 , 0
1119
+ ; GFX10-NEXT: v_mov_b32_e32 v4 , 0
1123
1120
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1124
- ; GFX10-NEXT: v_mov_b32_e32 v8 , 0
1121
+ ; GFX10-NEXT: v_mov_b32_e32 v7 , 0
1125
1122
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1126
1123
; GFX10-NEXT: s_clause 0x5
1127
- ; GFX10-NEXT: global_load_ubyte v1 , v0, s[2:3] offset:2
1128
- ; GFX10-NEXT: global_load_ubyte v3 , v0, s[2:3] offset:3
1129
- ; GFX10-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4
1130
- ; GFX10-NEXT: global_load_ubyte v4 , v0, s[2:3] offset:6
1131
- ; GFX10-NEXT: global_load_ubyte v5 , v0, s[2:3] offset:1
1132
- ; GFX10-NEXT: global_load_ubyte v7 , v0, s[2:3]
1124
+ ; GFX10-NEXT: global_load_ubyte v5 , v0, s[2:3] offset:6
1125
+ ; GFX10-NEXT: global_load_ubyte v1 , v0, s[2:3] offset:3
1126
+ ; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2
1127
+ ; GFX10-NEXT: global_load_ubyte v6 , v0, s[2:3] offset:1
1128
+ ; GFX10-NEXT: global_load_short_d16 v4 , v0, s[2:3] offset:4
1129
+ ; GFX10-NEXT: global_load_ubyte v0 , v0, s[2:3]
1133
1130
; GFX10-NEXT: s_waitcnt vmcnt(4)
1134
- ; GFX10-NEXT: v_lshl_or_b32 v0, v3, 8, v1
1131
+ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
1132
+ ; GFX10-NEXT: s_waitcnt vmcnt(3)
1133
+ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
1135
1134
; GFX10-NEXT: s_waitcnt vmcnt(2)
1136
- ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4
1135
+ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
1136
+ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v5
1137
1137
; GFX10-NEXT: s_waitcnt vmcnt(1)
1138
- ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v5
1139
- ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2
1140
- ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1141
- ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2
1142
- ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1143
- ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1138
+ ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v4
1139
+ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
1144
1140
; GFX10-NEXT: s_waitcnt vmcnt(0)
1145
- ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7
1146
- ; GFX10-NEXT: global_store_dwordx3 v8 , v[4:6], s[0:1] offset:16
1147
- ; GFX10-NEXT: global_store_dwordx4 v8 , v[0:3], s[0:1]
1141
+ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1142
+ ; GFX10-NEXT: global_store_dwordx3 v7 , v[4:6], s[0:1] offset:16
1143
+ ; GFX10-NEXT: global_store_dwordx4 v7 , v[0:3], s[0:1]
1148
1144
; GFX10-NEXT: s_endpgm
1149
1145
%tid = call i32 @llvm.amdgcn.workitem.id.x ()
1150
1146
%gep = getelementptr <7 x i8 >, <7 x i8 > addrspace (1 )* %in , i32 %tid
@@ -1417,18 +1413,16 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
1417
1413
; SI-NEXT: v_mov_b32_e32 v1, 0
1418
1414
; SI-NEXT: s_waitcnt lgkmcnt(0)
1419
1415
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
1420
- ; SI-NEXT: buffer_load_ubyte v2 , v[0:1], s[0:3], 0 addr64 offset:1
1421
- ; SI-NEXT: buffer_load_ubyte v3 , v[0:1], s[0:3], 0 addr64 offset:2
1416
+ ; SI-NEXT: buffer_load_ubyte v5 , v[0:1], s[0:3], 0 addr64 offset:1
1417
+ ; SI-NEXT: buffer_load_ubyte v2 , v[0:1], s[0:3], 0 addr64 offset:2
1422
1418
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
1423
1419
; SI-NEXT: s_mov_b32 s6, -1
1424
1420
; SI-NEXT: s_waitcnt vmcnt(2)
1425
- ; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2
1421
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
1422
+ ; SI-NEXT: s_waitcnt vmcnt(1)
1423
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
1426
1424
; SI-NEXT: s_waitcnt vmcnt(0)
1427
- ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1428
- ; SI-NEXT: v_or_b32_e32 v0, v0, v3
1429
- ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1430
- ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1431
- ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1425
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0
1432
1426
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1433
1427
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1434
1428
; SI-NEXT: s_endpgm
@@ -1444,24 +1438,24 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
1444
1438
; VI-NEXT: v_mov_b32_e32 v1, s1
1445
1439
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1446
1440
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1447
- ; VI-NEXT: v_add_u32_e32 v2, vcc, 3 , v0
1441
+ ; VI-NEXT: v_add_u32_e32 v2, vcc, 1 , v0
1448
1442
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1449
1443
; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
1450
1444
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1451
- ; VI-NEXT: v_add_u32_e32 v6, vcc, 1 , v0
1445
+ ; VI-NEXT: v_add_u32_e32 v6, vcc, 3 , v0
1452
1446
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
1453
1447
; VI-NEXT: flat_load_ubyte v4, v[4:5]
1454
1448
; VI-NEXT: flat_load_ubyte v5, v[6:7]
1455
- ; VI-NEXT: flat_load_ubyte v2 , v[2:3]
1449
+ ; VI-NEXT: flat_load_ubyte v6 , v[2:3]
1456
1450
; VI-NEXT: flat_load_ubyte v0, v[0:1]
1451
+ ; VI-NEXT: s_waitcnt vmcnt(3)
1452
+ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
1453
+ ; VI-NEXT: s_waitcnt vmcnt(2)
1454
+ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5
1457
1455
; VI-NEXT: s_waitcnt vmcnt(1)
1458
- ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
1459
- ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1456
+ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6
1460
1457
; VI-NEXT: s_waitcnt vmcnt(0)
1461
1458
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1462
- ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1
1463
- ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1
1464
- ; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5
1465
1459
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1466
1460
; VI-NEXT: s_endpgm
1467
1461
;
@@ -1470,23 +1464,22 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
1470
1464
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1471
1465
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1472
1466
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1473
- ; GFX10-NEXT: v_mov_b32_e32 v5 , 0
1467
+ ; GFX10-NEXT: v_mov_b32_e32 v6 , 0
1474
1468
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1475
1469
; GFX10-NEXT: s_clause 0x3
1476
1470
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3
1477
1471
; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2
1478
- ; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:1
1479
- ; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3]
1472
+ ; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1
1473
+ ; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3]
1474
+ ; GFX10-NEXT: s_waitcnt vmcnt(3)
1475
+ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1
1480
1476
; GFX10-NEXT: s_waitcnt vmcnt(2)
1481
- ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 8 , v2
1477
+ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2 , v2
1482
1478
; GFX10-NEXT: s_waitcnt vmcnt(1)
1483
- ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v3
1484
- ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1485
- ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1486
- ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1479
+ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
1487
1480
; GFX10-NEXT: s_waitcnt vmcnt(0)
1488
- ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1489
- ; GFX10-NEXT: global_store_dwordx4 v5 , v[0:3], s[0:1]
1481
+ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
1482
+ ; GFX10-NEXT: global_store_dwordx4 v6 , v[0:3], s[0:1]
1490
1483
; GFX10-NEXT: s_endpgm
1491
1484
%tid = call i32 @llvm.amdgcn.workitem.id.x ()
1492
1485
%gep = getelementptr <4 x i8 >, <4 x i8 > addrspace (1 )* %in , i32 %tid
@@ -1790,3 +1783,42 @@ bb:
1790
1783
store float %add , float addrspace (1 )* %out
1791
1784
ret void
1792
1785
}
1786
+
1787
+ %Vec = type { [4 x i8 ] }
1788
+
1789
+ define amdgpu_kernel void @cvt_f32_ubyte0_vector () local_unnamed_addr {
1790
+ ; GFX9-LABEL: cvt_f32_ubyte0_vector:
1791
+ ; GFX9: ; %bb.0: ; %entry
1792
+ ; GFX9: global_load_ubyte [[REG0:v[0-9]+]], v[0:1], off
1793
+ ; GFX9: global_load_ubyte [[REG1:v[0-9]+]], v[0:1], off offset:1
1794
+ ; GFX9: global_load_ubyte [[REG2:v[0-9]+]], v[0:1], off offset:2
1795
+ ; GFX9: global_load_ubyte [[REG3:v[0-9]+]], v[0:1], off offset:3
1796
+ ; GFX9: v_cvt_f32_ubyte0_e32 [[CVT:v[0-9]+]], [[REG3]]
1797
+ ; GFX9: v_fma_f32 v0, v1, [[CVT]], 0.5
1798
+ entry:
1799
+ br label %for.body.i
1800
+
1801
+ for.body.i: ; preds = %for.body.i, %entry
1802
+ %retval.sroa.0.0.copyload = load %Vec* , %Vec* addrspace (1 )* undef , align 8
1803
+ %add.ptr = getelementptr inbounds %Vec , %Vec* %retval.sroa.0.0.copyload , i64 undef
1804
+ %retval.sroa.0.0..sroa_cast = bitcast %Vec* %add.ptr to i32*
1805
+ %retval.sroa.0.0..sroa_cast_adr = addrspacecast i32* %retval.sroa.0.0..sroa_cast to i32 addrspace (1 )*
1806
+ %retval.sroa.0.0.copyload.i = load i32 , i32 addrspace (1 )* %retval.sroa.0.0..sroa_cast_adr , align 1
1807
+ %p1.sroa.6.0.extract.shift = lshr i32 %retval.sroa.0.0.copyload.i , 24
1808
+ %p1.sroa.6.0.extract.trunc = trunc i32 %p1.sroa.6.0.extract.shift to i8
1809
+ %conv12 = uitofp i8 %p1.sroa.6.0.extract.trunc to float
1810
+ %0 = load float , float addrspace (1 )* undef , align 8
1811
+ %mul = fmul contract float %0 , %conv12
1812
+ %add = fadd contract float %mul , 5.000000e-01
1813
+ %conv13 = fptoui float %add to i8
1814
+ %retval.sroa.4.0.insert.ext = zext i8 %conv13 to i32
1815
+ %retval.sroa.4.0.insert.shift = shl nuw i32 %retval.sroa.4.0.insert.ext , 24
1816
+ %retval.sroa.3.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i , 16711680
1817
+ %retval.sroa.3.0.insert.insert = or i32 %retval.sroa.4.0.insert.shift , %retval.sroa.3.0.insert.ext
1818
+ %retval.sroa.2.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i , 65280
1819
+ %retval.sroa.2.0.insert.insert = or i32 %retval.sroa.3.0.insert.insert , %retval.sroa.2.0.insert.ext
1820
+ %retval.sroa.0.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i , 255
1821
+ %retval.sroa.0.0.insert.insert = or i32 %retval.sroa.2.0.insert.insert , %retval.sroa.0.0.insert.ext
1822
+ store i32 %retval.sroa.0.0.insert.insert , i32 addrspace (1 )* undef , align 1
1823
+ br label %for.body.i
1824
+ }
0 commit comments