@@ -2030,6 +2030,205 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
2030
2030
ret void
2031
2031
}
2032
2032
2033
+ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , ptr addrspace (1 ) %ptr ) #0 {
2034
+ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2035
+ ; SDAG: ; %bb.0:
2036
+ ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2037
+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2038
+ ; SDAG-NEXT: s_movk_i32 s6, 0x41
2039
+ ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2040
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2041
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2042
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2043
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2044
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2045
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2046
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2047
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2048
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2049
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2050
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2051
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s16
2052
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s17
2053
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s18
2054
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s19
2055
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s20
2056
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s21
2057
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s22
2058
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s23
2059
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2060
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2061
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2062
+ ; SDAG-NEXT: s_nop 1
2063
+ ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2064
+ ; SDAG-NEXT: s_nop 7
2065
+ ; SDAG-NEXT: s_nop 3
2066
+ ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2067
+ ; SDAG-NEXT: s_endpgm
2068
+ ;
2069
+ ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2070
+ ; GISEL: ; %bb.0:
2071
+ ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2072
+ ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2073
+ ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
2074
+ ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2075
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2076
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2077
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2078
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2079
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2080
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2081
+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2082
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2083
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2084
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2085
+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2086
+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2087
+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2088
+ ; GISEL-NEXT: s_nop 1
2089
+ ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2090
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2091
+ ; GISEL-NEXT: s_nop 7
2092
+ ; GISEL-NEXT: s_nop 2
2093
+ ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2094
+ ; GISEL-NEXT: s_endpgm
2095
+ %result = call <4 x float > @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32 (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 3 , i32 65 , i32 1 , i32 1065353216 )
2096
+ store <4 x float > %result , ptr addrspace (1 ) %ptr , align 16
2097
+ ret void
2098
+ }
2099
+
2100
+ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , ptr addrspace (1 ) %ptr ) #0 {
2101
+ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2102
+ ; SDAG: ; %bb.0:
2103
+ ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2104
+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2105
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2106
+ ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2107
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2108
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2109
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2110
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2111
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2112
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2113
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2114
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2115
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2116
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2117
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s16
2118
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s17
2119
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s18
2120
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s19
2121
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s20
2122
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s21
2123
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s22
2124
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s23
2125
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2126
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2127
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2128
+ ; SDAG-NEXT: s_nop 1
2129
+ ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2130
+ ; SDAG-NEXT: s_nop 7
2131
+ ; SDAG-NEXT: s_nop 3
2132
+ ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2133
+ ; SDAG-NEXT: s_endpgm
2134
+ ;
2135
+ ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2136
+ ; GISEL: ; %bb.0:
2137
+ ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2138
+ ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2139
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2140
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2141
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2142
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2143
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2144
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2145
+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2146
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2147
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2148
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2149
+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2150
+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2151
+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2152
+ ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2153
+ ; GISEL-NEXT: s_nop 0
2154
+ ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2155
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2156
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2157
+ ; GISEL-NEXT: s_nop 7
2158
+ ; GISEL-NEXT: s_nop 1
2159
+ ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2160
+ ; GISEL-NEXT: s_endpgm
2161
+ %result = call <4 x float > @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32 (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 3 , i32 1065353216 , i32 1 , i32 -2 )
2162
+ store <4 x float > %result , ptr addrspace (1 ) %ptr , align 16
2163
+ ret void
2164
+ }
2165
+
2166
+ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , ptr addrspace (1 ) %ptr ) #0 {
2167
+ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2168
+ ; SDAG: ; %bb.0:
2169
+ ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2170
+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2171
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2172
+ ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2173
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2174
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2175
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2176
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2177
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2178
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2179
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2180
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2181
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2182
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2183
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s16
2184
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s17
2185
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s18
2186
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s19
2187
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s20
2188
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s21
2189
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s22
2190
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s23
2191
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2192
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2193
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2194
+ ; SDAG-NEXT: s_nop 1
2195
+ ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2196
+ ; SDAG-NEXT: s_nop 7
2197
+ ; SDAG-NEXT: s_nop 3
2198
+ ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2199
+ ; SDAG-NEXT: s_endpgm
2200
+ ;
2201
+ ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2202
+ ; GISEL: ; %bb.0:
2203
+ ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2204
+ ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2205
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2206
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2207
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2208
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2209
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2210
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2211
+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2212
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2213
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2214
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2215
+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2216
+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2217
+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2218
+ ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2219
+ ; GISEL-NEXT: s_nop 0
2220
+ ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2221
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2222
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2223
+ ; GISEL-NEXT: s_nop 7
2224
+ ; GISEL-NEXT: s_nop 1
2225
+ ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2226
+ ; GISEL-NEXT: s_endpgm
2227
+ %result = call <4 x float > @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32 (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 3 , i32 1065353216 , i32 1 , i32 1042479491 )
2228
+ store <4 x float > %result , ptr addrspace (1 ) %ptr , align 16
2229
+ ret void
2230
+ }
2231
+
2033
2232
; This should be optimized to avoid the scale
2034
2233
define <4 x float > @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %scale0 , i32 %scale1 ) {
2035
2234
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
0 commit comments