@@ -2024,6 +2024,205 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
2024
2024
ret void
2025
2025
}
2026
2026
2027
+ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , ptr addrspace (1 ) %ptr ) #0 {
2028
+ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2029
+ ; SDAG: ; %bb.0:
2030
+ ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2031
+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2032
+ ; SDAG-NEXT: s_movk_i32 s6, 0x41
2033
+ ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2034
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2035
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2036
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2037
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2038
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2039
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2040
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2041
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2042
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2043
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2044
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2045
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s16
2046
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s17
2047
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s18
2048
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s19
2049
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s20
2050
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s21
2051
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s22
2052
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s23
2053
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2054
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2055
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2056
+ ; SDAG-NEXT: s_nop 1
2057
+ ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2058
+ ; SDAG-NEXT: s_nop 7
2059
+ ; SDAG-NEXT: s_nop 3
2060
+ ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2061
+ ; SDAG-NEXT: s_endpgm
2062
+ ;
2063
+ ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2064
+ ; GISEL: ; %bb.0:
2065
+ ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2066
+ ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2067
+ ; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
2068
+ ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2069
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2070
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2071
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2072
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2073
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2074
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2075
+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2076
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2077
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2078
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2079
+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2080
+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2081
+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2082
+ ; GISEL-NEXT: s_nop 1
2083
+ ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2084
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2085
+ ; GISEL-NEXT: s_nop 7
2086
+ ; GISEL-NEXT: s_nop 2
2087
+ ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2088
+ ; GISEL-NEXT: s_endpgm
2089
+ %result = call <4 x float > @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32 (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 3 , i32 65 , i32 1 , i32 1065353216 )
2090
+ store <4 x float > %result , ptr addrspace (1 ) %ptr , align 16
2091
+ ret void
2092
+ }
2093
+
2094
+ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , ptr addrspace (1 ) %ptr ) #0 {
2095
+ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2096
+ ; SDAG: ; %bb.0:
2097
+ ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2098
+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2099
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2100
+ ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2101
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2102
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2103
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2104
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2105
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2106
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2107
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2108
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2109
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2110
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2111
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s16
2112
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s17
2113
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s18
2114
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s19
2115
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s20
2116
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s21
2117
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s22
2118
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s23
2119
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2120
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2121
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2122
+ ; SDAG-NEXT: s_nop 1
2123
+ ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2124
+ ; SDAG-NEXT: s_nop 7
2125
+ ; SDAG-NEXT: s_nop 3
2126
+ ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2127
+ ; SDAG-NEXT: s_endpgm
2128
+ ;
2129
+ ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2130
+ ; GISEL: ; %bb.0:
2131
+ ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2132
+ ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2133
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2134
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2135
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2136
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2137
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2138
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2139
+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2140
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2141
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2142
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2143
+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2144
+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2145
+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2146
+ ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2147
+ ; GISEL-NEXT: s_nop 0
2148
+ ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2149
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2150
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2151
+ ; GISEL-NEXT: s_nop 7
2152
+ ; GISEL-NEXT: s_nop 1
2153
+ ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2154
+ ; GISEL-NEXT: s_endpgm
2155
+ %result = call <4 x float > @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32 (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 3 , i32 1065353216 , i32 1 , i32 -2 )
2156
+ store <4 x float > %result , ptr addrspace (1 ) %ptr , align 16
2157
+ ret void
2158
+ }
2159
+
2160
+ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , ptr addrspace (1 ) %ptr ) #0 {
2161
+ ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2162
+ ; SDAG: ; %bb.0:
2163
+ ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2164
+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2165
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2166
+ ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2167
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2168
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2169
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2170
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2171
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2172
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2173
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2174
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2175
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2176
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2177
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s16
2178
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s17
2179
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s18
2180
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s19
2181
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s20
2182
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s21
2183
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s22
2184
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s23
2185
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2186
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2187
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2188
+ ; SDAG-NEXT: s_nop 1
2189
+ ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2190
+ ; SDAG-NEXT: s_nop 7
2191
+ ; SDAG-NEXT: s_nop 3
2192
+ ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2193
+ ; SDAG-NEXT: s_endpgm
2194
+ ;
2195
+ ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2196
+ ; GISEL: ; %bb.0:
2197
+ ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2198
+ ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2199
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2200
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2201
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2202
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2203
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2204
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2205
+ ; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2206
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2207
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2208
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2209
+ ; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2210
+ ; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2211
+ ; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2212
+ ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2213
+ ; GISEL-NEXT: s_nop 0
2214
+ ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2215
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2216
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2217
+ ; GISEL-NEXT: s_nop 7
2218
+ ; GISEL-NEXT: s_nop 1
2219
+ ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2220
+ ; GISEL-NEXT: s_endpgm
2221
+ %result = call <4 x float > @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32 (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 0 , i32 0 , i32 3 , i32 1065353216 , i32 1 , i32 1042479491 )
2222
+ store <4 x float > %result , ptr addrspace (1 ) %ptr , align 16
2223
+ ret void
2224
+ }
2225
+
2027
2226
; This should be optimized to avoid the scale
2028
2227
define <4 x float > @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a (<8 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %scale0 , i32 %scale1 ) {
2029
2228
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
0 commit comments