@@ -246,3 +246,113 @@ gpu.module @test_kernel {
246
246
gpu.return
247
247
}
248
248
}
249
+
250
+ // -----
251
+ #l = #xegpu.layout <inst_data = [16 , 16 ]>
252
+ #r = #xegpu.layout <inst_data = [16 ]>
253
+
254
+ gpu.module @kernel attributes {spirv.target_env = #spirv.target_env <#spirv.vce <v1.4 , [Addresses , Float16Buffer , Int64 , Int16 , Int8 , Kernel , Linkage , Vector16 , GenericPointer , Groups , Float16 , Float64 , AtomicFloat32AddEXT , ExpectAssumeKHR , SubgroupDispatch , VectorComputeINTEL , VectorAnyINTEL ], [SPV_EXT_shader_atomic_float_add , SPV_KHR_expect_assume , SPV_INTEL_vector_compute ]>, api =OpenCL , #spirv.resource_limits <>>} {
255
+ gpu.func @reduce_dim_0 (%a: memref <16 x512 xf32 >, %b: memref <512 xf32 >) kernel attributes {VectorComputeFunctionINTEL , spirv.entry_point_abi = #spirv.entry_point_abi <>} {
256
+ %acc = arith.constant dense <0.0 > : vector <64 xf32 >
257
+ %c64 = arith.constant 64 : index
258
+ %block_id_x = gpu.block_id x
259
+ %m = arith.muli %block_id_x , %c64 : index
260
+ %0 = xegpu.create_nd_tdesc %a [0 , %m ] : memref <16 x512 xf32 > -> !xegpu.tensor_desc <16 x64 xf32 , #l >
261
+ %1 = xegpu.load_nd %0: !xegpu.tensor_desc <16 x64 xf32 , #l > -> vector <16 x64 xf32 >
262
+ // CHECK: vector.multi_reduction <add>, {{.*}}, [[ACC:%[0-9A-Za-z]+]] [0] : vector<16x16xf32> to vector<16xf32>
263
+ // CHECK-COUNT-3: vector.multi_reduction <add>, {{.*}}, [[ACC]] [0] : vector<16x16xf32> to vector<16xf32>
264
+ %2 = vector.multi_reduction <add >, %1 , %acc {layout_result_0 = #r } [0 ]: vector <16 x64 xf32 > to vector <64 xf32 >
265
+ %3 = xegpu.create_nd_tdesc %b [%m ] : memref <512 xf32 > -> !xegpu.tensor_desc <64 xf32 , #r >
266
+ xegpu.store_nd %2 , %3: vector <64 xf32 >, !xegpu.tensor_desc <64 xf32 , #r >
267
+ gpu.return
268
+ }
269
+ }
270
+
271
+ // -----
272
+ #l = #xegpu.layout <inst_data = [16 , 16 ]>
273
+ #r = #xegpu.layout <inst_data = [16 ]>
274
+
275
+ gpu.module @kernel attributes {spirv.target_env = #spirv.target_env <#spirv.vce <v1.4 , [Addresses , Float16Buffer , Int64 , Int16 , Int8 , Kernel , Linkage , Vector16 , GenericPointer , Groups , Float16 , Float64 , AtomicFloat32AddEXT , ExpectAssumeKHR , SubgroupDispatch , VectorComputeINTEL , VectorAnyINTEL ], [SPV_EXT_shader_atomic_float_add , SPV_KHR_expect_assume , SPV_INTEL_vector_compute ]>, api =OpenCL , #spirv.resource_limits <>>} {
276
+ gpu.func @reduce_dim_1 (%a: memref <512 x32 xf32 >, %b: memref <512 xf32 >) kernel attributes {VectorComputeFunctionINTEL , spirv.entry_point_abi = #spirv.entry_point_abi <>} {
277
+ %c1 = arith.constant 1 : index
278
+ %c32 = arith.constant 32 : index
279
+ %acc = arith.constant dense <0.0 > : vector <32 xf32 >
280
+
281
+ %block_id_x = gpu.block_id x
282
+ %block_id_y = gpu.block_id y
283
+
284
+ %m = arith.muli %block_id_x , %c32 : index
285
+ %n = arith.muli %block_id_y , %c32 : index
286
+ %0 = xegpu.create_nd_tdesc %a [%m , %n ] : memref <512 x32 xf32 > -> !xegpu.tensor_desc <32 x128 xf32 , #l >
287
+ %1 = xegpu.load_nd %0: !xegpu.tensor_desc <32 x128 xf32 , #l > -> vector <32 x128 xf32 >
288
+
289
+ // CHECK: vector.multi_reduction <add>, {{.*}}, [[INIT:%[0-9A-Za-z]+]] [1] : vector<16x16xf32> to vector<16xf32>
290
+ // CHECK-COUNT-1: vector.multi_reduction <add>, {{.*}}, [[INIT]] [1] : vector<16x16xf32> to vector<16xf32>
291
+
292
+ %2 = vector.multi_reduction <add >, %1 , %acc {layout_result_0 = #r } [1 ]: vector <32 x128 xf32 > to vector <32 xf32 >
293
+ %3 = xegpu.create_nd_tdesc %b [%n ] : memref <512 xf32 > -> !xegpu.tensor_desc <32 xf32 , #r >
294
+ xegpu.store_nd %2 , %3: vector <32 xf32 >, !xegpu.tensor_desc <32 xf32 , #r >
295
+ gpu.return
296
+ }
297
+ }
298
+
299
+ // -----
300
+ #r = #xegpu.layout <inst_data = [16 ]>
301
+ #l = #xegpu.layout <inst_data = [16 , 16 ]>
302
+
303
+ gpu.module @kernel attributes {spirv.target_env = #spirv.target_env <#spirv.vce <v1.4 , [Addresses , Float16Buffer , Int64 , Int16 , Int8 , Kernel , Linkage , Vector16 , GenericPointer , Groups , Float16 , Float64 , AtomicFloat32AddEXT , ExpectAssumeKHR , SubgroupDispatch , VectorComputeINTEL , VectorAnyINTEL ], [SPV_EXT_shader_atomic_float_add , SPV_KHR_expect_assume , SPV_INTEL_vector_compute ]>, api =OpenCL , #spirv.resource_limits <>>} {
304
+ gpu.func @broadcast_dim_0 (%a: memref <512 xf32 >, %b: memref <16 x512 xf32 >) kernel attributes {VectorComputeFunctionINTEL , spirv.entry_point_abi = #spirv.entry_point_abi <>} {
305
+
306
+ %c64 = arith.constant 64 : index
307
+ %block_id_x = gpu.block_id x
308
+ %m = arith.muli %block_id_x , %c64 : index
309
+ %0 = xegpu.create_nd_tdesc %a [%m ] : memref <512 xf32 > -> !xegpu.tensor_desc <64 xf32 , #r >
310
+ %1 = xegpu.load_nd %0: !xegpu.tensor_desc <64 xf32 , #r > -> vector <64 xf32 >
311
+ // CHECK-COUNT-4: vector.broadcast {{.*}} : vector<16xf32> to vector<16x16xf32>
312
+ %2 = vector.broadcast %1 {layout_result_0 = #l } : vector <64 xf32 > to vector <16 x64 xf32 >
313
+ %3 = xegpu.create_nd_tdesc %b [0 , %m ] : memref <16 x512 xf32 > -> !xegpu.tensor_desc <16 x64 xf32 , #l >
314
+ xegpu.store_nd %2 , %3: vector <16 x64 xf32 >, !xegpu.tensor_desc <16 x64 xf32 , #l >
315
+ gpu.return
316
+ }
317
+ }
318
+
319
+ // -----
320
+ #r = #xegpu.layout <inst_data = [16 ]>
321
+ #l = #xegpu.layout <inst_data = [16 , 16 ]>
322
+
323
+ gpu.module @kernel attributes {spirv.target_env = #spirv.target_env <#spirv.vce <v1.4 , [Addresses , Float16Buffer , Int64 , Int16 , Int8 , Kernel , Linkage , Vector16 , GenericPointer , Groups , Float16 , Float64 , AtomicFloat32AddEXT , ExpectAssumeKHR , SubgroupDispatch , VectorComputeINTEL , VectorAnyINTEL ], [SPV_EXT_shader_atomic_float_add , SPV_KHR_expect_assume , SPV_INTEL_vector_compute ]>, api =OpenCL , #spirv.resource_limits <>>} {
324
+ gpu.func @broadcast_dim_1 (%a: memref <512 xf32 >, %b: memref <16 x512 xf32 >) kernel attributes {VectorComputeFunctionINTEL , spirv.entry_point_abi = #spirv.entry_point_abi <>} {
325
+
326
+ %c32 = arith.constant 32 : index
327
+ %block_id_x = gpu.block_id x
328
+ %m = arith.muli %block_id_x , %c32 : index
329
+ %0 = xegpu.create_nd_tdesc %a [%m ] : memref <512 xf32 > -> !xegpu.tensor_desc <32 xf32 , #r >
330
+ %1 = xegpu.load_nd %0: !xegpu.tensor_desc <32 xf32 , #r > -> vector <32 xf32 >
331
+ %11 = vector.shape_cast %1 : vector <32 xf32 > to vector <32 x1 xf32 >
332
+ // CHECK-COUNT-8: vector.broadcast {{.*}}: vector<16x1xf32> to vector<16x16xf32>
333
+ %2 = vector.broadcast %11 {layout_result_0 = #l } : vector <32 x1 xf32 > to vector <32 x64 xf32 >
334
+ %3 = xegpu.create_nd_tdesc %b [0 , %m ] : memref <16 x512 xf32 > -> !xegpu.tensor_desc <32 x64 xf32 , #l >
335
+ xegpu.store_nd %2 , %3: vector <32 x64 xf32 >, !xegpu.tensor_desc <32 x64 xf32 , #l >
336
+ gpu.return
337
+ }
338
+ }
339
+
340
+ // -----
341
+ #l = #xegpu.layout <inst_data = [16 , 8 ]>
342
+ #t = #xegpu.layout <inst_data = [8 , 16 ]>
343
+
344
+ gpu.module @kernel attributes {spirv.target_env = #spirv.target_env <#spirv.vce <v1.4 , [Addresses , Float16Buffer , Int64 , Int16 , Int8 , Kernel , Linkage , Vector16 , GenericPointer , Groups , Float16 , Float64 , AtomicFloat32AddEXT , ExpectAssumeKHR , SubgroupDispatch , VectorComputeINTEL , VectorAnyINTEL ], [SPV_EXT_shader_atomic_float_add , SPV_KHR_expect_assume , SPV_INTEL_vector_compute ]>, api =OpenCL , #spirv.resource_limits <>>} {
345
+ gpu.func @transpose (%a: memref <512 x8 xf32 >, %b: memref <8 x512 xf32 >) kernel attributes {VectorComputeFunctionINTEL , spirv.entry_point_abi = #spirv.entry_point_abi <>} {
346
+
347
+ %c32 = arith.constant 32 : index
348
+ %block_id_x = gpu.block_id x
349
+ %m = arith.muli %block_id_x , %c32 : index
350
+ %0 = xegpu.create_nd_tdesc %a [%m , 0 ] : memref <512 x8 xf32 > -> !xegpu.tensor_desc <32 x8 xf32 , #l >
351
+ %1 = xegpu.load_nd %0: !xegpu.tensor_desc <32 x8 xf32 , #l > -> vector <32 x8 xf32 >
352
+ // CHECK-COUNT-2: vector.transpose {{.*}} [1, 0] : vector<16x8xf32> to vector<8x16xf32>
353
+ %2 = vector.transpose %1 , [1 , 0 ] {layout_result_0 = #t } : vector <32 x8 xf32 > to vector <8 x32 xf32 >
354
+ %3 = xegpu.create_nd_tdesc %b [0 , %m ] : memref <8 x512 xf32 > -> !xegpu.tensor_desc <8 x32 xf32 , #t >
355
+ xegpu.store_nd %2 , %3: vector <8 x32 xf32 >, !xegpu.tensor_desc <8 x32 xf32 , #t >
356
+ gpu.return
357
+ }
358
+ }
0 commit comments