|
| 1 | +// RUN: mlir-opt %s \ |
| 2 | +// RUN: -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \ |
| 3 | +// RUN: | mlir-cpu-runner \ |
| 4 | +// RUN: --shared-libs=%mlir_cuda_runtime \ |
| 5 | +// RUN: --shared-libs=%mlir_runner_utils \ |
| 6 | +// RUN: --shared-libs=%mlir_c_runner_utils \ |
| 7 | +// RUN: --entry-point-result=void \ |
| 8 | +// RUN: | FileCheck %s |
| 9 | + |
| 10 | +// CHECK: Correct Results :8192 |
| 11 | +// CHECK: Incorrect Results :0 |
| 12 | + |
| 13 | +module { |
| 14 | + func.func @main() { |
| 15 | + %c10000000 = arith.constant 10000000 : index |
| 16 | + %false = arith.constant false |
| 17 | + %c32768 = arith.constant 32768 : index |
| 18 | + %c31_i32 = arith.constant 31 : i32 |
| 19 | + %c-1_i32 = arith.constant -1 : i32 |
| 20 | + %c5_i32 = arith.constant 5 : i32 |
| 21 | + %c0_i32 = arith.constant 0 : i32 |
| 22 | + %c0 = arith.constant 0 : index |
| 23 | + %c8 = arith.constant 8 : index |
| 24 | + %c64 = arith.constant 64 : index |
| 25 | + %c2 = arith.constant 2 : index |
| 26 | + %c32768_i32 = arith.constant 32768 : i32 |
| 27 | + %c128 = arith.constant 128 : index |
| 28 | + %c1 = arith.constant 1 : index |
| 29 | + %0 = llvm.mlir.constant(1 : i64) : i64 |
| 30 | + %1 = llvm.mlir.constant(128 : i64) : i64 |
| 31 | + %2 = llvm.mlir.constant(0 : i64) : i64 |
| 32 | + %f0 = arith.constant 0.0 : f16 |
| 33 | + %f123 = arith.constant 1.123 : f16 |
| 34 | + |
| 35 | + %srcMemref_host = memref.alloc() : memref<128x128xf16> |
| 36 | + %dstMemref_host = memref.alloc() : memref<128x128xf16> |
| 37 | + scf.for %arg0 = %c0 to %c128 step %c1 { |
| 38 | + scf.for %arg1 = %c0 to %c64 step %c1 { |
| 39 | + %d1 = arith.index_cast %arg0 : index to i32 |
| 40 | + %d2 = arith.index_cast %arg1 : index to i32 |
| 41 | + %d3 = arith.sitofp %d1 : i32 to f16 |
| 42 | + %d4 = arith.sitofp %d2 : i32 to f16 |
| 43 | + %d5 = arith.addf %d3, %f123 : f16 |
| 44 | + %d6 = arith.constant 3.12 : f16 |
| 45 | + %d7 = arith.mulf %d5, %d6 : f16 |
| 46 | + %d8 = arith.addf %d7, %d5 : f16 |
| 47 | + %d9 = arith.constant 0.178 : f16 |
| 48 | + %d10 = arith.divf %d9, %d8 : f16 |
| 49 | + memref.store %d10, %srcMemref_host[%arg0, %arg1] : memref<128x128xf16> |
| 50 | + memref.store %f0, %dstMemref_host[%arg0, %arg1] : memref<128x128xf16> |
| 51 | + } |
| 52 | + } |
| 53 | + |
| 54 | + %s1 = gpu.wait async |
| 55 | + %srcMemref, %s2 = gpu.alloc async [%s1] () : memref<128x128xf16> |
| 56 | + %dstMemref, %s3 = gpu.alloc async [%s2] () : memref<128x128xf16> |
| 57 | + %s4 = gpu.memcpy async [%s3] %srcMemref, %srcMemref_host : memref<128x128xf16>, memref<128x128xf16> |
| 58 | + %s5 = gpu.memcpy async [%s4] %dstMemref, %dstMemref_host : memref<128x128xf16>, memref<128x128xf16> |
| 59 | + |
| 60 | + %expand_shape = memref.expand_shape %srcMemref [[0, 1], [2, 3]] : memref<128x128xf16> into memref<2x64x2x64xf16> |
| 61 | + %transpose = memref.transpose %expand_shape (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<2x64x2x64xf16> to memref<2x2x64x64xf16, strided<[8192, 64, 128, 1]>> |
| 62 | + %cast = memref.cast %transpose : memref<2x2x64x64xf16, strided<[8192, 64, 128, 1]>> to memref<*xf16> |
| 63 | + %24 = nvgpu.tma.create.descriptor %cast box[%c2, %c2, %c64, %c64] : memref<*xf16> -> <tensor = memref<2x2x64x64xf16, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none> |
| 64 | + |
| 65 | + gpu.launch |
| 66 | + blocks(%arg2, %arg3, %arg4) in (%arg8 = %c1, %arg9 = %c1, %arg10 = %c1) |
| 67 | + threads(%arg5, %arg6, %arg7) in (%arg11 = %c128, %arg12 = %c1, %arg13 = %c1) |
| 68 | + dynamic_shared_memory_size %c32768_i32 |
| 69 | + { |
| 70 | + %26 = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>> |
| 71 | + %view = memref.view %26[%c0][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<2x2x64x64xf16, #gpu.address_space<workgroup>> |
| 72 | + %27 = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>> |
| 73 | + %thread_id_x = gpu.thread_id x |
| 74 | + %28 = arith.index_cast %thread_id_x : index to i32 |
| 75 | + %29 = arith.shrui %28, %c5_i32 : i32 |
| 76 | + %30 = nvvm.shfl.sync idx %c-1_i32, %29, %c0_i32, %c31_i32 : i32 -> i32 |
| 77 | + %31 = arith.cmpi eq, %30, %c0_i32 : i32 |
| 78 | + %32 = nvvm.elect.sync -> i1 |
| 79 | + %33 = arith.andi %31, %32 : i1 |
| 80 | + scf.if %33 { |
| 81 | + nvgpu.mbarrier.init %27[%c0], %c1 : <memorySpace = #gpu.address_space<workgroup>> |
| 82 | + } |
| 83 | + %34 = nvvm.shfl.sync idx %c-1_i32, %29, %c0_i32, %c31_i32 : i32 -> i32 |
| 84 | + %35 = arith.cmpi eq, %34, %c0_i32 : i32 |
| 85 | + %36 = nvvm.elect.sync -> i1 |
| 86 | + %37 = arith.andi %35, %36 : i1 |
| 87 | + scf.if %37 { |
| 88 | + nvgpu.mbarrier.arrive.expect_tx %27[%c0], %c32768 : <memorySpace = #gpu.address_space<workgroup>> |
| 89 | + nvgpu.tma.async.load %24[%c0, %c0, %c0, %c0], %27[%c0] to %view : <tensor = memref<2x2x64x64xf16, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<2x2x64x64xf16, #gpu.address_space<workgroup>> |
| 90 | + } |
| 91 | + nvgpu.mbarrier.try_wait.parity %27[%c0], %false, %c10000000 : <memorySpace = #gpu.address_space<workgroup>> |
| 92 | + scf.for %arg14 = %c0 to %c2 step %c1 { |
| 93 | + scf.for %arg15 = %c0 to %c2 step %c1 { |
| 94 | + %38 = arith.muli %arg14, %c64 : index |
| 95 | + %39 = arith.muli %arg15, %c64 : index |
| 96 | + %subview = memref.subview %view[%arg14, %arg15, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[64, 1], offset: ?>, #gpu.address_space<workgroup>> |
| 97 | + %subview_0 = memref.subview %dstMemref[%38, %39] [64, 64] [1, 1] : memref<128x128xf16> to memref<64x64xf16, strided<[128, 1], offset: ?>> |
| 98 | + %block_dim_x = gpu.block_dim x |
| 99 | + %thread_id_y = gpu.thread_id y |
| 100 | + %40 = arith.muli %thread_id_y, %block_dim_x : index |
| 101 | + %41 = arith.addi %thread_id_x, %40 : index |
| 102 | + %block_dim_y = gpu.block_dim y |
| 103 | + %42 = arith.muli %block_dim_x, %block_dim_y : index |
| 104 | + %thread_id_z = gpu.thread_id z |
| 105 | + %43 = arith.muli %thread_id_z, %42 : index |
| 106 | + %44 = arith.addi %41, %43 : index |
| 107 | + %45 = arith.cmpi eq, %44, %c0 : index |
| 108 | + scf.if %45 { |
| 109 | + scf.for %arg16 = %c0 to %c64 step %c1 { |
| 110 | + scf.for %arg17 = %c0 to %c64 step %c1 { |
| 111 | + %46 = memref.load %subview[%arg16, %arg17] : memref<64x64xf16, strided<[64, 1], offset: ?>, #gpu.address_space<workgroup>> |
| 112 | + memref.store %46, %subview_0[%arg16, %arg17] : memref<64x64xf16, strided<[128, 1], offset: ?>> |
| 113 | + } |
| 114 | + } |
| 115 | + } |
| 116 | + gpu.barrier |
| 117 | + } |
| 118 | + } |
| 119 | + gpu.terminator |
| 120 | + } |
| 121 | + |
| 122 | + %s6 = gpu.memcpy async [%s5] %dstMemref_host, %dstMemref : memref<128x128xf16>, memref<128x128xf16> |
| 123 | + gpu.wait [%s6] |
| 124 | + |
| 125 | + %errorCount, %correctCount = scf.for %arg0 = %c0 to %c128 step %c1 iter_args(%ec1 = %c0, %cc1 = %c0) -> (index,index) { |
| 126 | + %ec2, %cc2 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%ec2 = %ec1, %cc2 = %cc1) -> (index, index) { |
| 127 | + %v1 = memref.load %dstMemref_host[%arg0, %arg1] : memref<128x128xf16> |
| 128 | + %v2 = memref.load %srcMemref_host[%arg0, %arg1] : memref<128x128xf16> |
| 129 | + %p = arith.cmpf one, %v1, %v2 : f16 |
| 130 | + %ec3, %cc3 = scf.if %p -> (index, index) { |
| 131 | + %ec3 = arith.addi %ec2, %c1 : index |
| 132 | + scf.yield %ec3, %cc2 : index, index |
| 133 | + } else { |
| 134 | + %cc3 = arith.addi %cc2, %c1 : index |
| 135 | + scf.yield %ec2, %cc3 : index, index |
| 136 | + } |
| 137 | + scf.yield %ec3, %cc3 : index,index |
| 138 | + } |
| 139 | + scf.yield %ec2, %cc2 : index,index |
| 140 | + } |
| 141 | + |
| 142 | + vector.print str "Correct Results :" |
| 143 | + vector.print %correctCount : index |
| 144 | + vector.print str "Incorrect Results :" |
| 145 | + vector.print %errorCount : index |
| 146 | + return |
| 147 | + } |
| 148 | +} |
0 commit comments