[MLIR][GPU] Fix gpu.printf (#121940)

grypp · web-flow · commit f50f9698ad01 · 2025-01-08T08:25:57.000+01:00
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1055,7 +1055,7 @@ def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>,
     imposed by one's target platform.
   }];
   let assemblyFormat = [{
-    $format attr-dict ($args^ `:` type($args))?
+    $format attr-dict (`,` $args^ `:` type($args))?
   }];
 }
 
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -633,7 +633,7 @@ gpu.module @test_module_29 {
     // CHECK-NEXT: %[[EL1:.*]] = llvm.getelementptr %[[ALLOC]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i32, f64)>
     // CHECK-NEXT: llvm.store %[[EXT]], %[[EL1]] : f64, !llvm.ptr
     // CHECK-NEXT: llvm.call @vprintf(%[[FORMATSTART]], %[[ALLOC]]) : (!llvm.ptr, !llvm.ptr) -> i32
-    gpu.printf "Hello: %d\n" %arg0, %arg1 : i32, f32
+    gpu.printf "Hello: %d\n", %arg0, %arg1 : i32, f32
     gpu.return
   }
 }
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
@@ -36,7 +36,7 @@ gpu.module @test_module {
     // CHECK-NEXT: %[[NARGS1:.*]] = llvm.mlir.constant(1 : i32) : i32
     // CHECK-NEXT: %[[ARG0_64:.*]] = llvm.zext %[[ARG0]] : i32 to i64
     // CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_args(%[[DESC1]], %[[NARGS1]], %[[ARG0_64]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[ISLAST]]) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
-    gpu.printf "Hello: %d\n" %arg0 : i32
+    gpu.printf "Hello: %d\n", %arg0 : i32
     gpu.return
   }
 }
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
@@ -9,7 +9,7 @@ gpu.module @test_module {
     // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<4>
     // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<4>) -> !llvm.ptr<4>, !llvm.array<11 x i8>
     // CHECK-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func<i32 (ptr<4>, ...)>) : (!llvm.ptr<4>, i32) -> i32
-    gpu.printf "Hello: %d\n" %arg0 : i32
+    gpu.printf "Hello: %d\n", %arg0 : i32
     gpu.return
   }
 }
diff --git a/mlir/test/Conversion/GPUToSPIRV/printf.mlir b/mlir/test/Conversion/GPUToSPIRV/printf.mlir
@@ -62,7 +62,7 @@ module attributes {
         // CHECK: [[FMTSTR_ADDR:%.*]] = spirv.mlir.addressof [[PRINTMSG]] : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant>
         // CHECK-NEXT: [[FMTSTR_PTR1:%.*]] = spirv.Bitcast [[FMTSTR_ADDR]] : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant> to !spirv.ptr<i8, UniformConstant>
         // CHECK-NEXT:  {{%.*}} = spirv.CL.printf [[FMTSTR_PTR1]] {{%.*}}, {{%.*}}, {{%.*}} : !spirv.ptr<i8, UniformConstant>, i32, f32, i32 -> i32
-        gpu.printf "\nHello, world : %d %f \n Thread id: %d\n" %arg0, %arg1, %2: i32, f32, index
+        gpu.printf "\nHello, world : %d %f \n Thread id: %d\n", %arg0, %arg1, %2: i32, f32, index
 
         // CHECK: spirv.Return
         gpu.return
diff --git a/mlir/test/Dialect/GPU/indirect-device-func-call.mlir b/mlir/test/Dialect/GPU/indirect-device-func-call.mlir
@@ -6,7 +6,7 @@ gpu.module @kernels {
     func.func @hello(%arg0 : f32) {
         %tid_x = gpu.thread_id x
         %csti8 = arith.constant 2 : i8
-        gpu.printf "Hello from %lld, %d, %f\n" %tid_x, %csti8, %arg0  : index, i8, f32
+        gpu.printf "Hello from %lld, %d, %f\n", %tid_x, %csti8, %arg0  : index, i8, f32
         return
     }
     // CHECK-LABEL: @hello_indirect
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
@@ -229,9 +229,22 @@ module attributes {gpu.container_module} {
 
     // CHECK-LABEL: gpu.func @printf_test
     // CHECK: (%[[ARG0:.*]]: i32)
-    // CHECK: gpu.printf "Value: %d" %[[ARG0]] : i32
+    // CHECK: gpu.printf "Value: %d", %[[ARG0]] : i32
     gpu.func @printf_test(%arg0 : i32) {
-      gpu.printf "Value: %d" %arg0 : i32
+      gpu.printf "Value: %d", %arg0 : i32
+      gpu.return
+    }
+
+    // CHECK-LABEL: gpu.func @printf_empty
+    // CHECK: gpu.printf  "]"
+    // CHECK: scf.if
+    // CHECK: gpu.printf ", "
+    gpu.func @printf_empty(%arg0 : i32) {
+      gpu.printf "]"
+      %1 = arith.cmpi slt, %arg0, %arg0 : i32
+      scf.if %1 {
+        gpu.printf ", "
+      } 
       gpu.return
     }
 
diff --git a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
@@ -23,7 +23,7 @@ func.func @test_math(%arg0 : f32) {
         threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) { 
         // CHECK-NVVM: __nv_expf 
         %s1 = math.exp %arg0 : f32
-        gpu.printf "%f" %s1 : f32
+        gpu.printf "%f", %s1 : f32
         gpu.terminator
     }
     return
diff --git a/mlir/test/Integration/GPU/CUDA/assert.mlir b/mlir/test/Integration/GPU/CUDA/assert.mlir
@@ -16,10 +16,10 @@ gpu.module @kernels {
 gpu.func @test_assert(%c0: i1, %c1: i1) kernel {
   %0 = gpu.thread_id x
   cf.assert %c1, "passing assertion"
-  gpu.printf "thread %lld: print after passing assertion\n" %0 : index
+  gpu.printf "thread %lld: print after passing assertion\n", %0 : index
   // Test callsite(callsite(name)) location.
   cf.assert %c0, "failing assertion" loc(callsite(callsite("callee_func_name"("callee_file.cc":7:9) at "caller_file.cc":10:8) at "caller2_file.cc":11:12))
-  gpu.printf "thread %lld: print after failing assertion\n" %0 : index
+  gpu.printf "thread %lld: print after failing assertion\n", %0 : index
   gpu.return
 }
 }
diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir
@@ -14,7 +14,7 @@ module attributes {gpu.container_module} {
             %0 = gpu.thread_id x
             %csti8 = arith.constant 2 : i8
             %cstf32 = arith.constant 3.0 : f32
-            gpu.printf "Hello from %lld, %d, %f\n" %0, %csti8, %cstf32  : index, i8, f32
+            gpu.printf "Hello from %lld, %d, %f\n", %0, %csti8, %cstf32  : index, i8, f32
             gpu.return
         }
     }
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir b/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir
@@ -43,7 +43,7 @@ module attributes {gpu.container_module} {
       %cnd2 =  arith.cmpi eq, %bidY, %c3 : index
       scf.if %cnd1 {
         scf.if %cnd2 {
-          gpu.printf "clusterIdx: (%d, %d, %d) in Cluster Dimension: (%d, %d, %d) blockIdx: (%d, %d, %d) \n" 
+          gpu.printf "clusterIdx: (%d, %d, %d) in Cluster Dimension: (%d, %d, %d) blockIdx: (%d, %d, %d) \n",
             %cidX_i32,
             %cidY_i32,
             %cidZ_i32,
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -85,7 +85,7 @@ module @mymod {
       
       // Step 7. First thread does TMA load
       scf.if %10 {
-        gpu.printf "[GPU] TMA SIZE %d\0A" %c8192 : index
+        gpu.printf "[GPU] TMA SIZE %d\0A", %c8192 : index
         nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
         nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c8192 : !barrierType
       } else {
@@ -98,16 +98,16 @@ module @mymod {
 
       // Step 9. Print loaded data in 128b swizzled
       scf.if %10 {        
-        gpu.printf "===--- Matrix A ---=== %d \0A" %c-1_i32 : i32
+        gpu.printf "===--- Matrix A ---=== %d \0A", %c-1_i32 : i32
         scf.for %arg12 = %c0 to %c128 step %c1 {
           scf.for %arg13 = %c0 to %c64 step %c1 {
             %15 = memref.load %7[%arg12, %arg13] : !shmemlhs
             %16 = arith.extf %15 : f16 to f32
-            gpu.printf "%.0f,   " %16 : f32
+            gpu.printf "%.0f,   ", %16 : f32
           }
-          gpu.printf "%d\0A" %c-1_i32 : i32
+          gpu.printf "%d\0A", %c-1_i32 : i32
         }
-        gpu.printf "===----------------=== %d \0A" %c-1_i32 : i32
+        gpu.printf "===----------------=== %d \0A", %c-1_i32 : i32
       }
       gpu.terminator
     }
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
@@ -109,7 +109,7 @@ module @mymod {
       
       // Step 6. First thread does TMA load
       scf.if %10 {
-        gpu.printf "[GPU] TMA SIZE %d\0A" %c32768 : index
+        gpu.printf "[GPU] TMA SIZE %d\0A", %c32768 : index
         nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
         nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %rhsShmem1 : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1]>, 3>
         nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1], offset: 4096>, 3>
@@ -124,7 +124,7 @@ module @mymod {
 
       // Step 8. Print loaded data in 128b swizzled
       scf.if %10 {        
-        gpu.printf "===--- Matrix B ---=== %d \n" %c-1_i32 : i32
+        gpu.printf "===--- Matrix B ---=== %d \n", %c-1_i32 : i32
         scf.for %ii = %c0 to %c64 step %c1 {
           scf.for %j = %c0 to %c128 step %c1 {
             %lhs0 = memref.load %rhsShmem[%ii, %j] : !shmemrhs
@@ -133,7 +133,7 @@ module @mymod {
           }
           gpu.printf "%d\n" %c-1_i32 : i32
         }
-        gpu.printf "===----------------=== %d \n" %c-1_i32 : i32
+        gpu.printf "===----------------=== %d \n", %c-1_i32 : i32
       }
       gpu.barrier
       gpu.terminator
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -80,8 +80,8 @@ module @mymod {
         nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c6144 : <memorySpace = #gpu.address_space<workgroup>>
         %11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3>
         %12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3>
-        gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32
-        gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32
+        gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A", %11 : f32
+        gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A", %12 : f32
         nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
         nvgpu.tma.async.load %4[%c0, %c0], %9[%c0] to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
       } else {
@@ -92,8 +92,8 @@ module @mymod {
       scf.if %10 {
         %11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
         %12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>
-        gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32
-        gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32
+        gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A", %11 : f32
+        gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A", %12 : f32
       }
       gpu.terminator
     }
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
@@ -96,8 +96,8 @@ func.func @main() {
     scf.if %10 {
       %11 = memref.load %out[%c45, %c7] : memref<64x8xf32, 3>
       %12 = memref.load %out_1[%c7, %c0] : memref<8x128xf32, 3>
-      gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32
-      gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32
+      gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A", %11 : f32
+      gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A", %12 : f32
     }
     gpu.terminator
   }
diff --git a/mlir/test/Integration/GPU/ROCM/printf.mlir b/mlir/test/Integration/GPU/ROCM/printf.mlir
@@ -13,7 +13,7 @@ module attributes {gpu.container_module} {
     gpu.module @kernels {
         gpu.func @hello() kernel {
             %0 = gpu.thread_id x
-            gpu.printf "Hello from %d\n" %0 : index
+            gpu.printf "Hello from %d\n", %0 : index
             gpu.return
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -1055,7 +1055,7 @@ def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>,`
`1055`	`1055`	`imposed by one's target platform.`
`1056`	`1056`	`}];`
`1057`	`1057`	`let assemblyFormat = [{`
`1058`		- $format attr-dict ($args^ `:` type($args))?
	`1058`	+ $format attr-dict (`,` $args^ `:` type($args))?
`1059`	`1059`	`}];`
`1060`	`1060`	`}`
`1061`	`1061`
Original file line number	Diff line number	Diff line change
`@@ -633,7 +633,7 @@ gpu.module @test_module_29 {`
`633`	`633`	`// CHECK-NEXT: %[[EL1:.*]] = llvm.getelementptr %[[ALLOC]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i32, f64)>`
`634`	`634`	`// CHECK-NEXT: llvm.store %[[EXT]], %[[EL1]] : f64, !llvm.ptr`
`635`	`635`	`// CHECK-NEXT: llvm.call @vprintf(%[[FORMATSTART]], %[[ALLOC]]) : (!llvm.ptr, !llvm.ptr) -> i32`
`636`		`- gpu.printf "Hello: %d\n" %arg0, %arg1 : i32, f32`
	`636`	`+ gpu.printf "Hello: %d\n", %arg0, %arg1 : i32, f32`
`637`	`637`	`gpu.return`
`638`	`638`	`}`
`639`	`639`	`}`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ gpu.module @test_module {`
`36`	`36`	`// CHECK-NEXT: %[[NARGS1:.*]] = llvm.mlir.constant(1 : i32) : i32`
`37`	`37`	`// CHECK-NEXT: %[[ARG0_64:.*]] = llvm.zext %[[ARG0]] : i32 to i64`
`38`	`38`	`// CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_args(%[[DESC1]], %[[NARGS1]], %[[ARG0_64]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[ISLAST]]) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64`
`39`		`- gpu.printf "Hello: %d\n" %arg0 : i32`
	`39`	`+ gpu.printf "Hello: %d\n", %arg0 : i32`
`40`	`40`	`gpu.return`
`41`	`41`	`}`
`42`	`42`	`}`
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@ gpu.module @test_module {`
`9`	`9`	`// CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<4>`
`10`	`10`	`// CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<4>) -> !llvm.ptr<4>, !llvm.array<11 x i8>`
`11`	`11`	`// CHECK-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func<i32 (ptr<4>, ...)>) : (!llvm.ptr<4>, i32) -> i32`
`12`		`- gpu.printf "Hello: %d\n" %arg0 : i32`
	`12`	`+ gpu.printf "Hello: %d\n", %arg0 : i32`
`13`	`13`	`gpu.return`
`14`	`14`	`}`
`15`	`15`	`}`
Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@ gpu.module @kernels {`
`6`	`6`	`func.func @hello(%arg0 : f32) {`
`7`	`7`	`%tid_x = gpu.thread_id x`
`8`	`8`	`%csti8 = arith.constant 2 : i8`
`9`		`- gpu.printf "Hello from %lld, %d, %f\n" %tid_x, %csti8, %arg0 : index, i8, f32`
	`9`	`+ gpu.printf "Hello from %lld, %d, %f\n", %tid_x, %csti8, %arg0 : index, i8, f32`
`10`	`10`	`return`
`11`	`11`	`}`
`12`	`12`	`// CHECK-LABEL: @hello_indirect`
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ func.func @test_math(%arg0 : f32) {`
`23`	`23`	`threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) {`
`24`	`24`	`// CHECK-NVVM: __nv_expf`
`25`	`25`	`%s1 = math.exp %arg0 : f32`
`26`		`- gpu.printf "%f" %s1 : f32`
	`26`	`+ gpu.printf "%f", %s1 : f32`
`27`	`27`	`gpu.terminator`
`28`	`28`	`}`
`29`	`29`	`return`
Original file line number	Diff line number	Diff line change
`@@ -16,10 +16,10 @@ gpu.module @kernels {`
`16`	`16`	`gpu.func @test_assert(%c0: i1, %c1: i1) kernel {`
`17`	`17`	`%0 = gpu.thread_id x`
`18`	`18`	`cf.assert %c1, "passing assertion"`
`19`		`- gpu.printf "thread %lld: print after passing assertion\n" %0 : index`
	`19`	`+ gpu.printf "thread %lld: print after passing assertion\n", %0 : index`
`20`	`20`	`// Test callsite(callsite(name)) location.`
`21`	`21`	`cf.assert %c0, "failing assertion" loc(callsite(callsite("callee_func_name"("callee_file.cc":7:9) at "caller_file.cc":10:8) at "caller2_file.cc":11:12))`
`22`		`- gpu.printf "thread %lld: print after failing assertion\n" %0 : index`
	`22`	`+ gpu.printf "thread %lld: print after failing assertion\n", %0 : index`
`23`	`23`	`gpu.return`
`24`	`24`	`}`
`25`	`25`	`}`
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ module attributes {gpu.container_module} {`
`14`	`14`	`%0 = gpu.thread_id x`
`15`	`15`	`%csti8 = arith.constant 2 : i8`
`16`	`16`	`%cstf32 = arith.constant 3.0 : f32`
`17`		`- gpu.printf "Hello from %lld, %d, %f\n" %0, %csti8, %cstf32 : index, i8, f32`
	`17`	`+ gpu.printf "Hello from %lld, %d, %f\n", %0, %csti8, %cstf32 : index, i8, f32`
`18`	`18`	`gpu.return`
`19`	`19`	`}`
`20`	`20`	`}`
Original file line number	Diff line number	Diff line change
`@@ -96,8 +96,8 @@ func.func @main() {`
`96`	`96`	`scf.if %10 {`
`97`	`97`	`%11 = memref.load %out[%c45, %c7] : memref<64x8xf32, 3>`
`98`	`98`	`%12 = memref.load %out_1[%c7, %c0] : memref<8x128xf32, 3>`
`99`		`- gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32`
`100`		`- gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32`
	`99`	`+ gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A", %11 : f32`
	`100`	`+ gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A", %12 : f32`
`101`	`101`	`}`
`102`	`102`	`gpu.terminator`
`103`	`103`	`}`
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ module attributes {gpu.container_module} {`
`13`	`13`	`gpu.module @kernels {`
`14`	`14`	`gpu.func @hello() kernel {`
`15`	`15`	`%0 = gpu.thread_id x`
`16`		`- gpu.printf "Hello from %d\n" %0 : index`
	`16`	`+ gpu.printf "Hello from %d\n", %0 : index`
`17`	`17`	`gpu.return`
`18`	`18`	`}`
`19`	`19`	`}`