@@ -653,15 +653,15 @@ func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d
653
653
%c0 = arith.constant 0 : index
654
654
%crd0 = arith.constant 0 : index
655
655
%crd1 = arith.constant 0 : index
656
- // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}]
656
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}]
657
657
nvgpu.tma.async.load %tensorMap1d [%crd0 ], %mbarrier [%c0 ] to %buffer1d : !tensorMap1d , !mbarrier -> memref <128 xf32 ,3 >
658
- // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}]
658
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}]
659
659
nvgpu.tma.async.load %tensorMap2d [%crd0 , %crd1 ], %mbarrier [%c0 ] to %buffer2d : !tensorMap2d , !mbarrier -> memref <32 x32 xf32 ,3 >
660
- // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}]
660
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}]
661
661
nvgpu.tma.async.load %tensorMap3d [%crd0 , %crd1 , %crd0 ], %mbarrier [%c0 ] to %buffer3d : !tensorMap3d , !mbarrier -> memref <2 x32 x32 xf32 ,3 >
662
- // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
662
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
663
663
nvgpu.tma.async.load %tensorMap4d [%crd0 , %crd1 , %crd1 , %crd0 ], %mbarrier [%c0 ] to %buffer4d : !tensorMap4d , !mbarrier -> memref <2 x2 x32 x32 xf32 ,3 >
664
- // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
664
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
665
665
nvgpu.tma.async.load %tensorMap5d [%crd0 , %crd1 , %crd1 , %crd0 , %crd0 ], %mbarrier [%c0 ] to %buffer5d : !tensorMap5d , !mbarrier -> memref <2 x2 x2 x32 x32 xf32 ,3 >
666
666
func.return
667
667
}
@@ -678,15 +678,15 @@ func.func @async_tma_load_pred(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensor
678
678
%c0 = arith.constant 0 : index
679
679
%crd0 = arith.constant 0 : index
680
680
%crd1 = arith.constant 0 : index
681
- // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}], predicate = %{{.*}}
681
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}] predicate = %{{.*}}
682
682
nvgpu.tma.async.load %tensorMap1d [%crd0 ], %mbarrier [%c0 ] to %buffer1d , predicate = %p : !tensorMap1d , !mbarrier -> memref <128 xf32 ,3 >
683
- // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}], predicate = %{{.*}}
683
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}] predicate = %{{.*}}
684
684
nvgpu.tma.async.load %tensorMap2d [%crd0 , %crd1 ], %mbarrier [%c0 ] to %buffer2d , predicate = %p : !tensorMap2d , !mbarrier -> memref <32 x32 xf32 ,3 >
685
- // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}}
685
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}] predicate = %{{.*}}
686
686
nvgpu.tma.async.load %tensorMap3d [%crd0 , %crd1 , %crd0 ], %mbarrier [%c0 ] to %buffer3d , predicate = %p : !tensorMap3d , !mbarrier -> memref <2 x32 x32 xf32 ,3 >
687
- // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}}
687
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] predicate = %{{.*}}
688
688
nvgpu.tma.async.load %tensorMap4d [%crd0 , %crd1 , %crd1 , %crd0 ], %mbarrier [%c0 ] to %buffer4d , predicate = %p : !tensorMap4d , !mbarrier -> memref <2 x2 x32 x32 xf32 ,3 >
689
- // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}}
689
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] predicate = %{{.*}}
690
690
nvgpu.tma.async.load %tensorMap5d [%crd0 , %crd1 , %crd1 , %crd0 , %crd0 ], %mbarrier [%c0 ] to %buffer5d , predicate = %p : !tensorMap5d , !mbarrier -> memref <2 x2 x2 x32 x32 xf32 ,3 >
691
691
func.return
692
692
}
@@ -737,8 +737,8 @@ module @mymodule {
737
737
nvgpu.tma.async.load %lhsTensorMap [%c0 , %c0 ], %mbarrier [%c0 ] to %lhsShmem : !lhsTensorMap , !barrierType -> !shmemlhs
738
738
// CHECK: %[[desc:.+]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
739
739
// CHECK: %[[c8192:.+]] = llvm.mlir.constant(8192 : index) : i64
740
- // CHECK: %[[shmemOfset:.+]] = llvm.getelementptr %[[desc]][%[[c8192]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
741
- // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %[[shmemOfset]], %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i32, i32
740
+ // CHECK: %[[shmemOfset:.+]] = llvm.getelementptr %[[desc]][%[[c8192]]] : (!llvm.ptr<3>, i64)
741
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %[[shmemOfset]], %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}]
742
742
nvgpu.tma.async.load %rhsTensorMap [%c0 , %c0 ], %mbarrier [%c0 ] to %rhsShmem : !rhsTensorMap , !barrierType -> !shmemrhs
743
743
return
744
744
}
0 commit comments