@@ -120,6 +120,36 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
120
120
"Call finish() after each kernel launch.">
121
121
];
122
122
}
123
+
124
+ def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
125
+ let summary = "GPU tiling and fusion path.";
126
+ let description = [{
127
+ This pass tiles linalg operations and creates two nested csf.forall loops. When converting to gpu.launch,
128
+ the inner loop is mapped to the block sizes and the outer - to grid sizes. The tiles calculation is based
129
+ on the GPU device properties, retrieved from the DLTI attributes. If the DLTI attributes are not specified,
130
+ defaults to the pass options.
131
+ }];
132
+ let options = [
133
+ Option<"numEus", "num-eus", "size_t",
134
+ /*default=*/"448",
135
+ "Number of Execution Units.">,
136
+ Option<"numEusPerSlice", "num-eus-per-slice", "size_t",
137
+ /*default=*/"8",
138
+ "Number of Execution Units per slice.">,
139
+ Option<"numThreadsPerEu", "num-threads-per-eu", "size_t",
140
+ /*default=*/"8",
141
+ "Number of threads per Execution Unit.">,
142
+ Option<"cacheSize", "cache-size", "size_t",
143
+ /*default=*/"131072",
144
+ "Execution Unit cache size.">,
145
+ Option<"vectorWidth", "vector-width", "size_t",
146
+ /*default=*/"512",
147
+ "The maximum width of EU's vector registers.">,
148
+ Option<"workGroupSize", "work-group-size", "size_t",
149
+ /*default=*/"64",
150
+ "The maximum workgroup size.">
151
+ ];
152
+ }
123
153
#endif // GC_USE_IMEX
124
154
125
155
def IterativeTilingAndFusion : Pass<"iterative-tiling-and-fusion",
0 commit comments