Skip to content

Commit 443ff0e

Browse files
Apply changes from code review
1 parent 501d434 commit 443ff0e

File tree

6 files changed

+180
-159
lines changed

6 files changed

+180
-159
lines changed

lib/gc/Transforms/GPU/AddContextArg.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ namespace {
2020
struct AddContextArg final : gc::impl::AddContextArgBase<AddContextArg> {
2121
void runOnOperation() override {
2222
auto func = getOperation();
23+
if (func.isExternal()) {
24+
return;
25+
}
26+
2327
auto funcType = func.getFunctionType();
2428
auto argTypes = llvm::to_vector<8>(funcType.getInputs());
2529
auto resultTypes = llvm::to_vector<1>(funcType.getResults());
@@ -28,14 +32,19 @@ struct AddContextArg final : gc::impl::AddContextArgBase<AddContextArg> {
2832
argTypes.emplace_back(newArgType);
2933
auto newFuncType = FunctionType::get(ctx, argTypes, resultTypes);
3034
func.setType(newFuncType);
31-
32-
if (func.getBody().hasOneBlock()) {
33-
func.getBody().front().addArgument(newArgType, func.getLoc());
34-
}
35+
func.getBody().front().addArgument(newArgType, func.getLoc());
3536

3637
// Find all function calls and append the last argument of the current
3738
// function to the call.
39+
auto module = func->getParentOfType<ModuleOp>();
3840
func.walk([&](func::CallOp call) {
41+
// If the function to be called is defined in the current module, then the
42+
// context arg will be added to this function signature either and, thus,
43+
// wee need add the context arg to the function call.
44+
if (auto callee = module.lookupSymbol<func::FuncOp>(call.getCallee());
45+
!callee || callee.isExternal()) {
46+
return;
47+
}
3948
auto args = llvm::to_vector<8>(call.getOperands());
4049
args.emplace_back(func.getArgument(func.getNumArguments() - 1));
4150
call->setOperands(args);

lib/gc/Transforms/GPU/GpuToGpuOcl.cpp

Lines changed: 53 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,30 @@ struct Helper final {
6969
rewriter.getIntegerAttr(idxType, static_cast<int64_t>(value)));
7070
}
7171

72+
Value calculateStaticSize(OpBuilder &rewriter, const Location loc,
73+
const MemRefType type) const {
74+
if (type.getRank() == 0) {
75+
return idxConstant(rewriter, loc, 0);
76+
}
77+
78+
auto elementType = type.getElementType();
79+
if (!elementType.isIntOrIndexOrFloat()) {
80+
return nullptr;
81+
}
82+
83+
int64_t numElements = 1;
84+
for (auto dim : type.getShape()) {
85+
if (dim == ShapedType::kDynamic) {
86+
return nullptr;
87+
}
88+
numElements = numElements * dim;
89+
}
90+
auto elementSize = elementType.isIndex()
91+
? idxType.getIntOrFloatBitWidth()
92+
: elementType.getIntOrFloatBitWidth();
93+
return idxConstant(rewriter, loc, elementSize * numElements / 8);
94+
}
95+
7296
void destroyKernels(OpBuilder &rewriter, Location loc,
7397
ArrayRef<Value> kernelPtrs) const {
7498
auto size = idxConstant(rewriter, loc, kernelPtrs.size());
@@ -102,82 +126,44 @@ struct ConvertAlloc final : ConvertOpPattern<gpu::AllocOp> {
102126
ConversionPatternRewriter &rewriter) const override {
103127
auto loc = allocOp.getLoc();
104128
MemRefType type = allocOp.getType();
105-
auto shape = type.getShape();
106-
auto dynamics = adaptor.getDynamicSizes();
107129

108-
if (shape.empty() || dynamics.empty()) {
109-
int64_t staticSize;
110-
if (shape.empty()) {
111-
staticSize = 0;
112-
} else {
113-
staticSize = type.getElementType().getIntOrFloatBitWidth() / 8;
114-
for (auto dim : shape) {
115-
assert(dim != ShapedType::kDynamic);
116-
staticSize *= dim;
117-
}
118-
}
119-
auto size = helper.idxConstant(rewriter, loc, staticSize);
130+
if (auto staticSize = helper.calculateStaticSize(rewriter, loc, type)) {
120131
auto ptr = funcCall(rewriter, GPU_OCL_MALLOC, helper.ptrType,
121132
{helper.ptrType, helper.idxType}, loc,
122-
{getCtxPtr(rewriter), size})
133+
{getCtxPtr(rewriter), staticSize})
123134
.getResult();
124135
Value replacement = MemRefDescriptor::fromStaticShape(
125136
rewriter, loc, helper.converter, type, ptr, ptr);
126137
rewriter.replaceOp(allocOp, replacement);
127138
return success();
128139
}
129140

130-
auto ndims = shape.size();
131-
SmallVector<Value> newShape;
132-
SmallVector<Value> newStrides(ndims);
133-
auto staticSize = type.getElementType().getIntOrFloatBitWidth() / 8;
134-
auto size = dynamics[0];
135-
136-
auto idxMul = [&](Value x, Value y) -> Value {
137-
if (auto xConst = getConstantIntValue(x)) {
138-
if (auto yConst = getConstantIntValue(y)) {
139-
return helper.idxConstant(rewriter, loc,
140-
xConst.value() * yConst.value());
141-
}
142-
}
143-
return rewriter.create<LLVM::MulOp>(loc, x, y);
144-
};
145-
146-
for (size_t i = 0, j = 0; i < ndims; i++) {
147-
auto dim = shape[i];
148-
if (dim == ShapedType::kDynamic) {
149-
auto dynSize = dynamics[j++];
150-
newShape.emplace_back(dynSize);
151-
if (j != 1) {
152-
size = idxMul(size, dynSize);
153-
}
154-
} else {
155-
staticSize *= dim;
156-
newShape.emplace_back(helper.idxConstant(rewriter, loc, dim));
157-
}
141+
auto dstType = helper.converter.convertType(type);
142+
if (!dstType) {
143+
allocOp.emitError() << "Failed to convert the MemRefType";
144+
return failure();
158145
}
159146

160-
size = idxMul(size, helper.idxConstant(rewriter, loc, staticSize));
147+
SmallVector<Value> shape;
148+
SmallVector<Value> strides;
149+
Value size;
150+
getMemRefDescriptorSizes(loc, type, adaptor.getDynamicSizes(), rewriter,
151+
shape, strides, size);
152+
assert(shape.size() == strides.size());
153+
161154
auto ptr = funcCall(rewriter, GPU_OCL_MALLOC, helper.ptrType,
162155
{helper.ptrType, helper.idxType}, loc,
163156
{getCtxPtr(rewriter), size})
164157
.getResult();
165158

166-
newStrides[ndims - 1] = helper.idxConstant(rewriter, loc, 1);
167-
for (int i = static_cast<int>(ndims) - 2; i >= 0; i--) {
168-
newStrides[i] = idxMul(newStrides[i + 1], newShape[i]);
169-
;
170-
}
171-
172-
auto dsc = MemRefDescriptor::undef(rewriter, loc,
173-
helper.converter.convertType(type));
159+
auto dsc = MemRefDescriptor::undef(rewriter, loc, dstType);
174160
dsc.setAllocatedPtr(rewriter, loc, ptr);
175161
dsc.setAlignedPtr(rewriter, loc, ptr);
176162
dsc.setOffset(rewriter, loc, helper.idxConstant(rewriter, loc, 0));
177163

178-
for (unsigned i = 0, n = static_cast<unsigned>(ndims); i < n; i++) {
179-
dsc.setSize(rewriter, loc, i, newShape[i]);
180-
dsc.setStride(rewriter, loc, i, newStrides[i]);
164+
for (unsigned i = 0, n = static_cast<unsigned>(shape.size()); i < n; i++) {
165+
dsc.setSize(rewriter, loc, i, shape[i]);
166+
dsc.setStride(rewriter, loc, i, strides[i]);
181167
}
182168

183169
rewriter.replaceOp(allocOp, static_cast<Value>(dsc));
@@ -209,23 +195,24 @@ struct ConvertMemcpy final : ConvertOpPattern<gpu::MemcpyOp> {
209195
matchAndRewrite(gpu::MemcpyOp gpuMemcpy, OpAdaptor adaptor,
210196
ConversionPatternRewriter &rewriter) const override {
211197
auto loc = gpuMemcpy.getLoc();
198+
MemRefDescriptor srcDsc(adaptor.getSrc());
199+
MemRefDescriptor dstDsc(adaptor.getDst());
212200
auto srcType = gpuMemcpy.getSrc().getType();
213-
auto elementSize = srcType.getElementType().getIntOrFloatBitWidth() / 8;
214-
uint64_t numElements = 0;
215-
for (auto dim : srcType.getShape()) {
216-
if (dim == ShapedType::kDynamic) {
217-
gpuMemcpy.emitOpError()
218-
<< "dynamic shapes are not currently not supported";
219-
return failure();
201+
Value size = helper.calculateStaticSize(rewriter, loc, srcType);
202+
203+
if (!size) {
204+
auto numElements = helper.idxConstant(rewriter, loc, 1);
205+
for (unsigned i = 0, n = srcType.getRank(); i < n; i++) {
206+
numElements = rewriter.create<LLVM::MulOp>(
207+
loc, numElements, srcDsc.size(rewriter, loc, i));
220208
}
221-
numElements = numElements ? numElements * dim : dim;
209+
size = rewriter.create<mlir::LLVM::MulOp>(
210+
loc, numElements,
211+
getSizeInBytes(loc, srcType.getElementType(), rewriter));
222212
}
223213

224-
MemRefDescriptor srcDsc(adaptor.getSrc());
225-
MemRefDescriptor dstDsc(adaptor.getDst());
226214
auto srcPtr = srcDsc.alignedPtr(rewriter, loc);
227215
auto dstPtr = dstDsc.alignedPtr(rewriter, loc);
228-
auto size = helper.idxConstant(rewriter, loc, elementSize * numElements);
229216
auto oclMemcpy = funcCall(
230217
rewriter, GPU_OCL_MEMCPY, helper.voidType,
231218
{helper.ptrType, helper.ptrType, helper.ptrType, helper.idxType}, loc,

lib/gc/Transforms/GPU/Pipeline.cpp

Lines changed: 15 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,45 +6,35 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include <string>
10+
11+
#include "gc/Transforms/Passes.h"
12+
13+
#include "imex/Conversion/Passes.h"
14+
#include "imex/Transforms/Passes.h"
15+
916
#include "mlir/Conversion/Passes.h"
10-
#include "mlir/Dialect/Arith/Transforms/Passes.h"
1117
#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
1218
#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
13-
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
19+
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
20+
#include "mlir/Dialect/GPU/Transforms/Passes.h"
1421
#include "mlir/Dialect/LLVMIR/Transforms/Passes.h"
1522
#include "mlir/Dialect/Linalg/Passes.h"
16-
#include "mlir/Dialect/Math/Transforms/Passes.h"
17-
#include "mlir/Dialect/MemRef/IR/MemRef.h"
1823
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
19-
#include "mlir/Dialect/SCF/IR/SCF.h"
20-
#include "mlir/Dialect/Tensor/IR/Tensor.h"
21-
#include "mlir/IR/DialectRegistry.h"
24+
#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
2225
#include "mlir/InitAllPasses.h"
2326
#include "mlir/Pass/PassManager.h"
24-
#include "mlir/Support/LogicalResult.h"
2527
#include "mlir/Transforms/Passes.h"
26-
#include <iostream>
27-
28-
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
29-
#include "mlir/Dialect/GPU/Transforms/Passes.h"
30-
#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
31-
32-
#include <imex/Conversion/Passes.h>
33-
#include <imex/Transforms/Passes.h>
34-
35-
#include <string>
36-
37-
#include "gc/Transforms/Passes.h"
3828

3929
namespace mlir::gc {
4030

4131
struct GPUPipelineOption : PassPipelineOptions<GPUPipelineOption> {
42-
PassOptions::Option<bool> isUsmArgs{
32+
Option<bool> isUsmArgs{
4333
*this, "is-usm-args",
44-
llvm::cl::desc("Whether to use USM(unified shared memory) func args, in "
45-
"which the host and device could access the same buffer "
46-
"and there is no need to add memcpy explicitly"),
47-
llvm::cl::init(true)};
34+
desc("Whether to use USM(unified shared memory) func args, in "
35+
"which the host and device could access the same buffer "
36+
"and there is no need to add memcpy explicitly"),
37+
init(true)};
4838
};
4939

5040
void populateGPUPipeline(OpPassManager &pm,

lib/gc/Transforms/IterativeTilingAndFusion.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -680,7 +680,7 @@ defaultTilingOfType(RewriterBase &rewriter, Operation *op,
680680
} else {
681681
defaultTileSize.resize(iteratorTypes.size(), rewriter.getIndexAttr(0));
682682
// Try tileSize from `32` to `16`.
683-
SmallVector<int64_t> tsOrder = {16, 32};
683+
SmallVector<int64_t> tsOrder = {32, 16};
684684
// Record how many dims have been tiled, including fully tiled, i.e.
685685
// tileSize == dimSize.
686686
unsigned nonOneTileDims =
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
// RUN: gc-opt %s --gpu-to-gpuocl | FileCheck %s
2+
3+
module @test attributes {gpu.container_module} {
4+
llvm.func @entry(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64) attributes {llvm.emit_c_interface} {
5+
%0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
6+
%1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
7+
%2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
8+
%3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
9+
%4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
10+
%5 = llvm.insertvalue %arg4, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
11+
%6 = llvm.insertvalue %arg5, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
12+
%7 = llvm.insertvalue %arg6, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
13+
%8 = builtin.unrealized_conversion_cast %7 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<64x64xf32>
14+
%gpu_mem = gpu.alloc host_shared () : memref<64x64xf32>
15+
gpu.memcpy %gpu_mem, %8 : memref<64x64xf32>, memref<64x64xf32>
16+
%9 = llvm.mlir.constant(32 : index) : i64
17+
%10 = builtin.unrealized_conversion_cast %9 : i64 to index
18+
%11 = llvm.mlir.constant(2 : index) : i64
19+
%12 = builtin.unrealized_conversion_cast %11 : i64 to index
20+
%13 = llvm.mlir.constant(1 : index) : i64
21+
%14 = builtin.unrealized_conversion_cast %13 : i64 to index
22+
gpu.launch_func @entry_kernel::@entry_kernel blocks in (%12, %12, %14) threads in (%14, %14, %14) args(%10 : index, %gpu_mem : memref<64x64xf32>)
23+
gpu.memcpy %8, %gpu_mem : memref<64x64xf32>, memref<64x64xf32>
24+
gpu.dealloc %gpu_mem : memref<64x64xf32>
25+
llvm.return
26+
}
27+
28+
gpu.module @entry_kernel attributes {gpu.binary = "Some SPIRV here \00"} {
29+
gpu.func @entry_kernel(%arg0: index, %arg1: memref<64x64xf32>) kernel attributes {} {
30+
gpu.return
31+
}
32+
}
33+
}
34+
35+
// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_SPIRV
36+
// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_Name
37+
// CHECK: llvm.mlir.global internal @gcGpuOclKernel_entry_kernel_Ptr
38+
39+
// CHECK: llvm.func internal @createGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr
40+
// CHECK: [[NEW_PTR:%.+]] = llvm.call @gcGpuOclKernelCreate([[CTX]]
41+
// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
42+
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
43+
// CHECK: [[CMPXCHG:%.+]] = llvm.cmpxchg [[PTR_ADDR]], [[ZERO]], [[NEW_PTR]]
44+
// CHECK: [[FLAG:%.+]] = llvm.extractvalue [[CMPXCHG]][1]
45+
// CHECK: llvm.cond_br [[FLAG]], [[BB1:\^.+]], [[BB2:\^.+]]
46+
// CHECK: [[BB1]]:
47+
// CHECK: llvm.return [[NEW_PTR]]
48+
// CHECK: [[BB2]]:
49+
// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64
50+
// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]]
51+
// CHECK: [[ADDR:%.+]] = llvm.getelementptr [[ARRAY]]
52+
// CHECK: llvm.store [[NEW_PTR]], [[ADDR]]
53+
// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]])
54+
// CHECK: [[OLD_PTR:%.+]] = llvm.extractvalue [[CMPXCHG]][0]
55+
// CHECK: llvm.return [[OLD_PTR]]
56+
57+
// CHECK: llvm.func internal @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline}
58+
// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
59+
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
60+
// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]]
61+
// CHECK: [[ICMP:%.+]] = llvm.icmp "eq" [[PTR]], [[ZERO]]
62+
// CHECK: llvm.cond_br [[ICMP]], [[BB1:\^.+]], [[BB2:\^.+]]
63+
// CHECK: [[BB1]]:
64+
// CHECK: [[NEW_PTR:%.+]] = llvm.call @createGcGpuOclKernel_entry_kernel([[CTX]])
65+
// CHECK: llvm.return [[NEW_PTR]]
66+
// CHECK: [[BB2]]:
67+
// CHECK: llvm.return [[PTR]]
68+
69+
// CHECK: llvm.func @entry(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, [[CTX:%.+]]: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64)
70+
// CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
71+
// CHECK: llvm.call @gcGpuOclMalloc([[CTX]], [[SIZE]])
72+
// CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
73+
// CHECK: [[SRC:%.+]] = llvm.extractvalue
74+
// CHECK: [[DST:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][1]
75+
// CHECK: llvm.call @gcGpuOclMemcpy([[CTX]], [[SRC]], [[DST]], [[SIZE]])
76+
// CHECK: [[KERNEL:%.+]] = llvm.call @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]) : (!llvm.ptr) -> !llvm.ptr
77+
// CHECK: llvm.call @gcGpuOclKernelLaunch([[CTX]], [[KERNEL]],
78+
// CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
79+
// CHECK: [[SRC:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][1]
80+
// CHECK: [[DST:%.+]] = llvm.extractvalue
81+
// CHECK: llvm.call @gcGpuOclMemcpy([[CTX]], [[SRC]], [[DST]], [[SIZE]])
82+
// CHECK: [[GPU_PTR:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][0]
83+
// CHECK: llvm.call @gcGpuOclDealloc([[CTX]], [[GPU_PTR]])
84+
85+
// CHECK: llvm.func @gcGpuOclKernelCreate
86+
// CHECK: llvm.func @gcGpuOclKernelDestroy
87+
// CHECK: llvm.func @gcGpuOclKernelLaunch
88+
89+
90+
// CHECK: llvm.func @gcGpuOclModuleDestructor()
91+
// CHECK: llvm.fence acquire
92+
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
93+
// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]]
94+
// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64
95+
// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]]
96+
// CHECK: [[ADDR:%.+]] = llvm.getelementptr [[ARRAY]]
97+
// CHECK: llvm.store [[PTR]], [[ADDR]]
98+
// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]])

0 commit comments

Comments
 (0)