-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[MLIR][GPU-LLVM] Add in-pass signature update option for opencl kernels #105664
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MLIR][GPU-LLVM] Add in-pass signature update option for opencl kernels #105664
Conversation
@victor-eds, @fabianmcg, @ftynse, @kuhar, @sommerlukas, please take a look. This avoids the changes to validation in the GPU dialect and follows a similar approach taken in the SPIRV lowering. |
Co-authored-by: Victor Perez <[email protected]>
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Petr Kurapov (kurapov-peter) ChangesThis PR introduces a new option Note: this is a draft showing an alternative solution to the one proposed in #102925. Currently, it lacks the differentiation between kernels and non-kernel functions (generic address space should be attached to the latter one's arguments). I'll add it if we decide the approach is acceptable. Edit: after some discussions, the conversion to generic address spaces seem to be unnecessary. The assumption is that if there was a function that required specific handling (e.g., should accept pointers from shared space) it is user's responsibility to put the correct attribute on it. I'm leaving the conversion to only replace None with Global for now until (and if) we find any problems with it. Full diff: https://github.com/llvm/llvm-project/pull/105664.diff 3 Files Affected:
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 7bde9e490e4f4e..05f07421b8f526 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -542,6 +542,9 @@ def ConvertGpuOpsToLLVMSPVOps : Pass<"convert-gpu-to-llvm-spv", "gpu::GPUModuleO
Option<"indexBitwidth", "index-bitwidth", "unsigned",
/*default=kDeriveIndexBitwidthFromDataLayout*/"0",
"Bitwidth of the index type, 0 to use size of machine word">,
+ Option<"forceOpenclAddressSpaces", "force-opencl-address-spaces",
+ "bool", /*default=*/"false",
+ "Force kernel argument pointers to have address space global.">,
];
}
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index ced4236402923a..a9c6e76b506e61 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -34,6 +34,8 @@
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/FormatVariadic.h"
+#define DEBUG_TYPE "gpu-to-llvm-spv"
+
using namespace mlir;
namespace mlir {
@@ -306,6 +308,48 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern<gpu::ShuffleOp> {
}
};
+class MemorySpaceToOpenCLMemorySpaceConverter final : public TypeConverter {
+public:
+ MemorySpaceToOpenCLMemorySpaceConverter() {
+ addConversion([](Type t) { return t; });
+ addConversion(
+ [this](BaseMemRefType memRefType) -> std::optional<Type> {
+ std::optional<gpu::AddressSpace> addrSpace =
+ memorySpaceMap(memRefType.getMemorySpace());
+ if (!addrSpace) {
+ LLVM_DEBUG(
+ llvm::dbgs()
+ << "cannot convert " << memRefType
+ << " due to being unable to find address space in the map\n");
+ return std::nullopt;
+ }
+ auto addrSpaceAttr =
+ gpu::AddressSpaceAttr::get(memRefType.getContext(), *addrSpace);
+ if (auto rankedType = dyn_cast<MemRefType>(memRefType)) {
+ return MemRefType::get(memRefType.getShape(),
+ memRefType.getElementType(),
+ rankedType.getLayout(), addrSpaceAttr);
+ }
+ return UnrankedMemRefType::get(memRefType.getElementType(),
+ addrSpaceAttr);
+ });
+ addConversion([this](FunctionType type) {
+ auto inputs = llvm::map_to_vector(
+ type.getInputs(), [this](Type ty) { return convertType(ty); });
+ auto results = llvm::map_to_vector(
+ type.getResults(), [this](Type ty) { return convertType(ty); });
+ return FunctionType::get(type.getContext(), inputs, results);
+ });
+ }
+
+private:
+ std::optional<gpu::AddressSpace> memorySpaceMap(Attribute memSpaceAttr) {
+ if (!memSpaceAttr)
+ return gpu::AddressSpace::Global;
+ return std::nullopt;
+ }
+};
+
//===----------------------------------------------------------------------===//
// GPU To LLVM-SPV Pass.
//===----------------------------------------------------------------------===//
@@ -325,6 +369,20 @@ struct GPUToLLVMSPVConversionPass final
LLVMTypeConverter converter(context, options);
LLVMConversionTarget target(*context);
+ if (forceOpenclAddressSpaces) {
+ MemorySpaceToOpenCLMemorySpaceConverter converter;
+ AttrTypeReplacer replacer;
+ replacer.addReplacement([&converter](BaseMemRefType origType)
+ -> std::optional<BaseMemRefType> {
+ return converter.convertType<BaseMemRefType>(origType);
+ });
+
+ replacer.recursivelyReplaceElementsIn(getOperation(),
+ /*replaceAttrs=*/true,
+ /*replaceLocs=*/false,
+ /*replaceTypes=*/true);
+ }
+
target.addIllegalOp<gpu::BarrierOp, gpu::BlockDimOp, gpu::BlockIdOp,
gpu::GPUFuncOp, gpu::GlobalIdOp, gpu::GridDimOp,
gpu::ReturnOp, gpu::ShuffleOp, gpu::ThreadIdOp>();
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
index ec4f4a304d5073..d100f36ae42521 100644
--- a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
+++ b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
@@ -2,6 +2,8 @@
// RUN: | FileCheck --check-prefixes=CHECK-64,CHECK %s
// RUN: mlir-opt -pass-pipeline="builtin.module(gpu.module(convert-gpu-to-llvm-spv{index-bitwidth=32}))" -split-input-file -verify-diagnostics %s \
// RUN: | FileCheck --check-prefixes=CHECK-32,CHECK %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(gpu.module(convert-gpu-to-llvm-spv{force-opencl-address-spaces}))" -split-input-file -verify-diagnostics %s \
+// RUN: | FileCheck --check-prefixes=OPENCL %s
gpu.module @builtins {
// CHECK-64: llvm.func spir_funccc @_Z14get_num_groupsj(i32) -> i64 attributes {
@@ -515,3 +517,15 @@ gpu.module @kernels {
gpu.return
}
}
+
+// -----
+
+gpu.module @kernels {
+// OPENCL-LABEL: llvm.func spir_funccc @no_address_spaces(
+// OPENCL-SAME: %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1>
+// OPENCL-SAME: %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1>
+// OPENCL-SAME: %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1>
+ gpu.func @no_address_spaces(%arg0: memref<f32>, %arg1: memref<f32, #gpu.address_space<global>>, %arg2: memref<f32>) {
+ gpu.return
+ }
+}
|
|
||
// ----- | ||
|
||
gpu.module @kernels { | ||
// OPENCL-LABEL: llvm.func spir_funccc @no_address_spaces( | ||
// OPENCL-SAME: %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1> | ||
// OPENCL-SAME: %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1> | ||
// OPENCL-SAME: %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1> | ||
gpu.func @no_address_spaces(%arg0: memref<f32>, %arg1: memref<f32, #gpu.address_space<global>>, %arg2: memref<f32>) { | ||
gpu.return | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add additional and more complex tests?
For example, adding a device function inside the module and calling it from the kernel. Loading one of the arguments, etc...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added an example. I had to add func to llvm and finalize memref to llvm into the pass so that the signature and loads/stores are lowered correctly.
The function should have a spir_func attribute though. I think it'd be easiest to just go through all of them after func to llvm and attach it to those that don't have it. Kernels will have those set correctly, so we can always assume it's not a kernel.
if (forceOpenclAddressSpaces) { | ||
MemorySpaceToOpenCLMemorySpaceConverter converter; | ||
AttrTypeReplacer replacer; | ||
replacer.addReplacement([&converter](BaseMemRefType origType) | ||
-> std::optional<BaseMemRefType> { | ||
return converter.convertType<BaseMemRefType>(origType); | ||
}); | ||
|
||
replacer.recursivelyReplaceElementsIn(getOperation(), | ||
/*replaceAttrs=*/true, | ||
/*replaceLocs=*/false, | ||
/*replaceTypes=*/true); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like an overkill, isn't there another way?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried a natural way - putting additional conversion to the llvm converter but it didn't work out. First, it expects a legal value as the output, so I can't just add a memref. I tried to run converter.convertType(<newMemRefTypeWithAttribute>)
for this. It does the job for memrefs in the function body but not for its signature. I could not access the signature type by adding a conversion (the type never reaches the hook). And even though the internal signature conversion calls type converter recursively the pattern is never called. I think it should work since I'm adding the conversion after all the default ones, so it should be called first. Yet, that doesn't happen somehow.
So instead, I added this. It doesn't have to think about legality and all since it's not an llvm converter.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I took a look into this once more. It seems like if I were to reuse LLVMTypeConverter
I'd need to rewrite the convertFunctionSignature
to specialize to the case. That drags out some implementation code as well as changes to the converter. This looks worse to me. If you have any better idea please let me know :).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
True, conversion to LLVM assumes memspace None
maps to the default address space in several places. It'd be complex to change that.
I've added the missing calling convention for the non-kernel functions. Any other comments? |
Co-authored-by: Jakub Kuderski <[email protected]>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great addition. IMO we can make this the default behavior and drop the flag as the generated code would be illegal otherwise.
In case this is not made default behavior, can we extend tests to check output with flag unset?
@victor-eds, how about this a1666d6 to remove the external patterns and avoid the Here's how it looks like in an example. Input: gpu.module @kernels {
gpu.func @no_address_spaces_complex(%arg0: memref<2x2xf32>, %arg1: memref<4xf32>) kernel {
func.call @no_address_spaces_callee(%arg0, %arg1) : (memref<2x2xf32>, memref<4xf32>) -> ()
gpu.return
}
func.func @no_address_spaces_callee(%arg0: memref<2x2xf32>, %arg1: memref<4xf32>) {
%block_id = gpu.block_id x
%0 = memref.load %arg0[%block_id, %block_id] : memref<2x2xf32>
memref.store %0, %arg1[%block_id] : memref<4xf32>
func.return
}
} After running the pass we get: module {
gpu.module @kernels {
llvm.func spir_funccc @_Z12get_group_idj(i32) -> i64 attributes {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return}
llvm.func spir_kernelcc @no_address_spaces_complex(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr<1>, %arg8: !llvm.ptr<1>, %arg9: i64, %arg10: i64, %arg11: i64) attributes {gpu.kernel} {
%0 = llvm.mlir.undef : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%1 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%2 = llvm.insertvalue %arg8, %1[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%3 = llvm.insertvalue %arg9, %2[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%4 = llvm.insertvalue %arg10, %3[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%5 = llvm.insertvalue %arg11, %4[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%6 = builtin.unrealized_conversion_cast %5 : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> to memref<4xf32, 1>
%7 = builtin.unrealized_conversion_cast %6 : memref<4xf32, 1> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%8 = llvm.mlir.undef : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%9 = llvm.insertvalue %arg0, %8[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%10 = llvm.insertvalue %arg1, %9[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%11 = llvm.insertvalue %arg2, %10[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%12 = llvm.insertvalue %arg3, %11[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%13 = llvm.insertvalue %arg5, %12[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%14 = llvm.insertvalue %arg4, %13[3, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%15 = llvm.insertvalue %arg6, %14[4, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%16 = builtin.unrealized_conversion_cast %15 : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)> to memref<2x2xf32, 1>
%17 = builtin.unrealized_conversion_cast %16 : memref<2x2xf32, 1> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
func.call @no_address_spaces_callee(%16, %6) : (memref<2x2xf32, 1>, memref<4xf32, 1>) -> ()
llvm.return
}
func.func @no_address_spaces_callee(%arg0: memref<2x2xf32, 1>, %arg1: memref<4xf32, 1>) {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.call spir_funccc @_Z12get_group_idj(%0) {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return} : (i32) -> i64
%2 = builtin.unrealized_conversion_cast %1 : i64 to index
%3 = memref.load %arg0[%2, %2] : memref<2x2xf32, 1>
memref.store %3, %arg1[%2] : memref<4xf32, 1>
return
}
}
} So, there're no gpu ops left and the address spaces are correct, although described as plain integer attributes. If I'm to lower that further to llvm I get all the unrealized casts resolved ( module {
gpu.module @kernels {
llvm.func spir_funccc @_Z12get_group_idj(i32) -> i64 attributes {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return}
llvm.func spir_kernelcc @no_address_spaces_complex(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr<1>, %arg8: !llvm.ptr<1>, %arg9: i64, %arg10: i64, %arg11: i64) attributes {gpu.kernel} {
%0 = llvm.mlir.undef : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%1 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%2 = llvm.insertvalue %arg8, %1[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%3 = llvm.insertvalue %arg9, %2[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%4 = llvm.insertvalue %arg10, %3[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%5 = llvm.insertvalue %arg11, %4[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%6 = llvm.mlir.undef : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%7 = llvm.insertvalue %arg0, %6[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%8 = llvm.insertvalue %arg1, %7[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%9 = llvm.insertvalue %arg2, %8[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%10 = llvm.insertvalue %arg3, %9[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%11 = llvm.insertvalue %arg5, %10[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%12 = llvm.insertvalue %arg4, %11[3, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%13 = llvm.insertvalue %arg6, %12[4, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%14 = llvm.extractvalue %13[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%15 = llvm.extractvalue %13[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%16 = llvm.extractvalue %13[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%17 = llvm.extractvalue %13[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%18 = llvm.extractvalue %13[3, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%19 = llvm.extractvalue %13[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%20 = llvm.extractvalue %13[4, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%21 = llvm.extractvalue %5[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%22 = llvm.extractvalue %5[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%23 = llvm.extractvalue %5[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%24 = llvm.extractvalue %5[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%25 = llvm.extractvalue %5[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
llvm.call @no_address_spaces_callee(%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25) : (!llvm.ptr<1>, !llvm.ptr<1>, i64, i64, i64, i64, i64, !llvm.ptr<1>, !llvm.ptr<1>, i64, i64, i64) -> ()
llvm.return
}
llvm.func @no_address_spaces_callee(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr<1>, %arg8: !llvm.ptr<1>, %arg9: i64, %arg10: i64, %arg11: i64) {
%0 = llvm.mlir.undef : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%1 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%2 = llvm.insertvalue %arg8, %1[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%3 = llvm.insertvalue %arg9, %2[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%4 = llvm.insertvalue %arg10, %3[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%5 = llvm.insertvalue %arg11, %4[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%6 = llvm.mlir.undef : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%7 = llvm.insertvalue %arg0, %6[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%8 = llvm.insertvalue %arg1, %7[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%9 = llvm.insertvalue %arg2, %8[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%10 = llvm.insertvalue %arg3, %9[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%11 = llvm.insertvalue %arg5, %10[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%12 = llvm.insertvalue %arg4, %11[3, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%13 = llvm.insertvalue %arg6, %12[4, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%14 = llvm.mlir.constant(0 : i32) : i32
%15 = llvm.call spir_funccc @_Z12get_group_idj(%14) {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return} : (i32) -> i64
%16 = llvm.extractvalue %13[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%17 = llvm.mlir.constant(2 : index) : i64
%18 = llvm.mul %15, %17 : i64
%19 = llvm.add %18, %15 : i64
%20 = llvm.getelementptr %16[%19] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
%21 = llvm.load %20 : !llvm.ptr<1> -> f32
%22 = llvm.extractvalue %5[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
%23 = llvm.getelementptr %22[%15] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
llvm.store %21, %23 : f32, !llvm.ptr<1>
llvm.return
}
}
} So now we can use default passes and not require any additional converters. And if I add canonicalization we get a nice simple kernel: module {
gpu.module @kernels {
llvm.func spir_funccc @_Z12get_group_idj(i32) -> i64 attributes {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return}
llvm.func spir_kernelcc @no_address_spaces_complex(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr<1>, %arg8: !llvm.ptr<1>, %arg9: i64, %arg10: i64, %arg11: i64) attributes {gpu.kernel} {
llvm.call @no_address_spaces_callee(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11) : (!llvm.ptr<1>, !llvm.ptr<1>, i64, i64, i64, i64, i64, !llvm.ptr<1>, !llvm.ptr<1>, i64, i64, i64) -> ()
llvm.return
}
llvm.func @no_address_spaces_callee(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr<1>, %arg8: !llvm.ptr<1>, %arg9: i64, %arg10: i64, %arg11: i64) {
%0 = llvm.mlir.constant(2 : index) : i64
%1 = llvm.mlir.constant(0 : i32) : i32
%2 = llvm.call spir_funccc @_Z12get_group_idj(%1) {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return} : (i32) -> i64
%3 = llvm.mul %2, %0 : i64
%4 = llvm.add %3, %2 : i64
%5 = llvm.getelementptr %arg1[%4] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
%6 = llvm.load %5 : !llvm.ptr<1> -> f32
%7 = llvm.getelementptr %arg8[%2] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
llvm.store %6, %7 : f32, !llvm.ptr<1>
llvm.return
}
}
} |
Hey! IMO this helps in the particular case you're posting, but doesn't in a more general case, e.g., if we had |
ping |
…m#105664) Default to Global address space for memrefs that do not have an explicit address space set in the IR. --------- Co-authored-by: Victor Perez <[email protected]> Co-authored-by: Jakub Kuderski <[email protected]> Co-authored-by: Victor Perez <[email protected]>
This PR introduces a new option
force-opencl-address-spaces
to thegpu-to-llvm-spv
conversion pass. It addresses a problem with signature incompatibility betweengpu.launch_func
and the kernel arguments when using a unified shared memory with OpenCL runtime. The discrepancy comes from the fact that the global address space is defined as1
in OpenCL spec.Note: this is a draft showing an alternative solution to the one proposed in #102925. Currently, it lacks the differentiation between kernels and non-kernel functions (generic address space should be attached to the latter one's arguments). I'll add it if we decide the approach is acceptable.Edit: after some discussions, the conversion to generic address spaces seem to be unnecessary. The assumption is that if there was a function that required specific handling (e.g., should accept pointers from shared space) it is user's responsibility to put the correct attribute on it. I'm leaving the conversion to only replace None with Global for now until (and if) we find any problems with it.