Skip to content

Commit c4b0b45

Browse files
committed
save a little size on "Reapply #11294 and #11295 (improve GLU test and implement using internal views to avoid copying)"
These were reverted due to internal test failures. Sending this as an exported internal diff so that we can make sure we get internal signal. Original summary for #11294 (to make the GLU test input asymmetric): This way it will produce different results along each tested dim. Original summaryfor #11295: GLU requires slicing the input Tensor into two halves. Currently, we accomplish this by copying; ExecuTorch does not support views in general because it requires Tensors to be contiguous. However, nothing stops us from implementing [the ATen that uses views](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/GatedLinearUnit.cpp#L35) entirely internally to the op. To support this, I added `support_noncontiguous_tensors` as an optional template argument to BroadcastIndexesRange and plumbed it through to the elementwise_util functions as an optional SupportNonContiguousTensors parameter. Differential Revision: [D76311585](https://our.internmc.facebook.com/intern/diff/D76311585/) [ghstack-poisoned]
1 parent 0f39205 commit c4b0b45

File tree

1 file changed

+46
-26
lines changed

1 file changed

+46
-26
lines changed

kernels/portable/cpu/op_glu.cpp

Lines changed: 46 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,46 @@ using ScalarType = executorch::aten::ScalarType;
2424

2525
namespace {
2626

27+
struct SplitGLUInputTensor {
28+
explicit SplitGLUInputTensor(const Tensor& self, int64_t dim);
29+
using SizesArray = std::array<executorch::aten::SizesType, kTensorDimensionLimit>;
30+
SizesArray half_sizes;
31+
TensorImpl first_half_impl;
32+
TensorImpl second_half_impl;
33+
Tensor first_half;
34+
Tensor second_half;
35+
36+
private:
37+
static SizesArray get_half_sizes(const Tensor& self, int64_t dim) {
38+
SizesArray half_sizes;
39+
std::copy(self.sizes().begin(), self.sizes().end(), half_sizes.begin());
40+
half_sizes[dim] /= 2;
41+
return half_sizes;
42+
}
43+
};
44+
45+
SplitGLUInputTensor::SplitGLUInputTensor(const Tensor& self, int64_t dim)
46+
: half_sizes(get_half_sizes(self, dim)),
47+
first_half_impl(
48+
self.scalar_type(),
49+
self.dim(),
50+
half_sizes.data(),
51+
self.mutable_data_ptr(),
52+
const_cast<executorch::aten::DimOrderType*>(self.dim_order().data()),
53+
const_cast<executorch::aten::StridesType*>(self.strides().data()),
54+
self.shape_dynamism()),
55+
second_half_impl(
56+
self.scalar_type(),
57+
self.dim(),
58+
half_sizes.data(),
59+
reinterpret_cast<char*>(self.mutable_data_ptr()) +
60+
self.strides()[dim] * self.size(dim) / 2 * self.element_size(),
61+
const_cast<executorch::aten::DimOrderType*>(self.dim_order().data()),
62+
const_cast<executorch::aten::StridesType*>(self.strides().data()),
63+
self.shape_dynamism()),
64+
first_half(&first_half_impl),
65+
second_half(&second_half_impl) {}
66+
2767
/**
2868
* Applies the gated linear unit function
2969
*
@@ -39,34 +79,12 @@ Tensor& glu_out_tensor(
3979
const Tensor& self,
4080
int64_t dim,
4181
Tensor& out) {
42-
const auto self_size = self.size(dim);
4382
ET_KERNEL_CHECK(
4483
ctx,
4584
self.dim() <= static_cast<ssize_t>(kTensorDimensionLimit),
4685
InvalidArgument,
4786
out);
48-
std::array<executorch::aten::SizesType, kTensorDimensionLimit> half_sizes;
49-
std::copy(self.sizes().begin(), self.sizes().end(), half_sizes.begin());
50-
half_sizes[dim] /= 2;
51-
TensorImpl first_half_impl(
52-
self.scalar_type(),
53-
self.dim(),
54-
half_sizes.data(),
55-
self.mutable_data_ptr(),
56-
const_cast<executorch::aten::DimOrderType*>(self.dim_order().data()),
57-
const_cast<executorch::aten::StridesType*>(self.strides().data()),
58-
self.shape_dynamism());
59-
TensorImpl second_half_impl(
60-
self.scalar_type(),
61-
self.dim(),
62-
half_sizes.data(),
63-
reinterpret_cast<char*>(self.mutable_data_ptr()) +
64-
self.strides()[dim] * self_size / 2 * self.element_size(),
65-
const_cast<executorch::aten::DimOrderType*>(self.dim_order().data()),
66-
const_cast<executorch::aten::StridesType*>(self.strides().data()),
67-
self.shape_dynamism());
68-
Tensor first_half(&first_half_impl);
69-
Tensor second_half(&second_half_impl);
87+
SplitGLUInputTensor split_input(self, dim);
7088
ScalarType compute_type =
7189
executorch::runtime::isFloatingType(self.scalar_type())
7290
? self.scalar_type()
@@ -79,14 +97,16 @@ Tensor& glu_out_tensor(
7997
op_name,
8098
utils::SupportedTensorDtypes::FLOATHBF16>(
8199
[](const auto val_a, const auto val_b) -> CTYPE_COMPUTE {
82-
// TODO: rewrite this to be vectorization-capable.
100+
// TODO: rewrite this to be vectorization-capable? the
101+
// tensors might not be contiguous; need to have
102+
// apply_bitensor_elementwise_fn check that.
83103
const auto one = static_cast<decltype(val_a)>(1.0);
84104
return val_a * (one / (one + std::exp(-val_b)));
85105
},
86106
ctx,
87-
first_half,
107+
split_input.first_half,
88108
utils::SupportedTensorDtypes::FLOATHBF16,
89-
second_half,
109+
split_input.second_half,
90110
utils::SupportedTensorDtypes::FLOATHBF16,
91111
out,
92112
utils::internal::SupportNoncontiguousTensors());

0 commit comments

Comments
 (0)