Skip to content

Commit 04a900f

Browse files
committed
Update base for Update on "[Excutorch][Llama] Decouple input sequence length from kv cache context length"
Decouple max sequence length, for shape dynamism in torch.export, from sequence length used for kv cache sizing. Differential Revision: [D68448334](https://our.internmc.facebook.com/intern/diff/D68448334/) cc mergennachin cccclai helunwencser dvorjackz [ghstack-poisoned]
2 parents 67bd5d9 + bdd3d9c commit 04a900f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+813
-249
lines changed

backends/arm/operator_support/to_copy_support.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool
125125
# Check dim_order (to_dim_order_copy)
126126
if "dim_order" in node.kwargs:
127127
dim_order = node.kwargs["dim_order"]
128+
# pyre-ignore[6]
128129
if dim_order != list(range(len(dim_order))):
129130
logger.info(
130131
f"Argument {dim_order=} is not supported for "

backends/cadence/aot/compiler.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
ExecutorchProgramManager,
3434
to_edge,
3535
)
36+
from executorch.exir.dialects._ops import ops as exir_ops
3637
from executorch.exir.pass_base import PassResult
3738
from executorch.exir.passes import ToOutVarPass
3839
from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
@@ -186,14 +187,17 @@ def export_to_edge(
186187
edge_prog_manager = to_edge(
187188
expo_program,
188189
compile_config=EdgeCompileConfig(
189-
_skip_dim_order=True,
190190
# Allow specific non-core aten ops in the IR.
191191
_core_aten_ops_exception_list=[
192192
torch.ops.aten._native_batch_norm_legit_functional.default,
193193
torch.ops.aten.linear.default,
194194
torch.ops.aten.linalg_vector_norm.default,
195195
torch.ops.aten.unfold.default,
196196
torch.ops.aten.angle.default,
197+
# cadence replaced to_dim_order_copy with _to_copy for performance
198+
# skip _to_copy op to get around of dim order check
199+
# We should remove this op once cadence can support dim order
200+
exir_ops.edge.aten._to_copy.default,
197201
],
198202
),
199203
constant_methods=constant_methods,

backends/cadence/aot/replace_ops.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
# pyre-unsafe
1313

14+
import copy
1415
import math
1516
from operator import neg
1617
from typing import cast, Dict, Iterable, Sequence, Set, Tuple
@@ -35,7 +36,12 @@
3536
from executorch.backends.cadence.aot.utils import get_edge_overload_packet
3637
from executorch.exir.dialects._ops import ops as exir_ops
3738
from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
39+
from executorch.exir.dim_order_utils import get_memory_format
3840
from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
41+
from executorch.exir.passes.dim_order_ops_registry import (
42+
DimOrderOpsMap,
43+
MemoryFormatOpsMap,
44+
)
3945
from torch._subclasses import FakeTensor
4046
from torch.fx.node import Argument
4147

@@ -1799,6 +1805,72 @@ def call_operator(
17991805
)
18001806

18011807

1808+
@register_cadence_pass(CadencePassAttribute(opt_level=0))
1809+
class ReplaceToDimOrderCopyWithToCopyPass(ExportPass):
1810+
"""
1811+
dim_order_ops::to_dim_order_copy is not supported, so this is an opt_level=0 pass.
1812+
If the dim order is sequential, we don't need the extra work with strides and
1813+
can just use to_copy.
1814+
"""
1815+
1816+
def call_operator(
1817+
self,
1818+
op,
1819+
args: Tuple[Argument, ...],
1820+
kwargs: Dict[str, Argument],
1821+
meta: NodeMetadata,
1822+
) -> ProxyValue:
1823+
if op not in DimOrderOpsMap:
1824+
return super().call_operator(op, args, kwargs, meta)
1825+
1826+
# new kwargs with dim_order, and no memory_format for the new op
1827+
nkwargs = dict(copy.deepcopy(kwargs)) # orig kwargs are immutable
1828+
1829+
ndim = None
1830+
1831+
# can always get the shape, assuming rank is specialized
1832+
1833+
# pyre-ignore[16]: `None` has no attribute `to_tensor`
1834+
if isinstance(args[0], ProxyValue) and args[0].is_tensor():
1835+
# pyre-ignore[16]: `None` has no attribute `to_tensor`
1836+
ndim = args[0].to_tensor().dim()
1837+
elif isinstance(args[0], torch.Tensor):
1838+
# pyre-ignore[16]: `None` has no attribute `dim`
1839+
ndim = args[0].dim()
1840+
elif isinstance(args[0], torch.fx.immutable_collections.immutable_list):
1841+
# pyre-ignore[6]: Incompatible parameter type
1842+
ndim = len(args[0])
1843+
else:
1844+
assert 0, f"Expecting a Tensor or a ProxyValue but got {type(args[0])}"
1845+
1846+
# get the "to" memory format for the EdgeOp
1847+
contiguous_dim_order = list(range(ndim))
1848+
dim_order = nkwargs.pop("dim_order", None)
1849+
1850+
# Cadence only supports contiguous memory format
1851+
assert (
1852+
dim_order is None
1853+
# pyre-ignore[6]: Incompatible parameter type
1854+
or len(dim_order) == 0
1855+
or dim_order == contiguous_dim_order
1856+
), "Expected dim order in congituous or prevserve memory format, but got {}".format(
1857+
dim_order
1858+
)
1859+
1860+
# bring back memory format
1861+
# pyre-ignore[6]: Incompatible parameter type
1862+
nkwargs["memory_format"] = get_memory_format(dim_order)
1863+
1864+
memory_format_op = MemoryFormatOpsMap[op]
1865+
1866+
return super().call_operator(
1867+
memory_format_op,
1868+
args,
1869+
nkwargs,
1870+
meta,
1871+
)
1872+
1873+
18021874
@register_cadence_pass(CadencePassAttribute(opt_level=0))
18031875
class ReplaceFullLikeWithFullPass(ExportPass):
18041876
"""
@@ -2108,4 +2180,5 @@ class CadenceReplaceOpsInGraph:
21082180
ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
21092181
ReplaceAtenAvgPoolWithJarvisAvgPoolPass,
21102182
ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass,
2183+
ReplaceToDimOrderCopyWithToCopyPass,
21112184
]

backends/cadence/fusion_g3/operators/op_exp.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,9 @@ Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
4949
out);
5050
#endif
5151

52-
if (out.scalar_type() == ScalarType::Float) {
53-
float* const out_data = out.mutable_data_ptr<float>();
54-
const float* const in_data = in.const_data_ptr<float>();
52+
if (in.scalar_type() == ScalarType::Float) {
53+
float* __restrict__ out_data = out.mutable_data_ptr<float>();
54+
const float* __restrict__ in_data = in.const_data_ptr<float>();
5555

5656
XT_KERNEL_CHECK(
5757
ctx, out, xa_nn_elm_exp_f32_f32, out_data, in_data, out.numel());
@@ -66,4 +66,4 @@ Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
6666
} // namespace native
6767
} // namespace G3
6868
} // namespace impl
69-
} // namespace cadence
69+
} // namespace cadence

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ void main() {
4141
div_by_x % out_limits.y,
4242
div_by_x / out_limits.y);
4343

44-
if (any(greaterThanEqual(pos, out_limits))) {
44+
if (pos.z >= out_limits.z) {
4545
return;
4646
}
4747

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ void main() {
5959
pos.y *= BATCH_SIZE_Y;
6060

6161
// do not process if top pixel does not fit within the output range
62-
if (any(greaterThanEqual(pos, out_limits))) {
62+
if (pos.z >= out_limits.z) {
6363
return;
6464
}
6565

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ void main() {
4444
div_by_x % out_limits.y,
4545
div_by_x / out_limits.y);
4646

47-
if (any(greaterThanEqual(pos, out_limits))) {
47+
if (pos.z >= out_limits.z) {
4848
return;
4949
}
5050

backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@
1212

1313
#define VEC4_T ${texel_type(DTYPE)}
1414

15-
#define TILE_SIZE ${TILE_SIZE}
15+
#define TILE_SIZE_X ${TILE_SIZE_X}
16+
#define TILE_SIZE_Y ${TILE_SIZE_Y}
17+
#define LOCAL_WG_SIZE 64
1618

1719
#define op(X, A, B) ${OPERATOR}
1820

@@ -24,27 +26,36 @@ ${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
2426
${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
2527
${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
2628
${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
27-
${layout_declare_ubo(4, "ivec3", "out_limits")}
28-
${layout_declare_ubo(5, "ivec4", "in_sizes")}
29-
${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
30-
${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
31-
${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
29+
30+
layout(push_constant) uniform restrict Block {
31+
ivec4 out_limits;
32+
ivec4 in_sizes;
33+
ivec2 kernel_size;
34+
ivec2 stride;
35+
ivec2 padding;
36+
ivec2 dilation;
37+
ivec2 overlay_region;
38+
int in_group_size;
39+
int dummy_padding;
40+
float out_min;
41+
float out_max;
42+
};
3243

3344
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
3445

35-
// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
36-
// 64 is the number of threads in the local wg
37-
$num_shared = 64 * TILE_SIZE * TILE_SIZE
38-
shared ivec2 pos_shared[${num_shared}];
46+
// For performance improvement, reduce register usage by caching positions in shared memory.
47+
// Offset index by 1 every 16 points to avoid bank access conflict.
48+
#define offset_pos_index(index) (index + ((index) >> 4))
49+
shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE_Y)];
3950

4051
/*
4152
* Computes a 2D pointwise convolution of an NxN output tile. Calculating an
4253
* output tile for pointwise convolution is more efficient because the kernel
4354
* size is only 1x1, making it easier to re-use loaded texels from t_kernel.
4455
*/
4556
void main() {
46-
const ivec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1) / TILE_SIZE;
47-
const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
57+
const ivec2 out_limits_scaled = (out_limits.xy + ivec2(TILE_SIZE_X - 1, TILE_SIZE_Y - 1)) / ivec2(TILE_SIZE_X, TILE_SIZE_Y);
58+
const uint shared_mem_stride = LOCAL_WG_SIZE;
4859

4960
const uint div_by_x = gl_GlobalInvocationID.x / out_limits_scaled.x;
5061
const ivec3 gpos = ivec3(
@@ -58,33 +69,32 @@ void main() {
5869
// +--------+--------+
5970
// | pos[2] | pos[3] |
6071
// +--------+--------+
61-
ivec2 pos[TILE_SIZE * TILE_SIZE];
62-
for (int y = 0, i = 0; y < TILE_SIZE; ++y) {
63-
for (int x = 0; x < TILE_SIZE; ++x) {
64-
pos[i] = ivec2(
65-
gpos.x * TILE_SIZE + x, gpos.y * TILE_SIZE + y);
66-
pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex] = pos[i];
72+
ivec2 pos[TILE_SIZE_X * TILE_SIZE_Y];
73+
for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) {
74+
for (int x = 0; x < TILE_SIZE_X; ++x) {
75+
pos[i] = ivec2(gpos.x * TILE_SIZE_X + x, gpos.y * TILE_SIZE_Y + y);
76+
pos_shared[offset_pos_index((shared_mem_stride * i) + gl_LocalInvocationIndex)] = ivec3(pos[i], gpos.z);
6777
i++;
6878
}
6979
}
7080

7181
// If the top left position is out of bounds, then this invocation will have
7282
// no work to do.
73-
if (any(greaterThanEqual(ivec3(pos[0], gpos.z), out_limits))) {
83+
if (gpos.z >= out_limits.z) {
7484
return;
7585
}
7686

7787
// Compute the index of the input texture that needs to be loaded for each
7888
// output position. Note that negative indices can be produced indicating that
7989
// the top-left element is in a region added by padding.
80-
ivec2 ipos[TILE_SIZE * TILE_SIZE];
81-
for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
90+
ivec2 ipos[TILE_SIZE_X * TILE_SIZE_Y];
91+
for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
8292
ipos[i] = pos[i] * stride - padding;
8393
}
8494

85-
vec4 sum[TILE_SIZE * TILE_SIZE];
95+
vec4 sum[TILE_SIZE_X * TILE_SIZE_Y];
8696
sum[0] = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
87-
for (int i = 1; i < TILE_SIZE * TILE_SIZE; ++i) {
97+
for (int i = 1; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
8898
sum[i] = sum[0];
8999
}
90100

@@ -100,7 +110,7 @@ void main() {
100110
const vec4 ktex_3 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(3, 0));
101111

102112
#pragma unroll
103-
for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
113+
for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
104114
const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i], z4), 0);
105115
// For 2x2 tile size algorithm works as follows.
106116
// To explain the calculations below, the contents of one in_tex and the
@@ -142,10 +152,11 @@ void main() {
142152
}
143153
}
144154

145-
for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
146-
const ivec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex];
147-
if (all(lessThan(ivec3(pos, gpos.z), out_limits))) {
148-
imageStore(t_out, ivec3(pos, gpos.z), op(sum[i], out_min, out_max));
155+
for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
156+
const uint index = (shared_mem_stride * i) + gl_LocalInvocationIndex;
157+
const ivec3 pos = pos_shared[offset_pos_index(index)];
158+
if (all(lessThan(pos, out_limits.xyz))) {
159+
imageStore(t_out, pos, op(sum[i], out_min, out_max));
149160
}
150161
}
151162
}

backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ conv2d_pw:
99
OPERATOR: X
1010
NDIM: 3
1111
DTYPE: float
12-
TILE_SIZE: 2
12+
TILE_SIZE_X: 2
13+
TILE_SIZE_Y: 2
1314
generate_variant_forall:
1415
DTYPE:
1516
- VALUE: half

0 commit comments

Comments
 (0)