Update base for Update on "[ET-VK][Ops] aten.index_select"

jorgep31415 · jorgep31415 · commit 10ec068f9e25 · 2024-05-28T18:59:30.000-07:00
## The Operator `nn.Module` invocations of [`torch.index_select`](https://pytorch.org/docs/stable/generated/torch.index_select.html) get compiled to `aten.index_select.default` in the Edge Dialect, which carries the following signature. ``` - func: index_select(Tensor self, int dim, Tensor index) -> Tensor ``` ## Implementation This is a C-packing-only implementation. It is very similar to `aten.slice`: #3171 ``` - func: slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) ``` It features a similar split between a shader for N,H,W and a shader for C, because copying from the C-dimension is more difficult due to C-packing. Both `index_select` and `slice` copy specific indices across 1 dimension. The difference is in the way these indices are specified. - `slice` uses `start=1`/`end=5`/`step=2` as three scalars for indices `1,3`. - `index_select` lists the exact indices inside a tensor e.g. `index=torch.tensor([1,3])`. Hence, `slice` uses a `offset=1` and `step=2` to compute input position. In `index_select`, we read the index tensor to compute input position. Differential Revision: [D57745489](https://our.internmc.facebook.com/intern/diff/D57745489/) [ghstack-poisoned]
diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh
@@ -77,16 +77,18 @@ install_sccache() {
 
   export PATH="${SCCACHE_PATH}:${PATH}"
 
-  # Create temp directory for sccache shims
-  TMP_DIR=$(mktemp -d)
-  trap 'rm -rfv ${TMP_DIR}' EXIT
+  # Create temp directory for sccache shims if TMP_DIR doesn't exist
+  if [ -z "${TMP_DIR:-}" ]; then
+    TMP_DIR=$(mktemp -d)
+    trap 'rm -rfv ${TMP_DIR}' EXIT
+    export PATH="${TMP_DIR}:$PATH"
+  fi
 
   write_sccache_stub "${TMP_DIR}/c++"
   write_sccache_stub "${TMP_DIR}/cc"
   write_sccache_stub "${TMP_DIR}/clang++"
   write_sccache_stub "${TMP_DIR}/clang"
 
-  export PATH="${TMP_DIR}:$PATH"
   sccache --zero-stats || true
 }
 
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2024 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -89,6 +90,17 @@ install_flatc_from_source() {
   popd || return
 }
 
+install_arm() {
+  # NB: This function could be used to install Arm dependencies
+  # Setup arm example environment (including TOSA tools)
+  git config --global user.email "github_executorch@arm.com"
+  git config --global user.name "Github Executorch"
+  bash examples/arm/setup.sh --i-agree-to-the-contained-eula
+
+  # Test tosa_reference flow
+  source examples/arm/ethos-u-scratch/setup_path.sh
+}
+
 build_executorch_runner_buck2() {
   # Build executorch runtime with retry as this step is flaky on macos CI
   retry buck2 build //examples/portable/executor_runner:executor_runner
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
@@ -41,10 +41,6 @@ jobs:
 
   macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    strategy:
-      matrix:
-        include:
-          - build-tool: buck2
     with:
       runner: macos-m1-stable
       python-version: '3.11'
@@ -53,18 +49,21 @@ jobs:
       script: |
         set -eux
 
-        BUILD_TOOL=${{ matrix.build-tool }}
-
         bash .ci/scripts/setup-conda.sh
 
+        # Create temp directory for sccache shims
+        export TMP_DIR=$(mktemp -d)
+        export PATH="${TMP_DIR}:$PATH"
+        trap 'rm -rfv ${TMP_DIR}' EXIT
+
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         PYTHON_EXECUTABLE=python \
         EXECUTORCH_BUILD_PYBIND=ON \
         CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
         ${CONDA_RUN} --no-capture-output \
-        .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+        .ci/scripts/setup-macos.sh cmake
 
         # Run pytest with coverage
         ${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml
         # Run gtest
-        ${CONDA_RUN} buck2 test runtime/core/... runtime/platform/...
+        ${CONDA_RUN} test/run_oss_cpp_tests.sh
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -307,3 +307,37 @@ jobs:
     uses: ./.github/workflows/_unittest.yml
     with:
       docker-image: executorch-ubuntu-22.04-clang12
+
+  unittest-arm:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      matrix:
+        include:
+          - build-tool: buck2
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-arm-sdk
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        BUILD_TOOL=${{ matrix.build-tool }}
+
+        # Setup MacOS dependencies as there is no Docker support on MacOS atm
+        PYTHON_EXECUTABLE=python \
+        EXECUTORCH_BUILD_PYBIND=ON \
+        EXECUTORCH_BUILD_ARM_BAREMETAL=ON \
+        .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+
+        source .ci/scripts/utils.sh
+        # Install Arm dependencies
+        install_arm
+
+        # Run pytest with coverage
+        pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -142,17 +142,13 @@ jobs:
         install_flatc_from_source
         install_executorch
 
-        # Setup arm example environment (including TOSA tools)
-        git config --global user.email "github_executorch@arm.com"
-        git config --global user.name "Github Executorch"
-        bash examples/arm/setup.sh --i-agree-to-the-contained-eula
+        install_arm
 
         # Increase number of files user can monitor to bypass buck failures.
         # Hopefully this is high enough for this setup.
         sudo sysctl fs.inotify.max_user_watches=1048576 # 1024 * 1024
 
         # Test ethos-u delegate examples with run.sh
-        source examples/arm/ethos-u-scratch/setup_path.sh
         PYTHON_EXECUTABLE=python bash examples/arm/run.sh examples/arm/ethos-u-scratch/ buck2
 
   test-arm-reference-delegation:
@@ -172,20 +168,11 @@ jobs:
         install_flatc_from_source
         install_executorch
 
-        # Setup arm example environment (including TOSA tools)
-        git config --global user.email "github_executorch@arm.com"
-        git config --global user.name "Github Executorch"
-        bash examples/arm/setup.sh --i-agree-to-the-contained-eula
+        install_arm
 
         # Test tosa_reference flow
-        source examples/arm/ethos-u-scratch/setup_path.sh
         PYTHON_EXECUTABLE=python bash backends/arm/test/run_tosa_reference.sh
 
-        # Run Arm specific unit-tests
-        # Run pytest only on specified folders. These test should migrate into
-        # the _unittest.yml once that test env is fixed
-        pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test/ops/ backends/arm/test/models
-
   test-coreml-delegate:
     name: test-coreml-delegate
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -156,3 +156,33 @@ command = [
     '--',
     '@{{PATHSFILE}}',
 ]
+
+[[linter]]
+code = 'NEWLINE'
+include_patterns = ['**']
+exclude_patterns = [
+    'third-party/**',
+    '**/third-party/**',
+    '**/*.png',
+    '**/*.webp',
+    '**/*.jpeg',
+    '**/*.mp4',
+    '**/*.pte',
+    '**/*.pth',
+    '**/*.bin',
+    '**/*.patch',
+    '**/*.svg',
+    '**/*.bat',
+    '**/*.jpg',
+    '**/*.jar',
+]
+command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'newlines_linter',
+    '--',
+    '@{{PATHSFILE}}',
+]
+is_formatter = true
diff --git a/backends/apple/coreml/.clang-format b/backends/apple/coreml/.clang-format
@@ -25,4 +25,3 @@ BraceWrapping:
 
 # Options for aligning backslashes in escaped newlines.
 AlignEscapedNewlines: Left
-
diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.mm b/backends/apple/coreml/runtime/delegate/backend_delegate.mm
@@ -200,4 +200,3 @@ bool purge_models_cache() const noexcept override {
     return std::make_shared<BackendDelegateImpl>(config);
 }
 } //namespace executorchcoreml
-
diff --git a/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm b/backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm
@@ -235,4 +235,3 @@ ModelLoggingOptions get_logging_options(BackendExecutionContext& context) {
 
 } // namespace executor
 } // namespace torch
-
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -70,26 +70,20 @@ Topics in this section will help you get started with ExecuTorch.
    :hidden:
 
    intro-overview
-   concepts
    intro-how-it-works
+   getting-started-architecture
+   concepts
 
 .. toctree::
    :glob:
    :maxdepth: 1
    :caption: Getting Started
    :hidden:
 
-   getting-started-architecture
    getting-started-setup
+   export-overview
    runtime-build-and-cross-compilation
 
-.. toctree::
-   :glob:
-   :maxdepth: 2
-   :caption: Working with LLMs
-   :hidden:
-
-   llm/getting-started
 
 .. toctree::
    :glob:
@@ -116,11 +110,11 @@ Topics in this section will help you get started with ExecuTorch.
 
 .. toctree::
    :glob:
-   :maxdepth: 1
-   :caption: Exporting to ExecuTorch
+   :maxdepth: 2
+   :caption: Working with LLMs
    :hidden:
 
-   export-overview
+   llm/getting-started
 
 .. toctree::
    :glob:
diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py
@@ -88,12 +88,13 @@ def tok_decode(self, tokens):
 
     def _model_call(self, inps):
         if self._use_kv_cache:
-            result_logits = []
-            for pos in range(self._max_seq_length):
-                pos_tensor = torch.tensor([pos], dtype=torch.int64)
-                logits = self._model(inps[:, pos : pos + 1], pos_tensor)
-                result_logits.append(logits)
-            return torch.cat(result_logits, dim=1)
+            pos_tensor = torch.arange(
+                self._max_seq_length, dtype=torch.int64, device=self.device
+            )
+
+            # Batch process the whole sequence.
+            logits = self._model(inps[:, : self._max_seq_length], pos_tensor)
+            return logits
         else:
             return self._model(inps)
 
diff --git a/examples/models/llama2/lib/quant_lib.py b/examples/models/llama2/lib/quant_lib.py
@@ -158,6 +158,8 @@ def get_qnn_quantizer(args):
         backend == "qnn"
     ), f"The quantization config is for backend {backend} instead of qnn."
     qnn_quantizer = QnnQuantizer()
+    qnn_quantizer.set_per_channel_conv_quant(enable=True)
+    qnn_quantizer.set_per_channel_linear_quant(enable=True)
     # more custom quantization are supported including 16a4w etc. default to 8bit quantized
     custom_annotations = ()
     if quant_config == "8a8w":
diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py
@@ -219,7 +219,8 @@ def test_constraint_present_after_dce(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
                 z = y.item()
-                torch._constrain_as_value(z, 0, 4)
+                torch._check(z > 0)
+                torch._check(z < 4)
                 return x[z : z + y.shape[0]]
 
         ep = torch.export.export(M(), (torch.randn(10), torch.tensor([3])))
diff --git a/exir/verification/test/test_verifier.py b/exir/verification/test/test_verifier.py
@@ -37,7 +37,8 @@ def test_edge_verifier_enablement(self) -> None:
         class M(torch.nn.Module):
             def forward(self, x, y):
                 z = y.item()
-                torch._constrain_as_value(z, 0, 4)
+                torch._check(z > 0)
+                torch._check(z < 4)
                 return x[z : z + y.shape[0]]
 
         ep = torch.export.export(M(), (torch.randn(10), torch.tensor([3])))
diff --git a/extension/training/optimizer/sgd.cpp b/extension/training/optimizer/sgd.cpp
diff --git a/extension/training/optimizer/sgd.h b/extension/training/optimizer/sgd.h
diff --git a/extension/training/optimizer/targets.bzl b/extension/training/optimizer/targets.bzl
diff --git a/extension/training/optimizer/test/sgd_test.cpp b/extension/training/optimizer/test/sgd_test.cpp
diff --git a/extension/training/optimizer/test/targets.bzl b/extension/training/optimizer/test/targets.bzl
diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh

Original file line number	Diff line number	Diff line change
`@@ -25,4 +25,3 @@ BraceWrapping:`
`25`	`25`
`26`	`26`	`# Options for aligning backslashes in escaped newlines.`
`27`	`27`	`AlignEscapedNewlines: Left`
`28`		`-`
Original file line number	Diff line number	Diff line change
`@@ -200,4 +200,3 @@ bool purge_models_cache() const noexcept override {`
`200`	`200`	`return std::make_shared<BackendDelegateImpl>(config);`
`201`	`201`	`}`
`202`	`202`	`} //namespace executorchcoreml`
`203`		`-`
Original file line number	Diff line number	Diff line change
`@@ -235,4 +235,3 @@ ModelLoggingOptions get_logging_options(BackendExecutionContext& context) {`
`235`	`235`
`236`	`236`	`} // namespace executor`
`237`	`237`	`} // namespace torch`
`238`		`-`