pytorch
diff --git a/‎.ci/scripts/gather_test_models.py
Lines changed: 11 additions & 9 deletions b/‎.ci/scripts/gather_test_models.py
Lines changed: 11 additions & 9 deletions
diff --git a/‎.ci/scripts/setup-macos.sh
Lines changed: 3 additions & 0 deletions b/‎.ci/scripts/setup-macos.sh
Lines changed: 3 additions & 0 deletions
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 11 additions & 0 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 11 additions & 0 deletions
diff --git a/‎.github/workflows/apple.yml
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/apple.yml
Lines changed: 6 additions & 0 deletions
diff --git a/‎.github/workflows/build-wheels-linux.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/build-wheels-linux.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/build-wheels-m1.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/build-wheels-m1.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/pull.yml
Lines changed: 4 additions & 2 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 40 additions & 2 deletions b/‎.github/workflows/trunk.yml
Lines changed: 40 additions & 2 deletions
diff --git a/‎.gitmodules
Lines changed: 3 additions & 0 deletions b/‎.gitmodules
Lines changed: 3 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 9 additions & 4 deletions b/‎CONTRIBUTING.md
Lines changed: 9 additions & 4 deletions
diff --git a/‎backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.mm
Lines changed: 29 additions & 16 deletions b/‎backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.mm
Lines changed: 29 additions & 16 deletions
diff --git a/‎backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm
Lines changed: 0 additions & 1 deletion b/‎backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/apple/coreml/runtime/test/ETCoreMLModelProfilerTests.mm
Lines changed: 0 additions & 1 deletion b/‎backends/apple/coreml/runtime/test/ETCoreMLModelProfilerTests.mm
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/apple/coreml/scripts/install_requirements.sh
Lines changed: 1 addition & 1 deletion b/‎backends/apple/coreml/scripts/install_requirements.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/test/test_coreml_partitioner.py
Lines changed: 5 additions & 13 deletions b/‎backends/apple/coreml/test/test_coreml_partitioner.py
Lines changed: 5 additions & 13 deletions
@@ -20,16 +20,16 @@
 CUSTOM_RUNNERS = {
     "linux": {
         # This one runs OOM on smaller runner, the root cause is unclear (T163016365)
-        "w2l": "linux.12xlarge",
-        "ic4": "linux.12xlarge",
-        "resnet50": "linux.12xlarge",
-        "llava": "linux.12xlarge",
-        "llama3_2_vision_encoder": "linux.12xlarge",
-        # "llama3_2_text_decoder": "linux.12xlarge",  # TODO: re-enable test when Huy's change is in / model gets smaller.
+        "w2l": "linux.4xlarge.memory",
+        "ic4": "linux.4xlarge.memory",
+        "resnet50": "linux.4xlarge.memory",
+        "llava": "linux.4xlarge.memory",
+        "llama3_2_vision_encoder": "linux.4xlarge.memory",
+        "llama3_2_text_decoder": "linux.4xlarge.memory",
         # This one causes timeout on smaller runner, the root cause is unclear (T161064121)
-        "dl3": "linux.12xlarge",
-        "emformer_join": "linux.12xlarge",
-        "emformer_predict": "linux.12xlarge",
+        "dl3": "linux.4xlarge.memory",
+        "emformer_join": "linux.4xlarge.memory",
+        "emformer_predict": "linux.4xlarge.memory",
     }
 }
 
@@ -39,10 +39,12 @@
     "linux": {
         "mobilebert": 90,
         "emformer_predict": 360,
+        "llama3_2_text_decoder": 360,
     },
     "macos": {
         "mobilebert": 90,
         "emformer_predict": 360,
+        "llama3_2_text_decoder": 360,
     },
 }
 
 
@@ -49,6 +49,9 @@ install_buck() {
 
   rm "${BUCK2}"
   popd
+
+  # Kill all running buck2 daemon for a fresh start
+  buck2 killall || true
 }
 
 function write_sccache_stub() {
 
@@ -27,6 +27,10 @@ while [[ $# -gt 0 ]]; do
       MODE="$2" # portable or xnnpack+custom or xnnpack+custom+qe
       shift 2
       ;;
+    -pt2e_quantize)
+      PT2E_QUANTIZE="$2"
+      shift 2
+      ;;
     -upload)
       UPLOAD_DIR="$2"
       shift 2
@@ -44,6 +48,9 @@ MODE=${MODE:-"xnnpack+custom"}
 # Default UPLOAD_DIR to empty string if not set
 UPLOAD_DIR="${UPLOAD_DIR:-}"
 
+# Default PT2E_QUANTIZE to empty string if not set
+PT2E_QUANTIZE="${PT2E_QUANTIZE:-}"
+
 if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
     echo "Expecting atleast 4 positional arguments"
     echo "Usage: [...]"
@@ -234,6 +241,10 @@ if [[ "${COREML}" == "ON" ]]; then
 fi
 if [[ "${QNN}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
+  echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}"
+  if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then
+    EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once "
+  fi
 fi
 # Add dynamically linked library location
 $PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
 
@@ -42,6 +42,8 @@ jobs:
 
   build-demo-ios:
     name: build-demo-ios
+    # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
@@ -190,6 +192,8 @@ jobs:
         ) done
 
   upload-frameworks-ios:
+    # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     runs-on: ubuntu-22.04
     needs: [build-frameworks-ios, set-version]
     timeout-minutes: 30
@@ -278,6 +282,8 @@ jobs:
 
   build-benchmark-app:
     name: build-benchmark-app
+    # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
 
@@ -27,6 +27,7 @@ jobs:
       test-infra-ref: main
       with-cuda: disabled
       with-rocm: disabled
+      python-versions: '["3.10", "3.11", "3.12"]'
 
   build:
     needs: generate-matrix
 
@@ -27,6 +27,7 @@ jobs:
       test-infra-ref: main
       with-cuda: disabled
       with-rocm: disabled
+      python-versions: '["3.10", "3.11", "3.12"]'
 
   build:
     needs: generate-matrix
 
@@ -332,7 +332,7 @@ jobs:
       docker-image: executorch-ubuntu-22.04-clang12
 
   unittest-arm:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -368,6 +368,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
+        pt2e_quantize: [qnn_16a16w, qnn_8a8w]
         mode: [qnn]
       fail-fast: false
     with:
@@ -384,6 +385,7 @@ jobs:
         DTYPE=${{ matrix.dtype }}
         BUILD_TOOL="cmake"
         MODE=${{ matrix.mode }}
+        PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
@@ -393,7 +395,7 @@ jobs:
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
 
   test-phi-3-mini-runner-linux:
     name: test-phi-3-mini-runner-linux
 
@@ -131,7 +131,7 @@ jobs:
 
   test-arm-backend-delegation:
     name: test-arm-backend-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -157,7 +157,7 @@ jobs:
 
   test-arm-reference-delegation:
     name: test-arm-reference-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -351,6 +351,8 @@ jobs:
         done
 
   test-huggingface-transformers:
+    # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     name: test-huggingface-transformers
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     secrets: inherit
@@ -441,3 +443,39 @@ jobs:
 
         cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
         echo "::endgroup::"
+
+
+  test-llama-runner-qnn-linux:
+    name: test-llama-runner-qnn-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      matrix:
+        dtype: [fp32]
+        pt2e_quantize: [qnn_16a16w, qnn_8a8w]
+        mode: [qnn]
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 900
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        BUILD_TOOL="cmake"
+        DTYPE=${{ matrix.dtype }}
+        MODE=${{ matrix.mode }}
+        PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+
+        # Setup executorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
+        # Test llama2
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
@@ -64,6 +64,9 @@
 [submodule "third-party/pybind11"]
 	path = third-party/pybind11
 	url = https://github.com/pybind/pybind11.git
+[submodule "backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3"]
+	path = backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3
+	url = https://github.com/foss-xtensa/nnlib-FusionG3/
 [submodule "third-party/ao"]
 	path = third-party/ao
 	url = https://github.com/pytorch/ao.git
@@ -283,10 +283,15 @@ for basics.
    - If the reviewers have requests or questions, follow up with them.
    - The goal of the reviewer is to ensure that the code in the `main` branch of
      the repo is consistent, maintainable, and of high quality.
-1. Once approved, your reviewer will import the PR into Meta's internal system
-   and merge it from there.
-   - If the PR is approved and not merged within a few business days, please
-     comment on the PR to ask about its status.
+1. Once the PR has been approved,
+   - If you have the "write permission" in this repo, you can merge it yourself
+     by clicking the "Squash and merge" button once it is green and all CI
+     signals are passing.
+   - If you don't have "write permission" in this repo, the reviewer will take
+     care of the PR. The reviewer may import the PR into Meta's internal system
+     to validate it against internal CI.
+   - If the PR is approved but not merged within 5 business days, please comment
+     on the PR to ask about its status.
    - Note that if the `main` [CI](#continuous-integration) jobs are broken, we
      will only merge PRs that fix the broken jobs until all critical jobs are
      fixed.
 
@@ -26,25 +26,38 @@ + (nullable NSURL *)compileModelAtURL:(NSURL *)modelURL
 #else
     __block NSError *localError = nil;
     __block NSURL *result = nil;
-    
-    dispatch_semaphore_t sema = dispatch_semaphore_create(0);
-    [MLModel compileModelAtURL:modelURL completionHandler:^(NSURL * _Nullable tempURL, NSError * _Nullable compilationError) {
-        result = [tempURL copy];
-        localError = compilationError;
-        dispatch_semaphore_signal(sema);
-    }];
-    
-    long status = dispatch_semaphore_wait(sema, dispatch_time(DISPATCH_TIME_NOW, (int64_t)(maxWaitTimeInSeconds * NSEC_PER_SEC)));
-    if (status != 0) {
+
+    if (@available(iOS 16, macOS 13, watchOS 9, tvOS 16, *)) {
+        dispatch_semaphore_t sema = dispatch_semaphore_create(0);
+        [MLModel compileModelAtURL:modelURL completionHandler:^(NSURL * _Nullable tempURL, NSError * _Nullable compilationError) {
+            result = [tempURL copy];
+            localError = compilationError;
+            dispatch_semaphore_signal(sema);
+        }];
+
+        long status = dispatch_semaphore_wait(sema, dispatch_time(DISPATCH_TIME_NOW, (int64_t)(maxWaitTimeInSeconds * NSEC_PER_SEC)));
+        if (status != 0) {
+            ETCoreMLLogErrorAndSetNSError(error,
+                                        ETCoreMLErrorCompilationFailed,
+                                        "%@: Failed to compile model in %f seconds.",
+                                        NSStringFromClass(ETCoreMLModelCompiler.class),
+                                        maxWaitTimeInSeconds);
+            return nil;
+        }
+    } else {
+        result = [MLModel compileModelAtURL:modelURL error:&localError];
+    }
+
+    if (localError) {
         ETCoreMLLogErrorAndSetNSError(error,
-                                      ETCoreMLErrorCompilationFailed,
-                                      "%@: Failed to compile model in %f seconds.",
-                                      NSStringFromClass(ETCoreMLModelCompiler.class),
-                                      maxWaitTimeInSeconds);
+                                    ETCoreMLErrorCompilationFailed,
+                                    "%@: Failed to compile model, error: %@",
+                                    NSStringFromClass(ETCoreMLModelCompiler.class),
+                                    localError);
         return nil;
+    } else {
+        return result;
     }
-    
-    return result;
 #endif
 }
 
 
@@ -151,7 +151,6 @@ - (void)testMV3ProgramDebugging {
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
-    XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
 }
 
@@ -146,7 +146,6 @@ - (void)testMV3ProgramProfiling {
             XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
             XCTAssertNotNil(profilingResult[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
             XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
-            XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
             XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
             XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
         };
 
@@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party"
 mkdir "$COREML_DIR_PATH/third-party"
 
 echo "${green}ExecuTorch: Cloning coremltools."
-git clone --depth 1 --branch 8.0 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
+git clone --depth 1 --branch 8.1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
 cd $COREMLTOOLS_DIR_PATH
 
 STATUS=$?
 
@@ -71,23 +71,15 @@ def test_vit_skip_conv(self):
             )
         )
 
-        conv_block = ["aten.convolution.default", "executorch_call_delegate"]
-        safe_softmax_block = [
-            "getitem",
-            "getitem",
-            "getitem",
-            "getitem",
-            "aten.any.dim",
-            "executorch_call_delegate",
-        ]
-        final_block = ["getitem"]
-        total = conv_block + 12 * safe_softmax_block + final_block
-
         assert [
             node.target.__name__
             for node in delegated_program_manager.exported_program().graph.nodes
             if node.op == "call_function"
-        ] == total
+        ] == [
+            "aten.convolution.default",
+            "executorch_call_delegate",
+            "getitem",
+        ]
 
     def test_buffer(self):
         embedding_dim = 3
Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,9 @@ install_buck() {`
`49`	`49`
`50`	`50`	`rm "${BUCK2}"`
`51`	`51`	`popd`
	`52`	`+`
	`53`	`+ # Kill all running buck2 daemon for a fresh start`
	`54`	`+ buck2 killall \|\| true`
`52`	`55`	`}`
`53`	`56`
`54`	`57`	`function write_sccache_stub() {`
Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,6 @@ - (void)testMV3ProgramDebugging {`
`151`	`151`	`XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);`
`152`	`152`	`XCTAssertNotNil(debuggingResults[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);`
`153`	`153`	`XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);`
`154`		`- XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);`
`155`	`154`	`XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);`
`156`	`155`	`XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);`
`157`	`156`	`}`