Merge branch 'main' into android-thinking

kirklandsign · web-flow · commit 2f48d5ac1c56 · 2025-05-02T16:10:01.000-07:00
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -59,6 +59,9 @@
 )
 
 from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.backends.transforms.decompose_sdpa import (
+    DecomposeScaledDotProductAttention,
+)
 from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
@@ -194,6 +197,7 @@ def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
             )
 
     def transform_for_annotation_pipeline(self, graph_module: GraphModule):
+        self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeLayerNormPass())
diff --git a/backends/arm/_passes/decompose_softmax_pass.py b/backends/arm/_passes/decompose_softmax_pass.py
@@ -8,7 +8,11 @@
 from executorch.exir.pass_base import ExportPass
 
 # For BI case
-torch_softmax = (torch.ops.aten.softmax.int, torch.ops.aten.log_softmax.int)
+torch_softmax = (
+    torch.ops.aten.softmax.int,
+    torch.ops.aten._safe_softmax.default,
+    torch.ops.aten.log_softmax.int,
+)
 # For MI case
 edge_softmax = (
     exir_ops.edge.aten._softmax.default,
diff --git a/backends/arm/test/models/test_conformer.py b/backends/arm/test/models/test_conformer.py
@@ -83,7 +83,6 @@ def test_conformer_tosa_BI(self):
             )
         )
 
-    @unittest.expectedFailure  # TODO(MLETORCH-635)
     def test_conformer_u55_BI(self):
         tester = (
             ArmTester(
@@ -97,13 +96,20 @@ def test_conformer_u55_BI(self):
             .to_executorch()
             .serialize()
         )
+
         if conftest.is_option_enabled("corstone_fvp"):
-            tester.run_method_and_compare_outputs(
-                qtol=1.0,
-                rtol=1.0,
-                atol=5.0,
-                inputs=get_test_inputs(self.dim, self.lengths, self.num_examples),
-            )
+            try:
+                tester.run_method_and_compare_outputs(
+                    qtol=1.0,
+                    rtol=1.0,
+                    atol=5.0,
+                    inputs=get_test_inputs(self.dim, self.lengths, self.num_examples),
+                )
+                self.fail(
+                    "TODO(MLETORCH-635): Expected failure under FVP option, but test passed."
+                )
+            except Exception:
+                pass
 
     @unittest.expectedFailure  # TODO(MLETORCH-635)
     def test_conformer_u85_BI(self):
diff --git a/backends/arm/test/ops/test_sdpa.py b/backends/arm/test/ops/test_sdpa.py
@@ -0,0 +1,45 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Tuple
+
+import torch
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+class SDPA(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, query, key, value):
+        return torch.nn.functional.scaled_dot_product_attention(
+            query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+
+
+input_t = Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+
+
+def test_sdpa_MI():
+    test_input = tuple(torch.randn(1, 3, 197, 64) for x in range(3))
+    pipeline = TosaPipelineMI[input_t](SDPA(), test_input, [], [])
+    pipeline.pop_stage("check_count.exir")
+    pipeline.run()
+
+
+def test_sdpa_BI():
+    test_input = tuple(torch.randn(1, 3, 197, 64) for x in range(3))
+    pipeline = TosaPipelineBI[input_t](SDPA(), test_input, [], [])
+    pipeline.pop_stage("check.quant_nodes")
+    pipeline.pop_stage("check_count.exir")
+    pipeline.pop_stage(
+        "run_method_and_compare_outputs"
+    )  # TODO: reference is not quantized
+    pipeline.run()
diff --git a/docs/source/_static/img/swiftpm_xcode.mp4 b/docs/source/_static/img/swiftpm_xcode.mp4
diff --git a/docs/source/_static/img/swiftpm_xcode.png b/docs/source/_static/img/swiftpm_xcode.png
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/Constants.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/Constants.swift
@@ -25,4 +25,6 @@ You are a helpful assistant.
 """
     
     public static let llama3PromptTemplate = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>%@<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+    
+public static let phi4PromptTemplate = "<|user|>%@<|end|><|assistant|>"
 }
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift b/examples/demo-apps/apple_ios/LLaMA/LLaMA/Application/ContentView.swift
@@ -86,6 +86,7 @@ struct ContentView: View {
     case llama
     case llava
     case qwen3
+    case phi4
     
     static func fromPath(_ path: String) -> ModelType {
       let filename = (path as NSString).lastPathComponent.lowercased()
@@ -95,8 +96,10 @@ struct ContentView: View {
         return .llava
       } else if filename.hasPrefix("qwen3") {
         return .qwen3
+      } else if filename.hasPrefix("phi4") {
+        return .phi4
       }
-      print("Unknown model type in path: \(path). Model filename should start with one of: llama, llava, or qwen3")
+      print("Unknown model type in path: \(path). Model filename should start with one of: llama, llava, qwen3, or phi4")
       exit(1)
     }
   }
@@ -343,15 +346,15 @@ struct ContentView: View {
       }
 
       switch modelType {
-      case .llama, .qwen3:
+      case .llama, .qwen3, .phi4:
         runnerHolder.runner = runnerHolder.runner ?? Runner(modelPath: modelPath, tokenizerPath: tokenizerPath)
       case .llava:
         runnerHolder.llavaRunner = runnerHolder.llavaRunner ?? LLaVARunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
       }
 
       guard !shouldStopGenerating else { return }
       switch modelType {
-      case .llama, .qwen3:
+      case .llama, .qwen3, .phi4:
         if let runner = runnerHolder.runner, !runner.isLoaded() {
           var error: Error?
           let startLoadTime = Date()
@@ -474,12 +477,14 @@ struct ContentView: View {
             prompt = String(format: Constants.llama3PromptTemplate, text)
           case .llava:
             prompt = String(format: Constants.llama3PromptTemplate, text)
+          case .phi4:
+              prompt = String(format: Constants.phi4PromptTemplate, text)
           }
 
           try runnerHolder.runner?.generate(prompt, sequenceLength: seq_len) { token in
 
             if token != prompt {
-              if token == "<|eot_id|>" {
+                if token == "<|eot_id|>" {
                 // hack to fix the issue that extension/llm/runner/text_token_generator.h
                 // keeps generating after <|eot_id|>
                 shouldStopShowingToken = true
diff --git a/examples/demo-apps/apple_ios/LLaMA/README.md b/examples/demo-apps/apple_ios/LLaMA/README.md
@@ -32,12 +32,12 @@ Download already exported LLaMA/LLaVA models along with tokenizers from [Hugging
     ```bash
     open examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj
     ```
-    
+
 3. Click the Play button to launch the app in the Simulator.
 
 4. To run on a device, ensure you have it set up for development and a provisioning profile with the `increased-memory-limit` entitlement. Update the app's bundle identifier to match your provisioning profile with the required capability.
 
-5. After successfully launching the app, copy the exported ExecuTorch model (`.pte`) and tokenizer (`.model`) files to the iLLaMA folder. Three models are currently supported at the moment - Llama, Qwen3, and Llava multimodal. Please ensure that your model `.pte` file starts with `llama`, `qwen3`, or `llava` so that the app selects the correct model type.
+5. After successfully launching the app, copy the exported ExecuTorch model (`.pte`) and tokenizer (`.model`) files to the iLLaMA folder. Four models are currently supported at the moment - Llama, Qwen3, Phi4-mini, and Llava multimodal. Please ensure that your model `.pte` file starts with `llama`, `qwen3`, `phi4` or `llava` so that the app selects the correct model type.
 
     - **For the Simulator:** Drag and drop both files onto the Simulator window and save them in the `On My iPhone > iLLaMA` folder.
     - **For a Device:** Open a separate Finder window, navigate to the Files tab, drag and drop both files into the iLLaMA folder, and wait for the copying to finish.

Original file line number	Diff line number	Diff line change
`@@ -25,4 +25,6 @@ You are a helpful assistant.`
`25`	`25`	`"""`
`26`	`26`
`27`	`27`	`public static let llama3PromptTemplate = "<\|begin_of_text\|><\|start_header_id\|>user<\|end_header_id\|>%@<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>"`
	`28`	`+`
	`29`	`+public static let phi4PromptTemplate = "<\|user\|>%@<\|end\|><\|assistant\|>"`
`28`	`30`	`}`