[llama-mm] Fix AOTI test for attention (#6915)

larryliu0820 · web-flow · commit 3b475e3341f1 · 2024-11-18T09:32:10.000-08:00
Summary: Disable `reorder_for_peak_memory` because it moves
`_local_dense_scalar` codegen to after subgraphs.

Test Plan: As titled.

```
RUN_SKIPPED=1 pytorch -m unittest
extension.llm.modules.test.test_attention -k test_attention_aoti
```

Need to address the flaky test later.

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/extension/llm/modules/test/test_attention.py b/extension/llm/modules/test/test_attention.py
@@ -156,7 +156,9 @@ def test_attention_export(self):
 
         assert_close(et_res, tt_res)
 
-    @unittest.skip(reason="TODO(T207740932): test is flaky")
+    @unittest.skipIf(
+        int(os.getenv("RUN_SKIPPED", 0)) < 1, reason="TODO(T207740932): test is flaky"
+    )
     def test_attention_aoti(self):
         # Self attention.
 
@@ -168,7 +170,10 @@ def test_attention_aoti(self):
                 self.et_mha,
                 args=(self.x, self.x),
                 kwargs={"input_pos": self.input_pos},
-                options={"aot_inductor.package": True},
+                options={
+                    "aot_inductor.package": True,
+                    "reorder_for_peak_memory": False,
+                },
                 dynamic_shapes=self.dynamic_shapes,
             )
         with tempfile.TemporaryDirectory() as tempdir: