[export] add non-strict training IR (pytorch#130062)

pianpwk · pytorchmergebot · commit e8998d68c894 · 2024-07-16T17:08:00.000Z
Summary: Adds non-strict implementation of training IR export. Any expected non-strict training IR failures are also either existing strict training IR or non-strict failures (no new failures added). 4 strict training IR failures also resolved. Refraining from unifying export/export_for_training, per @ydwu4's feedback :) Test Plan: added test_export_training_ir_to_run_decomp_non_strict.py for non-strict training IR Differential Revision: D59349454 Pull Request resolved: pytorch#130062 Approved by: https://github.com/ydwu4, https://github.com/zhxchen17
diff --git a/test/export/test_export.py b/test/export/test_export.py
@@ -152,7 +152,8 @@ class Inp:
 RETRACEABILITY_SUFFIX = "_retraceability"
 SERDES_SUFFIX = "_serdes"
 PREDISPATCH_SUFFIX = "_pre_dispatch"
-TRAINING_IR_DECOMP_SUFFIX = "_training_ir_to_decomp"
+TRAINING_IR_DECOMP_STRICT_SUFFIX = "_training_ir_to_decomp"
+TRAINING_IR_DECOMP_NON_STRICT_SUFFIX = "_training_ir_to_decomp_non_strict"
 
 
 def is_non_strict_test(test_name):
@@ -167,6 +168,12 @@ def is_serdes_test(test_name):
     return test_name.endswith(SERDES_SUFFIX)
 
 
+def is_training_ir_test(test_name):
+    return test_name.endswith(TRAINING_IR_DECOMP_STRICT_SUFFIX) or test_name.endswith(
+        TRAINING_IR_DECOMP_NON_STRICT_SUFFIX
+    )
+
+
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
 class TestDynamismExpression(TestCase):
     def test_export_inline_constraints(self):
@@ -309,6 +316,7 @@ def forward(self, x, y):
         )
 
     # Errors because fake mode is not detected from non-tensor inputs
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     @testing.expectedFailureTrainingIRToRunDecomp
     def test_no_tensor_computation_3(self):
         class Module(torch.nn.Module):
@@ -346,8 +354,6 @@ def forward(self, x, y):
     return (x_0,)""",
         )
 
-    # Errors because non-strict is not supported in training IR (T193692164)
-    @testing.expectedFailureTrainingIRToRunDecomp
     def test_external_call_non_strict_real_tensor(self):
         class ExternalMethod:
             def add(self, x):
@@ -418,8 +424,6 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         args = (torch.randn(15, 3, 256, 256), torch.ones(15, 32, 256, 256))
         self.assertEqual(gm(*args), m(*args))
 
-    # Errors because non-strict is not supported in training IR (T193692164)
-    @testing.expectedFailureTrainingIRToRunDecomp
     def test_basic_non_strict_real_tensor(self):
         class Basic(torch.nn.Module):
             def __init__(self):
@@ -434,8 +438,6 @@ def forward(self, x, y):
         ep = export(f, args, strict=False)
         self.assertEqual(ep.module()(*args), f(*args))
 
-    # Errors because non-strict is not supported in training IR (T193692164)
-    @testing.expectedFailureTrainingIRToRunDecomp
     def test_basic_non_strict_fake_tensor(self):
         class Basic(torch.nn.Module):
             def __init__(self):
@@ -690,8 +692,6 @@ def forward(self, x):
             torch.allclose(ep.module()(torch.zeros(2, 3)), torch.ones(2, 3) * 21)
         )
 
-    # Predispatch has different expected results
-    @testing.expectedFailureTrainingIRToRunDecomp  # T193700910
     def test_torch_fn(self):
         class M1(torch.nn.Module):
             def __init__(self):
@@ -823,6 +823,7 @@ def forward(self, p_linear_weight, p_linear_bias, x):
     @testing.expectedFailurePreDispatchRunDecomp
     @testing.expectedFailureRetraceability
     @testing.expectedFailureTrainingIRToRunDecomp  # T193700910
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_export_cond_preserve_torch_fn_for_subgraphs(self):
         class MySubModule(torch.nn.Module):
             def foo(self, x):
@@ -2178,6 +2179,7 @@ def forward(self, arg1, arg2, *args, kw1, kw2, **kwargs):
     @testing.expectedFailureSerDer  # we don't save placeholder metadata
     @testing.expectedFailureNonStrict
     @testing.expectedFailureTrainingIRToRunDecomp  # T193692674
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_linear_conv(self):
         class MyLinear(torch.nn.Module):
             def __init__(self):
@@ -2853,7 +2855,6 @@ def test_buffer_util(self):
         self.assertEqual(buffer[1].shape, torch.Size([100]))  # running_var
         self.assertEqual(buffer[2].shape, torch.Size([]))  # num_batches_tracked
 
-    @testing.expectedFailureTrainingIRToRunDecomp  # T193701564
     def test_export_dynamo_config(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -2889,6 +2890,7 @@ def _patch_config(kwargs):
                 _ = export(mod, inp, strict=True)
 
     @testing.expectedFailureTrainingIRToRunDecomp  # T193700396
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_device_to_static(self):
         class Module(torch.nn.Module):
             def forward(self, x):
@@ -2904,6 +2906,7 @@ def forward(self, x):
             self.assertIn(op, (torch.ops.aten._to_copy.default,))
 
     @testing.expectedFailureTrainingIRToRunDecomp  # T193700396
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_device_to_dynamic(self):
         class Module(torch.nn.Module):
             def forward(self, x):
@@ -2923,6 +2926,7 @@ def forward(self, x):
             self.assertIn(op, (torch.ops.aten._to_copy.default,))
 
     @testing.expectedFailureTrainingIRToRunDecomp  # T193700396
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_device_to_mutation(self):
         class Module(torch.nn.Module):
             def forward(self, x):
@@ -2936,6 +2940,7 @@ def forward(self, x):
             export(Module(), (torch.tensor(1, device="cpu"),))
 
     @testing.expectedFailureTrainingIRToRunDecomp  # T193700396
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_float_conversion(self):
         class Module(torch.nn.Module):
             def forward(self, x):
@@ -2951,6 +2956,7 @@ def forward(self, x):
             self.assertIn(op, (torch.ops.aten._to_copy.default,))
 
     @testing.expectedFailureTrainingIRToRunDecomp  # T193700396
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_device_to_mutation_float(self):
         class Module(torch.nn.Module):
             def forward(self, x):
@@ -2964,6 +2970,7 @@ def forward(self, x):
             export(Module(), (torch.tensor(1, dtype=torch.float),))
 
     @testing.expectedFailureTrainingIRToRunDecomp  # T193692674
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_module(self):
         class MyLinear(torch.nn.Module):
             def __init__(self):
@@ -3010,6 +3017,7 @@ def forward(self, x):
         )
 
     @testing.expectedFailureTrainingIRToRunDecomp  # T193701564
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_module_with_dict_container_inp_out(self):
         class MyLinear(torch.nn.Module):
             def __init__(self):
@@ -3773,6 +3781,7 @@ def forward(self, xs, y):
 
     @testing.expectedFailureSerDer  # We don't preserve metadata on graph module
     @testing.expectedFailureNonStrict
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_retrace_graph_level_meta_preservation(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -3854,6 +3863,7 @@ def forward(self, x):
 
     # TODO Retracing a module with constant attrs don't work.(T193692674)
     @testing.expectedFailureTrainingIRToRunDecomp
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     @testing.expectedFailureRetraceability  # T183144788
     def test_lifted_constants(self) -> None:
         class Module(torch.nn.Module):
@@ -3890,6 +3900,7 @@ def forward(self, x):
 
     @testing.expectedFailureRetraceability  # T183144788
     @testing.expectedFailureTrainingIRToRunDecomp  # T193701164
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_tensor_attribute_zero_args(self):
         class Foo(torch.nn.Module):
             def __init__(self, value):
@@ -4237,6 +4248,7 @@ def forward(self, x):
 
     @testing.expectedFailureRetraceability  # Retracing tensor constants results in buffers
     @testing.expectedFailureTrainingIRToRunDecomp  # T193692674
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_nested_module_with_constant_buffer(self):
         class M1(torch.nn.Module):
             def __init__(self):
@@ -4386,6 +4398,8 @@ def forward(self, x, y):
         self.assertTrue(torch.allclose(ep.module()(*inp), M()(*inp)))
 
     # TODO Retracing a module with constant attrs don't work.(T193692674)
+    @testing.expectedFailureTrainingIRToRunDecomp
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     @unittest.skip("Test is only supposed to work with non-strict mode")
     def test_issue_113041(self):
         class TestModule(torch.nn.Module):
@@ -5252,6 +5266,7 @@ def forward(self, x):
         self.assertEqual(ep.state_dict, m.state_dict())
 
     @testing.expectedFailureTrainingIRToRunDecomp  # T193692674
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_non_persistent_buffer(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -5319,6 +5334,7 @@ def forward(self, x):
 
     # TODO Retracing a module with constant attrs don't work.(T193692674)
     @testing.expectedFailureTrainingIRToRunDecomp
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_fake_weights(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -5377,8 +5393,6 @@ def forward(self, x):
             # under a new FakeTensorMode.
             ep = torch.export.export(m, (inp,))
 
-    # Errors because non-strict is not supported in training IR (T193692164)
-    @testing.expectedFailureTrainingIRToRunDecomp
     def test_compiling_state(self):
         class TestModule1(torch.nn.Module):
             def forward(self, x):
@@ -5428,7 +5442,6 @@ def forward(self, x):
         self.assertEqual(mod.foo, ep.module().foo)
         self.assertEqual(mod(torch.ones(4, 4)), ep.module()(torch.ones(4, 4)))
 
-    @testing.expectedFailureTrainingIRToRunDecomp  # T193702033
     def test_symint_tensor_return(self):
         class Module(torch.nn.Module):
             def forward(self, x):
@@ -5534,6 +5547,7 @@ def forward(self, x):
     # TODO Retracing a module with constant attrs don't work.(T193692674)
     @testing.expectedFailureRetraceability
     @testing.expectedFailureTrainingIRToRunDecomp
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_placeholder_naming_collisions(self):
         # test collisions between nested user inputs
         class Foo(torch.nn.Module):
@@ -6150,7 +6164,6 @@ def forward(self, x):
         for param in ["alpha", "beta", "gamma"]:
             self.assertTrue(param in unep.state_dict())
 
-    @testing.expectedFailureTrainingIRToRunDecomp  # nn_module_stack replacement when we do sympy_interp()
     def test_intermediate_shape_comp(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
@@ -6182,14 +6195,18 @@ def forward(self, x, y):
             all(node.args[0].op == "placeholder" for node in sym_size_nodes)
         )
         # dynamo will DCE the repeat node, AOTAutograd will leave it
+        # training IR will also DCE due to retracing
         repeat_nodes = [
             node
             for node in ep.graph.nodes
             if node.target == torch.ops.aten.repeat.default
         ]
         self.assertEqual(
             len(repeat_nodes),
-            1 if is_non_strict_test(self._testMethodName) else 0,
+            1
+            if is_non_strict_test(self._testMethodName)
+            and not is_training_ir_test(self._testMethodName)
+            else 0,
         )
 
     def test_checks_to_constrain_range(self):
diff --git a/test/export/test_export_training_ir_to_run_decomp.py b/test/export/test_export_training_ir_to_run_decomp.py
@@ -10,24 +10,41 @@
 test_classes = {}
 
 
-def mocked_training_ir_to_run_decomp_export(*args, **kwargs):
+def mocked_training_ir_to_run_decomp_export_strict(*args, **kwargs):
     ep = _export_for_training(*args, **kwargs)
     return ep.run_decompositions(
         {}, _preserve_ops=testing._COMPOSITE_OPS_THAT_CAN_BE_PRESERVED_TESTING_ONLY
     )
 
 
-def make_dynamic_cls(cls):
-    cls_prefix = "TrainingIRToRunDecompExport"
-
-    test_class = testing.make_test_cls_with_mocked_export(
-        cls,
-        cls_prefix,
-        test_export.TRAINING_IR_DECOMP_SUFFIX,
-        mocked_training_ir_to_run_decomp_export,
-        xfail_prop="_expected_failure_training_ir_to_run_decomp",
+def mocked_training_ir_to_run_decomp_export_non_strict(*args, **kwargs):
+    if "strict" in kwargs:
+        ep = _export_for_training(*args, **kwargs)
+    else:
+        ep = _export_for_training(*args, **kwargs, strict=False)
+    return ep.run_decompositions(
+        {}, _preserve_ops=testing._COMPOSITE_OPS_THAT_CAN_BE_PRESERVED_TESTING_ONLY
     )
 
+
+def make_dynamic_cls(cls, strict):
+    if strict:
+        test_class = testing.make_test_cls_with_mocked_export(
+            cls,
+            "TrainingIRToRunDecompExport",
+            test_export.TRAINING_IR_DECOMP_STRICT_SUFFIX,
+            mocked_training_ir_to_run_decomp_export_strict,
+            xfail_prop="_expected_failure_training_ir_to_run_decomp",
+        )
+    else:
+        test_class = testing.make_test_cls_with_mocked_export(
+            cls,
+            "TrainingIRToRunDecompExportNonStrict",
+            test_export.TRAINING_IR_DECOMP_NON_STRICT_SUFFIX,
+            mocked_training_ir_to_run_decomp_export_non_strict,
+            xfail_prop="_expected_failure_training_ir_to_run_decomp_non_strict",
+        )
+
     test_classes[test_class.__name__] = test_class
     # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
     globals()[test_class.__name__] = test_class
@@ -40,7 +57,8 @@ def make_dynamic_cls(cls):
     test_export.TestExport,
 ]
 for test in tests:
-    make_dynamic_cls(test)
+    make_dynamic_cls(test, True)
+    make_dynamic_cls(test, False)
 del test
 
 if __name__ == "__main__":
diff --git a/test/export/testing.py b/test/export/testing.py
@@ -239,6 +239,12 @@ def expectedFailureTrainingIRToRunDecomp(fn):
     return fn
 
 
+# Controls tests generated in test/export/test_export_training_ir_to_run_decomp.py
+def expectedFailureTrainingIRToRunDecompNonStrict(fn):
+    fn._expected_failure_training_ir_to_run_decomp_non_strict = True
+    return fn
+
+
 # Controls tests generated in test/export/test_export_nonstrict.py
 def expectedFailureNonStrict(fn):
     fn._expected_failure_non_strict = True
diff --git a/torch/export/_trace.py b/torch/export/_trace.py