ref(backup): Do deepcopy before validation (#53346)

azaslavsky · web-flow · commit 4c71260e7351 · 2023-07-21T17:10:06.000Z
Previously, all validation mutated the inputs in-place. This was performant (no need to make deep copies of two potentially large JSON blobs), but potentially bug prone if not held carefully, since the validation itself mutated the inputs via JSONScrubbingComparators. This means that the post-comparison input objects MUST NOT be re-used, since they are meaningfully different from the inputs that were declared valid. Instead, we now do deepcopies under the hood, leaving the originally passed-in inputs untouched. Issue: getsentry/team-ospo#156
diff --git a/src/sentry/runner/commands/backup.py b/src/sentry/runner/commands/backup.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from copy import deepcopy
 from datetime import datetime, timedelta, timezone
 from difflib import unified_diff
 from io import StringIO
@@ -159,10 +160,13 @@ def compare(self, on: InstanceID, left: JSONData, right: JSONData) -> Comparator
 
 
 def validate(
-    expect: JSONData, actual: JSONData, comparators: ComparatorMap = DEFAULT_COMPARATORS
+    expect: JSONData,
+    actual: JSONData,
+    comparators: ComparatorMap = DEFAULT_COMPARATORS,
 ) -> ComparatorFindings:
     """Ensures that originally imported data correctly matches actual outputted data, and produces a
-    list of reasons why not when it doesn't"""
+    list of reasons why not when it doesn't.
+    """
 
     def json_lines(obj: JSONData) -> list[str]:
         """Take a JSONData object and pretty-print it as JSON."""
@@ -176,6 +180,11 @@ def json_lines(obj: JSONData) -> list[str]:
         id = InstanceID(model["model"], model["pk"])
         exp_models[id] = model
 
+    # Because we may be scrubbing data from the objects as we compare them, we may (optionally) make
+    # deep copies to start to avoid potentially mangling the input data.
+    expect = deepcopy(expect)
+    actual = deepcopy(actual)
+
     # Ensure that the actual JSON contains no duplicates - we assume that the expected JSON did not.
     for model in actual:
         id = InstanceID(model["model"], model["pk"])
diff --git a/tests/sentry/backup/test_correctness.py b/tests/sentry/backup/test_correctness.py
@@ -20,7 +20,9 @@
 
 
 def import_export_then_validate(
-    tmp_path: Path, fixture_file_name: str, map: ComparatorMap = EMPTY_COMPARATORS_FOR_TESTING
+    tmp_path: Path,
+    fixture_file_name: str,
+    map: ComparatorMap = EMPTY_COMPARATORS_FOR_TESTING,
 ) -> None:
     """Test helper that validates that data imported from a fixture `.json` file correctly matches
     the actual outputted export data."""
@@ -34,11 +36,7 @@ def import_export_then_validate(
         rv = CliRunner().invoke(import_, [str(fixture_file_path)])
         assert rv.exit_code == 0, rv.output
 
-    res = validate(
-        expect,
-        tmp_export_to_file(tmp_path.joinpath("tmp_test_file.json")),
-        map,
-    )
+    res = validate(expect, tmp_export_to_file(tmp_path.joinpath("tmp_test_file.json")), map)
     if res.findings:
         raise ValidationError(res)