Merge pull request #126 from stac-utils/add-pydantic

jonhealy1 · web-flow · commit 9daa5a6823ee · 2025-05-31T23:45:29.000+08:00
- Added --pydantic option for validating STAC objects using stac-pydantic models, providing enhanced type checking and validation
- Improved bbox validation output to show detailed information about mismatches between bbox and geometry bounds, including which specific coordinates differ and by how much
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,11 @@ The format is (loosely) based on [Keep a Changelog](http://keepachangelog.com/)
 - Added sponsors and supporters section with logos ([#122](https://github.com/stac-utils/stac-check/pull/122))
 - Added check to verify that bbox matches item's polygon geometry ([#123](https://github.com/stac-utils/stac-check/pull/123))
 - Added configuration documentation to README ([#124](https://github.com/stac-utils/stac-check/pull/124))
+- Added `--pydantic` option for validating STAC objects using stac-pydantic models, providing enhanced type checking and validation ([#126](https://github.com/stac-utils/stac-check/pull/126))
+
+### Enhanced
+
+- Improved bbox validation output to show detailed information about mismatches between bbox and geometry bounds, including which specific coordinates differ and by how much ([#126](https://github.com/stac-utils/stac-check/pull/126))
 
 ### Updated
 
diff --git a/README.md b/README.md
@@ -86,6 +86,7 @@ Options:
                            (enabled by default).
   --header KEY VALUE       HTTP header to include in the requests. Can be used
                            multiple times.
+  --pydantic               Use stac-pydantic for enhanced validation with Pydantic models.
   --help                   Show this message and exit.
 ```
 
diff --git a/sample_files/1.0.0/bad-item.json b/sample_files/1.0.0/bad-item.json
@@ -8,7 +8,7 @@
     -122.59750209,
     37.48803556,
     -122.2880486,
-    37.613537207
+    37.613531207
   ],
   "geometry": {
     "type": "Polygon",
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 from setuptools import find_packages, setup
 
-__version__ = "1.6.0"
+__version__ = "1.7.0"
 
 with open("README.md", "r") as fh:
     long_description = fh.read()
@@ -20,7 +20,7 @@
         "requests>=2.32.3",
         "jsonschema>=4.23.0",
         "click>=8.1.8",
-        "stac-validator>=3.6.0",
+        "stac-validator[pydantic]>=3.7.0",
         "PyYAML",
         "python-dotenv",
     ],
diff --git a/stac_check/cli.py b/stac_check/cli.py
@@ -91,6 +91,13 @@ def intro_message(linter: Linter) -> None:
         f"Validator: stac-validator {linter.validator_version}", bg="blue", fg="white"
     )
 
+    # Always show validation method
+    validation_method = (
+        "Pydantic" if hasattr(linter, "pydantic") and linter.pydantic else "JSONSchema"
+    )
+    click.secho()
+    click.secho(f"Validation method: {validation_method}", bg="yellow", fg="black")
+
     click.secho()
 
 
@@ -111,7 +118,17 @@ def cli_message(linter: Linter) -> None:
 
     """ schemas validated for core object """
     click.secho()
-    if len(linter.schema) > 0:
+
+    # Determine if we're using Pydantic validation
+    using_pydantic = hasattr(linter, "pydantic") and linter.pydantic
+
+    # For Pydantic validation, always show the appropriate schema model
+    if using_pydantic:
+        click.secho("Schemas validated: ", fg="blue")
+        asset_type = linter.asset_type.capitalize() if linter.asset_type else "Item"
+        click.secho(f"    stac-pydantic {asset_type} model")
+    # For JSONSchema validation or when schemas are available
+    elif len(linter.schema) > 0:
         click.secho("Schemas validated: ", fg="blue")
         for schema in linter.schema:
             click.secho(f"    {schema}")
@@ -194,10 +211,15 @@ def cli_message(linter: Linter) -> None:
     multiple=True,
     help="HTTP header to include in the requests. Can be used multiple times.",
 )
+@click.option(
+    "--pydantic",
+    is_flag=True,
+    help="Use stac-pydantic for enhanced validation with Pydantic models.",
+)
 @click.command()
 @click.argument("file")
 @click.version_option(version=importlib.metadata.distribution("stac-check").version)
-def main(file, recursive, max_depth, assets, links, no_assets_urls, header):
+def main(file, recursive, max_depth, assets, links, no_assets_urls, header, pydantic):
     linter = Linter(
         file,
         assets=assets,
@@ -206,6 +228,7 @@ def main(file, recursive, max_depth, assets, links, no_assets_urls, header):
         max_depth=max_depth,
         assets_open_urls=not no_assets_urls,
         headers=dict(header),
+        pydantic=pydantic,
     )
     intro_message(linter)
     if recursive > 0:
diff --git a/stac_check/lint.py b/stac_check/lint.py
@@ -3,7 +3,7 @@
 import json
 import os
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import requests
 import yaml
@@ -27,6 +27,7 @@ class Linter:
         max_depth (Optional[int], optional): An optional integer indicating the maximum depth to validate recursively. Defaults to None.
         assets_open_urls (bool): Whether to open assets URLs when validating assets. Defaults to True.
         headers (dict): HTTP headers to include in the requests.
+        pydantic (bool, optional): A boolean value indicating whether to use pydantic validation. Defaults to False.
 
     Attributes:
         data (dict): A dictionary representing the STAC JSON file.
@@ -122,14 +123,15 @@ def check_summaries(self) -> bool:
             Creates a message with best practices recommendations for the STAC JSON file.
     """
 
-    item: Union[str, dict]  # url, file name, or dictionary
+    item: Union[str, Dict]
     config_file: Optional[str] = None
     assets: bool = False
     links: bool = False
     recursive: bool = False
     max_depth: Optional[int] = None
     assets_open_urls: bool = True
-    headers: dict = field(default_factory=dict)
+    headers: Dict = field(default_factory=dict)
+    pydantic: bool = False
 
     def __post_init__(self):
         self.data = self.load_data(self.item)
@@ -270,16 +272,21 @@ def validate_file(self, file: Union[str, dict]) -> Dict[str, Any]:
                 assets=self.assets,
                 assets_open_urls=self.assets_open_urls,
                 headers=self.headers,
+                pydantic=self.pydantic,
             )
             stac.run()
         elif isinstance(file, dict):
             stac = StacValidate(
-                assets_open_urls=self.assets_open_urls, headers=self.headers
+                assets_open_urls=self.assets_open_urls,
+                headers=self.headers,
+                pydantic=self.pydantic,
             )
             stac.validate_dict(file)
         else:
             raise ValueError("Input must be a file path or STAC dictionary.")
-        return stac.message[0]
+
+        message = stac.message[0]
+        return message
 
     def recursive_validation(self, file: Union[str, Dict[str, Any]]) -> str:
         """Recursively validate a STAC item or catalog file and its child items.
@@ -302,6 +309,7 @@ def recursive_validation(self, file: Union[str, Dict[str, Any]]) -> str:
                     max_depth=self.max_depth,
                     assets_open_urls=self.assets_open_urls,
                     headers=self.headers,
+                    pydantic=self.pydantic,
                 )
                 stac.run()
             else:
@@ -310,6 +318,7 @@ def recursive_validation(self, file: Union[str, Dict[str, Any]]) -> str:
                     max_depth=self.max_depth,
                     assets_open_urls=self.assets_open_urls,
                     headers=self.headers,
+                    pydantic=self.pydantic,
                 )
                 stac.validate_dict(file)
             return stac.message
@@ -454,16 +463,20 @@ def check_geometry_null(self) -> bool:
         else:
             return False
 
-    def check_bbox_matches_geometry(self) -> bool:
+    def check_bbox_matches_geometry(
+        self,
+    ) -> Union[bool, Tuple[bool, List[float], List[float], List[float]]]:
         """Checks if the bbox of a STAC item matches its geometry.
 
         This function verifies that the bounding box (bbox) accurately represents
         the minimum bounding rectangle of the item's geometry. It only applies to
         items with non-null geometry of type Polygon or MultiPolygon.
 
         Returns:
-            bool: True if the bbox matches the geometry or if the check is not applicable
-                 (e.g., null geometry or non-polygon type). False if there's a mismatch.
+            Union[bool, Tuple[bool, List[float], List[float], List[float]]]:
+                - True if the bbox matches the geometry or if the check is not applicable
+                  (e.g., null geometry or non-polygon type).
+                - When there's a mismatch: a tuple containing (False, calculated_bbox, actual_bbox, differences)
         """
         # Skip check if geometry is null or bbox is not present
         if (
@@ -504,11 +517,14 @@ def check_bbox_matches_geometry(self) -> bool:
 
         calc_bbox = [min(lons), min(lats), max(lons), max(lats)]
 
-        # Allow for small floating point differences (epsilon)
-        epsilon = 1e-8
-        for i in range(4):
-            if abs(bbox[i] - calc_bbox[i]) > epsilon:
-                return False
+        # Allow for differences that would be invisible when rounded to 6 decimal places
+        # 1e-6 would be exactly at the 6th decimal place, so use 5e-7 to be just under that threshold
+        epsilon = 5e-7
+        differences = [abs(bbox[i] - calc_bbox[i]) for i in range(4)]
+
+        if any(diff > epsilon for diff in differences):
+            # Return False along with the calculated bbox, actual bbox, and the differences
+            return (False, calc_bbox, bbox, differences)
 
         return True
 
@@ -675,12 +691,60 @@ def create_best_practices_dict(self) -> Dict:
             best_practices_dict["null_geometry"] = [msg_1]
 
         # best practices - check if bbox matches geometry
-        if (
-            not self.check_bbox_matches_geometry()
-            and config.get("check_bbox_geometry_match", True) == True
-        ):
-            msg_1 = "The bbox field does not match the bounds of the geometry. The bbox should be the minimum bounding rectangle of the geometry."
-            best_practices_dict["bbox_geometry_mismatch"] = [msg_1]
+        bbox_check_result = self.check_bbox_matches_geometry()
+        bbox_mismatch = False
+
+        if isinstance(bbox_check_result, tuple):
+            bbox_mismatch = not bbox_check_result[0]
+        else:
+            bbox_mismatch = not bbox_check_result
+
+        if bbox_mismatch and config.get("check_bbox_geometry_match", True) == True:
+            if isinstance(bbox_check_result, tuple):
+                # Unpack the result
+                _, calc_bbox, actual_bbox, differences = bbox_check_result
+
+                # Format the bbox values for display
+                calc_bbox_str = ", ".join([f"{v:.6f}" for v in calc_bbox])
+                actual_bbox_str = ", ".join([f"{v:.6f}" for v in actual_bbox])
+
+                # Create a more detailed message about which coordinates differ
+                coordinate_labels = [
+                    "min longitude",
+                    "min latitude",
+                    "max longitude",
+                    "max latitude",
+                ]
+                mismatch_details = []
+
+                # Use the same epsilon threshold as in check_bbox_matches_geometry
+                epsilon = 5e-7
+
+                for i, (diff, label) in enumerate(zip(differences, coordinate_labels)):
+                    if diff > epsilon:
+                        mismatch_details.append(
+                            f"{label}: calculated={calc_bbox[i]:.6f}, actual={actual_bbox[i]:.6f}, diff={diff:.7f}"
+                        )
+
+                msg_1 = "The bbox field does not match the bounds of the geometry. The bbox should be the minimum bounding rectangle of the geometry."
+                msg_2 = f"Calculated bbox from geometry: [{calc_bbox_str}]"
+                msg_3 = f"Actual bbox in metadata: [{actual_bbox_str}]"
+
+                messages = [msg_1, msg_2, msg_3]
+                if mismatch_details:
+                    messages.append("Mismatched coordinates:")
+                    messages.extend(mismatch_details)
+                else:
+                    # If we got here but there are no visible differences at 6 decimal places,
+                    # add a note explaining that the differences are too small to matter
+                    messages.append(
+                        "Note: The differences are too small to be visible at 6 decimal places and can be ignored."
+                    )
+
+                best_practices_dict["bbox_geometry_mismatch"] = messages
+            else:
+                msg_1 = "The bbox field does not match the bounds of the geometry. The bbox should be the minimum bounding rectangle of the geometry."
+                best_practices_dict["bbox_geometry_mismatch"] = [msg_1]
 
         # check to see if there are too many links
         if (
diff --git a/tests/test_lint.py b/tests/test_lint.py
@@ -282,7 +282,7 @@ def test_bbox_matches_geometry():
     # Test with matching bbox and geometry
     file = "sample_files/1.0.0/core-item.json"
     linter = Linter(file)
-    assert linter.check_bbox_matches_geometry() == True
+    assert linter.check_bbox_matches_geometry() is True
 
     # Test with mismatched bbox and geometry
     mismatched_item = {
@@ -306,7 +306,30 @@ def test_bbox_matches_geometry():
         "properties": {"datetime": "2020-12-11T22:38:32.125Z"},
     }
     linter = Linter(mismatched_item)
-    assert linter.check_bbox_matches_geometry() == False
+    result = linter.check_bbox_matches_geometry()
+
+    # Check that the result is a tuple and the first element is False
+    assert isinstance(result, tuple)
+    assert result[0] is False
+
+    # Check that the tuple contains the expected elements (calculated bbox, actual bbox, differences)
+    assert len(result) == 4
+    calc_bbox, actual_bbox, differences = result[1], result[2], result[3]
+
+    # Verify the calculated bbox matches the geometry coordinates
+    assert calc_bbox == [
+        172.91173669923782,
+        1.3438851951615003,
+        172.95469614953714,
+        1.3690476620161975,
+    ]
+
+    # Verify the actual bbox is what we provided
+    assert actual_bbox == [100.0, 0.0, 105.0, 1.0]
+
+    # Verify the differences are calculated correctly
+    expected_differences = [abs(actual_bbox[i] - calc_bbox[i]) for i in range(4)]
+    assert differences == expected_differences
 
     # Test with null geometry (should return True as check is not applicable)
     null_geom_item = {
@@ -318,7 +341,7 @@ def test_bbox_matches_geometry():
         "properties": {"datetime": "2020-12-11T22:38:32.125Z"},
     }
     linter = Linter(null_geom_item)
-    assert linter.check_bbox_matches_geometry() == True
+    assert linter.check_bbox_matches_geometry() is True
 
     # Test with missing bbox (should return True as check is not applicable)
     no_bbox_item = {
@@ -340,7 +363,7 @@ def test_bbox_matches_geometry():
         "properties": {"datetime": "2020-12-11T22:38:32.125Z"},
     }
     linter = Linter(no_bbox_item)
-    assert linter.check_bbox_matches_geometry() == True
+    assert linter.check_bbox_matches_geometry() is True
 
 
 def test_bloated_item():
@@ -633,3 +656,36 @@ def test_lint_assets_no_links():
             "request_invalid": [],
         },
     }
+
+
+def test_lint_pydantic_validation_valid():
+    """Test pydantic validation with a valid STAC item."""
+    file = "sample_files/1.0.0/core-item.json"
+    linter = Linter(file, pydantic=True)
+
+    assert linter.valid_stac == True
+    assert linter.asset_type == "ITEM"
+    assert "stac-pydantic Item model" in linter.message["schema"]
+    assert linter.message["validation_method"] == "pydantic"
+
+
+def test_lint_pydantic_validation_invalid():
+    """Test pydantic validation with an invalid STAC item (missing required fields)."""
+    file = "sample_files/1.0.0/bad-item.json"
+    linter = Linter(file, pydantic=True)
+
+    assert linter.valid_stac == False
+    assert "PydanticValidationError" in linter.message["error_type"]
+    assert "id: Field required" in linter.message["error_message"]
+    assert linter.message["validation_method"] == "pydantic"
+
+
+def test_lint_pydantic_validation_recursive():
+    """Test pydantic validation with recursive option."""
+    file = "sample_files/1.0.0/collection.json"
+    linter = Linter(file, recursive=True, max_depth=1, pydantic=True)
+
+    assert linter.valid_stac == True
+    assert linter.asset_type == "COLLECTION"
+    assert "stac-pydantic Collection model" in linter.message["schema"]
+    assert linter.message["validation_method"] == "pydantic"