Patch release 2.2.5 (#19893)

awaelchli · tchaton · lantiga · web-flow · commit ac3f1ee0d32b · 2024-05-22T13:19:39.000-04:00
Co-authored-by: thomas chaton &lt;thomas@grid.ai&gt;
Co-authored-by: Luca Antiga &lt;luca.antiga@gmail.com&gt;
diff --git a/docs/source-app/conf.py b/docs/source-app/conf.py
@@ -449,3 +449,6 @@ def find_source():
 
 # ignore all links in any CHANGELOG file
 linkcheck_exclude_documents = [r"^(.*\/)*CHANGELOG.*$"]
+
+# ignore the following relative links (false positive errors during linkcheck)
+linkcheck_ignore = ["https://openai.com/"]
diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py
@@ -343,8 +343,6 @@ def _load_py_module(name: str, location: str) -> ModuleType:
     "graphcore": ("https://docs.graphcore.ai/en/latest/", None),
     "lightning_habana": ("https://lightning-ai.github.io/lightning-Habana/", None),
     "tensorboardX": ("https://tensorboardx.readthedocs.io/en/stable/", None),
-    # needed for referencing App from lightning scope
-    "lightning.app": ("https://lightning.ai/docs/app/stable/", None),
     # needed for referencing Fabric from lightning scope
     "lightning.fabric": ("https://lightning.ai/docs/fabric/stable/", None),
     # TODO: these are missing objects.inv
@@ -626,4 +624,5 @@ def package_list_from_file(file):
     "https://stackoverflow.com/questions/66640705/how-can-i-install-grpcio-on-an-apple-m1-silicon-laptop",
     "https://github.com/Lightning-AI/lightning/blob/master/examples/pytorch/ipu/mnist_sample.py",
     "https://ngc.nvidia.com/catalog/containers/nvidia:nemo",  # in ecosystem/asr_nlp_tts.rst
+    "https://openai.com/",
 ]
diff --git a/requirements/app/app.txt b/requirements/app/app.txt
@@ -1,4 +1,4 @@
-lightning-cloud == 0.5.68  # Must be pinned to ensure compatibility
+lightning-cloud == 0.5.69  # Must be pinned to ensure compatibility
 packaging
 typing-extensions >=4.4.0, <4.10.0
 deepdiff >=5.7.0, <6.6.0
diff --git a/src/lightning/app/core/app.py b/src/lightning/app/core/app.py
@@ -30,6 +30,7 @@
 from lightning.app.api.request_types import _APIRequest, _CommandRequest, _DeltaRequest
 from lightning.app.core.constants import (
     BATCH_DELTA_COUNT,
+    CHECK_ERROR_QUEUE_INTERVAL,
     DEBUG_ENABLED,
     FLOW_DURATION_SAMPLES,
     FLOW_DURATION_THRESHOLD,
@@ -165,6 +166,7 @@ def __init__(
 
         self._last_run_time: float = 0.0
         self._run_times: list = []
+        self._last_check_error_queue: float = 0.0
 
         # Path attributes can't get properly attached during the initialization, because the full name
         # is only available after all Flows and Works have been instantiated.
@@ -318,10 +320,12 @@ def batch_get_state_changed_from_queue(q: BaseQueue, timeout: Optional[float] =
             return []
 
     def check_error_queue(self) -> None:
-        exception: Exception = self.get_state_changed_from_queue(self.error_queue)  # type: ignore[assignment,arg-type]
-        if isinstance(exception, Exception):
-            self.exception = exception
-            self.stage = AppStage.FAILED
+        if (time() - self._last_check_error_queue) > CHECK_ERROR_QUEUE_INTERVAL:
+            exception: Exception = self.get_state_changed_from_queue(self.error_queue)  # type: ignore[assignment,arg-type]
+            if isinstance(exception, Exception):
+                self.exception = exception
+                self.stage = AppStage.FAILED
+            self._last_check_error_queue = time()
 
     @property
     def flows(self) -> List[Union[LightningWork, "LightningFlow"]]:
diff --git a/src/lightning/app/core/constants.py b/src/lightning/app/core/constants.py
@@ -70,6 +70,7 @@ def get_lightning_cloud_url() -> str:
 LIGHTNING_COMPONENT_PUBLIC_REGISTRY = "https://lightning.ai/v1/components"
 LIGHTNING_APPS_PUBLIC_REGISTRY = "https://lightning.ai/v1/apps"
 LIGHTNING_MODELS_PUBLIC_REGISTRY = "https://lightning.ai/v1/models"
+ENABLE_ORCHESTRATOR = bool(int(os.getenv("ENABLE_ORCHESTRATOR", "1")))
 
 LIGHTNING_CLOUDSPACE_HOST = os.getenv("LIGHTNING_CLOUDSPACE_HOST")
 LIGHTNING_CLOUDSPACE_EXPOSED_PORT_COUNT = int(os.getenv("LIGHTNING_CLOUDSPACE_EXPOSED_PORT_COUNT", "0"))
@@ -99,6 +100,7 @@ def get_lightning_cloud_url() -> str:
 SYS_CUSTOMIZATIONS_SYNC_PATH = ".sys-customizations-sync"
 
 BATCH_DELTA_COUNT = int(os.getenv("BATCH_DELTA_COUNT", "128"))
+CHECK_ERROR_QUEUE_INTERVAL = float(os.getenv("CHECK_ERROR_QUEUE_INTERVAL", "30"))
 
 
 def enable_multiple_works_in_default_container() -> bool:
diff --git a/src/lightning/app/runners/cloud.py b/src/lightning/app/runners/cloud.py
@@ -34,7 +34,7 @@
     CloudspaceIdRunsBody,
     Externalv1LightningappInstance,
     Gridv1ImageSpec,
-    IdGetBody1,
+    IdGetBody,
     ProjectIdCloudspacesBody,
     V1BuildSpec,
     V1CloudSpace,
@@ -1027,7 +1027,7 @@ def _api_create_run_instance(
             project_id=project_id,
             cloudspace_id=cloudspace_id,
             id=run_id,
-            body=IdGetBody1(
+            body=IdGetBody(
                 cluster_id=cluster_id,
                 name=run_name,
                 desired_state=desired_state,
diff --git a/src/lightning/app/runners/multiprocess.py b/src/lightning/app/runners/multiprocess.py
@@ -81,16 +81,17 @@ def dispatch(self, *args: Any, open_ui: bool = True, **kwargs: Any):
 
             _set_flow_context()
 
-            storage_orchestrator = StorageOrchestrator(
-                self.app,
-                self.app.request_queues,
-                self.app.response_queues,
-                self.app.copy_request_queues,
-                self.app.copy_response_queues,
-            )
-            self.threads.append(storage_orchestrator)
-            storage_orchestrator.setDaemon(True)
-            storage_orchestrator.start()
+            if constants.ENABLE_ORCHESTRATOR:
+                storage_orchestrator = StorageOrchestrator(
+                    self.app,
+                    self.app.request_queues,
+                    self.app.response_queues,
+                    self.app.copy_request_queues,
+                    self.app.copy_response_queues,
+                )
+                self.threads.append(storage_orchestrator)
+                storage_orchestrator.setDaemon(True)
+                storage_orchestrator.start()
 
             if self.start_server:
                 self.app.should_publish_changes_to_api = True
diff --git a/src/lightning/app/utilities/network.py b/src/lightning/app/utilities/network.py
@@ -96,10 +96,14 @@ def create_retry_strategy():
         # are going to be alive for a very long time (~ 4 days) but retries every 120 seconds
         total=_CONNECTION_RETRY_TOTAL,
         backoff_factor=_CONNECTION_RETRY_BACKOFF_FACTOR,
+        # Any 4xx and 5xx statuses except
+        # 400 Bad Request
+        # 401 Unauthorized
+        # 403 Forbidden
+        # 404 Not Found
         status_forcelist={
-            408,  # Request Timeout
-            429,  # Too Many Requests
-            *range(500, 600),  # Any 5xx Server Error status
+            402,
+            *range(405, 600),
         },
         allowed_methods={
             "POST",  # Default methods are idempotent, add POST here
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
+## [2.2.5] - 2024-05-23
+
+### Fixed
+
+- Fixed a matrix shape mismatch issue when running a model loaded from a quantized checkpoint (bitsandbytes) ([#19886](https://github.com/Lightning-AI/lightning/pull/19886))
+
+
 ## [2.2.2] - 2024-04-11
 
 ### Fixed
diff --git a/src/lightning/fabric/plugins/precision/bitsandbytes.py b/src/lightning/fabric/plugins/precision/bitsandbytes.py
@@ -234,9 +234,9 @@ def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torc
             """Inplace quantize."""
             if weight is None:
                 weight = self.weight.data
-                if weight.data.type == torch.int8:
-                    # already quantized
-                    return
+            if weight.data.dtype == torch.int8:
+                # already quantized
+                return
             assert isinstance(self.weight, bnb.nn.Int8Params)
             self.weight = self.quantize(self.weight, weight, device)
 
@@ -318,9 +318,9 @@ def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torc
             """Inplace quantize."""
             if weight is None:
                 weight = self.weight.data
-                if weight.data.type == torch.uint8:
-                    # already quantized
-                    return
+            if weight.data.dtype == torch.uint8:
+                # already quantized
+                return
             assert isinstance(self.weight, bnb.nn.Params4bit)
             self.weight = self.quantize(self.weight, weight, device)
 
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
+## [2.2.5] - 2024-05-23
+
+### Fixed
+
+- Fixed a matrix shape mismatch issue when running a model loaded from a quantized checkpoint (bitsandbytes) ([#19886](https://github.com/Lightning-AI/lightning/pull/19886))
+
+
 ## [2.2.3] - 2024-04-23
 
 ### Fixed
diff --git a/src/version.info b/src/version.info
@@ -1 +1 @@
-2.2.4
+2.2.5
diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py
@@ -1187,3 +1187,27 @@ def run(self):
 def test_lightning_work_stopped():
     app = LightningApp(SimpleWork2())
     MultiProcessRuntime(app, start_server=False).dispatch()
+
+
+class FailedWork(LightningWork):
+    def run(self):
+        raise Exception
+
+
+class CheckErrorQueueLightningApp(LightningApp):
+    def check_error_queue(self):
+        super().check_error_queue()
+
+
+def test_error_queue_check(monkeypatch):
+    import sys
+
+    from lightning.app.core import app as app_module
+
+    sys_mock = mock.MagicMock()
+    monkeypatch.setattr(app_module, "CHECK_ERROR_QUEUE_INTERVAL", 0)
+    monkeypatch.setattr(sys, "exit", sys_mock)
+    app = LightningApp(FailedWork())
+    MultiProcessRuntime(app, start_server=False).dispatch()
+    assert app.stage == AppStage.FAILED
+    assert app._last_check_error_queue != 0.0
diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py
@@ -24,7 +24,7 @@
     Externalv1Cluster,
     Externalv1LightningappInstance,
     Gridv1ImageSpec,
-    IdGetBody1,
+    IdGetBody,
     ProjectIdProjectclustersbindingsBody,
     V1BuildSpec,
     V1CloudSpace,
@@ -508,7 +508,7 @@ def test_basic_auth_enabled(self, tmpdir, monkeypatch):
             project_id="test-project-id",
             cloudspace_id=mock.ANY,
             id=mock.ANY,
-            body=IdGetBody1(
+            body=IdGetBody(
                 desired_state=mock.ANY,
                 name=mock.ANY,
                 env=mock.ANY,
@@ -712,7 +712,7 @@ def test_call_with_queue_server_type_specified(self, tmpdir, lightningapps, monk
         cloud_runtime.dispatch()
 
         # calling with no env variable set
-        body = IdGetBody1(
+        body = IdGetBody(
             desired_state=V1LightningappInstanceState.STOPPED,
             env=[],
             name=mock.ANY,
@@ -727,7 +727,7 @@ def test_call_with_queue_server_type_specified(self, tmpdir, lightningapps, monk
         monkeypatch.setitem(os.environ, "LIGHTNING_CLOUD_QUEUE_TYPE", "http")
         cloud_runtime.backend.client.reset_mock()
         cloud_runtime.dispatch()
-        body = IdGetBody1(
+        body = IdGetBody(
             desired_state=V1LightningappInstanceState.STOPPED,
             env=mock.ANY,
             name=mock.ANY,
@@ -998,7 +998,7 @@ def test_call_with_work_app_and_app_comment_command_execution_set(self, lightnin
                 project_id="test-project-id",
                 cloudspace_id=mock.ANY,
                 id=mock.ANY,
-                body=IdGetBody1(
+                body=IdGetBody(
                     desired_state=V1LightningappInstanceState.STOPPED,
                     name=mock.ANY,
                     env=[V1EnvVar(name="ENABLE_APP_COMMENT_COMMAND_EXECUTION", value="1")],
diff --git a/tests/tests_app/utilities/test_network.py b/tests/tests_app/utilities/test_network.py
@@ -49,7 +49,8 @@ def test_find_free_network_port_cloudspace(_, patch_constants):
 def test_http_client_retry_post(getconn_mock):
     getconn_mock.return_value.getresponse.side_effect = [
         mock.Mock(status=500, msg=HTTPMessage()),
-        mock.Mock(status=429, msg=HTTPMessage()),
+        mock.Mock(status=599, msg=HTTPMessage()),
+        mock.Mock(status=405, msg=HTTPMessage()),
         mock.Mock(status=200, msg=HTTPMessage()),
     ]
 
@@ -61,14 +62,16 @@ def test_http_client_retry_post(getconn_mock):
         mock.call("POST", "/test", body=None, headers=mock.ANY),
         mock.call("POST", "/test", body=None, headers=mock.ANY),
         mock.call("POST", "/test", body=None, headers=mock.ANY),
+        mock.call("POST", "/test", body=None, headers=mock.ANY),
     ]
 
 
 @mock.patch("urllib3.connectionpool.HTTPConnectionPool._get_conn")
 def test_http_client_retry_get(getconn_mock):
     getconn_mock.return_value.getresponse.side_effect = [
         mock.Mock(status=500, msg=HTTPMessage()),
-        mock.Mock(status=429, msg=HTTPMessage()),
+        mock.Mock(status=599, msg=HTTPMessage()),
+        mock.Mock(status=405, msg=HTTPMessage()),
         mock.Mock(status=200, msg=HTTPMessage()),
     ]
 
@@ -80,4 +83,5 @@ def test_http_client_retry_get(getconn_mock):
         mock.call("GET", "/test", body=None, headers=mock.ANY),
         mock.call("GET", "/test", body=None, headers=mock.ANY),
         mock.call("GET", "/test", body=None, headers=mock.ANY),
+        mock.call("GET", "/test", body=None, headers=mock.ANY),
     ]
diff --git a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py
@@ -230,3 +230,37 @@ def __init__(self):
     assert not keys.missing_keys
     assert model.l.weight.device.type == "cuda"
     assert model.l.weight.dtype == expected
+
+
+@RunIf(min_cuda_gpus=1, min_torch="2.1")
+@pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes unavailable")
+def test_load_quantized_checkpoint(tmp_path):
+    """Test that a checkpoint saved from a quantized model can be loaded back into a quantized model."""
+
+    class Model(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = torch.nn.Linear(16, 16, bias=False)
+
+        def forward(self, x):
+            return self.linear(x)
+
+    fabric = Fabric(accelerator="cuda", devices=1, plugins=BitsandbytesPrecision("nf4-dq"))
+    model = Model()
+    model = fabric.setup(model)
+    model(torch.randn(2, 16, device=fabric.device))
+    state_dict = model.state_dict()
+    # The checkpoint contains quantized weights
+    assert state_dict["linear.weight"].dtype == torch.uint8
+    assert state_dict["linear.weight"].shape == (128, 1)
+    torch.save(state_dict, tmp_path / "checkpoint.pt")
+
+    fabric = Fabric(accelerator="cuda", devices=1, plugins=BitsandbytesPrecision("nf4-dq"))
+    model = Model()
+    model = fabric.setup(model)
+    state_dict = torch.load(tmp_path / "checkpoint.pt")
+    model.load_state_dict(state_dict)
+    assert model.linear.weight.dtype == torch.uint8
+    assert model.linear.weight.shape == (128, 1)
+    # Shapes match during forward (weight is being dequantized during forward)
+    model(torch.randn(2, 16, device=fabric.device))

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-lightning-cloud == 0.5.68 # Must be pinned to ensure compatibility`
	`1`	`+lightning-cloud == 0.5.69 # Must be pinned to ensure compatibility`
`2`	`2`	`packaging`
`3`	`3`	`typing-extensions >=4.4.0, <4.10.0`
`4`	`4`	`deepdiff >=5.7.0, <6.6.0`