Skip to content

Commit ac3f1ee

Browse files
awaelchlitchatonlantiga
authored
Patch release 2.2.5 (#19893)
Co-authored-by: thomas chaton <[email protected]> Co-authored-by: Luca Antiga <[email protected]>
1 parent 2a46b0c commit ac3f1ee

File tree

16 files changed

+125
-36
lines changed

16 files changed

+125
-36
lines changed

docs/source-app/conf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,3 +449,6 @@ def find_source():
449449

450450
# ignore all links in any CHANGELOG file
451451
linkcheck_exclude_documents = [r"^(.*\/)*CHANGELOG.*$"]
452+
453+
# ignore the following relative links (false positive errors during linkcheck)
454+
linkcheck_ignore = ["https://openai.com/"]

docs/source-pytorch/conf.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -343,8 +343,6 @@ def _load_py_module(name: str, location: str) -> ModuleType:
343343
"graphcore": ("https://docs.graphcore.ai/en/latest/", None),
344344
"lightning_habana": ("https://lightning-ai.github.io/lightning-Habana/", None),
345345
"tensorboardX": ("https://tensorboardx.readthedocs.io/en/stable/", None),
346-
# needed for referencing App from lightning scope
347-
"lightning.app": ("https://lightning.ai/docs/app/stable/", None),
348346
# needed for referencing Fabric from lightning scope
349347
"lightning.fabric": ("https://lightning.ai/docs/fabric/stable/", None),
350348
# TODO: these are missing objects.inv
@@ -626,4 +624,5 @@ def package_list_from_file(file):
626624
"https://stackoverflow.com/questions/66640705/how-can-i-install-grpcio-on-an-apple-m1-silicon-laptop",
627625
"https://github.com/Lightning-AI/lightning/blob/master/examples/pytorch/ipu/mnist_sample.py",
628626
"https://ngc.nvidia.com/catalog/containers/nvidia:nemo", # in ecosystem/asr_nlp_tts.rst
627+
"https://openai.com/",
629628
]

requirements/app/app.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
lightning-cloud == 0.5.68 # Must be pinned to ensure compatibility
1+
lightning-cloud == 0.5.69 # Must be pinned to ensure compatibility
22
packaging
33
typing-extensions >=4.4.0, <4.10.0
44
deepdiff >=5.7.0, <6.6.0

src/lightning/app/core/app.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from lightning.app.api.request_types import _APIRequest, _CommandRequest, _DeltaRequest
3131
from lightning.app.core.constants import (
3232
BATCH_DELTA_COUNT,
33+
CHECK_ERROR_QUEUE_INTERVAL,
3334
DEBUG_ENABLED,
3435
FLOW_DURATION_SAMPLES,
3536
FLOW_DURATION_THRESHOLD,
@@ -165,6 +166,7 @@ def __init__(
165166

166167
self._last_run_time: float = 0.0
167168
self._run_times: list = []
169+
self._last_check_error_queue: float = 0.0
168170

169171
# Path attributes can't get properly attached during the initialization, because the full name
170172
# is only available after all Flows and Works have been instantiated.
@@ -318,10 +320,12 @@ def batch_get_state_changed_from_queue(q: BaseQueue, timeout: Optional[float] =
318320
return []
319321

320322
def check_error_queue(self) -> None:
321-
exception: Exception = self.get_state_changed_from_queue(self.error_queue) # type: ignore[assignment,arg-type]
322-
if isinstance(exception, Exception):
323-
self.exception = exception
324-
self.stage = AppStage.FAILED
323+
if (time() - self._last_check_error_queue) > CHECK_ERROR_QUEUE_INTERVAL:
324+
exception: Exception = self.get_state_changed_from_queue(self.error_queue) # type: ignore[assignment,arg-type]
325+
if isinstance(exception, Exception):
326+
self.exception = exception
327+
self.stage = AppStage.FAILED
328+
self._last_check_error_queue = time()
325329

326330
@property
327331
def flows(self) -> List[Union[LightningWork, "LightningFlow"]]:

src/lightning/app/core/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def get_lightning_cloud_url() -> str:
7070
LIGHTNING_COMPONENT_PUBLIC_REGISTRY = "https://lightning.ai/v1/components"
7171
LIGHTNING_APPS_PUBLIC_REGISTRY = "https://lightning.ai/v1/apps"
7272
LIGHTNING_MODELS_PUBLIC_REGISTRY = "https://lightning.ai/v1/models"
73+
ENABLE_ORCHESTRATOR = bool(int(os.getenv("ENABLE_ORCHESTRATOR", "1")))
7374

7475
LIGHTNING_CLOUDSPACE_HOST = os.getenv("LIGHTNING_CLOUDSPACE_HOST")
7576
LIGHTNING_CLOUDSPACE_EXPOSED_PORT_COUNT = int(os.getenv("LIGHTNING_CLOUDSPACE_EXPOSED_PORT_COUNT", "0"))
@@ -99,6 +100,7 @@ def get_lightning_cloud_url() -> str:
99100
SYS_CUSTOMIZATIONS_SYNC_PATH = ".sys-customizations-sync"
100101

101102
BATCH_DELTA_COUNT = int(os.getenv("BATCH_DELTA_COUNT", "128"))
103+
CHECK_ERROR_QUEUE_INTERVAL = float(os.getenv("CHECK_ERROR_QUEUE_INTERVAL", "30"))
102104

103105

104106
def enable_multiple_works_in_default_container() -> bool:

src/lightning/app/runners/cloud.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
CloudspaceIdRunsBody,
3535
Externalv1LightningappInstance,
3636
Gridv1ImageSpec,
37-
IdGetBody1,
37+
IdGetBody,
3838
ProjectIdCloudspacesBody,
3939
V1BuildSpec,
4040
V1CloudSpace,
@@ -1027,7 +1027,7 @@ def _api_create_run_instance(
10271027
project_id=project_id,
10281028
cloudspace_id=cloudspace_id,
10291029
id=run_id,
1030-
body=IdGetBody1(
1030+
body=IdGetBody(
10311031
cluster_id=cluster_id,
10321032
name=run_name,
10331033
desired_state=desired_state,

src/lightning/app/runners/multiprocess.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -81,16 +81,17 @@ def dispatch(self, *args: Any, open_ui: bool = True, **kwargs: Any):
8181

8282
_set_flow_context()
8383

84-
storage_orchestrator = StorageOrchestrator(
85-
self.app,
86-
self.app.request_queues,
87-
self.app.response_queues,
88-
self.app.copy_request_queues,
89-
self.app.copy_response_queues,
90-
)
91-
self.threads.append(storage_orchestrator)
92-
storage_orchestrator.setDaemon(True)
93-
storage_orchestrator.start()
84+
if constants.ENABLE_ORCHESTRATOR:
85+
storage_orchestrator = StorageOrchestrator(
86+
self.app,
87+
self.app.request_queues,
88+
self.app.response_queues,
89+
self.app.copy_request_queues,
90+
self.app.copy_response_queues,
91+
)
92+
self.threads.append(storage_orchestrator)
93+
storage_orchestrator.setDaemon(True)
94+
storage_orchestrator.start()
9495

9596
if self.start_server:
9697
self.app.should_publish_changes_to_api = True

src/lightning/app/utilities/network.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,14 @@ def create_retry_strategy():
9696
# are going to be alive for a very long time (~ 4 days) but retries every 120 seconds
9797
total=_CONNECTION_RETRY_TOTAL,
9898
backoff_factor=_CONNECTION_RETRY_BACKOFF_FACTOR,
99+
# Any 4xx and 5xx statuses except
100+
# 400 Bad Request
101+
# 401 Unauthorized
102+
# 403 Forbidden
103+
# 404 Not Found
99104
status_forcelist={
100-
408, # Request Timeout
101-
429, # Too Many Requests
102-
*range(500, 600), # Any 5xx Server Error status
105+
402,
106+
*range(405, 600),
103107
},
104108
allowed_methods={
105109
"POST", # Default methods are idempotent, add POST here

src/lightning/fabric/CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
66

77

8+
## [2.2.5] - 2024-05-23
9+
10+
### Fixed
11+
12+
- Fixed a matrix shape mismatch issue when running a model loaded from a quantized checkpoint (bitsandbytes) ([#19886](https://github.com/Lightning-AI/lightning/pull/19886))
13+
14+
815
## [2.2.2] - 2024-04-11
916

1017
### Fixed

src/lightning/fabric/plugins/precision/bitsandbytes.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -234,9 +234,9 @@ def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torc
234234
"""Inplace quantize."""
235235
if weight is None:
236236
weight = self.weight.data
237-
if weight.data.type == torch.int8:
238-
# already quantized
239-
return
237+
if weight.data.dtype == torch.int8:
238+
# already quantized
239+
return
240240
assert isinstance(self.weight, bnb.nn.Int8Params)
241241
self.weight = self.quantize(self.weight, weight, device)
242242

@@ -318,9 +318,9 @@ def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torc
318318
"""Inplace quantize."""
319319
if weight is None:
320320
weight = self.weight.data
321-
if weight.data.type == torch.uint8:
322-
# already quantized
323-
return
321+
if weight.data.dtype == torch.uint8:
322+
# already quantized
323+
return
324324
assert isinstance(self.weight, bnb.nn.Params4bit)
325325
self.weight = self.quantize(self.weight, weight, device)
326326

src/lightning/pytorch/CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
66

77

8+
## [2.2.5] - 2024-05-23
9+
10+
### Fixed
11+
12+
- Fixed a matrix shape mismatch issue when running a model loaded from a quantized checkpoint (bitsandbytes) ([#19886](https://github.com/Lightning-AI/lightning/pull/19886))
13+
14+
815
## [2.2.3] - 2024-04-23
916

1017
### Fixed

src/version.info

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.2.4
1+
2.2.5

tests/tests_app/core/test_lightning_app.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1187,3 +1187,27 @@ def run(self):
11871187
def test_lightning_work_stopped():
11881188
app = LightningApp(SimpleWork2())
11891189
MultiProcessRuntime(app, start_server=False).dispatch()
1190+
1191+
1192+
class FailedWork(LightningWork):
1193+
def run(self):
1194+
raise Exception
1195+
1196+
1197+
class CheckErrorQueueLightningApp(LightningApp):
1198+
def check_error_queue(self):
1199+
super().check_error_queue()
1200+
1201+
1202+
def test_error_queue_check(monkeypatch):
1203+
import sys
1204+
1205+
from lightning.app.core import app as app_module
1206+
1207+
sys_mock = mock.MagicMock()
1208+
monkeypatch.setattr(app_module, "CHECK_ERROR_QUEUE_INTERVAL", 0)
1209+
monkeypatch.setattr(sys, "exit", sys_mock)
1210+
app = LightningApp(FailedWork())
1211+
MultiProcessRuntime(app, start_server=False).dispatch()
1212+
assert app.stage == AppStage.FAILED
1213+
assert app._last_check_error_queue != 0.0

tests/tests_app/runners/test_cloud.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
Externalv1Cluster,
2525
Externalv1LightningappInstance,
2626
Gridv1ImageSpec,
27-
IdGetBody1,
27+
IdGetBody,
2828
ProjectIdProjectclustersbindingsBody,
2929
V1BuildSpec,
3030
V1CloudSpace,
@@ -508,7 +508,7 @@ def test_basic_auth_enabled(self, tmpdir, monkeypatch):
508508
project_id="test-project-id",
509509
cloudspace_id=mock.ANY,
510510
id=mock.ANY,
511-
body=IdGetBody1(
511+
body=IdGetBody(
512512
desired_state=mock.ANY,
513513
name=mock.ANY,
514514
env=mock.ANY,
@@ -712,7 +712,7 @@ def test_call_with_queue_server_type_specified(self, tmpdir, lightningapps, monk
712712
cloud_runtime.dispatch()
713713

714714
# calling with no env variable set
715-
body = IdGetBody1(
715+
body = IdGetBody(
716716
desired_state=V1LightningappInstanceState.STOPPED,
717717
env=[],
718718
name=mock.ANY,
@@ -727,7 +727,7 @@ def test_call_with_queue_server_type_specified(self, tmpdir, lightningapps, monk
727727
monkeypatch.setitem(os.environ, "LIGHTNING_CLOUD_QUEUE_TYPE", "http")
728728
cloud_runtime.backend.client.reset_mock()
729729
cloud_runtime.dispatch()
730-
body = IdGetBody1(
730+
body = IdGetBody(
731731
desired_state=V1LightningappInstanceState.STOPPED,
732732
env=mock.ANY,
733733
name=mock.ANY,
@@ -998,7 +998,7 @@ def test_call_with_work_app_and_app_comment_command_execution_set(self, lightnin
998998
project_id="test-project-id",
999999
cloudspace_id=mock.ANY,
10001000
id=mock.ANY,
1001-
body=IdGetBody1(
1001+
body=IdGetBody(
10021002
desired_state=V1LightningappInstanceState.STOPPED,
10031003
name=mock.ANY,
10041004
env=[V1EnvVar(name="ENABLE_APP_COMMENT_COMMAND_EXECUTION", value="1")],

tests/tests_app/utilities/test_network.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ def test_find_free_network_port_cloudspace(_, patch_constants):
4949
def test_http_client_retry_post(getconn_mock):
5050
getconn_mock.return_value.getresponse.side_effect = [
5151
mock.Mock(status=500, msg=HTTPMessage()),
52-
mock.Mock(status=429, msg=HTTPMessage()),
52+
mock.Mock(status=599, msg=HTTPMessage()),
53+
mock.Mock(status=405, msg=HTTPMessage()),
5354
mock.Mock(status=200, msg=HTTPMessage()),
5455
]
5556

@@ -61,14 +62,16 @@ def test_http_client_retry_post(getconn_mock):
6162
mock.call("POST", "/test", body=None, headers=mock.ANY),
6263
mock.call("POST", "/test", body=None, headers=mock.ANY),
6364
mock.call("POST", "/test", body=None, headers=mock.ANY),
65+
mock.call("POST", "/test", body=None, headers=mock.ANY),
6466
]
6567

6668

6769
@mock.patch("urllib3.connectionpool.HTTPConnectionPool._get_conn")
6870
def test_http_client_retry_get(getconn_mock):
6971
getconn_mock.return_value.getresponse.side_effect = [
7072
mock.Mock(status=500, msg=HTTPMessage()),
71-
mock.Mock(status=429, msg=HTTPMessage()),
73+
mock.Mock(status=599, msg=HTTPMessage()),
74+
mock.Mock(status=405, msg=HTTPMessage()),
7275
mock.Mock(status=200, msg=HTTPMessage()),
7376
]
7477

@@ -80,4 +83,5 @@ def test_http_client_retry_get(getconn_mock):
8083
mock.call("GET", "/test", body=None, headers=mock.ANY),
8184
mock.call("GET", "/test", body=None, headers=mock.ANY),
8285
mock.call("GET", "/test", body=None, headers=mock.ANY),
86+
mock.call("GET", "/test", body=None, headers=mock.ANY),
8387
]

tests/tests_fabric/plugins/precision/test_bitsandbytes.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,3 +230,37 @@ def __init__(self):
230230
assert not keys.missing_keys
231231
assert model.l.weight.device.type == "cuda"
232232
assert model.l.weight.dtype == expected
233+
234+
235+
@RunIf(min_cuda_gpus=1, min_torch="2.1")
236+
@pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes unavailable")
237+
def test_load_quantized_checkpoint(tmp_path):
238+
"""Test that a checkpoint saved from a quantized model can be loaded back into a quantized model."""
239+
240+
class Model(torch.nn.Module):
241+
def __init__(self):
242+
super().__init__()
243+
self.linear = torch.nn.Linear(16, 16, bias=False)
244+
245+
def forward(self, x):
246+
return self.linear(x)
247+
248+
fabric = Fabric(accelerator="cuda", devices=1, plugins=BitsandbytesPrecision("nf4-dq"))
249+
model = Model()
250+
model = fabric.setup(model)
251+
model(torch.randn(2, 16, device=fabric.device))
252+
state_dict = model.state_dict()
253+
# The checkpoint contains quantized weights
254+
assert state_dict["linear.weight"].dtype == torch.uint8
255+
assert state_dict["linear.weight"].shape == (128, 1)
256+
torch.save(state_dict, tmp_path / "checkpoint.pt")
257+
258+
fabric = Fabric(accelerator="cuda", devices=1, plugins=BitsandbytesPrecision("nf4-dq"))
259+
model = Model()
260+
model = fabric.setup(model)
261+
state_dict = torch.load(tmp_path / "checkpoint.pt")
262+
model.load_state_dict(state_dict)
263+
assert model.linear.weight.dtype == torch.uint8
264+
assert model.linear.weight.shape == (128, 1)
265+
# Shapes match during forward (weight is being dequantized during forward)
266+
model(torch.randn(2, 16, device=fabric.device))

0 commit comments

Comments
 (0)