Lightning-AI · lexierule · Mar 5, 2021 · Feb 24, 2021 · Feb 25, 2021 · Feb 25, 2021
diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
@@ -30,13 +30,6 @@ jobs:
         pip install --requirement requirements/devel.txt --upgrade-strategy only-if-needed
         pip list
 
-    - name: Cache datasets
-      # todo this probably does not work with docker images, rather cache dockers
-      uses: actions/cache@v2
-      with:
-        path: Datasets
-        key: pl-dataset
-
     - name: Pull checkpoints from S3
       # todo: consider adding coma caching, but ATM all models have less then 100KB
       run: |
@@ -46,6 +39,12 @@ jobs:
         unzip -o checkpoints.zip
         ls -l checkpoints/
 
+    # todo: require proper fix in docker image
+    - name: Hotfix dependency
+      run: |
+        pip install torchtext==0.6.0 -U
+      shell: bash
+
     - name: Tests
       run: |
         # NOTE: run coverage on tests does not propagare faler status for Win, https://github.com/nedbat/coveragepy/issues/1003

diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
@@ -112,6 +112,12 @@ jobs:
         pip list
       shell: bash
 
+    # todo: require proper fix in docker image
+    - name: Hotfix dependency
+      run: |
+        pip install torchtext==0.6.0 -U
+      shell: bash
+
     - name: Reinstall Horovod if necessary
       if: runner.os != 'windows'
       env:
@@ -135,7 +141,12 @@ jobs:
     - name: Tests
       run: |
         # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003
-        coverage run --source pytorch_lightning -m pytest pytorch_lightning tests pl_examples -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
+        coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
+
+    # todo: put this back just when TorchVision can download datasets
+    #- name: Examples
+    #  run: |
+    #    python -m pytest pl_examples -v --durations=10
 
     - name: Upload pytest test results
       uses: actions/upload-artifact@v2

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,27 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
+## [1.2.2] - 2021-03-02
+
+### Added
+
+- Added `checkpoint` parameter to callback's `on_save_checkpoint` hook ([#6072](https://github.com/PyTorchLightning/pytorch-lightning/pull/6072))
+
+### Changed
+
+- Changed the order of `backward`, `step`, `zero_grad` to `zero_grad`, `backward`, `step` ([#6147](https://github.com/PyTorchLightning/pytorch-lightning/pull/6147))
+- Changed default for DeepSpeed CPU Offload to False, due to prohibitively slow speeds at smaller scale ([#6262](https://github.com/PyTorchLightning/pytorch-lightning/pull/6262))
+
+### Fixed
+
+- Fixed epoch level schedulers not being called when `val_check_interval < 1.0` ([#6075](https://github.com/PyTorchLightning/pytorch-lightning/pull/6075))
+- Fixed multiple early stopping callbacks ([#6197](https://github.com/PyTorchLightning/pytorch-lightning/pull/6197))
+- Fixed incorrect usage of `detach()`, `cpu()`, `to()` ([#6216](https://github.com/PyTorchLightning/pytorch-lightning/pull/6216))
+- Fixed LBFGS optimizer support which didn't converge in automatic optimization ([#6147](https://github.com/PyTorchLightning/pytorch-lightning/pull/6147))
+- Prevent `WandbLogger` from dropping values ([#5931](https://github.com/PyTorchLightning/pytorch-lightning/pull/5931))
+- Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297)
+
+
 ## [1.2.1] - 2021-02-23
 
 ### Fixed

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -23,7 +23,7 @@ jobs:
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: 2
 
-    pool: dsvm-spot-pool
+    pool: gridai-spot-pool
 
     #strategy:
     #  matrix:
@@ -58,25 +58,31 @@ jobs:
         export GIT_TERMINAL_PROMPT=1
         #sudo apt-get install -y cmake
         # python -m pip install "pip==20.1"
-        pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
+        pip install --requirement requirements.txt
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)"
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed
         pip install git+https://$(AUTH_TOKEN)@github.com/PyTorchLightning/[email protected] --no-cache-dir
         pip list
       displayName: 'Install dependencies'
 
-    - script: |
+    - bash: |
         python tests/collect_env_details.py
+        python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
       displayName: 'Env details'
 
+    # todo: require proper fix in docker image
+    - bash: |
+        pip install torchtext==0.7 -U
+      displayName: 'HotFix'
+
     - bash: |
         wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/
         unzip -o legacy/checkpoints.zip -d legacy/
         ls -l legacy/checkpoints/
       displayName: 'Get legacy checkpoints'
 
-    - script: |
+    - bash: |
         python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50
       displayName: 'Testing: standard'
 
@@ -90,12 +96,14 @@ jobs:
         codecov --token=$(CODECOV_TOKEN) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
       displayName: 'Statistics'
 
-    - script: |
-        python -m pytest benchmarks pl_examples -v --maxfail=2 --durations=0
-      displayName: 'Testing: extended'
-
-    - script: |
-        python setup.py install --user --quiet
-        bash pl_examples/run_ddp-example.sh
-        pip uninstall -y pytorch-lightning
-      displayName: 'Examples'
+    - bash: |
+        python -m pytest benchmarks -v --maxfail=2 --durations=0
+      displayName: 'Testing: benchmarks'
+
+    # todo: put this back just when TorchVision can download datasets
+    #- bash: |
+    #    python -m pytest pl_examples -v --maxfail=2 --durations=0
+    #    python setup.py install --user --quiet
+    #    bash pl_examples/run_ddp-example.sh
+    #    pip uninstall -y pytorch-lightning
+    #  displayName: 'Examples'
diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst
@@ -946,7 +946,7 @@ When set to ``False``, Lightning does not automate the optimization process. Thi
         opt = self.optimizers(use_pl_optimizer=True)
 
         loss = ...
-        self.manual_backward(loss, opt)
+        self.manual_backward(loss)
         opt.step()
         opt.zero_grad()
 
@@ -961,16 +961,16 @@ In the multi-optimizer case, ignore the ``optimizer_idx`` argument and use the o
 
     def training_step(self, batch, batch_idx, optimizer_idx):
         # access your optimizers with use_pl_optimizer=False. Default is True
-        (opt_a, opt_b) = self.optimizers(use_pl_optimizer=True)
+        opt_a, opt_b = self.optimizers(use_pl_optimizer=True)
 
         gen_loss = ...
         opt_a.zero_grad()
-        self.manual_backward(gen_loss, opt_a)
+        self.manual_backward(gen_loss)
         opt_a.step()
 
         disc_loss = ...
         opt_b.zero_grad()
-        self.manual_backward(disc_loss, opt_b)
+        self.manual_backward(disc_loss)
         opt_b.step()
 
 --------------

diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
@@ -23,32 +23,31 @@ to manually manage the optimization process. To do so, do the following:
 
 * Override your LightningModule ``automatic_optimization`` property to return ``False``
 * Drop or ignore the optimizer_idx argument
-* Use `self.manual_backward(loss)` instead of `loss.backward()`.
+* Use ``self.manual_backward(loss)`` instead of ``loss.backward()``.
 
-.. note:: This is only recommended for experts who need ultimate flexibility. Lightning will handle only precision and accelerators logic. The users are left with zero_grad, accumulated_grad_batches, model toggling, etc..
+.. note:: This is only recommended for experts who need ultimate flexibility. Lightning will handle only precision and accelerators logic. The users are left with ``optimizer.zero_grad()``, gradient accumulation, model toggling, etc..
 
-.. warning:: Before 1.2, ``optimzer.step`` was calling ``zero_grad`` internally. From 1.2, it is left to the users expertize.
+.. warning:: Before 1.2, ``optimzer.step`` was calling ``optimizer.zero_grad()`` internally. From 1.2, it is left to the users expertize.
 
 .. tip:: To perform ``accumulate_grad_batches`` with one optimizer, you can do as such.
 
 .. tip:: ``self.optimizers()`` will return ``LightningOptimizer`` objects. You can access your own optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to support accelerators and precision for you.
 
-
 .. code-block:: python
 
     def training_step(batch, batch_idx, optimizer_idx):
         opt = self.optimizers()
 
         loss = self.compute_loss(batch)
         self.manual_backward(loss)
-        opt.step()
 
         # accumulate gradient batches
         if batch_idx % 2 == 0:
+            opt.step()
             opt.zero_grad()
 
 
-.. tip:: It is a good practice to provide the optimizer with a ``closure`` function that performs a ``forward`` and ``backward`` pass of your model. It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure.
+.. tip:: It is a good practice to provide the optimizer with a ``closure`` function that performs a ``forward`` and ``backward`` pass of your model. It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure. See also `the PyTorch docs <https://pytorch.org/docs/stable/optim.html#optimizer-step-closure>`_.
 
 Here is the same example as above using a ``closure``.
 
@@ -71,7 +70,6 @@ Here is the same example as above using a ``closure``.
 .. code-block:: python
 
     # Scenario for a GAN.
-
     def training_step(...):
         opt_gen, opt_dis = self.optimizers()
 
@@ -137,8 +135,12 @@ Here is an example on how to use it:
 
 Automatic optimization
 ======================
-With Lightning most users don't have to think about when to call .backward(), .step(), .zero_grad(), since
-Lightning automates that for you.
+With Lightning most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()``
+since Lightning automates that for you.
+
+.. warning::
+   Before 1.2.2, ``.zero_grad()`` was called after ``.backward()`` and ``.step()`` internally.
+   From 1.2.2, Lightning calls ``.zero_grad()`` before ``.backward()``.
 
 Under the hood Lightning does the following:
 
@@ -147,33 +149,33 @@ Under the hood Lightning does the following:
     for epoch in epochs:
         for batch in data:
             loss = model.training_step(batch, batch_idx, ...)
+            optimizer.zero_grad()
             loss.backward()
             optimizer.step()
-            optimizer.zero_grad()
 
-        for scheduler in schedulers:
-            scheduler.step()
+        for lr_scheduler in lr_schedulers:
+            lr_scheduler.step()
 
 In the case of multiple optimizers, Lightning does the following:
 
 .. code-block:: python
 
     for epoch in epochs:
-      for batch in data:
-         for opt in optimizers:
-            disable_grads_for_other_optimizers()
-            train_step(opt)
-            opt.step()
+        for batch in data:
+            for opt in optimizers:
+                loss = model.training_step(batch, batch_idx, optimizer_idx)
+                opt.zero_grad()
+                loss.backward()
+                opt.step()
 
-      for scheduler in schedulers:
-         scheduler.step()
+        for lr_scheduler in lr_schedulers:
+            lr_scheduler.step()
 
 
 Learning rate scheduling
 ------------------------
-Every optimizer you use can be paired with any `LearningRateScheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
-In the basic use-case, the scheduler (or multiple schedulers) should be returned as the second output from the ``.configure_optimizers``
-method:
+Every optimizer you use can be paired with any `Learning Rate Scheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
+In the basic use-case, the scheduler (or multiple schedulers) should be returned as the second output from the ``.configure_optimizers`` method:
 
 .. testcode::
 
@@ -262,7 +264,7 @@ returned as a dict which can contain the following keywords:
 
 Use multiple optimizers (like GANs)
 -----------------------------------
-To use multiple optimizers return > 1 optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`
+To use multiple optimizers return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`
 
 .. testcode::
 
@@ -283,13 +285,15 @@ Lightning will call each optimizer sequentially:
 .. code-block:: python
 
    for epoch in epochs:
-      for batch in data:
-         for opt in optimizers:
-            train_step(opt)
-            opt.step()
+       for batch in data:
+           for opt in optimizers:
+               loss = train_step(batch, batch_idx, optimizer_idx)
+               opt.zero_grad()
+               loss.backward()
+               opt.step()
 
-      for scheduler in schedulers:
-         scheduler.step()
+      for lr_scheduler in lr_schedulers:
+          lr_scheduler.step()
 
 ----------
 
@@ -334,7 +338,7 @@ Here we add a learning-rate warm up
         # update params
         optimizer.step(closure=closure)
 
-.. note:: The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. It handles TPUs, AMP, accumulate_grad_batches, zero_grad, and much more ...
+.. note:: The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. It handles TPUs, AMP, accumulate_grad_batches and much more ...
 
 .. testcode::
 
@@ -364,6 +368,11 @@ Using the closure functions for optimization
 
 When using optimization schemes such as LBFGS, the `second_order_closure` needs to be enabled. By default, this function is defined by wrapping the `training_step` and the backward steps as follows
 
+.. warning::
+   Before 1.2.2, ``.zero_grad()`` was called outside the closure internally.
+   From 1.2.2, the closure calls ``.zero_grad()`` inside, so there is no need to define your own closure
+   when using similar optimizers to :class:`torch.optim.LBFGS` which requires reevaluation of the loss with the closure in ``optimizer.step()``.
+
 .. testcode::
 
     def second_order_closure(pl_module, split_batch, batch_idx, opt_idx, optimizer, hidden):

diff --git a/docs/source/starter/introduction_guide.rst b/docs/source/starter/introduction_guide.rst
@@ -361,9 +361,9 @@ The training step is what happens inside the training loop.
             # TRAINING STEP
             # ....
             # TRAINING STEP
+            optimizer.zero_grad()
             loss.backward()
             optimizer.step()
-            optimizer.zero_grad()
 
 In the case of MNIST, we do the following
 
@@ -377,9 +377,9 @@ In the case of MNIST, we do the following
             loss = F.nll_loss(logits, y)
             # ------ TRAINING STEP END ------
 
+            optimizer.zero_grad()
             loss.backward()
             optimizer.step()
-            optimizer.zero_grad()
 
 In Lightning, everything that is in the training step gets organized under the
 :func:`~pytorch_lightning.core.LightningModule.training_step` function in the LightningModule.

diff --git a/docs/source/starter/new-project.rst b/docs/source/starter/new-project.rst
@@ -248,7 +248,7 @@ as long as you return a loss with an attached graph from the `training_step`, Li
 .. code-block:: python
 
     def training_step(self, batch, batch_idx):
-        loss = self.encoder(batch[0])
+        loss = self.encoder(batch)
         return loss
 
 .. _manual_opt:
@@ -267,19 +267,18 @@ Turn off automatic optimization and you control the train loop!
 
     def training_step(self, batch, batch_idx, optimizer_idx):
         # access your optimizers with use_pl_optimizer=False. Default is True
-        (opt_a, opt_b, opt_c) = self.optimizers(use_pl_optimizer=True)
+        opt_a, opt_b = self.optimizers(use_pl_optimizer=True)
 
-        loss_a = self.generator(batch[0])
-
-        # use this instead of loss.backward so we can automate half precision, etc...
-        self.manual_backward(loss_a, opt_a, retain_graph=True)
-        self.manual_backward(loss_a, opt_a)
-        opt_a.step()
+        loss_a = self.generator(batch)
         opt_a.zero_grad()
+        # use `manual_backward()` instead of `loss.backward` to automate half precision, etc...
+        self.manual_backward(loss_a)
+        opt_a.step()
 
-        loss_b = self.discriminator(batch[0])
-        self.manual_backward(loss_b, opt_b)
-        ...
+        loss_b = self.discriminator(batch)
+        opt_b.zero_grad()
+        self.manual_backward(loss_b)
+        opt_b.step()
 
 
 Predict or Deploy