Skip to content

Commit 1c851b8

Browse files
Bordarohitgr7
andauthored
fixing miss-leading tested acc values (#5876)
* fixing tested values * . * tests * yapf * softmax * hvd * rename * lr * duplicate * drop * classif * rm EvalModel * Revert "rm EvalModel" This reverts commit 6c3fb39. * update tests * fix * azure * azure * self * cpu * Apply suggestions from code review Co-authored-by: rohitgr7 <[email protected]>
1 parent ebabe56 commit 1c851b8

File tree

15 files changed

+207
-167
lines changed

15 files changed

+207
-167
lines changed

tests/accelerators/ddp_model.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
import torch
2121

2222
from pytorch_lightning import seed_everything, Trainer
23-
from tests.base import EvalModelTemplate
23+
from tests.helpers.datamodules import ClassifDataModule
24+
from tests.helpers.simple_models import ClassificationModel
2425

2526

2627
def main():
@@ -35,24 +36,28 @@ def main():
3536
parser.set_defaults(accelerator="ddp")
3637
args = parser.parse_args()
3738

38-
model = EvalModelTemplate()
39+
dm = ClassifDataModule()
40+
model = ClassificationModel()
3941
trainer = Trainer.from_argparse_args(args)
4042

41-
result = {}
4243
if args.trainer_method == 'fit':
43-
trainer.fit(model)
44-
result = {'status': 'complete', 'method': args.trainer_method, 'result': None}
45-
if args.trainer_method == 'test':
46-
result = trainer.test(model)
47-
result = {'status': 'complete', 'method': args.trainer_method, 'result': result}
48-
if args.trainer_method == 'fit_test':
49-
trainer.fit(model)
50-
result = trainer.test(model)
51-
result = {'status': 'complete', 'method': args.trainer_method, 'result': result}
52-
53-
if len(result) > 0:
54-
file_path = os.path.join(args.tmpdir, 'ddp.result')
55-
torch.save(result, file_path)
44+
trainer.fit(model, datamodule=dm)
45+
result = None
46+
elif args.trainer_method == 'test':
47+
result = trainer.test(model, datamodule=dm)
48+
elif args.trainer_method == 'fit_test':
49+
trainer.fit(model, datamodule=dm)
50+
result = trainer.test(model, datamodule=dm)
51+
else:
52+
raise ValueError(f'Unsupported: {args.trainer_method}')
53+
54+
result_ext = {
55+
'status': 'complete',
56+
'method': args.trainer_method,
57+
'result': result,
58+
}
59+
file_path = os.path.join(args.tmpdir, 'ddp.result')
60+
torch.save(result_ext, file_path)
5661

5762

5863
if __name__ == '__main__':

tests/accelerators/test_ddp.py

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,13 @@
2323
from tests.helpers.boring_model import BoringModel
2424
from tests.utilities.distributed import call_training_script
2525

26+
CLI_ARGS = '--max_epochs 1 --gpus 2 --accelerator ddp'
27+
2628

27-
@pytest.mark.parametrize('cli_args', [
28-
pytest.param('--max_epochs 1 --gpus 2 --accelerator ddp'),
29-
])
3029
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
31-
def test_multi_gpu_model_ddp_fit_only(tmpdir, cli_args):
30+
def test_multi_gpu_model_ddp_fit_only(tmpdir):
3231
# call the script
33-
std, err = call_training_script(ddp_model, cli_args, 'fit', tmpdir, timeout=120)
32+
call_training_script(ddp_model, CLI_ARGS, 'fit', tmpdir, timeout=120)
3433

3534
# load the results of the script
3635
result_path = os.path.join(tmpdir, 'ddp.result')
@@ -40,13 +39,10 @@ def test_multi_gpu_model_ddp_fit_only(tmpdir, cli_args):
4039
assert result['status'] == 'complete'
4140

4241

43-
@pytest.mark.parametrize('cli_args', [
44-
pytest.param('--max_epochs 1 --gpus 2 --accelerator ddp'),
45-
])
4642
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
47-
def test_multi_gpu_model_ddp_test_only(tmpdir, cli_args):
43+
def test_multi_gpu_model_ddp_test_only(tmpdir):
4844
# call the script
49-
call_training_script(ddp_model, cli_args, 'test', tmpdir)
45+
call_training_script(ddp_model, CLI_ARGS, 'test', tmpdir)
5046

5147
# load the results of the script
5248
result_path = os.path.join(tmpdir, 'ddp.result')
@@ -56,13 +52,10 @@ def test_multi_gpu_model_ddp_test_only(tmpdir, cli_args):
5652
assert result['status'] == 'complete'
5753

5854

59-
@pytest.mark.parametrize('cli_args', [
60-
pytest.param('--max_epochs 1 --gpus 2 --accelerator ddp'),
61-
])
6255
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
63-
def test_multi_gpu_model_ddp_fit_test(tmpdir, cli_args):
56+
def test_multi_gpu_model_ddp_fit_test(tmpdir):
6457
# call the script
65-
call_training_script(ddp_model, cli_args, 'fit_test', tmpdir, timeout=20)
58+
call_training_script(ddp_model, CLI_ARGS, 'fit_test', tmpdir, timeout=20)
6659

6760
# load the results of the script
6861
result_path = os.path.join(tmpdir, 'ddp.result')
@@ -73,7 +66,7 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir, cli_args):
7366

7467
model_outs = result['result']
7568
for out in model_outs:
76-
assert out['test_acc'] > 0.90
69+
assert out['test_acc'] > 0.7
7770

7871

7972
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")

tests/accelerators/test_ddp_spawn.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
from pytorch_lightning.core import memory
2121
from pytorch_lightning.trainer import Trainer
2222
from pytorch_lightning.trainer.states import TrainerState
23-
from tests.base import EvalModelTemplate
23+
from tests.helpers import BoringModel
24+
from tests.helpers.datamodules import ClassifDataModule
25+
from tests.helpers.simple_models import ClassificationModel
2426

2527

2628
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -29,16 +31,17 @@ def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
2931

3032
trainer_options = dict(
3133
default_root_dir=tmpdir,
32-
callbacks=[EarlyStopping()],
34+
callbacks=[EarlyStopping(monitor='train_acc')],
3335
max_epochs=50,
3436
limit_train_batches=10,
3537
limit_val_batches=10,
3638
gpus=[0, 1],
3739
accelerator='ddp_spawn',
3840
)
3941

40-
model = EvalModelTemplate()
41-
tpipes.run_model_test(trainer_options, model)
42+
dm = ClassifDataModule()
43+
model = ClassificationModel()
44+
tpipes.run_model_test(trainer_options, model, dm)
4245

4346

4447
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -55,7 +58,7 @@ def test_multi_gpu_model_ddp_spawn(tmpdir):
5558
progress_bar_refresh_rate=0,
5659
)
5760

58-
model = EvalModelTemplate()
61+
model = BoringModel()
5962

6063
tpipes.run_model_test(trainer_options, model)
6164

@@ -68,7 +71,7 @@ def test_ddp_all_dataloaders_passed_to_fit(tmpdir):
6871
"""Make sure DDP works with dataloaders passed to fit()"""
6972
tutils.set_random_master_port()
7073

71-
model = EvalModelTemplate()
74+
model = BoringModel()
7275
fit_options = dict(train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader())
7376

7477
trainer = Trainer(

tests/accelerators/test_dp.py

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,36 +11,69 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
import os
15+
from unittest import mock
16+
1417
import pytest
1518
import torch
19+
import torch.nn.functional as F
1620

1721
import pytorch_lightning as pl
1822
import tests.helpers.pipelines as tpipes
1923
import tests.helpers.utils as tutils
2024
from pytorch_lightning.callbacks import EarlyStopping
2125
from pytorch_lightning.core import memory
22-
from tests.base import EvalModelTemplate
26+
from tests.helpers import BoringModel
27+
from tests.helpers.datamodules import ClassifDataModule
28+
from tests.helpers.simple_models import ClassificationModel
2329

2430
PRETEND_N_OF_GPUS = 16
2531

2632

33+
class CustomClassificationModelDP(ClassificationModel):
34+
35+
def _step(self, batch, batch_idx):
36+
x, y = batch
37+
logits = self(x)
38+
return {'logits': logits, 'y': y}
39+
40+
def training_step(self, batch, batch_idx):
41+
out = self._step(batch, batch_idx)
42+
loss = F.cross_entropy(out['logits'], out['y'])
43+
return loss
44+
45+
def validation_step(self, batch, batch_idx):
46+
return self._step(batch, batch_idx)
47+
48+
def test_step(self, batch, batch_idx):
49+
return self._step(batch, batch_idx)
50+
51+
def validation_step_end(self, outputs):
52+
self.log('val_acc', self.valid_acc(outputs['logits'], outputs['y']))
53+
54+
def test_step_end(self, outputs):
55+
self.log('test_acc', self.test_acc(outputs['logits'], outputs['y']))
56+
57+
2758
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
2859
def test_multi_gpu_early_stop_dp(tmpdir):
2960
"""Make sure DDP works. with early stopping"""
3061
tutils.set_random_master_port()
3162

63+
dm = ClassifDataModule()
64+
model = CustomClassificationModelDP()
65+
3266
trainer_options = dict(
3367
default_root_dir=tmpdir,
34-
callbacks=[EarlyStopping()],
68+
callbacks=[EarlyStopping(monitor='val_acc')],
3569
max_epochs=50,
3670
limit_train_batches=10,
3771
limit_val_batches=10,
3872
gpus=[0, 1],
3973
accelerator='dp',
4074
)
4175

42-
model = EvalModelTemplate()
43-
tpipes.run_model_test(trainer_options, model)
76+
tpipes.run_model_test(trainer_options, model, dm)
4477

4578

4679
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -57,22 +90,21 @@ def test_multi_gpu_model_dp(tmpdir):
5790
progress_bar_refresh_rate=0,
5891
)
5992

60-
model = EvalModelTemplate()
93+
model = BoringModel()
6194

6295
tpipes.run_model_test(trainer_options, model)
6396

6497
# test memory helper functions
6598
memory.get_memory_profile('min_max')
6699

67100

101+
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
68102
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
69103
def test_dp_test(tmpdir):
70104
tutils.set_random_master_port()
71105

72-
import os
73-
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
74-
75-
model = EvalModelTemplate()
106+
dm = ClassifDataModule()
107+
model = CustomClassificationModelDP()
76108
trainer = pl.Trainer(
77109
default_root_dir=tmpdir,
78110
max_epochs=2,
@@ -81,17 +113,17 @@ def test_dp_test(tmpdir):
81113
gpus=[0, 1],
82114
accelerator='dp',
83115
)
84-
trainer.fit(model)
116+
trainer.fit(model, datamodule=dm)
85117
assert 'ckpt' in trainer.checkpoint_callback.best_model_path
86-
results = trainer.test()
118+
results = trainer.test(datamodule=dm)
87119
assert 'test_acc' in results[0]
88120

89-
old_weights = model.c_d1.weight.clone().detach().cpu()
121+
old_weights = model.layer_0.weight.clone().detach().cpu()
90122

91-
results = trainer.test(model)
123+
results = trainer.test(model, datamodule=dm)
92124
assert 'test_acc' in results[0]
93125

94126
# make sure weights didn't change
95-
new_weights = model.c_d1.weight.clone().detach().cpu()
127+
new_weights = model.layer_0.weight.clone().detach().cpu()
96128

97129
assert torch.all(torch.eq(old_weights, new_weights))

tests/base/model_template.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def forward(self, x):
111111
x = self.c_d1_drop(x)
112112

113113
x = self.c_d2(x)
114-
logits = F.log_softmax(x, dim=1)
114+
logits = F.softmax(x, dim=1)
115115

116116
return logits
117117

tests/core/test_datamodules.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -385,9 +385,8 @@ def _step(self, batch, batch_idx):
385385
return {'logits': logits, 'y': y}
386386

387387
def training_step(self, batch, batch_idx):
388-
_, y = batch
389388
out = self._step(batch, batch_idx)
390-
loss = F.cross_entropy(out['logits'], y)
389+
loss = F.cross_entropy(out['logits'], out['y'])
391390
return loss
392391

393392
def validation_step(self, batch, batch_idx):

0 commit comments

Comments
 (0)