Skip to content

Fix Checkpoint in Hyperparameter Tuning #2782

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Apr 29, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 50 additions & 34 deletions beginner_source/hyperparameter_tuning_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
"""
from functools import partial
import os
import tempfile
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
Expand All @@ -57,14 +59,13 @@
sys.stdout.fileno = lambda: 0
# sphinx_gallery_end_ignore
from ray import tune
from ray.air import Checkpoint, session
from ray import train
from ray.train import Checkpoint, get_checkpoint
from ray.tune.schedulers import ASHAScheduler

# TODO: Migrate to ray.train.Checkpoint and remove following line
os.environ["RAY_AIR_NEW_PERSISTENCE_MODE"]="0"
import ray.cloudpickle as pickle

######################################################################
# Most of the imports are needed for building the PyTorch model. Only the last three
# Most of the imports are needed for building the PyTorch model. Only the last
# imports are for Ray Tune.
#
# Data loaders
Expand Down Expand Up @@ -135,13 +136,15 @@ def forward(self, x):
#
# net = Net(config["l1"], config["l2"])
#
# checkpoint = session.get_checkpoint()
#
# checkpoint = get_checkpoint()
# if checkpoint:
# checkpoint_state = checkpoint.to_dict()
# start_epoch = checkpoint_state["epoch"]
# net.load_state_dict(checkpoint_state["net_state_dict"])
# optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
# with checkpoint.as_directory() as checkpoint_dir:
# data_path = Path(checkpoint_dir) / "data.pkl"
# with open(data_path, "rb") as fp:
# checkpoint_state = pickle.load(fp)
# start_epoch = checkpoint_state["epoch"]
# net.load_state_dict(checkpoint_state["net_state_dict"])
# optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
# else:
# start_epoch = 0
#
Expand Down Expand Up @@ -197,12 +200,16 @@ def forward(self, x):
# "net_state_dict": net.state_dict(),
# "optimizer_state_dict": optimizer.state_dict(),
# }
# checkpoint = Checkpoint.from_dict(checkpoint_data)
# with tempfile.TemporaryDirectory() as checkpoint_dir:
# data_path = Path(checkpoint_dir) / "data.pkl"
# with open(data_path, "wb") as fp:
# pickle.dump(checkpoint_data, fp)
#
# session.report(
# {"loss": val_loss / val_steps, "accuracy": correct / total},
# checkpoint=checkpoint,
# )
# checkpoint = Checkpoint.from_directory(checkpoint_dir)
# train.report(
# {"loss": val_loss / val_steps, "accuracy": correct / total},
# checkpoint=checkpoint,
# )
#
# Here we first save a checkpoint and then report some metrics back to Ray Tune. Specifically,
# we send the validation loss and accuracy back to Ray Tune. Ray Tune can then use these metrics
Expand Down Expand Up @@ -236,13 +243,15 @@ def train_cifar(config, data_dir=None):
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

checkpoint = session.get_checkpoint()

checkpoint = get_checkpoint()
if checkpoint:
checkpoint_state = checkpoint.to_dict()
start_epoch = checkpoint_state["epoch"]
net.load_state_dict(checkpoint_state["net_state_dict"])
optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
with checkpoint.as_directory() as checkpoint_dir:
data_path = Path(checkpoint_dir) / "data.pkl"
with open(data_path, "rb") as fp:
checkpoint_state = pickle.load(fp)
start_epoch = checkpoint_state["epoch"]
net.load_state_dict(checkpoint_state["net_state_dict"])
optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
else:
start_epoch = 0

Expand Down Expand Up @@ -311,12 +320,17 @@ def train_cifar(config, data_dir=None):
"net_state_dict": net.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
}
checkpoint = Checkpoint.from_dict(checkpoint_data)

session.report(
{"loss": val_loss / val_steps, "accuracy": correct / total},
checkpoint=checkpoint,
)
with tempfile.TemporaryDirectory() as checkpoint_dir:
data_path = Path(checkpoint_dir) / "data.pkl"
with open(data_path, "wb") as fp:
pickle.dump(checkpoint_data, fp)

checkpoint = Checkpoint.from_directory(checkpoint_dir)
train.report(
{"loss": val_loss / val_steps, "accuracy": correct / total},
checkpoint=checkpoint,
)

print("Finished Training")


Expand Down Expand Up @@ -449,13 +463,15 @@ def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
best_trained_model = nn.DataParallel(best_trained_model)
best_trained_model.to(device)

best_checkpoint = best_trial.checkpoint.to_air_checkpoint()
best_checkpoint_data = best_checkpoint.to_dict()

best_trained_model.load_state_dict(best_checkpoint_data["net_state_dict"])
best_checkpoint = result.get_best_checkpoint(trial=best_trial, metric="accuracy", mode="max")
with best_checkpoint.as_directory() as checkpoint_dir:
data_path = Path(checkpoint_dir) / "data.pkl"
with open(data_path, "rb") as fp:
best_checkpoint_data = pickle.load(fp)

test_acc = test_accuracy(best_trained_model, device)
print("Best trial test set accuracy: {}".format(test_acc))
best_trained_model.load_state_dict(best_checkpoint_data["net_state_dict"])
test_acc = test_accuracy(best_trained_model, device)
print("Best trial test set accuracy: {}".format(test_acc))


if __name__ == "__main__":
Expand Down