Skip to content

Commit 6b8b138

Browse files
committed
Integ tests for local mode model trainer (#1623)
1 parent 0875efc commit 6b8b138

File tree

14 files changed

+548
-15
lines changed

14 files changed

+548
-15
lines changed

src/sagemaker/modules/constants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616

1717
DEFAULT_INSTANCE_TYPE = "ml.m5.xlarge"
1818

19-
SM_CODE = "sm_code"
20-
SM_CODE_CONTAINER_PATH = "/opt/ml/input/data/sm_code"
19+
SM_CODE = "code"
20+
SM_CODE_CONTAINER_PATH = "/opt/ml/input/data/code"
2121

2222
SM_DRIVERS = "sm_drivers"
2323
SM_DRIVERS_CONTAINER_PATH = "/opt/ml/input/data/sm_drivers"

src/sagemaker/modules/local_core/local_container.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,8 @@
3232
)
3333
from sagemaker.local.utils import check_for_studio, recursive_copy
3434
from sagemaker.model import DIR_PARAM_NAME
35-
from sagemaker.modules import logger
35+
from sagemaker.modules import logger, Session
3636
from sagemaker.modules.configs import Channel
37-
from sagemaker.session import Session
3837
from sagemaker.utils import ECR_URI_PATTERN, create_tar_file, _module_import_error, download_folder
3938
from sagemaker_core.main.utils import Unassigned
4039
from sagemaker_core.shapes import DataSource
@@ -105,13 +104,17 @@ class _LocalContainer(BaseModel):
105104
input_data_config: Optional[List[Channel]]
106105
environment: Optional[Dict[str, str]]
107106
hyper_parameters: Optional[Dict[str, str]]
108-
sagemaker_session: Optional[Session]
107+
sagemaker_session: Optional[Session] = None
109108
container_entrypoint: Optional[List[str]]
110109
container_arguments: Optional[List[str]]
111110

112111
def model_post_init(self, __context: Any):
113112
"""Post init method to perform custom validation and set default values."""
114113
self.hosts = [f"algo-{i}" for i in range(1, self.instance_count + 1)]
114+
if self.environment is None:
115+
self.environment = {}
116+
if self.hyper_parameters is None:
117+
self.hyper_parameters = {}
115118

116119
for channel in self.input_data_config:
117120
if channel.data_source and channel.data_source.s3_data_source != Unassigned():

src/sagemaker/modules/train/container_drivers/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
TrainingJob - {training_job_name}
3737
"""
3838

39-
USER_CODE_PATH = "/opt/ml/input/data/sm_code"
39+
USER_CODE_PATH = "/opt/ml/input/data/code"
4040
SOURCE_CODE_JSON = "/opt/ml/input/data/sm_drivers/sourcecode.json"
4141
DISTRIBUTED_JSON = "/opt/ml/input/data/sm_drivers/distributed.json"
4242

src/sagemaker/modules/train/model_trainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,7 @@ def train(
535535
shutil.copytree(SM_DRIVERS_LOCAL_PATH, drivers_dir.name, dirs_exist_ok=True)
536536

537537
# If source code is provided, create a channel for the source code
538-
# The source code will be mounted at /opt/ml/input/data/sm_code in the container
538+
# The source code will be mounted at /opt/ml/input/data/code in the container
539539
if self.source_code.source_dir:
540540
source_code_channel = self.create_input_data_channel(
541541
channel_name=SM_CODE,
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# flake8: noqa
2+
import argparse
3+
import numpy as np
4+
import os
5+
import sys
6+
import logging
7+
import json
8+
import shutil
9+
import torch
10+
import torch.nn as nn
11+
from torch.utils.data import DataLoader, TensorDataset
12+
from pytorch_model_def import get_model
13+
14+
15+
logger = logging.getLogger(__name__)
16+
logger.setLevel(logging.DEBUG)
17+
logger.addHandler(logging.StreamHandler(sys.stdout))
18+
current_dir = os.path.dirname(os.path.abspath(__file__))
19+
data_dir = "/opt/ml/input/data"
20+
21+
22+
def get_train_data(train_dir):
23+
"""
24+
Get the training data and convert to tensors
25+
"""
26+
27+
x_train = np.load(os.path.join(train_dir, "x_train.npy"))
28+
y_train = np.load(os.path.join(train_dir, "y_train.npy"))
29+
logger.info(f"x train: {x_train.shape}, y train: {y_train.shape}")
30+
31+
return torch.from_numpy(x_train), torch.from_numpy(y_train)
32+
33+
34+
def get_test_data(test_dir):
35+
"""
36+
Get the testing data and convert to tensors
37+
"""
38+
39+
x_test = np.load(os.path.join(test_dir, "x_test.npy"))
40+
y_test = np.load(os.path.join(test_dir, "y_test.npy"))
41+
logger.info(f"x test: {x_test.shape}, y test: {y_test.shape}")
42+
43+
return torch.from_numpy(x_test), torch.from_numpy(y_test)
44+
45+
46+
def model_fn(model_dir):
47+
"""
48+
Load the model for inference
49+
"""
50+
51+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52+
model = get_model()
53+
model.load_state_dict(torch.load(model_dir + "/model.pth"))
54+
model.eval()
55+
return model.to(device)
56+
57+
58+
def input_fn(request_body, request_content_type):
59+
"""
60+
Deserialize and prepare the prediction input
61+
"""
62+
63+
if request_content_type == "application/json":
64+
request = json.loads(request_body)
65+
train_inputs = torch.tensor(request)
66+
return train_inputs
67+
68+
69+
def predict_fn(input_data, model):
70+
"""
71+
Apply model to the incoming request
72+
"""
73+
74+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
75+
model.to(device)
76+
model.eval()
77+
with torch.no_grad():
78+
return model(input_data.float()).numpy()[0]
79+
80+
81+
def train():
82+
"""
83+
Train the PyTorch model
84+
"""
85+
# Directories: train, test and model
86+
train_dir = os.path.join(data_dir, "train")
87+
test_dir = os.path.join(data_dir, "test")
88+
model_dir = os.environ.get("SM_MODEL_DIR", os.path.join(current_dir, "data/model"))
89+
90+
# Load the training and testing data
91+
x_train, y_train = get_train_data(train_dir)
92+
x_test, y_test = get_test_data(test_dir)
93+
train_ds = TensorDataset(x_train, y_train)
94+
95+
# Training parameters - used to configure the training loop
96+
batch_size = 64
97+
epochs = 1
98+
learning_rate = 0.1
99+
logger.info(
100+
"batch_size = {}, epochs = {}, learning rate = {}".format(batch_size, epochs, learning_rate)
101+
)
102+
103+
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
104+
105+
# Define the model, loss function and optimizer
106+
model = get_model()
107+
model = model.to(device)
108+
criterion = nn.MSELoss()
109+
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
110+
111+
# Train the model
112+
for epoch in range(epochs):
113+
for x_train_batch, y_train_batch in train_dl:
114+
y = model(x_train_batch.float())
115+
loss = criterion(y.flatten(), y_train_batch.float())
116+
optimizer.zero_grad()
117+
loss.backward()
118+
optimizer.step()
119+
epoch += 1
120+
logger.info(f"epoch: {epoch} -> loss: {loss}")
121+
122+
# Test the model
123+
with torch.no_grad():
124+
y = model(x_test.float()).flatten()
125+
mse = ((y - y_test) ** 2).sum() / y_test.shape[0]
126+
print("\nTest MSE:", mse.numpy())
127+
128+
# Save the model
129+
os.makedirs(model_dir, exist_ok=True)
130+
torch.save(model.state_dict(), model_dir + "/model.pth")
131+
inference_code_path = model_dir + "/code/"
132+
133+
if not os.path.exists(inference_code_path):
134+
os.mkdir(inference_code_path)
135+
logger.info("Created a folder at {}!".format(inference_code_path))
136+
137+
shutil.copy("local_training_script.py", inference_code_path)
138+
shutil.copy("pytorch_model_def.py", inference_code_path)
139+
logger.info("Saving models files to {}".format(inference_code_path))
140+
141+
142+
if __name__ == "__main__":
143+
print("Running the training job ...\n")
144+
145+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
146+
147+
train()
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# flake8: noqa
2+
import torch
3+
import torch.nn as nn
4+
5+
6+
class NeuralNet(nn.Module):
7+
def __init__(self):
8+
super().__init__()
9+
self.fc1 = nn.Linear(8, 8)
10+
self.fc2 = nn.Linear(8, 6)
11+
self.fc3 = nn.Linear(6, 1)
12+
13+
def forward(self, x):
14+
x = torch.tanh(self.fc1(x))
15+
x = torch.sigmoid(self.fc2(x))
16+
x = self.fc3(x)
17+
return x
18+
19+
20+
def get_model():
21+
22+
model = NeuralNet()
23+
return model

tests/data/modules/script_mode/custom_script.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,8 @@ def train():
132132
os.mkdir(inference_code_path)
133133
logger.info("Created a folder at {}!".format(inference_code_path))
134134

135-
code_dir = os.environ.get("SM_CHANNEL_CODE", current_dir)
136-
shutil.copy(os.path.join(code_dir, "custom_script.py"), inference_code_path)
137-
shutil.copy(os.path.join(code_dir, "pytorch_model_def.py"), inference_code_path)
135+
shutil.copy("custom_script.py", inference_code_path)
136+
shutil.copy("pytorch_model_def.py", inference_code_path)
138137
logger.info("Saving models files to {}".format(inference_code_path))
139138

140139

0 commit comments

Comments
 (0)