Skip to content

Commit 4c51262

Browse files
author
Andre Moeller
committed
add chainer notebooks
1 parent 7e1c06a commit 4c51262

30 files changed

+3591
-0
lines changed

sagemaker-python-sdk/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ These examples focus on the Amazon SageMaker Python SDK which allows you to writ
77
- [cifar 10 with MXNet Gluon](mxnet_gluon_cifar10)
88
- [MNIST with MXNet Gluon](mxnet_gluon_mnist)
99
- [MNIST with MXNet](mxnet_mnist)
10+
- [CIFAR-10 with Chainer and ChainerMN](chainer_cifar10)
11+
- [Sentiment Analysis with Chainer](chainer_sentiment_analysis)
12+
- [MNIST with Chainer](chainer_mnist)
1013
- [Sentiment Analysis with MXNet Gluon](mxnet_gluon_sentiment)
1114
- [TensorFlow Neural Networks with Layers](tensorflow_abalone_age_predictor_using_layers)
1215
- [TensorFlow Networks with Keras](tensorflow_abalone_age_predictor_using_keras)

sagemaker-python-sdk/chainer_cifar10/chainer_single_machine_cifar10.ipynb

Lines changed: 457 additions & 0 deletions
Large diffs are not rendered by default.

sagemaker-python-sdk/chainer_cifar10/chainermn_distributed_cifar10.ipynb

Lines changed: 465 additions & 0 deletions
Large diffs are not rendered by default.
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
14+
import boto3
15+
import tarfile
16+
from urllib.parse import urlparse
17+
import os
18+
19+
def retrieve_output_from_s3(s3_url, output_dir):
20+
"""
21+
Downloads output artifacts from s3 and extracts them into the given directory.
22+
23+
Args:
24+
s3_url: S3 URL to the output artifacts
25+
output_dir: directory to write artifacts to
26+
"""
27+
o = urlparse(s3_url)
28+
s3 = boto3.resource('s3')
29+
output_data_path = os.path.join(output_dir)
30+
output_file_name = os.path.join(output_data_path, 'output.tar.gz')
31+
try:
32+
os.makedirs(output_data_path)
33+
except FileExistsError:
34+
pass
35+
s3.Bucket(o.netloc).download_file(o.path.lstrip('/'), output_file_name)
36+
tar = tarfile.open(output_file_name)
37+
tar.extractall(output_data_path)
38+
tar.close()

sagemaker-python-sdk/chainer_cifar10/src/__init__.py

Whitespace-only changes.
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
14+
from __future__ import print_function, absolute_import
15+
16+
import argparse
17+
import os
18+
19+
import numpy as np
20+
21+
import chainer
22+
import chainer.functions as F
23+
import chainer.links as L
24+
import chainermn
25+
from chainer import initializers
26+
from chainer import serializers
27+
from chainer import training
28+
from chainer.training import extensions
29+
30+
import net
31+
32+
33+
if __name__=='__main__':
34+
35+
num_gpus = int(os.environ['SM_NUM_GPUS'])
36+
37+
parser = argparse.ArgumentParser()
38+
39+
# retrieve the hyperparameters we set from the client (with some defaults)
40+
parser.add_argument('--epochs', type=int, default=30)
41+
parser.add_argument('--batch-size', type=int, default=256)
42+
parser.add_argument('--learning-rate', type=float, default=0.05)
43+
parser.add_argument('--communicator', type=str, default='pure_nccl' if num_gpus > 0 else 'naive')
44+
45+
# Data, model, and output directories. These are required.
46+
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
47+
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
48+
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
49+
parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])
50+
51+
args, _ = parser.parse_known_args()
52+
53+
train_data = np.load(os.path.join(args.train, 'train.npz'))['data']
54+
train_labels = np.load(os.path.join(args.train, 'train.npz'))['labels']
55+
56+
test_data = np.load(os.path.join(args.test, 'test.npz'))['data']
57+
test_labels = np.load(os.path.join(args.test, 'test.npz'))['labels']
58+
59+
train = chainer.datasets.TupleDataset(train_data, train_labels)
60+
test = chainer.datasets.TupleDataset(test_data, test_labels)
61+
62+
# Set up a neural network to train.
63+
# Classifier reports softmax cross entropy loss and accuracy at every
64+
# iteration, which will be used by the PrintReport extension below.
65+
model = L.Classifier(net.VGG(10))
66+
67+
comm = chainermn.create_communicator(args.communicator)
68+
69+
# comm.inter_rank gives the rank of the node. This should only print on one node.
70+
if comm.inter_rank == 0:
71+
print('# Minibatch-size: {}'.format(args.batch_size))
72+
print('# epoch: {}'.format(args.epochs))
73+
print('# communicator: {}'.format(args.communicator))
74+
75+
# Set up a neural network to train.
76+
# Classifier reports softmax cross entropy loss and accuracy at every
77+
# iteration, which will be used by the PrintReport extension below.
78+
79+
# comm.intra_rank gives the rank of the process on a given node.
80+
device = comm.intra_rank if num_gpus > 0 else -1
81+
if device >= 0:
82+
chainer.cuda.get_device_from_id(device).use()
83+
84+
optimizer = chainermn.create_multi_node_optimizer(chainer.optimizers.MomentumSGD(args.learning_rate), comm)
85+
optimizer.setup(model)
86+
optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))
87+
88+
num_loaders = 2
89+
train_iter = chainer.iterators.MultiprocessIterator(train, args.batch_size, n_processes=num_loaders)
90+
test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False, n_processes=num_loaders)
91+
92+
# Set up a trainer
93+
updater = training.StandardUpdater(train_iter, optimizer, device=device)
94+
trainer = training.Trainer(updater, (args.epochs, 'epoch'), out=args.output_data_dir)
95+
96+
# Evaluate the model with the test dataset for each epoch
97+
98+
evaluator = extensions.Evaluator(test_iter, model, device=device)
99+
evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
100+
trainer.extend(evaluator)
101+
102+
# Reduce the learning rate by half every 25 epochs.
103+
trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch'))
104+
105+
# Dump a computational graph from 'loss' variable at the first iteration
106+
# The "main" refers to the target link of the "main" optimizer.
107+
trainer.extend(extensions.dump_graph('main/loss'))
108+
109+
# Write a log of evaluation statistics for each epoch
110+
trainer.extend(extensions.LogReport())
111+
if comm.rank == 0:
112+
if extensions.PlotReport.available():
113+
trainer.extend(
114+
extensions.PlotReport(['main/loss', 'validation/main/loss'],
115+
'epoch', file_name='loss.png'))
116+
trainer.extend(
117+
extensions.PlotReport(
118+
['main/accuracy', 'validation/main/accuracy'],
119+
'epoch', file_name='accuracy.png'))
120+
121+
trainer.extend(extensions.dump_graph('main/loss'))
122+
123+
trainer.extend(extensions.PrintReport(
124+
['epoch', 'main/loss', 'validation/main/loss',
125+
'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
126+
127+
# Run the training
128+
trainer.run()
129+
130+
# Save the model (only on one host).
131+
if comm.rank == 0:
132+
serializers.save_npz(os.path.join(args.model_dir, 'model.npz'), model)
133+
134+
135+
136+
def model_fn(model_dir):
137+
"""
138+
This function is called by the Chainer container during hosting when running on SageMaker with
139+
values populated by the hosting environment.
140+
141+
This function loads models written during training into `model_dir`.
142+
143+
144+
Args:
145+
model_dir (str): path to the directory containing the saved model artifacts
146+
147+
Returns:
148+
a loaded Chainer model
149+
150+
For more on `model_fn`, please visit the sagemaker-python-sdk repository:
151+
https://github.com/aws/sagemaker-python-sdk
152+
153+
For more on the Chainer container, please visit the sagemaker-chainer-containers repository:
154+
https://github.com/aws/sagemaker-chainer-containers
155+
"""
156+
chainer.config.train = False
157+
model = L.Classifier(net.VGG(10))
158+
serializers.load_npz(os.path.join(model_dir, 'model.npz'), model)
159+
return model.predictor
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
14+
from __future__ import print_function, absolute_import
15+
16+
import argparse
17+
import os
18+
19+
import numpy as np
20+
21+
import chainer
22+
import chainer.functions as F
23+
import chainer.links as L
24+
from chainer import training
25+
from chainer import serializers
26+
from chainer.training import extensions
27+
28+
import net
29+
30+
if __name__ =='__main__':
31+
32+
parser = argparse.ArgumentParser()
33+
34+
# retrieve the hyperparameters we set from the client (with some defaults)
35+
parser.add_argument('--epochs', type=int, default=50)
36+
parser.add_argument('--batch-size', type=int, default=64)
37+
parser.add_argument('--learning-rate', type=float, default=0.05)
38+
39+
# Data, model, and output directories These are required.
40+
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
41+
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
42+
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
43+
parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])
44+
45+
args, _ = parser.parse_known_args()
46+
47+
num_gpus = int(os.environ['SM_NUM_GPUS'])
48+
49+
train_data = np.load(os.path.join(args.train, 'train.npz'))['data']
50+
train_labels = np.load(os.path.join(args.train, 'train.npz'))['labels']
51+
52+
test_data = np.load(os.path.join(args.test, 'test.npz'))['data']
53+
test_labels = np.load(os.path.join(args.test, 'test.npz'))['labels']
54+
55+
train = chainer.datasets.TupleDataset(train_data, train_labels)
56+
test = chainer.datasets.TupleDataset(test_data, test_labels)
57+
58+
print('# Minibatch-size: {}'.format(args.batch_size))
59+
print('# epoch: {}'.format(args.epochs))
60+
print('# learning rate: {}'.format(args.learning_rate))
61+
62+
# Set up a neural network to train.
63+
# Classifier reports softmax cross entropy loss and accuracy at every
64+
# iteration, which will be used by the PrintReport extension below.
65+
model = L.Classifier(net.VGG(10))
66+
67+
optimizer = chainer.optimizers.MomentumSGD(args.learning_rate)
68+
optimizer.setup(model)
69+
optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))
70+
71+
# Set up a trainer
72+
device = 0 if num_gpus > 0 else -1 # -1 indicates CPU, 0 indicates first GPU device.
73+
if num_gpus > 1:
74+
devices = range(num_gpus)
75+
train_iters = [chainer.iterators.MultiprocessIterator(i, args.batch_size, n_processes=4) \
76+
for i in chainer.datasets.split_dataset_n_random(train, len(devices))]
77+
test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False, n_processes=num_gpus)
78+
updater = training.updaters.MultiprocessParallelUpdater(train_iters, optimizer, devices=range(num_gpus))
79+
else:
80+
train_iter = chainer.iterators.MultiprocessIterator(train, args.batch_size)
81+
test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False)
82+
updater = training.updater.StandardUpdater(train_iter, optimizer, device=device)
83+
84+
stop_trigger = (args.epochs, 'epoch')
85+
trainer = training.Trainer(updater, stop_trigger, out=args.output_data_dir)
86+
# Evaluate the model with the test dataset for each epoch
87+
trainer.extend(extensions.Evaluator(test_iter, model, device=device))
88+
89+
# Reduce the learning rate by half every 25 epochs.
90+
trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch'))
91+
92+
# Dump a computational graph from 'loss' variable at the first iteration
93+
# The "main" refers to the target link of the "main" optimizer.
94+
trainer.extend(extensions.dump_graph('main/loss'))
95+
96+
# Write a log of evaluation statistics for each epoch
97+
trainer.extend(extensions.LogReport())
98+
99+
if extensions.PlotReport.available():
100+
trainer.extend(
101+
extensions.PlotReport(['main/loss', 'validation/main/loss'],
102+
'epoch', file_name='loss.png'))
103+
trainer.extend(
104+
extensions.PlotReport(
105+
['main/accuracy', 'validation/main/accuracy'],
106+
'epoch', file_name='accuracy.png'))
107+
108+
# Print selected entries of the log to stdout
109+
# Here "main" refers to the target link of the "main" optimizer again, and
110+
# "validation" refers to the default name of the Evaluator extension.
111+
# Entries other than 'epoch' are reported by the Classifier link, called by
112+
# either the updater or the evaluator.
113+
trainer.extend(extensions.PrintReport(
114+
['epoch', 'main/loss', 'validation/main/loss',
115+
'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
116+
117+
# Run the training
118+
trainer.run()
119+
120+
# Save the model to model_dir. It's loaded below in `model_fn`.
121+
serializers.save_npz(os.path.join(args.model_dir, 'model.npz'), model)
122+
123+
124+
def model_fn(model_dir):
125+
"""
126+
This function is called by the Chainer container during hosting when running on SageMaker with
127+
values populated by the hosting environment.
128+
129+
This function loads models written during training into `model_dir`.
130+
131+
Args:
132+
model_dir (str): path to the directory containing the saved model artifacts
133+
134+
Returns:
135+
a loaded Chainer model
136+
137+
For more on `model_fn`, please visit the sagemaker-python-sdk repository:
138+
https://github.com/aws/sagemaker-python-sdk
139+
140+
For more on the Chainer container, please visit the sagemaker-chainer-containers repository:
141+
https://github.com/aws/sagemaker-chainer-containers
142+
"""
143+
chainer.config.train = False
144+
model = L.Classifier(net.VGG(10))
145+
serializers.load_npz(os.path.join(model_dir, 'model.npz'), model)
146+
return model.predictor

0 commit comments

Comments
 (0)