Skip to content

Commit fdf817c

Browse files
authored
ZCC Integration Tests (aws#342)
* WIP * WIP * Integration tests for Estimator, LinearClassifier, Session, Keras. 1/4 passes. * Added --script-mode for debugging * Add test for keras * Fix writer.close() * Fixup PT integration test * Remove internal class import
1 parent ffd2211 commit fdf817c

File tree

13 files changed

+576
-4
lines changed

13 files changed

+576
-4
lines changed

tests/zero_code_change/pt_utils.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import torch
2+
import torch.nn as nn
3+
import torch.nn.functional as F
4+
5+
import torchvision
6+
import torchvision.transforms as transforms
7+
8+
9+
def get_dataloaders() -> Tuple[torch.utils.data.DataLoader, torch.utils.data.DataLoader]:
10+
transform = transforms.Compose(
11+
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
12+
)
13+
14+
trainset = torchvision.datasets.CIFAR10(
15+
root="./data", train=True, download=True, transform=transform
16+
)
17+
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2)
18+
19+
testset = torchvision.datasets.CIFAR10(
20+
root="./data", train=False, download=True, transform=transform
21+
)
22+
testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=2)
23+
24+
classes = ("plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck")
25+
return trainloader, testloader
26+
27+
28+
class Net(nn.Module):
29+
def __init__(self):
30+
super().__init__()
31+
self.conv1 = nn.Conv2d(3, 6, 5)
32+
self.pool = nn.MaxPool2d(2, 2)
33+
self.conv2 = nn.Conv2d(6, 16, 5)
34+
self.fc1 = nn.Linear(16 * 5 * 5, 120)
35+
self.fc2 = nn.Linear(120, 84)
36+
self.fc3 = nn.Linear(84, 10)
37+
38+
def forward(self, x):
39+
x = self.pool(F.relu(self.conv1(x)))
40+
x = self.pool(F.relu(self.conv2(x)))
41+
x = x.view(-1, 16 * 5 * 5)
42+
x = F.relu(self.fc1(x))
43+
x = F.relu(self.fc2(x))
44+
x = self.fc3(x)
45+
return x
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
"""
2+
WARNING: This must be run manually, with the custom TensorFlow fork installed.
3+
Not used in CI/CD. May be useful for DLC testing.
4+
5+
We'll import a forked version of PyTorch, then run the MNIST tutorial at
6+
https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html.
7+
This should work without changing anything from the tutorial.
8+
Afterwards, we read from the directory and ensure that all the values are there.
9+
"""
10+
import argparse
11+
import torch
12+
import torch.nn as nn
13+
import torch.nn.functional as F
14+
import torch.optim as optim
15+
16+
import tornasole.pytorch as ts
17+
from tornasole.core.utils import SagemakerSimulator
18+
from pt_utils import get_dataloaders, Net
19+
20+
21+
def test_pytorch(script_mode: bool):
22+
with SagemakerSimulator() as sim:
23+
trainloader, testloader = get_dataloaders()
24+
net = Net()
25+
criterion = nn.CrossEntropyLoss()
26+
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
27+
28+
if script_mode:
29+
hook = ts.TornasoleHook(out_dir=sim.out_dir)
30+
hook.register_hook(net)
31+
hook.register_loss(criterion)
32+
33+
for epoch in range(1): # loop over the dataset multiple times
34+
running_loss = 0.0
35+
for i, data in enumerate(trainloader, 0):
36+
# get the inputs; data is a list of [inputs, labels]
37+
inputs, labels = data
38+
39+
# zero the parameter gradients
40+
optimizer.zero_grad()
41+
42+
# forward + backward + optimize
43+
outputs = net(inputs)
44+
if True:
45+
loss = criterion(outputs, labels)
46+
else:
47+
loss = F.cross_entropy(outputs, labels)
48+
loss.backward()
49+
optimizer.step()
50+
51+
# print statistics
52+
running_loss += loss.item()
53+
if i % 2000 == 1999: # print every 2000 mini-batches
54+
print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / 2000))
55+
running_loss = 0.0
56+
break
57+
58+
print("Finished Training")
59+
60+
from tornasole.trials import Trial, create_trial
61+
import tornasole_rules
62+
63+
trial = create_trial(path=sim.out_dir)
64+
print(f"trial.available_steps() = {trial.available_steps()}")
65+
print(f"trial.tensors() = {trial.tensors()}")
66+
67+
print(f"collection_manager = {hook.collection_manager}")
68+
69+
weights_tensors = hook.collection_manager.get("weights").tensor_names
70+
print(f"'weights' collection tensors = {weights_tensors}")
71+
assert len(weights_tensors) > 0
72+
73+
gradients_tensors = hook.collection_manager.get("gradients").tensor_names
74+
print(f"'gradients' collection tensors = {gradients_tensors}")
75+
assert len(gradients_tensors) > 0
76+
77+
losses_tensors = hook.collection_manager.get("losses").tensor_names
78+
print(f"'losses' collection tensors = {losses_tensors}")
79+
assert len(losses_tensors) > 0
80+
81+
assert all(
82+
[name in trial.tensors() for name in hook.collection_manager.get("losses").tensor_names]
83+
)
84+
85+
86+
if __name__ == "__main__":
87+
parser = argparse.ArgumentParser()
88+
parser.add_argument(
89+
"--script-mode", help="Manually create hooks instead of relying on ZCC", action="store_true"
90+
)
91+
args = parser.parse_args()
92+
93+
test_pytorch(script_mode=args.script_mode)
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
"""
2+
WARNING: This must be run manually, with the custom TensorFlow fork installed.
3+
Not used in CI/CD. May be useful for DLC testing.
4+
5+
Be sure to run with Python2 (/usr/bin/python) and Python3.
6+
Run with and without the flag --zcc.
7+
8+
Test with DNNClassifier and raw Estimator.
9+
Test with Session.
10+
Test with Keras.
11+
12+
Test with AdamOptimizer and SGD.
13+
14+
We check that certain tensors are saved.
15+
"""
16+
17+
import argparse
18+
import numpy as np
19+
import random
20+
import tensorflow as tf
21+
import tornasole.tensorflow as ts
22+
from tornasole.core.utils import SagemakerSimulator
23+
from tf_utils import (
24+
get_estimator,
25+
get_input_fns,
26+
get_train_op_and_placeholders,
27+
get_data,
28+
get_keras_data,
29+
get_keras_model_v1,
30+
)
31+
32+
33+
def test_estimator(script_mode: bool):
34+
""" Throws errors about tensors not saving to collection. Investigate after merging PR #304.
35+
"""
36+
with SagemakerSimulator() as sim:
37+
# Setup
38+
mnist_classifier = get_estimator()
39+
train_input_fn, eval_input_fn = get_input_fns()
40+
41+
# Train and evaluate
42+
if script_mode:
43+
hook = ts.TornasoleEstimatorHook(out_dir=sim.out_dir)
44+
mnist_classifier.train(input_fn=train_input_fn, steps=50)
45+
mnist_classifier.evaluate(input_fn=eval_input_fn, steps=10)
46+
else:
47+
mnist_classifier.train(input_fn=train_input_fn, steps=50)
48+
mnist_classifier.evaluate(input_fn=eval_input_fn, steps=10)
49+
50+
# Check that hook created and tensors saved
51+
trial = ts.create_trial(path=sim.out_dir)
52+
assert ts.get_hook() is not None, "Hook was not created."
53+
assert len(trial.available_steps()) > 0, "Nothing saved at any step."
54+
assert len(trial.tensors()) > 0, "Tensors were not saved."
55+
56+
57+
def test_linear_classifier(script_mode: bool):
58+
""" Throws errors about tensors not saving to collection. Investigate after merging PR #304.
59+
"""
60+
with SagemakerSimulator() as sim:
61+
# Setup
62+
train_input_fn, eval_input_fn = get_input_fns()
63+
x_feature = tf.feature_column.numeric_column("x", shape=(28, 28))
64+
estimator = tf.compat.v1.estimator.LinearClassifier(
65+
feature_columns=[x_feature], model_dir="/tmp/mnist_linear_classifier", n_classes=10
66+
)
67+
68+
# Train
69+
if script_mode:
70+
hook = ts.TornasoleEstimatorHook(out_dir=sim.out_dir)
71+
estimator.train(input_fn=train_input_fn, steps=100, hooks=[hook])
72+
else:
73+
estimator.train(input_fn=train_input_fn, steps=100, hooks=[hook])
74+
75+
# Check that hook created and tensors saved
76+
trial = ts.create_trial(path=sim.out_dir)
77+
assert ts.get_hook() is not None, "Hook was not created."
78+
assert len(trial.available_steps()) > 0, "Nothing saved at any step."
79+
assert len(trial.tensors()) > 0, "Tensors were not saved."
80+
81+
82+
def test_monitored_session(script_mode: bool):
83+
""" Works as intended. """
84+
with SagemakerSimulator() as sim:
85+
train_op, X, Y = get_train_op_and_placeholders()
86+
init = tf.compat.v1.global_variables_initializer()
87+
mnist = get_data()
88+
89+
if script_mode:
90+
hook = ts.TornasoleKerasHook(out_dir=sim.out_dir)
91+
sess = tf.train.MonitoredSession(hooks=[hook])
92+
else:
93+
sess = tf.train.MonitoredSession()
94+
95+
with sess:
96+
sess.run(init)
97+
for step in range(1, 101):
98+
batch_x, batch_y = mnist.train.next_batch(32)
99+
sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
100+
101+
# Check that hook created and tensors saved
102+
trial = ts.create_trial(path=sim.out_dir)
103+
assert ts.get_hook() is not None, "Hook was not created."
104+
assert len(trial.available_steps()) > 0, "Nothing saved at any step."
105+
assert len(trial.tensors()) > 0, "Tensors were not saved."
106+
107+
108+
def test_keras_v1(script_mode: bool):
109+
""" Failing because we need TornasoleKerasHook from PR #304.
110+
111+
Taken from https://www.tensorflow.org/guide/keras/functional.
112+
"""
113+
with SagemakerSimulator() as sim:
114+
import tensorflow.compat.v1.keras as keras
115+
116+
model = get_keras_model_v1()
117+
(x_train, y_train), (x_test, y_test) = get_keras_data()
118+
119+
model.compile(
120+
loss="sparse_categorical_crossentropy",
121+
optimizer=keras.optimizers.RMSprop(),
122+
metrics=["accuracy"],
123+
)
124+
if script_mode:
125+
hook = ts.TornasoleKerasHook(out_dir=sim.out_dir)
126+
history = model.fit(
127+
x_train, y_train, batch_size=64, epochs=5, validation_split=0.2, callbacks=[hook]
128+
)
129+
test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook])
130+
else:
131+
history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2)
132+
test_scores = model.evaluate(x_test, y_test, verbose=2)
133+
134+
# Check that hook created and tensors saved
135+
trial = ts.create_trial(path=sim.out_dir)
136+
assert ts.get_hook() is not None, "Hook was not created."
137+
assert len(trial.available_steps()) > 0, "Nothing saved at any step."
138+
assert len(trial.tensors()) > 0, "Tensors were not saved."
139+
140+
141+
if __name__ == "__main__":
142+
parser = argparse.ArgumentParser()
143+
parser.add_argument(
144+
"--script-mode", help="Manually create hooks instead of relying on ZCC", action="store_true"
145+
)
146+
args = parser.parse_args()
147+
script_mode = args.script_mode
148+
149+
test_estimator(script_mode=script_mode)
150+
test_monitored_session(script_mode=script_mode)
151+
test_linear_classifier(script_mode=script_mode)
152+
test_keras_v1(script_mode=script_mode)

0 commit comments

Comments
 (0)