Skip to content

Add document embedding support to Object2Vec algorithm #772

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions src/sagemaker/amazon/object2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,19 @@
from sagemaker.vpc_utils import VPC_CONFIG_DEFAULT


def _list_check_subset(valid_super_list):
valid_superset = set(valid_super_list)

def validate(value):
if not isinstance(value, str):
return False

val_list = [s.strip() for s in value.split(',')]
return set(val_list).issubset(valid_superset)

return validate


class Object2Vec(AmazonAlgorithmEstimatorBase):

repo_name = 'object2vec'
Expand Down Expand Up @@ -57,6 +70,14 @@ class Object2Vec(AmazonAlgorithmEstimatorBase):
'One of "adagrad", "adam", "rmsprop", "sgd", "adadelta"', str)
learning_rate = hp('learning_rate', (ge(1e-06), le(1.0)),
'A float in [1e-06, 1.0]', float)

negative_sampling_rate = hp('negative_sampling_rate', (ge(0), le(100)), 'An integer in [0, 100]', int)
comparator_list = hp('comparator_list', _list_check_subset(["hadamard", "concat", "abs_diff"]),
'Comma-separated of hadamard, concat, abs_diff. E.g. "hadamard,abs_diff"', str)
tied_token_embedding_weight = hp('tied_token_embedding_weight', (), 'Either True or False', bool)
token_embedding_storage_type = hp('token_embedding_storage_type', isin("dense", "row_sparse"),
'One of "dense", "row_sparse"', str)

enc0_network = hp('enc0_network', isin("hcnn", "bilstm", "pooled_embedding"),
'One of "hcnn", "bilstm", "pooled_embedding"', str)
enc1_network = hp('enc1_network', isin("hcnn", "bilstm", "pooled_embedding", "enc0"),
Expand Down Expand Up @@ -104,6 +125,10 @@ def __init__(self, role, train_instance_count, train_instance_type,
output_layer=None,
optimizer=None,
learning_rate=None,
negative_sampling_rate=None,
comparator_list=None,
tied_token_embedding_weight=None,
token_embedding_storage_type=None,
enc0_network=None,
enc1_network=None,
enc0_cnn_filter_width=None,
Expand Down Expand Up @@ -164,6 +189,10 @@ def __init__(self, role, train_instance_count, train_instance_type,
output_layer(str): Optional. Type of output layer
optimizer(str): Optional. Type of optimizer for training
learning_rate(float): Optional. Learning rate for SGD training
negative_sampling_rate(int): Optional. Negative sampling rate
comparator_list(str): Optional. Customization of comparator operator
tied_token_embedding_weight(bool): Optional. Tying of token embedding layer weight
token_embedding_storage_type(str): Optional. Type of token embedding storage
enc0_network(str): Optional. Network model of encoder "enc0"
enc1_network(str): Optional. Network model of encoder "enc1"
enc0_cnn_filter_width(int): Optional. CNN filter width
Expand Down Expand Up @@ -197,6 +226,12 @@ def __init__(self, role, train_instance_count, train_instance_type,
self.output_layer = output_layer
self.optimizer = optimizer
self.learning_rate = learning_rate

self.negative_sampling_rate = negative_sampling_rate
self.comparator_list = comparator_list
self.tied_token_embedding_weight = tied_token_embedding_weight
self.token_embedding_storage_type = token_embedding_storage_type

self.enc0_network = enc0_network
self.enc1_network = enc1_network
self.enc0_cnn_filter_width = enc0_cnn_filter_width
Expand Down
4 changes: 4 additions & 0 deletions tests/integ/test_object2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ def test_object2vec(sagemaker_session):
enc0_vocab_size=45000,
enc_dim=16,
num_classes=3,
negative_sampling_rate=0,
comparator_list='hadamard,concat,abs_diff',
tied_token_embedding_weight=False,
token_embedding_storage_type='dense',
sagemaker_session=sagemaker_session)

record_set = prepare_record_set_from_local_files(data_path, object2vec.data_location,
Expand Down
15 changes: 13 additions & 2 deletions tests/unit/test_object2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ def test_all_hyperparameters(sagemaker_session):
output_layer='softmax',
optimizer='adam',
learning_rate=0.0001,
negative_sampling_rate=1,
comparator_list='hadamard, abs_diff',
tied_token_embedding_weight=True,
token_embedding_storage_type='row_sparse',
enc0_network='bilstm',
enc1_network='hcnn',
enc0_cnn_filter_width=3,
Expand Down Expand Up @@ -161,7 +165,11 @@ def test_required_hyper_parameters_value(sagemaker_session, required_hyper_param
('optimizer', 0),
('enc0_cnn_filter_width', 'string'),
('weight_decay', 'string'),
('learning_rate', 'string')
('learning_rate', 'string'),
('negative_sampling_rate', 'some_string'),
('comparator_list', 0),
('comparator_list', ['foobar']),
('token_embedding_storage_type', 123),
])
def test_optional_hyper_parameters_type(sagemaker_session, optional_hyper_parameters, value):
with pytest.raises(ValueError):
Expand All @@ -182,7 +190,10 @@ def test_optional_hyper_parameters_type(sagemaker_session, optional_hyper_parame
('weight_decay', 200000),
('enc0_cnn_filter_width', 2000),
('learning_rate', 0),
('learning_rate', 2)
('learning_rate', 2),
('negative_sampling_rate', -1),
('comparator_list', 'hadamard,foobar'),
('token_embedding_storage_type', 'foobar'),
])
def test_optional_hyper_parameters_value(sagemaker_session, optional_hyper_parameters, value):
with pytest.raises(ValueError):
Expand Down