Skip to content

Commit 64371d3

Browse files
guoqiao1992metrizableajaykarpuricywang86rui
authored
feature: add dataset definition support for processing jobs (#2031)
Co-authored-by: Eric Johnson <[email protected]> Co-authored-by: Ajay Karpur <[email protected]> Co-authored-by: icywang86rui <[email protected]>
1 parent e8deeb3 commit 64371d3

File tree

10 files changed

+677
-149
lines changed

10 files changed

+677
-149
lines changed

src/sagemaker/apiutils/_base_types.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ def from_boto(cls, boto_dict, **kwargs):
4949
boto_dict (dict): A dictionary of a boto response.
5050
**kwargs: Arbitrary keyword arguments
5151
"""
52+
if boto_dict is None:
53+
return None
54+
5255
boto_dict = {k: v for k, v in boto_dict.items() if k not in cls._boto_ignore()}
5356
custom_boto_names_to_member_names = {a: b for b, a in cls._custom_boto_names.items()}
5457
cls_kwargs = _boto_functions.from_boto(

src/sagemaker/apiutils/_boto_functions.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,7 @@ def from_boto(boto_dict, boto_name_to_member_name, member_name_to_type):
6868
api_type, is_collection = member_name_to_type[member_name]
6969
if is_collection:
7070
if isinstance(boto_value, dict):
71-
member_value = {
72-
key: api_type.from_boto(value) for key, value in boto_value.items()
73-
}
71+
member_value = api_type.from_boto(boto_value)
7472
else:
7573
member_value = [api_type.from_boto(item) for item in boto_value]
7674
else:
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
"""Classes for using DatasetDefinition in Processing job with Amazon SageMaker."""
14+
from __future__ import absolute_import
15+
16+
from sagemaker.dataset_definition.inputs import ( # noqa: F401
17+
DatasetDefinition,
18+
S3Input,
19+
RedshiftDatasetDefinition,
20+
AthenaDatasetDefinition,
21+
)
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
"""The input configs for DatasetDefinition.
14+
15+
DatasetDefinition supports the data sources like S3 which can be queried via Athena
16+
and Redshift. A mechanism has to be created for customers to generate datasets
17+
from Athena/Redshift queries and to retrieve the data, using Processing jobs
18+
so as to make it available for other downstream processes.
19+
"""
20+
from __future__ import absolute_import
21+
22+
from sagemaker.apiutils._base_types import ApiObject
23+
24+
25+
class RedshiftDatasetDefinition(ApiObject):
26+
"""DatasetDefinition for Redshift.
27+
28+
With this input, SQL queries will be executed using Redshift to generate datasets to S3.
29+
30+
Attributes:
31+
cluster_id (str): The Redshift cluster Identifier.
32+
database (str): The name of the Redshift database used in Redshift query execution.
33+
db_user (str): The database user name used in Redshift query execution.
34+
query_string (str): The SQL query statements to be executed.
35+
cluster_role_arn (str): The IAM role attached to your Redshift cluster that
36+
Amazon SageMaker uses to generate datasets.
37+
output_s3_uri (str): The location in Amazon S3 where the Redshift query
38+
results are stored.
39+
kms_key_id (str): The AWS Key Management Service (AWS KMS) key that Amazon
40+
SageMaker uses to encrypt data from a Redshift execution.
41+
output_format (str): The data storage format for Redshift query results.
42+
Valid options are "PARQUET", "CSV"
43+
output_compression (str): The compression used for Redshift query results.
44+
Valid options are "None", "GZIP", "SNAPPY", "ZSTD", "BZIP2"
45+
"""
46+
47+
cluster_id = None
48+
database = None
49+
db_user = None
50+
query_string = None
51+
cluster_role_arn = None
52+
output_s3_uri = None
53+
kms_key_id = None
54+
output_format = None
55+
output_compression = None
56+
57+
58+
class AthenaDatasetDefinition(ApiObject):
59+
"""DatasetDefinition for Athena.
60+
61+
With this input, SQL queries will be executed using Athena to generate datasets to S3.
62+
63+
Attributes:
64+
catalog (str): The name of the data catalog used in Athena query execution.
65+
database (str): The name of the database used in the Athena query execution.
66+
query_string (str): The SQL query statements, to be executed.
67+
output_s3_uri (str): The location in Amazon S3 where Athena query results are stored.
68+
work_group (str): The name of the workgroup in which the Athena query is being started.
69+
kms_key_id (str): The AWS Key Management Service (AWS KMS) key that Amazon
70+
SageMaker uses to encrypt data generated from an Athena query execution.
71+
output_format (str): The data storage format for Athena query results.
72+
Valid options are "PARQUET", "ORC", "AVRO", "JSON", "TEXTFILE"
73+
output_compression (str): The compression used for Athena query results.
74+
Valid options are "GZIP", "SNAPPY", "ZLIB"
75+
"""
76+
77+
catalog = None
78+
database = None
79+
query_string = None
80+
output_s3_uri = None
81+
work_group = None
82+
kms_key_id = None
83+
output_format = None
84+
output_compression = None
85+
86+
87+
class DatasetDefinition(ApiObject):
88+
"""DatasetDefinition input.
89+
90+
Attributes:
91+
data_distribution_type (str): Whether the generated dataset is FullyReplicated or
92+
ShardedByS3Key (default).
93+
input_mode (str): Whether to use File or Pipe input mode. In File (default) mode, Amazon
94+
SageMaker copies the data from the input source onto the local Amazon Elastic Block
95+
Store (Amazon EBS) volumes before starting your training algorithm. This is the most
96+
commonly used input mode. In Pipe mode, Amazon SageMaker streams input data from the
97+
source directly to your algorithm without using the EBS volume.
98+
local_path (str): The local path where you want Amazon SageMaker to download the Dataset
99+
Definition inputs to run a processing job. LocalPath is an absolute path to the input
100+
data. This is a required parameter when `AppManaged` is False (default).
101+
redshift_dataset_definition
102+
(:class:`~sagemaker.dataset_definition.RedshiftDatasetDefinition`): Redshift
103+
dataset definition.
104+
athena_dataset_definition (:class:`~sagemaker.dataset_definition.AthenaDatasetDefinition`):
105+
Configuration for Athena Dataset Definition input.
106+
"""
107+
108+
_custom_boto_types = {
109+
"redshift_dataset_definition": (RedshiftDatasetDefinition, True),
110+
"athena_dataset_definition": (AthenaDatasetDefinition, True),
111+
}
112+
113+
data_distribution_type = "ShardedByS3Key"
114+
input_mode = "File"
115+
local_path = None
116+
redshift_dataset_definition = None
117+
athena_dataset_definition = None
118+
119+
120+
class S3Input(ApiObject):
121+
"""Metadata of data objects stored in S3.
122+
123+
Two options are provided: specifying a S3 prefix or by explicitly listing the files
124+
in a manifest file and referencing the manifest file's S3 path.
125+
Note: Strong consistency is not guaranteed if S3Prefix is provided here.
126+
S3 list operations are not strongly consistent.
127+
Use ManifestFile if strong consistency is required.
128+
129+
Attributes:
130+
s3_uri (str): the path to a specific S3 object or a S3 prefix
131+
local_path (str): the path to a local directory. If not provided, skips data download
132+
by SageMaker platform.
133+
s3_data_type (str): Valid options are "ManifestFile" or "S3Prefix".
134+
s3_input_mode (str): Valid options are "Pipe" or "File".
135+
s3_data_distribution_type (str): Valid options are "FullyReplicated" or "ShardedByS3Key".
136+
s3_compression_type (str): Valid options are "None" or "Gzip".
137+
"""
138+
139+
s3_uri = None
140+
local_path = None
141+
s3_data_type = "S3Prefix"
142+
s3_input_mode = "File"
143+
s3_data_distribution_type = "FullyReplicated"
144+
s3_compression_type = None

0 commit comments

Comments
 (0)