aws
diff --git a/‎src/sagemaker/dataset_definition/__init__.py
Lines changed: 21 additions & 0 deletions b/‎src/sagemaker/dataset_definition/__init__.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/sagemaker/dataset_definition/inputs.py
Lines changed: 256 additions & 0 deletions b/‎src/sagemaker/dataset_definition/inputs.py
Lines changed: 256 additions & 0 deletions
@@ -0,0 +1,21 @@
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Placeholder docstring"""
+from __future__ import absolute_import
+
+from sagemaker.dataset_definition.inputs import (  # noqa: F401
+    DatasetDefinition,
+    S3Input,
+    RedshiftDatasetDefinition,
+    AthenaDatasetDefinition,
+)
@@ -0,0 +1,256 @@
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""The input configs for DatasetDefinition.
+
+DatasetDefinition support the data sources like S3 data queries via Athena
+and Redshift. A mechanism has to be created for customers to generate datasets
+from Athena/Redshift queries and to retrieve the data, using Processing jobs
+so as to make it available for other downstream processes.
+"""
+from __future__ import absolute_import
+
+from typing import Dict, Any
+import attr
+
+_service_attribute_name = "service_attribute_name"
+
+
+@attr.s
+class _BaseConfig:
+    """Base config object for DatasetDefinition.
+
+    The class implemented common to_dict() and from_dict() methods to
+    serialize/deserialize the class to construct service request.
+    """
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Construct the dictionary from the class.
+
+        Returns:
+            dict represents the class.
+        """
+
+        dictionary = {}
+
+        for attribute in self.__class__.get_attributes():
+            if self.__dict__[attribute.name] is not None:
+                attribute_value = self.__dict__[attribute.name]
+                if isinstance(self.__dict__[attribute.name], _BaseConfig):
+                    dictionary[
+                        attribute.metadata[_service_attribute_name]
+                    ] = attribute_value.to_dict()
+                else:
+                    dictionary[attribute.metadata[_service_attribute_name]] = attribute_value
+
+        return dictionary
+
+    @classmethod
+    def from_dict(cls, service_response) -> "_BaseConfig":
+        """Construct the _BaseConfig object from the dictionary.
+
+        args:
+            service_response: the json response returned from the service.
+
+         Returns:
+             a _BaseConfig object.
+        """
+
+        if service_response is None:
+            return None
+
+        dd_dict = {}
+
+        for attribute in attr.fields(cls):
+            service_attr_name = attribute.metadata[_service_attribute_name]
+            if service_attr_name in service_response:
+                if isinstance(service_response[service_attr_name], dict):
+                    dd_dict[attribute.name] = attribute.type.from_dict(
+                        service_response[service_attr_name]
+                    )
+                else:
+                    dd_dict[attribute.name] = service_response[
+                        attribute.metadata[_service_attribute_name]
+                    ]
+
+        return cls(**dd_dict)
+
+    @classmethod
+    def get_attributes(cls):
+        """Get all class attributes
+
+        Returns:
+            dict represents the class.
+        """
+        return attr.fields(cls)
+
+
+@attr.s
+class RedshiftDatasetDefinition(_BaseConfig):
+    """DatasetDefinition for Redshift.
+
+    With this input, SQL queries will be executed using Redshift to generate datasets to S3.
+
+    Attributes:
+        cluster_id (str): The Redshift cluster Identifier.
+        database (str): The Redshift database created for your cluster.
+        db_user (str): The user name of a user account that has permission to connect
+            to the database.
+        query_string (str): The SQL query statements to be executed.
+        cluster_role_arn (str): Redshift cluster role arn.
+        output_s3_uri (str): The path to a specific S3 object or a S3 prefix for output
+        kms_key_id (str): KMS key id.
+        output_format (str): the data storage format for Redshift query results.
+            Valid options are "PARQUET", "CSV"
+        output_compression (str): compression used for Redshift query results.
+            Valid options are "None", "GZIP", "SNAPPY", "ZSTD", "BZIP2"
+    """
+
+    cluster_id: str = attr.ib(
+        validator=attr.validators.instance_of(str), metadata={_service_attribute_name: "ClusterId"}
+    )
+    database: str = attr.ib(
+        validator=attr.validators.instance_of(str), metadata={_service_attribute_name: "Database"}
+    )
+    db_user: str = attr.ib(
+        validator=attr.validators.instance_of(str), metadata={_service_attribute_name: "DbUser"}
+    )
+    query_string: str = attr.ib(
+        validator=attr.validators.instance_of(str),
+        metadata={_service_attribute_name: "QueryString"},
+    )
+    cluster_role_arn: str = attr.ib(
+        validator=attr.validators.instance_of(str),
+        metadata={_service_attribute_name: "ClusterRoleArn"},
+    )
+    output_s3_uri: str = attr.ib(
+        validator=attr.validators.instance_of(str),
+        metadata={_service_attribute_name: "OutputS3Uri"},
+    )
+
+    kms_key_id: str = attr.ib(default=None, metadata={_service_attribute_name: "KmsKeyId"})
+    output_format: str = attr.ib(
+        default="PARQUET", metadata={_service_attribute_name: "OutputFormat"}
+    )
+    output_compression: str = attr.ib(
+        default="GZIP", metadata={_service_attribute_name: "OutputCompression"}
+    )
+
+
+@attr.s
+class AthenaDatasetDefinition(_BaseConfig):
+    """DatasetDefinition for Athena.
+
+    With this input, SQL queries will be executed using Athena to generate datasets to S3.
+
+    Attributes:
+        catalog (str): The name of the data catalog used in query execution.
+        database (str): The name of the database used in the query execution.
+        query_string (str): The SQL query statements to be executed.
+        output_s3_uri (str): the path to a specific S3 object or a S3 prefix for output
+        work_group (str): The name of the workgroup in which the query is being started.
+        kms_key_id (str): KMS key id.
+        output_format (str): the data storage format for Athena query results.
+            Valid options are "PARQUET", "ORC", "AVRO", "JSON", "TEXTFILE"
+        output_compression (str): compression used for Athena query results.
+            Valid options are "GZIP", "SNAPPY", "ZLIB"
+    """
+
+    catalog: str = attr.ib(
+        validator=attr.validators.instance_of(str), metadata={_service_attribute_name: "Catalog"}
+    )
+    database: str = attr.ib(
+        validator=attr.validators.instance_of(str), metadata={_service_attribute_name: "Database"}
+    )
+    query_string: str = attr.ib(
+        validator=attr.validators.instance_of(str),
+        metadata={_service_attribute_name: "QueryString"},
+    )
+    output_s3_uri: str = attr.ib(
+        validator=attr.validators.instance_of(str),
+        metadata={_service_attribute_name: "OutputS3Uri"},
+    )
+    work_group: str = attr.ib(default=None, metadata={_service_attribute_name: "WorkGroup"})
+    kms_key_id: str = attr.ib(default=None, metadata={_service_attribute_name: "KmsKeyId"})
+    output_format: str = attr.ib(
+        default="PARQUET", metadata={_service_attribute_name: "OutputFormat"}
+    )
+    output_compression: str = attr.ib(
+        default="GZIP", metadata={_service_attribute_name: "OutputCompression"}
+    )
+
+
+@attr.s
+class DatasetDefinition(_BaseConfig):
+    """DatasetDefinition input.
+
+    Attributes:
+        data_distribution_type (str): Valid options are "FullyReplicated" or "ShardedByS3Key".
+        input_mode (str): Valid options are "Pipe" or "File".
+        local_path (str): the path to a local directory. If not provided, skips data download by
+            SageMaker platform.
+        redshift_dataset_definition
+            (:class:`~sagemaker.dataset_definition.RedshiftDatasetDefinition`): Redshift
+            dataset definition.
+        athena_dataset_definition (:class:`~sagemaker.dataset_definition.AthenaDatasetDefinition`):
+            Athena dataset definition.
+    """
+
+    data_distribution_type: str = attr.ib(
+        default="ShardedByS3Key", metadata={_service_attribute_name: "DataDistributionType"}
+    )
+    input_mode: str = attr.ib(default="Pipe", metadata={_service_attribute_name: "InputMode"})
+    local_path: str = attr.ib(default=None, metadata={_service_attribute_name: "LocalPath"})
+    redshift_dataset_definition: RedshiftDatasetDefinition = attr.ib(
+        default=None, metadata={_service_attribute_name: "RedshiftDatasetDefinition"}
+    )
+    athena_dataset_definition: AthenaDatasetDefinition = attr.ib(
+        default=None, metadata={_service_attribute_name: "AthenaDatasetDefinition"}
+    )
+
+
+@attr.s
+class S3Input(_BaseConfig):
+    """Metadata of data objects stored in S3.
+
+    Two options are provided: specifying a S3 prefix or by explicitly listing the files
+    in a manifest file and referencing the manifest file's S3 path.
+    Note: Strong consistency is not guaranteed if S3Prefix is provided here.
+    S3 list operations are not strongly consistent.
+    Use ManifestFile if strong consistency is required.
+
+    Attributes:
+        s3_uri (str): the path to a specific S3 object or a S3 prefix
+        local_path (str): the path to a local directory. If not provided, skips data download
+            by SageMaker platform.
+        s3_data_type (str): Valid options are "ManifestFile" or "S3Prefix".
+        s3_input_mode (str): Valid options are "Pipe" or "File".
+        s3_data_distribution_type (str): Valid options are "FullyReplicated" or "ShardedByS3Key".
+        s3_compression_type (str): Valid options are "None" or "Gzip".
+    """
+
+    s3_uri: str = attr.ib(
+        validator=attr.validators.instance_of(str), metadata={_service_attribute_name: "S3Uri"}
+    )
+    local_path: str = attr.ib(
+        validator=attr.validators.instance_of(str), metadata={_service_attribute_name: "LocalPath"}
+    )
+    s3_data_type: str = attr.ib(
+        default="S3Prefix", metadata={_service_attribute_name: "S3DataType"}
+    )
+    s3_input_mode: str = attr.ib(default="File", metadata={_service_attribute_name: "S3InputMode"})
+    s3_data_distribution_type: str = attr.ib(
+        default="FullyReplicated", metadata={_service_attribute_name: "S3DataDistributionType"}
+    )
+    s3_compression_type: str = attr.ib(
+        default=None, metadata={_service_attribute_name: "S3CompressionType"}
+    )