Skip to content

feature: Add data ingestion only data-wrangler flow recipe generation helper function #2336

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
May 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 190 additions & 0 deletions src/sagemaker/wrangler/ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
# language governing permissions and limitations under the License.
"""Data wrangler helpers for data ingestion."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit Data Wrangler

from __future__ import absolute_import

from typing import Dict
from uuid import uuid4
from sagemaker.dataset_definition.inputs import (
RedshiftDatasetDefinition,
AthenaDatasetDefinition,
)


def generate_data_ingestion_flow_from_s3_input(
input_name: str,
s3_uri: str,
s3_content_type: str = "csv",
s3_has_header: bool = False,
operator_version: str = "0.1",
schema: Dict = None,
):
"""Generate the data ingestion only flow from s3 input

Args:
input_name (str): the name of the input to flow source node
s3_uri (str): uri for the s3 input to flow source node
s3_content_type (str): s3 input content type
s3_has_header (bool): flag indicating the input has header or not
operator_version: (str): the version of the operator
schema: (typing.Dict): the schema for the data to be ingested
Returns:
dict (typing.Dict): A flow only conduct data ingestion with 1-1 mapping
output_name (str): The output name used to configure
`sagemaker.processing.FeatureStoreOutput`
"""
source_node = {
"node_id": str(uuid4()),
"type": "SOURCE",
"inputs": [],
"outputs": [{"name": "default"}],
"operator": f"sagemaker.s3_source_{operator_version}",
"parameters": {
"dataset_definition": {
"datasetSourceType": "S3",
"name": input_name,
"s3ExecutionContext": {
"s3Uri": s3_uri,
"s3ContentType": s3_content_type,
"s3HasHeader": s3_has_header,
},
}
},
}

output_node = _get_output_node(source_node["node_id"], operator_version, schema)

flow = {
"metadata": {"version": 1, "disable_limits": False},
"nodes": [source_node, output_node],
}

return flow, f'{output_node["node_id"]}.default'


def generate_data_ingestion_flow_from_athena_dataset_definition(
input_name: str,
athena_dataset_definition: AthenaDatasetDefinition,
operator_version: str = "0.1",
schema: Dict = None,
):
"""Generate the data ingestion only flow from athena input

Args:
input_name (str): the name of the input to flow source node
athena_dataset_definition (AthenaDatasetDefinition): athena input to flow source node
operator_version: (str): the version of the operator
schema: (typing.Dict): the schema for the data to be ingested
Returns:
dict (typing.Dict): A flow only conduct data ingestion with 1-1 mapping
output_name (str): The output name used to configure
`sagemaker.processing.FeatureStoreOutput`
"""
source_node = {
"node_id": str(uuid4()),
"type": "SOURCE",
"inputs": [],
"outputs": [{"name": "default"}],
"operator": f"sagemaker.athena_source_{operator_version}",
"parameters": {
"dataset_definition": {
"datasetSourceType": "Athena",
"name": input_name,
"catalogName": athena_dataset_definition.catalog,
"databaseName": athena_dataset_definition.database,
"queryString": athena_dataset_definition.query_string,
"s3OutputLocation": athena_dataset_definition.output_s3_uri,
"outputFormat": athena_dataset_definition.output_format,
}
},
}

output_node = _get_output_node(source_node["node_id"], operator_version, schema)

flow = {
"metadata": {"version": 1, "disable_limits": False},
"nodes": [source_node, output_node],
}

return flow, f'{output_node["node_id"]}.default'


def generate_data_ingestion_flow_from_redshift_dataset_definition(
input_name: str,
redshift_dataset_definition: RedshiftDatasetDefinition,
operator_version: str = "0.1",
schema: Dict = None,
):
"""Generate the data ingestion only flow from redshift input

Args:
input_name (str): the name of the input to flow source node
redshift_dataset_definition (RedshiftDatasetDefinition): redshift input to flow source node
operator_version: (str): the version of the operator
schema: (typing.Dict): the schema for the data to be ingested
Returns:
dict (typing.Dict): A flow only conduct data ingestion with 1-1 mapping
output_name (str): The output name used to configure
`sagemaker.processing.FeatureStoreOutput`
"""
source_node = {
"node_id": str(uuid4()),
"type": "SOURCE",
"inputs": [],
"outputs": [{"name": "default"}],
"operator": f"sagemaker.redshift_source_{operator_version}",
"parameters": {
"dataset_definition": {
"datasetSourceType": "Redshift",
"name": input_name,
"clusterIdentifier": redshift_dataset_definition.cluster_id,
"database": redshift_dataset_definition.database,
"dbUser": redshift_dataset_definition.db_user,
"queryString": redshift_dataset_definition.query_string,
"unloadIamRole": redshift_dataset_definition.cluster_role_arn,
"s3OutputLocation": redshift_dataset_definition.output_s3_uri,
"outputFormat": redshift_dataset_definition.output_format,
}
},
}

output_node = _get_output_node(source_node["node_id"], operator_version, schema)

flow = {
"metadata": {"version": 1, "disable_limits": False},
"nodes": [source_node, output_node],
}

return flow, f'{output_node["node_id"]}.default'


def _get_output_node(source_node_id: str, operator_version: str, schema: Dict):
"""A helper function to generate output node, for internal use only

Args:
source_node_id (str): source node id
operator_version: (str): the version of the operator
schema: (typing.Dict): the schema for the data to be ingested
Returns:
dict (typing.Dict): output node
"""
return {
"node_id": str(uuid4()),
"type": "TRANSFORM",
"operator": f"sagemaker.spark.infer_and_cast_type_{operator_version}",
"trained_parameters": {} if schema is None else schema,
"parameters": {},
"inputs": [{"name": "default", "node_id": source_node_id, "output_name": "default"}],
"outputs": [{"name": "default"}],
}
100 changes: 100 additions & 0 deletions tests/data/workflow/features.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11
M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,1620426219.5977669,86691581-c6e4-4223-a586-92f42d2487da
F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1620426219.5977669,0d178b8f-539b-43fb-996f-0ce39655fa35
M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,1620426219.5977669,ac1f9e65-991c-4b1b-880b-ca8a06b407bd
I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,1620426219.5977669,78c2523b-227a-4b80-a41f-5af0faa82858
I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8,1620426219.5977669,2010f07c-163c-48bb-a150-7644a13a00b7
F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20,1620426219.5977669,ba1f8ad3-20ab-4174-a55c-e1a591fed775
F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16,1620426219.5977669,e12332f1-22b2-4d6b-ab21-8466cc715f4f
M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9,1620426219.5977669,1d41ffc3-d8e8-4c18-a241-0d850be6430d
F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19,1620426219.5977669,076497c3-e04e-4ede-bfe6-c3920bec4920
F,0.525,0.38,0.14,0.6065,0.19399999999999998,0.1475,0.21,14,1620426219.5977669,8eed62f9-8cc8-484f-873f-181d4bb25376
M,0.43,0.35,0.11,0.406,0.1675,0.081,0.135,10,1620426219.5977669,4bfadbd1-3cab-4ea9-b79c-601f9993914b
M,0.49,0.38,0.135,0.5415,0.2175,0.095,0.19,11,1620426219.5977669,21ca31b1-19c5-477b-94d7-5395860ecc36
F,0.535,0.405,0.145,0.6845,0.2725,0.171,0.205,10,1620426219.5977669,f19c4f32-4f1d-4ffd-a254-99811cd3c4e2
F,0.47,0.355,0.1,0.4755,0.1675,0.0805,0.185,10,1620426219.5977669,65d27cb3-d88e-4cf0-8309-bcb3e6f43374
M,0.5,0.4,0.13,0.6645,0.258,0.133,0.24,12,1620426219.5977669,b906920b-67c2-4b9a-b8cd-bacd5fd96a57
I,0.355,0.28,0.085,0.2905,0.095,0.0395,0.115,7,1620426219.5977669,5ca156c0-d233-4c04-8787-0ac48ff0936a
F,0.44,0.34,0.1,0.451,0.188,0.087,0.13,10,1620426219.5977669,bc88e01e-401d-417a-8d89-fabd12ea142c
M,0.365,0.295,0.08,0.2555,0.09699999999999999,0.043,0.1,7,1620426219.5977669,68f3bf84-2b80-4f54-b944-7b31ef829235
M,0.45,0.32,0.1,0.381,0.1705,0.075,0.115,9,1620426219.5977669,81763d96-9686-4072-a82e-fc661868d8e5
M,0.355,0.28,0.095,0.2455,0.0955,0.062,0.075,11,1620426219.5977669,5cb7f6d0-9945-420d-8981-1c5f40765b00
I,0.38,0.275,0.1,0.2255,0.08,0.049,0.085,10,1620426219.5977669,81dd45ef-dcdf-42ee-a6a7-95256e0be416
F,0.565,0.44,0.155,0.9395,0.4275,0.214,0.27,12,1620426219.5977669,b326f6f4-dbf4-4eba-9cdf-76afd67ff867
F,0.55,0.415,0.135,0.7635,0.318,0.21,0.2,9,1620426219.5977669,b6098540-05c3-4bb1-8425-9e74e96f4831
F,0.615,0.48,0.165,1.1615,0.513,0.301,0.305,10,1620426219.5977669,f401c947-c270-491b-9e7b-7124a550bbb5
F,0.56,0.44,0.14,0.9285,0.3825,0.188,0.3,11,1620426219.5977669,dfcfdaab-d0b7-406c-b65f-82d3000ad37b
F,0.58,0.45,0.185,0.9955,0.3945,0.272,0.285,11,1620426219.5977669,1f5e1b26-3774-4736-94d5-47c29e989aed
M,0.59,0.445,0.14,0.9309999999999999,0.35600000000000004,0.23399999999999999,0.28,12,1620426219.5977669,ab61df96-da45-4c20-993e-75ed25199560
M,0.605,0.475,0.18,0.9365,0.39399999999999996,0.21899999999999997,0.295,15,1620426219.5977669,4e94807a-4ce4-47d0-ada9-76a6f79e0d02
M,0.575,0.425,0.14,0.8635,0.39299999999999996,0.22699999999999998,0.2,11,1620426219.5977669,e53c6a81-3da1-4274-a932-3f6487afc390
M,0.58,0.47,0.165,0.9975,0.3935,0.242,0.33,10,1620426219.5977669,ffef730b-7672-4302-9244-74c69c5071a1
F,0.68,0.56,0.165,1.639,0.6055,0.2805,0.46,15,1620426219.5977669,ab91508b-f12a-4c00-b8f2-2d6d7467780f
M,0.665,0.525,0.165,1.338,0.5515,0.3575,0.35,18,1620426219.5977669,99fdad9f-350e-426d-81c0-c0d120d275ee
F,0.68,0.55,0.175,1.798,0.815,0.3925,0.455,19,1620426219.5977669,02b7c246-3ef9-4322-9bee-be471bcb6204
F,0.705,0.55,0.2,1.7095,0.633,0.4115,0.49,13,1620426219.5977669,0c35c6f9-cc26-4a32-b3a1-9d51134f6c36
M,0.465,0.355,0.105,0.4795,0.22699999999999998,0.124,0.125,8,1620426219.5977669,f257e5cf-746c-4327-b8de-5424eccbc90d
F,0.54,0.475,0.155,1.217,0.5305,0.3075,0.34,16,1620426219.5977669,4b6b93a3-fb47-414d-ab80-29bbbf0c9114
F,0.45,0.355,0.105,0.5225,0.237,0.1165,0.145,8,1620426219.5977669,5a4c8bbe-95f4-459d-90ca-533a8d487597
F,0.575,0.445,0.135,0.883,0.381,0.2035,0.26,11,1620426219.5977669,1eb83598-db8d-4713-9418-e20478758197
M,0.355,0.29,0.09,0.3275,0.134,0.086,0.09,9,1620426219.5977669,9a8d00e1-bb06-411b-9fb4-deb07fee3a29
F,0.45,0.335,0.105,0.425,0.1865,0.091,0.115,9,1620426219.5977669,326905b0-2bda-4527-a0f1-224e2d336dc1
F,0.55,0.425,0.135,0.8515,0.36200000000000004,0.196,0.27,14,1620426219.5977669,430219f9-cfff-4859-a54a-1531f3104435
I,0.24,0.175,0.045,0.07,0.0315,0.0235,0.02,5,1620426219.5977669,cd295f71-11b4-43a8-b8e1-c88989d9f07b
I,0.205,0.15,0.055,0.042,0.0255,0.015,0.012,5,1620426219.5977669,c8bea5aa-15cf-4774-8609-9b80985eb4b5
I,0.21,0.15,0.05,0.042,0.0175,0.0125,0.015,4,1620426219.5977669,5ea93636-b136-4ca8-8d6c-b2aa05cc800f
I,0.39,0.295,0.095,0.203,0.0875,0.045,0.075,7,1620426219.5977669,33e7b212-142d-4b2b-bf6a-38913f56f748
M,0.47,0.37,0.12,0.5795,0.293,0.22699999999999998,0.14,9,1620426219.5977669,ba122940-6cde-419c-9680-7869474c6737
F,0.46,0.375,0.12,0.4605,0.1775,0.11,0.15,7,1620426219.5977669,908e92cc-ca40-477c-94ad-e636a1add28b
I,0.325,0.245,0.07,0.161,0.0755,0.0255,0.045,6,1620426219.5977669,2303d903-a98b-41a8-ac57-a7d424d47309
F,0.525,0.425,0.16,0.8355,0.3545,0.2135,0.245,9,1620426219.5977669,d23da530-8cea-4dbe-a61c-2e6614a49529
I,0.52,0.41,0.12,0.595,0.2385,0.111,0.19,8,1620426219.5977669,d7508fdb-84f6-4b2f-9585-2e6c2853a8bb
M,0.4,0.32,0.095,0.303,0.1335,0.06,0.1,7,1620426219.5977669,0ef3dc27-f678-4bd6-a838-d0ba946020ff
M,0.485,0.36,0.13,0.5415,0.2595,0.096,0.16,10,1620426219.5977669,82a6e555-a6db-4d02-915c-20be5e121955
F,0.47,0.36,0.12,0.4775,0.2105,0.1055,0.15,10,1620426219.5977669,c8a8bc1c-7b27-429d-8905-7faceb7ec757
M,0.405,0.31,0.1,0.385,0.17300000000000001,0.0915,0.11,7,1620426219.5977669,62046348-1247-47d5-87c2-33f9af52c712
F,0.5,0.4,0.14,0.6615,0.2565,0.1755,0.22,8,1620426219.5977669,ca6765e7-7f30-4d76-9493-0aad96defa52
M,0.445,0.35,0.12,0.4425,0.192,0.0955,0.135,8,1620426219.5977669,e41d141d-acfd-405d-a80a-9780f4bdf16f
M,0.47,0.385,0.135,0.5895,0.2765,0.12,0.17,8,1620426219.5977669,923bae07-1669-4186-bf71-a282882395a8
I,0.245,0.19,0.06,0.086,0.042,0.013999999999999999,0.025,4,1620426219.5977669,1eaa1226-f142-4252-ac9e-0e9baa53e22f
F,0.505,0.4,0.125,0.583,0.24600000000000002,0.13,0.175,7,1620426219.5977669,01bc0fd6-a6b2-4468-bff8-7108e144e43f
M,0.45,0.345,0.105,0.4115,0.18,0.1125,0.135,7,1620426219.5977669,091022d7-9929-40ff-bc5f-c83bdfeaccb3
M,0.505,0.405,0.11,0.625,0.305,0.16,0.175,9,1620426219.5977669,c3585601-6661-40fa-a4e4-810585017013
F,0.53,0.41,0.13,0.6965,0.302,0.1935,0.2,10,1620426219.5977669,741c9d3b-99e4-4450-9af6-31017e2ea22b
M,0.425,0.325,0.095,0.3785,0.1705,0.08,0.1,7,1620426219.5977669,51b2dc31-26fb-425c-a1f9-661fe0e10075
M,0.52,0.4,0.12,0.58,0.23399999999999999,0.1315,0.185,8,1620426219.5977669,c3a13766-8f9a-44db-9c73-173b08f17909
M,0.475,0.355,0.12,0.48,0.23399999999999999,0.1015,0.135,8,1620426219.5977669,7ed6e906-7d14-4fd9-91a6-3872cd4d9a39
F,0.565,0.44,0.16,0.915,0.354,0.1935,0.32,12,1620426219.5977669,58b4087a-d840-4d70-a31b-36356790d5a7
F,0.595,0.495,0.185,1.285,0.41600000000000004,0.22399999999999998,0.485,13,1620426219.5977669,c1d6684d-638f-4cd7-81e8-8058147b044b
F,0.475,0.39,0.12,0.5305,0.2135,0.1155,0.17,10,1620426219.5977669,23034b8f-1b76-4ea1-9a44-b6069abe4e4f
I,0.31,0.235,0.07,0.151,0.063,0.0405,0.045,6,1620426219.5977669,f701efad-fab7-4ca1-ae92-832038e574ce
M,0.555,0.425,0.13,0.7665,0.264,0.168,0.275,13,1620426219.5977669,b30b290c-7b2f-48cd-9ffe-be6a116e8530
F,0.4,0.32,0.11,0.353,0.1405,0.0985,0.1,8,1620426219.5977669,e647cabb-86ff-496e-aba4-f3ea33efa663
F,0.595,0.475,0.17,1.247,0.48,0.225,0.425,20,1620426219.5977669,92c99768-e2d6-41ae-97f5-2519e45ca4c5
M,0.57,0.48,0.175,1.185,0.474,0.261,0.38,11,1620426219.5977669,c72c1654-e1ae-44e9-a540-a27a9b622495
F,0.605,0.45,0.195,1.0979999999999999,0.48100000000000004,0.2895,0.315,13,1620426219.5977669,5b633807-e5ee-41a3-aeba-530f2e58d0b0
F,0.6,0.475,0.15,1.0075,0.4425,0.221,0.28,15,1620426219.5977669,cbc95185-77a7-4baf-aaf0-5487fadf6e96
M,0.595,0.475,0.14,0.9440000000000001,0.3625,0.18899999999999997,0.315,9,1620426219.5977669,ff87dd10-c5f2-4e37-849c-f42c447b1bde
F,0.6,0.47,0.15,0.922,0.363,0.19399999999999998,0.305,10,1620426219.5977669,21bd37cd-0a1e-4ade-827b-3090f8bde946
F,0.555,0.425,0.14,0.7879999999999999,0.282,0.1595,0.285,11,1620426219.5977669,685b0b02-b675-4172-b707-0d34e9581265
F,0.615,0.475,0.17,1.1025,0.4695,0.2355,0.345,14,1620426219.5977669,bad81517-a67d-4863-a852-1f13309d4d4e
F,0.575,0.445,0.14,0.941,0.3845,0.252,0.285,9,1620426219.5977669,62ea15d8-b7cc-4131-9efc-290f7cb2e4f3
M,0.62,0.51,0.175,1.615,0.5105,0.192,0.675,12,1620426219.5977669,17455749-8733-4586-b6bc-5cb8eb0728ef
F,0.52,0.425,0.165,0.9885,0.396,0.225,0.32,16,1620426219.5977669,6628482c-9b86-42cf-8eb4-4341aaf12406
M,0.595,0.475,0.16,1.3175,0.408,0.23399999999999999,0.58,21,1620426219.5977669,01509b91-0e5d-4b8e-b593-105504c8d72e
M,0.58,0.45,0.14,1.013,0.38,0.21600000000000005,0.36,14,1620426219.5977669,21de87b8-a3e1-40de-8104-047a4e4bd8ee
F,0.57,0.465,0.18,1.295,0.33899999999999997,0.2225,0.44,12,1620426219.5977669,a568ec6f-3539-438a-9546-ddcecdfcc300
M,0.625,0.465,0.14,1.195,0.4825,0.205,0.4,13,1620426219.5977669,917e00db-4f54-492f-8e77-d7515f9d8b13
M,0.56,0.44,0.16,0.8645,0.3305,0.2075,0.26,10,1620426219.5977669,6ced2c2d-2e32-4f93-84c7-bd63a9f960d0
F,0.46,0.355,0.13,0.517,0.2205,0.114,0.165,9,1620426219.5977669,0aecf615-6812-45e1-847d-6fa0bdd3f361
F,0.575,0.45,0.16,0.9775,0.3135,0.231,0.33,12,1620426219.5977669,33ed462d-339c-4bab-93d6-8e4fafc9dcc8
M,0.565,0.425,0.135,0.8115,0.341,0.1675,0.255,15,1620426219.5977669,d07eab87-eeb7-4832-9fed-6b4a3fe60968
M,0.555,0.44,0.15,0.755,0.307,0.1525,0.26,12,1620426219.5977669,c69de048-e914-4236-b040-898a1fd7026c
M,0.595,0.465,0.175,1.115,0.4015,0.254,0.39,13,1620426219.5977669,119ef6d1-e371-40d5-b2e1-751fdf6a56ff
F,0.625,0.495,0.165,1.262,0.507,0.318,0.39,10,1620426219.5977669,42e078ac-2216-4109-a41f-2abdb33ac25b
M,0.695,0.56,0.19,1.494,0.588,0.3425,0.485,15,1620426219.5977669,89348ada-ebaa-4ff4-b7dd-21fbe6d63e1b
M,0.665,0.535,0.195,1.6059999999999999,0.5755,0.38799999999999996,0.48,14,1620426219.5977669,891dd59e-cd3a-4e67-8b23-b4f4dfcffcc2
M,0.535,0.435,0.15,0.725,0.26899999999999996,0.1385,0.25,9,1620426219.5977669,e66bedd0-1d39-4973-b5b2-85598f442dcc
M,0.47,0.375,0.13,0.523,0.214,0.132,0.145,8,1620426219.5977669,78068386-0f6a-431e-8fa2-8c306889f793
M,0.47,0.37,0.13,0.5225,0.201,0.133,0.165,7,1620426219.5977669,e18cd87a-664f-41b9-817f-575899b37834
F,0.475,0.375,0.125,0.5785,0.2775,0.085,0.155,10,1620426219.5977669,8082fa9d-1f21-4f1d-a29a-774461bc77f2
Loading