Add CSV serializer

Balaji Veeramani · Balaji Veeramani · commit db562fafbb62 · 2020-07-09T19:53:29.000-05:00
diff --git a/doc/frameworks/tensorflow/deploying_tensorflow_serving.rst b/doc/frameworks/tensorflow/deploying_tensorflow_serving.rst
@@ -236,7 +236,7 @@ your input data to CSV format:
 
   # create a Predictor with JSON serialization
 
-  predictor = Predictor('endpoint-name', serializer=sagemaker.predictor.csv_serializer)
+  predictor = Predictor('endpoint-name', serializer=sagemaker.serializers.CSVSerializer())
 
   # CSV-formatted string input
   input = '1.0,2.0,5.0\n1.0,2.0,5.0\n1.0,2.0,5.0'
@@ -252,7 +252,7 @@ your input data to CSV format:
     ]
   }
 
-You can also use python arrays or numpy arrays as input and let the `csv_serializer` object
+You can also use python arrays or numpy arrays as input and let the `CSVSerializer` object
 convert them to CSV, but the client-size CSV conversion is more sophisticated than the
 CSV parsing on the Endpoint, so if you encounter conversion problems, try using one of the
 JSON options instead.
diff --git a/doc/frameworks/tensorflow/using_tf.rst b/doc/frameworks/tensorflow/using_tf.rst
@@ -685,7 +685,7 @@ your input data to CSV format:
 
   # create a Predictor with JSON serialization
 
-  predictor = Predictor('endpoint-name', serializer=sagemaker.predictor.csv_serializer)
+  predictor = Predictor('endpoint-name', serializer=sagemaker.serializers.CSVSerializer())
 
   # CSV-formatted string input
   input = '1.0,2.0,5.0\n1.0,2.0,5.0\n1.0,2.0,5.0'
@@ -701,7 +701,7 @@ your input data to CSV format:
     ]
   }
 
-You can also use python arrays or numpy arrays as input and let the `csv_serializer` object
+You can also use python arrays or numpy arrays as input and let the `CSVSerializer` object
 convert them to CSV, but the client-size CSV conversion is more sophisticated than the
 CSV parsing on the Endpoint, so if you encounter conversion problems, try using one of the
 JSON options instead.
diff --git a/src/sagemaker/amazon/ipinsights.py b/src/sagemaker/amazon/ipinsights.py
@@ -16,8 +16,9 @@
 from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
 from sagemaker.amazon.hyperparameter import Hyperparameter as hp  # noqa
 from sagemaker.amazon.validation import ge, le
-from sagemaker.predictor import Predictor, csv_serializer, json_deserializer
+from sagemaker.predictor import Predictor, json_deserializer
 from sagemaker.model import Model
+from sagemaker.serializers import CSVSerializer
 from sagemaker.session import Session
 from sagemaker.vpc_utils import VPC_CONFIG_DEFAULT
 
@@ -197,7 +198,7 @@ def __init__(self, endpoint_name, sagemaker_session=None):
         super(IPInsightsPredictor, self).__init__(
             endpoint_name,
             sagemaker_session,
-            serializer=csv_serializer,
+            serializer=CSVSerializer(),
             deserializer=json_deserializer,
         )
 
diff --git a/src/sagemaker/predictor.py b/src/sagemaker/predictor.py
@@ -17,7 +17,7 @@
 import csv
 import json
 import six
-from six import StringIO, BytesIO
+from six import BytesIO
 import numpy as np
 
 from sagemaker.content_types import CONTENT_TYPE_JSON, CONTENT_TYPE_CSV, CONTENT_TYPE_NPY
@@ -495,108 +495,6 @@ def ACCEPT(self):
         return self.accept
 
 
-class _CsvSerializer(object):
-    """Placeholder docstring"""
-
-    def __init__(self):
-        """Placeholder docstring"""
-        self.content_type = CONTENT_TYPE_CSV
-
-    def __call__(self, data):
-        """Take data of various data formats and serialize them into CSV.
-
-        Args:
-            data (object): Data to be serialized.
-
-        Returns:
-            object: Sequence of bytes to be used for the request body.
-        """
-        # For inputs which represent multiple "rows", the result should be newline-separated CSV
-        # rows
-        if _is_mutable_sequence_like(data) and len(data) > 0 and _is_sequence_like(data[0]):
-            return "\n".join([_CsvSerializer._serialize_row(row) for row in data])
-        return _CsvSerializer._serialize_row(data)
-
-    @staticmethod
-    def _serialize_row(data):
-        # Don't attempt to re-serialize a string
-        """
-        Args:
-            data:
-        """
-        if isinstance(data, str):
-            return data
-        if isinstance(data, np.ndarray):
-            data = np.ndarray.flatten(data)
-        if hasattr(data, "__len__"):
-            if len(data) == 0:
-                raise ValueError("Cannot serialize empty array")
-            return _csv_serialize_python_array(data)
-
-        # files and buffers
-        if hasattr(data, "read"):
-            return _csv_serialize_from_buffer(data)
-
-        raise ValueError("Unable to handle input format: ", type(data))
-
-
-def _csv_serialize_python_array(data):
-    """
-    Args:
-        data:
-    """
-    return _csv_serialize_object(data)
-
-
-def _csv_serialize_from_buffer(buff):
-    """
-    Args:
-        buff:
-    """
-    return buff.read()
-
-
-def _csv_serialize_object(data):
-    """
-    Args:
-        data:
-    """
-    csv_buffer = StringIO()
-
-    csv_writer = csv.writer(csv_buffer, delimiter=",")
-    csv_writer.writerow(data)
-    return csv_buffer.getvalue().rstrip("\r\n")
-
-
-csv_serializer = _CsvSerializer()
-
-
-def _is_mutable_sequence_like(obj):
-    """
-    Args:
-        obj:
-    """
-    return _is_sequence_like(obj) and hasattr(obj, "__setitem__")
-
-
-def _is_sequence_like(obj):
-    """
-    Args:
-        obj:
-    """
-    return hasattr(obj, "__iter__") and hasattr(obj, "__getitem__")
-
-
-def _row_to_csv(obj):
-    """
-    Args:
-        obj:
-    """
-    if isinstance(obj, str):
-        return obj
-    return ",".join(obj)
-
-
 class _CsvDeserializer(object):
     """Placeholder docstring"""
 
diff --git a/src/sagemaker/serializers.py b/src/sagemaker/serializers.py
@@ -14,6 +14,10 @@
 from __future__ import absolute_import
 
 import abc
+import csv
+import io
+
+import numpy as np
 
 
 class BaseSerializer(abc.ABC):
@@ -38,3 +42,100 @@ def serialize(self, data):
     @abc.abstractmethod
     def CONTENT_TYPE(self):
         """The MIME type of the data sent to the inference endpoint."""
+
+
+class CSVSerializer(BaseSerializer):
+    """Placeholder docstring"""
+
+    CONTENT_TYPE = "text/csv"
+
+    def serialize(self, data):
+        """Take data of various data formats and serialize them into CSV.
+
+        Args:
+            data (object): Data to be serialized.
+
+        Returns:
+            object: Sequence of bytes to be used for the request body.
+        """
+        # For inputs which represent multiple "rows", the result should be newline-separated CSV
+        # rows
+        if _is_mutable_sequence_like(data) and len(data) > 0 and _is_sequence_like(data[0]):
+            return "\n".join([CSVSerializer._serialize_row(row) for row in data])
+        return CSVSerializer._serialize_row(data)
+
+    @staticmethod
+    def _serialize_row(data):
+        # Don't attempt to re-serialize a string
+        """
+        Args:
+            data:
+        """
+        if isinstance(data, str):
+            return data
+        if isinstance(data, np.ndarray):
+            data = np.ndarray.flatten(data)
+        if hasattr(data, "__len__"):
+            if len(data) == 0:
+                raise ValueError("Cannot serialize empty array")
+            return _csv_serialize_python_array(data)
+
+        # files and buffers
+        if hasattr(data, "read"):
+            return _csv_serialize_from_buffer(data)
+
+        raise ValueError("Unable to handle input format: ", type(data))
+
+
+def _csv_serialize_python_array(data):
+    """
+    Args:
+        data:
+    """
+    return _csv_serialize_object(data)
+
+
+def _csv_serialize_from_buffer(buff):
+    """
+    Args:
+        buff:
+    """
+    return buff.read()
+
+
+def _csv_serialize_object(data):
+    """
+    Args:
+        data:
+    """
+    csv_buffer = io.StringIO()
+
+    csv_writer = csv.writer(csv_buffer, delimiter=",")
+    csv_writer.writerow(data)
+    return csv_buffer.getvalue().rstrip("\r\n")
+
+
+def _is_mutable_sequence_like(obj):
+    """
+    Args:
+        obj:
+    """
+    return _is_sequence_like(obj) and hasattr(obj, "__setitem__")
+
+
+def _is_sequence_like(obj):
+    """
+    Args:
+        obj:
+    """
+    return hasattr(obj, "__iter__") and hasattr(obj, "__getitem__")
+
+
+def _row_to_csv(obj):
+    """
+    Args:
+        obj:
+    """
+    if isinstance(obj, str):
+        return obj
+    return ",".join(obj)
diff --git a/src/sagemaker/sparkml/model.py b/src/sagemaker/sparkml/model.py
@@ -16,7 +16,7 @@
 from sagemaker import Model, Predictor, Session
 from sagemaker.content_types import CONTENT_TYPE_CSV
 from sagemaker.fw_registry import registry
-from sagemaker.predictor import csv_serializer
+from sagemaker.serializers import CSVSerializer
 
 framework_name = "sparkml-serving"
 repo_name = "sagemaker-sparkml-serving"
@@ -51,7 +51,7 @@ def __init__(self, endpoint_name, sagemaker_session=None):
         super(SparkMLPredictor, self).__init__(
             endpoint_name=endpoint_name,
             sagemaker_session=sagemaker_session,
-            serializer=csv_serializer,
+            serializer=CSVSerializer(),
             content_type=CONTENT_TYPE_CSV,
         )
 
diff --git a/tests/integ/test_marketplace.py b/tests/integ/test_marketplace.py
@@ -22,6 +22,7 @@
 import sagemaker
 import tests.integ
 from sagemaker import AlgorithmEstimator, ModelPackage
+from sagemaker.serializeres import CSVSerializer
 from sagemaker.tuner import IntegerParameter, HyperparameterTuner
 from sagemaker.utils import sagemaker_timestamp
 from sagemaker.utils import _aws_partition
@@ -136,10 +137,7 @@ def test_marketplace_attach(sagemaker_session, cpu_instance_type):
             training_job_name=training_job_name, sagemaker_session=sagemaker_session
         )
         predictor = estimator.deploy(
-            1,
-            cpu_instance_type,
-            endpoint_name=endpoint_name,
-            serializer=sagemaker.predictor.csv_serializer,
+            1, cpu_instance_type, endpoint_name=endpoint_name, serializer=CSVSerializer()
         )
         shape = pandas.read_csv(os.path.join(data_path, "iris.csv"), header=None)
         a = [50 * i for i in range(3)]
@@ -165,7 +163,7 @@ def test_marketplace_model(sagemaker_session, cpu_instance_type):
     )
 
     def predict_wrapper(endpoint, session):
-        return sagemaker.Predictor(endpoint, session, serializer=sagemaker.predictor.csv_serializer)
+        return sagemaker.Predictor(endpoint, session, serializer=CSVSerializer())
 
     model = ModelPackage(
         role="SageMakerRole",
diff --git a/tests/integ/test_multi_variant_endpoint.py b/tests/integ/test_multi_variant_endpoint.py
@@ -25,7 +25,8 @@
 from sagemaker.content_types import CONTENT_TYPE_CSV
 from sagemaker.utils import unique_name_from_base
 from sagemaker.amazon.amazon_estimator import get_image_uri
-from sagemaker.predictor import csv_serializer, Predictor
+from sagemaker.predictor import Predictor
+from sagemaker.serializers import CSVSerializer
 
 
 import tests.integ
@@ -169,7 +170,7 @@ def test_predict_invocation_with_target_variant(sagemaker_session, multi_variant
     predictor = Predictor(
         endpoint_name=multi_variant_endpoint.endpoint_name,
         sagemaker_session=sagemaker_session,
-        serializer=csv_serializer,
+        serializer=CSVSerializer(),
         content_type=CONTENT_TYPE_CSV,
         accept=CONTENT_TYPE_CSV,
     )
@@ -297,7 +298,7 @@ def test_predict_invocation_with_target_variant_local_mode(
     predictor = Predictor(
         endpoint_name=multi_variant_endpoint.endpoint_name,
         sagemaker_session=sagemaker_session,
-        serializer=csv_serializer,
+        serializer=CSVSerializer(),
         content_type=CONTENT_TYPE_CSV,
         accept=CONTENT_TYPE_CSV,
     )
diff --git a/tests/integ/test_tfs.py b/tests/integ/test_tfs.py
@@ -24,6 +24,7 @@
 import tests.integ
 import tests.integ.timeout
 from sagemaker.tensorflow.model import TensorFlowModel, TensorFlowPredictor
+from sagemaker.serializers import CSVSerializer
 
 
 @pytest.fixture(scope="module")
@@ -236,9 +237,7 @@ def test_predict_csv(tfs_predictor):
     expected_result = {"predictions": [[3.5, 4.0, 5.5], [3.5, 4.0, 5.5]]}
 
     predictor = TensorFlowPredictor(
-        tfs_predictor.endpoint_name,
-        tfs_predictor.sagemaker_session,
-        serializer=sagemaker.predictor.csv_serializer,
+        tfs_predictor.endpoint_name, tfs_predictor.sagemaker_session, serializer=CSVSerializer(),
     )
 
     result = predictor.predict(input_data)
diff --git a/tests/unit/sagemaker/tensorflow/test_tfs.py b/tests/unit/sagemaker/tensorflow/test_tfs.py
diff --git a/tests/unit/sagemaker/test_serializers.py b/tests/unit/sagemaker/test_serializers.py
diff --git a/tests/unit/test_predictor.py b/tests/unit/test_predictor.py

Original file line number	Diff line number	Diff line change
`@@ -236,7 +236,7 @@ your input data to CSV format:`
`236`	`236`
`237`	`237`	`# create a Predictor with JSON serialization`
`238`	`238`
`239`		`- predictor = Predictor('endpoint-name', serializer=sagemaker.predictor.csv_serializer)`
	`239`	`+ predictor = Predictor('endpoint-name', serializer=sagemaker.serializers.CSVSerializer())`
`240`	`240`
`241`	`241`	`# CSV-formatted string input`
`242`	`242`	`input = '1.0,2.0,5.0\n1.0,2.0,5.0\n1.0,2.0,5.0'`
`@@ -252,7 +252,7 @@ your input data to CSV format:`
`252`	`252`	`]`
`253`	`253`	`}`
`254`	`254`
`255`		-You can also use python arrays or numpy arrays as input and let the `csv_serializer` object
	`255`	+You can also use python arrays or numpy arrays as input and let the `CSVSerializer` object
`256`	`256`	`convert them to CSV, but the client-size CSV conversion is more sophisticated than the`
`257`	`257`	`CSV parsing on the Endpoint, so if you encounter conversion problems, try using one of the`
`258`	`258`	`JSON options instead.`
Original file line number	Diff line number	Diff line change
`@@ -685,7 +685,7 @@ your input data to CSV format:`
`685`	`685`
`686`	`686`	`# create a Predictor with JSON serialization`
`687`	`687`
`688`		`- predictor = Predictor('endpoint-name', serializer=sagemaker.predictor.csv_serializer)`
	`688`	`+ predictor = Predictor('endpoint-name', serializer=sagemaker.serializers.CSVSerializer())`
`689`	`689`
`690`	`690`	`# CSV-formatted string input`
`691`	`691`	`input = '1.0,2.0,5.0\n1.0,2.0,5.0\n1.0,2.0,5.0'`
`@@ -701,7 +701,7 @@ your input data to CSV format:`
`701`	`701`	`]`
`702`	`702`	`}`
`703`	`703`
`704`		-You can also use python arrays or numpy arrays as input and let the `csv_serializer` object
	`704`	+You can also use python arrays or numpy arrays as input and let the `CSVSerializer` object
`705`	`705`	`convert them to CSV, but the client-size CSV conversion is more sophisticated than the`
`706`	`706`	`CSV parsing on the Endpoint, so if you encounter conversion problems, try using one of the`
`707`	`707`	`JSON options instead.`