change: Support protobuf4 (#3985)

claytonparnell · web-flow · commit 53fe9b7456b1 · 2023-07-12T16:30:39.000-07:00
* Support protobuf4

* formatting...

* Ignore pylint

* Add .proto file for ref

* Add pyi file

* more formatting..

* Raise lower bound of protobuf

* Support protobuf 3 AND 4

* Update lower bound to ensure compatibility. Loosen PyYAML
diff --git a/setup.py b/setup.py
@@ -52,14 +52,14 @@ def read_requirements(filename):
     "cloudpickle==2.2.1",
     "google-pasta",
     "numpy>=1.9.0,<2.0",
-    "protobuf>=3.1,<4.0",
+    "protobuf>=3.12,<5.0",
     "smdebug_rulesconfig==1.0.1",
-    "importlib-metadata>=1.4.0,<5.0",
+    "importlib-metadata>=1.4.0,<7.0",
     "packaging>=20.0",
     "pandas",
     "pathos",
     "schema",
-    "PyYAML==6.0",
+    "PyYAML~=6.0",
     "jsonschema",
     "platformdirs",
     "tblib==1.7.0",
diff --git a/src/sagemaker/amazon/record.proto b/src/sagemaker/amazon/record.proto
@@ -0,0 +1,123 @@
+syntax = "proto2";
+
+package aialgs.data;
+
+option java_package = "com.amazonaws.aialgorithms.proto";
+option java_outer_classname = "RecordProtos";
+
+// A sparse or dense rank-R tensor that stores data as doubles (float64).
+message Float32Tensor   {
+    // Each value in the vector. If keys is empty this is treated as a
+    // dense vector.
+    repeated float values = 1 [packed = true];
+
+    // If not empty then the vector is treated as sparse with
+    // each key specifying the location of the value in the sparse vector.
+    repeated uint64 keys = 2 [packed = true];
+
+    // Optional shape which will allow the vector to represent a matrix.
+    // e.g. if shape = [ 10, 20 ] then floor(keys[i] / 10) will give the row
+    // and keys[i] % 20 will give the column.
+    // This also supports n-dimensonal tensors.
+    // NB. this must be specified if the tensor is sparse.
+    repeated uint64 shape = 3 [packed = true];
+}
+
+// A sparse or dense rank-R tensor that stores data as doubles (float64).
+message Float64Tensor {
+    // Each value in the vector. If keys is empty this is treated as a
+    // dense vector.
+    repeated double values = 1 [packed = true];
+
+    // If not empty then the vector is treated as sparse with
+    // each key specifying the location of the value in the sparse vector.
+    repeated uint64 keys = 2 [packed = true];
+
+    // Optional shape which will allow the vector to represent a matrix.
+    // e.g. if shape = [ 10, 20 ] then floor(keys[i] / 10) will give the row
+    // and keys[i] % 20 will give the column.
+    // This also supports n-dimensonal tensors.
+    // NB. this must be specified if the tensor is sparse.
+    repeated uint64 shape = 3 [packed = true];
+}
+
+// A sparse or dense rank-R tensor that stores data as 32-bit ints (int32).
+message Int32Tensor {
+    // Each value in the vector. If keys is empty this is treated as a
+    // dense vector.
+    repeated int32 values = 1 [packed = true];
+
+    // If not empty then the vector is treated as sparse with
+    // each key specifying the location of the value in the sparse vector.
+    repeated uint64 keys = 2 [packed = true];
+
+    // Optional shape which will allow the vector to represent a matrix.
+    // e.g. if shape = [ 10, 20 ] then floor(keys[i] / 10) will give the row
+    // and keys[i] % 20 will give the column.
+    // This also supports n-dimensonal tensors.
+    // NB. this must be specified if the tensor is sparse.
+    repeated uint64 shape = 3 [packed = true];
+}
+
+// Support for storing binary data for parsing in other ways (such as JPEG/etc).
+// This is an example of another type of value and may not immediately be supported.
+message Bytes {
+    repeated bytes value = 1;
+
+    // Stores the content type of the data if known.
+    // This will allow the possibility of using decoders for common formats
+    // in the future.
+    optional string content_type = 2;
+}
+
+message Value {
+    oneof value {
+        // The numbering assumes the possible use of:
+        // - float16, float128
+        // - int8, int16, int32
+        Float32Tensor float32_tensor = 2;
+        Float64Tensor float64_tensor = 3;
+        Int32Tensor int32_tensor = 7;
+        Bytes bytes = 9;
+    }
+}
+
+message Record {
+    // Map from the name of the feature to the value.
+    //
+    // For vectors and libsvm-like datasets,
+    // a single feature with the name `values`
+    // should be specified.
+    map<string, Value> features = 1;
+
+    // Optional set of labels for this record.
+    // Similar to features field above, the key used for
+    // generic scalar / vector labels should ve 'values'
+    map<string, Value> label = 2;
+
+    // Unique identifier for this record in the dataset.
+    //
+    // Whilst not necessary, this allows better
+    // debugging where there are data issues.
+    //
+    // This is not used by the algorithm directly.
+    optional string uid = 3;
+
+    // Textual metadata describing the record.
+    //
+    // This may include JSON-serialized information
+    // about the source of the record.
+    //
+    // This is not used by the algorithm directly.
+    optional string metadata = 4;
+
+    // Optional serialized JSON object that allows per-record
+    // hyper-parameters/configuration/other information to be set.
+    //
+    // The meaning/interpretation of this field is defined by
+    // the algorithm author and may not be supported.
+    //
+    // This is used to pass additional inference configuration
+    // when batch inference is used (e.g. types of scores to return).
+    optional string configuration = 5;
+}
diff --git a/src/sagemaker/amazon/record_pb2.py b/src/sagemaker/amazon/record_pb2.py