Skip to content

Add optional Arrow deserialization support #181

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions elastic_transport/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@
except ImportError:
pass

try:
from elastic_transport._serializer import PyArrowSerializer # noqa: F401

__all__.append("PyArrowSerializer")
except ImportError:
pass

_logger = logging.getLogger("elastic_transport")
_logger.addHandler(logging.NullHandler())
del _logger
Expand Down
31 changes: 31 additions & 0 deletions elastic_transport/_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
except ModuleNotFoundError:
orjson = None # type: ignore[assignment]

try:
import pyarrow as pa
except ModuleNotFoundError:
pa = None # type: ignore[assignment]


class Serializer:
"""Serializer interface."""
Expand Down Expand Up @@ -192,12 +197,38 @@ def dumps(self, data: Any) -> bytes:
return bytes(buffer)


if pa is not None:

class PyArrowSerializer(Serializer):
"""PyArrow serializer for deserializing Arrow Stream data."""

mimetype: ClassVar[str] = "application/vnd.apache.arrow.stream"

def loads(self, data: bytes) -> pa.Table:
try:
with pa.ipc.open_stream(data) as reader:
return reader.read_all()
except pa.ArrowException as e:
raise SerializationError(
message=f"Unable to deserialize as Arrow stream: {data!r}",
errors=(e,),
)

def dumps(self, data: Any) -> bytes:
raise SerializationError(
message="Elasticsearch does not accept Arrow input data"
)


DEFAULT_SERIALIZERS = {
JsonSerializer.mimetype: JsonSerializer(),
TextSerializer.mimetype: TextSerializer(),
NdjsonSerializer.mimetype: NdjsonSerializer(),
}

if pa is not None:
DEFAULT_SERIALIZERS[PyArrowSerializer.mimetype] = PyArrowSerializer()


class SerializerCollection:
"""Collection of serializers that can be fetched by mimetype. Used by
Expand Down
1 change: 1 addition & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def lint(session):
"mypy==1.7.1",
"types-requests",
"types-certifi",
"pyarrow-stubs",
)
# https://github.com/python/typeshed/issues/10786
session.run(
Expand Down
1 change: 1 addition & 0 deletions requirements-min.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ requests==2.26.0
urllib3==1.26.2
aiohttp==3.8.0
httpx==0.27.0
pyarrow==1.0.0
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
"opentelemetry-api",
"opentelemetry-sdk",
"orjson",
"pyarrow",
# Override Read the Docs default (sphinx<2)
"sphinx>2",
"furo",
Expand Down
3 changes: 3 additions & 0 deletions tests/test_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ def test__all__sorted(module):
# Optional dependencies are added at the end
if "OrjsonSerializer" in module_all:
module_all.remove("OrjsonSerializer")
if "PyArrowSerializer" in module_all:
module_all.remove("PyArrowSerializer")

assert module_all == sorted(module_all)


Expand Down
21 changes: 21 additions & 0 deletions tests/test_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@
from datetime import date
from decimal import Decimal

import pyarrow as pa
import pytest

from elastic_transport import (
JsonSerializer,
NdjsonSerializer,
OrjsonSerializer,
PyArrowSerializer,
SerializationError,
SerializerCollection,
TextSerializer,
Expand Down Expand Up @@ -191,3 +193,22 @@ def test_ndjson_dumps():
b'{"key:"value"}\n'
b'{"bytes":"too"}\n'
)


def test_pyarrow_loads():
data = [
pa.array([1, 2, 3, 4]),
pa.array(["foo", "bar", "baz", None]),
pa.array([True, None, False, True]),
]
batch = pa.record_batch(data, names=["f0", "f1", "f2"])
sink = pa.BufferOutputStream()
with pa.ipc.new_stream(sink, batch.schema) as writer:
writer.write_batch(batch)

serializer = PyArrowSerializer()
assert serializer.loads(sink.getvalue()).to_pydict() == {
"f0": [1, 2, 3, 4],
"f1": ["foo", "bar", "baz", None],
"f2": [True, None, False, True],
}
Loading