Skip to content

Add pcodec #501

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Feb 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci-linux.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
run: |
conda activate env
export DISABLE_NUMCODECS_AVX2=""
python -m pip install -v -e .[test,test_extras,msgpack,zfpy]
python -m pip install -v -e .[test,test_extras,msgpack,zfpy,pcodec]

- name: List installed packages
shell: "bash -l {0}"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci-osx.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
run: |
conda activate env
export DISABLE_NUMCODECS_AVX2=""
python -m pip install -v -e .[test,test_extras,msgpack,zfpy]
python -m pip install -v -e .[test,test_extras,msgpack,zfpy,pcodec]

- name: List installed packages
shell: "bash -l {0}"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci-windows.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
shell: "bash -l {0}"
run: |
conda activate env
python -m pip install -v -e .[test,test_extras,msgpack,zfpy]
python -m pip install -v -e .[test,test_extras,msgpack,zfpy,pcodec]

- name: List installed packages
shell: "bash -l {0}"
Expand Down
2 changes: 1 addition & 1 deletion c-blosc
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __getattr__(cls, name):
return Mock()


MOCK_MODULES = ['msgpack']
MOCK_MODULES = ['msgpack', 'pcodec']
sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)


Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ Contents
abc
registry
blosc
pcodec
lz4
zfpy
zstd
Expand Down
10 changes: 10 additions & 0 deletions docs/pcodec.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
PCodec
======

.. automodule:: numcodecs.pcodec

.. autoclass:: PCodec

.. autoattribute:: codec_id
.. automethod:: encode
.. automethod:: decode
2 changes: 2 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Unreleased
Enhancements
~~~~~~~~~~~~

* Add PCodec
By :user:`Ryan Abernathey <rabernat>`.
* Use PyData theme for docs
By :user:`John Kirkham <jakirkham>`, :issue:`485`.

Expand Down
Binary file added fixture/pcodec/array.00.npy
Binary file not shown.
Binary file added fixture/pcodec/array.01.npy
Binary file not shown.
Binary file added fixture/pcodec/array.02.npy
Binary file not shown.
Binary file added fixture/pcodec/array.03.npy
Binary file not shown.
Binary file added fixture/pcodec/array.04.npy
Binary file not shown.
Binary file added fixture/pcodec/array.05.npy
Binary file not shown.
Binary file added fixture/pcodec/array.06.npy
Binary file not shown.
Binary file added fixture/pcodec/array.07.npy
Binary file not shown.
Binary file added fixture/pcodec/array.08.npy
Binary file not shown.
Binary file added fixture/pcodec/array.09.npy
Binary file not shown.
8 changes: 8 additions & 0 deletions fixture/pcodec/codec.00/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"delta_encoding_order": null,
"equal_pages_up_to": 262144,
"float_mult_spec": "enabled",
"id": "pcodec",
"int_mult_spec": "enabled",
"level": 8
}
Binary file added fixture/pcodec/codec.00/encoded.00.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.00/encoded.01.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.00/encoded.02.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.00/encoded.03.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.00/encoded.04.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.00/encoded.05.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.00/encoded.06.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.00/encoded.07.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.00/encoded.08.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.00/encoded.09.dat
Binary file not shown.
8 changes: 8 additions & 0 deletions fixture/pcodec/codec.01/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"delta_encoding_order": null,
"equal_pages_up_to": 262144,
"float_mult_spec": "enabled",
"id": "pcodec",
"int_mult_spec": "enabled",
"level": 1
}
Binary file added fixture/pcodec/codec.01/encoded.00.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.01/encoded.01.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.01/encoded.02.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.01/encoded.03.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.01/encoded.04.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.01/encoded.05.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.01/encoded.06.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.01/encoded.07.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.01/encoded.08.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.01/encoded.09.dat
Binary file not shown.
8 changes: 8 additions & 0 deletions fixture/pcodec/codec.02/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"delta_encoding_order": null,
"equal_pages_up_to": 262144,
"float_mult_spec": "enabled",
"id": "pcodec",
"int_mult_spec": "enabled",
"level": 5
}
Binary file added fixture/pcodec/codec.02/encoded.00.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.02/encoded.01.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.02/encoded.02.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.02/encoded.03.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.02/encoded.04.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.02/encoded.05.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.02/encoded.06.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.02/encoded.07.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.02/encoded.08.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.02/encoded.09.dat
Binary file not shown.
8 changes: 8 additions & 0 deletions fixture/pcodec/codec.03/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"delta_encoding_order": null,
"equal_pages_up_to": 262144,
"float_mult_spec": "enabled",
"id": "pcodec",
"int_mult_spec": "enabled",
"level": 9
}
Binary file added fixture/pcodec/codec.03/encoded.00.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.03/encoded.01.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.03/encoded.02.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.03/encoded.03.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.03/encoded.04.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.03/encoded.05.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.03/encoded.06.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.03/encoded.07.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.03/encoded.08.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.03/encoded.09.dat
Binary file not shown.
8 changes: 8 additions & 0 deletions fixture/pcodec/codec.04/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"delta_encoding_order": null,
"equal_pages_up_to": 262144,
"float_mult_spec": "disabled",
"id": "pcodec",
"int_mult_spec": "disabled",
"level": 8
}
Binary file added fixture/pcodec/codec.04/encoded.00.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.04/encoded.01.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.04/encoded.02.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.04/encoded.03.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.04/encoded.04.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.04/encoded.05.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.04/encoded.06.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.04/encoded.07.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.04/encoded.08.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.04/encoded.09.dat
Binary file not shown.
8 changes: 8 additions & 0 deletions fixture/pcodec/codec.05/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"delta_encoding_order": null,
"equal_pages_up_to": 300,
"float_mult_spec": "enabled",
"id": "pcodec",
"int_mult_spec": "enabled",
"level": 8
}
Binary file added fixture/pcodec/codec.05/encoded.00.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.05/encoded.01.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.05/encoded.02.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.05/encoded.03.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.05/encoded.04.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.05/encoded.05.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.05/encoded.06.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.05/encoded.07.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.05/encoded.08.dat
Binary file not shown.
Binary file added fixture/pcodec/codec.05/encoded.09.dat
Binary file not shown.
3 changes: 3 additions & 0 deletions numcodecs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,6 @@

from numcodecs.fletcher32 import Fletcher32
register_codec(Fletcher32)

from numcodecs.pcodec import PCodec
register_codec(PCodec)
89 changes: 89 additions & 0 deletions numcodecs/pcodec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from typing import Optional, Literal

import numcodecs
import numcodecs.abc
from numcodecs.compat import ensure_contiguous_ndarray

try:
from pcodec import standalone, ChunkConfig, PagingSpec
except ImportError: # pragma: no cover
standalone = None


DEFAULT_MAX_PAGE_N = 262144


class PCodec(numcodecs.abc.Codec):
"""
PCodec (or pco, pronounced "pico") losslessly compresses and decompresses
numerical sequences with high compression ratio and fast speed.

See `PCodec Repo <https://github.com/mwlon/pcodec>`_ for more information.

PCodec supports only the following numerical dtypes: uint32, unit64, int32,
int64, float32, and float64.

Parameters
----------
level : int
A compression level from 0-12, where 12 take the longest and compresses
the most.
delta_encoding_order : init or None
Either a delta encoding level from 0-7 or None. If set to None, pcodec
will try to infer the optimal delta encoding order.
int_mult_spec : {'enabled', 'disabled'}
If enabled, pcodec will consider using int mult mode, which can
substantially improve compression ratio but decrease speed in some cases
for integer types.
float_mult_spec : {'enabled', 'disabled'}
If enabled, pcodec will consider using float mult mode, which can
substantially improve compression ratio but decrease speed in some cases
for float types.
equal_pages_up_to : int
Divide the chunk into equal pages of up to this many numbers.
"""

codec_id = "pcodec"

def __init__(
self,
level: int = 8,
delta_encoding_order: Optional[int] = None,
int_mult_spec: Literal["enabled", "disabled"] = "enabled",
float_mult_spec: Literal["enabled", "disabled"] = "enabled",
equal_pages_up_to: int = 262144
):
if standalone is None: # pragma: no cover
raise ImportError(
"pcodec must be installed to use the PCodec codec."
)

# note that we use `level` instead of `compression_level` to
# match other codecs
self.level = level
self.delta_encoding_order = delta_encoding_order
self.int_mult_spec = int_mult_spec
self.float_mult_spec = float_mult_spec
self.equal_pages_up_to = equal_pages_up_to

def encode(self, buf):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def encode(self, buf):
def encode(self, buf) -> bytes:

?
(This may be a good way to label array-bytes codecs; maybe also type for buf should be ndarray)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not convinced that there is much value in adding these sorts of type hints if we are not actually running type checking on the library.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it not for other users of the library, and their IDEs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But incorrect type hints are worse than none at all! For example, is ndarray really the correct type for buf? Maybe, but who knows? I could add it, but without running mypy, we'll never know for sure.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True that correctness is important, of course; but this is like the "light" version of array->array V array->bytes V bytes->bytes. Still useful. You can always get around mypy too, if you want.

buf = ensure_contiguous_ndarray(buf)

paging_spec = PagingSpec.equal_pages_up_to(self.equal_pages_up_to)

config = ChunkConfig(
compression_level=self.level,
delta_encoding_order=self.delta_encoding_order,
int_mult_spec=self.int_mult_spec,
float_mult_spec=self.float_mult_spec,
paging_spec=paging_spec,
)
return standalone.simple_compress(buf, config)

def decode(self, buf, out=None):
if out is not None:
out = ensure_contiguous_ndarray(out)
standalone.simple_decompress_into(buf, out)
return out
else:
return standalone.simple_decompress(buf)
11 changes: 11 additions & 0 deletions numcodecs/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,17 @@ def check_encode_decode_array(arr, codec):
assert_array_items_equal(arr, dec)


def check_encode_decode_array_to_bytes(arr, codec):

enc = codec.encode(arr)
dec = codec.decode(enc)
assert_array_items_equal(arr, dec)

out = np.empty_like(arr)
codec.decode(enc, out=out)
assert_array_items_equal(arr, out)


def check_config(codec):
config = codec.get_config()
# round-trip through JSON to check serialization
Expand Down
78 changes: 78 additions & 0 deletions numcodecs/tests/test_pcodec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pytest
import numpy as np

from numcodecs.pcodec import PCodec

try:
# initializing codec triggers ImportError
PCodec()
except ImportError: # pragma: no cover
pytest.skip(
"pcodec not available", allow_module_level=True
)

from numcodecs.tests.common import (
check_encode_decode_array_to_bytes,
check_config,
check_repr,
check_backwards_compatibility,
check_err_decode_object_buffer,
check_err_encode_object_buffer,
)


codecs = [
PCodec(),
PCodec(level=1),
PCodec(level=5),
PCodec(level=9),
PCodec(float_mult_spec="disabled", int_mult_spec="disabled"),
PCodec(equal_pages_up_to=300),
]


# mix of dtypes: integer, float
# mix of shapes: 1D, 2D
# mix of orders: C, F
arrays = [
np.arange(1000, dtype="u4"),
np.arange(1000, dtype="u8"),
np.arange(1000, dtype="i4"),
np.arange(1000, dtype="i8"),
np.linspace(1000, 1001, 1000, dtype="f4"),
np.linspace(1000, 1001, 1000, dtype="f8"),
np.random.normal(loc=1000, scale=1, size=(100, 10)),
np.asfortranarray(np.random.normal(loc=1000, scale=1, size=(100, 10))),
np.random.randint(0, 2**60, size=1000, dtype="u8"),
np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype="i8"),
]


@pytest.mark.parametrize("arr", arrays)
@pytest.mark.parametrize("codec", codecs)
def test_encode_decode(arr, codec):
check_encode_decode_array_to_bytes(arr, codec)


def test_config():
codec = PCodec(level=3)
check_config(codec)


def test_repr():
check_repr(
"PCodec(delta_encoding_order=None, equal_pages_up_to=262144, float_mult_spec='enabled', "
"int_mult_spec='enabled', level=3)"
)


def test_backwards_compatibility():
check_backwards_compatibility(PCodec.codec_id, arrays, codecs)


def test_err_decode_object_buffer():
check_err_decode_object_buffer(PCodec())


def test_err_encode_object_buffer():
check_err_encode_object_buffer(PCodec())
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ msgpack = [
zfpy = [
"zfpy>=1.0.0",
]
pcodec = [
"pcodec>=0.1.0",
]

[tool.setuptools]
license-files = ["LICENSE.txt"]
Expand Down