Skip to content

Faster isin comparisons #7

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 74 additions & 14 deletions cyberpandas/ip_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
delegated_method)
from ._utils import combine, pack, unpack
from .common import _U8_MAX, _IPv4_MAX
from .parser import _to_ipaddress_pyint
from .parser import _to_ipaddress_pyint, _as_ip_object

# -----------------------------------------------------------------------------
# Extension Type
Expand Down Expand Up @@ -286,34 +286,94 @@ def packed(self):
return self.data.tobytes()

def isin(self, other):
"""
"""Check whether elements of 'self' are in 'other'.

Comparison is done elementwise.

Parameters
----------
other : str or sequences
For ``str`` 'other', the argument is attempted to
be converted to an :class:`ipaddress.IPv4Network` or
a :class:`ipaddress.IPv6Network` or an :class:`IPArray`.
If all those conversions fail, a TypeError is raised.

For a sequence of strings, the same conversion is attempted.
You should not mix networks with addresses.

Finally, other may be an ``IPArray`` of addresses to compare to.

Returns
-------
contained : ndarray
A 1-D boolean ndarray with the same length as self.

Examples
--------
Comparison to a single network

>>> s = IPArray(['192.168.1.1', '255.255.255.255'])
>>> s.isin('192.168.1.0/24')
array([ True, False])

Comparison to many networks
>>> s.isin(['192.168.1.0/24', '192.168.2.0/24'])
array([ True, False])

Comparison to many IP Addresses

>>> s.isin(['192.168.1.1', '192.168.1.2', '255.255.255.1']])
array([ True, False])
"""
if isinstance(other, str) or not isinstance(other,
collections.Sequence):
box = (isinstance(other, str) or
not isinstance(other, (IPArray, collections.Sequence)))
if box:
other = [other]

networks = []
for net in other:
try:
networks.append(ipaddress.IPv4Network(net))
except ValueError:
networks.append(ipaddress.IPv6Network(net))
addresses = []

if not isinstance(other, IPArray):
for net in other:
net = _as_ip_object(net)
if isinstance(net, (ipaddress.IPv4Network,
ipaddress.IPv6Network)):
networks.append(net)
if isinstance(net, (ipaddress.IPv4Address,
ipaddress.IPv6Address)):
addresses.append(ipaddress.IPv6Network(net))
else:
addresses = other

# Flatten all the addresses
addresses = IPArray(addresses) # TODO: think about copy=False

# TODO: perf
pyips = self.to_pyipaddress()
mask = np.zeros(len(self), dtype='bool')
for network in networks:
for i, ip in enumerate(pyips):
if ip in network:
mask[i] = True
mask |= self._isin_network(network)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there no way to short-circuit here? Or is that something for a future cythonized method...

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why loop to create networks and then loop again?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unless I'm missing something, I don't think there's anything to short-circut. We may not know whether an element of self is in any of the other networks until the last network.

Some context, I expected self to be large and networks to be small. So I'm OK looping over networks.


# no... we should flatten this.
mask |= self._isin_addresses(addresses)
return mask

def _isin_network(self, other):
# type: (Union[ipaddress.IPv4Network,ipaddress.IPv6Network]) -> ndarray
"""Check whether an array of addresses is contained in a network."""
# A network is bounded below by 'network_address' and
# above by 'broadcast_address'.
# IPArray handles comparisons between arrays of addresses, and NumPy
# handles broadcasting.
net_lo = type(self)([other.network_address])

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this fail on attempt to compare IP4 and IP6?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once we're in an IPArray, we don't care about IPv4 vs. IPv6. Everything is 128 bits.

net_hi = type(self)([other.broadcast_address])

return (net_lo <= self) & (self <= net_hi)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this using ipaddress's internal comparisons, or is this implemented by us elsewhere? I like that it ended up as a simple one-liner.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Implemented by us in

def __lt__(self, other):
.

Amazing what a night of sleep can do to change your perspective on a problem :)


def _isin_addresses(self, other):
"""Check whether elements of self are present in other."""
from pandas.core.algorithms import isin
# TODO(factorize): replace this
return isin(self, other)

def setitem(self, indexer, value):
"""Set the 'value' inplace.
"""
Expand Down
20 changes: 20 additions & 0 deletions cyberpandas/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,23 @@ def _to_ipaddress_pyint(values):

values2 = [unpack(pack(x)) for x in values]
return np.atleast_1d(np.asarray(values2, dtype=IPType.mybase))


def _as_ip_object(val):
"""Attempt to parse 'val' as any IP object.

Attempts to parse as these in order:

- IP Address (v4 or v6)
- IP Network (v4 or v6)
"""
try:
return ipaddress.ip_address(val)
except ValueError:
pass

try:
return ipaddress.ip_network(val)
except ValueError:
raise ValueError("Could not parse {} is an address or "
"network".format(val))
39 changes: 38 additions & 1 deletion cyberpandas/test_ip.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,13 +194,50 @@ def test_attributes(prop):
tm.assert_numpy_array_equal(result, expected)


def test_isin():
def test_isin_all4():
s = ip.IPArray([u'192.168.1.1', u'255.255.255.255'])
result = s.isin([u'192.168.1.0/24'])
expected = np.array([True, False])
tm.assert_numpy_array_equal(result, expected)


def test_isin_all6():
s = ip.IPArray([u'2001:db8::1', u'2001:db9::1'])
result = s.isin([u'2001:db8::0/96'])
expected = np.array([True, False])
tm.assert_numpy_array_equal(result, expected)


def test_isin_mix():
s = ip.IPArray([u'192.168.1.1', u'255.255.255.255',
u'2001:db8::1', u'2001:db9::1'])

result = s.isin([u'192.168.1.0/24'])
expected = np.array([True, False, False, False])
tm.assert_numpy_array_equal(result, expected)

result = s.isin([u'2001:db8::0/96'])
expected = np.array([False, False, True, False])
tm.assert_numpy_array_equal(result, expected)

result = s.isin([u'192.168.1.0/24', u'2001:db8::0/96'])
expected = np.array([True, False, True, False])
tm.assert_numpy_array_equal(result, expected)

s = ip.IPArray([u'192.168.1.1', u'192.168.1.2',
u'255.255.255.255'])
result = s.isin([u'192.168.1.0/24'])
expected = np.array([True, True, False])
tm.assert_numpy_array_equal(result, expected)


def test_isin_iparray():
s = ip.IPArray([10, 20, 20, 30])
result = s.isin(ip.IPArray([30, 20]))
expected = np.array([False, True, True, True])
tm.assert_numpy_array_equal(result, expected)


def test_getitem_scalar():
ser = ip.IPArray([0, 1, 2])
result = ser[1]
Expand Down
25 changes: 25 additions & 0 deletions cyberpandas/test_parser.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import ipaddress

import pytest

from cyberpandas import parser, IPArray
Expand Down Expand Up @@ -31,3 +33,26 @@ def test_to_ipaddress_scalar():
expected = parser.to_ipaddress([1])
assert len(result) == 1
assert all(result == expected)


@pytest.mark.parametrize('val, expected', [
(u'192.168.1.1', ipaddress.IPv4Address(u'192.168.1.1')),
(100, ipaddress.IPv4Address(100)),
(ipaddress.IPv4Address(100), ipaddress.IPv4Address(100)),
(2**64, ipaddress.IPv6Address(2**64)),
(u'192.168.0.0/28', ipaddress.IPv4Network(u'192.168.0.0/28')),
(ipaddress.IPv4Network(u'192.168.0.0/28'),
ipaddress.IPv4Network(u'192.168.0.0/28')),
(u'2001:db00::0/24', ipaddress.IPv6Network(u'2001:db00::0/24')),
])
def test_as_ip_object(val, expected):
result = parser._as_ip_object(val)
assert result == expected


@pytest.mark.parametrize("val", [
u"129", -1
])
def test_as_ip_object_raises(val):
with pytest.raises(ValueError):
parser._as_ip_object(val)
2 changes: 0 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,5 @@
]




# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {'https://docs.python.org/': None}