Skip to content

Commit 0745fbc

Browse files
anmyachevvnlitvinov
authored andcommitted
ready cython version, combined concat benchmarks
1 parent 9551aca commit 0745fbc

File tree

2 files changed

+56
-132
lines changed

2 files changed

+56
-132
lines changed

asv_bench/benchmarks/io/parsers.py

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -21,32 +21,16 @@ def time_check_datetimes(self, value):
2121

2222
class ConcatDateCols(object):
2323

24-
params = ([1234567890, 'AAAA'], [1, 2])
25-
param_names = ['value', 'dim']
24+
params = ([1234567890, 'AAAA'], [1, 2], [np.array, list])
25+
param_names = ['value', 'dim', 'container']
2626

27-
def setup(self, value, dim):
28-
count_elem = 1000000
27+
def setup(self, value, dim, container):
28+
count_elem = 10000
2929
if dim == 1:
30-
self.object = (np.array([value] * count_elem),)
30+
self.object = (container([value] * count_elem),)
3131
if dim == 2:
32-
self.object = (np.array([value] * count_elem),
33-
np.array([value] * count_elem))
32+
self.object = (container([value] * count_elem),
33+
container([value] * count_elem))
3434

35-
def time_check_concat(self, value, dim):
35+
def time_check_concat(self, value, dim, container):
3636
_concat_date_cols(self.object)
37-
38-
class ConcatDateColsList(object):
39-
40-
params = ([1234567890, 'AAAA'], [1, 2])
41-
param_names = ['value', 'dim']
42-
43-
def setup(self, value, dim):
44-
count_elem = 1000000
45-
if dim == 1:
46-
self.object = ([value] * count_elem,)
47-
if dim == 2:
48-
self.object = ([value] * count_elem,
49-
[value] * count_elem)
50-
51-
def time_check_concat(self, value, dim):
52-
_concat_date_cols(self.object)

pandas/_libs/lib.pyx

Lines changed: 48 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,8 @@ import warnings
99
import cython
1010
from cython import Py_ssize_t
1111

12-
from cpython cimport (PyErr_SetString, Py_INCREF, PyTuple_SET_ITEM,
13-
PyTuple_New, PyObject_Str, PyList_SetItem,
14-
Py_EQ,
15-
PyObject_RichCompareBool,
12+
from cpython cimport (Py_INCREF, PyTuple_SET_ITEM, PyTuple_New, PyObject_Str,
13+
Py_EQ, Py_SIZE, PyObject_RichCompareBool,
1614
PyUnicode_Join, PyList_New)
1715

1816
from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
@@ -22,13 +20,11 @@ PyDateTime_IMPORT
2220

2321
import numpy as np
2422
cimport numpy as cnp
25-
from numpy cimport (ndarray, PyArray_GETITEM, PyArray_CheckExact,
23+
from numpy cimport (ndarray, PyArray_GETITEM,
2624
PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew,
27-
flatiter, NPY_OBJECT, PyArray_SETITEM,
28-
int64_t, PyArray_GETPTR1,
29-
float32_t, float64_t, npy_intp, PyArray_NDIM,
30-
uint8_t, uint64_t, PyArray_ZEROS,
31-
complex128_t)
25+
flatiter, NPY_OBJECT,
26+
int64_t, float32_t, float64_t,
27+
uint8_t, uint64_t, complex128_t)
3228
cnp.import_array()
3329

3430
cdef extern from "numpy/arrayobject.h":
@@ -48,9 +44,6 @@ cdef extern from "numpy/arrayobject.h":
4844
object fields
4945
tuple names
5046

51-
cdef extern from "Python.h":
52-
object PyUnicode_FromFormat(const char *format, ...)
53-
5447

5548
cdef extern from "src/parse_helper.h":
5649
int floatify(object, float64_t *result, int *maybe_int) except -1
@@ -2365,126 +2358,73 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan):
23652358
return maybe_convert_objects(output)
23662359

23672360

2368-
cdef inline int convert_and_set_item(object item, Py_ssize_t index,
2369-
object result,
2370-
int keep_trivial_numbers):
2361+
cdef inline void convert_and_set_item(object item, Py_ssize_t index,
2362+
object[:] result,
2363+
bint keep_trivial_numbers):
23712364
cdef:
2372-
int do_convert = 1
2373-
object str_item
2374-
int int_item
2375-
double double_item
2365+
bint do_convert = 1
23762366

23772367
if keep_trivial_numbers:
2378-
if isinstance(item, int):
2379-
int_item = item
2380-
if int_item == 0:
2368+
if isinstance(item, int) and Py_SIZE(item) < 2:
2369+
if <int>item == 0:
23812370
do_convert = 0
23822371
elif isinstance(item, float):
2383-
double_item = item
2384-
if double_item == 0.0:
2372+
if <double>item == 0.0:
23852373
do_convert = 0
23862374

2387-
if do_convert:
2388-
if not isinstance(item, (str, bytes)):
2389-
str_item = PyObject_Str(item)
2390-
item = str_item
2391-
2392-
if PyArray_SETITEM(result, PyArray_GETPTR1(result, index), item):
2393-
PyErr_SetString(RuntimeError, "Cannot set resulting item")
2394-
return 0
2375+
if do_convert and not isinstance(item, (str, bytes)):
2376+
item = PyObject_Str(item)
23952377

2396-
return 1
2378+
result[index] = item
23972379

23982380

2399-
cpdef int put_object_as_unicode(object list, Py_ssize_t idx, object item):
2381+
cdef inline void put_object_as_unicode(object[:] lst, Py_ssize_t idx,
2382+
object item):
24002383
if not isinstance(item, str):
24012384
item = PyObject_Str(item)
2402-
Py_INCREF(item)
2403-
return 1 if PyList_SetItem(list, idx, item) == 0 else 0
2385+
lst[idx] = item
2386+
24042387

24052388
cpdef object _concat_date_cols(object date_cols,
24062389
object keep_trivial_numbers=False):
24072390
cdef:
2408-
object sequence
2409-
int keep_numbers, all_numpy = 1
2410-
Py_ssize_t sequence_size
2411-
Py_ssize_t array_size, min_array_size = 0
2412-
Py_ssize_t i, j
2413-
object result, arrays
2414-
object array, fast_array, item
2415-
npy_intp dims[1]
2416-
object separator
2391+
bint keep_numbers
2392+
Py_ssize_t sequence_size, i, j
2393+
Py_ssize_t array_size, min_size
2394+
object result
2395+
object separator = " "
24172396
object list_to_join, result_string
2397+
object[:] list_view
2398+
object[:] result_view
2399+
object[:] iterator
2400+
object[::] arrays
24182401

2419-
sequence = date_cols
24202402
keep_numbers = keep_trivial_numbers
24212403
sequence_size = len(date_cols)
24222404

2423-
if sequence_size == -1:
2424-
return None
2425-
elif sequence_size == 0:
2426-
return np.zeros(0, dtype=object)
2405+
if sequence_size == 0:
2406+
result = np.zeros(0, dtype=object)
24272407
elif sequence_size == 1:
2428-
array = sequence[0]
2429-
array_size = len(array)
2430-
dims[0] = array_size
2431-
result = PyArray_ZEROS(1, dims, NPY_OBJECT, 0)
2432-
if PyArray_CheckExact(array):
2433-
for i in range(array_size):
2434-
item = PyArray_GETITEM(array,
2435-
PyArray_GETPTR1(array, i))
2436-
if not convert_and_set_item(item, i, result, keep_numbers):
2437-
raise RuntimeError
2438-
else:
2439-
if not isinstance(array, (tuple, list)):
2440-
fast_array = tuple(array)
2441-
else:
2442-
fast_array = array
2443-
for i in range(array_size):
2444-
item = fast_array[i]
2445-
if not convert_and_set_item(item, i, result, keep_numbers):
2446-
raise RuntimeError
2447-
2448-
return result
2408+
iterator = date_cols[0]
2409+
array_size = len(iterator)
2410+
result = np.zeros(array_size, dtype=object)
2411+
result_view = result
2412+
for i in range(array_size):
2413+
convert_and_set_item(iterator[i], i, result_view, keep_numbers)
24492414
else:
2450-
arrays = list(sequence)
2451-
for i in range(sequence_size):
2452-
array = arrays[i]
2453-
if PyArray_CheckExact(array):
2454-
if PyArray_NDIM(array) != 1:
2455-
raise RuntimeError("ndarrays must be 1-dimentional")
2456-
elif not isinstance(array, (tuple, list)):
2457-
all_numpy = 0
2458-
fast_array = tuple(array)
2459-
array = fast_array
2460-
else:
2461-
all_numpy = 0
2462-
if len(array) < min_array_size or min_array_size == 0:
2463-
min_array_size = len(array)
2464-
dims[0] = min_array_size
2465-
result = PyArray_ZEROS(1, dims, NPY_OBJECT, 0)
2415+
arrays = date_cols
24662416

2467-
separator = PyUnicode_FromFormat(" ")
2468-
list_to_join = PyList_New(sequence_size)
2417+
min_size = min([len(arr) for arr in date_cols])
2418+
result = np.zeros(min_size, dtype=object)
2419+
result_view = result
24692420

2470-
for i in range(min_array_size):
2471-
if all_numpy:
2472-
for j in range(sequence_size):
2473-
array = arrays[j]
2474-
item = PyArray_GETITEM(array, PyArray_GETPTR1(array, i))
2475-
if not put_object_as_unicode(list_to_join, j, item):
2476-
raise RuntimeError
2477-
else:
2478-
for j in range(sequence_size):
2479-
array = arrays[j]
2480-
item = array[i]
2481-
if not put_object_as_unicode(list_to_join, j, item):
2482-
raise RuntimeError
2421+
list_to_join = PyList_New(sequence_size)
2422+
list_view = list_to_join
24832423

2424+
for i in range(min_size):
2425+
for j in range(sequence_size):
2426+
put_object_as_unicode(list_view, j, arrays[j][i])
24842427
result_string = PyUnicode_Join(separator, list_to_join)
2428+
result_view[i] = result_string
24852429

2486-
if (PyArray_SETITEM(result, PyArray_GETPTR1(result, i),
2487-
result_string) != 0):
2488-
raise RuntimeError("Cannot set resulting item")
2489-
2490-
return result
2430+
return result

0 commit comments

Comments
 (0)