Skip to content

Removes pyquickhelper as a dependency #33

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/check-urls.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ jobs:
timeout: 2
retry_count# : 2
# exclude_urls: https://hal.archives-ouvertes.fr/hal-00990252/document
# exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/
exclude_patterns: https://circleci.com/gh/sdpython/pandas_streaming/
# force_pass : true
2 changes: 1 addition & 1 deletion .local.jenkins.lin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ virtualenv:

install:
- $PYINT -m pip install --upgrade pip
- $PYINT -m pip install --upgrade --no-cache-dir --no-deps --index http://localhost:8067/simple/ jyquickhelper pyquickhelper pandas_streaming --extra-index-url=https://pypi.python.org/simple/
- $PYINT -m pip install --upgrade --no-cache-dir --no-deps --index http://localhost:8067/simple/ jyquickhelper pandas_streaming --extra-index-url=https://pypi.python.org/simple/
- $PYINT -m pip install -r requirements.txt
- $PYINT -m pip install -r requirements-dev.txt
- $PYINT --version
Expand Down
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ pandas-streaming: streaming API over pandas
:target: https://ci.appveyor.com/project/sdpython/pandas-streaming
:alt: Build Status Windows

.. image:: https://circleci.com/gh/sdpython/pandas_streaming/tree/main.svg?style=svg
:target: https://circleci.com/gh/sdpython/pandas_streaming/tree/main
.. image:: https://dl.circleci.com/status-badge/img/gh/sdpython/pandas-streaming/tree/main.svg?style=svg
:target: https://dl.circleci.com/status-badge/redirect/gh/sdpython/pandas-streaming/tree/main

.. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming
:target: https://dev.azure.com/xavierdupre3/pandas_streaming/
Expand Down
2 changes: 1 addition & 1 deletion _doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@

# The following is used by sphinx.ext.linkcode to provide links to github
linkcode_resolve = make_linkcode_resolve(
"pandas_streaming",
"pandas-streaming",
(
"https://github.com/sdpython/pandas-streaming/"
"blob/{revision}/{package}/"
Expand Down
4 changes: 2 additions & 2 deletions _doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ pandas-streaming: streaming API over pandas
:target: https://ci.appveyor.com/project/sdpython/pandas-streaming
:alt: Build Status Windows

.. image:: https://circleci.com/gh/sdpython/pandas_streaming/tree/main.svg?style=svg
:target: https://circleci.com/gh/sdpython/pandas_streaming/tree/main
.. image:: https://dl.circleci.com/status-badge/img/gh/sdpython/pandas-streaming/tree/main.svg?style=svg
:target: https://dl.circleci.com/status-badge/redirect/gh/sdpython/pandas-streaming/tree/main

.. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming
:target: https://dev.azure.com/xavierdupre3/pandas_streaming/
Expand Down
2 changes: 1 addition & 1 deletion _unittests/ut_df/test_connex_split.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest
import pandas
from pyquickhelper.pycode import ExtTestCase
from pandas_streaming.ext_test_case import ExtTestCase
from pandas_streaming.df import (
dataframe_shuffle,
train_test_split_weights,
Expand Down
2 changes: 1 addition & 1 deletion _unittests/ut_df/test_connex_split_big.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import unittest
from collections import Counter
import pandas
from pyquickhelper.pycode import ExtTestCase
from pandas_streaming.ext_test_case import ExtTestCase
from pandas_streaming.df import train_test_connex_split


Expand Down
2 changes: 1 addition & 1 deletion _unittests/ut_df/test_connex_split_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import unittest
from collections import Counter
import pandas
from pyquickhelper.pycode import ExtTestCase
from pandas_streaming.ext_test_case import ExtTestCase
from pandas_streaming.df import train_test_apart_stratify


Expand Down
2 changes: 1 addition & 1 deletion _unittests/ut_df/test_dataframe_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import unittest
import numpy
import pandas
from pyquickhelper.pycode import ExtTestCase
from pandas_streaming.ext_test_case import ExtTestCase
from pandas_streaming.df import dataframe_hash_columns


Expand Down
2 changes: 1 addition & 1 deletion _unittests/ut_df/test_dataframe_helpers_simple.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest
import pandas
import numpy
from pyquickhelper.pycode import ExtTestCase
from pandas_streaming.ext_test_case import ExtTestCase
from pandas_streaming.df import dataframe_unfold
from pandas_streaming.df.dataframe_helpers import hash_int, hash_str, hash_float

Expand Down
63 changes: 32 additions & 31 deletions _unittests/ut_df/test_dataframe_io.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
import tempfile
import unittest
import io
import zipfile
import numpy
import pandas
from pyquickhelper.pycode import ExtTestCase, get_temp_folder
from pandas_streaming.ext_test_case import ExtTestCase
from pandas_streaming.df import to_zip, read_zip


Expand All @@ -20,43 +21,43 @@ def test_zip_dataframe(self):
]
)

temp = get_temp_folder(__file__, "temp_zip")
name = os.path.join(temp, "df.zip")
to_zip(df, name, encoding="utf-8", index=False)
df2 = read_zip(name, encoding="utf-8")
self.assertEqualDataFrame(df, df2)
with tempfile.TemporaryDirectory() as temp:
name = os.path.join(temp, "df.zip")
to_zip(df, name, encoding="utf-8", index=False)
df2 = read_zip(name, encoding="utf-8")
self.assertEqualDataFrame(df, df2)

st = io.BytesIO()
zp = zipfile.ZipFile(st, "w")
to_zip(df, zp, encoding="utf-8", index=False)
zp.close()
st = io.BytesIO()
zp = zipfile.ZipFile(st, "w")
to_zip(df, zp, encoding="utf-8", index=False)
zp.close()

st = io.BytesIO(st.getvalue())
zp = zipfile.ZipFile(st, "r")
df3 = read_zip(zp, encoding="utf-8")
zp.close()
self.assertEqualDataFrame(df, df3)
st = io.BytesIO(st.getvalue())
zp = zipfile.ZipFile(st, "r")
df3 = read_zip(zp, encoding="utf-8")
zp.close()
self.assertEqualDataFrame(df, df3)

def test_zip_numpy(self):
df = numpy.zeros((3, 4))
df[2, 3] = 1

temp = get_temp_folder(__file__, "temp_zip")
name = os.path.join(temp, "df.zip")
to_zip(df, name, "arr.npy")
df2 = read_zip(name, "arr.npy")
self.assertEqualArray(df, df2)

st = io.BytesIO()
zp = zipfile.ZipFile(st, "w")
to_zip(df, zp, "arr.npy")
zp.close()

st = io.BytesIO(st.getvalue())
zp = zipfile.ZipFile(st, "r")
df3 = read_zip(zp, "arr.npy")
zp.close()
self.assertEqualArray(df, df3)
with tempfile.TemporaryDirectory() as temp:
name = os.path.join(temp, "df.zip")
to_zip(df, name, "arr.npy")
df2 = read_zip(name, "arr.npy")
self.assertEqualArray(df, df2)

st = io.BytesIO()
zp = zipfile.ZipFile(st, "w")
to_zip(df, zp, "arr.npy")
zp.close()

st = io.BytesIO(st.getvalue())
zp = zipfile.ZipFile(st, "r")
df3 = read_zip(zp, "arr.npy")
zp.close()
self.assertEqualArray(df, df3)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion _unittests/ut_df/test_dataframe_io_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from io import StringIO, BytesIO
from json import loads
import pandas
from pyquickhelper.pycode import ExtTestCase
from pandas_streaming.ext_test_case import ExtTestCase
from pandas_streaming.df.dataframe_io_helpers import (
enumerate_json_items,
JsonPerRowsStream,
Expand Down
171 changes: 86 additions & 85 deletions _unittests/ut_df/test_dataframe_sort.py
Original file line number Diff line number Diff line change
@@ -1,104 +1,105 @@
import os
import tempfile
import unittest
import pandas
from pyquickhelper.pycode import ExtTestCase, get_temp_folder
from pandas_streaming.ext_test_case import ExtTestCase
from pandas_streaming.df import StreamingDataFrame


class TestDataFrameSort(ExtTestCase):
def test_sort_values(self):
temp = get_temp_folder(__file__, "temp_sort_values")
name = os.path.join(temp, "_data_")
df = pandas.DataFrame(
[
dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
dict(a=5, b="f", c=5.7, ind="a2", ai=2),
dict(a=4, b="g", ind="a3", ai=3),
dict(a=8, b="h", c=5.9, ai=4),
dict(a=16, b="i", c=6.2, ind="a5", ai=5),
]
)
sdf = StreamingDataFrame.read_df(df, chunksize=2)
sorted_df = df.sort_values(by="a")
res = sdf.sort_values(by="a", temp_file=name)
res_df = res.to_df()
self.assertEqualDataFrame(sorted_df, res_df)
with tempfile.TemporaryDirectory() as temp:
name = os.path.join(temp, "_data_")
df = pandas.DataFrame(
[
dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
dict(a=5, b="f", c=5.7, ind="a2", ai=2),
dict(a=4, b="g", ind="a3", ai=3),
dict(a=8, b="h", c=5.9, ai=4),
dict(a=16, b="i", c=6.2, ind="a5", ai=5),
]
)
sdf = StreamingDataFrame.read_df(df, chunksize=2)
sorted_df = df.sort_values(by="a")
res = sdf.sort_values(by="a", temp_file=name)
res_df = res.to_df()
self.assertEqualDataFrame(sorted_df, res_df)

def test_sort_values_twice(self):
temp = get_temp_folder(__file__, "temp_sort_values_twice")
name = os.path.join(temp, "_data_")
df = pandas.DataFrame(
[
dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
dict(a=5, b="f", c=5.7, ind="a2", ai=2),
dict(a=4, b="g", ind="a3", ai=3),
dict(a=8, b="h", c=5.9, ai=4),
dict(a=16, b="i", c=6.2, ind="a5", ai=5),
]
)
sdf = StreamingDataFrame.read_df(df, chunksize=2)
sorted_df = df.sort_values(by="a")
res = sdf.sort_values(by="a", temp_file=name)
res_df = res.to_df()
self.assertEqualDataFrame(sorted_df, res_df)
res_df = res.to_df()
self.assertEqualDataFrame(sorted_df, res_df)
with tempfile.TemporaryDirectory() as temp:
name = os.path.join(temp, "_data_")
df = pandas.DataFrame(
[
dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
dict(a=5, b="f", c=5.7, ind="a2", ai=2),
dict(a=4, b="g", ind="a3", ai=3),
dict(a=8, b="h", c=5.9, ai=4),
dict(a=16, b="i", c=6.2, ind="a5", ai=5),
]
)
sdf = StreamingDataFrame.read_df(df, chunksize=2)
sorted_df = df.sort_values(by="a")
res = sdf.sort_values(by="a", temp_file=name)
res_df = res.to_df()
self.assertEqualDataFrame(sorted_df, res_df)
res_df = res.to_df()
self.assertEqualDataFrame(sorted_df, res_df)

def test_sort_values_reverse(self):
temp = get_temp_folder(__file__, "temp_sort_values_reverse")
name = os.path.join(temp, "_data_")
df = pandas.DataFrame(
[
dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
dict(a=5, b="f", c=5.7, ind="a2", ai=2),
dict(a=4, b="g", ind="a3", ai=3),
dict(a=8, b="h", c=5.9, ai=4),
dict(a=16, b="i", c=6.2, ind="a5", ai=5),
]
)
sdf = StreamingDataFrame.read_df(df, chunksize=2)
sorted_df = df.sort_values(by="a", ascending=False)
res = sdf.sort_values(by="a", temp_file=name, ascending=False)
res_df = res.to_df()
self.assertEqualDataFrame(sorted_df, res_df)
with tempfile.TemporaryDirectory() as temp:
name = os.path.join(temp, "_data_")
df = pandas.DataFrame(
[
dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
dict(a=5, b="f", c=5.7, ind="a2", ai=2),
dict(a=4, b="g", ind="a3", ai=3),
dict(a=8, b="h", c=5.9, ai=4),
dict(a=16, b="i", c=6.2, ind="a5", ai=5),
]
)
sdf = StreamingDataFrame.read_df(df, chunksize=2)
sorted_df = df.sort_values(by="a", ascending=False)
res = sdf.sort_values(by="a", temp_file=name, ascending=False)
res_df = res.to_df()
self.assertEqualDataFrame(sorted_df, res_df)

def test_sort_values_nan_last(self):
temp = get_temp_folder(__file__, "temp_sort_values_nan_last")
name = os.path.join(temp, "_data_")
df = pandas.DataFrame(
[
dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
dict(b="f", c=5.7, ind="a2", ai=2),
dict(b="f", c=5.8, ind="a2", ai=2),
dict(a=4, b="g", ind="a3", ai=3),
dict(a=8, b="h", c=5.9, ai=4),
dict(a=16, b="i", c=6.2, ind="a5", ai=5),
]
)
sdf = StreamingDataFrame.read_df(df, chunksize=2)
sorted_df = df.sort_values(by="a", na_position="last")
res = sdf.sort_values(by="a", temp_file=name, na_position="last")
res_df = res.to_df()
self.assertEqualDataFrame(sorted_df, res_df)
with tempfile.TemporaryDirectory() as temp:
name = os.path.join(temp, "_data_")
df = pandas.DataFrame(
[
dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
dict(b="f", c=5.7, ind="a2", ai=2),
dict(b="f", c=5.8, ind="a2", ai=2),
dict(a=4, b="g", ind="a3", ai=3),
dict(a=8, b="h", c=5.9, ai=4),
dict(a=16, b="i", c=6.2, ind="a5", ai=5),
]
)
sdf = StreamingDataFrame.read_df(df, chunksize=2)
sorted_df = df.sort_values(by="a", na_position="last")
res = sdf.sort_values(by="a", temp_file=name, na_position="last")
res_df = res.to_df()
self.assertEqualDataFrame(sorted_df, res_df)

def test_sort_values_nan_first(self):
temp = get_temp_folder(__file__, "temp_sort_values_nan_first")
name = os.path.join(temp, "_data_")
df = pandas.DataFrame(
[
dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
dict(b="f", c=5.7, ind="a2", ai=2),
dict(b="f", c=5.8, ind="a2", ai=2),
dict(a=4, b="g", ind="a3", ai=3),
dict(a=8, b="h", c=5.9, ai=4),
dict(a=16, b="i", c=6.2, ind="a5", ai=5),
]
)
sdf = StreamingDataFrame.read_df(df, chunksize=2)
sorted_df = df.sort_values(by="a", na_position="first")
res = sdf.sort_values(by="a", temp_file=name, na_position="first")
res_df = res.to_df()
self.assertEqualDataFrame(sorted_df, res_df)
with tempfile.TemporaryDirectory() as temp:
name = os.path.join(temp, "_data_")
df = pandas.DataFrame(
[
dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
dict(b="f", c=5.7, ind="a2", ai=2),
dict(b="f", c=5.8, ind="a2", ai=2),
dict(a=4, b="g", ind="a3", ai=3),
dict(a=8, b="h", c=5.9, ai=4),
dict(a=16, b="i", c=6.2, ind="a5", ai=5),
]
)
sdf = StreamingDataFrame.read_df(df, chunksize=2)
sorted_df = df.sort_values(by="a", na_position="first")
res = sdf.sort_values(by="a", temp_file=name, na_position="first")
res_df = res.to_df()
self.assertEqualDataFrame(sorted_df, res_df)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion _unittests/ut_df/test_pandas_groupbynan.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas
import numpy
from scipy.sparse.linalg import lsqr as sparse_lsqr
from pyquickhelper.pycode import ExtTestCase, ignore_warnings
from pandas_streaming.ext_test_case import ExtTestCase, ignore_warnings
from pandas_streaming.df import pandas_groupby_nan, numpy_types


Expand Down
Loading