Skip to content

Commit c74c9a6

Browse files
authored
INTPYTHON-493 Fix handling of nulls in arrays (#263)
1 parent 9a3b213 commit c74c9a6

File tree

5 files changed

+252
-19
lines changed

5 files changed

+252
-19
lines changed

bindings/python/pymongoarrow/lib.pyx

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ cdef const bson_t* bson_reader_read_safe(bson_reader_t* stream_reader) except? N
5959
cdef class BuilderManager:
6060
cdef:
6161
dict builder_map
62+
dict parent_types
63+
dict parent_names
6264
uint64_t count
6365
bint has_schema
6466
object tzinfo
@@ -69,6 +71,8 @@ cdef class BuilderManager:
6971
self.tzinfo = tzinfo
7072
self.count = 0
7173
self.builder_map = {}
74+
self.parent_names = {}
75+
self.parent_types = {}
7276
self.pool = default_memory_pool()
7377
# Unpack the schema map.
7478
for fname, (ftype, arrow_type) in schema_map.items():
@@ -146,6 +150,7 @@ cdef class BuilderManager:
146150
cdef bson_iter_t child_iter
147151
cdef uint64_t count = self.count
148152
cdef _ArrayBuilderBase builder = None
153+
cdef _ArrayBuilderBase parent_builder = None
149154

150155
while bson_iter_next(doc_iter):
151156
# Get the key and and value.
@@ -156,12 +161,15 @@ cdef class BuilderManager:
156161
if parent_type == BSON_TYPE_ARRAY:
157162
full_key = base_key
158163
full_key.append(b"[]")
164+
self.parent_types[full_key] = BSON_TYPE_ARRAY
159165

160166
elif parent_type == BSON_TYPE_DOCUMENT:
161167
full_key = base_key
162168
full_key.append(b".")
163169
full_key.append(key)
164170
(<DocumentBuilder>self.builder_map[base_key]).add_field(key)
171+
self.parent_types[full_key] = BSON_TYPE_DOCUMENT
172+
self.parent_names[full_key] = base_key
165173

166174
else:
167175
full_key = key
@@ -174,8 +182,13 @@ cdef class BuilderManager:
174182
continue
175183

176184
# Append nulls to catch up.
177-
# For lists, the nulls are stored in the parent.
185+
# For list children, the nulls are stored in the parent.
178186
if parent_type != BSON_TYPE_ARRAY:
187+
# For document children, catch up with the parent doc.
188+
# Root fields will use the base document count
189+
if parent_type == BSON_TYPE_DOCUMENT:
190+
parent_builder = <_ArrayBuilderBase>self.builder_map.get(base_key, None)
191+
count = parent_builder.length() - 1
179192
if count > builder.length():
180193
status = builder.append_nulls_raw(count - builder.length())
181194
if not status.ok():
@@ -222,27 +235,36 @@ cdef class BuilderManager:
222235
cdef dict return_map = {}
223236
cdef bytes key
224237
cdef str field
238+
cdef uint64_t count
225239
cdef CStatus status
226-
cdef _ArrayBuilderBase value
240+
cdef _ArrayBuilderBase builder
241+
cdef _ArrayBuilderBase parent_builder
227242

228243
# Move the builders to a new dict with string keys.
229-
for key, value in self.builder_map.items():
230-
return_map[key.decode('utf-8')] = value
244+
for key, builder in self.builder_map.items():
245+
return_map[key.decode('utf-8')] = builder
231246

232247
# Insert null fields.
233248
for field in list(return_map):
234249
if return_map[field] is None:
235250
return_map[field] = NullBuilder(memory_pool=self.pool)
236251

237252
# Pad fields as needed.
238-
for field, value in return_map.items():
239-
# If it isn't a list item, append nulls as needed.
240-
# For lists, the nulls are stored in the parent.
241-
if not field.endswith('[]'):
242-
if value.length() < self.count:
243-
status = value.append_nulls_raw(self.count - value.length())
244-
if not status.ok():
245-
raise ValueError("Failed to append nulls to", field)
253+
for field, builder in return_map.items():
254+
# For list children, the nulls are stored in the parent.
255+
key = field.encode('utf-8')
256+
parent_type = self.parent_types.get(key, None)
257+
if parent_type == BSON_TYPE_ARRAY:
258+
continue
259+
if parent_type == BSON_TYPE_DOCUMENT:
260+
parent_builder = self.builder_map[self.parent_names[key]]
261+
count = parent_builder.length()
262+
else:
263+
count = self.count
264+
if builder.length() < count:
265+
status = builder.append_nulls_raw(count - builder.length())
266+
if not status.ok():
267+
raise ValueError("Failed to append nulls to", field)
246268

247269
return return_map
248270

@@ -688,13 +710,15 @@ cdef class ListBuilder(_ArrayBuilderBase):
688710
self.type_marker = BSON_TYPE_ARRAY
689711

690712
cdef CStatus append_raw(self, bson_iter_t * doc_iter, bson_type_t value_t):
713+
if value_t == BSON_TYPE_NULL:
714+
return self.append_null_raw()
691715
return self.builder.get().Append(self.count)
692716

693717
cpdef void append_count(self):
694718
self.count += 1
695719

696720
cdef CStatus append_null_raw(self):
697-
return self.builder.get().Append(self.count)
721+
return self.builder.get().AppendNull()
698722

699723
cdef shared_ptr[CArrayBuilder] get_builder(self):
700724
return <shared_ptr[CArrayBuilder]>self.builder
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
[{
2+
"object1": {
3+
"object11": {
4+
"object111": {}
5+
}
6+
}
7+
},
8+
{
9+
"object1": {
10+
"object11": {
11+
"object111": {
12+
"list1111": [
13+
{"field11111": 13.2},
14+
{"field11111": 41.6}
15+
]
16+
}
17+
}
18+
}
19+
},
20+
{
21+
"object1": {
22+
"object11": {
23+
"object111": {
24+
"list1111": [
25+
{"field11111": 3.9},
26+
{"field11111": 69.5}
27+
]
28+
}
29+
}
30+
}
31+
},
32+
{
33+
"object1": {
34+
"object11": {
35+
"object111": {
36+
"list1111": [
37+
{"field11111": 147.2}
38+
],
39+
"list1112": [
40+
{"field11121": "Barrier"},
41+
{"field11121": "Barrier"},
42+
{"field11121": "Barrier"},
43+
{"field11121": "Barrier"}
44+
]
45+
}
46+
}
47+
}
48+
},
49+
{
50+
"object1": {
51+
"object11": {
52+
"object111": {}
53+
}
54+
}
55+
},
56+
{
57+
"object1": {
58+
"object11": {
59+
"object111": {
60+
"list1111": [
61+
{"field11111": 90.4}
62+
]
63+
}
64+
}
65+
}
66+
},
67+
{
68+
"object1": {
69+
"object11": {
70+
"object111": {
71+
"list1111": [
72+
{"field11111": 1.7},
73+
{"field11111": 53.9}
74+
]
75+
}
76+
}
77+
}
78+
},
79+
{
80+
"object1": {
81+
"object11": {
82+
"object111": {
83+
"list1111": [
84+
{"field11111": 15.6}
85+
]
86+
}
87+
}
88+
}
89+
},
90+
{
91+
"object1": {
92+
"object11": {
93+
"object111": {}
94+
}
95+
}
96+
},
97+
{
98+
"object1": {
99+
"object11": {
100+
"object111": {
101+
"list1111": [
102+
{"field11111": 6.7},
103+
{"field11111": 12.3}
104+
]
105+
}
106+
}
107+
}
108+
},
109+
{
110+
"object1": {
111+
"object11": {
112+
"object111": {
113+
"list1111": [
114+
{"field11111": 57.1}
115+
]
116+
}
117+
}
118+
}
119+
},
120+
{
121+
"object1": {
122+
"object11": {
123+
"object111": {
124+
"list1111": [
125+
{"field11111": 60.5}
126+
]
127+
}
128+
}
129+
}
130+
},
131+
{
132+
"object1": {
133+
"object11": {
134+
"object111": {
135+
"list1111": [
136+
{"field11111": 1.2}
137+
]
138+
}
139+
}
140+
}
141+
},
142+
{
143+
"object1": {
144+
"object11": {
145+
"object111": {
146+
"list1111": [
147+
{"field11111": 14.9}
148+
]
149+
}
150+
}
151+
}
152+
},
153+
{
154+
"object1": {
155+
"object11": {
156+
"object111": {}
157+
}
158+
}
159+
},
160+
{
161+
"object1": {
162+
"object11": {
163+
"object111": {}
164+
}
165+
}
166+
},
167+
{
168+
"object1": {
169+
"object11": {
170+
"object111": {}
171+
}
172+
}
173+
}]
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[{"col": null}, {"col": [{"field11111": 13.2}, {"field11111": 41.6}]}, {"col": [{"field11111": 3.9}, {"field11111": 69.5}]}, {"col": [{"field11111": 147.2}]}, {"col": null}, {"col": [{"field11111": 90.4}]}, {"col": [{"field11111": 1.7}, {"field11111": 53.9}]}, {"col": [{"field11111": 15.6}]}, {"col": null}, {"col": [{"field11111": 6.7}, {"field11111": 12.3}]}, {"col": [{"field11111": 57.1}]}, {"col": [{"field11111": 60.5}]}, {"col": [{"field11111": 1.2}]}, {"col": [{"field11111": 14.9}]}, {"col": null}, {"col": null}, {"col": null}]

bindings/python/test/test_arrow.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import json
1516
import tempfile
1617
import unittest
1718
import unittest.mock as mock
1819
from datetime import date, datetime
20+
from pathlib import Path
1921
from test import client_context
2022
from test.utils import AllowListEventListener, NullsTestMixin
2123

@@ -56,6 +58,8 @@
5658
ObjectIdType,
5759
)
5860

61+
HERE = Path(__file__).absolute().parent
62+
5963

6064
class ArrowApiTestMixin:
6165
@classmethod
@@ -493,11 +497,42 @@ def test_schema_missing_field(self):
493497
self.assertEqual(out["list_field"].to_pylist(), expected)
494498

495499
def test_schema_incorrect_data_type(self):
500+
# From https://github.com/mongodb-labs/mongo-arrow/issues/260.
496501
self.coll.delete_many({})
497502
self.coll.insert_one({"x": {"y": 1}})
498503
out = find_arrow_all(self.coll, {}, schema=Schema({"x": str}))
499504
assert out.to_pylist() == [{"x": None}]
500505

506+
def test_schema_arrays_of_documents(self):
507+
# From https://github.com/mongodb-labs/mongo-arrow/issues/258.
508+
coll = self.coll
509+
coll.delete_many({})
510+
coll.insert_one({"list1": [{"field1": 13.2, "field2": 13.2}, {"field1": 41.6}]})
511+
coll.insert_one({"list1": [{"field1": 1.6}]})
512+
schema = Schema(
513+
{"col": pa.list_(pa.struct({"field1": pa.float64(), "field2": pa.float64()}))}
514+
)
515+
df = aggregate_arrow_all(coll, [{"$project": {"col": "$list1"}}], schema=schema)
516+
assert df.to_pylist() == [
517+
{"col": [{"field1": 13.2, "field2": 13.2}, {"field1": 41.6, "field2": None}]},
518+
{"col": [{"field1": 1.6, "field2": None}]},
519+
]
520+
521+
def test_schema_arrays_of_documents_with_nulls(self):
522+
# From https://github.com/mongodb-labs/mongo-arrow/issues/257.
523+
coll = self.coll
524+
coll.delete_many({})
525+
with (HERE / "nested_data_in.json").open() as fid:
526+
coll.insert_many(json.load(fid))
527+
df = aggregate_arrow_all(
528+
coll,
529+
[{"$project": {"col": "$object1.object11.object111.list1111"}}],
530+
schema=Schema({"col": pa.list_(pa.struct({"field11111": pa.float64()}))}),
531+
)
532+
with (HERE / "nested_data_out.json").open() as fid:
533+
expected = json.load(fid)
534+
assert df.to_pylist() == expected
535+
501536
def test_auto_schema_nested(self):
502537
# Create table with random data of various types.
503538
_, data = self._create_nested_data()
@@ -558,7 +593,7 @@ def test_auto_schema_first_list_null(self):
558593
]
559594
expected = pa.Table.from_pylist(
560595
[
561-
{"a": []},
596+
{"a": None},
562597
{"a": ["str"]},
563598
{"a": []},
564599
]
@@ -588,7 +623,7 @@ def test_auto_schema_first_list_element_null(self):
588623
]
589624
expected = pa.Table.from_pylist(
590625
[
591-
{"a": []},
626+
{"a": None},
592627
{"a": ["str"]}, # Inferred schema should use the first non-null element.
593628
{"a": []},
594629
]
@@ -603,7 +638,7 @@ def test_auto_schema_first_embedded_list_null(self):
603638
]
604639
expected = pa.Table.from_pylist(
605640
[
606-
{"a": {"b": []}},
641+
{"a": {"b": None}},
607642
{"a": {"b": ["str"]}},
608643
{"a": {"b": []}},
609644
]

0 commit comments

Comments
 (0)