Skip to content

Commit 5a54ce6

Browse files
committed
Bug#25418534: JSON_EXTRACT USING WILDCARDS TAKES FOREVER
Patch #2: find_child_doms() checks for duplicates each time it adds a result to the result vector. As explained in the header comment for Json_wrapper::seek(), the duplicate elimination is needed for paths which contain multiple ellipses, so in most cases it is unnecessary work. This patch makes find_child_doms() only check for duplicates in the case where the path contains multiple ellipses, and only when inspecting an ellipsis path leg which is not the first one. Additionally: Remove checks for empty vector after a successful call to push_back(). That is, replace checks for is_seek_done(result, only_need_one) with a simple check for only_need_one when we know the result vector cannot be empty. Call is_seek_done() from the loop in Json_dom::seek() instead of at the top of find_child_doms(), so that we can break out of the loop earlier if we find a match and only need one. Microbenchmarks (64-bit, Intel Core i7-4770 3.4 GHz, GCC 6.3): BM_JsonDomSearchEllipsis 25693 ns/iter [+210.9%] BM_JsonDomSearchEllipsis_OnlyOne 17881 ns/iter [+324.3%] BM_JsonDomSearchKey 128 ns/iter [ +0.8%] BM_JsonBinarySearchEllipsis 231319 ns/iter [ +38.7%] BM_JsonBinarySearchEllipsis_OnlyOne 222726 ns/iter [ +41.6%] BM_JsonBinarySearchKey 86 ns/iter [ 0.0%] Change-Id: I0ee624830680247ec5aed302c0408db00240d441
1 parent b08d97b commit 5a54ce6

File tree

1 file changed

+56
-20
lines changed

1 file changed

+56
-20
lines changed

sql/json_dom.cc

Lines changed: 56 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -210,16 +210,33 @@ static bool seen_already(Json_dom_vector *result, Json_dom *cand)
210210
/**
211211
Add a value to a vector if it isn't already there.
212212
213+
This is used for removing duplicate matches for daisy-chained
214+
ellipsis tokens in #find_child_doms(). The problem with
215+
daisy-chained ellipses is that the candidate set may contain the
216+
same Json_dom object multiple times at different nesting levels
217+
after matching the first ellipsis. That is, the candidate set may
218+
contain a Json_dom and its parent, grandparent and so on. When
219+
matching the next ellipsis in the path, each value in the candidate
220+
set and all its children will be inspected, so the nested Json_dom
221+
will be seen multiple times, as its grandparent, parent and finally
222+
itself are inspected. We want it to appear only once in the result.
223+
213224
@param[in] candidate value to add
214-
@param[in,out] duplicates set of values added
225+
@param[in,out] duplicates set of values added, or `nullptr` if the ellipsis
226+
token is not daisy-chained
215227
@param[in,out] result vector
216228
@return false on success, true on error
217229
*/
218230
static bool add_if_missing(Json_dom *candidate,
219231
Json_dom_vector *duplicates,
220232
Json_dom_vector *result)
221233
{
222-
if (duplicates->insert_unique(candidate).second)
234+
/*
235+
If we are not checking duplicates, or if the candidate is not
236+
already in the duplicate set, add the candidate to the result
237+
vector.
238+
*/
239+
if (duplicates == nullptr || duplicates->insert_unique(candidate).second)
223240
{
224241
return result->push_back(candidate);
225242
}
@@ -252,8 +269,10 @@ static inline bool is_seek_done(const Result_vector *hits, bool only_need_one)
252269
@param[in] path_leg identifies the child
253270
@param[in] auto_wrap if true, match final scalar with [0] is need be
254271
@param[in] only_need_one true if we can stop after finding one match
255-
@param[in,out] duplicates helps to identify duplicate arrays and objects
256-
introduced by daisy-chained ** tokens
272+
@param[in,out] duplicates set of values collected, which helps to identify
273+
duplicate arrays and objects introduced by daisy-chained
274+
** tokens, or `nullptr` if the path leg is not a
275+
daisy-chained ** token
257276
@param[in,out] result the vector of qualifying children
258277
@return false on success, true on error
259278
*/
@@ -267,15 +286,11 @@ static bool find_child_doms(Json_dom *dom,
267286
enum_json_type dom_type= dom->json_type();
268287
enum_json_path_leg_type leg_type= path_leg->get_type();
269288

270-
if (is_seek_done(result, only_need_one))
271-
return false;
272-
273289
// Handle auto-wrapping of non-arrays.
274290
if (auto_wrap && dom_type != enum_json_type::J_ARRAY &&
275291
path_leg->is_autowrap())
276292
{
277-
return !seen_already(result, dom) &&
278-
add_if_missing(dom, duplicates, result);
293+
return !seen_already(result, dom) && result->push_back(dom);
279294
}
280295

281296
switch (leg_type)
@@ -285,8 +300,7 @@ static bool find_child_doms(Json_dom *dom,
285300
{
286301
const auto array= down_cast<const Json_array *>(dom);
287302
const Json_array_index idx= path_leg->first_array_index(array->size());
288-
return idx.within_bounds() &&
289-
add_if_missing((*array)[idx.position()], duplicates, result);
303+
return idx.within_bounds() && result->push_back((*array)[idx.position()]);
290304
}
291305
return false;
292306
case jpl_array_range:
@@ -297,9 +311,9 @@ static bool find_child_doms(Json_dom *dom,
297311
const auto range= path_leg->get_array_range(array->size());
298312
for (size_t i= range.m_begin; i < range.m_end; ++i)
299313
{
300-
if (add_if_missing((*array)[i], duplicates, result))
314+
if (result->push_back((*array)[i]))
301315
return true; /* purecov: inspected */
302-
if (is_seek_done(result, only_need_one))
316+
if (only_need_one)
303317
return false;
304318
}
305319
}
@@ -350,9 +364,7 @@ static bool find_child_doms(Json_dom *dom,
350364
{
351365
const auto object= down_cast<const Json_object *>(dom);
352366
Json_dom *child= object->get(path_leg->get_member_name());
353-
354-
if (child != NULL && add_if_missing(child, duplicates, result))
355-
return true; /* purecov: inspected */
367+
return child != nullptr && result->push_back(child);
356368
}
357369

358370
return false;
@@ -363,9 +375,9 @@ static bool find_child_doms(Json_dom *dom,
363375
{
364376
for (const auto &member : *down_cast<const Json_object *>(dom))
365377
{
366-
if (add_if_missing(member.second, duplicates, result))
378+
if (result->push_back(member.second))
367379
return true; /* purecov: inspected */
368-
if (is_seek_done(result, only_need_one))
380+
if (only_need_one)
369381
return false;
370382
}
371383
}
@@ -2271,13 +2283,34 @@ bool Json_dom::seek(const Json_seekable_path &path,
22712283
if (hits->push_back(this))
22722284
return true; /* purecov: inspected */
22732285

2286+
bool seen_ellipsis= false;
2287+
22742288
size_t path_leg_count= path.leg_count();
22752289
for (size_t path_idx= 0; path_idx < path_leg_count; path_idx++)
22762290
{
22772291
const Json_path_leg *path_leg= path.get_leg_at(path_idx);
2278-
duplicates.clear();
22792292
candidates.clear();
22802293

2294+
/*
2295+
When we have multiple ellipses in the path, we need to eliminate
2296+
duplicates from the result. It's not needed for the first ellipsis.
2297+
See explanation in add_if_missing() and Json_wrapper::seek().
2298+
*/
2299+
Json_dom_vector *dup_vector= nullptr;
2300+
if (path_leg->get_type() == jpl_ellipsis)
2301+
{
2302+
if (seen_ellipsis)
2303+
{
2304+
/*
2305+
This ellipsis is not the first one, so we need to eliminate
2306+
duplicates in find_child_doms().
2307+
*/
2308+
dup_vector= &duplicates;
2309+
dup_vector->clear();
2310+
}
2311+
seen_ellipsis= true;
2312+
}
2313+
22812314
/*
22822315
On the last path leg, we can stop after the first match if only
22832316
one match is requested by the caller.
@@ -2288,8 +2321,11 @@ bool Json_dom::seek(const Json_seekable_path &path,
22882321
for (Json_dom *hit : *hits)
22892322
{
22902323
if (find_child_doms(hit, path_leg, auto_wrap, stop_after_first_match,
2291-
&duplicates, &candidates))
2324+
dup_vector, &candidates))
22922325
return true; /* purecov: inspected */
2326+
2327+
if (is_seek_done(&candidates, stop_after_first_match))
2328+
break;
22932329
}
22942330

22952331
// swap the two lists so that they can be re-used

0 commit comments

Comments
 (0)