Skip to content

Commit 4e5c590

Browse files
author
Xing Zhang
committed
Bug #25426632: STRNXFRM RUNS SLOWER WHEN TAILORING RULE CONTAINS CONTRACTION
Split contraction list by the string length. BM_Hungarian_AS_CS 4163 -> 4016 ns/iter [+ 3.7%] Change-Id: Id7a46275367009f61094f43e651a8055922832f7
1 parent c4bc530 commit 4e5c590

File tree

3 files changed

+89
-54
lines changed

3 files changed

+89
-54
lines changed

include/m_ctype.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <stdarg.h>
2525
#include <stddef.h>
2626
#include <sys/types.h>
27+
#include <stdbool.h>
2728

2829
#include "my_byteorder.h"
2930
#include "my_compiler.h"
@@ -117,8 +118,17 @@ typedef struct my_contraction_t
117118

118119
typedef struct my_contraction_list_t
119120
{
120-
size_t nitems; /* Number of items in the list */
121-
MY_CONTRACTION *item; /* List of contractions */
121+
bool has_contractions;
122+
/*
123+
Contractions are split by their length. The first two elements in nitems
124+
and item are meaningless because contraction must consist of at least two
125+
code points.
126+
*/
127+
128+
/* Number of contractions of same length. */
129+
size_t nitems[MY_UCA_MAX_CONTRACTION + 1];
130+
/* Lists of contractions of same length. */
131+
MY_CONTRACTION* item[MY_UCA_MAX_CONTRACTION + 1];
122132

123133
/*
124134
Character flags for each character, e.g. "is contraction head"

strings/ctype-uca.cc

Lines changed: 72 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,11 @@ MY_UCA_INFO my_uca_v400=
6060
0xFFFF, /* maxchar */
6161
uca_length,
6262
uca_weight,
63-
{ /* Contractions: */
64-
0, /* nitems */
65-
NULL, /* item */
66-
NULL /* flags */
63+
{ /* Contractions: */
64+
false, /* has_contractions */
65+
{0}, /* nitems */
66+
{nullptr}, /* item */
67+
nullptr /* flags */
6768
},
6869

6970
/* Logical positions */
@@ -96,10 +97,11 @@ MY_UCA_INFO my_uca_v520=
9697
0x10FFFF, /* maxchar */
9798
uca520_length,
9899
uca520_weight,
99-
{ /* Contractions: */
100-
0, /* nitems */
101-
NULL, /* item */
102-
NULL /* flags */
100+
{ /* Contractions: */
101+
false, /* has_contractions */
102+
{0}, /* nitems */
103+
{nullptr}, /* item */
104+
nullptr /* flags */
103105
},
104106

105107
0x0009, /* first_non_ignorable p != ignore */
@@ -849,15 +851,15 @@ static MY_CONTRACTION *
849851
my_uca_add_contraction(MY_CONTRACTIONS *list, my_wc_t *wc, size_t len,
850852
my_bool with_context)
851853
{
852-
MY_CONTRACTION *next= &list->item[list->nitems];
853-
size_t i;
854854
/*
855855
Contraction is always at least two code points.
856856
Contraction is never longer than MY_UCA_MAX_CONTRACTION,
857857
which is guaranteed by using my_coll_rule_expand() with proper limit.
858858
*/
859859
DBUG_ASSERT(len > 1 && len <= MY_UCA_MAX_CONTRACTION);
860-
for (i= 0; i < len; i++)
860+
861+
MY_CONTRACTION *next= &list->item[len][list->nitems[len]];
862+
for (size_t i= 0; i < len; i++)
861863
{
862864
/*
863865
We don't support contractions with U+0000.
@@ -866,10 +868,10 @@ my_uca_add_contraction(MY_CONTRACTIONS *list, my_wc_t *wc, size_t len,
866868
DBUG_ASSERT(wc[i] != 0);
867869
next->ch[i]= wc[i];
868870
}
869-
if (i < MY_UCA_MAX_CONTRACTION)
870-
next->ch[i]= 0; /* Add end-of-line marker */
871+
if (len < MY_UCA_MAX_CONTRACTION)
872+
next->ch[len]= 0; /* Add end-of-line marker */
871873
next->with_context= with_context;
872-
list->nitems++;
874+
list->nitems[len]++;
873875
return next;
874876
}
875877

@@ -879,24 +881,35 @@ my_uca_add_contraction(MY_CONTRACTIONS *list, my_wc_t *wc, size_t len,
879881
880882
@param contractions Pointer to UCA data
881883
@param loader Pointer to charset loader
882-
@param n Number of contractions
884+
@param ncontractions Pointer to number of contractions
883885
884886
@return Error code
885887
@retval 0 - memory allocated successfully
886888
@retval 1 - not enough memory
887889
*/
888890

889-
static my_bool
891+
static bool
890892
my_uca_alloc_contractions(MY_CONTRACTIONS *contractions,
891-
MY_CHARSET_LOADER *loader, size_t n)
893+
MY_CHARSET_LOADER *loader, size_t *ncontractions)
892894
{
893-
size_t size= n * sizeof(MY_CONTRACTION);
894-
if (!(contractions->item= static_cast<MY_CONTRACTION*>((loader->once_alloc)(size))) ||
895-
!(contractions->flags= (char *) (loader->once_alloc)(MY_UCA_CNT_FLAG_SIZE)))
896-
return 1;
897-
memset(contractions->item, 0, size);
895+
for (size_t contraction_len= 2; contraction_len <= MY_UCA_MAX_CONTRACTION;
896+
contraction_len++)
897+
{
898+
if (ncontractions[contraction_len])
899+
{
900+
size_t size= ncontractions[contraction_len] * sizeof(MY_CONTRACTION);
901+
contractions->item[contraction_len]=
902+
static_cast<MY_CONTRACTION*>((loader->once_alloc)(size));
903+
if (!contractions->item[contraction_len])
904+
return true;
905+
memset(contractions->item[contraction_len], 0, size);
906+
}
907+
}
908+
if (!(contractions->flags=
909+
(char *)(loader->once_alloc)(MY_UCA_CNT_FLAG_SIZE)))
910+
return true;
898911
memset(contractions->flags, 0, MY_UCA_CNT_FLAG_SIZE);
899-
return 0;
912+
return false;
900913
}
901914

902915

@@ -911,7 +924,7 @@ my_uca_alloc_contractions(MY_CONTRACTIONS *contractions,
911924
const MY_CONTRACTIONS *
912925
my_charset_get_contractions(const CHARSET_INFO *cs)
913926
{
914-
return (cs->uca != NULL) && (cs->uca->contractions.nitems > 0) ?
927+
return (cs->uca != NULL) && (cs->uca->contractions.has_contractions) ?
915928
&cs->uca->contractions : NULL;
916929
}
917930

@@ -929,7 +942,7 @@ my_charset_get_contractions(const CHARSET_INFO *cs)
929942
static inline my_bool
930943
my_uca_have_contractions(const MY_UCA_INFO *uca)
931944
{
932-
return (uca->contractions.nitems > 0);
945+
return uca->contractions.has_contractions;
933946
}
934947

935948

@@ -1002,9 +1015,9 @@ uint16 *
10021015
my_uca_contraction2_weight(const MY_CONTRACTIONS *list, my_wc_t wc1, my_wc_t wc2)
10031016
{
10041017
MY_CONTRACTION *c, *last;
1005-
for (c= list->item, last= c + list->nitems; c < last; c++)
1018+
for (c= list->item[2], last= c + list->nitems[2]; c < last; c++)
10061019
{
1007-
if (c->ch[0] == wc1 && c->ch[1] == wc2 && c->ch[2] == 0)
1020+
if (c->ch[0] == wc1 && c->ch[1] == wc2)
10081021
{
10091022
return c->weight;
10101023
}
@@ -1085,7 +1098,7 @@ static inline const uint16 *
10851098
my_uca_contraction_weight(const MY_CONTRACTIONS *list, const my_wc_t *wc, size_t len)
10861099
{
10871100
MY_CONTRACTION *c, *last;
1088-
for (c= list->item, last= c + list->nitems; c < last; c++)
1101+
for (c= list->item[len], last= c + list->nitems[len]; c < last; c++)
10891102
{
10901103
if ((len == MY_UCA_MAX_CONTRACTION || c->ch[len] == 0) &&
10911104
!c->with_context &&
@@ -1148,10 +1161,7 @@ my_uca_scanner::contraction_find(my_wc_t wc0, size_t *chars_skipped)
11481161
{
11491162
size_t clen= 1;
11501163
int flag;
1151-
uchar *s, *beg= nullptr;
1152-
const MY_CONTRACTION *contraction_begin= cs->uca->contractions.item;
1153-
const MY_CONTRACTION *contraction_end=
1154-
contraction_begin + cs->uca->contractions.nitems;
1164+
uchar *s, *beg;
11551165
MY_CONTRACTION tofind;
11561166
memset(&tofind, 0, sizeof(tofind));
11571167
tofind.ch[0]= wc0;
@@ -1192,21 +1202,24 @@ my_uca_scanner::contraction_find(my_wc_t wc0, size_t *chars_skipped)
11921202
to looking for new character sequence which adds one more character,
11931203
which is obviously greater than the current one.
11941204
*/
1205+
const MY_CONTRACTION *contraction_begin=
1206+
cs->uca->contractions.item[clen + 1];
1207+
const MY_CONTRACTION *contraction_end=
1208+
contraction_begin + cs->uca->contractions.nitems[clen + 1];
11951209
auto candidate= std::lower_bound(contraction_begin,
11961210
contraction_end,
11971211
tofind,
11981212
contraction_chars_cmp);
1199-
if (candidate == contraction_end)
1200-
break;
1201-
if (!contraction_chars_cmp(tofind, *candidate))
1213+
if (candidate != contraction_end &&
1214+
!contraction_chars_cmp(tofind, *candidate))
12021215
{
12031216
/*
12041217
std::lower_bound() ensures *candidate is greater than or equal to
12051218
tofind. And contraction_chars_cmp() returns false which means
12061219
tofind is greater than or equal to *candidate. So tofind has to
12071220
equal to *candidate.
12081221
*/
1209-
contraction_begin= longest_contraction= candidate;
1222+
longest_contraction= candidate;
12101223
beg= s;
12111224
*chars_skipped= clen;
12121225
}
@@ -1263,11 +1276,12 @@ my_uca_scanner::previous_context_find(my_wc_t wc0, my_wc_t wc1)
12631276
memset(&tofind, 0, sizeof(tofind));
12641277
tofind.ch[0]= wc0;
12651278
tofind.ch[1]= wc1;
1266-
MY_CONTRACTION *contraction_end= contractions->item + contractions->nitems;
1267-
MY_CONTRACTION *c= std::lower_bound(contractions->item,
1279+
MY_CONTRACTION *contraction_end=
1280+
contractions->item[2] + contractions->nitems[2];
1281+
MY_CONTRACTION *c= std::lower_bound(contractions->item[2],
12681282
contraction_end,
12691283
tofind, contraction_chars_cmp);
1270-
if (c == contraction_end || c->ch[0] != wc0 || c->ch[1] != wc1 || c->ch[2])
1284+
if (c == contraction_end || c->ch[0] != wc0 || c->ch[1] != wc1)
12711285
return NULL;
12721286
if (c->with_context)
12731287
{
@@ -4358,11 +4372,12 @@ apply_one_rule(CHARSET_INFO *cs, MY_CHARSET_LOADER *loader,
43584372
r->with_context)->weight;
43594373
to_stride= 1;
43604374
to_num_ce= &to[MY_UCA_MAX_WEIGHT_SIZE - 1];
4375+
/* Temporarily hide - it's incomplete */
4376+
dst->contractions.nitems[nshift]--;
43614377
/* Store weights of the "reset to" character */
4362-
dst->contractions.nitems--; /* Temporarily hide - it's incomplete */
43634378
nweights= my_char_weight_put(dst, to, to_stride, MY_UCA_MAX_WEIGHT_SIZE - 1,
43644379
to_num_ce, r, nreset, rules->uca->version);
4365-
dst->contractions.nitems++; /* Activate, now it's complete */
4380+
dst->contractions.nitems[nshift]++; /* Activate, now it's complete */
43664381
}
43674382
else
43684383
{
@@ -4450,8 +4465,8 @@ init_weight_level(CHARSET_INFO *cs, MY_CHARSET_LOADER *loader,
44504465
bool lengths_are_temporary)
44514466
{
44524467
MY_COLL_RULE *r, *rlast;
4453-
int ncontractions= 0;
44544468
size_t i, npages= (src->maxchar + 1) / 256;
4469+
size_t ncontractions[MY_UCA_MAX_CONTRACTION + 1]{0};
44554470

44564471
dst->maxchar= src->maxchar;
44574472

@@ -4516,7 +4531,10 @@ init_weight_level(CHARSET_INFO *cs, MY_CHARSET_LOADER *loader,
45164531
dst->weights[pagec]= NULL; /* Mark that we'll overwrite this page */
45174532
}
45184533
else
4519-
ncontractions++;
4534+
{
4535+
ncontractions[my_wstrnlen(r->curr, MY_UCA_MAX_CONTRACTION)]++;
4536+
dst->contractions.has_contractions= true;
4537+
}
45204538
}
45214539

45224540
/* Allocate pages that we'll overwrite and copy default weights */
@@ -4532,7 +4550,7 @@ init_weight_level(CHARSET_INFO *cs, MY_CHARSET_LOADER *loader,
45324550
return rc;
45334551
}
45344552

4535-
if (ncontractions)
4553+
if (dst->contractions.has_contractions)
45364554
{
45374555
if (my_uca_alloc_contractions(&dst->contractions, loader, ncontractions))
45384556
return TRUE;
@@ -4552,10 +4570,16 @@ init_weight_level(CHARSET_INFO *cs, MY_CHARSET_LOADER *loader,
45524570
return TRUE;
45534571
}
45544572
// Sort contractions by the code points.
4555-
if (ncontractions)
4556-
std::sort(dst->contractions.item,
4557-
dst->contractions.item + dst->contractions.nitems,
4558-
contraction_chars_cmp);
4573+
if (dst->contractions.has_contractions)
4574+
{
4575+
for (size_t i= 2; i <= MY_UCA_MAX_CONTRACTION; i++)
4576+
{
4577+
if (dst->contractions.nitems[i])
4578+
std::sort(dst->contractions.item[i],
4579+
dst->contractions.item[i] + dst->contractions.nitems[i],
4580+
contraction_chars_cmp);
4581+
}
4582+
}
45594583
return FALSE;
45604584
}
45614585

strings/uca900_data.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
1+
/* Copyright (c) 2016, 2017 Oracle and/or its affiliates. All rights reserved.
22

33
This program is free software; you can redistribute it and/or modify
44
it under the terms of the GNU General Public License as published by
@@ -270726,9 +270726,10 @@ MY_UCA_INFO my_uca_v900=
270726270726
nullptr, /* length - not used */
270727270727
uca900_weight,
270728270728
{ /* Contractions: */
270729-
0, /* nitems */
270730-
NULL, /* item */
270731-
NULL /* flags */
270729+
false, /* have_contraction */
270730+
{0}, /* nitems */
270731+
{nullptr}, /* item */
270732+
nullptr /* flags */
270732270733
},
270733270734

270734270735
0x0009, /* first_non_ignorable p != ignore */

0 commit comments

Comments
 (0)