Skip to content

Commit 3633441

Browse files
committed
---
yaml --- r: 232495 b: refs/heads/try c: 7ebae85 h: refs/heads/master i: 232493: 1cc2786 232491: be061b0 232487: abfb8e2 232479: 6143f76 v: v3
1 parent ff38a48 commit 3633441

File tree

2 files changed

+159
-50
lines changed

2 files changed

+159
-50
lines changed

[refs]

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
---
22
refs/heads/master: edeb4f1c86cbf6af8ef9874d4b3af50f721ea1b8
33
refs/heads/snap-stage3: 1af31d4974e33027a68126fa5a5a3c2c6491824f
4-
refs/heads/try: c5a1d8c3db171a4351712c04e6ba6a4e4636a332
4+
refs/heads/try: 7ebae85bb8eac495bbc4a463319b23404fdc63a6
55
refs/tags/release-0.1: 1f5c5126e96c79d22cb7862f75304136e204f105
66
refs/tags/release-0.2: c870d2dffb391e14efb05aa27898f1f6333a9596
77
refs/tags/release-0.3: b5f0d0f648d9a6153664837026ba1be43d3e2503

branches/try/src/libcore/str/pattern.rs

Lines changed: 158 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -676,8 +676,10 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
676676
if searcher.end == 0 {
677677
return SearchStep::Done;
678678
}
679+
let is_long = searcher.memory == usize::MAX;
679680
match searcher.next_back::<RejectAndMatch>(self.haystack.as_bytes(),
680-
self.needle.as_bytes())
681+
self.needle.as_bytes(),
682+
is_long)
681683
{
682684
SearchStep::Reject(mut a, b) => {
683685
// skip to next char boundary
@@ -706,8 +708,16 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
706708
}
707709
}
708710
StrSearcherImpl::TwoWay(ref mut searcher) => {
709-
searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
710-
self.needle.as_bytes())
711+
let is_long = searcher.memory == usize::MAX;
712+
if is_long {
713+
searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
714+
self.needle.as_bytes(),
715+
true)
716+
} else {
717+
searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
718+
self.needle.as_bytes(),
719+
false)
720+
}
711721
}
712722
}
713723
}
@@ -718,14 +728,21 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
718728
#[derive(Clone, Debug)]
719729
struct TwoWaySearcher {
720730
// constants
731+
/// critical factorization index
721732
crit_pos: usize,
733+
/// critical factorization index for reversed needle
734+
crit_pos_back: usize,
722735
period: usize,
736+
/// `byteset` is an extension (not part of the two way algorithm);
737+
/// it's a 64-bit "fingerprint" where each set bit `j` corresponds
738+
/// to a (byte & 63) == j present in the needle.
723739
byteset: u64,
724740

725741
// variables
726742
position: usize,
727743
end: usize,
728-
memory: usize
744+
memory: usize,
745+
memory_back: usize,
729746
}
730747

731748
/*
@@ -797,6 +814,9 @@ struct TwoWaySearcher {
797814
798815
The purpose of maximal_suffix is to find such a critical factorization.
799816
817+
If the period is short, compute another factorization x = u' v' to use
818+
for reverse search, chosen instead so that |v'| < period(x).
819+
800820
*/
801821
impl TwoWaySearcher {
802822
fn new(needle: &[u8], end: usize) -> TwoWaySearcher {
@@ -810,10 +830,6 @@ impl TwoWaySearcher {
810830
(crit_pos_true, period_true)
811831
};
812832

813-
// This isn't in the original algorithm, as far as I'm aware.
814-
let byteset = needle.iter()
815-
.fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a);
816-
817833
// A particularly readable explanation of what's going on here can be found
818834
// in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
819835
// see the code for "Algorithm CP" on p. 323.
@@ -824,27 +840,51 @@ impl TwoWaySearcher {
824840
// "Algorithm CP2", which is optimized for when the period of the needle
825841
// is large.
826842
if &needle[..crit_pos] == &needle[period.. period + crit_pos] {
827-
// short period case
843+
// short period case -- the period is exact
844+
let byteset = needle[..period].iter()
845+
.fold(0, |a, &b| (1 << (b & 0x3f)) | a);
846+
847+
// compute a separate critical factorization for the reversed needle
848+
// x = u' v' where |v'| < period(x).
849+
//
850+
// This is sped up by the period being known already.
851+
// Note that a case like x = "acba" may be factored exactly forwards
852+
// (crit_pos = 1, period = 3) while being factored with approximate
853+
// period in reverse (crit_pos = 2, period = 2). We use the given
854+
// reverse factorization but keep the exact period.
855+
let crit_pos_back = needle.len() - cmp::max(
856+
TwoWaySearcher::reverse_maximal_suffix(needle, period, false),
857+
TwoWaySearcher::reverse_maximal_suffix(needle, period, true));
858+
828859
TwoWaySearcher {
829860
crit_pos: crit_pos,
861+
crit_pos_back: crit_pos_back,
830862
period: period,
831863
byteset: byteset,
832864

833865
position: 0,
834866
end: end,
835-
memory: 0
867+
memory: 0,
868+
// memory_back after which we have already matched
869+
memory_back: needle.len(),
836870
}
837871
} else {
838-
// long period case
839-
// we have an approximation to the actual period, and don't use memory.
872+
// long period case -- we have an approximation to the actual period,
873+
// and don't use memorization.
874+
875+
let byteset = needle.iter()
876+
.fold(0, |a, &b| (1 << (b & 0x3f)) | a);
877+
840878
TwoWaySearcher {
841879
crit_pos: crit_pos,
880+
crit_pos_back: crit_pos,
842881
period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
843882
byteset: byteset,
844883

845884
position: 0,
846885
end: end,
847-
memory: usize::MAX // Dummy value to signify that the period is long
886+
memory: usize::MAX, // Dummy value to signify that the period is long
887+
memory_back: usize::MAX,
848888
}
849889
}
850890
}
@@ -926,19 +966,18 @@ impl TwoWaySearcher {
926966

927967
// Follows the ideas in `next()`.
928968
//
929-
// All the definitions are completely symmetrical, with period(x) = period(reverse(x))
969+
// The definitions are symmetrical, with period(x) = period(reverse(x))
930970
// and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
931-
// is a critical factorization, so is (reverse(v), reverse(u)). Similarly,
932-
// the "period" stored in self.period is the real period if long_period is
933-
// false, and so is still valid for a reversed needle, and if long_period is
934-
// true, all the algorithm requires is that self.period is less than or
935-
// equal to the real period, which must be true for the forward case anyway.
971+
// is a critical factorization, so is (reverse(v), reverse(u)).
972+
//
973+
// For the short period case, using memorization, we rely on |u| < period(x).
974+
// For this case we have computed a critical factorization x = u' v'
975+
// where |v'| < period(x) instead (field `crit_pos_back`).
936976
//
937977
// To search in reverse through the haystack, we search forward through
938-
// a reversed haystack with a reversed needle, and the above paragraph shows
939-
// that the precomputed parameters can be left alone.
978+
// a reversed haystack with a reversed needle, matching first u' and then v'.
940979
#[inline]
941-
fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8])
980+
fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool)
942981
-> S::Output
943982
where S: TwoWayStrategy
944983
{
@@ -959,21 +998,34 @@ impl TwoWaySearcher {
959998
// Quickly skip by large portions unrelated to our substring
960999
if !self.byteset_contains(haystack[self.end - needle.len()]) {
9611000
self.end -= needle.len();
1001+
if !long_period {
1002+
self.memory_back = needle.len();
1003+
}
9621004
continue 'search;
9631005
}
9641006

9651007
// See if the left part of the needle matches
966-
for i in (0..self.crit_pos).rev() {
1008+
let crit = if long_period { self.crit_pos_back }
1009+
else { cmp::min(self.crit_pos_back, self.memory_back) };
1010+
for i in (0..crit).rev() {
9671011
if needle[i] != haystack[self.end - needle.len() + i] {
968-
self.end -= self.crit_pos - i;
1012+
self.end -= self.crit_pos_back - i;
1013+
if !long_period {
1014+
self.memory_back = needle.len();
1015+
}
9691016
continue 'search;
9701017
}
9711018
}
9721019

9731020
// See if the right part of the needle matches
974-
for i in self.crit_pos..needle.len() {
1021+
let needle_end = if long_period { needle.len() }
1022+
else { self.memory_back };
1023+
for i in self.crit_pos_back..needle_end {
9751024
if needle[i] != haystack[self.end - needle.len() + i] {
9761025
self.end -= self.period;
1026+
if !long_period {
1027+
self.memory_back = self.period;
1028+
}
9771029
continue 'search;
9781030
}
9791031
}
@@ -982,53 +1034,110 @@ impl TwoWaySearcher {
9821034
let match_pos = self.end - needle.len();
9831035
// Note: sub self.period instead of needle.len() to have overlapping matches
9841036
self.end -= needle.len();
1037+
if !long_period {
1038+
self.memory_back = needle.len();
1039+
}
9851040

9861041
return S::matching(match_pos, match_pos + needle.len());
9871042
}
9881043
}
9891044

990-
// Computes a critical factorization (u, v) of `arr`.
991-
// Specifically, returns (i, p), where i is the starting index of v in some
992-
// critical factorization (u, v) and p = period(v)
1045+
// Compute the maximal suffix of `arr`.
1046+
//
1047+
// The maximal suffix is a possible critical factorization (u, v) of `arr`.
1048+
//
1049+
// Returns (`i`, `p`) where `i` is the starting index of v and `p` is the
1050+
// period of v.
1051+
//
1052+
// `order_greater` determines if lexical order is `<` or `>`. Both
1053+
// orders must be computed -- the ordering with the largest `i` gives
1054+
// a critical factorization.
1055+
//
1056+
// For long period cases, the resulting period is not exact (it is too short).
9931057
#[inline]
994-
fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) {
995-
let mut left: usize = !0; // Corresponds to i in the paper
996-
let mut right = 0; // Corresponds to j in the paper
997-
let mut offset = 1; // Corresponds to k in the paper
1058+
fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) {
1059+
let mut left = 0; // Corresponds to i in the paper
1060+
let mut right = 1; // Corresponds to j in the paper
1061+
let mut offset = 0; // Corresponds to k in the paper
9981062
let mut period = 1; // Corresponds to p in the paper
9991063

1000-
while right + offset < arr.len() {
1001-
let a;
1002-
let b;
1003-
if reversed {
1004-
a = arr[left.wrapping_add(offset)];
1005-
b = arr[right + offset];
1064+
while let Some(&a) = arr.get(right + offset) {
1065+
// `left` will be inbounds when `right` is.
1066+
let b = arr[left + offset];
1067+
if (a < b && !order_greater) || (a > b && order_greater) {
1068+
// Suffix is smaller, period is entire prefix so far.
1069+
right += offset + 1;
1070+
offset = 0;
1071+
period = right - left;
1072+
} else if a == b {
1073+
// Advance through repetition of the current period.
1074+
if offset + 1 == period {
1075+
right += offset + 1;
1076+
offset = 0;
1077+
} else {
1078+
offset += 1;
1079+
}
10061080
} else {
1007-
a = arr[right + offset];
1008-
b = arr[left.wrapping_add(offset)];
1081+
// Suffix is larger, start over from current location.
1082+
left = right;
1083+
right += 1;
1084+
offset = 0;
1085+
period = 1;
10091086
}
1010-
if a < b {
1087+
}
1088+
(left, period)
1089+
}
1090+
1091+
// Compute the maximal suffix of the reverse of `arr`.
1092+
//
1093+
// The maximal suffix is a possible critical factorization (u', v') of `arr`.
1094+
//
1095+
// Returns `i` where `i` is the starting index of v', from the back;
1096+
// returns immedately when a period of `known_period` is reached.
1097+
//
1098+
// `order_greater` determines if lexical order is `<` or `>`. Both
1099+
// orders must be computed -- the ordering with the largest `i` gives
1100+
// a critical factorization.
1101+
//
1102+
// For long period cases, the resulting period is not exact (it is too short).
1103+
fn reverse_maximal_suffix(arr: &[u8], known_period: usize,
1104+
order_greater: bool) -> usize
1105+
{
1106+
let mut left = 0; // Corresponds to i in the paper
1107+
let mut right = 1; // Corresponds to j in the paper
1108+
let mut offset = 0; // Corresponds to k in the paper
1109+
let mut period = 1; // Corresponds to p in the paper
1110+
let n = arr.len();
1111+
1112+
while right + offset < n {
1113+
let a = arr[n - (1 + right + offset)];
1114+
let b = arr[n - (1 + left + offset)];
1115+
if (a < b && !order_greater) || (a > b && order_greater) {
10111116
// Suffix is smaller, period is entire prefix so far.
1012-
right += offset;
1013-
offset = 1;
1014-
period = right.wrapping_sub(left);
1117+
right += offset + 1;
1118+
offset = 0;
1119+
period = right - left;
10151120
} else if a == b {
10161121
// Advance through repetition of the current period.
1017-
if offset == period {
1018-
right += offset;
1019-
offset = 1;
1122+
if offset + 1 == period {
1123+
right += offset + 1;
1124+
offset = 0;
10201125
} else {
10211126
offset += 1;
10221127
}
10231128
} else {
10241129
// Suffix is larger, start over from current location.
10251130
left = right;
10261131
right += 1;
1027-
offset = 1;
1132+
offset = 0;
10281133
period = 1;
10291134
}
1135+
if period == known_period {
1136+
break;
1137+
}
10301138
}
1031-
(left.wrapping_add(1), period)
1139+
debug_assert!(period <= known_period);
1140+
left
10321141
}
10331142
}
10341143

0 commit comments

Comments
 (0)