@@ -419,76 +419,8 @@ struct TwoWaySearcher {
419
419
memory : uint
420
420
}
421
421
422
- /*
423
- This is the Two-Way search algorithm, which was introduced in the paper:
424
- Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
425
-
426
- Here's some background information.
427
-
428
- A *word* is a string of symbols. The *length* of a word should be a familiar
429
- notion, and here we denote it for any word x by |x|.
430
- (We also allow for the possibility of the *empty word*, a word of length zero).
431
-
432
- If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
433
- *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
434
- For example, both 1 and 2 are periods for the string "aa". As another example,
435
- the only period of the string "abcd" is 4.
436
-
437
- We denote by period(x) the *smallest* period of x (provided that x is non-empty).
438
- This is always well-defined since every non-empty word x has at least one period,
439
- |x|. We sometimes call this *the period* of x.
440
-
441
- If u, v and x are words such that x = uv, where uv is the concatenation of u and
442
- v, then we say that (u, v) is a *factorization* of x.
443
-
444
- Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
445
- that both of the following hold
446
-
447
- - either w is a suffix of u or u is a suffix of w
448
- - either w is a prefix of v or v is a prefix of w
449
-
450
- then w is said to be a *repetition* for the factorization (u, v).
451
-
452
- Just to unpack this, there are four possibilities here. Let w = "abc". Then we
453
- might have:
454
-
455
- - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
456
- - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
457
- - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
458
- - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
459
-
460
- Note that the word vu is a repetition for any factorization (u,v) of x = uv,
461
- so every factorization has at least one repetition.
462
-
463
- If x is a string and (u, v) is a factorization for x, then a *local period* for
464
- (u, v) is an integer r such that there is some word w such that |w| = r and w is
465
- a repetition for (u, v).
466
-
467
- We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
468
- call this *the local period* of (u, v). Provided that x = uv is non-empty, this
469
- is well-defined (because each non-empty word has at least one factorization, as
470
- noted above).
471
-
472
- It can be proven that the following is an equivalent definition of a local period
473
- for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
474
- all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
475
- defined. (i.e. i > 0 and i + r < |x|).
476
-
477
- Using the above reformulation, it is easy to prove that
478
-
479
- 1 <= local_period(u, v) <= period(uv)
480
-
481
- A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
482
- *critical factorization*.
483
-
484
- The algorithm hinges on the following theorem, which is stated without proof:
485
-
486
- **Critical Factorization Theorem** Any word x has at least one critical
487
- factorization (u, v) such that |u| < period(x).
488
-
489
- The purpose of maximal_suffix is to find such a critical factorization.
490
-
491
- */
422
+ // This is the Two-Way search algorithm, which was introduced in the paper:
423
+ // Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
492
424
impl TwoWaySearcher {
493
425
fn new ( needle : & [ u8 ] ) -> TwoWaySearcher {
494
426
let ( crit_pos1, period1) = TwoWaySearcher :: maximal_suffix ( needle, false ) ;
@@ -504,19 +436,15 @@ impl TwoWaySearcher {
504
436
period = period2;
505
437
}
506
438
507
- // This isn't in the original algorithm, as far as I'm aware.
508
439
let byteset = needle. iter ( )
509
440
. fold ( 0 , |a, & b| ( 1 << ( ( b & 0x3f ) as uint ) ) | a) ;
510
441
511
- // A particularly readable explanation of what's going on here can be found
512
- // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
513
- // see the code for "Algorithm CP" on p. 323.
442
+ // The logic here (calculating crit_pos and period, the final if statement to see which
443
+ // period to use for the TwoWaySearcher) is essentially an implementation of the
444
+ // "small-period" function from the paper ( p. 670)
514
445
//
515
- // What's going on is we have some critical factorization (u, v) of the
516
- // needle, and we want to determine whether u is a suffix of
517
- // v.slice_to(period). If it is, we use "Algorithm CP1". Otherwise we use
518
- // "Algorithm CP2", which is optimized for when the period of the needle
519
- // is large.
446
+ // In the paper they check whether `needle.slice_to(crit_pos)` is a suffix of
447
+ // `needle.slice(crit_pos, crit_pos + period)`, which is precisely what this does
520
448
if needle. slice_to ( crit_pos) == needle. slice ( period, period + crit_pos) {
521
449
TwoWaySearcher {
522
450
crit_pos : crit_pos,
@@ -538,11 +466,6 @@ impl TwoWaySearcher {
538
466
}
539
467
}
540
468
541
- // One of the main ideas of Two-Way is that we factorize the needle into
542
- // two halves, (u, v), and begin trying to find v in the haystack by scanning
543
- // left to right. If v matches, we try to match u by scanning right to left.
544
- // How far we can jump when we encounter a mismatch is all based on the fact
545
- // that (u, v) is a critical factorization for the needle.
546
469
#[ inline]
547
470
fn next ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] , long_period : bool ) -> Option < ( uint , uint ) > {
548
471
' search: loop {
@@ -556,9 +479,6 @@ impl TwoWaySearcher {
556
479
( ( haystack[ self . position + needle. len ( ) - 1 ] & 0x3f )
557
480
as uint ) ) & 1 == 0 {
558
481
self . position += needle. len ( ) ;
559
- if !long_period {
560
- self . memory = 0 ;
561
- }
562
482
continue ' search;
563
483
}
564
484
@@ -597,9 +517,9 @@ impl TwoWaySearcher {
597
517
}
598
518
}
599
519
600
- // Computes a critical factorization (u, v) of `arr`.
601
- // Specifically, returns (i, p), where i is the starting index of v in some
602
- // critical factorization (u, v) and p = period(v)
520
+ // returns (i, p) where i is the "critical position", the starting index of
521
+ // of maximal suffix, and p is the period of the suffix
522
+ // see p. 668 of the paper
603
523
#[ inline]
604
524
fn maximal_suffix ( arr : & [ u8 ] , reversed : bool ) -> ( uint , uint ) {
605
525
let mut left = -1 ; // Corresponds to i in the paper
0 commit comments