Skip to content

Commit 20c92d7

Browse files
sozelfistvil02
andauthored
Add another implementation of Levenshtein distance (#702)
* chore: add `edit_distance.rs` to DIRECTORY.md * feat: implement Edit Distance algorithm * chore(tests): add few more checks * chore: rename files - rename `src/string/levenshtein_distance.rs` to `src/string/levenshtein_distance/optimized_dp.rs` - move and rename `src/dynamic_programming/edit_distance.rs` to `src/string/levenshtein_distance/naive_dp.rs` * chore: rename `levenshtein_distance` function * chore: update DIRECTORY.md * chore: update `mod.rs` files * chore: format code with `fmt` * chore: update DIRECTORY.md * feat: implement levenshtein distance in both naive and optimized version using DP * chore: update DIRECTORY.md * chore(tests): update tests * ref: Refactor tests for Levenshtein distance calculation - Consolidated test cases into a constant array for improved readability and maintainability - Simplified test structure by removing macro-based test generation, enhancing code clarity - Introduced a `run_test_case` function to encapsulate test logic, enhancing test function conciseness. - Organized test suite into separate modules for naive and optimized implementations, promoting code organization. --------- Co-authored-by: Piotr Idzik <[email protected]>
1 parent c032677 commit 20c92d7

File tree

2 files changed

+133
-118
lines changed

2 files changed

+133
-118
lines changed

src/string/levenshtein_distance.rs

Lines changed: 132 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,122 @@
1+
//! Provides functions to calculate the Levenshtein distance between two strings.
2+
//!
3+
//! The Levenshtein distance is a measure of the similarity between two strings by calculating the minimum number of single-character
4+
//! edits (insertions, deletions, or substitutions) required to change one string into the other.
5+
16
use std::cmp::min;
27

3-
/// The Levenshtein distance (or edit distance) between 2 strings.\
4-
/// This edit distance is defined as being 1 point per insertion, substitution, or deletion which must be made to make the strings equal.
5-
/// This function iterates over the bytes in the string, so it may not behave entirely as expected for non-ASCII strings.
8+
/// Calculates the Levenshtein distance between two strings using a naive dynamic programming approach.
9+
///
10+
/// The Levenshtein distance is a measure of the similarity between two strings by calculating the minimum number of single-character
11+
/// edits (insertions, deletions, or substitutions) required to change one string into the other.
12+
///
13+
/// # Arguments
14+
///
15+
/// * `string1` - A reference to the first string.
16+
/// * `string2` - A reference to the second string.
17+
///
18+
/// # Returns
19+
///
20+
/// The Levenshtein distance between the two input strings.
21+
///
22+
/// This function computes the Levenshtein distance by constructing a dynamic programming matrix and iteratively filling it in.
23+
/// It follows the standard top-to-bottom, left-to-right approach for filling in the matrix.
24+
///
25+
/// # Complexity
26+
///
27+
/// - Time complexity: O(nm),
28+
/// - Space complexity: O(nm),
29+
///
30+
/// where n and m are lengths of `string1` and `string2`.
31+
///
32+
/// Note that this implementation uses a straightforward dynamic programming approach without any space optimization.
33+
/// It may consume more memory for larger input strings compared to the optimized version.
34+
pub fn naive_levenshtein_distance(string1: &str, string2: &str) -> usize {
35+
let distance_matrix: Vec<Vec<usize>> = (0..=string1.len())
36+
.map(|i| {
37+
(0..=string2.len())
38+
.map(|j| {
39+
if i == 0 {
40+
j
41+
} else if j == 0 {
42+
i
43+
} else {
44+
0
45+
}
46+
})
47+
.collect()
48+
})
49+
.collect();
50+
51+
let updated_matrix = (1..=string1.len()).fold(distance_matrix, |matrix, i| {
52+
(1..=string2.len()).fold(matrix, |mut inner_matrix, j| {
53+
let cost = if string1.as_bytes()[i - 1] == string2.as_bytes()[j - 1] {
54+
0
55+
} else {
56+
1
57+
};
58+
inner_matrix[i][j] = (inner_matrix[i - 1][j - 1] + cost)
59+
.min(inner_matrix[i][j - 1] + 1)
60+
.min(inner_matrix[i - 1][j] + 1);
61+
inner_matrix
62+
})
63+
});
64+
65+
updated_matrix[string1.len()][string2.len()]
66+
}
67+
68+
/// Calculates the Levenshtein distance between two strings using an optimized dynamic programming approach.
669
///
7-
/// For a detailed explanation, check the example on Uncyclopedia: <https://en.wikipedia.org/wiki/Levenshtein_distance>\
8-
/// (see the examples with the matrices, for instance between KITTEN and SITTING)
70+
/// This edit distance is defined as 1 point per insertion, substitution, or deletion required to make the strings equal.
971
///
10-
/// Note that although we compute a matrix, left-to-right, top-to-bottom, at each step all we need to compute `cell[i][j]` is:
11-
/// - `cell[i][j-1]`
12-
/// - `cell[i-j][j]`
13-
/// - `cell[i-i][j-1]`
72+
/// # Arguments
1473
///
15-
/// This can be achieved by only using one "rolling" row and one additional variable, when computed `cell[i][j]` (or `row[i]`):
16-
/// - `cell[i][j-1]` is the value to the left, on the same row (the one we just computed, `row[i-1]`)
17-
/// - `cell[i-1][j]` is the value at `row[i]`, the one we're changing
18-
/// - `cell[i-1][j-1]` was the value at `row[i-1]` before we changed it, for that we'll use a variable
74+
/// * `string1` - The first string.
75+
/// * `string2` - The second string.
76+
///
77+
/// # Returns
78+
///
79+
/// The Levenshtein distance between the two input strings.
80+
/// For a detailed explanation, check the example on [Uncyclopedia](https://en.wikipedia.org/wiki/Levenshtein_distance).
81+
/// This function iterates over the bytes in the string, so it may not behave entirely as expected for non-ASCII strings.
1982
///
20-
/// Doing this reduces space complexity from O(nm) to O(n)
83+
/// Note that this implementation utilizes an optimized dynamic programming approach, significantly reducing the space complexity from O(nm) to O(n), where n and m are the lengths of `string1` and `string2`.
2184
///
22-
/// Second note: if we want to minimize space, since we're now O(n) make sure you use the shortest string horizontally, and the longest vertically
85+
/// Additionally, it minimizes space usage by leveraging the shortest string horizontally and the longest string vertically in the computation matrix.
2386
///
2487
/// # Complexity
25-
/// - time complexity: O(nm),
26-
/// - space complexity: O(n),
2788
///
28-
/// where n and m are lengths of `str_a` and `str_b`
29-
pub fn levenshtein_distance(string1: &str, string2: &str) -> usize {
89+
/// - Time complexity: O(nm),
90+
/// - Space complexity: O(n),
91+
///
92+
/// where n and m are lengths of `string1` and `string2`.
93+
pub fn optimized_levenshtein_distance(string1: &str, string2: &str) -> usize {
3094
if string1.is_empty() {
3195
return string2.len();
3296
}
3397
let l1 = string1.len();
3498
let mut prev_dist: Vec<usize> = (0..=l1).collect();
3599

36100
for (row, c2) in string2.chars().enumerate() {
37-
let mut prev_substitution_cost = prev_dist[0]; // we'll keep a reference to matrix[i-1][j-1] (top-left cell)
38-
prev_dist[0] = row + 1; // diff with empty string, since `row` starts at 0, it's `row + 1`
101+
// we'll keep a reference to matrix[i-1][j-1] (top-left cell)
102+
let mut prev_substitution_cost = prev_dist[0];
103+
// diff with empty string, since `row` starts at 0, it's `row + 1`
104+
prev_dist[0] = row + 1;
39105

40106
for (col, c1) in string1.chars().enumerate() {
41-
let deletion_cost = prev_dist[col] + 1; // "on the left" in the matrix (i.e. the value we just computed)
42-
let insertion_cost = prev_dist[col + 1] + 1; // "on the top" in the matrix (means previous)
107+
// "on the left" in the matrix (i.e. the value we just computed)
108+
let deletion_cost = prev_dist[col] + 1;
109+
// "on the top" in the matrix (means previous)
110+
let insertion_cost = prev_dist[col + 1] + 1;
43111
let substitution_cost = if c1 == c2 {
44-
prev_substitution_cost // last char is the same on both ends, so the min_distance is left unchanged from matrix[i-1][i+1]
112+
// last char is the same on both ends, so the min_distance is left unchanged from matrix[i-1][i+1]
113+
prev_substitution_cost
45114
} else {
46-
prev_substitution_cost + 1 // substitute the last character
115+
// substitute the last character
116+
prev_substitution_cost + 1
47117
};
48-
49-
prev_substitution_cost = prev_dist[col + 1]; // save the old value at (i-1, j-1)
118+
// save the old value at (i-1, j-1)
119+
prev_substitution_cost = prev_dist[col + 1];
50120
prev_dist[col + 1] = _min3(deletion_cost, insertion_cost, substitution_cost);
51121
}
52122
}
@@ -60,94 +130,39 @@ fn _min3<T: Ord>(a: T, b: T, c: T) -> T {
60130

61131
#[cfg(test)]
62132
mod tests {
63-
use super::_min3;
64-
use super::levenshtein_distance;
65-
66-
#[test]
67-
fn test_doc_example() {
68-
assert_eq!(2, levenshtein_distance("FROG", "DOG"));
69-
}
70-
71-
#[test]
72-
fn return_0_with_empty_strings() {
73-
assert_eq!(0, levenshtein_distance("", ""));
74-
}
75-
76-
#[test]
77-
fn return_1_with_empty_and_a() {
78-
assert_eq!(1, levenshtein_distance("", "a"));
79-
}
80-
81-
#[test]
82-
fn return_1_with_a_and_empty() {
83-
assert_eq!(1, levenshtein_distance("a", ""));
84-
}
85-
86-
#[test]
87-
fn return_1_with_ab_and_a() {
88-
assert_eq!(1, levenshtein_distance("ab", "a"));
89-
}
90-
91-
#[test]
92-
fn return_0_with_foobar_and_foobar() {
93-
assert_eq!(0, levenshtein_distance("foobar", "foobar"));
94-
}
95-
96-
#[test]
97-
fn return_6_with_foobar_and_barfoo() {
98-
assert_eq!(6, levenshtein_distance("foobar", "barfoo"));
99-
}
100-
101-
#[test]
102-
fn return_1_with_kind_and_bind() {
103-
assert_eq!(1, levenshtein_distance("kind", "bind"));
104-
}
105-
106-
#[test]
107-
fn return_3_with_winner_and_win() {
108-
assert_eq!(3, levenshtein_distance("winner", "win"));
109-
}
110-
111-
#[test]
112-
fn equal_strings() {
113-
assert_eq!(0, levenshtein_distance("Hello, world!", "Hello, world!"));
114-
assert_eq!(0, levenshtein_distance("Hello, world!", "Hello, world!"));
115-
assert_eq!(0, levenshtein_distance("Test_Case_#1", "Test_Case_#1"));
116-
assert_eq!(0, levenshtein_distance("Test_Case_#1", "Test_Case_#1"));
117-
}
118-
119-
#[test]
120-
fn one_edit_difference() {
121-
assert_eq!(1, levenshtein_distance("Hello, world!", "Hell, world!"));
122-
assert_eq!(1, levenshtein_distance("Test_Case_#1", "Test_Case_#2"));
123-
assert_eq!(1, levenshtein_distance("Test_Case_#1", "Test_Case_#10"));
124-
assert_eq!(1, levenshtein_distance("Hello, world!", "Hell, world!"));
125-
assert_eq!(1, levenshtein_distance("Test_Case_#1", "Test_Case_#2"));
126-
assert_eq!(1, levenshtein_distance("Test_Case_#1", "Test_Case_#10"));
127-
}
128-
129-
#[test]
130-
fn several_differences() {
131-
assert_eq!(2, levenshtein_distance("My Cat", "My Case"));
132-
assert_eq!(7, levenshtein_distance("Hello, world!", "Goodbye, world!"));
133-
assert_eq!(6, levenshtein_distance("Test_Case_#3", "Case #3"));
134-
assert_eq!(2, levenshtein_distance("My Cat", "My Case"));
135-
assert_eq!(7, levenshtein_distance("Hello, world!", "Goodbye, world!"));
136-
assert_eq!(6, levenshtein_distance("Test_Case_#3", "Case #3"));
137-
}
138-
139-
#[test]
140-
fn return_1_with_1_2_3() {
141-
assert_eq!(1, _min3(1, 2, 3));
142-
}
143-
144-
#[test]
145-
fn return_1_with_3_2_1() {
146-
assert_eq!(1, _min3(3, 2, 1));
147-
}
148-
149-
#[test]
150-
fn return_1_with_2_3_1() {
151-
assert_eq!(1, _min3(2, 3, 1));
152-
}
133+
const LEVENSHTEIN_DISTANCE_TEST_CASES: &[(&str, &str, usize)] = &[
134+
("", "", 0),
135+
("Hello, World!", "Hello, World!", 0),
136+
("", "Rust", 4),
137+
("horse", "ros", 3),
138+
("tan", "elephant", 6),
139+
("execute", "intention", 8),
140+
];
141+
142+
macro_rules! levenshtein_distance_tests {
143+
($function:ident) => {
144+
mod $function {
145+
use super::*;
146+
147+
fn run_test_case(string1: &str, string2: &str, expected_distance: usize) {
148+
assert_eq!(super::super::$function(string1, string2), expected_distance);
149+
assert_eq!(super::super::$function(string2, string1), expected_distance);
150+
assert_eq!(super::super::$function(string1, string1), 0);
151+
assert_eq!(super::super::$function(string2, string2), 0);
152+
}
153+
154+
#[test]
155+
fn test_levenshtein_distance() {
156+
for &(string1, string2, expected_distance) in
157+
LEVENSHTEIN_DISTANCE_TEST_CASES.iter()
158+
{
159+
run_test_case(string1, string2, expected_distance);
160+
}
161+
}
162+
}
163+
};
164+
}
165+
166+
levenshtein_distance_tests!(naive_levenshtein_distance);
167+
levenshtein_distance_tests!(optimized_levenshtein_distance);
153168
}

src/string/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ pub use self::duval_algorithm::duval_algorithm;
3232
pub use self::hamming_distance::hamming_distance;
3333
pub use self::jaro_winkler_distance::jaro_winkler_distance;
3434
pub use self::knuth_morris_pratt::knuth_morris_pratt;
35-
pub use self::levenshtein_distance::levenshtein_distance;
35+
pub use self::levenshtein_distance::{naive_levenshtein_distance, optimized_levenshtein_distance};
3636
pub use self::lipogram::is_lipogram;
3737
pub use self::manacher::manacher;
3838
pub use self::palindrome::is_palindrome;

0 commit comments

Comments
 (0)