Skip to content

Commit 83263ae

Browse files
committed
Add support for C++ boost::regex to benchmarks
This is very similar to the `std::regex` benchmark implementation since Boost.Regex and `std::regex` have very similar APIs and regex grammar support. As such, it uses the `stdcpp` Rust and C FFIs to reduce code duplication. * bench/Cargo.toml: add `re-boost` feature * bench/build.rs: add `cboost` library to bench build. This uses a compiler preprocessor definition to indicate whether or not to use Boost when compiling the `stdcpp` FFI. * bench/compile: add `re-boost` feature to bench compile script * bench/run: add `re-boost` feature to bench run script * bench/src/bench.rs: use `ffi::stdcpp::Regex`, define its `text!` macro, and `Text` type for feature `re-boost` * bench/src/ffi/mod.rs: declare `stdcpp` module for `re-boost` feature * bench/src/ffi/stdcpp.cpp: implement C API using C++ `boost::regex`. The Boost.Regex API is very similar to the `std::regex` API and therefore only uses a different namespace. * bench/src/main.rs: add boost to bench main * bench/src/misc.rs: - do not run `match_class_unicode` benchmark for `re-boost` feature because `boost::regex` ECMAScript grammar does not support unicode character classes * bench/src/sherlock.rs: - do not run `name_sherlock_nocase`, `name_holmes_nocase`, `name_sherlock_holmes_nocase`, `name_alt3_nocase`, `name_alt4_nocase`, `name_alt5_nocase`, `the_nocase`, `everything_greedy_nl`, and `line_boundary_sherlock_holmes` benchmarks for `re-boost` feature because `boost::regex` ECMAScript grammar does not support inline modifier syntax - do not run `letters`, `letters_upper`, and `letters_lower` benchmarks for `re-boost` feature because `boost::regex` ECMAScript grammar does not support unicode character classes - use a different regex for `everything_greedy` benchmark because `boost::regex` '.' does not match '\r' - `words` benchmark for `boost::regex` matches RE2 test result, so use that test for `re-boost` feature as well - do not run `holmes_coword_watson` benchmark for `re-boost` feature because Boost.Regex implementation currently seems to have exponential behavior here
1 parent abc30a8 commit 83263ae

File tree

10 files changed

+84
-24
lines changed

10 files changed

+84
-24
lines changed

bench/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,9 @@ re-pcre1 = ["libpcre-sys"]
4747
re-pcre2 = []
4848
re-onig = ["onig"]
4949
re-stdcpp = []
50-
re-re2 = []
5150
libcxx = []
51+
re-boost = []
52+
re-re2 = []
5253
re-dphobos = []
5354
re-dphobos-dmd = ["re-dphobos"]
5455
re-dphobos-dmd-ct = ["re-dphobos-dmd"]

bench/build.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@ fn main() {
3535
.compile("libcstdcpp.a");
3636
}
3737
}
38+
if env::var("CARGO_FEATURE_RE_BOOST").is_ok() {
39+
// stdcpp is a C++ library, so we need to compile our shim layer.
40+
cc::Build::new()
41+
.cpp(true)
42+
.file("src/ffi/stdcpp.cpp")
43+
.define("USE_BOOST", None)
44+
.compile("libcboost.a");
45+
println!("cargo:rustc-link-lib=boost_regex");
46+
}
3847
if env::var("CARGO_FEATURE_RE_RE2").is_ok() {
3948
// RE2 is a C++ library, so we need to compile our shim layer.
4049
cc::Build::new()

bench/compile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22

33
exec cargo build \
44
--release \
5-
--features 're-stdcpp re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
5+
--features 're-stdcpp re-boost re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
66
"$@"

bench/run

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22

33
usage() {
4-
echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | re2 | onig | tcl ]" >&2
4+
echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | boost | re2 | onig | tcl ]" >&2
55
exit 1
66
}
77

@@ -36,6 +36,9 @@ case $which in
3636
stdcpp-libcxx)
3737
exec cargo bench --bench bench --features 're-stdcpp libcxx' "$@"
3838
;;
39+
boost)
40+
exec cargo bench --bench bench --features re-boost "$@"
41+
;;
3942
re2)
4043
exec cargo bench --bench bench --features re-re2 "$@"
4144
;;

bench/src/bench.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,16 @@ extern crate regex;
2828
extern crate regex_syntax;
2929
extern crate test;
3030

31-
3231
#[cfg(feature = "re-onig")]
3332
pub use ffi::onig::Regex;
3433
#[cfg(feature = "re-pcre1")]
3534
pub use ffi::pcre1::Regex;
3635
#[cfg(feature = "re-pcre2")]
3736
pub use ffi::pcre2::Regex;
38-
#[cfg(feature = "re-stdcpp")]
37+
#[cfg(any(
38+
feature = "re-stdcpp",
39+
feature = "re-boost",
40+
))]
3941
pub use ffi::stdcpp::Regex;
4042
#[cfg(feature = "re-re2")]
4143
pub use ffi::re2::Regex;
@@ -93,6 +95,7 @@ macro_rules! text {
9395
feature = "re-pcre1",
9496
feature = "re-pcre2",
9597
feature = "re-stdcpp",
98+
feature = "re-boost",
9699
feature = "re-re2",
97100
feature = "re-dphobos",
98101
feature = "re-rust",
@@ -111,6 +114,7 @@ type Text = Vec<u8>;
111114
feature = "re-pcre1",
112115
feature = "re-pcre2",
113116
feature = "re-stdcpp",
117+
feature = "re-boost",
114118
feature = "re-re2",
115119
feature = "re-dphobos",
116120
feature = "re-rust",

bench/src/ffi/mod.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@ pub mod onig;
2020
pub mod pcre1;
2121
#[cfg(feature = "re-pcre2")]
2222
pub mod pcre2;
23-
#[cfg(feature = "re-stdcpp")]
23+
#[cfg(any(
24+
feature = "re-stdcpp",
25+
feature = "re-boost",
26+
))]
2427
pub mod stdcpp;
2528
#[cfg(feature = "re-re2")]
2629
pub mod re2;

bench/src/ffi/stdcpp.cpp

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,17 @@
1+
#ifdef USE_BOOST
2+
#include <boost/regex.hpp>
3+
#else
14
#include <regex>
5+
#endif
26

37
extern "C" {
8+
9+
#ifdef USE_BOOST
10+
namespace regex_ns = boost;
11+
#else
12+
namespace regex_ns = std;
13+
#endif
14+
415
typedef void stdcpp_regexp;
516

617
typedef struct stdcpp_string {
@@ -9,34 +20,34 @@ extern "C" {
920
} stdcpp_string;
1021

1122
stdcpp_regexp* stdcpp_regexp_new(stdcpp_string pat) {
12-
return reinterpret_cast<stdcpp_regexp*>(new std::regex(pat.text,
13-
pat.len,
14-
std::regex::optimize));
23+
return reinterpret_cast<stdcpp_regexp*>(new regex_ns::regex(pat.text,
24+
pat.len,
25+
regex_ns::regex::optimize));
1526
}
1627

1728
void stdcpp_regexp_free(stdcpp_regexp *re) {
18-
delete reinterpret_cast<std::regex*>(re);
29+
delete reinterpret_cast<regex_ns::regex*>(re);
1930
}
2031

2132
bool stdcpp_regexp_match(stdcpp_regexp *re, stdcpp_string text,
2233
int startpos, int endpos) {
23-
std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
24-
return std::regex_search(text.text + startpos, text.text + endpos,
25-
cpp_re);
34+
regex_ns::regex cpp_re(*reinterpret_cast<regex_ns::regex*>(re));
35+
return regex_ns::regex_search(text.text + startpos, text.text + endpos,
36+
cpp_re);
2637
}
2738

2839
bool stdcpp_regexp_find(stdcpp_regexp *re, stdcpp_string text,
2940
int startpos, int endpos,
3041
int *match_start, int *match_end) {
31-
std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
32-
std::cmatch result;
33-
bool matched;
34-
matched = std::regex_search(text.text + startpos, text.text + endpos,
35-
result, cpp_re);
36-
if (matched) {
42+
regex_ns::regex cpp_re(*reinterpret_cast<regex_ns::regex*>(re));
43+
regex_ns::cmatch result;
44+
bool matched;
45+
matched = regex_ns::regex_search(text.text + startpos, text.text + endpos,
46+
result, cpp_re);
47+
if (matched) {
3748
*match_start = result[0].first - text.text;
3849
*match_end = *match_start + result.length(0);
39-
}
40-
return matched;
50+
}
51+
return matched;
4152
}
4253
}

bench/src/main.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,15 @@ fn count_pcre2(pat: &str, haystack: &str) -> usize {
135135
Regex::new(pat).unwrap().find_iter(haystack).count()
136136
}
137137

138+
#[cfg(not(any(
139+
feature = "re-stdcpp",
140+
feature = "re-boost",
141+
)))]
138142
nada!("re-stdcpp", count_stdcpp);
139-
#[cfg(feature = "re-stdcpp")]
143+
#[cfg(any(
144+
feature = "re-stdcpp",
145+
feature = "re-boost",
146+
))]
140147
fn count_stdcpp(pat: &str, haystack: &str) -> usize {
141148
use ffi::stdcpp::Regex;
142149
Regex::new(pat).unwrap().find_iter(haystack).count()

bench/src/misc.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@ bench_match!(match_class_in_range, "[ac]", {
4646
});
4747

4848
#[cfg(not(feature = "re-rust-bytes"))]
49+
// std C++ does not support unicode character classes
4950
#[cfg(not(feature = "re-stdcpp"))]
51+
#[cfg(not(feature = "re-boost"))]
5052
#[cfg(not(feature = "re-tcl"))]
5153
bench_match!(match_class_unicode, r"\p{L}", {
5254
format!("{}a", repeat("☃5☃5").take(20).collect::<String>())

bench/src/sherlock.rs

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,15 @@ sherlock!(name_sherlock_holmes, r"Sherlock Holmes", 91);
3737
// limit. All of these should be able to use either memchr2 or memchr3.
3838
// std C++ does not support inline modifier syntax
3939
#[cfg(not(feature = "re-stdcpp"))]
40+
#[cfg(not(feature = "re-boost"))]
4041
sherlock!(name_sherlock_nocase, r"(?i)Sherlock", 102);
4142
// std C++ does not support inline modifier syntax
4243
#[cfg(not(feature = "re-stdcpp"))]
44+
#[cfg(not(feature = "re-boost"))]
4345
sherlock!(name_holmes_nocase, r"(?i)Holmes", 467);
4446
// std C++ does not support inline modifier syntax
4547
#[cfg(not(feature = "re-stdcpp"))]
48+
#[cfg(not(feature = "re-boost"))]
4649
sherlock!(name_sherlock_holmes_nocase, r"(?i)Sherlock Holmes", 96);
4750

4851
// Will quickly find instances of 'Sherlock', but then needs to fall back to
@@ -63,6 +66,7 @@ sherlock!(name_alt3, r"Sherlock|Holmes|Watson|Irene|Adler|John|Baker", 740);
6366
// Still using Aho-Corasick, but needs the lazy DFA.
6467
// std C++ does not support inline modifier syntax
6568
#[cfg(not(feature = "re-stdcpp"))]
69+
#[cfg(not(feature = "re-boost"))]
6670
sherlock!(
6771
name_alt3_nocase,
6872
r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker",
@@ -72,11 +76,13 @@ sherlock!(
7276
sherlock!(name_alt4, r"Sher[a-z]+|Hol[a-z]+", 582);
7377
// std C++ does not support inline modifier syntax
7478
#[cfg(not(feature = "re-stdcpp"))]
79+
#[cfg(not(feature = "re-boost"))]
7580
sherlock!(name_alt4_nocase, r"(?i)Sher[a-z]+|Hol[a-z]+", 697);
7681
// Uses Aho-Corasick, but can use memchr3 (unlike name_alt3).
7782
sherlock!(name_alt5, r"Sherlock|Holmes|Watson", 639);
7883
// std C++ does not support inline modifier syntax
7984
#[cfg(not(feature = "re-stdcpp"))]
85+
#[cfg(not(feature = "re-boost"))]
8086
sherlock!(name_alt5_nocase, r"(?i)Sherlock|Holmes|Watson", 650);
8187

8288
// How long does it take to discover that there's no match? In the first two
@@ -94,6 +100,7 @@ sherlock!(the_lower, r"the", 7218);
94100
sherlock!(the_upper, r"The", 741);
95101
// std C++ does not support inline modifier syntax
96102
#[cfg(not(feature = "re-stdcpp"))]
103+
#[cfg(not(feature = "re-boost"))]
97104
sherlock!(the_nocase, r"(?i)the", 7987);
98105

99106
// Process whitespace after a very common word.
@@ -106,40 +113,51 @@ sherlock!(the_whitespace, r"the\s+\w+", 5410);
106113
#[cfg(not(feature = "re-pcre1"))]
107114
#[cfg(not(feature = "re-pcre2"))]
108115
#[cfg(not(feature = "re-stdcpp"))]
116+
#[cfg(not(feature = "re-boost"))]
109117
#[cfg(not(feature = "re-tcl"))]
110118
sherlock!(everything_greedy, r".*", 13053);
111119
// std::regex . does not match \r
112120
#[cfg(feature = "re-stdcpp")]
121+
#[cfg(feature = "re-boost")]
113122
sherlock!(everything_greedy, r"[^\n]*", 13053);
114123
#[cfg(not(feature = "re-dphobos"))]
115124
#[cfg(not(feature = "re-onig"))]
116125
#[cfg(not(feature = "re-pcre1"))]
117126
#[cfg(not(feature = "re-pcre2"))]
118127
// std C++ does not support inline modifier syntax
119128
#[cfg(not(feature = "re-stdcpp"))]
129+
#[cfg(not(feature = "re-boost"))]
120130
#[cfg(not(feature = "re-tcl"))]
121131
sherlock!(everything_greedy_nl, r"(?s).*", 1);
122132

123133
// How fast can we match every letter? This also defeats any clever prefix
124134
// tricks.
135+
// std C++ does not support unicode character classes
125136
#[cfg(not(feature = "re-stdcpp"))]
137+
#[cfg(not(feature = "re-boost"))]
126138
#[cfg(not(feature = "re-tcl"))]
127139
sherlock!(letters, r"\p{L}", 447160);
128140

141+
// std C++ does not support unicode character classes
129142
#[cfg(not(feature = "re-stdcpp"))]
143+
#[cfg(not(feature = "re-boost"))]
130144
#[cfg(not(feature = "re-tcl"))]
131145
sherlock!(letters_upper, r"\p{Lu}", 14180);
132146

147+
// std C++ does not support unicode character classes
133148
#[cfg(not(feature = "re-stdcpp"))]
149+
#[cfg(not(feature = "re-boost"))]
134150
#[cfg(not(feature = "re-tcl"))]
135151
sherlock!(letters_lower, r"\p{Ll}", 432980);
136152

137153
// Similarly, for words.
138-
#[cfg(not(feature = "re-re2"))]
139154
#[cfg(not(feature = "re-stdcpp"))]
155+
#[cfg(not(feature = "re-boost"))]
156+
#[cfg(not(feature = "re-re2"))]
140157
sherlock!(words, r"\w+", 109214);
141-
#[cfg(feature = "re-re2")]
142158
#[cfg(feature = "re-stdcpp")]
159+
#[cfg(feature = "re-boost")]
160+
#[cfg(feature = "re-re2")]
143161
sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here?
144162

145163
// Find complete words before Holmes. The `\w` defeats any prefix
@@ -162,6 +180,7 @@ sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7);
162180
#[cfg(not(feature = "re-pcre1"))]
163181
#[cfg(not(feature = "re-pcre2"))]
164182
#[cfg(not(feature = "re-stdcpp"))]
183+
#[cfg(not(feature = "re-boost"))]
165184
#[cfg(not(feature = "re-tcl"))]
166185
sherlock!(
167186
holmes_coword_watson,
@@ -178,6 +197,7 @@ sherlock!(quotes, r#"["'][^"']{0,30}[?!.]["']"#, 767);
178197
// lazy DFA the entire way.
179198
// std C++ does not support multiline until C++17 nor the inline modifier syntax
180199
#[cfg(not(feature = "re-stdcpp"))]
200+
#[cfg(not(feature = "re-boost"))]
181201
#[cfg(not(feature = "re-dphobos"))]
182202
sherlock!(
183203
line_boundary_sherlock_holmes,

0 commit comments

Comments
 (0)