Skip to content

Add RegexSet functionality to C API #288

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 8, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions regex-capi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,5 @@ There are a few things missing from the C API that are present in the Rust API.
There's no particular (known) reason why they don't, they just haven't been
implemented yet.

* RegexSet, which permits matching multiple regular expressions simultaneously
in a single linear time search.
* Splitting a string by a regex.
* Replacing regex matches in a string with some other text.
188 changes: 188 additions & 0 deletions regex-capi/ctest/test.c
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,191 @@ bool test_compile_error_size_limit() {
return passed;
}

bool test_regex_set_matches() {

#define PAT_COUNT 6

bool passed = true;
const char *patterns[] = {
"foo", "barfoo", "\\w+", "\\d+", "foobar", "bar"
};
const size_t patterns_lengths[] = {
3, 6, 3, 3, 6, 3
};

rure_error *err = rure_error_new();
rure_set *re = rure_compile_set((const uint8_t **) patterns,
patterns_lengths,
PAT_COUNT,
0,
NULL,
err);
if (re == NULL) {
passed = false;
goto done2;
}

if (rure_set_len(re) != PAT_COUNT) {
passed = false;
goto done1;
}

if (!rure_set_is_match(re, (const uint8_t *) "foobar", 6, 0)) {
passed = false;
goto done1;
}

if (rure_set_is_match(re, (const uint8_t *) "", 0, 0)) {
passed = false;
goto done1;
}

bool matches[PAT_COUNT];
if (!rure_set_matches(re, (const uint8_t *) "foobar", 6, 0, matches)) {
passed = false;
goto done1;
}

const bool match_target[] = {
true, false, true, false, true, true
};

int i;
for (i = 0; i < PAT_COUNT; ++i) {
if (matches[i] != match_target[i]) {
passed = false;
goto done1;
}
}

done1:
rure_set_free(re);
done2:
rure_error_free(err);
return passed;

#undef PAT_COUNT
}

bool test_regex_set_match_start() {

#define PAT_COUNT 3

bool passed = true;
const char *patterns[] = {
"foo", "bar", "fooo"
};
const size_t patterns_lengths[] = {
3, 3, 4
};

rure_error *err = rure_error_new();
rure_set *re = rure_compile_set((const uint8_t **) patterns,
patterns_lengths,
PAT_COUNT,
0,
NULL,
err);
if (re == NULL) {
passed = false;
goto done2;
}

if (rure_set_len(re) != PAT_COUNT) {
passed = false;
goto done1;
}

if (rure_set_is_match(re, (const uint8_t *) "foobiasdr", 7, 2)) {
passed = false;
goto done1;
}

{
bool matches[PAT_COUNT];
if (!rure_set_matches(re, (const uint8_t *) "fooobar", 8, 0, matches)) {
passed = false;
goto done1;
}

const bool match_target[] = {
true, true, true
};

int i;
for (i = 0; i < PAT_COUNT; ++i) {
if (matches[i] != match_target[i]) {
passed = false;
goto done1;
}
}
}

{
bool matches[PAT_COUNT];
if (!rure_set_matches(re, (const uint8_t *) "fooobar", 7, 1, matches)) {
passed = false;
goto done1;
}

const bool match_target[] = {
false, true, false
};

int i;
for (i = 0; i < PAT_COUNT; ++i) {
if (matches[i] != match_target[i]) {
passed = false;
goto done1;
}
}
}

done1:
rure_set_free(re);
done2:
rure_error_free(err);
return passed;

#undef PAT_COUNT
}

bool test_regex_set_options() {

bool passed = true;
rure_options *opts = rure_options_new();
rure_options_size_limit(opts, 0);
rure_error *err = rure_error_new();

const char *patterns[] = { "\\w{100}" };
const size_t patterns_lengths[] = { 8 };

rure_set *re = rure_compile_set((const uint8_t **) patterns, patterns_lengths,
1, 0, opts, err);
if (re != NULL) {
if (DEBUG) {
fprintf(stderr,
"[test_compile_error_size_limit] "
"expected NULL regex pointer, but got non-NULL pointer\n");
}
passed = false;
rure_set_free(re);
}
const char *msg = rure_error_message(err);
if (NULL == strstr(msg, "exceeds size")) {
if (DEBUG) {
fprintf(stderr,
"[test_compile_error] "
"expected an 'exceeds size' error message, but "
"got this instead: '%s'\n", msg);
}
passed = false;
}
rure_options_free(opts);
rure_error_free(err);
return passed;
}

void run_test(bool (test)(), const char *name, bool *passed) {
if (!test()) {
*passed = false;
Expand All @@ -353,6 +538,9 @@ int main() {
run_test(test_compile_error, "test_compile_error", &passed);
run_test(test_compile_error_size_limit, "test_compile_error_size_limit",
&passed);
run_test(test_regex_set_matches, "test_regex_set_match", &passed);
run_test(test_regex_set_options, "test_regex_set_options", &passed);
run_test(test_regex_set_match_start, "test_regex_set_match_start", &passed);

if (!passed) {
exit(1);
Expand Down
93 changes: 92 additions & 1 deletion regex-capi/include/rure.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ extern "C" {
*/
typedef struct rure rure;

/*
* rure_set is the type of a set of compiled regular expressions.
*
* A rure can be safely used from multiple threads simultaneously.
*/
typedef struct rure_set rure_set;

/*
* rure_options is the set of non-flag configuration options for compiling
* a regular expression. Currently, only two options are available: setting
Expand Down Expand Up @@ -165,7 +172,7 @@ rure *rure_compile(const uint8_t *pattern, size_t length,
/*
* rure_free frees the given compiled regular expression.
*
* This must be called at most once.
* This must be called at most once for any rure.
*/
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is slightly more clear I think. rure_set_free should match this.

void rure_free(rure *re);

Expand Down Expand Up @@ -446,6 +453,90 @@ void rure_options_size_limit(rure_options *options, size_t limit);
*/
void rure_options_dfa_size_limit(rure_options *options, size_t limit);

/*
* rure_compile_set compiles the given list of patterns into a single regular
* expression which can be matched in a linear-scan. Each pattern in patterns
* must be valid UTF-8 and the length of each pattern in patterns corresponds
* to a byte length in patterns_lengths.
*
* The number of patterns to compile is specified by patterns_count. patterns
* must contain at least this many entries.
*
* flags is a bitfield. Valid values are constants declared with prefix
* RURE_FLAG_.
*
* options contains non-flag configuration settings. If it's NULL, default
* settings are used. options may be freed immediately after a call to
* rure_compile.
*
* error is set if there was a problem compiling the pattern.
*
* The compiled expression set returned may be used from multiple threads.
*/
rure_set *rure_compile_set(const uint8_t **patterns,
const size_t *patterns_lengths,
size_t patterns_count,
uint32_t flags,
rure_options *options,
rure_error *error);

/*
* rure_set_free frees the given compiled regular expression set.
*
* This must be called at most once for any rure_set.
*/
void rure_set_free(rure_set *re);

/*
* rure_is_match returns true if and only if any regexes within the set
* match anywhere in the haystack. Once a match has been located, the
* matching engine will quit immediately.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*/
bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length,
size_t start);

/*
* rure_set_matches compares each regex in the set against the haystack and
* modifies matches with the match result of each pattern. Match results are
* ordered in the same way as the rure_set was compiled. For example,
* index 0 of matches corresponds to the first pattern passed to
* `rure_compile_set`.
*
* haystack may contain arbitrary bytes, but ASCII compatible text is more
* useful. UTF-8 is even more useful. Other text encodings aren't supported.
* length should be the number of bytes in haystack.
*
* start is the position at which to start searching. Note that setting the
* start position is distinct from incrementing the pointer, since the regex
* engine may look at bytes before the start position to determine match
* information. For example, if the start position is greater than 0, then the
* \A ("begin text") anchor can never match.
*
* matches must be greater than or equal to the number of patterns the
* rure_set was compiled with.
*
* Only use this function if you specifically need to know which regexes
* matched within the set. To determine if any of the regexes matched without
* caring which, use rure_set_is_match.
*/
bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length,
size_t start, bool *matches);

/*
* rure_set_len returns the number of patterns rure_set was compiled with.
*/
size_t rure_set_len(rure_set *re);

/*
* rure_error_new allocates space for an error.
*
Expand Down
Loading