Skip to content

Commit d8074a8

Browse files
committed
Update
1 parent 9468fe3 commit d8074a8

File tree

4 files changed

+38
-28
lines changed

4 files changed

+38
-28
lines changed

include/pytorch/tokenizers/pcre2_regex.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,11 @@
55

66
// Define PCRE2 code unit width before including pcre2.h
77
#define PCRE2_CODE_UNIT_WIDTH 8
8-
9-
// Third Party
108
#include <pcre2.h>
119

12-
// Local
13-
#include "regex.h"
10+
#include <pytorch/tokenizers/regex.h>
11+
12+
namespace tokenizers {
1413

1514
/**
1615
* @brief PCRE2-based implementation of IRegex.
@@ -50,5 +49,8 @@ class Pcre2Regex : public IRegex {
5049
pcre2_match_data* match_data_;
5150
bool is_valid_;
5251

53-
friend std::unique_ptr<IRegex> createRegex(const std::string& pattern);
54-
};
52+
friend Result<std::unique_ptr<IRegex>> createRegex(
53+
const std::string& pattern);
54+
};
55+
56+
} // namespace tokenizers

src/pcre2_regex.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1-
#include "pytorch/tokenizers/pcre2_regex.h"
2-
31
#include <iostream>
42
#include <vector>
53

4+
#include <pytorch/tokenizers/pcre2_regex.h>
5+
6+
namespace tokenizers {
7+
68
Pcre2Regex::Pcre2Regex(const std::string& pattern)
79
: regex_(nullptr), match_data_(nullptr), is_valid_(false) {
810
int error_code;
@@ -108,4 +110,6 @@ bool Pcre2Regex::ok() const {
108110

109111
const pcre2_code* Pcre2Regex::rawRegex() const {
110112
return regex_;
111-
}
113+
}
114+
115+
} // namespace tokenizers

src/regex.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
#include <pytorch/tokenizers/regex.h>
1+
#include <pytorch/tokenizers/pcre2_regex.h>
22
#include <pytorch/tokenizers/re2_regex.h>
3+
#include <pytorch/tokenizers/regex.h>
34
#include <pytorch/tokenizers/std_regex.h>
45

56
#include <re2/re2.h>

test/test_regex.cpp

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
#include <gtest/gtest.h>
22

3-
#include "pytorch/tokenizers/regex.h"
4-
#include "pytorch/tokenizers/re2_regex.h"
53
#include "pytorch/tokenizers/pcre2_regex.h"
4+
#include "pytorch/tokenizers/re2_regex.h"
5+
#include "pytorch/tokenizers/regex.h"
6+
7+
using namespace tokenizers;
68

79
// Test basic functionality
810
TEST(RegexTest, BasicMatching) {
9-
auto regex = createRegex("\\w+");
11+
auto regex = TK_UNWRAP_THROW(createRegex("\\w+"));
1012
ASSERT_TRUE(regex->ok());
1113

1214
std::string text = "Hello world";
@@ -24,9 +26,9 @@ TEST(RegexTest, Pcre2Specific) {
2426
const std::string pattern = "(?<=@)\\w+";
2527
Re2Regex re2_regex(pattern);
2628
ASSERT_FALSE(re2_regex.ok());
27-
29+
2830
// Now verify that the factory function fallsback on a PCRE2 regex
29-
auto regex = createRegex(pattern);
31+
auto regex = TK_UNWRAP_THROW(createRegex(pattern));
3032
ASSERT_TRUE(regex->ok());
3133

3234
std::string text = "[email protected]";
@@ -40,20 +42,21 @@ TEST(RegexTest, Pcre2Specific) {
4042
// This specific pattern is from the Qwen2.5 1.5B pretokenizer.
4143
// https://huggingface.co/Qwen/Qwen2.5-1.5B/raw/main/tokenizer.json
4244
TEST(RegexTest, ComplexPatternWithNegativeLookahead) {
43-
const std::string complex_pattern = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
44-
45+
const std::string complex_pattern =
46+
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
47+
4548
// First verify that RE2 cannot handle this pattern
4649
Re2Regex re2_regex(complex_pattern);
4750
ASSERT_FALSE(re2_regex.ok());
48-
51+
4952
// Now verify that the factory function fallsback on a PCRE2 regex
50-
auto regex = createRegex(complex_pattern);
53+
auto regex = TK_UNWRAP_THROW(createRegex(complex_pattern));
5154
ASSERT_TRUE(regex->ok());
52-
55+
5356
// Test the pattern with some sample text
5457
std::string text = "Hello's world\n test";
5558
auto matches = regex->findAll(text);
56-
59+
5760
// We expect to match:
5861
// 1. "Hello" (word)
5962
// 2. "'s" (contraction)
@@ -62,22 +65,22 @@ TEST(RegexTest, ComplexPatternWithNegativeLookahead) {
6265
// 5. " " (whitespace)
6366
// 6. " test" (word with leading space)
6467
ASSERT_EQ(matches.size(), 6);
65-
68+
6669
EXPECT_EQ(matches[0].text, "Hello");
6770
EXPECT_EQ(matches[0].position, 0);
68-
71+
6972
EXPECT_EQ(matches[1].text, "'s");
7073
EXPECT_EQ(matches[1].position, 5);
71-
74+
7275
EXPECT_EQ(matches[2].text, " world");
7376
EXPECT_EQ(matches[2].position, 7);
74-
77+
7578
EXPECT_EQ(matches[3].text, "\n");
7679
EXPECT_EQ(matches[3].position, 13);
77-
80+
7881
EXPECT_EQ(matches[4].text, " ");
7982
EXPECT_EQ(matches[4].position, 14);
80-
83+
8184
EXPECT_EQ(matches[5].text, " test");
8285
EXPECT_EQ(matches[5].position, 15);
83-
}
86+
}

0 commit comments

Comments
 (0)