1
1
#include < gtest/gtest.h>
2
2
3
- #include " pytorch/tokenizers/regex.h"
4
- #include " pytorch/tokenizers/re2_regex.h"
5
3
#include " pytorch/tokenizers/pcre2_regex.h"
4
+ #include " pytorch/tokenizers/re2_regex.h"
5
+ #include " pytorch/tokenizers/regex.h"
6
+
7
+ using namespace tokenizers ;
6
8
7
9
// Test basic functionality
8
10
TEST (RegexTest, BasicMatching) {
9
- auto regex = createRegex (" \\ w+" );
11
+ auto regex = TK_UNWRAP_THROW ( createRegex (" \\ w+" ) );
10
12
ASSERT_TRUE (regex->ok ());
11
13
12
14
std::string text = " Hello world" ;
@@ -24,9 +26,9 @@ TEST(RegexTest, Pcre2Specific) {
24
26
const std::string pattern = " (?<=@)\\ w+" ;
25
27
Re2Regex re2_regex (pattern);
26
28
ASSERT_FALSE (re2_regex.ok ());
27
-
29
+
28
30
// Now verify that the factory function fallsback on a PCRE2 regex
29
- auto regex = createRegex (pattern);
31
+ auto regex = TK_UNWRAP_THROW ( createRegex (pattern) );
30
32
ASSERT_TRUE (regex->ok ());
31
33
32
34
std::string text =
" [email protected] " ;
@@ -40,20 +42,21 @@ TEST(RegexTest, Pcre2Specific) {
40
42
// This specific pattern is from the Qwen2.5 1.5B pretokenizer.
41
43
// https://huggingface.co/Qwen/Qwen2.5-1.5B/raw/main/tokenizer.json
42
44
TEST (RegexTest, ComplexPatternWithNegativeLookahead) {
43
- const std::string complex_pattern = " (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+(?!\\ S)|\\ s+" ;
44
-
45
+ const std::string complex_pattern =
46
+ " (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+(?!\\ S)|\\ s+" ;
47
+
45
48
// First verify that RE2 cannot handle this pattern
46
49
Re2Regex re2_regex (complex_pattern);
47
50
ASSERT_FALSE (re2_regex.ok ());
48
-
51
+
49
52
// Now verify that the factory function fallsback on a PCRE2 regex
50
- auto regex = createRegex (complex_pattern);
53
+ auto regex = TK_UNWRAP_THROW ( createRegex (complex_pattern) );
51
54
ASSERT_TRUE (regex->ok ());
52
-
55
+
53
56
// Test the pattern with some sample text
54
57
std::string text = " Hello's world\n test" ;
55
58
auto matches = regex->findAll (text);
56
-
59
+
57
60
// We expect to match:
58
61
// 1. "Hello" (word)
59
62
// 2. "'s" (contraction)
@@ -62,22 +65,22 @@ TEST(RegexTest, ComplexPatternWithNegativeLookahead) {
62
65
// 5. " " (whitespace)
63
66
// 6. " test" (word with leading space)
64
67
ASSERT_EQ (matches.size (), 6 );
65
-
68
+
66
69
EXPECT_EQ (matches[0 ].text , " Hello" );
67
70
EXPECT_EQ (matches[0 ].position , 0 );
68
-
71
+
69
72
EXPECT_EQ (matches[1 ].text , " 's" );
70
73
EXPECT_EQ (matches[1 ].position , 5 );
71
-
74
+
72
75
EXPECT_EQ (matches[2 ].text , " world" );
73
76
EXPECT_EQ (matches[2 ].position , 7 );
74
-
77
+
75
78
EXPECT_EQ (matches[3 ].text , " \n " );
76
79
EXPECT_EQ (matches[3 ].position , 13 );
77
-
80
+
78
81
EXPECT_EQ (matches[4 ].text , " " );
79
82
EXPECT_EQ (matches[4 ].position , 14 );
80
-
83
+
81
84
EXPECT_EQ (matches[5 ].text , " test" );
82
85
EXPECT_EQ (matches[5 ].position , 15 );
83
- }
86
+ }
0 commit comments