pytorch-labs · facebook-github-bot · Apr 18, 2025 · Apr 14, 2025 · Apr 16, 2025 · Apr 17, 2025
diff --git a/include/pytorch/tokenizers/re2_regex.h b/include/pytorch/tokenizers/re2_regex.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include <re2/re2.h>
+
+#include <pytorch/tokenizers/regex.h>
+
+namespace tokenizers {
+
+/**
+ * @brief RE2-based implementation of IRegex.
+ */
+class Re2Regex : public IRegex {
+ public:
+  /**
+   * @brief Construct a RE2 regex with the given pattern.
+   *
+   * @param pattern The regex pattern to compile.
+   */
+  explicit Re2Regex(const std::string& pattern);
+
+  /**
+   * @brief Return all non-overlapping matches found in the input string.
+   */
+  virtual std::vector<Match> find_all(const std::string& text) const override;
+
+ private:
+  std::unique_ptr<re2::RE2> regex_;
+
+  friend Result<std::unique_ptr<IRegex>> create_regex(
+      const std::string& pattern);
+};
+
+} // namespace tokenizers
diff --git a/include/pytorch/tokenizers/regex.h b/include/pytorch/tokenizers/regex.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <pytorch/tokenizers/result.h>
+
+namespace tokenizers {
+
+struct Match {
+  size_t start; // starting index of the match
+  size_t end; // ending index of the match (exclusive)
+};
+
+/**
+ * @brief Abstract interface for regex wrappers.
+ */
+class IRegex {
+ public:
+  virtual ~IRegex() = default;
+
+  /**
+   * @brief Find all non-overlapping matches in the input string.
+   *
+   * @param text The input string to search.
+   * @return A vector of strings containing all matched substrings.
+   */
+  virtual std::vector<Match> find_all(const std::string& text) const = 0;
+};
+
+/**
+ * @brief Creates a regex instance. Tries RE2 first, falls back to std::regex.
+ *
+ * @param pattern The regex pattern to compile.
+ * @return A unique pointer to an IRegex-compatible object.
+ */
+Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern);
+
+} // namespace tokenizers
diff --git a/include/pytorch/tokenizers/std_regex.h b/include/pytorch/tokenizers/std_regex.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+#include <regex>
+#include <string>
+#include "regex.h"
+
+namespace tokenizers {
+
+/**
+ * @brief std::regex-based implementation of IRegex.
+ */
+class StdRegex : public IRegex {
+ public:
+  /**
+   * @brief Construct a std::regex wrapper with the given pattern.
+   *
+   * @param pattern The regex pattern to compile.
+   * @throws std::regex_error if the pattern is invalid.
+   */
+  explicit StdRegex(const std::string& pattern);
+
+  /**
+   * @brief Find all non-overlapping matches in the input string.
+   */
+  virtual std::vector<Match> find_all(const std::string& text) const override;
+
+ private:
+  std::regex regex_;
+};
+
+} // namespace tokenizers
diff --git a/src/re2_regex.cpp b/src/re2_regex.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <pytorch/tokenizers/re2_regex.h>
+
+namespace tokenizers {
+
+Re2Regex::Re2Regex(const std::string& pattern) {
+  regex_ = std::make_unique<re2::RE2>(pattern);
+  // Warmup re2 as it is slow on the first run, void the return value as it's
+  // not needed Refer to
+  // https://github.com/google/re2/blob/6dcd83d60f7944926bfd308cc13979fc53dd69ca/re2/fuzzing/re2_fuzzer.cc#L136-L141
+  (void)regex_->ReverseProgramSize();
+}
+
+std::vector<Match> Re2Regex::find_all(const std::string& text) const {
+  std::vector<Match> result;
+  re2::StringPiece input(text);
+  re2::StringPiece piece;
+
+  const char* base = input.data();
+
+  while (RE2::FindAndConsume(&input, *regex_, &piece)) {
+    size_t start = piece.data() - base;
+    result.push_back({start, start + piece.size()});
+  }
+
+  return result;
+}
+
+} // namespace tokenizers
diff --git a/src/regex.cpp b/src/regex.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <pytorch/tokenizers/re2_regex.h>
+#include <pytorch/tokenizers/regex.h>
+#include <pytorch/tokenizers/std_regex.h>
+
+#include <re2/re2.h>
+#include <iostream>
+#include <memory>
+
+namespace tokenizers {
+
+/**
+ * @brief Factory function that creates a regex object using RE2 if possible.
+ *        Falls back to std::regex if RE2 rejects the pattern with
+ *        ErrorBadPerlOp.
+ */
+Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
+  // Try RE2 first
+  auto re2 = std::make_unique<Re2Regex>("(" + pattern + ")");
+
+  if (re2->regex_->ok()) {
+    return static_cast<std::unique_ptr<IRegex>>(std::move(re2));
+  }
+
+  if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) {
+    try {
+      std::cout
+          << "RE2 is unable to support things such as negative lookaheads in "
+          << pattern << ", defaulting to std::regex.";
+      auto std_regex = std::make_unique<StdRegex>("(" + pattern + ")");
+      return static_cast<std::unique_ptr<IRegex>>(std::move(std_regex));
+    } catch (const std::regex_error& e) {
+      std::cerr << "std::regex failed: " << e.what() << std::endl;
+      return tokenizers::Error::LoadFailure;
+    }
+  } else {
+    std::cerr << "RE2 failed to compile pattern: " << pattern << "\n";
+    std::cerr << "Error: " << (re2->regex_->error()) << std::endl;
+    return tokenizers::Error::LoadFailure;
+  }
+}
+
+} // namespace tokenizers
diff --git a/src/std_regex.cpp b/src/std_regex.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <pytorch/tokenizers/std_regex.h>
+#include <regex>
+
+namespace tokenizers {
+
+StdRegex::StdRegex(const std::string& pattern) : regex_(pattern) {}
+
+std::vector<Match> StdRegex::find_all(const std::string& text) const {
+  std::vector<Match> result;
+  std::sregex_iterator iter(text.begin(), text.end(), regex_);
+  std::sregex_iterator end;
+
+  for (; iter != end; ++iter) {
+    const auto& match = *iter;
+    size_t start = match.position(1);
+    result.push_back({start, start + match[1].length()});
+  }
+
+  return result;
+}
+
+} // namespace tokenizers