[LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). (llvm#123521)

cmtice · web-flow · commit d9a7498aa24a · 2025-02-05T10:47:11.000-08:00
This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see https://discourse.llvm.org/t/rfc-data-inspection-language/69893 This version of the lexer only handles local variables and namespaces, and is designed to work with llvm#120971.
diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h
@@ -0,0 +1,123 @@
+//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_VALUEOBJECT_DILLEXER_H
+#define LLDB_VALUEOBJECT_DILLEXER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace lldb_private::dil {
+
+/// Class defining the tokens generated by the DIL lexer and used by the
+/// DIL parser.
+class Token {
+public:
+  enum Kind {
+    coloncolon,
+    eof,
+    identifier,
+    l_paren,
+    r_paren,
+  };
+
+  Token(Kind kind, std::string spelling, uint32_t start)
+      : m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {}
+
+  Kind GetKind() const { return m_kind; }
+
+  std::string GetSpelling() const { return m_spelling; }
+
+  bool Is(Kind kind) const { return m_kind == kind; }
+
+  bool IsNot(Kind kind) const { return m_kind != kind; }
+
+  bool IsOneOf(Kind kind1, Kind kind2) const { return Is(kind1) || Is(kind2); }
+
+  template <typename... Ts> bool IsOneOf(Kind kind, Ts... Ks) const {
+    return Is(kind) || IsOneOf(Ks...);
+  }
+
+  uint32_t GetLocation() const { return m_start_pos; }
+
+  static llvm::StringRef GetTokenName(Kind kind);
+
+private:
+  Kind m_kind;
+  std::string m_spelling;
+  uint32_t m_start_pos; // within entire expression string
+};
+
+/// Class for doing the simple lexing required by DIL.
+class DILLexer {
+public:
+  /// Lexes all the tokens in expr and calls the private constructor
+  /// with the lexed tokens.
+  static llvm::Expected<DILLexer> Create(llvm::StringRef expr);
+
+  /// Return the current token to be handled by the DIL parser.
+  const Token &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }
+
+  /// Advance the current token position by N.
+  void Advance(uint32_t N = 1) {
+    if (m_tokens_idx + N >= m_lexed_tokens.size())
+      // N is too large; advance to the end of the lexed tokens.
+      m_tokens_idx = m_lexed_tokens.size() - 1;
+    else
+      m_tokens_idx += N;
+  }
+
+  /// Return the lexed token N positions ahead of the 'current' token
+  /// being handled by the DIL parser.
+  const Token &LookAhead(uint32_t N) {
+    if (m_tokens_idx + N < m_lexed_tokens.size())
+      return m_lexed_tokens[m_tokens_idx + N];
+
+    // Last token should be an 'eof' token.
+    return m_lexed_tokens.back();
+  }
+
+  /// Return the index for the 'current' token being handled by the DIL parser.
+  uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }
+
+  /// Set the index for the 'current' token (to be handled by the parser)
+  /// to a particular position. Used for either committing 'look ahead' parsing
+  /// or rolling back tentative parsing.
+  void ResetTokenIdx(uint32_t new_value) {
+    assert(new_value < m_lexed_tokens.size());
+    m_tokens_idx = new_value;
+  }
+
+  uint32_t NumLexedTokens() { return m_lexed_tokens.size(); }
+
+private:
+  DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens)
+      : m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)),
+        m_tokens_idx(0) {}
+
+  static llvm::Expected<Token> Lex(llvm::StringRef expr,
+                                   llvm::StringRef &remainder);
+
+  // The input string we are lexing & parsing.
+  llvm::StringRef m_expr;
+
+  // Holds all of the tokens lexed so far.
+  std::vector<Token> m_lexed_tokens;
+
+  // Index into m_lexed_tokens; indicates which token the DIL parser is
+  // currently trying to parse/handle.
+  uint32_t m_tokens_idx;
+};
+
+} // namespace lldb_private::dil
+
+#endif // LLDB_VALUEOBJECT_DILLEXER_H
diff --git a/lldb/source/ValueObject/CMakeLists.txt b/lldb/source/ValueObject/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_lldb_library(lldbValueObject
+  DILLexer.cpp
   ValueObject.cpp
   ValueObjectCast.cpp
   ValueObjectChild.cpp
diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
@@ -0,0 +1,97 @@
+//===-- DILLexer.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This implements the recursive descent parser for the Data Inspection
+// Language (DIL), and its helper functions, which will eventually underlie the
+// 'frame variable' command. The language that this parser recognizes is
+// described in lldb/docs/dil-expr-lang.ebnf
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "lldb/Utility/Status.h"
+#include "llvm/ADT/StringSwitch.h"
+
+namespace lldb_private::dil {
+
+llvm::StringRef Token::GetTokenName(Kind kind) {
+  switch (kind) {
+  case Kind::coloncolon:
+    return "coloncolon";
+  case Kind::eof:
+    return "eof";
+  case Kind::identifier:
+    return "identifier";
+  case Kind::l_paren:
+    return "l_paren";
+  case Kind::r_paren:
+    return "r_paren";
+  }
+}
+
+static bool IsLetter(char c) {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
+}
+
+static bool IsDigit(char c) { return '0' <= c && c <= '9'; }
+
+// A word starts with a letter, underscore, or dollar sign, followed by
+// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or  underscores.
+static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
+                                             llvm::StringRef &remainder) {
+  // Find the longest prefix consisting of letters, digits, underscors and
+  // '$'. If it doesn't start with a digit, then it's a word.
+  llvm::StringRef candidate = remainder.take_while(
+      [](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; });
+  if (candidate.empty() || IsDigit(candidate[0]))
+    return std::nullopt;
+  remainder = remainder.drop_front(candidate.size());
+  return candidate;
+}
+
+llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
+  std::vector<Token> tokens;
+  llvm::StringRef remainder = expr;
+  do {
+    if (llvm::Expected<Token> t = Lex(expr, remainder)) {
+      tokens.push_back(std::move(*t));
+    } else {
+      return t.takeError();
+    }
+  } while (tokens.back().GetKind() != Token::eof);
+  return DILLexer(expr, std::move(tokens));
+}
+
+llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
+                                    llvm::StringRef &remainder) {
+  // Skip over whitespace (spaces).
+  remainder = remainder.ltrim();
+  llvm::StringRef::iterator cur_pos = remainder.begin();
+
+  // Check to see if we've reached the end of our input string.
+  if (remainder.empty())
+    return Token(Token::eof, "", (uint32_t)expr.size());
+
+  uint32_t position = cur_pos - expr.begin();
+  std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
+  if (maybe_word)
+    return Token(Token::identifier, maybe_word->str(), position);
+
+  constexpr std::pair<Token::Kind, const char *> operators[] = {
+      {Token::l_paren, "("},
+      {Token::r_paren, ")"},
+      {Token::coloncolon, "::"},
+  };
+  for (auto [kind, str] : operators) {
+    if (remainder.consume_front(str))
+      return Token(kind, str, position);
+  }
+
+  // Unrecognized character(s) in string; unable to lex it.
+  return llvm::createStringError("Unable to lex input string");
+}
+
+} // namespace lldb_private::dil
diff --git a/lldb/unittests/ValueObject/CMakeLists.txt b/lldb/unittests/ValueObject/CMakeLists.txt
@@ -1,10 +1,12 @@
 add_lldb_unittest(LLDBValueObjectTests
   DumpValueObjectOptionsTests.cpp
+  DILLexerTests.cpp
 
   LINK_LIBS
     lldbValueObject
     lldbPluginPlatformLinux
     lldbPluginScriptInterpreterNone
+    LLVMTestingSupport
 
   LINK_COMPONENTS
     Support
diff --git a/lldb/unittests/ValueObject/DILLexerTests.cpp b/lldb/unittests/ValueObject/DILLexerTests.cpp
@@ -0,0 +1,156 @@
+//===-- DILLexerTests.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+#include <string>
+
+using llvm::StringRef;
+
+using namespace lldb_private::dil;
+
+llvm::Expected<std::vector<std::pair<Token::Kind, std::string>>>
+ExtractTokenData(llvm::StringRef input_expr) {
+
+  llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
+  if (!maybe_lexer)
+    return maybe_lexer.takeError();
+  DILLexer lexer(*maybe_lexer);
+
+  std::vector<std::pair<Token::Kind, std::string>> data;
+  do {
+    Token tok = lexer.GetCurrentToken();
+    data.push_back(std::make_pair(tok.GetKind(), tok.GetSpelling()));
+    lexer.Advance();
+  } while (data.back().first != Token::eof);
+  // Don't return the eof token.
+  data.pop_back();
+  return data;
+}
+
+TEST(DILLexerTests, SimpleTest) {
+  StringRef input_expr("simple_var");
+  llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
+  ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+  DILLexer lexer(*maybe_lexer);
+  Token token = lexer.GetCurrentToken();
+
+  EXPECT_EQ(token.GetKind(), Token::identifier);
+  EXPECT_EQ(token.GetSpelling(), "simple_var");
+  lexer.Advance();
+  token = lexer.GetCurrentToken();
+  EXPECT_EQ(token.GetKind(), Token::eof);
+}
+
+TEST(DILLexerTests, TokenKindTest) {
+  Token token = Token(Token::identifier, "ident", 0);
+
+  EXPECT_TRUE(token.Is(Token::identifier));
+  EXPECT_FALSE(token.Is(Token::l_paren));
+  EXPECT_TRUE(token.IsOneOf(Token::eof, Token::identifier));
+  EXPECT_FALSE(token.IsOneOf(Token::l_paren, Token::r_paren, Token::coloncolon,
+                             Token::eof));
+}
+
+TEST(DILLexerTests, LookAheadTest) {
+  StringRef input_expr("(anonymous namespace)::some_var");
+  llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
+  ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+  DILLexer lexer(*maybe_lexer);
+  Token token = lexer.GetCurrentToken();
+
+  // Current token is '('; check the next 4 tokens, to make
+  // sure they are the identifier 'anonymous', the identifier 'namespace'
+  // ')' and '::', in that order.
+  EXPECT_EQ(token.GetKind(), Token::l_paren);
+  EXPECT_EQ(lexer.LookAhead(1).GetKind(), Token::identifier);
+  EXPECT_EQ(lexer.LookAhead(1).GetSpelling(), "anonymous");
+  EXPECT_EQ(lexer.LookAhead(2).GetKind(), Token::identifier);
+  EXPECT_EQ(lexer.LookAhead(2).GetSpelling(), "namespace");
+  EXPECT_EQ(lexer.LookAhead(3).GetKind(), Token::r_paren);
+  EXPECT_EQ(lexer.LookAhead(4).GetKind(), Token::coloncolon);
+
+  // Our current index should still be 0, as we only looked ahead; we are still
+  // officially on the '('.
+  EXPECT_EQ(lexer.GetCurrentTokenIdx(), 0u);
+
+  // Accept the 'lookahead', so our current token is '::', which has the index
+  // 4 in our vector of tokens (which starts at zero).
+  lexer.Advance(4);
+  token = lexer.GetCurrentToken();
+  EXPECT_EQ(token.GetKind(), Token::coloncolon);
+  EXPECT_EQ(lexer.GetCurrentTokenIdx(), 4u);
+
+  lexer.Advance();
+  token = lexer.GetCurrentToken();
+  EXPECT_EQ(token.GetKind(), Token::identifier);
+  EXPECT_EQ(token.GetSpelling(), "some_var");
+  EXPECT_EQ(lexer.GetCurrentTokenIdx(), 5u);
+  EXPECT_EQ(token.GetLocation(), strlen("(anonymous namespace)::"));
+
+  lexer.Advance();
+  token = lexer.GetCurrentToken();
+  EXPECT_EQ(token.GetKind(), Token::eof);
+}
+
+TEST(DILLexerTests, MultiTokenLexTest) {
+  EXPECT_THAT_EXPECTED(
+      ExtractTokenData("This string has (several ) ::identifiers"),
+      llvm::HasValue(testing::ElementsAre(
+          testing::Pair(Token::identifier, "This"),
+          testing::Pair(Token::identifier, "string"),
+          testing::Pair(Token::identifier, "has"),
+          testing::Pair(Token::l_paren, "("),
+          testing::Pair(Token::identifier, "several"),
+          testing::Pair(Token::r_paren, ")"),
+          testing::Pair(Token::coloncolon, "::"),
+          testing::Pair(Token::identifier, "identifiers"))));
+}
+
+TEST(DILLexerTests, IdentifiersTest) {
+  // These strings should lex into identifier tokens.
+  std::vector<std::string> valid_identifiers = {
+      "$My_name1", "$pc",  "abcd", "_", "_a",     "_a_",      "$",
+      "a_b",       "this", "self", "a", "MyName", "namespace"};
+
+  // The lexer can lex these strings, but they should not be identifiers.
+  std::vector<std::string> invalid_identifiers = {"", "::", "(", ")"};
+
+  // The lexer is expected to fail attempting to lex these strings (it cannot
+  // create valid tokens out of them).
+  std::vector<std::string> invalid_tok_strings = {"234", "2a", "2", "1MyName"};
+
+  // Verify that all of the valid identifiers come out as identifier tokens.
+  for (auto &str : valid_identifiers) {
+    SCOPED_TRACE(str);
+    EXPECT_THAT_EXPECTED(ExtractTokenData(str),
+                         llvm::HasValue(testing::ElementsAre(
+                             testing::Pair(Token::identifier, str))));
+  }
+
+  // Verify that the lexer fails on invalid token strings.
+  for (auto &str : invalid_tok_strings) {
+    SCOPED_TRACE(str);
+    auto maybe_lexer = DILLexer::Create(str);
+    EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Failed());
+  }
+
+  // Verify that none of the invalid identifiers come out as identifier tokens.
+  for (auto &str : invalid_identifiers) {
+    SCOPED_TRACE(str);
+    llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(str);
+    EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+    DILLexer lexer(*maybe_lexer);
+    Token token = lexer.GetCurrentToken();
+    EXPECT_TRUE(token.IsNot(Token::identifier));
+    EXPECT_TRUE(token.IsOneOf(Token::eof, Token::coloncolon, Token::l_paren,
+                              Token::r_paren));
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`add_lldb_library(lldbValueObject`
	`2`	`+ DILLexer.cpp`
`2`	`3`	`ValueObject.cpp`
`3`	`4`	`ValueObjectCast.cpp`
`4`	`5`	`ValueObjectChild.cpp`