Skip to content

[LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). #123521

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Feb 5, 2025
Merged
123 changes: 123 additions & 0 deletions lldb/include/lldb/ValueObject/DILLexer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLDB_VALUEOBJECT_DILLEXER_H
#define LLDB_VALUEOBJECT_DILLEXER_H

#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Error.h"
#include <cstdint>
#include <memory>
#include <string>
#include <vector>

namespace lldb_private::dil {

/// Class defining the tokens generated by the DIL lexer and used by the
/// DIL parser.
class Token {
public:
enum Kind {
coloncolon,
eof,
identifier,
l_paren,
r_paren,
};

Token(Kind kind, std::string spelling, uint32_t start)
: m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {}

Kind GetKind() const { return m_kind; }

std::string GetSpelling() const { return m_spelling; }

bool Is(Kind kind) const { return m_kind == kind; }

bool IsNot(Kind kind) const { return m_kind != kind; }

bool IsOneOf(Kind kind1, Kind kind2) const { return Is(kind1) || Is(kind2); }

template <typename... Ts> bool IsOneOf(Kind kind, Ts... Ks) const {
return Is(kind) || IsOneOf(Ks...);
}

uint32_t GetLocation() const { return m_start_pos; }

static llvm::StringRef GetTokenName(Kind kind);

private:
Kind m_kind;
std::string m_spelling;
uint32_t m_start_pos; // within entire expression string
};

/// Class for doing the simple lexing required by DIL.
class DILLexer {
public:
/// Lexes all the tokens in expr and calls the private constructor
/// with the lexed tokens.
static llvm::Expected<DILLexer> Create(llvm::StringRef expr);

/// Return the current token to be handled by the DIL parser.
const Token &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }

/// Advance the current token position by N.
void Advance(uint32_t N = 1) {
if (m_tokens_idx + N >= m_lexed_tokens.size())
// N is too large; advance to the end of the lexed tokens.
m_tokens_idx = m_lexed_tokens.size() - 1;
else
m_tokens_idx += N;
}

/// Return the lexed token N positions ahead of the 'current' token
/// being handled by the DIL parser.
const Token &LookAhead(uint32_t N) {
if (m_tokens_idx + N < m_lexed_tokens.size())
return m_lexed_tokens[m_tokens_idx + N];

// Last token should be an 'eof' token.
return m_lexed_tokens.back();
}

/// Return the index for the 'current' token being handled by the DIL parser.
uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }

/// Set the index for the 'current' token (to be handled by the parser)
/// to a particular position. Used for either committing 'look ahead' parsing
/// or rolling back tentative parsing.
void ResetTokenIdx(uint32_t new_value) {
assert(new_value < m_lexed_tokens.size());
m_tokens_idx = new_value;
}

uint32_t NumLexedTokens() { return m_lexed_tokens.size(); }

private:
DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens)
: m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)),
m_tokens_idx(0) {}

static llvm::Expected<Token> Lex(llvm::StringRef expr,
llvm::StringRef &remainder);

// The input string we are lexing & parsing.
llvm::StringRef m_expr;

// Holds all of the tokens lexed so far.
std::vector<Token> m_lexed_tokens;

// Index into m_lexed_tokens; indicates which token the DIL parser is
// currently trying to parse/handle.
uint32_t m_tokens_idx;
};

} // namespace lldb_private::dil

#endif // LLDB_VALUEOBJECT_DILLEXER_H
1 change: 1 addition & 0 deletions lldb/source/ValueObject/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
add_lldb_library(lldbValueObject
DILLexer.cpp
ValueObject.cpp
ValueObjectCast.cpp
ValueObjectChild.cpp
Expand Down
97 changes: 97 additions & 0 deletions lldb/source/ValueObject/DILLexer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
//===-- DILLexer.cpp ------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// This implements the recursive descent parser for the Data Inspection
// Language (DIL), and its helper functions, which will eventually underlie the
// 'frame variable' command. The language that this parser recognizes is
// described in lldb/docs/dil-expr-lang.ebnf
//
//===----------------------------------------------------------------------===//

#include "lldb/ValueObject/DILLexer.h"
#include "lldb/Utility/Status.h"
#include "llvm/ADT/StringSwitch.h"

namespace lldb_private::dil {

llvm::StringRef Token::GetTokenName(Kind kind) {
switch (kind) {
case Kind::coloncolon:
return "coloncolon";
case Kind::eof:
return "eof";
case Kind::identifier:
return "identifier";
case Kind::l_paren:
return "l_paren";
case Kind::r_paren:
return "r_paren";
}
}

static bool IsLetter(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
}

static bool IsDigit(char c) { return '0' <= c && c <= '9'; }

// A word starts with a letter, underscore, or dollar sign, followed by
// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or underscores.
static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
llvm::StringRef &remainder) {
// Find the longest prefix consisting of letters, digits, underscors and
// '$'. If it doesn't start with a digit, then it's a word.
llvm::StringRef candidate = remainder.take_while(
[](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; });
if (candidate.empty() || IsDigit(candidate[0]))
return std::nullopt;
remainder = remainder.drop_front(candidate.size());
return candidate;
}

llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
std::vector<Token> tokens;
llvm::StringRef remainder = expr;
do {
if (llvm::Expected<Token> t = Lex(expr, remainder)) {
tokens.push_back(std::move(*t));
} else {
return t.takeError();
}
} while (tokens.back().GetKind() != Token::eof);
return DILLexer(expr, std::move(tokens));
}

llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
llvm::StringRef &remainder) {
// Skip over whitespace (spaces).
remainder = remainder.ltrim();
llvm::StringRef::iterator cur_pos = remainder.begin();

// Check to see if we've reached the end of our input string.
if (remainder.empty())
return Token(Token::eof, "", (uint32_t)expr.size());

uint32_t position = cur_pos - expr.begin();
std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
if (maybe_word)
return Token(Token::identifier, maybe_word->str(), position);

constexpr std::pair<Token::Kind, const char *> operators[] = {
{Token::l_paren, "("},
{Token::r_paren, ")"},
{Token::coloncolon, "::"},
};
for (auto [kind, str] : operators) {
if (remainder.consume_front(str))
return Token(kind, str, position);
}

// Unrecognized character(s) in string; unable to lex it.
return llvm::createStringError("Unable to lex input string");
}

} // namespace lldb_private::dil
2 changes: 2 additions & 0 deletions lldb/unittests/ValueObject/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
add_lldb_unittest(LLDBValueObjectTests
DumpValueObjectOptionsTests.cpp
DILLexerTests.cpp

LINK_LIBS
lldbValueObject
lldbPluginPlatformLinux
lldbPluginScriptInterpreterNone
LLVMTestingSupport

LINK_COMPONENTS
Support
Expand Down
156 changes: 156 additions & 0 deletions lldb/unittests/ValueObject/DILLexerTests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
//===-- DILLexerTests.cpp --------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "lldb/ValueObject/DILLexer.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Testing/Support/Error.h"
#include "gtest/gtest.h"
#include <string>

using llvm::StringRef;

using namespace lldb_private::dil;

llvm::Expected<std::vector<std::pair<Token::Kind, std::string>>>
ExtractTokenData(llvm::StringRef input_expr) {

llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
if (!maybe_lexer)
return maybe_lexer.takeError();
DILLexer lexer(*maybe_lexer);

std::vector<std::pair<Token::Kind, std::string>> data;
do {
Token tok = lexer.GetCurrentToken();
data.push_back(std::make_pair(tok.GetKind(), tok.GetSpelling()));
lexer.Advance();
} while (data.back().first != Token::eof);
// Don't return the eof token.
data.pop_back();
return data;
}

TEST(DILLexerTests, SimpleTest) {
StringRef input_expr("simple_var");
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
DILLexer lexer(*maybe_lexer);
Token token = lexer.GetCurrentToken();

EXPECT_EQ(token.GetKind(), Token::identifier);
EXPECT_EQ(token.GetSpelling(), "simple_var");
lexer.Advance();
token = lexer.GetCurrentToken();
EXPECT_EQ(token.GetKind(), Token::eof);
}

TEST(DILLexerTests, TokenKindTest) {
Token token = Token(Token::identifier, "ident", 0);

EXPECT_TRUE(token.Is(Token::identifier));
EXPECT_FALSE(token.Is(Token::l_paren));
EXPECT_TRUE(token.IsOneOf(Token::eof, Token::identifier));
EXPECT_FALSE(token.IsOneOf(Token::l_paren, Token::r_paren, Token::coloncolon,
Token::eof));
}

TEST(DILLexerTests, LookAheadTest) {
StringRef input_expr("(anonymous namespace)::some_var");
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
DILLexer lexer(*maybe_lexer);
Token token = lexer.GetCurrentToken();

// Current token is '('; check the next 4 tokens, to make
// sure they are the identifier 'anonymous', the identifier 'namespace'
// ')' and '::', in that order.
EXPECT_EQ(token.GetKind(), Token::l_paren);
EXPECT_EQ(lexer.LookAhead(1).GetKind(), Token::identifier);
EXPECT_EQ(lexer.LookAhead(1).GetSpelling(), "anonymous");
EXPECT_EQ(lexer.LookAhead(2).GetKind(), Token::identifier);
EXPECT_EQ(lexer.LookAhead(2).GetSpelling(), "namespace");
EXPECT_EQ(lexer.LookAhead(3).GetKind(), Token::r_paren);
EXPECT_EQ(lexer.LookAhead(4).GetKind(), Token::coloncolon);

// Our current index should still be 0, as we only looked ahead; we are still
// officially on the '('.
EXPECT_EQ(lexer.GetCurrentTokenIdx(), 0u);

// Accept the 'lookahead', so our current token is '::', which has the index
// 4 in our vector of tokens (which starts at zero).
lexer.Advance(4);
token = lexer.GetCurrentToken();
EXPECT_EQ(token.GetKind(), Token::coloncolon);
EXPECT_EQ(lexer.GetCurrentTokenIdx(), 4u);

lexer.Advance();
token = lexer.GetCurrentToken();
EXPECT_EQ(token.GetKind(), Token::identifier);
EXPECT_EQ(token.GetSpelling(), "some_var");
EXPECT_EQ(lexer.GetCurrentTokenIdx(), 5u);
EXPECT_EQ(token.GetLocation(), strlen("(anonymous namespace)::"));

lexer.Advance();
token = lexer.GetCurrentToken();
EXPECT_EQ(token.GetKind(), Token::eof);
}

TEST(DILLexerTests, MultiTokenLexTest) {
EXPECT_THAT_EXPECTED(
ExtractTokenData("This string has (several ) ::identifiers"),
llvm::HasValue(testing::ElementsAre(
testing::Pair(Token::identifier, "This"),
testing::Pair(Token::identifier, "string"),
testing::Pair(Token::identifier, "has"),
testing::Pair(Token::l_paren, "("),
testing::Pair(Token::identifier, "several"),
testing::Pair(Token::r_paren, ")"),
testing::Pair(Token::coloncolon, "::"),
testing::Pair(Token::identifier, "identifiers"))));
}

TEST(DILLexerTests, IdentifiersTest) {
// These strings should lex into identifier tokens.
std::vector<std::string> valid_identifiers = {
"$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$",
"a_b", "this", "self", "a", "MyName", "namespace"};

// The lexer can lex these strings, but they should not be identifiers.
std::vector<std::string> invalid_identifiers = {"", "::", "(", ")"};

// The lexer is expected to fail attempting to lex these strings (it cannot
// create valid tokens out of them).
std::vector<std::string> invalid_tok_strings = {"234", "2a", "2", "1MyName"};

// Verify that all of the valid identifiers come out as identifier tokens.
for (auto &str : valid_identifiers) {
SCOPED_TRACE(str);
EXPECT_THAT_EXPECTED(ExtractTokenData(str),
llvm::HasValue(testing::ElementsAre(
testing::Pair(Token::identifier, str))));
}

// Verify that the lexer fails on invalid token strings.
for (auto &str : invalid_tok_strings) {
SCOPED_TRACE(str);
auto maybe_lexer = DILLexer::Create(str);
EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Failed());
}

// Verify that none of the invalid identifiers come out as identifier tokens.
for (auto &str : invalid_identifiers) {
SCOPED_TRACE(str);
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(str);
EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
DILLexer lexer(*maybe_lexer);
Token token = lexer.GetCurrentToken();
EXPECT_TRUE(token.IsNot(Token::identifier));
EXPECT_TRUE(token.IsOneOf(Token::eof, Token::coloncolon, Token::l_paren,
Token::r_paren));
}
}