-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). #123521
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
468f73f
[LLDB] Add Lexer (with tests) for DIL (Data Inspection Language).
cmtice 61a2607
[LLDB] Add Lexer (with tests) for DIL (Data Inspection Language)
cmtice 5e2ee55
Many changes, to address all the review comments:
cmtice ccf5203
Address latest review comments:
cmtice 29e9f26
Address more review comments:
cmtice 0b33ab7
Address remaining review comments:
cmtice 4103144
Remove trailing '_' from LLDB_VALUEOBJECT_DILLEXER_H
cmtice 29ad86c
Clearing the git cache.
cmtice 41416de
Restore .h files after clearing the cache.
cmtice f56c019
Remove unneeded include file & unused m_eof_token.
cmtice 5dbf7d2
Remove unnecessary code from DILLexerTests.cpp, as requested.
cmtice File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
//===-- DILLexer.h ----------------------------------------------*- C++ -*-===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef LLDB_VALUEOBJECT_DILLEXER_H | ||
#define LLDB_VALUEOBJECT_DILLEXER_H | ||
|
||
#include "llvm/ADT/StringRef.h" | ||
#include "llvm/Support/Error.h" | ||
#include <cstdint> | ||
#include <memory> | ||
#include <string> | ||
#include <vector> | ||
|
||
namespace lldb_private::dil { | ||
|
||
/// Class defining the tokens generated by the DIL lexer and used by the | ||
/// DIL parser. | ||
class Token { | ||
public: | ||
enum Kind { | ||
coloncolon, | ||
eof, | ||
identifier, | ||
l_paren, | ||
r_paren, | ||
}; | ||
|
||
Token(Kind kind, std::string spelling, uint32_t start) | ||
: m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {} | ||
|
||
Kind GetKind() const { return m_kind; } | ||
|
||
std::string GetSpelling() const { return m_spelling; } | ||
|
||
bool Is(Kind kind) const { return m_kind == kind; } | ||
|
||
bool IsNot(Kind kind) const { return m_kind != kind; } | ||
|
||
bool IsOneOf(Kind kind1, Kind kind2) const { return Is(kind1) || Is(kind2); } | ||
|
||
template <typename... Ts> bool IsOneOf(Kind kind, Ts... Ks) const { | ||
return Is(kind) || IsOneOf(Ks...); | ||
} | ||
|
||
uint32_t GetLocation() const { return m_start_pos; } | ||
|
||
static llvm::StringRef GetTokenName(Kind kind); | ||
|
||
private: | ||
Kind m_kind; | ||
std::string m_spelling; | ||
uint32_t m_start_pos; // within entire expression string | ||
}; | ||
|
||
/// Class for doing the simple lexing required by DIL. | ||
class DILLexer { | ||
public: | ||
/// Lexes all the tokens in expr and calls the private constructor | ||
/// with the lexed tokens. | ||
static llvm::Expected<DILLexer> Create(llvm::StringRef expr); | ||
|
||
/// Return the current token to be handled by the DIL parser. | ||
const Token &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; } | ||
|
||
/// Advance the current token position by N. | ||
void Advance(uint32_t N = 1) { | ||
if (m_tokens_idx + N >= m_lexed_tokens.size()) | ||
// N is too large; advance to the end of the lexed tokens. | ||
m_tokens_idx = m_lexed_tokens.size() - 1; | ||
else | ||
m_tokens_idx += N; | ||
} | ||
|
||
/// Return the lexed token N positions ahead of the 'current' token | ||
/// being handled by the DIL parser. | ||
const Token &LookAhead(uint32_t N) { | ||
if (m_tokens_idx + N < m_lexed_tokens.size()) | ||
return m_lexed_tokens[m_tokens_idx + N]; | ||
|
||
// Last token should be an 'eof' token. | ||
return m_lexed_tokens.back(); | ||
} | ||
|
||
/// Return the index for the 'current' token being handled by the DIL parser. | ||
uint32_t GetCurrentTokenIdx() { return m_tokens_idx; } | ||
|
||
/// Set the index for the 'current' token (to be handled by the parser) | ||
/// to a particular position. Used for either committing 'look ahead' parsing | ||
/// or rolling back tentative parsing. | ||
void ResetTokenIdx(uint32_t new_value) { | ||
assert(new_value < m_lexed_tokens.size()); | ||
m_tokens_idx = new_value; | ||
} | ||
|
||
uint32_t NumLexedTokens() { return m_lexed_tokens.size(); } | ||
|
||
private: | ||
DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens) | ||
: m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)), | ||
m_tokens_idx(0) {} | ||
|
||
static llvm::Expected<Token> Lex(llvm::StringRef expr, | ||
llvm::StringRef &remainder); | ||
|
||
// The input string we are lexing & parsing. | ||
llvm::StringRef m_expr; | ||
|
||
// Holds all of the tokens lexed so far. | ||
std::vector<Token> m_lexed_tokens; | ||
|
||
// Index into m_lexed_tokens; indicates which token the DIL parser is | ||
// currently trying to parse/handle. | ||
uint32_t m_tokens_idx; | ||
}; | ||
|
||
} // namespace lldb_private::dil | ||
|
||
#endif // LLDB_VALUEOBJECT_DILLEXER_H |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
add_lldb_library(lldbValueObject | ||
DILLexer.cpp | ||
ValueObject.cpp | ||
ValueObjectCast.cpp | ||
ValueObjectChild.cpp | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
//===-- DILLexer.cpp ------------------------------------------------------===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
// This implements the recursive descent parser for the Data Inspection | ||
// Language (DIL), and its helper functions, which will eventually underlie the | ||
// 'frame variable' command. The language that this parser recognizes is | ||
// described in lldb/docs/dil-expr-lang.ebnf | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "lldb/ValueObject/DILLexer.h" | ||
#include "lldb/Utility/Status.h" | ||
#include "llvm/ADT/StringSwitch.h" | ||
|
||
namespace lldb_private::dil { | ||
|
||
llvm::StringRef Token::GetTokenName(Kind kind) { | ||
switch (kind) { | ||
case Kind::coloncolon: | ||
return "coloncolon"; | ||
case Kind::eof: | ||
return "eof"; | ||
case Kind::identifier: | ||
return "identifier"; | ||
case Kind::l_paren: | ||
return "l_paren"; | ||
case Kind::r_paren: | ||
return "r_paren"; | ||
} | ||
} | ||
|
||
static bool IsLetter(char c) { | ||
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); | ||
} | ||
|
||
static bool IsDigit(char c) { return '0' <= c && c <= '9'; } | ||
|
||
// A word starts with a letter, underscore, or dollar sign, followed by | ||
// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or underscores. | ||
static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr, | ||
llvm::StringRef &remainder) { | ||
// Find the longest prefix consisting of letters, digits, underscors and | ||
// '$'. If it doesn't start with a digit, then it's a word. | ||
llvm::StringRef candidate = remainder.take_while( | ||
[](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; }); | ||
if (candidate.empty() || IsDigit(candidate[0])) | ||
return std::nullopt; | ||
remainder = remainder.drop_front(candidate.size()); | ||
return candidate; | ||
} | ||
|
||
llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) { | ||
std::vector<Token> tokens; | ||
llvm::StringRef remainder = expr; | ||
do { | ||
if (llvm::Expected<Token> t = Lex(expr, remainder)) { | ||
tokens.push_back(std::move(*t)); | ||
} else { | ||
return t.takeError(); | ||
} | ||
} while (tokens.back().GetKind() != Token::eof); | ||
return DILLexer(expr, std::move(tokens)); | ||
} | ||
|
||
llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr, | ||
llvm::StringRef &remainder) { | ||
// Skip over whitespace (spaces). | ||
remainder = remainder.ltrim(); | ||
llvm::StringRef::iterator cur_pos = remainder.begin(); | ||
|
||
// Check to see if we've reached the end of our input string. | ||
if (remainder.empty()) | ||
return Token(Token::eof, "", (uint32_t)expr.size()); | ||
|
||
uint32_t position = cur_pos - expr.begin(); | ||
std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder); | ||
if (maybe_word) | ||
return Token(Token::identifier, maybe_word->str(), position); | ||
|
||
constexpr std::pair<Token::Kind, const char *> operators[] = { | ||
{Token::l_paren, "("}, | ||
{Token::r_paren, ")"}, | ||
{Token::coloncolon, "::"}, | ||
}; | ||
for (auto [kind, str] : operators) { | ||
if (remainder.consume_front(str)) | ||
return Token(kind, str, position); | ||
} | ||
|
||
// Unrecognized character(s) in string; unable to lex it. | ||
return llvm::createStringError("Unable to lex input string"); | ||
} | ||
|
||
} // namespace lldb_private::dil |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
//===-- DILLexerTests.cpp --------------------------------------------===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "lldb/ValueObject/DILLexer.h" | ||
#include "llvm/ADT/StringRef.h" | ||
#include "llvm/Testing/Support/Error.h" | ||
#include "gtest/gtest.h" | ||
#include <string> | ||
|
||
using llvm::StringRef; | ||
|
||
using namespace lldb_private::dil; | ||
|
||
llvm::Expected<std::vector<std::pair<Token::Kind, std::string>>> | ||
ExtractTokenData(llvm::StringRef input_expr) { | ||
|
||
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr); | ||
if (!maybe_lexer) | ||
return maybe_lexer.takeError(); | ||
DILLexer lexer(*maybe_lexer); | ||
|
||
std::vector<std::pair<Token::Kind, std::string>> data; | ||
do { | ||
Token tok = lexer.GetCurrentToken(); | ||
data.push_back(std::make_pair(tok.GetKind(), tok.GetSpelling())); | ||
lexer.Advance(); | ||
} while (data.back().first != Token::eof); | ||
// Don't return the eof token. | ||
data.pop_back(); | ||
return data; | ||
} | ||
|
||
TEST(DILLexerTests, SimpleTest) { | ||
StringRef input_expr("simple_var"); | ||
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr); | ||
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); | ||
DILLexer lexer(*maybe_lexer); | ||
Token token = lexer.GetCurrentToken(); | ||
|
||
EXPECT_EQ(token.GetKind(), Token::identifier); | ||
EXPECT_EQ(token.GetSpelling(), "simple_var"); | ||
lexer.Advance(); | ||
token = lexer.GetCurrentToken(); | ||
EXPECT_EQ(token.GetKind(), Token::eof); | ||
} | ||
|
||
TEST(DILLexerTests, TokenKindTest) { | ||
Token token = Token(Token::identifier, "ident", 0); | ||
|
||
EXPECT_TRUE(token.Is(Token::identifier)); | ||
EXPECT_FALSE(token.Is(Token::l_paren)); | ||
EXPECT_TRUE(token.IsOneOf(Token::eof, Token::identifier)); | ||
EXPECT_FALSE(token.IsOneOf(Token::l_paren, Token::r_paren, Token::coloncolon, | ||
Token::eof)); | ||
} | ||
|
||
TEST(DILLexerTests, LookAheadTest) { | ||
StringRef input_expr("(anonymous namespace)::some_var"); | ||
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr); | ||
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); | ||
DILLexer lexer(*maybe_lexer); | ||
Token token = lexer.GetCurrentToken(); | ||
|
||
// Current token is '('; check the next 4 tokens, to make | ||
// sure they are the identifier 'anonymous', the identifier 'namespace' | ||
// ')' and '::', in that order. | ||
EXPECT_EQ(token.GetKind(), Token::l_paren); | ||
EXPECT_EQ(lexer.LookAhead(1).GetKind(), Token::identifier); | ||
EXPECT_EQ(lexer.LookAhead(1).GetSpelling(), "anonymous"); | ||
EXPECT_EQ(lexer.LookAhead(2).GetKind(), Token::identifier); | ||
EXPECT_EQ(lexer.LookAhead(2).GetSpelling(), "namespace"); | ||
EXPECT_EQ(lexer.LookAhead(3).GetKind(), Token::r_paren); | ||
EXPECT_EQ(lexer.LookAhead(4).GetKind(), Token::coloncolon); | ||
|
||
// Our current index should still be 0, as we only looked ahead; we are still | ||
// officially on the '('. | ||
EXPECT_EQ(lexer.GetCurrentTokenIdx(), 0u); | ||
|
||
// Accept the 'lookahead', so our current token is '::', which has the index | ||
// 4 in our vector of tokens (which starts at zero). | ||
lexer.Advance(4); | ||
token = lexer.GetCurrentToken(); | ||
EXPECT_EQ(token.GetKind(), Token::coloncolon); | ||
EXPECT_EQ(lexer.GetCurrentTokenIdx(), 4u); | ||
|
||
lexer.Advance(); | ||
token = lexer.GetCurrentToken(); | ||
EXPECT_EQ(token.GetKind(), Token::identifier); | ||
EXPECT_EQ(token.GetSpelling(), "some_var"); | ||
EXPECT_EQ(lexer.GetCurrentTokenIdx(), 5u); | ||
EXPECT_EQ(token.GetLocation(), strlen("(anonymous namespace)::")); | ||
|
||
lexer.Advance(); | ||
token = lexer.GetCurrentToken(); | ||
EXPECT_EQ(token.GetKind(), Token::eof); | ||
} | ||
|
||
TEST(DILLexerTests, MultiTokenLexTest) { | ||
EXPECT_THAT_EXPECTED( | ||
ExtractTokenData("This string has (several ) ::identifiers"), | ||
llvm::HasValue(testing::ElementsAre( | ||
testing::Pair(Token::identifier, "This"), | ||
testing::Pair(Token::identifier, "string"), | ||
testing::Pair(Token::identifier, "has"), | ||
testing::Pair(Token::l_paren, "("), | ||
testing::Pair(Token::identifier, "several"), | ||
testing::Pair(Token::r_paren, ")"), | ||
testing::Pair(Token::coloncolon, "::"), | ||
testing::Pair(Token::identifier, "identifiers")))); | ||
} | ||
|
||
TEST(DILLexerTests, IdentifiersTest) { | ||
// These strings should lex into identifier tokens. | ||
std::vector<std::string> valid_identifiers = { | ||
"$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$", | ||
"a_b", "this", "self", "a", "MyName", "namespace"}; | ||
|
||
// The lexer can lex these strings, but they should not be identifiers. | ||
std::vector<std::string> invalid_identifiers = {"", "::", "(", ")"}; | ||
|
||
// The lexer is expected to fail attempting to lex these strings (it cannot | ||
// create valid tokens out of them). | ||
std::vector<std::string> invalid_tok_strings = {"234", "2a", "2", "1MyName"}; | ||
|
||
// Verify that all of the valid identifiers come out as identifier tokens. | ||
for (auto &str : valid_identifiers) { | ||
SCOPED_TRACE(str); | ||
EXPECT_THAT_EXPECTED(ExtractTokenData(str), | ||
llvm::HasValue(testing::ElementsAre( | ||
testing::Pair(Token::identifier, str)))); | ||
} | ||
|
||
// Verify that the lexer fails on invalid token strings. | ||
for (auto &str : invalid_tok_strings) { | ||
SCOPED_TRACE(str); | ||
auto maybe_lexer = DILLexer::Create(str); | ||
EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Failed()); | ||
} | ||
|
||
// Verify that none of the invalid identifiers come out as identifier tokens. | ||
for (auto &str : invalid_identifiers) { | ||
SCOPED_TRACE(str); | ||
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(str); | ||
EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); | ||
DILLexer lexer(*maybe_lexer); | ||
Token token = lexer.GetCurrentToken(); | ||
EXPECT_TRUE(token.IsNot(Token::identifier)); | ||
EXPECT_TRUE(token.IsOneOf(Token::eof, Token::coloncolon, Token::l_paren, | ||
Token::r_paren)); | ||
} | ||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.