Skip to content

Commit d9a7498

Browse files
authored
[LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). (#123521)
This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see https://discourse.llvm.org/t/rfc-data-inspection-language/69893 This version of the lexer only handles local variables and namespaces, and is designed to work with #120971.
1 parent 2d8106c commit d9a7498

File tree

5 files changed

+379
-0
lines changed

5 files changed

+379
-0
lines changed
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLDB_VALUEOBJECT_DILLEXER_H
10+
#define LLDB_VALUEOBJECT_DILLEXER_H
11+
12+
#include "llvm/ADT/StringRef.h"
13+
#include "llvm/Support/Error.h"
14+
#include <cstdint>
15+
#include <memory>
16+
#include <string>
17+
#include <vector>
18+
19+
namespace lldb_private::dil {
20+
21+
/// Class defining the tokens generated by the DIL lexer and used by the
22+
/// DIL parser.
23+
class Token {
24+
public:
25+
enum Kind {
26+
coloncolon,
27+
eof,
28+
identifier,
29+
l_paren,
30+
r_paren,
31+
};
32+
33+
Token(Kind kind, std::string spelling, uint32_t start)
34+
: m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {}
35+
36+
Kind GetKind() const { return m_kind; }
37+
38+
std::string GetSpelling() const { return m_spelling; }
39+
40+
bool Is(Kind kind) const { return m_kind == kind; }
41+
42+
bool IsNot(Kind kind) const { return m_kind != kind; }
43+
44+
bool IsOneOf(Kind kind1, Kind kind2) const { return Is(kind1) || Is(kind2); }
45+
46+
template <typename... Ts> bool IsOneOf(Kind kind, Ts... Ks) const {
47+
return Is(kind) || IsOneOf(Ks...);
48+
}
49+
50+
uint32_t GetLocation() const { return m_start_pos; }
51+
52+
static llvm::StringRef GetTokenName(Kind kind);
53+
54+
private:
55+
Kind m_kind;
56+
std::string m_spelling;
57+
uint32_t m_start_pos; // within entire expression string
58+
};
59+
60+
/// Class for doing the simple lexing required by DIL.
61+
class DILLexer {
62+
public:
63+
/// Lexes all the tokens in expr and calls the private constructor
64+
/// with the lexed tokens.
65+
static llvm::Expected<DILLexer> Create(llvm::StringRef expr);
66+
67+
/// Return the current token to be handled by the DIL parser.
68+
const Token &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }
69+
70+
/// Advance the current token position by N.
71+
void Advance(uint32_t N = 1) {
72+
if (m_tokens_idx + N >= m_lexed_tokens.size())
73+
// N is too large; advance to the end of the lexed tokens.
74+
m_tokens_idx = m_lexed_tokens.size() - 1;
75+
else
76+
m_tokens_idx += N;
77+
}
78+
79+
/// Return the lexed token N positions ahead of the 'current' token
80+
/// being handled by the DIL parser.
81+
const Token &LookAhead(uint32_t N) {
82+
if (m_tokens_idx + N < m_lexed_tokens.size())
83+
return m_lexed_tokens[m_tokens_idx + N];
84+
85+
// Last token should be an 'eof' token.
86+
return m_lexed_tokens.back();
87+
}
88+
89+
/// Return the index for the 'current' token being handled by the DIL parser.
90+
uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }
91+
92+
/// Set the index for the 'current' token (to be handled by the parser)
93+
/// to a particular position. Used for either committing 'look ahead' parsing
94+
/// or rolling back tentative parsing.
95+
void ResetTokenIdx(uint32_t new_value) {
96+
assert(new_value < m_lexed_tokens.size());
97+
m_tokens_idx = new_value;
98+
}
99+
100+
uint32_t NumLexedTokens() { return m_lexed_tokens.size(); }
101+
102+
private:
103+
DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens)
104+
: m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)),
105+
m_tokens_idx(0) {}
106+
107+
static llvm::Expected<Token> Lex(llvm::StringRef expr,
108+
llvm::StringRef &remainder);
109+
110+
// The input string we are lexing & parsing.
111+
llvm::StringRef m_expr;
112+
113+
// Holds all of the tokens lexed so far.
114+
std::vector<Token> m_lexed_tokens;
115+
116+
// Index into m_lexed_tokens; indicates which token the DIL parser is
117+
// currently trying to parse/handle.
118+
uint32_t m_tokens_idx;
119+
};
120+
121+
} // namespace lldb_private::dil
122+
123+
#endif // LLDB_VALUEOBJECT_DILLEXER_H

lldb/source/ValueObject/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
add_lldb_library(lldbValueObject
2+
DILLexer.cpp
23
ValueObject.cpp
34
ValueObjectCast.cpp
45
ValueObjectChild.cpp

lldb/source/ValueObject/DILLexer.cpp

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
//===-- DILLexer.cpp ------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
// This implements the recursive descent parser for the Data Inspection
8+
// Language (DIL), and its helper functions, which will eventually underlie the
9+
// 'frame variable' command. The language that this parser recognizes is
10+
// described in lldb/docs/dil-expr-lang.ebnf
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#include "lldb/ValueObject/DILLexer.h"
15+
#include "lldb/Utility/Status.h"
16+
#include "llvm/ADT/StringSwitch.h"
17+
18+
namespace lldb_private::dil {
19+
20+
llvm::StringRef Token::GetTokenName(Kind kind) {
21+
switch (kind) {
22+
case Kind::coloncolon:
23+
return "coloncolon";
24+
case Kind::eof:
25+
return "eof";
26+
case Kind::identifier:
27+
return "identifier";
28+
case Kind::l_paren:
29+
return "l_paren";
30+
case Kind::r_paren:
31+
return "r_paren";
32+
}
33+
}
34+
35+
static bool IsLetter(char c) {
36+
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
37+
}
38+
39+
static bool IsDigit(char c) { return '0' <= c && c <= '9'; }
40+
41+
// A word starts with a letter, underscore, or dollar sign, followed by
42+
// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or underscores.
43+
static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
44+
llvm::StringRef &remainder) {
45+
// Find the longest prefix consisting of letters, digits, underscors and
46+
// '$'. If it doesn't start with a digit, then it's a word.
47+
llvm::StringRef candidate = remainder.take_while(
48+
[](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; });
49+
if (candidate.empty() || IsDigit(candidate[0]))
50+
return std::nullopt;
51+
remainder = remainder.drop_front(candidate.size());
52+
return candidate;
53+
}
54+
55+
llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
56+
std::vector<Token> tokens;
57+
llvm::StringRef remainder = expr;
58+
do {
59+
if (llvm::Expected<Token> t = Lex(expr, remainder)) {
60+
tokens.push_back(std::move(*t));
61+
} else {
62+
return t.takeError();
63+
}
64+
} while (tokens.back().GetKind() != Token::eof);
65+
return DILLexer(expr, std::move(tokens));
66+
}
67+
68+
llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
69+
llvm::StringRef &remainder) {
70+
// Skip over whitespace (spaces).
71+
remainder = remainder.ltrim();
72+
llvm::StringRef::iterator cur_pos = remainder.begin();
73+
74+
// Check to see if we've reached the end of our input string.
75+
if (remainder.empty())
76+
return Token(Token::eof, "", (uint32_t)expr.size());
77+
78+
uint32_t position = cur_pos - expr.begin();
79+
std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
80+
if (maybe_word)
81+
return Token(Token::identifier, maybe_word->str(), position);
82+
83+
constexpr std::pair<Token::Kind, const char *> operators[] = {
84+
{Token::l_paren, "("},
85+
{Token::r_paren, ")"},
86+
{Token::coloncolon, "::"},
87+
};
88+
for (auto [kind, str] : operators) {
89+
if (remainder.consume_front(str))
90+
return Token(kind, str, position);
91+
}
92+
93+
// Unrecognized character(s) in string; unable to lex it.
94+
return llvm::createStringError("Unable to lex input string");
95+
}
96+
97+
} // namespace lldb_private::dil

lldb/unittests/ValueObject/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
add_lldb_unittest(LLDBValueObjectTests
22
DumpValueObjectOptionsTests.cpp
3+
DILLexerTests.cpp
34

45
LINK_LIBS
56
lldbValueObject
67
lldbPluginPlatformLinux
78
lldbPluginScriptInterpreterNone
9+
LLVMTestingSupport
810

911
LINK_COMPONENTS
1012
Support
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
//===-- DILLexerTests.cpp --------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "lldb/ValueObject/DILLexer.h"
10+
#include "llvm/ADT/StringRef.h"
11+
#include "llvm/Testing/Support/Error.h"
12+
#include "gtest/gtest.h"
13+
#include <string>
14+
15+
using llvm::StringRef;
16+
17+
using namespace lldb_private::dil;
18+
19+
llvm::Expected<std::vector<std::pair<Token::Kind, std::string>>>
20+
ExtractTokenData(llvm::StringRef input_expr) {
21+
22+
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
23+
if (!maybe_lexer)
24+
return maybe_lexer.takeError();
25+
DILLexer lexer(*maybe_lexer);
26+
27+
std::vector<std::pair<Token::Kind, std::string>> data;
28+
do {
29+
Token tok = lexer.GetCurrentToken();
30+
data.push_back(std::make_pair(tok.GetKind(), tok.GetSpelling()));
31+
lexer.Advance();
32+
} while (data.back().first != Token::eof);
33+
// Don't return the eof token.
34+
data.pop_back();
35+
return data;
36+
}
37+
38+
TEST(DILLexerTests, SimpleTest) {
39+
StringRef input_expr("simple_var");
40+
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
41+
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
42+
DILLexer lexer(*maybe_lexer);
43+
Token token = lexer.GetCurrentToken();
44+
45+
EXPECT_EQ(token.GetKind(), Token::identifier);
46+
EXPECT_EQ(token.GetSpelling(), "simple_var");
47+
lexer.Advance();
48+
token = lexer.GetCurrentToken();
49+
EXPECT_EQ(token.GetKind(), Token::eof);
50+
}
51+
52+
TEST(DILLexerTests, TokenKindTest) {
53+
Token token = Token(Token::identifier, "ident", 0);
54+
55+
EXPECT_TRUE(token.Is(Token::identifier));
56+
EXPECT_FALSE(token.Is(Token::l_paren));
57+
EXPECT_TRUE(token.IsOneOf(Token::eof, Token::identifier));
58+
EXPECT_FALSE(token.IsOneOf(Token::l_paren, Token::r_paren, Token::coloncolon,
59+
Token::eof));
60+
}
61+
62+
TEST(DILLexerTests, LookAheadTest) {
63+
StringRef input_expr("(anonymous namespace)::some_var");
64+
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
65+
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
66+
DILLexer lexer(*maybe_lexer);
67+
Token token = lexer.GetCurrentToken();
68+
69+
// Current token is '('; check the next 4 tokens, to make
70+
// sure they are the identifier 'anonymous', the identifier 'namespace'
71+
// ')' and '::', in that order.
72+
EXPECT_EQ(token.GetKind(), Token::l_paren);
73+
EXPECT_EQ(lexer.LookAhead(1).GetKind(), Token::identifier);
74+
EXPECT_EQ(lexer.LookAhead(1).GetSpelling(), "anonymous");
75+
EXPECT_EQ(lexer.LookAhead(2).GetKind(), Token::identifier);
76+
EXPECT_EQ(lexer.LookAhead(2).GetSpelling(), "namespace");
77+
EXPECT_EQ(lexer.LookAhead(3).GetKind(), Token::r_paren);
78+
EXPECT_EQ(lexer.LookAhead(4).GetKind(), Token::coloncolon);
79+
80+
// Our current index should still be 0, as we only looked ahead; we are still
81+
// officially on the '('.
82+
EXPECT_EQ(lexer.GetCurrentTokenIdx(), 0u);
83+
84+
// Accept the 'lookahead', so our current token is '::', which has the index
85+
// 4 in our vector of tokens (which starts at zero).
86+
lexer.Advance(4);
87+
token = lexer.GetCurrentToken();
88+
EXPECT_EQ(token.GetKind(), Token::coloncolon);
89+
EXPECT_EQ(lexer.GetCurrentTokenIdx(), 4u);
90+
91+
lexer.Advance();
92+
token = lexer.GetCurrentToken();
93+
EXPECT_EQ(token.GetKind(), Token::identifier);
94+
EXPECT_EQ(token.GetSpelling(), "some_var");
95+
EXPECT_EQ(lexer.GetCurrentTokenIdx(), 5u);
96+
EXPECT_EQ(token.GetLocation(), strlen("(anonymous namespace)::"));
97+
98+
lexer.Advance();
99+
token = lexer.GetCurrentToken();
100+
EXPECT_EQ(token.GetKind(), Token::eof);
101+
}
102+
103+
TEST(DILLexerTests, MultiTokenLexTest) {
104+
EXPECT_THAT_EXPECTED(
105+
ExtractTokenData("This string has (several ) ::identifiers"),
106+
llvm::HasValue(testing::ElementsAre(
107+
testing::Pair(Token::identifier, "This"),
108+
testing::Pair(Token::identifier, "string"),
109+
testing::Pair(Token::identifier, "has"),
110+
testing::Pair(Token::l_paren, "("),
111+
testing::Pair(Token::identifier, "several"),
112+
testing::Pair(Token::r_paren, ")"),
113+
testing::Pair(Token::coloncolon, "::"),
114+
testing::Pair(Token::identifier, "identifiers"))));
115+
}
116+
117+
TEST(DILLexerTests, IdentifiersTest) {
118+
// These strings should lex into identifier tokens.
119+
std::vector<std::string> valid_identifiers = {
120+
"$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$",
121+
"a_b", "this", "self", "a", "MyName", "namespace"};
122+
123+
// The lexer can lex these strings, but they should not be identifiers.
124+
std::vector<std::string> invalid_identifiers = {"", "::", "(", ")"};
125+
126+
// The lexer is expected to fail attempting to lex these strings (it cannot
127+
// create valid tokens out of them).
128+
std::vector<std::string> invalid_tok_strings = {"234", "2a", "2", "1MyName"};
129+
130+
// Verify that all of the valid identifiers come out as identifier tokens.
131+
for (auto &str : valid_identifiers) {
132+
SCOPED_TRACE(str);
133+
EXPECT_THAT_EXPECTED(ExtractTokenData(str),
134+
llvm::HasValue(testing::ElementsAre(
135+
testing::Pair(Token::identifier, str))));
136+
}
137+
138+
// Verify that the lexer fails on invalid token strings.
139+
for (auto &str : invalid_tok_strings) {
140+
SCOPED_TRACE(str);
141+
auto maybe_lexer = DILLexer::Create(str);
142+
EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Failed());
143+
}
144+
145+
// Verify that none of the invalid identifiers come out as identifier tokens.
146+
for (auto &str : invalid_identifiers) {
147+
SCOPED_TRACE(str);
148+
llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(str);
149+
EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
150+
DILLexer lexer(*maybe_lexer);
151+
Token token = lexer.GetCurrentToken();
152+
EXPECT_TRUE(token.IsNot(Token::identifier));
153+
EXPECT_TRUE(token.IsOneOf(Token::eof, Token::coloncolon, Token::l_paren,
154+
Token::r_paren));
155+
}
156+
}

0 commit comments

Comments
 (0)