-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). #123521
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see https://discourse.llvm.org/t/rfc-data-inspection-language/69893 This version of the lexer only handles local variables and namespaces, and is designed to work with llvm#120971.
@llvm/pr-subscribers-lldb Author: None (cmtice) ChangesThis adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see This version of the lexer only handles local variables and namespaces, and is designed to work with Full diff: https://github.com/llvm/llvm-project/pull/123521.diff 4 Files Affected:
diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h
new file mode 100644
index 00000000000000..45c506b2f4106d
--- /dev/null
+++ b/lldb/include/lldb/ValueObject/DILLexer.h
@@ -0,0 +1,156 @@
+//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
+#define LLDB_VALUEOBJECT_DILLEXER_H_
+
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <limits.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace lldb_private {
+
+namespace dil {
+
+enum class TokenKind {
+ coloncolon,
+ eof,
+ identifier,
+ invalid,
+ kw_namespace,
+ l_paren,
+ none,
+ r_paren,
+ unknown,
+};
+
+/// Class defining the tokens generated by the DIL lexer and used by the
+/// DIL parser.
+class DILToken {
+public:
+ DILToken(dil::TokenKind kind, std::string spelling, uint32_t start)
+ : m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
+
+ DILToken() : m_kind(dil::TokenKind::none), m_spelling(""), m_start_pos(0) {}
+
+ void setKind(dil::TokenKind kind) { m_kind = kind; }
+ dil::TokenKind getKind() const { return m_kind; }
+
+ std::string getSpelling() const { return m_spelling; }
+
+ uint32_t getLength() const { return m_spelling.size(); }
+
+ bool is(dil::TokenKind kind) const { return m_kind == kind; }
+
+ bool isNot(dil::TokenKind kind) const { return m_kind != kind; }
+
+ bool isOneOf(dil::TokenKind kind1, dil::TokenKind kind2) const {
+ return is(kind1) || is(kind2);
+ }
+
+ template <typename... Ts> bool isOneOf(dil::TokenKind kind, Ts... Ks) const {
+ return is(kind) || isOneOf(Ks...);
+ }
+
+ uint32_t getLocation() const { return m_start_pos; }
+
+ void setValues(dil::TokenKind kind, std::string spelling, uint32_t start) {
+ m_kind = kind;
+ m_spelling = spelling;
+ m_start_pos = start;
+ }
+
+ static const std::string getTokenName(dil::TokenKind kind);
+
+private:
+ dil::TokenKind m_kind;
+ std::string m_spelling;
+ uint32_t m_start_pos; // within entire expression string
+};
+
+/// Class for doing the simple lexing required by DIL.
+class DILLexer {
+public:
+ DILLexer(llvm::StringRef dil_expr) : m_expr(dil_expr.str()) {
+ m_cur_pos = m_expr.begin();
+ // Use UINT_MAX to indicate invalid/uninitialized value.
+ m_tokens_idx = UINT_MAX;
+ }
+
+ bool Lex(DILToken &result, bool look_ahead = false);
+
+ bool Is_Word(std::string::iterator start, uint32_t &length);
+
+ uint32_t GetLocation() { return m_cur_pos - m_expr.begin(); }
+
+ /// Update 'result' with the other paremeter values, create a
+ /// duplicate token, and push the duplicate token onto the vector of
+ /// lexed tokens.
+ void UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind,
+ std::string tok_str, uint32_t tok_pos);
+
+ /// Return the lexed token N+1 positions ahead of the 'current' token
+ /// being handled by the DIL parser.
+ const DILToken &LookAhead(uint32_t N);
+
+ const DILToken &AcceptLookAhead(uint32_t N);
+
+ /// Return the index for the 'current' token being handled by the DIL parser.
+ uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }
+
+ /// Return the current token to be handled by the DIL parser.
+ DILToken &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }
+
+ /// Update the index for the 'current' token, to point to the next lexed
+ /// token.
+ bool IncrementTokenIdx() {
+ if (m_tokens_idx >= m_lexed_tokens.size() - 1)
+ return false;
+
+ m_tokens_idx++;
+ return true;
+ }
+
+ /// Set the index for the 'current' token (to be handled by the parser)
+ /// to a particular position. Used for either committing 'look ahead' parsing
+ /// or rolling back tentative parsing.
+ bool ResetTokenIdx(uint32_t new_value) {
+ if (new_value > m_lexed_tokens.size() - 1)
+ return false;
+
+ m_tokens_idx = new_value;
+ return true;
+ }
+
+private:
+ // The input string we are lexing & parsing.
+ std::string m_expr;
+
+ // The current position of the lexer within m_expr (the character position,
+ // within the string, of the next item to be lexed).
+ std::string::iterator m_cur_pos;
+
+ // Holds all of the tokens lexed so far.
+ std::vector<DILToken> m_lexed_tokens;
+
+ // Index into m_lexed_tokens; indicates which token the DIL parser is
+ // currently trying to parse/handle.
+ uint32_t m_tokens_idx;
+
+ // "invalid" token; to be returned by lexer when 'look ahead' fails.
+ DILToken m_invalid_token;
+};
+
+} // namespace dil
+
+} // namespace lldb_private
+
+#endif // LLDB_VALUEOBJECT_DILLEXER_H_
diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
new file mode 100644
index 00000000000000..4c2b0b1813bb96
--- /dev/null
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -0,0 +1,205 @@
+//===-- DILLexer.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This implements the recursive descent parser for the Data Inspection
+// Language (DIL), and its helper functions, which will eventually underlie the
+// 'frame variable' command. The language that this parser recognizes is
+// described in lldb/docs/dil-expr-lang.ebnf
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "llvm/ADT/StringMap.h"
+
+namespace lldb_private {
+
+namespace dil {
+
+// For fast keyword lookup. More keywords will be added later.
+const llvm::StringMap<dil::TokenKind> Keywords = {
+ {"namespace", dil::TokenKind::kw_namespace},
+};
+
+const std::string DILToken::getTokenName(dil::TokenKind kind) {
+ switch (kind) {
+ case dil::TokenKind::coloncolon:
+ return "coloncolon";
+ case dil::TokenKind::eof:
+ return "eof";
+ case dil::TokenKind::identifier:
+ return "identifier";
+ case dil::TokenKind::kw_namespace:
+ return "namespace";
+ case dil::TokenKind::l_paren:
+ return "l_paren";
+ case dil::TokenKind::r_paren:
+ return "r_paren";
+ case dil::TokenKind::unknown:
+ return "unknown";
+ default:
+ return "token_name";
+ }
+}
+
+static bool Is_Letter(char c) {
+ if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
+ return true;
+ return false;
+}
+
+static bool Is_Digit(char c) { return ('0' <= c && c <= '9'); }
+
+// A word starts with a letter, underscore, or dollar sign, followed by
+// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or underscores.
+bool DILLexer::Is_Word(std::string::iterator start, uint32_t &length) {
+ bool done = false;
+ bool dollar_start = false;
+
+ // Must not start with a digit.
+ if (m_cur_pos == m_expr.end() || Is_Digit(*m_cur_pos))
+ return false;
+
+ // First character *may* be a '$', for a register name or convenience
+ // variable.
+ if (*m_cur_pos == '$') {
+ dollar_start = true;
+ ++m_cur_pos;
+ length++;
+ }
+
+ // Contains only letters, digits or underscores
+ for (; m_cur_pos != m_expr.end() && !done; ++m_cur_pos) {
+ char c = *m_cur_pos;
+ if (!Is_Letter(c) && !Is_Digit(c) && c != '_') {
+ done = true;
+ break;
+ } else
+ length++;
+ }
+
+ if (dollar_start && length > 1) // Must have something besides just '$'
+ return true;
+
+ if (!dollar_start && length > 0)
+ return true;
+
+ // Not a valid word, so re-set the lexing position.
+ m_cur_pos = start;
+ return false;
+}
+
+void DILLexer::UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind,
+ std::string tok_str, uint32_t tok_pos) {
+ DILToken new_token;
+ result.setValues(tok_kind, tok_str, tok_pos);
+ new_token = result;
+ m_lexed_tokens.push_back(std::move(new_token));
+}
+
+bool DILLexer::Lex(DILToken &result, bool look_ahead) {
+ bool retval = true;
+
+ if (!look_ahead) {
+ // We're being asked for the 'next' token, and not a part of a LookAhead.
+ // Check to see if we've already lexed it and pushed it onto our tokens
+ // vector; if so, return the next token from the vector, rather than doing
+ // more lexing.
+ if ((m_tokens_idx != UINT_MAX) &&
+ (m_tokens_idx < m_lexed_tokens.size() - 1)) {
+ result = m_lexed_tokens[m_tokens_idx + 1];
+ return retval;
+ }
+ }
+
+ // Skip over whitespace (spaces).
+ while (m_cur_pos != m_expr.end() && *m_cur_pos == ' ')
+ m_cur_pos++;
+
+ // Check to see if we've reached the end of our input string.
+ if (m_cur_pos == m_expr.end()) {
+ UpdateLexedTokens(result, dil::TokenKind::eof, "", m_expr.length());
+ return retval;
+ }
+
+ uint32_t position = m_cur_pos - m_expr.begin();
+ ;
+ std::string::iterator start = m_cur_pos;
+ uint32_t length = 0;
+ if (Is_Word(start, length)) {
+ dil::TokenKind kind;
+ std::string word = m_expr.substr(position, length);
+ auto iter = Keywords.find(word);
+ if (iter != Keywords.end())
+ kind = iter->second;
+ else
+ kind = dil::TokenKind::identifier;
+
+ UpdateLexedTokens(result, kind, word, position);
+ return true;
+ }
+
+ switch (*m_cur_pos) {
+ case '(':
+ m_cur_pos++;
+ UpdateLexedTokens(result, dil::TokenKind::l_paren, "(", position);
+ return true;
+ case ')':
+ m_cur_pos++;
+ UpdateLexedTokens(result, dil::TokenKind::r_paren, ")", position);
+ return true;
+ case ':':
+ if (position + 1 < m_expr.size() && m_expr[position + 1] == ':') {
+ m_cur_pos += 2;
+ UpdateLexedTokens(result, dil::TokenKind::coloncolon, "::", position);
+ return true;
+ }
+ break;
+ default:
+ break;
+ }
+ // Empty Token
+ result.setValues(dil::TokenKind::none, "", m_expr.length());
+ return false;
+}
+
+const DILToken &DILLexer::LookAhead(uint32_t N) {
+ uint32_t extra_lexed_tokens = m_lexed_tokens.size() - m_tokens_idx - 1;
+
+ if (N + 1 < extra_lexed_tokens)
+ return m_lexed_tokens[m_tokens_idx + N + 1];
+
+ uint32_t remaining_tokens =
+ (m_tokens_idx + N + 1) - m_lexed_tokens.size() + 1;
+
+ bool done = false;
+ bool look_ahead = true;
+ while (!done && remaining_tokens > 0) {
+ DILToken tok;
+ Lex(tok, look_ahead);
+ if (tok.getKind() == dil::TokenKind::eof)
+ done = true;
+ remaining_tokens--;
+ };
+
+ if (remaining_tokens > 0) {
+ m_invalid_token.setValues(dil::TokenKind::invalid, "", 0);
+ return m_invalid_token;
+ }
+
+ return m_lexed_tokens[m_tokens_idx + N + 1];
+}
+
+const DILToken &DILLexer::AcceptLookAhead(uint32_t N) {
+ if (m_tokens_idx + N + 1 > m_lexed_tokens.size())
+ return m_invalid_token;
+
+ m_tokens_idx += N + 1;
+ return m_lexed_tokens[m_tokens_idx];
+}
+
+} // namespace dil
+
+} // namespace lldb_private
diff --git a/lldb/unittests/ValueObject/CMakeLists.txt b/lldb/unittests/ValueObject/CMakeLists.txt
index 8fcc8d62a79979..952f5411a98057 100644
--- a/lldb/unittests/ValueObject/CMakeLists.txt
+++ b/lldb/unittests/ValueObject/CMakeLists.txt
@@ -1,5 +1,6 @@
add_lldb_unittest(LLDBValueObjectTests
DumpValueObjectOptionsTests.cpp
+ DILLexerTests.cpp
LINK_LIBS
lldbValueObject
diff --git a/lldb/unittests/ValueObject/DILLexerTests.cpp b/lldb/unittests/ValueObject/DILLexerTests.cpp
new file mode 100644
index 00000000000000..ec6ff86b64d36b
--- /dev/null
+++ b/lldb/unittests/ValueObject/DILLexerTests.cpp
@@ -0,0 +1,193 @@
+//===-- DILLexerTests.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "llvm/ADT/StringRef.h"
+#include "gtest/gtest.h"
+#include <string>
+
+using llvm::StringRef;
+
+TEST(DILLexerTests, SimpleTest) {
+ StringRef dil_input_expr("simple_var");
+ uint32_t tok_len = 10;
+ lldb_private::dil::DILLexer dil_lexer(dil_input_expr);
+ lldb_private::dil::DILToken dil_token;
+ dil_token.setKind(lldb_private::dil::TokenKind::unknown);
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::unknown);
+ dil_lexer.Lex(dil_token);
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+ EXPECT_EQ(dil_token.getSpelling(), "simple_var");
+ EXPECT_EQ(dil_token.getLength(), tok_len);
+ dil_lexer.Lex(dil_token);
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::eof);
+}
+
+TEST(DILLexerTests, TokenKindTest) {
+ StringRef dil_input_expr("namespace");
+ lldb_private::dil::DILLexer dil_lexer(dil_input_expr);
+ lldb_private::dil::DILToken dil_token;
+ dil_token.setKind(lldb_private::dil::TokenKind::unknown);
+
+ dil_lexer.Lex(dil_token);
+ EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), UINT_MAX);
+ dil_lexer.ResetTokenIdx(0);
+
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::kw_namespace);
+ EXPECT_TRUE(dil_token.isNot(lldb_private::dil::TokenKind::identifier));
+ EXPECT_FALSE(dil_token.is(lldb_private::dil::TokenKind::l_paren));
+ EXPECT_TRUE(dil_token.isOneOf(lldb_private::dil::TokenKind::eof,
+ lldb_private::dil::TokenKind::kw_namespace));
+ EXPECT_FALSE(dil_token.isOneOf(lldb_private::dil::TokenKind::l_paren,
+ lldb_private::dil::TokenKind::r_paren,
+ lldb_private::dil::TokenKind::coloncolon,
+ lldb_private::dil::TokenKind::eof));
+
+ dil_token.setKind(lldb_private::dil::TokenKind::identifier);
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+}
+
+TEST(DILLexerTests, LookAheadTest) {
+ StringRef dil_input_expr("(anonymous namespace)::some_var");
+ lldb_private::dil::DILLexer dil_lexer(dil_input_expr);
+ lldb_private::dil::DILToken dil_token;
+ dil_token.setKind(lldb_private::dil::TokenKind::unknown);
+ uint32_t expect_loc = 23;
+
+ dil_lexer.Lex(dil_token);
+ EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), UINT_MAX);
+ dil_lexer.ResetTokenIdx(0);
+
+ // Current token is '('; check the next 4 tokens, to make
+ // sure they are the identifier 'anonymous', the namespace keyword,
+ // ')' and '::', in that order.
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::l_paren);
+ EXPECT_EQ(dil_lexer.LookAhead(0).getKind(),
+ lldb_private::dil::TokenKind::identifier);
+ EXPECT_EQ(dil_lexer.LookAhead(0).getSpelling(), "anonymous");
+ EXPECT_EQ(dil_lexer.LookAhead(1).getKind(),
+ lldb_private::dil::TokenKind::kw_namespace);
+ EXPECT_EQ(dil_lexer.LookAhead(2).getKind(),
+ lldb_private::dil::TokenKind::r_paren);
+ EXPECT_EQ(dil_lexer.LookAhead(3).getKind(),
+ lldb_private::dil::TokenKind::coloncolon);
+ // Verify we've advanced our position counter (lexing location) in the
+ // input 23 characters (the length of '(anonymous namespace)::'.
+ EXPECT_EQ(dil_lexer.GetLocation(), expect_loc);
+
+ // Our current index should still be 0, as we only looked ahead; we are still
+ // officially on the '('.
+ EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), 0);
+
+ // Accept the 'lookahead', so our current token is '::', which has the index
+ // 4 in our vector of tokens (which starts at zero).
+ dil_token = dil_lexer.AcceptLookAhead(3);
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::coloncolon);
+ EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), 4);
+
+ // Lex the final variable name in the input string
+ dil_lexer.Lex(dil_token);
+ dil_lexer.IncrementTokenIdx();
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+ EXPECT_EQ(dil_token.getSpelling(), "some_var");
+ EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), 5);
+
+ dil_lexer.Lex(dil_token);
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::eof);
+}
+
+TEST(DILLexerTests, MultiTokenLexTest) {
+ StringRef dil_input_expr("This string has several identifiers");
+ lldb_private::dil::DILLexer dil_lexer(dil_input_expr);
+ lldb_private::dil::DILToken dil_token;
+ dil_token.setKind(lldb_private::dil::TokenKind::unknown);
+
+ dil_lexer.Lex(dil_token);
+ EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), UINT_MAX);
+ dil_lexer.ResetTokenIdx(0);
+
+ EXPECT_EQ(dil_token.getSpelling(), "This");
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+ dil_lexer.Lex(dil_token);
+ dil_lexer.IncrementTokenIdx();
+
+ EXPECT_EQ(dil_token.getSpelling(), "string");
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+ dil_lexer.Lex(dil_token);
+ dil_lexer.IncrementTokenIdx();
+
+ EXPECT_EQ(dil_token.getSpelling(), "has");
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+ dil_lexer.Lex(dil_token);
+ dil_lexer.IncrementTokenIdx();
+
+ EXPECT_EQ(dil_token.getSpelling(), "several");
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+ dil_lexer.Lex(dil_token);
+ dil_lexer.IncrementTokenIdx();
+
+ EXPECT_EQ(dil_token.getSpelling(), "identifiers");
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+ dil_lexer.Lex(dil_token);
+ dil_lexer.IncrementTokenIdx();
+
+ EXPECT_EQ(dil_token.getSpelling(), "");
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::eof);
+}
+
+TEST(DILLexerTests, IdentifiersTest) {
+ std::vector<std::string> valid_identifiers = {
+ "$My_name1",
+ "$pc",
+ "abcd",
+ "ab cd",
+ "_",
+ "_a",
+ "_a_",
+ "a_b",
+ "this",
+ "self",
+ "a",
+ "MyName"
+ };
+ std::vector<std::string> invalid_identifiers = {
+ "234",
+ "2a",
+ "2",
+ "$",
+ "1MyName",
+ "",
+ "namespace"
+ };
+
+ // Verify that all of the valid identifiers come out as identifier tokens.
+ for (auto str : valid_identifiers) {
+ StringRef dil_input_expr(str);
+ lldb_private::dil::DILLexer dil_lexer(dil_input_expr);
+ lldb_private::dil::DILToken dil_token;
+ dil_token.setKind(lldb_private::dil::TokenKind::unknown);
+
+ dil_lexer.Lex(dil_token);
+ EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier);
+ }
+
+ // Verify that none of the invalid identifiers come out as identifier tokens.
+ for (auto str : invalid_identifiers) {
+ StringRef dil_input_expr(str);
+ lldb_private::dil::DILLexer dil_lexer(dil_input_expr);
+ lldb_private::dil::DILToken dil_token;
+ dil_token.setKind(lldb_private::dil::TokenKind::unknown);
+
+ dil_lexer.Lex(dil_token);
+ EXPECT_TRUE(dil_token.isNot(lldb_private::dil::TokenKind::identifier));
+ EXPECT_TRUE(dil_token.isOneOf(lldb_private::dil::TokenKind::unknown,
+ lldb_private::dil::TokenKind::none,
+ lldb_private::dil::TokenKind::eof,
+ lldb_private::dil::TokenKind::kw_namespace));
+ }
+}
|
Update CMakeLists.txt to build DILLexer.cpp.
✅ With the latest revision this PR passed the C/C++ code formatter. |
|
||
/// Class defining the tokens generated by the DIL lexer and used by the | ||
/// DIL parser. | ||
class DILToken { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we already have a dill::
namespace, maybe we can drop DIL
prefix?
/// Class for doing the simple lexing required by DIL. | ||
class DILLexer { | ||
public: | ||
DILLexer(llvm::StringRef dil_expr) : m_expr(dil_expr.str()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think if we accept StringRef
as input, we shouldn't copy the data and work with the provided string view (and assume we don't outlive it).
If you need the lexer to own the text (and you probably don't), then accept std::string
and move from it.
|
||
/// Update the index for the 'current' token, to point to the next lexed | ||
/// token. | ||
bool IncrementTokenIdx() { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't Lex()
do this automatically?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not really. Lex() is called from LookAhead, when we definitely do not want to automatically increment the token index.
/// being handled by the DIL parser. | ||
const DILToken &LookAhead(uint32_t N); | ||
|
||
const DILToken &AcceptLookAhead(uint32_t N); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this API might be simpler. The lexer doesn't actually need to re-lex, the results will always be the same. We only need to rollback occasionally, but we'll always be process the same sequence of tokens the second time.
So the lexer can always add the tokens to the m_lexed_tokens
vector and we only need GetCurrentTokenIdx()
and ResetTokenIdx()
to do the rollback.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That would definitely be nice, and if its true, I'd consider taking this even further: given that we're going to be processing about a line of text at most, we might be able to just eagerly lex the whole string and avoid the whole on-demand parsing business. If so, maybe we don't even need the lexer class and the lexing could consist of a single function like std::vector<Token> Lex(string)
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Re werat's comment: This is already what the lexer does. It never re-lexes anything. It stores the lexed tokens in a vector as it lexes them, and keeps track, via index into the vector, which token is the 'current' one (as far as the parser is concerned). Accepting the lookahead basically involves just updating the index.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's good to hear, but it still makes the implementation complicated. Separating lexing from the traversal through the lexed tokens would make things much easier to follow. I understand traditional lexers can't do that, but hey are parsing megabytes of text. We're going to be parsing approximately one line of code here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yesterday I also thought of another advantage of early/eager parsing -- we can report errors (things like invalid tokens and such) early. Then the other functions ("get next token" and stuff) shouldn't have any failure modes except running off the end of token stream (which I think we could handle by pretending the stream ends with an infinite amount of EOF tokens)
lldb/source/ValueObject/DILLexer.cpp
Outdated
} | ||
|
||
uint32_t position = m_cur_pos - m_expr.begin(); | ||
; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
; |
lldb/source/ValueObject/DILLexer.cpp
Outdated
bool look_ahead = true; | ||
while (!done && remaining_tokens > 0) { | ||
DILToken tok; | ||
Lex(tok, look_ahead); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we check the return value here?
TEST(DILLexerTests, SimpleTest) { | ||
StringRef dil_input_expr("simple_var"); | ||
uint32_t tok_len = 10; | ||
lldb_private::dil::DILLexer dil_lexer(dil_input_expr); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO can drop dil_
here. All these tests are in the context of DIL, so it's clear what it's all about.
lldb_private::dil::DILLexer dil_lexer(dil_input_expr); | |
lldb_private::dil::DILLexer lexer(input_expr); |
|
||
namespace dil { | ||
|
||
enum class TokenKind { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Optional idea: Make this a non-class enum inside the (DIL)Token class so that it can be referred to as Token::coloncolon
void setValues(dil::TokenKind kind, std::string spelling, uint32_t start) { | ||
m_kind = kind; | ||
m_spelling = spelling; | ||
m_start_pos = start; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could we use the assignment operator instead (token = Token(kind, spelling, start)
) ?
m_tokens_idx = UINT_MAX; | ||
} | ||
|
||
bool Lex(DILToken &result, bool look_ahead = false); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bool Lex(DILToken &result, bool look_ahead = false); | |
std::optional<DILToken> Lex(bool look_ahead = false); |
/// being handled by the DIL parser. | ||
const DILToken &LookAhead(uint32_t N); | ||
|
||
const DILToken &AcceptLookAhead(uint32_t N); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That would definitely be nice, and if its true, I'd consider taking this even further: given that we're going to be processing about a line of text at most, we might be able to just eagerly lex the whole string and avoid the whole on-demand parsing business. If so, maybe we don't even need the lexer class and the lexing could consist of a single function like std::vector<Token> Lex(string)
?
/// Update 'result' with the other paremeter values, create a | ||
/// duplicate token, and push the duplicate token onto the vector of | ||
/// lexed tokens. | ||
void UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should all of these APIs really be public?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A few of them can be private (I'll take care of that). Many of them could be protected (mostly just called from the parser, which I can designate as a friend), except that if I make them protected then my unittest, which needs to access them, can't access them. I've been trying and trying to find a way to allow my unit tests access these methods when they're protected, but I can't seem to make it work. Do you know the magic secret for this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are workarounds, but no magic secrets. If something is called by the parser (which is the only user of the class anyway), then it should be public (btw, friends can see even private members, not just the public ones). And unit tests should generally test using the public APIs (as that's what the users will use). There are exceptions to that, but they usually involve situations where the public API requires arguments that would be difficult/impossible to mock in a test, which shouldn't be the case here.
Separating the public API from the private implementation details would really help this class, as I really don't know which one of these functions is meant to be called from the outside.
lldb/source/ValueObject/DILLexer.cpp
Outdated
// Empty Token | ||
result.setValues(dil::TokenKind::none, "", m_expr.length()); | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is the meaning of an empty token?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Roughly the same as unknown. I will change the TokenKind above to unknown.
}; | ||
|
||
// Verify that all of the valid identifiers come out as identifier tokens. | ||
for (auto str : valid_identifiers) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
StringRef dil_input_expr(str); | ||
lldb_private::dil::DILLexer dil_lexer(dil_input_expr); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
StringRef dil_input_expr(str); | |
lldb_private::dil::DILLexer dil_lexer(dil_input_expr); | |
SCOPED_TRACE(str); | |
lldb_private::dil::DILLexer dil_lexer(str); |
the explicit string ref construction is unnecessary. SCOPED_TRACE ensures that in case of a test failure, the error message will contain the string which caused it to fail.
} | ||
|
||
TEST(DILLexerTests, MultiTokenLexTest) { | ||
StringRef dil_input_expr("This string has several identifiers"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this would read better if there was helper function that would lex the string and then return the lexing result in a nicely-comparable for. E.g., with a function like
std::vector<std::pair<TokenKind, std::string>> Lex(StringRef input)
, this whole test would boil down to
EXPECT_THAT(Lex("This string has several identifiers"),
testing::ElementsAre(testing::Pair(TokenKind::identifier, "This"),
testing::Pair(TokenKind::identifier, "string"),
testing::Pair(TokenKind::identifier, "has"),
testing::Pair(TokenKind::identifier, "several"),
testing::Pair(TokenKind::identifier, "identifiers"),
testing::Pair(TokenKind::eof, "")));
Making it easy to write test like this is important, as I expect most of the lexer tests to be of this form, and I want to encourage people to write more tests like this for various corner cases, of which there are many in a lexer implementation (for example, test that a colon at the of a string does not access beyond the end of the string when it checks whether it's a part of ::
)
lldb/source/ValueObject/DILLexer.cpp
Outdated
return true; | ||
} | ||
|
||
switch (*m_cur_pos) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
An iterator is sort of a natural representation of a position in a string, but it also makes it hard to use many of the nice (safe) string APIs. For example, if we had a StringRef m_rest
to represent the unparsed portion of the string, then this block could be written as:
constexpr std::pair<TokenKind, const char *> operators = {
{TokenKind::l_paren, "("},
{TokenKind::r_paren, ")"},
{TokenKind::coloncolon, "::"},
{TokenKind::colon, ":"},
};
size_t position = m_rest.data() - m_input.data();
for (auto [kind, str]: operators) {
if (m_rest.consume_front(str)) {
UpdateLexedTokens(result, kind, str, position);
return true;
}
}
(Notice how I do not have to worry about running beyond the end of the string, how the "current position" is updated automatically, and how the tokens which are prefixes of one another are handled by putting the longer string first.)
There are also other places where this could be useful. E.g. skipping over whitespace could be implemented as m_rest = m_rest.ltrim()
and similar.
- Remove "DIL" prefix from DILTokenKind and DILToken. - Change the token kind from an enum class to an enum inside the Token class. - Use CamelCase for all the method names. - Replace Token::SetValues method with assignments. - Use a StringRef, not std::string, to hold the input string in the lexer. - Update the lexer to lex all the tokens at one time. Added two new methods for this: LexAll and GetNextToken. - Made some of the Lexer methods private. - Replaces StringMap with StringSwitch for fast keyword lookups. - Updated GetTokenName to directly return StringRefs; removed default case from switch statement. - Cleaned up code format in IsLetter & IsDigit. - Updated IsWord too return an iterator range containing the word (if any). - Updated Lex function (now called by LexAll) to return an llvm::Expected token; removed look_ahead checks; changed the operator lexing to use a vector of operators (as suggested). - Cleaned up LookAhead method, now that we know all tokens have already been lexed. - Added helper function to unittests, to help check a sequence of tokens. - Generally cleaned up the tests to deal with all the code changes.
I think I have addressed all the comments & requests. Please review this again. Thanks! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are some simplifications (and one rewrite :P) that I'd like to talk about, but I think we're not far.
The main thing that's bothering me is this identifier vs. keyword issue (and for this part, I'd like to loop in @jimingham (at least)).
Your implementation takes a very strict view of what's a possible identifier (it must consist of a very specific set of characters, appearing in a specific order, and it must not be a keyword). In contrast, the current "frame variable" implementation basically treats anything it doesn't recognise (including whitespace) as a variable name (and it has no keywords):
(lldb) v a*b
error: no variable named 'a*b' found in this frame
(lldb) v "a b"
error: no variable named 'a b' found in this frame
(lldb) v 123
error: no variable named '123' found in this frame
(lldb) v namespace
error: no variable named 'namespace' found in this frame
Now, obviously, in order to expand the language, we need to restrict the set of variable names, but I don't think we need to do it so aggressively. I don't think anyone will complain if we make it harder for him to access a variable called a*b
, but for example, namespace
, and 💩
are perfectly valid variable names in many languages (one of them is C).
For this reason, I think it'd be better to have a very liberal definition of what constitutes a possible variable name (identifier). Instead of a prescribing a character sequence, I think it be better to say that anything that doesn't contain one of the characters we recognize (basically: operators) is an identifier. IOW, to do something like frame variable
does right now.
As for keywords, I think it'd be best to have as few of them as possible, and for the rest, to treat their keyword-ness as context-dependent whereever possible. What I mean by that is to recognise them as keywords only within contexts where such usage would be legal. The namespace
keyword is a prime example of that. I think the only place where this can appear as a keyword in the context of the DIL is within the (anonymous namespace)
group. If that's the case, then I think we should be able to detect that and disambiguate the usage, so that e.g. v namespace
prints the variable called namespace
and v (anonymous namespace)::namespace
prints the variable called namespace
in an anonymous namespace (in an evil language which has anonymous namespaces but allows you to use variables called namespace
).
The way to do that is probably to not treat the string "namespace" as a keyword at the lexer level, but to recognize it later, when we have more context available.
What do you all think?
namespace lldb_private { | ||
|
||
namespace dil { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
namespace lldb_private { | |
namespace dil { | |
namespace lldb_private::dil { |
Token() : m_kind(Kind::none), m_spelling(""), m_start_pos(0) {} | ||
|
||
void SetKind(Kind kind) { m_kind = kind; } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we get rid of these (and of the none
token kind)? Ideally, I'd like to treat the token as a value type (so you can assign and copy it, but not mess with it), and one that is always valid (no uninitialized (none) state).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can get rid of the constructor on line 44, and I can get rid of the none token, but I need to keep SetKind, in order to separate ">>" into two ">"s, in the case where we have determined that ">>" is actually two single angle brackets closing a template, such as std::vector<std::pair<int,int>>. In that case, I'll also need to insert a token into the middle of the tokens vector -- not great, but may not happen that often, and the vector shouldn't be that long.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That doesn't remove your ability to modify a token, just how you how you do it (you update the token as a whole, instead of doing it piece wise). So, instead of something like token.SetKind(right_angle); token.SetSpelling(">");
you'd do token = Token(right_angle, ">", token.GetPosition())
.
That said, modification of the token stream and multi-pass (tentative) parsing doesn't sound like an ideal combination. How do you ensure that the modification does not leak into subsequent actions if the tentative parse is aborted?
Even if you can, the fact that you have to consider this makes me uneasy. Would it be possible to do this without modifying the token stream? So like, when the parser (I'm assuming this happens in the parser, as you need more context to disambiguate these) encounters a ">>" token and the current context allows you to treat this as a double closing template bracket, you act as if you encountered two ">" tokens, but without actually updating the tokens in the lexer.
|
||
llvm::Expected<bool> LexAll(); | ||
|
||
/// Return the lexed token N+1 positions ahead of the 'current' token |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel like there are too many ways to navigate the token stream here. You can either call GetCurrentToken+IncrementTokenIdx, or GetNextToken(which I guess increments the index automatically), or LookAhead+AcceptLookAhead.
I think it would be better to start with something simple (we can add more or revamp the existing API if it turns out to be clunky). What would you say to something like:
const Token &LookAhead(uint32_t N /* add `=1` if you want*/);
const Token &GetCurrentToken() { return LookAhead(0); } // just a fancy name for a look ahead of zero
void Advance(uint32_t N = 1); // advance the token stream
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The parser really needs a way to save & restore/reset the token index, because there are places in the parser where it does tentative parsing & then decides to rollback. It does so by saving the current token index, then doing the tentative parsing (which can advance the index some number of tokens), and then (for rolling back) setting the current token index back to the saved value.
So I don't think the simple API you've outlined above would be sufficient.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm fine with tentative parse and roll back APIs. I'm commenting on the other APIs which advance through the token stream linearly (but in a very baroque fashion). IOW, my proposal was to replace GetCurrentToken
, IncrementTokenIdx
, GetNextToken
, LookAhead
and AcceptLookAhead
with the three functions above (exact names TBD), and keep GetCurrentTokenIdx
and ResetTokenIdx
as they are.
bool ResetTokenIdx(uint32_t new_value) { | ||
if (new_value > m_lexed_tokens.size() - 1) | ||
return false; | ||
|
||
m_tokens_idx = new_value; | ||
return true; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bool ResetTokenIdx(uint32_t new_value) { | |
if (new_value > m_lexed_tokens.size() - 1) | |
return false; | |
m_tokens_idx = new_value; | |
return true; | |
} | |
void ResetTokenIdx(uint32_t new_value) { | |
assert(new_value < m_lexed_tokens.size()); | |
m_tokens_idx = new_value; | |
} |
(AIUI, the only usage of this function will be to restore a previous (and valid) position, so any error here is definitely a bug)
return true; | ||
} | ||
|
||
uint32_t GetLocation() { return m_cur_pos - m_expr.begin(); } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should this be private?
lldb/source/ValueObject/DILLexer.cpp
Outdated
} | ||
|
||
const Token &DILLexer::LookAhead(uint32_t N) { | ||
if (m_tokens_idx + N + 1 < m_lexed_tokens.size()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You didn't say anything about what you make of my suggestion to use an "infinite" stream of eof
tokens at the end. The current implementation uses an infinite stream of invalid
tokens, but I doubt that anything cares whether it's working with an invalid or eof token (if you're looking ahead, chances are you know what you're expecting to find, and you just want to know whether your token matches that expectation).
That would allow us to get rid of another pseudo-token and the m_invalid_token member.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the "infinite" stream of eof tokens at the end is ok; I will give it a try. :-)
|
||
using llvm::StringRef; | ||
|
||
bool VerifyExpectedTokens( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's a reason why I used EXPECT_THAT(..., ElementsAre()
in my example -- it gives much better error messages in case of failure: It will print the expected vector, the actual vector, and also the element which differs.
With an implementation like this, all you get is error: false != true
. Which one would you rather debug?
auto success = lexer.LexAll(); | ||
|
||
if (!success) { | ||
EXPECT_TRUE(false); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
auto success = lexer.LexAll(); | |
if (!success) { | |
EXPECT_TRUE(false); | |
} | |
ASSERT_THAT_EXPECTED(lexer.LexAll(), llvm::HasValue(true));` |
EXPECT_EQ(token.GetSpelling(), "simple_var"); | ||
EXPECT_EQ(token.GetLength(), tok_len); | ||
token = lexer.GetNextToken(); | ||
; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
delete
token = lexer.GetNextToken(); | ||
EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier); | ||
EXPECT_EQ(token.GetSpelling(), "simple_var"); | ||
EXPECT_EQ(token.GetLength(), tok_len); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm wondering if the GetLength function is really necessary. How often is it going to be called? Because if it's not very often, we might as well use GetSpelling().size()
.
The only way that the DIL lexer can handle keywords correctly would be if it knew in advance what language it was parsing for. Otherwise, we're going to be over broad for some language we support. Another consideration is that the DIL also is expected to parse the names given to children by synthetic child providers. At present we don't put any restrictions on those, so if you wanted you could produce a synthetic child called So I agree with Pavel, as much as possible the lexer should be permissive, really only handling the tokens we use to delimit words - For instance, currently the name of the synthetic child providers for the nth element of a std::vector is
So the lexer can't know about [] as a separate entity. It has to just be a name. |
I agree with everything except for the last part. The current parser already threats FWIW, I believe that the main source of keywords in Caroline's implementation is types (for casts). I think they could be handled generically (just ask the target whether the identifier names a type), were it not for the nasty C(++) habit of creating multiword types ( (*) I was recently made aware of an unfortunate difference in behavior of
I know that these are different languages, but this still seems like it could confuse some people. I don't have any specific ideas on how to fix/improve this though... |
If DIL is not allowed to recognize any operators except period, arrow, and parentheses, it seems to me that it won't be any more powerful than the current 'frame variable' implementation, in which case what is the point of having it at all? (As a matter of fact, it would be less powerful, since 'frame variable' also recognizes square brackets and star as valid operators and treats them accordingly). At the same time, I understand the concern about wanting to be able to parse synthetic variable names, which can contain arbitrary operator-like symbols. So I have a tentative proposal: How about insisting that synthetic variable names need to be surrounded by special delimiters, such as backquote or slash? Or having some other way of marking them as special? |
These are unfortunately visible to the user, and they will have to type them in |
I don't want to make them ugly, but I do think we need to ask the user to distinguish them in some way. Otherwise I don't see how we can make DIL any more powerful than the current 'frame variable'. |
We would have to change how synthetic children are made currently for array like things for this to work. If I have a std:vector called (lldb) v foo[0] and the DIL parses that it will either ask foo for a child named We still have the problem of all the synthetic child providers in the wild that might not do this.
That comes back to how soon we expect to know the language as we are parsing.
Yes, collection classes that use arbitrary types for the keys are problematic. How would we show the map if the keys are structures with some non-trivial compare function in a way that wouldn't be awful to use? I don't think we can get around people having to know that |
I don't understand why we couldn't emulate exactly what happens now. Current This roundaboutness has a very nice property in that you can specify the child number in any way you want, and it will get normalized to the right form:
And it definitely gives the impression that lldb "sees" into the array index, so I wouldn't at all be surprised if someone tried That's not to say that having the formatters provide more information about the structure of the children wouldn't be nice. For example, in the DAP protocol, we're expected to divide the children of an object into "named" and "indexed" categories. We can't do that right now, as we don't get that information. However, I don't think this needs to be tied to this work in any way.
I think there could be some reasonable middle ground here. Quoting/escapting every synthetic child is not going to fly. However, I wouldn't have a problem with asking users to quote "weird" child names. So, like, if there's a variable called Also, if we have this feature, I don't think it needs to be limited to synthetic children. There's no reason why not to use the same mechanism for regular variables as well. Even though most (but not all) languages do not support weird variable names like
Complex key types are hard for sure, and I think the current setup is perfectly reasonable for them. It's just that the result for integral keys (which are pretty common) is not ideal. It would be nice if users had a way to answer questions like "in my map of million keys, which value does the number 47 map to?" without an expression evaluator (which will often not compile). Any solution here is most certainly going to require the data formatters to provide more info... |
- Remove 'namespace' as a keyword (make it a normal identifier) - Remove 'invalid' and 'none' token types. - Remove unnecessary SetKind and GetLength methods from Tokens. - Re-arrange Lexer: - Give it a static Create method, which pre-lexes all the tokens - Make Lex method static - Pull IsWord method out of Lexer class - Make the Lexer constructor private. - Remove LexAll, GetLocation, UpdateLexedTokens, AcceptLookAhead, GetNextToken, and IncrementTokenIdx methods from Lexer class. - Add new 'Advance' method (to help replace some of the removed methods). - Update indexing in LookAhead (LookAead(0) now means the 'current' token). - Remove m_cur_pos data member from Lexer class. - Replace m_invalid_token with m_eof_token. - Use 'remainder' StringRef to help with lexing. - Update the unit tests to handle all the code changes in the Lexer. - Update the unit tests to use ASSERT_THAT_EXPECTED to check llvm::Expected return values. - Update the unit tests to use "testing::ElementsAre(testing::Pair ..." to verify all the lexed tokens; also added helper function ExtractTokenData, and deleted function VerifyExpectedTokens.
I think I have addressed/fixed all the latest review comments. Please take another look now. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Apart from the (mainly stylistic) inline comments, the biggest problem I see is that the definition of an identifier is still too narrow. The restriction on the dollar sign is completely unnecessary as C will let you put that anywhere. And it doesn't allow any non-ascii characters.
I really think this should be based on an deny- rather than an allow-list. Any character we don't claim for ourselves should be fair game for an identifier. If someone manages to enter the backspace character (\x7f
) into the expression, then so be it.
The question of "identifiers" starting with digits is interesting. Personally, I think it'd be fine to reject those (and require the currenly-vapourware quoting syntax), because I suspect you want to accept number suffixes, and I think it'd be confusing to explain why 123x
is a valid identifier but 123u
is not, but I suspect some might have a different opinion.
We could continue discussing that here, or we could accept everything here, and postpone this discussion for the patch which starts parsing numbers. Up to you..
token = lexer.GetCurrentToken(); | ||
EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier); | ||
EXPECT_EQ(token.GetSpelling(), "simple_var"); | ||
EXPECT_EQ(token.GetSpelling().size(), tok_len); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now the size check is unnecessary, as this is just testing the StringRef implementation
StringRef input_expr("namespace"); | ||
llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer = | ||
lldb_private::dil::DILLexer::Create(input_expr); | ||
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); | ||
lldb_private::dil::DILLexer lexer(*maybe_lexer); | ||
lldb_private::dil::Token token = | ||
lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0); | ||
lexer.Advance(); | ||
token = lexer.GetCurrentToken(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
StringRef input_expr("namespace"); | |
llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer = | |
lldb_private::dil::DILLexer::Create(input_expr); | |
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); | |
lldb_private::dil::DILLexer lexer(*maybe_lexer); | |
lldb_private::dil::Token token = | |
lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0); | |
lexer.Advance(); | |
token = lexer.GetCurrentToken(); | |
lldb_private::dil::Token token = | |
lldb_private::dil::Token(lldb_private::dil::Token::identifier, "ident", 0); |
(The reason is that this looks like a test for the token kind testing functions. You don't actually need the token to come out of the lexer for that. And we already have tests that check that producing a sequence of characters produces an "identifier" token.)
|
||
// Our current index should still be 0, as we only looked ahead; we are still | ||
// officially on the '('. | ||
EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)0); | |
EXPECT_EQ(lexer.GetCurrentTokenIdx(), 0u); |
(and elsewhere)
EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)5); | ||
// Verify we've advanced our position counter (lexing location) in the | ||
// input 23 characters (the length of '(anonymous namespace)::'. | ||
EXPECT_EQ(token.GetLocation(), expect_loc); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you might just say strlen("(anonymous namespace)::")
and skip the constant and the comment (this is a test, so it doesn't have to be super-efficient)
StringRef input_expr("This string has (several ) ::identifiers"); | ||
llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer = | ||
lldb_private::dil::DILLexer::Create(input_expr); | ||
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); | ||
lldb_private::dil::DILLexer lexer(*maybe_lexer); | ||
lldb_private::dil::Token token = | ||
lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0); | ||
|
||
std::vector<std::pair<lldb_private::dil::Token::Kind, std::string>> | ||
lexer_tokens_data = ExtractTokenData(lexer); | ||
|
||
EXPECT_THAT( | ||
lexer_tokens_data, | ||
testing::ElementsAre( | ||
testing::Pair(lldb_private::dil::Token::identifier, "This"), | ||
testing::Pair(lldb_private::dil::Token::identifier, "string"), | ||
testing::Pair(lldb_private::dil::Token::identifier, "has"), | ||
testing::Pair(lldb_private::dil::Token::l_paren, "("), | ||
testing::Pair(lldb_private::dil::Token::identifier, "several"), | ||
testing::Pair(lldb_private::dil::Token::r_paren, ")"), | ||
testing::Pair(lldb_private::dil::Token::coloncolon, "::"), | ||
testing::Pair(lldb_private::dil::Token::identifier, "identifiers"), | ||
testing::Pair(lldb_private::dil::Token::eof, ""))); | ||
lexer.Advance(); | ||
token = lexer.GetCurrentToken(); | ||
EXPECT_EQ(token.GetSpelling(), ""); | ||
EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::eof); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
StringRef input_expr("This string has (several ) ::identifiers"); | |
llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer = | |
lldb_private::dil::DILLexer::Create(input_expr); | |
ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); | |
lldb_private::dil::DILLexer lexer(*maybe_lexer); | |
lldb_private::dil::Token token = | |
lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0); | |
std::vector<std::pair<lldb_private::dil::Token::Kind, std::string>> | |
lexer_tokens_data = ExtractTokenData(lexer); | |
EXPECT_THAT( | |
lexer_tokens_data, | |
testing::ElementsAre( | |
testing::Pair(lldb_private::dil::Token::identifier, "This"), | |
testing::Pair(lldb_private::dil::Token::identifier, "string"), | |
testing::Pair(lldb_private::dil::Token::identifier, "has"), | |
testing::Pair(lldb_private::dil::Token::l_paren, "("), | |
testing::Pair(lldb_private::dil::Token::identifier, "several"), | |
testing::Pair(lldb_private::dil::Token::r_paren, ")"), | |
testing::Pair(lldb_private::dil::Token::coloncolon, "::"), | |
testing::Pair(lldb_private::dil::Token::identifier, "identifiers"), | |
testing::Pair(lldb_private::dil::Token::eof, ""))); | |
lexer.Advance(); | |
token = lexer.GetCurrentToken(); | |
EXPECT_EQ(token.GetSpelling(), ""); | |
EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::eof); | |
EXPECT_THAT_EXPECTED( | |
ExtractTokenData("This string has (several ) ::identifiers"), | |
llvm::HasValue(testing::ElementsAre( | |
testing::Pair(lldb_private::dil::Token::identifier, "This"), | |
testing::Pair(lldb_private::dil::Token::identifier, "string"), | |
testing::Pair(lldb_private::dil::Token::identifier, "has"), | |
testing::Pair(lldb_private::dil::Token::l_paren, "("), | |
testing::Pair(lldb_private::dil::Token::identifier, "several"), | |
testing::Pair(lldb_private::dil::Token::r_paren, ")"), | |
testing::Pair(lldb_private::dil::Token::coloncolon, "::"), | |
testing::Pair(lldb_private::dil::Token::identifier, "identifiers"), | |
testing::Pair(lldb_private::dil::Token::eof, "")))); |
(and have ExtractTokenData
construct the lexer internally and return llvm::Expected<std::vector<...>>)
I think this will reduce the boiler plate in all subsequent tests.
lldb/source/ValueObject/DILLexer.cpp
Outdated
llvm::StringRef::iterator cur_pos = expr.end() - remainder.size(); | ||
llvm::StringRef::iterator start = cur_pos; | ||
bool dollar_start = false; | ||
|
||
// Must not start with a digit. | ||
if (cur_pos == expr.end() || IsDigit(*cur_pos)) | ||
return std::nullopt; | ||
|
||
// First character *may* be a '$', for a register name or convenience | ||
// variable. | ||
if (*cur_pos == '$') { | ||
dollar_start = true; | ||
++cur_pos; | ||
} | ||
|
||
// Contains only letters, digits or underscores | ||
for (; cur_pos != expr.end(); ++cur_pos) { | ||
char c = *cur_pos; | ||
if (!IsLetter(c) && !IsDigit(c) && c != '_') | ||
break; | ||
} | ||
|
||
// If first char is '$', make sure there's at least one mare char, or it's | ||
// invalid. | ||
if (dollar_start && (cur_pos - start <= 1)) { | ||
cur_pos = start; | ||
return std::nullopt; | ||
} | ||
|
||
if (cur_pos == start) | ||
return std::nullopt; | ||
|
||
llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start); | ||
if (remainder.consume_front(word)) | ||
return word; | ||
|
||
return std::nullopt; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be equivalent to your code, but I'm only suggesting this to show how code like this can be written in a shorter and more stringref-y fashion. I think the actual algorithm will have to change, as it has a very narrow definition of identifiers.
llvm::StringRef::iterator cur_pos = expr.end() - remainder.size(); | |
llvm::StringRef::iterator start = cur_pos; | |
bool dollar_start = false; | |
// Must not start with a digit. | |
if (cur_pos == expr.end() || IsDigit(*cur_pos)) | |
return std::nullopt; | |
// First character *may* be a '$', for a register name or convenience | |
// variable. | |
if (*cur_pos == '$') { | |
dollar_start = true; | |
++cur_pos; | |
} | |
// Contains only letters, digits or underscores | |
for (; cur_pos != expr.end(); ++cur_pos) { | |
char c = *cur_pos; | |
if (!IsLetter(c) && !IsDigit(c) && c != '_') | |
break; | |
} | |
// If first char is '$', make sure there's at least one mare char, or it's | |
// invalid. | |
if (dollar_start && (cur_pos - start <= 1)) { | |
cur_pos = start; | |
return std::nullopt; | |
} | |
if (cur_pos == start) | |
return std::nullopt; | |
llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start); | |
if (remainder.consume_front(word)) | |
return word; | |
return std::nullopt; | |
const char *start = remainder.data(); | |
remainder.consume_front("$"); // initial '$' is valid | |
remainder = remainder.drop_while([](char c){ return IsDigit(c) || IsLetter(c) || c=='_'; }); | |
llvm::StringRef candidate(start, remainder.data()-start); | |
if (candidate.empty() || candidate == "$" || IsDigit(candidate[0])) | |
return std::nullopt; | |
return candidate; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Not very surprisingly) I find it easier to understand my code than your code, so I would prefer to keep my code. But if you really want me to use something like your code instead I will ...
One question: When constructing 'candidate', you've already removed all the characters that should comprise the candidate word from remainder. Since those characters are no longer there, doesn't that make 'start' invalid? (It's pointing to something that's been removed from 'remainder'?). How can 'remainder.data() - start' possibly work at this point?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The key thing to realize here is that a StringRef is just a fancy name for a pair<const char *, size_t>
. On its own, it doesn't make anything valid. It's only valid as long as the string it points to is valid. And the same goes for the validity pointers you get from the StringRef. This is just a pointer to the parsed string, which will remain valid for as long as that string is around -- regardless of what happens to the StringRef.
That said, I realized a different problem with this code. It will end up modifying the remainder even in the failure case. So here's a slightly different version which avoids that. This one isn't completely equivalent, as I've dropped the requirement on the positioning of the dollar sign, but that's something I very strongly believe we should do anyway.
StringRef candidate = remainder.take_while([](char c) { return IsDigit(c) || IsLetter(c) || c=='_' || c=='$'; });
if (candidate.empty() || IsDigit(candidate[0]))
return std::nullopt;
remainder.drop_front(candidate.size());
return candidate;
lldb/source/ValueObject/DILLexer.cpp
Outdated
llvm::StringRef &remainder) { | ||
// Skip over whitespace (spaces). | ||
remainder = remainder.ltrim(); | ||
llvm::StringRef::iterator cur_pos = expr.end() - remainder.size(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
llvm::StringRef::iterator cur_pos = expr.end() - remainder.size(); | |
llvm::StringRef::iterator cur_pos = remainder.begin(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Your suggestion is not correct. "cur_pos" is supposed to refer to the position of the currently-being-lexed word/token/etc in the original input string, not its position in remainder.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But that's the same thing. Both stringrefs are just pointers to the same underlying string. StringRef::iterator
is just a fancy name for const char *
. Would it look better to you if this was written as const char *cur_pos = remainder.data();
?
Or even better, since this is only used constructing the position
value below (I'm ignoring the check below), you can just drop this and construct position directly as remainder.data()-expr.data()
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"Both stringrefs are pointers to the same underlying string" -- I did not know that. SO...there's a string (const char *) in memory, that both StringRefs point to, and updating the StringRefs never updates the underlying memory contents? Hm...it would be nice if there was documentation somewhere that made this clear (the doxygen stuff does NOT). Thanks for the explanation & clarification.
lldb/source/ValueObject/DILLexer.cpp
Outdated
llvm::StringRef::iterator cur_pos = expr.end() - remainder.size(); | ||
|
||
// Check to see if we've reached the end of our input string. | ||
if (remainder.empty() || cur_pos == expr.end()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (remainder.empty() || cur_pos == expr.end()) | |
if (remainder.empty()) |
If the two conditions aren't equivalent, I'd like to know why.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because they're checking on two different StringRefs? If this function has been called properly (with the correct arguments), then the two probablye should be equivalent. But I don't like to make assumptions like that without checking them...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When I see code like this I start to wonder whether there's an edge case I'm missing. Defensive checks may make sense on some external API which is called from a bunch of places, but here we're talking about a private function that's only called from a single place. At best that deserves an assertion (you could e.g. say assert(expr.end() == remainder.end())
at the beginning of the function.
lldb/source/ValueObject/DILLexer.cpp
Outdated
if (maybe_word) { | ||
llvm::StringRef word = *maybe_word; | ||
return Token(Token::identifier, word.str(), position); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (maybe_word) { | |
llvm::StringRef word = *maybe_word; | |
return Token(Token::identifier, word.str(), position); | |
} | |
if (maybe_word) | |
return Token(Token::identifier, maybe_word->str(), position); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(it's shorter, and I don't think the extra local variable helps anything)
lldb/source/ValueObject/DILLexer.cpp
Outdated
if (remainder.consume_front(str)) { | ||
cur_pos += strlen(str); | ||
return Token(kind, str, position); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (remainder.consume_front(str)) { | |
cur_pos += strlen(str); | |
return Token(kind, str, position); | |
} | |
if (remainder.consume_front(str)) | |
return Token(kind, str, position); |
(cur_pos isn't used after this)
On Jan 31, 2025, at 2:21 AM, Pavel Labath ***@***.***> wrote:
@labath commented on this pull request.
Apart from the (mainly stylistic) inline comments, the biggest problem I see is that the definition of an identifier is still too narrow. The restriction on the dollar sign is completely unnecessary as C will let you put that anywhere <https://godbolt.org/z/o7qbfeWve>. And it doesn't allow any non-ascii characters.
I really think this should be based on an deny- rather than an allow-list. Any character we don't claim for ourselves should be fair game for an identifier. If someone manages to enter the backspace character (\x7f) into the expression, then so be it.
The question of "identifiers" starting with digits is interesting. Personally, I think it'd be fine to reject those (and require the currenly-vapourware quoting syntax), because I suspect you want to accept number suffixes, and I think it'd be confusing to explain why 123x is a valid identifier but 123u is not, but I suspect some might have a different opinion.
Are there any languages we know of that allow digits as the initial identifier character? Seems like that just makes parsing the language hard for no very clear benefit, so I'd be surprised if there are many that allow that. If there aren't any that we're likely to have to support, I wouldn't object to not treating tokens beginning with a number as an identifier.
… We could continue discussing that here, or we could accept everything here, and postpone this discussion for the patch which starts parsing numbers. Up to you..
In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>:
> +TEST(DILLexerTests, SimpleTest) {
+ StringRef input_expr("simple_var");
+ uint32_t tok_len = 10;
+ llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer =
+ lldb_private::dil::DILLexer::Create(input_expr);
+ ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+ lldb_private::dil::DILLexer lexer(*maybe_lexer);
+ lldb_private::dil::Token token =
+ lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0);
+ EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::unknown);
+
+ lexer.Advance();
+ token = lexer.GetCurrentToken();
+ EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier);
+ EXPECT_EQ(token.GetSpelling(), "simple_var");
+ EXPECT_EQ(token.GetSpelling().size(), tok_len);
Now the size check is unnecessary, as this is just testing the StringRef implementation
In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>:
> + StringRef input_expr("namespace");
+ llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer =
+ lldb_private::dil::DILLexer::Create(input_expr);
+ ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+ lldb_private::dil::DILLexer lexer(*maybe_lexer);
+ lldb_private::dil::Token token =
+ lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0);
+ lexer.Advance();
+ token = lexer.GetCurrentToken();
⬇️ Suggested change
- StringRef input_expr("namespace");
- llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer =
- lldb_private::dil::DILLexer::Create(input_expr);
- ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
- lldb_private::dil::DILLexer lexer(*maybe_lexer);
- lldb_private::dil::Token token =
- lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0);
- lexer.Advance();
- token = lexer.GetCurrentToken();
+ lldb_private::dil::Token token =
+ lldb_private::dil::Token(lldb_private::dil::Token::identifier, "ident", 0);
(The reason is that this looks like a test for the token kind testing functions. You don't actually need the token to come out of the lexer for that. And we already have tests that check that producing a sequence of characters produces an "identifier" token.)
In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>:
> + token = lexer.GetCurrentToken();
+
+ // Current token is '('; check the next 4 tokens, to make
+ // sure they are the identifier 'anonymous', the identifier 'namespace'
+ // ')' and '::', in that order.
+ EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::l_paren);
+ EXPECT_EQ(lexer.LookAhead(1).GetKind(), lldb_private::dil::Token::identifier);
+ EXPECT_EQ(lexer.LookAhead(1).GetSpelling(), "anonymous");
+ EXPECT_EQ(lexer.LookAhead(2).GetKind(), lldb_private::dil::Token::identifier);
+ EXPECT_EQ(lexer.LookAhead(2).GetSpelling(), "namespace");
+ EXPECT_EQ(lexer.LookAhead(3).GetKind(), lldb_private::dil::Token::r_paren);
+ EXPECT_EQ(lexer.LookAhead(4).GetKind(), lldb_private::dil::Token::coloncolon);
+
+ // Our current index should still be 0, as we only looked ahead; we are still
+ // officially on the '('.
+ EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)0);
⬇️ Suggested change
- EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)0);
+ EXPECT_EQ(lexer.GetCurrentTokenIdx(), 0u);
(and elsewhere)
In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>:
> +
+ // Accept the 'lookahead', so our current token is '::', which has the index
+ // 4 in our vector of tokens (which starts at zero).
+ lexer.Advance(4);
+ token = lexer.GetCurrentToken();
+ EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::coloncolon);
+ EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)4);
+
+ lexer.Advance();
+ token = lexer.GetCurrentToken();
+ EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier);
+ EXPECT_EQ(token.GetSpelling(), "some_var");
+ EXPECT_EQ(lexer.GetCurrentTokenIdx(), (uint32_t)5);
+ // Verify we've advanced our position counter (lexing location) in the
+ // input 23 characters (the length of '(anonymous namespace)::'.
+ EXPECT_EQ(token.GetLocation(), expect_loc);
you might just say strlen("(anonymous namespace)::") and skip the constant and the comment (this is a test, so it doesn't have to be super-efficient)
In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>:
> + StringRef input_expr("This string has (several ) ::identifiers");
+ llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer =
+ lldb_private::dil::DILLexer::Create(input_expr);
+ ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+ lldb_private::dil::DILLexer lexer(*maybe_lexer);
+ lldb_private::dil::Token token =
+ lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0);
+
+ std::vector<std::pair<lldb_private::dil::Token::Kind, std::string>>
+ lexer_tokens_data = ExtractTokenData(lexer);
+
+ EXPECT_THAT(
+ lexer_tokens_data,
+ testing::ElementsAre(
+ testing::Pair(lldb_private::dil::Token::identifier, "This"),
+ testing::Pair(lldb_private::dil::Token::identifier, "string"),
+ testing::Pair(lldb_private::dil::Token::identifier, "has"),
+ testing::Pair(lldb_private::dil::Token::l_paren, "("),
+ testing::Pair(lldb_private::dil::Token::identifier, "several"),
+ testing::Pair(lldb_private::dil::Token::r_paren, ")"),
+ testing::Pair(lldb_private::dil::Token::coloncolon, "::"),
+ testing::Pair(lldb_private::dil::Token::identifier, "identifiers"),
+ testing::Pair(lldb_private::dil::Token::eof, "")));
+ lexer.Advance();
+ token = lexer.GetCurrentToken();
+ EXPECT_EQ(token.GetSpelling(), "");
+ EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::eof);
⬇️ Suggested change
- StringRef input_expr("This string has (several ) ::identifiers");
- llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer =
- lldb_private::dil::DILLexer::Create(input_expr);
- ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
- lldb_private::dil::DILLexer lexer(*maybe_lexer);
- lldb_private::dil::Token token =
- lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0);
-
- std::vector<std::pair<lldb_private::dil::Token::Kind, std::string>>
- lexer_tokens_data = ExtractTokenData(lexer);
-
- EXPECT_THAT(
- lexer_tokens_data,
- testing::ElementsAre(
- testing::Pair(lldb_private::dil::Token::identifier, "This"),
- testing::Pair(lldb_private::dil::Token::identifier, "string"),
- testing::Pair(lldb_private::dil::Token::identifier, "has"),
- testing::Pair(lldb_private::dil::Token::l_paren, "("),
- testing::Pair(lldb_private::dil::Token::identifier, "several"),
- testing::Pair(lldb_private::dil::Token::r_paren, ")"),
- testing::Pair(lldb_private::dil::Token::coloncolon, "::"),
- testing::Pair(lldb_private::dil::Token::identifier, "identifiers"),
- testing::Pair(lldb_private::dil::Token::eof, "")));
- lexer.Advance();
- token = lexer.GetCurrentToken();
- EXPECT_EQ(token.GetSpelling(), "");
- EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::eof);
+ EXPECT_THAT_EXPECTED(
+ ExtractTokenData("This string has (several ) ::identifiers"),
+ llvm::HasValue(testing::ElementsAre(
+ testing::Pair(lldb_private::dil::Token::identifier, "This"),
+ testing::Pair(lldb_private::dil::Token::identifier, "string"),
+ testing::Pair(lldb_private::dil::Token::identifier, "has"),
+ testing::Pair(lldb_private::dil::Token::l_paren, "("),
+ testing::Pair(lldb_private::dil::Token::identifier, "several"),
+ testing::Pair(lldb_private::dil::Token::r_paren, ")"),
+ testing::Pair(lldb_private::dil::Token::coloncolon, "::"),
+ testing::Pair(lldb_private::dil::Token::identifier, "identifiers"),
+ testing::Pair(lldb_private::dil::Token::eof, ""))));
(and have ExtractTokenData construct the lexer internally and return llvm::Expected<std::vector<...>>)
I think this will reduce the boiler plate in all subsequent tests.
In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>:
> + llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer =
+ lldb_private::dil::DILLexer::Create(str);
+ ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+ lldb_private::dil::DILLexer lexer(*maybe_lexer);
+ lldb_private::dil::Token token =
+ lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0);
+ lexer.Advance();
+ token = lexer.GetCurrentToken();
+ EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier);
For example, this could boil down to this
⬇️ Suggested change
- llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer =
- lldb_private::dil::DILLexer::Create(str);
- ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
- lldb_private::dil::DILLexer lexer(*maybe_lexer);
- lldb_private::dil::Token token =
- lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0);
- lexer.Advance();
- token = lexer.GetCurrentToken();
- EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier);
+ EXPECT_THAT_EXPECTED(ExtractTokenData(str), llvm::HasValue(testing::ElementsAre(testing::Pair(identifier, str))));
In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>:
> + lldb_private::dil::DILLexer::Create(str);
+ ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+ lldb_private::dil::DILLexer lexer(*maybe_lexer);
+ lldb_private::dil::Token token =
+ lldb_private::dil::Token(lldb_private::dil::Token::unknown, "", 0);
+ lexer.Advance();
+ token = lexer.GetCurrentToken();
+ EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::identifier);
+ }
+
+ // Verify that none of the invalid identifiers come out as identifier tokens.
+ for (auto &str : invalid_identifiers) {
+ SCOPED_TRACE(str);
+ llvm::Expected<lldb_private::dil::DILLexer> maybe_lexer =
+ lldb_private::dil::DILLexer::Create(str);
+ if (!maybe_lexer) {
Control flow in a test is usually a bad sign (among other reasons, because we don't write tests for tests). In gtest matcher language, I think what you're trying to express is EXPECT_THAT_EXPECTED(ExtractTokenData(str), testing::Not(llvm::HasValue(testing::Contains(testing::Pair(Token::identifier, testing::_)))) (i.e., parsing does not succeed and return a token of the identifier kind), though it may be better to be even more specific: instead of saying what you do not want to happen, say what you actually expect to be the result of parsing of e.g. "123". (It's true that this will result will change once you start parsing numbers, but I think churn like that is fine for a unit test and serves as evidence of the functional changes you are making)
In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>:
> +//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+#include <string>
+
+using llvm::StringRef;
+
+std::vector<std::pair<lldb_private::dil::Token::Kind, std::string>>
I'd suggest adding using namespace lldb_private::dil to avoid all of this repetition.
In lldb/unittests/ValueObject/DILLexerTests.cpp <#123521 (comment)>:
> + testing::Pair(lldb_private::dil::Token::r_paren, ")"),
+ testing::Pair(lldb_private::dil::Token::coloncolon, "::"),
+ testing::Pair(lldb_private::dil::Token::identifier, "identifiers"),
+ testing::Pair(lldb_private::dil::Token::eof, "")));
+ lexer.Advance();
+ token = lexer.GetCurrentToken();
+ EXPECT_EQ(token.GetSpelling(), "");
+ EXPECT_EQ(token.GetKind(), lldb_private::dil::Token::eof);
+}
+
+TEST(DILLexerTests, IdentifiersTest) {
+ std::vector<std::string> valid_identifiers = {
+ "$My_name1", "$pc", "abcd", "ab cd", "_", "_a", "_a_",
+ "a_b", "this", "self", "a", "MyName", "namespace"};
+ std::vector<std::string> invalid_identifiers = {"234", "2a", "2",
+ "$", "1MyName", ""};
I think a lone dollar sign should be a valid identifier, at least until we have a reason for it not to be. The cases with numbers are more interesting, and I'm going to add them to our top-level discussion.
In lldb/include/lldb/ValueObject/DILLexer.h <#123521 (comment)>:
> +#include <string>
+#include <vector>
+
+namespace lldb_private::dil {
+
+/// Class defining the tokens generated by the DIL lexer and used by the
+/// DIL parser.
+class Token {
+public:
+ enum Kind {
+ coloncolon,
+ eof,
+ identifier,
+ l_paren,
+ r_paren,
+ unknown,
It looks like the only remaining use of the "unknown" token is to construct "empty" tokens in the tests. I think all of those cases could be avoided by just declaring the Token variable later in the code.
In lldb/include/lldb/ValueObject/DILLexer.h <#123521 (comment)>:
> +
+/// Class defining the tokens generated by the DIL lexer and used by the
+/// DIL parser.
+class Token {
+public:
+ enum Kind {
+ coloncolon,
+ eof,
+ identifier,
+ l_paren,
+ r_paren,
+ unknown,
+ };
+
+ Token(Kind kind, std::string spelling, uint32_t start)
+ : m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
⬇️ Suggested change
- : m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
+ : m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {}
In lldb/include/lldb/ValueObject/DILLexer.h <#123521 (comment)>:
> +};
+
+/// Class for doing the simple lexing required by DIL.
+class DILLexer {
+public:
+ /// Lexes all the tokens in expr and calls the private constructor
+ /// with the lexed tokens.
+ static llvm::Expected<DILLexer> Create(llvm::StringRef expr);
+
+ /// Return the current token to be handled by the DIL parser.
+ const Token &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }
+
+ /// Advance the current token position by N.
+ void Advance(uint32_t N = 1) {
+ // UINT_MAX means uninitialized, no "current" position, so move to start.
+ if (m_tokens_idx == UINT_MAX)
Could we just have the lexer start at token zero straight away?
In lldb/include/lldb/ValueObject/DILLexer.h <#123521 (comment)>:
> + if (m_tokens_idx == UINT_MAX)
+ m_tokens_idx = 0;
+ else if (m_tokens_idx + N >= m_lexed_tokens.size())
+ // N is too large; advance to the end of the lexed tokens.
+ m_tokens_idx = m_lexed_tokens.size() - 1;
+ else
+ m_tokens_idx += N;
+ }
+
+ /// Return the lexed token N positions ahead of the 'current' token
+ /// being handled by the DIL parser.
+ const Token &LookAhead(uint32_t N) {
+ if (m_tokens_idx + N < m_lexed_tokens.size())
+ return m_lexed_tokens[m_tokens_idx + N];
+
+ return m_eof_token;
A valid token stream should always end in and EOF token, right? If, so you can just return the last element.
In lldb/include/lldb/ValueObject/DILLexer.h <#123521 (comment)>:
> + /// Return the index for the 'current' token being handled by the DIL parser.
+ uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }
+
+ /// Set the index for the 'current' token (to be handled by the parser)
+ /// to a particular position. Used for either committing 'look ahead' parsing
+ /// or rolling back tentative parsing.
+ void ResetTokenIdx(uint32_t new_value) {
+ assert(new_value == UINT_MAX || new_value < m_lexed_tokens.size());
+ m_tokens_idx = new_value;
+ }
+
+ uint32_t NumLexedTokens() { return m_lexed_tokens.size(); }
+
+private:
+ DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens)
+ : m_expr(dil_expr), m_lexed_tokens(lexed_tokens), m_tokens_idx(UINT_MAX),
⬇️ Suggested change
- : m_expr(dil_expr), m_lexed_tokens(lexed_tokens), m_tokens_idx(UINT_MAX),
+ : m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)), m_tokens_idx(UINT_MAX),
In lldb/source/ValueObject/DILLexer.cpp <#123521 (comment)>:
> + llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
+ llvm::StringRef::iterator start = cur_pos;
+ bool dollar_start = false;
+
+ // Must not start with a digit.
+ if (cur_pos == expr.end() || IsDigit(*cur_pos))
+ return std::nullopt;
+
+ // First character *may* be a '$', for a register name or convenience
+ // variable.
+ if (*cur_pos == '$') {
+ dollar_start = true;
+ ++cur_pos;
+ }
+
+ // Contains only letters, digits or underscores
+ for (; cur_pos != expr.end(); ++cur_pos) {
+ char c = *cur_pos;
+ if (!IsLetter(c) && !IsDigit(c) && c != '_')
+ break;
+ }
+
+ // If first char is '$', make sure there's at least one mare char, or it's
+ // invalid.
+ if (dollar_start && (cur_pos - start <= 1)) {
+ cur_pos = start;
+ return std::nullopt;
+ }
+
+ if (cur_pos == start)
+ return std::nullopt;
+
+ llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start);
+ if (remainder.consume_front(word))
+ return word;
+
+ return std::nullopt;
This should be equivalent to your code, but I'm only suggesting this to show how code like this can be written in a shorter and more stringref-y fashion. I think the actual algorithm will have to change, as it has a very narrow definition of identifiers.
⬇️ Suggested change
- llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
- llvm::StringRef::iterator start = cur_pos;
- bool dollar_start = false;
-
- // Must not start with a digit.
- if (cur_pos == expr.end() || IsDigit(*cur_pos))
- return std::nullopt;
-
- // First character *may* be a '$', for a register name or convenience
- // variable.
- if (*cur_pos == '$') {
- dollar_start = true;
- ++cur_pos;
- }
-
- // Contains only letters, digits or underscores
- for (; cur_pos != expr.end(); ++cur_pos) {
- char c = *cur_pos;
- if (!IsLetter(c) && !IsDigit(c) && c != '_')
- break;
- }
-
- // If first char is '$', make sure there's at least one mare char, or it's
- // invalid.
- if (dollar_start && (cur_pos - start <= 1)) {
- cur_pos = start;
- return std::nullopt;
- }
-
- if (cur_pos == start)
- return std::nullopt;
-
- llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start);
- if (remainder.consume_front(word))
- return word;
-
- return std::nullopt;
+ const char *start = remainder.data();
+ remainder.consume_front("$"); // initial '$' is valid
+ remainder = remainder.drop_while([](char c){ return IsDigit(c) || IsLetter(c) || c=='_'; });
+ llvm::StringRef candidate(start, remainder.data()-start);
+ if (candidate.empty() || candidate == "$" || IsDigit(candidate[0]))
+ return std::nullopt;
+ return candidate;
In lldb/source/ValueObject/DILLexer.cpp <#123521 (comment)>:
> + llvm::StringRef remainder = expr;
+ do {
+ if (llvm::Expected<Token> t = Lex(expr, remainder)) {
+ tokens.push_back(std::move(*t));
+ } else {
+ return t.takeError();
+ }
+ } while (tokens.back().GetKind() != Token::eof);
+ return DILLexer(expr, std::move(tokens));
+}
+
+llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
+ llvm::StringRef &remainder) {
+ // Skip over whitespace (spaces).
+ remainder = remainder.ltrim();
+ llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
⬇️ Suggested change
- llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
+ llvm::StringRef::iterator cur_pos = remainder.begin();
In lldb/source/ValueObject/DILLexer.cpp <#123521 (comment)>:
> + tokens.push_back(std::move(*t));
+ } else {
+ return t.takeError();
+ }
+ } while (tokens.back().GetKind() != Token::eof);
+ return DILLexer(expr, std::move(tokens));
+}
+
+llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
+ llvm::StringRef &remainder) {
+ // Skip over whitespace (spaces).
+ remainder = remainder.ltrim();
+ llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
+
+ // Check to see if we've reached the end of our input string.
+ if (remainder.empty() || cur_pos == expr.end())
⬇️ Suggested change
- if (remainder.empty() || cur_pos == expr.end())
+ if (remainder.empty())
If the two conditions aren't equivalent, I'd like to know why.
In lldb/source/ValueObject/DILLexer.cpp <#123521 (comment)>:
> + if (maybe_word) {
+ llvm::StringRef word = *maybe_word;
+ return Token(Token::identifier, word.str(), position);
+ }
⬇️ Suggested change
- if (maybe_word) {
- llvm::StringRef word = *maybe_word;
- return Token(Token::identifier, word.str(), position);
- }
+ if (maybe_word)
+ return Token(Token::identifier, maybe_word->str(), position);
In lldb/source/ValueObject/DILLexer.cpp <#123521 (comment)>:
> + if (remainder.consume_front(str)) {
+ cur_pos += strlen(str);
+ return Token(kind, str, position);
+ }
⬇️ Suggested change
- if (remainder.consume_front(str)) {
- cur_pos += strlen(str);
- return Token(kind, str, position);
- }
+ if (remainder.consume_front(str))
+ return Token(kind, str, position);
(cur_pos isn't used after this)
—
Reply to this email directly, view it on GitHub <#123521 (review)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ADUPVW74P2KD3GANQKHXK6D2NNFDRAVCNFSM6AAAAABVO4RH2WVHI2DSMVQWIX3LMV43YUDVNRWFEZLROVSXG5CSMV3GSZLXHMZDKOBVHE4DIMBWGI>.
You are receiving this because you were mentioned.
|
To the best of my knowledge, all the languages that we want to support have roughly the same definition of what a valid identifier is: a letter or underscore, followed by a sequence of letters, digits and underscores, where 'letters' are defined as 'a..z' and 'A..Z'. The one's I've been able to check do not allow arbitrary characters in their identifiers. So that's what I implemented (acknowledging that I currently only recognize ascii at the moment, and fully plan to add utf8 support in the future). I added the ability to recognize the '$' at the front specifically to allow DIL users to ask about registers and LLDB convenience variables, which (to the best of my knowledge) allow '$' only in the first position, and not all by itself. I am not sure I see that benefits of expanding what DIL recognizes as a valid identifier beyond what the languages LLDB supports recognize? Am I missing something? Or (this is quite possible) have I misunderstood the definition of what's a valid identifier for some language we want to support? Since we definitely want to support lexing/parsing of numbers, I do not think it's a good idea for DIL to also allow identifiers to start with numbers. |
- Use std::move on std::string & std::vector in constructor initializers. - Remove some unnecessary code. - Update ExtractTokenData (helper function in unit tests) to set up the lexer and to the lexing inside the function; return an llvm::Expected value. - Add 'using namespace lldb_private::dil;' to unit tests; clean up tests accordingly. - Minor code cleanups in the unit tests.
I went through the thread to understand the current consensus and I see various nice ideas flying around. Let me try to summarize the requirements and the current status quo.
Possible behaviour for DIL:
For cases where the This way all typical cases work as expected by users and more or less match the current behaviour of frame variable. It's still possible to access synthetic children with any names, but it requires knowledge about the escaping. Does this sound like a reasonable approach? |
I don't know how you were checking that, but I'm certain that's not the case. I already gave you an example of C code which contradicts all of these (note that there can be difference between what's considered a valid identifier by the specification of a language, and what an actual compiler for that language will accept). And I'm not even mentioning all of the names that can be constructed by synthetic child providers. You say you want to add utf-8 support. How do you intend to do that? Do you want to enumerate all of the supported characters in each language? Check which language supports variable names in Klingon? Some of the rules can be really obscure. For example, Java accepts £ ( OTOH, if you just accept all of the high-bit ascii values as valid characters, then you can support utf8 with a single line of code. And you're not regressing anything because that's exactly what the current implementation does. I don't think this list has to be set in stone. For example,
For me the main benefits are:
That said, I would like to hear what you think are the benefits of not recognizing wider set of identifier values. And I'm not talking about names like |
Thanks for the summary Andy.
SGTM
I already spoke at length about identifier names. Quoting of fancy names SGTM. I don't think its relevant for this patch (lexing), but since you're also mentioning the wider syntax of the language, I want to mention that there's also another kind of disambiguation to consider. Forcing a string to be treated as an identifier is one thing. Another question is forcing an identifier to be treated as a specific kind of entity. For example, if there's a variable and a type with the same name, can we say which one we mean? Or can we say we want to see the value of a global variable We don't exactly support that right now, but e.g.
I'd go with just the second option because I'd like to avoid ambiguities and fallbacks. I think that's more or less the status quo, at least for the types with child providers. We'd need some special handling for C pointers and arrays, but that's also what we already have
I think this is an interesting idea whose usefulness mainly depends on what kind of string operations we have in the language. If there's no way to construct strings, then I don't think it's very useful. You could write things like Overall I'd leave this out for the time being because it doesn't impact parsing or lexing, just how some (currently invalid) syntax trees can be interpreted.
I'm not sure how this is going to help. I assume you're referring to the |
On Feb 2, 2025, at 9:49 PM, cmtice ***@***.***> wrote:
Apart from the (mainly stylistic) inline comments, the biggest problem I see is that the definition of an identifier is still too narrow. The restriction on the dollar sign is completely unnecessary as C will let you put that anywhere <https://godbolt.org/z/o7qbfeWve>. And it doesn't allow any non-ascii characters.
I really think this should be based on an deny- rather than an allow-list. Any character we don't claim for ourselves should be fair game for an identifier. If someone manages to enter the backspace character (\x7f) into the expression, then so be it.
The question of "identifiers" starting with digits is interesting. Personally, I think it'd be fine to reject those (and require the currenly-vapourware quoting syntax), because I suspect you want to accept number suffixes, and I think it'd be confusing to explain why 123x is a valid identifier but 123u is not, but I suspect some might have a different opinion.
We could continue discussing that here, or we could accept everything here, and postpone this discussion for the patch which starts parsing numbers. Up to you..
To the best of my knowledge, all the languages that we want to support have roughly the same definition of what a valid identifier is: a letter or underscore, followed by a sequence of letters, digits and underscores, where 'letters' are defined as 'a..z' and 'A..Z'. The one's I've been able to check do not allow arbitrary characters in their identifiers. So that's what I implemented (acknowledging that I currently only recognize ascii at the moment, and fully plan to add utf8 support in the future). I added the ability to recognize the '$' at the front specifically to allow DIL users to ask about registers and LLDB convenience variables, which (to the best of my knowledge) allow '$' only in the first position, and not all by itself.
I am not sure I see that benefits of expanding what DIL recognizes as a valid identifier beyond what the languages LLDB supports recognize? Am I missing something? Or (this is quite possible) have I misunderstood the definition of what's a valid identifier for some language we want to support?
Since we definitely want to support lexing/parsing of numbers, I do not think it's a good idea for DIL to also allow identifiers to start with numbers.
I agree here. We definitely will need to support UTF-8 characters, all the hip new languages use that character set. But allowing initial digits makes parsing sufficiently hard I don't think it likely there will be languages we need to support that do that. Can somebody even think of a language that allows this?
Jim
… —
Reply to this email directly, view it on GitHub <#123521 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ADUPVW36H3ZJSE2MRP6OPY32N37O5AVCNFSM6AAAAABVO4RH2WVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDMMZQGAZTGMJXGI>.
You are receiving this because you were mentioned.
|
There's one small area that our desire to parse the widest range of identifiers comes into conflict with, namely the "persistent variable" namespace. For the C family of languages, we reserve the initial |
- Remove 'unknown' token type. - Remove UINT_MAX as a valid token index; always start at 0. - Update IsWord to use Pavel's more efficient StringRef implementation. - Allow '$' anywhere in an identifer string. - Adjust unit tests to handle changes mentioned above. - Clean up test of invalid identifiers (remove if-then-else; split invalid identifiers from unrecognized strings).
I've addressed all the remaining review comments (I think). Hopefully this is ready for approval now? :-) |
Yeah, access to persistent variables is one of the first things that I'd add to the "frame var" language. It's true that this can create some conflicts and, but I think it we don't need to handle those at the lexer level. And variables with dollars in their names are currently accepted in frame var:
One way to handle this ambiguity would be to use the (imaginary) identifier disambiguation/scoping mechanism I alluded to in one of my previous comments. We could e.g. say that identifiers that start with a dollar are reserved by lldb, and if you want to access a variable with that name, you do something like The fact that swift uses identifiers like |
Thinking about this further, I can think of one reason to allow this even without sophisticated string operations -- because this allows you to access identifiers with weird child names. This way we could have users write I'd still try to avoid things like guessing whether the user forgot to add |
#ifndef LLDB_VALUEOBJECT_DILLEXER_H_ | ||
#define LLDB_VALUEOBJECT_DILLEXER_H_ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe our headers don't have the ending underscore.
#ifndef LLDB_VALUEOBJECT_DILLEXER_H_ | |
#define LLDB_VALUEOBJECT_DILLEXER_H_ | |
#ifndef LLDB_VALUEOBJECT_DILLEXER_H | |
#define LLDB_VALUEOBJECT_DILLEXER_H |
#include "llvm/ADT/iterator_range.h" | ||
#include "llvm/Support/Error.h" | ||
#include <cstdint> | ||
#include <limits.h> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It fits in nicer with the other headers (though maybe you don't need it if you remove the UINT_MAX thing below)
#include <limits.h> | |
#include <climits> |
identifier, | ||
l_paren, | ||
r_paren, | ||
unknown, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The unknown token is still here. Are you sure you uploaded the right version of the patch?
/// Advance the current token position by N. | ||
void Advance(uint32_t N = 1) { | ||
// UINT_MAX means uninitialized, no "current" position, so move to start. | ||
if (m_tokens_idx == UINT_MAX) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And I'd hope that all of the UINT_MAX business can go away if we really start at token zero.
|
||
// "eof" token; to be returned by lexer when 'look ahead' fails. | ||
Token m_eof_token; | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// "eof" token; to be returned by lexer when 'look ahead' fails. | |
Token m_eof_token; | |
}; | |
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks good now, thanks for your patience.
if (lexer.NumLexedTokens() == 0) | ||
return llvm::createStringError("No lexed tokens"); | ||
|
||
lexer.ResetTokenIdx(0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is unnecessary now.
…vm#123521) This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see https://discourse.llvm.org/t/rfc-data-inspection-language/69893 This version of the lexer only handles local variables and namespaces, and is designed to work with llvm#120971.
This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see
https://discourse.llvm.org/t/rfc-data-inspection-language/69893
This version of the lexer only handles local variables and namespaces, and is designed to work with
#120971.