Skip to content

Commit d6e2b58

Browse files
committed
Preserve whitespace and comments during lexing as Trivia
Store leading a trailing "trivia" around a token, such as whitespace, comments, doc comments, and escaping backticks. These are syntactically important for preserving formatting when printing ASTs but don't semantically affect the program. Tokens take all trailing trivia up to, but not including, the next newline. This is important to maintain checks that statements without semicolon separators start on a new line, among other things. Trivia are now data attached to the ends of tokens, not tokens themselves. Create a new Syntax sublibrary for upcoming immutable, persistent, thread-safe ASTs, which will contain only the syntactic information about source structure, as well as for generating new source code, and structural editing. Proactively move swift::Token into there. Since this patch is getting a bit large, a token fuzzer which checks for round-trip equivlence with the workflow: fuzzer => token stream => file1 => Lexer => token stream => file 2 => diff(file1, file2) Will arrive in a subsequent commit. This patch does not change the grammar.
1 parent 6a9298c commit d6e2b58

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+1817
-1049
lines changed

include/swift/AST/ASTContext.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -783,8 +783,6 @@ class ASTContext {
783783

784784
private:
785785
friend class Decl;
786-
Optional<RawComment> getRawComment(const Decl *D);
787-
void setRawComment(const Decl *D, RawComment RC);
788786

789787
Optional<StringRef> getBriefComment(const Decl *D);
790788
void setBriefComment(const Decl *D, StringRef Comment);

include/swift/AST/Attr.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "swift/AST/KnownProtocols.h"
2929
#include "swift/AST/Ownership.h"
3030
#include "swift/AST/PlatformKind.h"
31+
#include "swift/AST/RawComment.h"
3132
#include "llvm/ADT/SmallVector.h"
3233
#include "llvm/ADT/StringRef.h"
3334
#include "llvm/Support/ErrorHandling.h"
@@ -958,15 +959,15 @@ class OwnershipAttr : public DeclAttribute {
958959
class RawDocCommentAttr : public DeclAttribute {
959960
/// Source range of the attached comment. This comment is located before
960961
/// the declaration.
961-
CharSourceRange CommentRange;
962+
const RawComment Comment;
962963

963964
public:
964-
RawDocCommentAttr(CharSourceRange CommentRange)
965+
RawDocCommentAttr(RawComment Comment)
965966
: DeclAttribute(DAK_RawDocComment, SourceLoc(), SourceRange(),
966967
/*Implicit=*/false),
967-
CommentRange(CommentRange) {}
968+
Comment(Comment) {}
968969

969-
CharSourceRange getCommentRange() const { return CommentRange; }
970+
const RawComment &getComment() const { return Comment; }
970971

971972
static bool classof(const DeclAttribute *DA) {
972973
return DA->getKind() == DAK_RawDocComment;

include/swift/Basic/String.h

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
//===--- String.h - String storage ------------------------------*- C++ -*-===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See http://swift.org/LICENSE.txt for license information
9+
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
//
13+
// This file defines the 'String' storage wrapper, which can hold its own
14+
// unique copy of a string, or merely hold a reference to some point in a
15+
// source buffer, which is assumed to live at least as long as a value of
16+
// this type.
17+
//
18+
//===----------------------------------------------------------------------===//
19+
#ifndef SWIFT_BASIC_STRING_H
20+
#define SWIFT_BASIC_STRING_H
21+
22+
#include "llvm/ADT/IntrusiveRefCntPtr.h"
23+
#include "llvm/ADT/StringRef.h"
24+
25+
using llvm::StringRef;
26+
27+
namespace swift {
28+
29+
class String {
30+
const char *Data;
31+
size_t Length;
32+
bool Managed;
33+
34+
35+
static const char *copyBuffer(const String &Other) {
36+
auto Buffer = (char *)malloc(Other.str().size());
37+
memcpy(Buffer, Other.str().data(), Other.str().size());
38+
return Buffer;
39+
}
40+
41+
public:
42+
String() : Data(nullptr), Length(0), Managed(false) {}
43+
44+
String(const char *Data, size_t Length, bool Managed)
45+
: Data(Data), Length(Length), Managed(Managed) {}
46+
47+
String(StringRef Str, bool IsManaged = false)
48+
: String(Str.data(), Str.size(), IsManaged) {}
49+
50+
String(const String &Other)
51+
: Data(Other.Managed ? copyBuffer(Other) : Other.Data), Length(Other.Length),
52+
Managed(Other.Managed) {}
53+
54+
static String createManaged(const char *Str, size_t Length) {
55+
auto Buffer = malloc(Length);
56+
memcpy(Buffer, Str, Length);
57+
return String { reinterpret_cast<const char *>(Buffer), Length,
58+
/* Managed */ true };
59+
}
60+
61+
static String createManaged(StringRef Str) {
62+
return createManaged(Str.data(), Str.size());
63+
}
64+
65+
static String createUnmanaged(StringRef Str) {
66+
return String { Str, /* Managed */ false };
67+
}
68+
69+
size_t size() const {
70+
return Length;
71+
}
72+
73+
bool empty() const {
74+
return Length == 0;
75+
}
76+
77+
StringRef str() const {
78+
return StringRef { Data, Length };
79+
}
80+
81+
bool operator==(const String &Right) const {
82+
return str() == Right.str();
83+
}
84+
85+
~String() {
86+
if (Managed)
87+
free(reinterpret_cast<void *>(const_cast<char *>(Data)));
88+
}
89+
};
90+
91+
} // end namespace swift
92+
93+
#endif // SWIFT_BASIC_STRING_H
94+

include/swift/IDE/SyntaxModel.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ namespace swift {
2424
class ModuleDecl;
2525
class SourceFile;
2626

27+
namespace syntax {
28+
class Trivia;
29+
}
30+
2731
namespace ide {
2832

2933
enum class SyntaxNodeKind : uint8_t {
@@ -182,6 +186,8 @@ class SyntaxModelContext {
182186
struct Implementation;
183187
Implementation &Impl;
184188

189+
void addTrivia(const syntax::Trivia &T, std::vector<SyntaxNode> &Nodes);
190+
185191
public:
186192
explicit SyntaxModelContext(SourceFile &SrcFile);
187193
~SyntaxModelContext();

include/swift/Parse/Lexer.h

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@
1919

2020
#include "swift/Basic/SourceLoc.h"
2121
#include "swift/Basic/SourceManager.h"
22-
#include "swift/Parse/Token.h"
22+
#include "swift/Syntax/Token.h"
23+
#include "swift/Syntax/Syntax.h"
2324
#include "swift/AST/DiagnosticEngine.h"
2425
#include "llvm/ADT/SmallVector.h"
2526

27+
#include <deque>
28+
2629
namespace swift {
2730
class DiagnosticEngine;
2831
class InFlightDiagnostic;
@@ -88,7 +91,7 @@ class Lexer {
8891

8992
/// @}
9093

91-
Token NextToken;
94+
syntax::Token NextToken;
9295

9396
/// \brief This is true if we're lexing a .sil file instead of a .swift
9497
/// file. This enables the 'sil' keyword.
@@ -99,6 +102,13 @@ class Lexer {
99102
/// InSILBody - This is true when we're lexing the body of a SIL declaration
100103
/// in a SIL file. This enables some context-sensitive lexing.
101104
bool InSILBody = false;
105+
106+
/// The source trivia leading up to the current token.
107+
std::deque<syntax::Trivia> LeadingTrivia;
108+
109+
/// The source trivia after the current token, up to and including the first
110+
/// newline after the token.
111+
std::deque<syntax::Trivia> TrailingTrivia;
102112

103113
public:
104114
/// \brief Lexer state can be saved/restored to/from objects of this class.
@@ -192,10 +202,11 @@ class Lexer {
192202
return CodeCompletionPtr != nullptr;
193203
}
194204

195-
void lex(Token &Result) {
196-
Result = NextToken;
205+
syntax::Token lex() {
206+
auto Result = NextToken;
197207
if (Result.isNot(tok::eof))
198208
lexImpl();
209+
return Result;
199210
}
200211

201212
bool isKeepingComments() const {
@@ -206,7 +217,7 @@ class Lexer {
206217

207218
/// peekNextToken - Return the next token to be returned by Lex without
208219
/// actually lexing it.
209-
const Token &peekNextToken() const { return NextToken; }
220+
const syntax::Token &peekNextToken() const { return NextToken; }
210221

211222
/// \brief Returns the lexer state for the beginning of the given token
212223
/// location. After restoring the state, lexer will return this token and
@@ -216,11 +227,11 @@ class Lexer {
216227
/// \brief Returns the lexer state for the beginning of the given token.
217228
/// After restoring the state, lexer will return this token and continue from
218229
/// there.
219-
State getStateForBeginningOfToken(const Token &Tok) const {
230+
State getStateForBeginningOfToken(syntax::Token Tok) const {
220231
// If the token has a comment attached to it, rewind to before the comment,
221232
// not just the start of the token. This ensures that we will re-lex and
222233
// reattach the comment to the token if rewound to this state.
223-
SourceLoc TokStart = Tok.getCommentStart();
234+
auto TokStart = Tok.getAbsoluteTriviaStart();
224235
if (TokStart.isInvalid())
225236
TokStart = Tok.getLoc();
226237
return getStateForBeginningOfTokenLoc(TokStart);
@@ -256,8 +267,8 @@ class Lexer {
256267
/// resides.
257268
///
258269
/// \param Loc The source location of the beginning of a token.
259-
static Token getTokenAtLocation(const SourceManager &SM, SourceLoc Loc);
260-
270+
static Optional<syntax::Token>
271+
getTokenAtLocation(const SourceManager &SM, SourceLoc Loc);
261272

262273
/// \brief Retrieve the source location that points just past the
263274
/// end of the token referred to by \c Loc.
@@ -368,11 +379,11 @@ class Lexer {
368379
/// \brief Given a string literal token, separate it into string/expr segments
369380
/// of a potentially interpolated string.
370381
static void getStringLiteralSegments(
371-
const Token &Str,
382+
const syntax::Token &Str,
372383
SmallVectorImpl<StringSegment> &Segments,
373384
DiagnosticEngine *Diags);
374385

375-
void getStringLiteralSegments(const Token &Str,
386+
void getStringLiteralSegments(const syntax::Token &Str,
376387
SmallVectorImpl<StringSegment> &Segments) {
377388
return getStringLiteralSegments(Str, Segments, Diags);
378389
}
@@ -382,7 +393,7 @@ class Lexer {
382393
}
383394

384395
/// Get the token that starts at the given location.
385-
Token getTokenAt(SourceLoc Loc);
396+
syntax::Token getTokenAt(SourceLoc Loc);
386397

387398
/// SILBodyRAII - This helper class is used when parsing a SIL body to inform
388399
/// the lexer that SIL-specific lexing should be enabled.
@@ -426,6 +437,7 @@ class Lexer {
426437

427438
void formToken(tok Kind, const char *TokStart);
428439

440+
void skipUpToEndOfLine();
429441
void skipToEndOfLine();
430442

431443
/// Skip to the end of the line of a // comment.
@@ -441,6 +453,12 @@ class Lexer {
441453
void lexOperatorIdentifier();
442454
void lexHexNumber();
443455
void lexNumber();
456+
void lexTrivia(std::deque<syntax::Trivia> &T, bool StopAtFirstNewline = false);
457+
Optional<syntax::Trivia> lexWhitespace(bool StopAtFirstNewline);
458+
Optional<syntax::Trivia> lexComment();
459+
Optional<syntax::Trivia> lexSingleLineComment(syntax::TriviaKind Kind);
460+
Optional<syntax::Trivia> lexBlockComment(syntax::TriviaKind Kind);
461+
Optional<syntax::Trivia> lexDocComment();
444462
static unsigned lexUnicodeEscape(const char *&CurPtr, Lexer *Diags);
445463

446464
unsigned lexCharacter(const char *&CurPtr,

0 commit comments

Comments
 (0)