Skip to content

Commit 5637c25

Browse files
committed
[libSyntax] Always copy leading and trailing trivia strings into a SyntaxArena buffer
Referencing a string in arbitrary memory is not safe since the source buffer to which it points may have been freed. Instead copy all strings into the SyntaxArena. Since RawSyntax nodes retain their arena, they can be sure that the string won't disappear if it lives in their arena. To avoid lots of small copies, we copy the entire source buffer once into the syntax arena and make StringRefs point into that buffer.
1 parent db3a520 commit 5637c25

File tree

4 files changed

+51
-5
lines changed

4 files changed

+51
-5
lines changed

include/swift/Syntax/SyntaxArena.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ class SyntaxArena : public llvm::ThreadSafeRefCountedBase<SyntaxArena> {
4040
void *Allocate(size_t size, size_t alignment) {
4141
return Allocator.Allocate(size, alignment);
4242
}
43+
44+
bool containsPointer(const void *Ptr) {
45+
return getAllocator().identifyObject(Ptr) != llvm::None;
46+
}
4347
};
4448

4549
} // namespace syntax

include/swift/SyntaxParse/SyntaxTreeCreator.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include "swift/Parse/SyntaxParseActions.h"
1717
#include "swift/Syntax/References.h"
18+
#include "llvm/ADT/StringRef.h"
1819

1920
namespace swift {
2021
class RawSyntaxTokenCache;
@@ -38,6 +39,12 @@ class SyntaxTreeCreator: public SyntaxParseActions {
3839
unsigned BufferID;
3940
RC<syntax::SyntaxArena> Arena;
4041

42+
/// A string allocated in \c Arena that contains an exact copy of the source
43+
/// file for which this \c SyntaxTreeCreator creates a syntax tree. \c
44+
/// RawSyntax nodes can safely reference text inside this buffer since they
45+
/// retain the \c SyntaxArena which holds the buffer.
46+
StringRef ArenaSourceBuffer;
47+
4148
/// A cache of nodes that can be reused when creating the current syntax
4249
/// tree.
4350
SyntaxParsingCache *SyntaxCache;

lib/Syntax/RawSyntax.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,23 @@ Trivia lexTrivia(StringRef TriviaStr) {
8787
return SyntaxTrivia;
8888
}
8989

90+
/// If the \p Str is not allocated in \p Arena, copy it to \p Arena and adjust
91+
/// \p Str to point to the string's copy in \p Arena.
92+
void copyToArenaIfNecessary(StringRef &Str, const RC<SyntaxArena> Arena) {
93+
if (Str.empty()) {
94+
// Empty strings can live wherever they want. Nothing to do.
95+
return;
96+
}
97+
if (Arena->containsPointer(Str.data())) {
98+
// String already in arena. Nothing to do.
99+
return;
100+
}
101+
// Copy string to arena
102+
char *Data = (char *)Arena->Allocate(Str.size(), alignof(char *));
103+
std::uninitialized_copy(Str.begin(), Str.end(), Data);
104+
Str = StringRef(Data, Str.size());
105+
}
106+
90107
// FIXME: If we want thread-safety for tree creation, this needs to be atomic.
91108
unsigned RawSyntax::NextFreeNodeId = 1;
92109

@@ -129,6 +146,8 @@ RawSyntax::RawSyntax(tok TokKind, OwnedString Text, size_t TextLength,
129146
: Arena(Arena), Bits({{unsigned(TextLength), unsigned(Presence), true}}),
130147
RefCount(0) {
131148
assert(Arena && "RawSyntax nodes must always be allocated in an arena");
149+
copyToArenaIfNecessary(LeadingTrivia, Arena);
150+
copyToArenaIfNecessary(TrailingTrivia, Arena);
132151

133152
if (NodeId.hasValue()) {
134153
this->NodeId = NodeId.getValue();

lib/SyntaxParse/SyntaxTreeCreator.cpp

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ SyntaxTreeCreator::SyntaxTreeCreator(SourceManager &SM, unsigned bufferID,
4242
Arena(std::move(arena)),
4343
SyntaxCache(syntaxCache),
4444
TokenCache(new RawSyntaxTokenCache()) {
45+
StringRef BufferContent = SM.getEntireTextForBuffer(BufferID);
46+
char *Data = (char *)Arena->Allocate(BufferContent.size(), alignof(char *));
47+
std::uninitialized_copy(BufferContent.begin(), BufferContent.end(), Data);
48+
ArenaSourceBuffer = StringRef(Data, BufferContent.size());
49+
assert(ArenaSourceBuffer == BufferContent);
4550
}
4651

4752
SyntaxTreeCreator::~SyntaxTreeCreator() = default;
@@ -113,14 +118,25 @@ OpaqueSyntaxNode SyntaxTreeCreator::recordToken(tok tokenKind,
113118
StringRef leadingTrivia,
114119
StringRef trailingTrivia,
115120
CharSourceRange range) {
116-
SourceLoc tokLoc = range.getStart().getAdvancedLoc(leadingTrivia.size());
117121
unsigned tokLength =
118122
range.getByteLength() - leadingTrivia.size() - trailingTrivia.size();
119-
CharSourceRange tokRange = CharSourceRange{tokLoc, tokLength};
120-
StringRef tokenText = SM.extractText(tokRange, BufferID);
123+
auto leadingTriviaStartOffset =
124+
SM.getLocOffsetInBuffer(range.getStart(), BufferID);
125+
auto tokStartOffset = leadingTriviaStartOffset + leadingTrivia.size();
126+
auto trailingTriviaStartOffset = tokStartOffset + tokLength;
127+
128+
// Get StringRefs of the token's texts that point into the syntax arena's
129+
// buffer.
130+
StringRef leadingTriviaText =
131+
ArenaSourceBuffer.substr(leadingTriviaStartOffset, leadingTrivia.size());
132+
StringRef tokenText = ArenaSourceBuffer.substr(tokStartOffset, tokLength);
133+
StringRef trailingTriviaText = ArenaSourceBuffer.substr(
134+
trailingTriviaStartOffset, trailingTrivia.size());
135+
121136
auto ownedText = OwnedString::makeRefCounted(tokenText);
122-
auto raw = TokenCache->getToken(Arena, tokenKind, range.getByteLength(),
123-
ownedText, leadingTrivia, trailingTrivia);
137+
auto raw =
138+
TokenCache->getToken(Arena, tokenKind, range.getByteLength(), ownedText,
139+
leadingTriviaText, trailingTriviaText);
124140
OpaqueSyntaxNode opaqueN = raw.get();
125141
raw.resetWithoutRelease();
126142
return opaqueN;

0 commit comments

Comments
 (0)