Skip to content

[Syntax] support nul character as garbage text trivia in libSyntax #14962

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 5, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions include/swift/Parse/Lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,16 @@ class Lexer {
};

private:
/// Nul character meaning kind.
enum class NulCharacterKind {
/// String buffer terminator.
BufferEnd,
/// Embedded nul character.
Embedded,
/// Code completion marker.
CodeCompletion
};

/// For a source location in the current buffer, returns the corresponding
/// pointer.
const char *getBufferPtrForSourceLoc(SourceLoc Loc) const {
Expand Down Expand Up @@ -520,6 +530,8 @@ class Lexer {
/// Try to lex conflict markers by checking for the presence of the start and
/// end of the marker in diff3 or Perforce style respectively.
bool tryLexConflictMarker(bool EatNewline);

NulCharacterKind getNulCharacterKind(const char *Ptr) const;
};

/// Given an ordered token \param Array , get the iterator pointing to the first
Expand Down
104 changes: 67 additions & 37 deletions lib/Parse/Lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,16 +351,19 @@ void Lexer::skipToEndOfLine(bool EatNewline) {
}
break; // Otherwise, eat other characters.
case 0:
// If this is a random nul character in the middle of a buffer, skip it as
// whitespace.
if (CurPtr-1 != BufferEnd) {
switch (getNulCharacterKind(CurPtr - 1)) {
case NulCharacterKind::Embedded:
// If this is a random nul character in the middle of a buffer, skip it
// as whitespace.
diagnoseEmbeddedNul(Diags, CurPtr-1);
break;
LLVM_FALLTHROUGH;
case NulCharacterKind::CodeCompletion:
continue;
case NulCharacterKind::BufferEnd:
// Otherwise, the last line of the file does not have a newline.
--CurPtr;
return;
}

// Otherwise, the last line of the file does not have a newline.
--CurPtr;
return;
}
}
}
Expand Down Expand Up @@ -422,26 +425,30 @@ void Lexer::skipSlashStarComment() {

break; // Otherwise, eat other characters.
case 0:
// If this is a random nul character in the middle of a buffer, skip it as
// whitespace.
if (CurPtr-1 != BufferEnd) {
diagnoseEmbeddedNul(Diags, CurPtr-1);
break;
}

// Otherwise, we have an unterminated /* comment.
--CurPtr;
switch (getNulCharacterKind(CurPtr - 1)) {
case NulCharacterKind::Embedded:
// If this is a random nul character in the middle of a buffer, skip it
// as whitespace.
diagnoseEmbeddedNul(Diags, CurPtr - 1);
LLVM_FALLTHROUGH;
case NulCharacterKind::CodeCompletion:
continue;
case NulCharacterKind::BufferEnd: {
// Otherwise, we have an unterminated /* comment.
--CurPtr;

// Count how many levels deep we are.
llvm::SmallString<8> Terminator("*/");
while (--Depth != 0)
Terminator += "*/";
// Count how many levels deep we are.
llvm::SmallString<8> Terminator("*/");
while (--Depth != 0)
Terminator += "*/";

const char *EOL = (CurPtr[-1] == '\n') ? (CurPtr - 1) : CurPtr;
diagnose(EOL, diag::lex_unterminated_block_comment)
.fixItInsert(getSourceLoc(EOL), Terminator);
diagnose(StartPtr, diag::lex_comment_start);
return;
const char *EOL = (CurPtr[-1] == '\n') ? (CurPtr - 1) : CurPtr;
diagnose(EOL, diag::lex_unterminated_block_comment)
.fixItInsert(getSourceLoc(EOL), Terminator);
diagnose(StartPtr, diag::lex_comment_start);
return;
}
}
}
}
}
Expand Down Expand Up @@ -1857,6 +1864,16 @@ bool Lexer::tryLexConflictMarker(bool EatNewline) {
return false;
}

Lexer::NulCharacterKind Lexer::getNulCharacterKind(const char *Ptr) const {
assert(Ptr != nullptr && *Ptr == 0);
if (Ptr == CodeCompletionPtr) {
return NulCharacterKind::CodeCompletion;
}
if (Ptr == BufferEnd) {
return NulCharacterKind::BufferEnd;
}
return NulCharacterKind::Embedded;
}

void Lexer::tryLexEditorPlaceholder() {
assert(CurPtr[-1] == '<' && CurPtr[0] == '#');
Expand Down Expand Up @@ -2164,22 +2181,23 @@ void Lexer::lexImpl() {
return formToken(tok::unknown, TokStart);

case 0:
if (CurPtr-1 == CodeCompletionPtr)
switch (getNulCharacterKind(CurPtr - 1)) {
case NulCharacterKind::CodeCompletion:
return formToken(tok::code_complete, TokStart);

// If this is a random nul character in the middle of a buffer, skip it as
// whitespace.
if (CurPtr-1 != BufferEnd) {
case NulCharacterKind::Embedded:
// If this is a random nul character in the middle of a buffer, skip it as
// whitespace.
diagnoseEmbeddedNul(Diags, CurPtr-1);
goto Restart;
case NulCharacterKind::BufferEnd:
// Otherwise, this is the real end of the buffer. Put CurPtr back into
// buffer bounds.
--CurPtr;
// Return EOF.
return formToken(tok::eof, TokStart);
}

// Otherwise, this is the real end of the buffer. Put CurPtr back into
// buffer bounds.
--CurPtr;
// Return EOF.
return formToken(tok::eof, TokStart);

case '@': return formToken(tok::at_sign, TokStart);
case '{': return formToken(tok::l_brace, TokStart);
case '[': {
Expand Down Expand Up @@ -2323,7 +2341,6 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
Restart:
const char *TriviaStart = CurPtr;

// TODO: Handle random nul('\0') character in the middle of a buffer.
// TODO: Handle invalid UTF8 sequence which is skipped in lexImpl().
switch (*CurPtr++) {
case '\n':
Expand Down Expand Up @@ -2403,6 +2420,19 @@ void Lexer::lexTrivia(syntax::Trivia &Pieces, bool IsForTrailingTrivia) {
goto Restart;
}
break;
case 0:
switch (getNulCharacterKind(CurPtr - 1)) {
case NulCharacterKind::Embedded: {
diagnoseEmbeddedNul(Diags, CurPtr - 1);
size_t Length = CurPtr - TriviaStart;
Pieces.push_back(TriviaPiece::garbageText({TriviaStart, Length}));
goto Restart;
}
case NulCharacterKind::CodeCompletion:
case NulCharacterKind::BufferEnd:
break;
}
break;
default:
break;
}
Expand Down
5 changes: 0 additions & 5 deletions test/Syntax/lexer_invalid_nul.swift

This file was deleted.

5 changes: 5 additions & 0 deletions test/Syntax/round_trip_nul.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// RUN: cat %s | tr '\132' '\0' > %t.tr
// RUN: cp -f %t.tr %t
// RUN: %round-trip-syntax-test --swift-syntax-test %swift-syntax-test --file %t
let a = Z3Z // nul(Z)
func b() {}
28 changes: 28 additions & 0 deletions test/Syntax/tokens_nul.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// RUN: cat %s | tr '\132' '\0' > %t.tmp
// RUN: cp -f %t.tmp %t
// RUN: %swift-syntax-test -input-source-filename %t -dump-full-tokens 2>&1 | %FileCheck %t
let a = Z3Z // nul(Z)
func b() {}

// CHECK: 4:9: warning: nul character embedded in middle of file
// CHECK: 4:11: warning: nul character embedded in middle of file
// CHECK: 4:20: warning: nul character embedded in middle of file

// CHECK-LABEL: 4:7
// CHECK-NEXT:(Token equal
// CHECK-NEXT: (text="=")
// CHECK-NEXT: (trivia space 1)
// CHECK-NEXT: (trivia garbage_text \000))

// CHECK-LABEL: 4:10
// CHECK-NEXT:(Token integer_literal
// CHECK-NEXT: (text="3")
// CHECK-NEXT: (trivia garbage_text \000)
// CHECK-NEXT: (trivia space 1))

// CHECK-LABEL: 5:1
// CHECK-NEXT:(Token kw_func
// CHECK-NEXT: (trivia line_comment // nul(\000))
// CHECK-NEXT: (trivia newline 1)
// CHECK-NEXT: (text="func")
// CHECK-NEXT: (trivia space 1))