Merge pull request #40065 from ahoppen/pr/lexing-cutoff

ahoppen · web-flow · commit 60c78afb1305 · 2021-11-09T18:33:19.000+01:00
[Parser] Don't modify the current token kind when cutting off parsing
diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h
@@ -133,6 +133,11 @@ class Lexer {
   /// the next token doesn't have a comment.
   const char *CommentStart;
 
+  /// If this is not \c nullptr, all tokens after this point are treated as eof.
+  /// Used to cut off lexing early when we detect that the nesting level is too
+  /// deep.
+  const char *LexerCutOffPoint = nullptr;
+
   Lexer(const Lexer&) = delete;
   void operator=(const Lexer&) = delete;
 
@@ -222,6 +227,28 @@ class Lexer {
     lexImpl();
   }
 
+  /// Cut off lexing at the current position. The next token to be lexed will
+  /// be an EOF token, even if there is still source code to be lexed.
+  /// The current and next token (returned by \c peekNextToken ) are not
+  /// modified. The token after \c NextToken will be the EOF token.
+  void cutOffLexing() {
+    // If we already have a cut off point, don't push it further towards the
+    // back.
+    if (LexerCutOffPoint == nullptr || LexerCutOffPoint >= CurPtr) {
+      LexerCutOffPoint = CurPtr;
+    }
+  }
+
+  /// If a lexer cut off point has been set returns the offset in the buffer at
+  /// which lexing is being cut off.
+  Optional<size_t> lexingCutOffOffset() const {
+    if (LexerCutOffPoint) {
+      return LexerCutOffPoint - BufferStart;
+    } else {
+      return None;
+    }
+  }
+
   bool isKeepingComments() const {
     return RetainComments == CommentRetentionMode::ReturnAsTokens;
   }
diff --git a/include/swift/Parse/Parser.h b/include/swift/Parse/Parser.h
@@ -221,12 +221,6 @@ class Parser {
   /// The location of the previous token.
   SourceLoc PreviousLoc;
 
-  /// Stop parsing immediately.
-  void cutOffParsing() {
-    // Cut off parsing by acting as if we reached the end-of-file.
-    Tok.setKind(tok::eof);
-  }
-
   /// Use this to assert that the parser has advanced the lexing location, e.g.
   /// before a specific parser function has returned.
   class AssertParserMadeProgressBeforeLeavingScopeRAII {
@@ -329,35 +323,21 @@ class Parser {
 
   /// An RAII object that notes when we have seen a structure marker.
   class StructureMarkerRAII {
-    Parser *const P;
+    Parser &P;
 
     /// Max nesting level
     // TODO: customizable.
     enum { MaxDepth = 256 };
 
-    StructureMarkerRAII(Parser *parser) : P(parser) {}
-
-    /// Have the parser start the new Structure or fail if already too deep.
-    bool pushStructureMarker(Parser &parser, SourceLoc loc,
-                             StructureMarkerKind kind);
+    StructureMarkerRAII(Parser &parser) : P(parser) {}
 
   public:
-    StructureMarkerRAII(Parser &parser, SourceLoc loc, StructureMarkerKind kind)
-        : StructureMarkerRAII(
-              pushStructureMarker(parser, loc, kind) ? &parser : nullptr) {}
+    StructureMarkerRAII(Parser &parser, SourceLoc loc,
+                        StructureMarkerKind kind);
 
     StructureMarkerRAII(Parser &parser, const Token &tok);
 
-    /// Did we fail to push the new structure?
-    bool isFailed() {
-      return P == nullptr;
-    }
-
-    ~StructureMarkerRAII() {
-      if (P != nullptr) {
-        P->StructureMarkers.pop_back();
-      }
-    }
+    ~StructureMarkerRAII() { P.StructureMarkers.pop_back(); }
   };
   friend class StructureMarkerRAII;
 
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -2349,7 +2349,11 @@ void Lexer::lexImpl() {
 
   // Remember the start of the token so we can form the text range.
   const char *TokStart = CurPtr;
-  
+
+  if (LexerCutOffPoint && CurPtr >= LexerCutOffPoint) {
+    return formToken(tok::eof, TokStart);
+  }
+
   switch (*CurPtr++) {
   default: {
     char const *Tmp = CurPtr-1;
diff --git a/lib/Parse/ParseExpr.cpp b/lib/Parse/ParseExpr.cpp
@@ -3121,10 +3121,6 @@ ParserStatus Parser::parseExprList(tok leftTok, tok rightTok,
                                    SourceLoc &rightLoc, SyntaxKind Kind) {
   StructureMarkerRAII ParsingExprList(*this, Tok);
   
-  if (ParsingExprList.isFailed()) {
-    return makeParserError();
-  }
-
   leftLoc = consumeToken(leftTok);
   return parseList(rightTok, leftLoc, rightLoc, /*AllowSepAfterLast=*/false,
                    rightTok == tok::r_paren ? diag::expected_rparen_expr_list
diff --git a/lib/Parse/ParseGeneric.cpp b/lib/Parse/ParseGeneric.cpp
@@ -57,10 +57,6 @@ Parser::parseGenericParametersBeforeWhere(SourceLoc LAngleLoc,
     // Note that we're parsing a declaration.
     StructureMarkerRAII ParsingDecl(*this, Tok.getLoc(),
                                     StructureMarkerKind::Declaration);
-    
-    if (ParsingDecl.isFailed()) {
-      return makeParserError();
-    }
 
     // Parse attributes.
     DeclAttributes attributes;
diff --git a/lib/Parse/ParsePattern.cpp b/lib/Parse/ParsePattern.cpp
@@ -1216,9 +1216,6 @@ ParserResult<Pattern> Parser::parsePatternTuple() {
   SyntaxParsingContext TuplePatternCtxt(SyntaxContext,
                                         SyntaxKind::TuplePattern);
   StructureMarkerRAII ParsingPatternTuple(*this, Tok);
-  if (ParsingPatternTuple.isFailed()) {
-    return makeParserError();
-  }
   SourceLoc LPLoc = consumeToken(tok::l_paren);
   SourceLoc RPLoc;
 
diff --git a/lib/Parse/ParseStmt.cpp b/lib/Parse/ParseStmt.cpp
@@ -1280,10 +1280,6 @@ ParserResult<PoundAvailableInfo> Parser::parseStmtConditionPoundAvailable() {
 
   StructureMarkerRAII ParsingAvailabilitySpecList(*this, Tok);
 
-  if (ParsingAvailabilitySpecList.isFailed()) {
-    return makeParserError();
-  }
-
   SourceLoc LParenLoc = consumeToken(tok::l_paren);
 
   SmallVector<AvailabilitySpec *, 5> Specs;
diff --git a/lib/Parse/ParseType.cpp b/lib/Parse/ParseType.cpp
@@ -1017,10 +1017,6 @@ ParserResult<TypeRepr> Parser::parseTypeTupleBody() {
   TypeContext.setCreateSyntax(SyntaxKind::TupleType);
   Parser::StructureMarkerRAII ParsingTypeTuple(*this, Tok);
 
-  if (ParsingTypeTuple.isFailed()) {
-    return makeParserError();
-  }
-
   SourceLoc RPLoc, LPLoc = consumeToken(tok::l_paren);
   SourceLoc EllipsisLoc;
   unsigned EllipsisIdx;
diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp
@@ -405,6 +405,9 @@ namespace {
 /// underlying corrected token stream.
 class TokenRecorder: public ConsumeTokenReceiver {
   ASTContext &Ctx;
+  /// The lexer that is being used to lex the source file. Used to query whether
+  /// lexing has been cut off.
+  Lexer &BaseLexer;
   unsigned BufferID;
 
   // Token list ordered by their appearance in the source file.
@@ -425,11 +428,19 @@ class TokenRecorder: public ConsumeTokenReceiver {
   void relexComment(CharSourceRange CommentRange,
                     llvm::SmallVectorImpl<Token> &Scratch) {
     auto &SM = Ctx.SourceMgr;
+    auto EndOffset = SM.getLocOffsetInBuffer(CommentRange.getEnd(), BufferID);
+    if (auto LexerCutOffOffset = BaseLexer.lexingCutOffOffset()) {
+      if (*LexerCutOffOffset < EndOffset) {
+        // If lexing was cut off due to a too deep nesting level, adjust the end
+        // offset to not point past the cut off point.
+        EndOffset = *LexerCutOffOffset;
+      }
+    }
     Lexer L(Ctx.LangOpts, SM, BufferID, nullptr, LexerMode::Swift,
             HashbangMode::Disallowed, CommentRetentionMode::ReturnAsTokens,
             TriviaRetentionMode::WithoutTrivia,
             SM.getLocOffsetInBuffer(CommentRange.getStart(), BufferID),
-            SM.getLocOffsetInBuffer(CommentRange.getEnd(), BufferID));
+            EndOffset);
     while(true) {
       Token Result;
       L.lex(Result);
@@ -441,8 +452,8 @@ class TokenRecorder: public ConsumeTokenReceiver {
   }
 
 public:
-  TokenRecorder(ASTContext &ctx, unsigned BufferID)
-      : Ctx(ctx), BufferID(BufferID) {}
+  TokenRecorder(ASTContext &ctx, Lexer &BaseLexer)
+      : Ctx(ctx), BaseLexer(BaseLexer), BufferID(BaseLexer.getBufferID()) {}
 
   Optional<std::vector<Token>> finalize() override {
     auto &SM = Ctx.SourceMgr;
@@ -516,19 +527,14 @@ class TokenRecorder: public ConsumeTokenReceiver {
 Parser::Parser(std::unique_ptr<Lexer> Lex, SourceFile &SF,
                SILParserStateBase *SIL, PersistentParserState *PersistentState,
                std::shared_ptr<SyntaxParseActions> SPActions)
-  : SourceMgr(SF.getASTContext().SourceMgr),
-    Diags(SF.getASTContext().Diags),
-    SF(SF),
-    L(Lex.release()),
-    SIL(SIL),
-    CurDeclContext(&SF),
-    Context(SF.getASTContext()),
-    TokReceiver(SF.shouldCollectTokens() ?
-                new TokenRecorder(SF.getASTContext(), L->getBufferID()) :
-                new ConsumeTokenReceiver()),
-    SyntaxContext(new SyntaxParsingContext(SyntaxContext, SF,
-                                           L->getBufferID(),
-                                           std::move(SPActions))) {
+    : SourceMgr(SF.getASTContext().SourceMgr), Diags(SF.getASTContext().Diags),
+      SF(SF), L(Lex.release()), SIL(SIL), CurDeclContext(&SF),
+      Context(SF.getASTContext()),
+      TokReceiver(SF.shouldCollectTokens()
+                      ? new TokenRecorder(SF.getASTContext(), *L)
+                      : new ConsumeTokenReceiver()),
+      SyntaxContext(new SyntaxParsingContext(
+          SyntaxContext, SF, L->getBufferID(), std::move(SPActions))) {
   State = PersistentState;
   if (!State) {
     OwnedState.reset(new PersistentParserState());
@@ -880,28 +886,25 @@ getStructureMarkerKindForToken(const Token &tok) {
   }
 }
 
-Parser::StructureMarkerRAII::StructureMarkerRAII(Parser &parser,
-                                                 const Token &tok)
-    : StructureMarkerRAII(parser, tok.getLoc(),
-                          getStructureMarkerKindForToken(tok)) {}
-
-bool Parser::StructureMarkerRAII::pushStructureMarker(
-                                      Parser &parser, SourceLoc loc,    
-                                      StructureMarkerKind kind) {
-  
-  if (parser.StructureMarkers.size() < MaxDepth) {
-    parser.StructureMarkers.push_back({loc, kind, None});
-    return true;
-  } else {
+Parser::StructureMarkerRAII::StructureMarkerRAII(Parser &parser, SourceLoc loc,
+                                                 StructureMarkerKind kind)
+    : StructureMarkerRAII(parser) {
+  parser.StructureMarkers.push_back({loc, kind, None});
+  if (parser.StructureMarkers.size() > MaxDepth) {
     parser.diagnose(loc, diag::structure_overflow, MaxDepth);
     // We need to cut off parsing or we will stack-overflow.
     // But `cutOffParsing` changes the current token to eof, and we may be in
     // a place where `consumeToken()` will be expecting e.g. '[',
     // since we need that to get to the callsite, so this can cause an assert.
-    parser.cutOffParsing();
-    return false;
+    parser.L->cutOffLexing();
   }
 }
+
+Parser::StructureMarkerRAII::StructureMarkerRAII(Parser &parser,
+                                                 const Token &tok)
+    : StructureMarkerRAII(parser, tok.getLoc(),
+                          getStructureMarkerKindForToken(tok)) {}
+
 //===----------------------------------------------------------------------===//
 // Primitive Parsing
 //===----------------------------------------------------------------------===//
diff --git a/validation-test/compiler_crashers_2_fixed/parser-cutoff.swift b/validation-test/compiler_crashers_2_fixed/parser-cutoff.swift