Skip to content

[🍒][YAMLParser] Add multi-line literal folding support #4099

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion llvm/include/llvm/Support/YAMLParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
// See http://www.yaml.org/spec/1.2/spec.html for the full standard.
//
// This currently does not implement the following:
// * Multi-line literal folding.
// * Tag resolution.
// * UTF-16.
// * BOMs anywhere other than the first Unicode scalar value in the file.
Expand Down
67 changes: 62 additions & 5 deletions llvm/lib/Support/YAMLParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,9 @@ class Scanner {
/// Pos is whitespace or a new line
bool isBlankOrBreak(StringRef::iterator Position);

/// Return true if the line is a line break, false otherwise.
bool isLineEmpty(StringRef Line);

/// Consume a single b-break[28] if it's present at the current position.
///
/// Return false if the code unit at the current position isn't a line break.
Expand Down Expand Up @@ -471,6 +474,18 @@ class Scanner {
/// Scan a block scalar starting with | or >.
bool scanBlockScalar(bool IsLiteral);

/// Scan a block scalar style indicator and header.
///
/// Note: This is distinct from scanBlockScalarHeader to mirror the fact that
/// YAML does not consider the style indicator to be a part of the header.
///
/// Return false if an error occurred.
bool scanBlockScalarIndicators(char &StyleIndicator, char &ChompingIndicator,
unsigned &IndentIndicator, bool &IsDone);

/// Scan a style indicator in a block scalar header.
char scanBlockStyleIndicator();

/// Scan a chomping indicator in a block scalar header.
char scanBlockChompingIndicator();

Expand Down Expand Up @@ -1035,6 +1050,13 @@ bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
*Position == '\n';
}

bool Scanner::isLineEmpty(StringRef Line) {
for (const auto *Position = Line.begin(); Position != Line.end(); ++Position)
if (!isBlankOrBreak(Position))
return false;
return true;
}

bool Scanner::consumeLineBreakIfPresent() {
auto Next = skip_b_break(Current);
if (Next == Current)
Expand Down Expand Up @@ -1517,6 +1539,25 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) {
return true;
}

bool Scanner::scanBlockScalarIndicators(char &StyleIndicator,
char &ChompingIndicator,
unsigned &IndentIndicator,
bool &IsDone) {
StyleIndicator = scanBlockStyleIndicator();
if (!scanBlockScalarHeader(ChompingIndicator, IndentIndicator, IsDone))
return false;
return true;
}

char Scanner::scanBlockStyleIndicator() {
char Indicator = ' ';
if (Current != End && (*Current == '>' || *Current == '|')) {
Indicator = *Current;
skip(1);
}
return Indicator;
}

char Scanner::scanBlockChompingIndicator() {
char Indicator = ' ';
if (Current != End && (*Current == '+' || *Current == '-')) {
Expand Down Expand Up @@ -1655,19 +1696,19 @@ bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
}

bool Scanner::scanBlockScalar(bool IsLiteral) {
// Eat '|' or '>'
assert(*Current == '|' || *Current == '>');
skip(1);

char StyleIndicator;
char ChompingIndicator;
unsigned BlockIndent;
bool IsDone = false;
if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone))
if (!scanBlockScalarIndicators(StyleIndicator, ChompingIndicator, BlockIndent,
IsDone))
return false;
if (IsDone)
return true;
bool IsFolded = StyleIndicator == '>';

auto Start = Current;
const auto *Start = Current;
unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
unsigned LineBreaks = 0;
if (BlockIndent == 0) {
Expand All @@ -1688,6 +1729,22 @@ bool Scanner::scanBlockScalar(bool IsLiteral) {
auto LineStart = Current;
advanceWhile(&Scanner::skip_nb_char);
if (LineStart != Current) {
if (LineBreaks && IsFolded && !Scanner::isLineEmpty(Str)) {
// The folded style "folds" any single line break between content into a
// single space, except when that content is "empty" (only contains
// whitespace) in which case the line break is left as-is.
if (LineBreaks == 1) {
Str.append(LineBreaks,
isLineEmpty(StringRef(LineStart, Current - LineStart))
? '\n'
: ' ');
}
// If we saw a single line break, we are completely replacing it and so
// want `LineBreaks == 0`. Otherwise this decrement accounts for the
// fact that the first line break is "trimmed", only being used to
// signal a sequence of line breaks which should not be folded.
LineBreaks--;
}
Str.append(LineBreaks, '\n');
Str.append(StringRef(LineStart, Current - LineStart));
LineBreaks = 0;
Expand Down
108 changes: 99 additions & 9 deletions llvm/test/YAMLParser/spec-09-24.test
Original file line number Diff line number Diff line change
@@ -1,13 +1,103 @@
# RUN: yaml-bench -canonical %s | FileCheck %s
# CHECK: ? !!str "strip"
# CHECK: : !!str ""
# CHECK: ? !!str "clip"
# CHECK: : !!str ""
# CHECK: ? !!str "keep"
# CHECK: : !!str "\n"
# CHECK: ? !!str "literal_strip"
# CHECK: : !!str "Hello\n\n\nworld\non\nmultiple \n\n\nlines\n\nfoo bar"
# CHECK: ? !!str "literal_clip"
# CHECK: : !!str "Hello\n\n\nworld\non\nmultiple \n\n\nlines\n\nfoo bar\n"
# CHECK: ? !!str "literal_keep"
# CHECK: : !!str "Hello\n\n\nworld\non\nmultiple \n\n\nlines\n\nfoo bar\n\n\n\n"
# CHECK: ? !!str "folded_strip"
# CHECK: : !!str "Hello\n\nworld on multiple \n\nlines\nfoo bar"
# CHECK: ? !!str "folded_clip"
# CHECK: : !!str "Hello\n\nworld on multiple \n\nlines\nfoo bar\n"
# CHECK: ? !!str "folded_keep"
# CHECK: : !!str "Hello\n\nworld on multiple \n\nlines\nfoo bar\n\n\n"

strip: >-
literal_strip: |-
Hello

clip: >

keep: |+
world
on
multiple


lines

foo bar



literal_clip: |
Hello


world
on
multiple


lines

foo bar



literal_keep: |+
Hello


world
on
multiple


lines

foo bar



folded_strip: >-
Hello


world
on
multiple


lines

foo bar



folded_clip: >
Hello


world
on
multiple


lines

foo bar



folded_keep: >+
Hello


world
on
multiple


lines

foo bar