[vm/libs] Improve JsonUtf8Decoder performance.

mraleph · Commit Queue · commit 1e6c9b026f03 · 2024-05-15T09:56:20.000Z
This CL focuses on improving parsing of white space (between JSON tokens) and simple strings which don't contain escape sequences inside. Improvements are achieved by changing the code to table driven implementation instead of if-cascade: we have a 256 element table which stores attributes for characters (e.g. whether it is a white space or a terminal token for a simple string) and use this table to make decisions on whether to advance through characters or stop a loop and do something else. We also suppress bounds checks and interrupt checks in tight loops - in tight loops like this a bound checks can cost 30% in overhead. This CL brings 28% geomean improvement on benchmarks from the linked issue. (All measurements are done in X64 Product AOT) Individual measurements are: | Input JSON | ms/iter | vs HEAD | vs V8 | | ---------- | ------- | ------- | ----- | | apache_builds.json | 0.44 | 61.06% | 136.86% | | canada.json | 31.04 | 96.54% | 187.15% | | citm_catalog.json | 6.43 | 64.44% | 93.94% | | github_events.json | 0.23 | 59.02% | 128.86% | | google_maps_api_compact_response.json | 0.10 | 82.12% | 133.83% | | google_maps_api_response.json | 0.12 | 68.79% | 140.07% | | gsoc-2018.json | 9.25 | 44.89% | 147.43% | | instruments.json | 1.08 | 70.18% | 167.38% | | marine_ik.json | 21.07 | 88.25% | 142.58% | | mesh.json | 4.51 | 94.56% | 136.57% | | mesh.pretty.json | 9.97 | 83.76% | 193.79% | | numbers.json | 0.57 | 91.88% | 83.37% | | random.json | 3.79 | 78.32% | 107.18% | | repeat.json | 0.06 | 71.51% | 118.47% | | semanticscholar-corpus.json | 37.65 | 54.82% | 57.81% | | tree-pretty.json | 0.17 | 68.68% | 162.33% | | twitter_api_compact_response.json | 0.06 | 75.23% | 126.11% | | twitter_api_response.json | 0.08 | 70.64% | 123.60% | | twitterescaped.json | 3.88 | 84.66% | 177.94% | | twitter.json | 3.54 | 73.01% | 105.33% | | twitter_timeline.json | 0.37 | 81.52% | 271.37% | | update-center.json | 2.85 | 66.75% | 89.01% | vs HEAD (geomean): 72.94% vs V8 (geomean): 130.99% HEAD vs V8 (geomean): 179.59% Issue #55522 TEST=covered by co19 Change-Id: Id673118c19250ab7781cc98c7656b972debc60ff Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/365803 Reviewed-by: Alexander Markov <alexmarkov@google.com> Commit-Queue: Slava Egorov <vegorov@google.com>
diff --git a/sdk/lib/_internal/vm/lib/convert_patch.dart b/sdk/lib/_internal/vm/lib/convert_patch.dart
@@ -17,7 +17,8 @@ import "dart:_internal"
         POWERS_OF_TEN,
         unsafeCast,
         writeIntoOneByteString,
-        writeIntoTwoByteString;
+        writeIntoTwoByteString,
+        createOneByteStringFromCharacters;
 
 import "dart:typed_data" show Uint8List, Uint16List;
 
@@ -114,7 +115,7 @@ class _JsonListener {
   void popContainer() {
     value = currentContainer;
     currentContainer = stack.removeLast();
-    if (currentContainer is Map) key = stack.removeLast() as String;
+    if (currentContainer is Map) key = unsafeCast<String>(stack.removeLast());
   }
 
   void handleString(String value) {
@@ -139,12 +140,12 @@ class _JsonListener {
   }
 
   void propertyName() {
-    key = value as String;
+    key = unsafeCast<String>(value);
     value = null;
   }
 
   void propertyValue() {
-    var map = currentContainer as Map;
+    var map = unsafeCast<Map>(currentContainer);
     var reviver = this.reviver;
     if (reviver != null) {
       value = reviver(key, value);
@@ -164,7 +165,7 @@ class _JsonListener {
   }
 
   void arrayElement() {
-    var list = currentContainer as List;
+    var list = unsafeCast<List>(currentContainer);
     var reviver = this.reviver;
     if (reviver != null) {
       value = reviver(list.length, value);
@@ -535,6 +536,13 @@ mixin _ChunkedJsonParser<T> on _JsonParserWithListener {
    */
   int getChar(int index);
 
+  /**
+   * Returns [true] if [getChar] is returning UTF16 code units.
+   *
+   * Otherwise it is expected that [getChar] is returning UTF8 bytes.
+   */
+  bool get isUtf16Input;
+
   /**
    * Copy ASCII characters from start to end of chunk into a list.
    *
@@ -813,22 +821,35 @@ mixin _ChunkedJsonParser<T> on _JsonParserWithListener {
    * Starts parsing at [position] and continues until [chunkEnd].
    * Continues parsing where the previous chunk (if any) ended.
    */
+  @pragma('vm:unsafe:no-interrupts')
+  @pragma('vm:unsafe:no-bounds-checks')
   void parse(int position) {
     int length = chunkEnd;
     if (partialState != NO_PARTIAL) {
       position = parsePartial(position);
       if (position == length) return;
     }
+    final charAttributes = _characterAttributes;
+
     int state = this.state;
+    outer:
     while (position < length) {
-      int char = getChar(position);
-      switch (char) {
-        case SPACE:
-        case CARRIAGE_RETURN:
-        case NEWLINE:
-        case TAB:
-          position++;
+      int char = 0;
+      do {
+        char = getChar(position);
+        if (isUtf16Input && char > 0xFF) {
           break;
+        }
+        if ((charAttributes.codeUnitAt(char) & CHAR_WHITESPACE) == 0) {
+          break;
+        }
+        position++;
+        if (position >= length) {
+          break outer;
+        }
+      } while (true);
+
+      switch (char) {
         case QUOTE:
           if ((state & ALLOW_STRING_MASK) != 0) fail(position);
           state |= VALUE_READ_BITS;
@@ -988,35 +1009,80 @@ mixin _ChunkedJsonParser<T> on _JsonParserWithListener {
     return length;
   }
 
+  static const int CHAR_SIMPLE_STRING_END = 1;
+  static const int CHAR_WHITESPACE = 2;
+
+  /**
+   * [_characterAttributes] string was generated using the following code:
+   *
+   * ```
+   * int $(String ch) => ch.codeUnitAt(0);
+   * final list = Uint8List(256);
+   * for (var i = 0; i < $(' '); i++) {
+   *   list[i] |= CHAR_SIMPLE_STRING_END;
+   * }
+   * list[$('"')] |= CHAR_SIMPLE_STRING_END;
+   * list[$('\\')] |= CHAR_SIMPLE_STRING_END;
+   * list[$(' ')] |= CHAR_WHITESPACE;
+   * list[$('\r')] |= CHAR_WHITESPACE;
+   * list[$('\n')] |= CHAR_WHITESPACE;
+   * list[$('\t')] |= CHAR_WHITESPACE;
+   * for (var i = 0; i < 256; i += 64) {
+   *   print("'${String.fromCharCodes([
+   *         for (var v in list.skip(i).take(64)) v + $(' '),
+   *       ])}'");
+   * }
+   * ```
+   */
+  static const String _characterAttributes =
+      '!!!!!!!!!##!!#!!!!!!!!!!!!!!!!!!" !                             '
+      '                            !                                   '
+      '                                                                '
+      '                                                                ';
+
   /**
    * Parses a string value.
    *
    * Initial [position] is right after the initial quote.
    * Returned position right after the final quote.
    */
+  @pragma('vm:unsafe:no-interrupts')
+  @pragma('vm:unsafe:no-bounds-checks')
   int parseString(int position) {
+    final charAttributes = _characterAttributes;
+
     // Format: '"'([^\x00-\x1f\\\"]|'\\'[bfnrt/\\"])*'"'
     // Initial position is right after first '"'.
     int start = position;
     int end = chunkEnd;
     int bits = 0;
-    while (position < end) {
-      int char = getChar(position++);
-      bits |= char; // Includes final '"', but that never matters.
-      // BACKSLASH is larger than QUOTE and SPACE.
-      if (char > BACKSLASH) {
-        continue;
+    int char = 0;
+    if (position < end) {
+      do {
+        // Caveat: do not combine the following two lines together. It helps
+        // compiler to generate better code (it currently can't reorder operations
+        // to reduce register pressure).
+        char = getChar(position);
+        position++;
+        bits |= char; // Includes final '"', but that never matters.
+        if (isUtf16Input && char > 0xFF) {
+          continue;
+        }
+        if ((charAttributes.codeUnitAt(char) & CHAR_SIMPLE_STRING_END) != 0) {
+          break;
+        }
+      } while (position < end);
+      if (char == QUOTE) {
+        int sliceEnd = position - 1;
+        listener.handleString(getString(start, sliceEnd, bits));
+        return sliceEnd + 1;
       }
       if (char == BACKSLASH) {
-        beginString();
         int sliceEnd = position - 1;
+        beginString();
         if (start < sliceEnd) addSliceToString(start, sliceEnd);
         return parseStringToBuffer(sliceEnd);
       }
-      if (char == QUOTE) {
-        listener.handleString(getString(start, position - 1, bits));
-        return position;
-      }
       if (char < SPACE) {
         fail(position - 1, "Control character in string");
       }
@@ -1065,7 +1131,11 @@ mixin _ChunkedJsonParser<T> on _JsonParserWithListener {
    * This function scans through the string literal for escapes, and copies
    * slices of non-escape characters using [addSliceToString].
    */
+  @pragma('vm:unsafe:no-interrupts')
+  @pragma('vm:unsafe:no-bounds-checks')
   int parseStringToBuffer(int position) {
+    final charAttributes = _characterAttributes;
+
     int end = chunkEnd;
     int start = position;
     while (true) {
@@ -1075,11 +1145,23 @@ mixin _ChunkedJsonParser<T> on _JsonParserWithListener {
         }
         return chunkString(STR_PLAIN);
       }
-      int char = getChar(position++);
-      if (char > BACKSLASH) continue;
+
+      int char = 0;
+      do {
+        char = getChar(position);
+        position++;
+        if (isUtf16Input && char > 0xFF) {
+          continue;
+        }
+        if ((charAttributes.codeUnitAt(char) & CHAR_SIMPLE_STRING_END) != 0) {
+          break;
+        }
+      } while (position < end);
+
       if (char < SPACE) {
         fail(position - 1); // Control character in string.
       }
+
       if (char == QUOTE) {
         int quotePosition = position - 1;
         if (quotePosition > start) {
@@ -1088,13 +1170,16 @@ mixin _ChunkedJsonParser<T> on _JsonParserWithListener {
         listener.handleString(endString());
         return position;
       }
+
       if (char != BACKSLASH) {
         continue;
       }
+
       // Handle escape.
       if (position - 1 > start) {
         addSliceToString(start, position - 1);
       }
+
       if (position == end) return chunkString(STR_ESCAPE);
       position = parseStringEscape(position);
       if (position == end) return position;
@@ -1391,6 +1476,10 @@ class _JsonStringParser extends _JsonParserWithListener
 
   _JsonStringParser(_JsonListener listener) : super(listener);
 
+  @pragma('vm:prefer-inline')
+  bool get isUtf16Input => true;
+
+  @pragma('vm:prefer-inline')
   int getChar(int position) => chunk.codeUnitAt(position);
 
   String getString(int start, int end, int bits) {
@@ -1512,13 +1601,16 @@ class _JsonUtf8Parser extends _JsonParserWithListener
     parse(start);
   }
 
+  @pragma('vm:prefer-inline')
+  bool get isUtf16Input => false;
+
   @pragma('vm:prefer-inline')
   int getChar(int position) => chunk[position];
 
   String getString(int start, int end, int bits) {
     const int maxAsciiChar = 0x7f;
     if (bits <= maxAsciiChar) {
-      return new String.fromCharCodes(chunk, start, end);
+      return createOneByteStringFromCharacters(chunk, start, end);
     }
     beginString();
     if (start < end) addSliceToString(start, end);
diff --git a/sdk/lib/_internal/vm/lib/internal_patch.dart b/sdk/lib/_internal/vm/lib/internal_patch.dart
@@ -62,6 +62,14 @@ void copyRangeFromUint8ListToOneByteString(
   }
 }
 
+@pragma("vm:prefer-inline")
+String createOneByteStringFromCharacters(Uint8List bytes, int start, int end) {
+  final len = end - start;
+  final s = allocateOneByteString(len);
+  copyRangeFromUint8ListToOneByteString(bytes, s, start, 0, len);
+  return s;
+}
+
 /// The returned string is a [_TwoByteString] with uninitialized content.
 @pragma("vm:recognized", "asm-intrinsic")
 @pragma("vm:external-name", "Internal_allocateTwoByteString")

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,14 @@ void copyRangeFromUint8ListToOneByteString(`
`62`	`62`	`}`
`63`	`63`	`}`
`64`	`64`
	`65`	`+@pragma("vm:prefer-inline")`
	`66`	`+String createOneByteStringFromCharacters(Uint8List bytes, int start, int end) {`
	`67`	`+ final len = end - start;`
	`68`	`+ final s = allocateOneByteString(len);`
	`69`	`+ copyRangeFromUint8ListToOneByteString(bytes, s, start, 0, len);`
	`70`	`+ return s;`
	`71`	`+}`
	`72`	`+`
`65`	`73`	`/// The returned string is a [_TwoByteString] with uninitialized content.`
`66`	`74`	`@pragma("vm:recognized", "asm-intrinsic")`
`67`	`75`	`@pragma("vm:external-name", "Internal_allocateTwoByteString")`