swiftlang · ymanton · Mar 8, 2017
diff --git a/stdlib/public/SwiftShims/UnicodeShims.h b/stdlib/public/SwiftShims/UnicodeShims.h
@@ -83,6 +83,10 @@ _swift_stdlib_unicode_compare_utf8_utf8(const unsigned char *Left,
                                         const unsigned char *Right,
                                         __swift_int32_t RightLength);
 
+SWIFT_RUNTIME_STDLIB_INTERFACE
+__attribute__((__pure__)) __swift_int32_t
+_swift_stdlib_unicode_find_longest_contraction(void);
+
 SWIFT_RUNTIME_STDLIB_INTERFACE
 void *_swift_stdlib_unicodeCollationIterator_create(
     const __swift_uint16_t *Str,

diff --git a/stdlib/public/core/CMakeLists.txt b/stdlib/public/core/CMakeLists.txt
@@ -119,6 +119,7 @@ set(SWIFTLIB_ESSENTIAL
   StringComparable.swift
   StringCore.swift
   StringHashable.swift
+  StringHelpers.cpp
   StringInterpolation.swift
   StringLegacy.swift
   StringRangeReplaceableCollection.swift.gyb

diff --git a/stdlib/public/core/StringComparable.swift b/stdlib/public/core/StringComparable.swift
@@ -38,6 +38,13 @@ public func _stdlib_compareNSStringDeterministicUnicodeCollationPointer(
 ) -> Int32
 #endif
 
+@_silgen_name("_swift_string_memcmp")
+func _swift_string_memcmp(
+  _ s1: UnsafeMutableRawPointer,
+  _ s2: UnsafeMutableRawPointer,
+  _ n: Int
+) -> Int
+
 extension String {
 #if _runtime(_ObjC)
   /// This is consistent with Foundation, but incorrect as defined by Unicode.
@@ -65,11 +72,67 @@ extension String {
   }
 #endif
 
-  /// Compares two strings with the Unicode Collation Algorithm.
+  private
+  func _compareCodeUnitsASCII(_ rhs: String) -> Int {
+    let n = min(_core.count, rhs._core.count)
+    let selfStart = UnsafeMutableRawPointer(_core.startASCII)
+    let rhsStart = UnsafeMutableRawPointer(rhs._core.startASCII)
+    let firstDiff = _swift_string_memcmp(selfStart, rhsStart, n)
+    if _core.count == rhs._core.count && firstDiff == n {
+      return 0
+    }
+    return _compareString(rhs, offset: firstDiff)
+  }
+
   @inline(never)
-  @_semantics("stdlib_binary_only") // Hide the CF/ICU dependency
+  @_semantics("stdlib_binary_only") // Hide the ICU dependency
+  private
+  func _compareCodeUnitsUTF16(_ rhs: String) -> Int {
+    let n = min(_core.count, rhs._core.count) << _core.elementShift
+    let selfStart = UnsafeMutableRawPointer(_core.startUTF16)
+    let rhsStart = UnsafeMutableRawPointer(rhs._core.startUTF16)
+    var firstDiff = _swift_string_memcmp(selfStart, rhsStart, n)
+    if _core.count == rhs._core.count && firstDiff == n {
+      return 0
+    }
+    // At this point we have to fall back to the UCA.
+    // In order to properly order contractions and surrogate pairs we can't
+    // invoke the UCA with UTF16 strings that start in the middle of a contraction
+    // or surrogate pair. Rather than carry out a lot of expensive operations to
+    // figure out if we're in the middle of a contraction or surrogate pair, we
+    // simply step back a fixed number of code units, equal to the longest
+    // possible contraction, or the length of a surrogate pair (2), whichever is
+    // greater, minus 1 (while taking care that we don't step back past the start
+    // of the strings).
+    // This will produce a correct result at the cost of re-comparing a few
+    // characters that we know are equal, which is likely much cheaper than
+    // calculating a more precise number of code units to step back.
+    firstDiff = firstDiff >> _core.elementShift
+    let surrogateLength = 2
+    let stepBack = max(Int(_swift_stdlib_unicode_find_longest_contraction()), surrogateLength) - 1
+    firstDiff = firstDiff >= stepBack ? firstDiff - stepBack : 0
+    return _compareString(rhs, offset: firstDiff)
+  }
+
+  public  // @testable
+  func _compareCodeUnits(_ rhs: String) -> Int {
+    if _core.isASCII == rhs._core.isASCII &&
+       _core.hasContiguousStorage && rhs._core.hasContiguousStorage {
+         return _core.isASCII ? _compareCodeUnitsASCII(rhs) : _compareCodeUnitsUTF16(rhs)
+    }
+    return _compareString(rhs)
+  }
+
+  /// Compares two strings with the Unicode Collation Algorithm.
   public  // @testable
   func _compareDeterministicUnicodeCollation(_ rhs: String) -> Int {
+    return self._compareDeterministicUnicodeCollation(rhs, offset: 0)
+  }
+
+  @inline(never)
+  @_semantics("stdlib_binary_only") // Hide the CF/ICU dependency
+  public
+  func _compareDeterministicUnicodeCollation(_ rhs: String, offset: Int = 0) -> Int {
     // Note: this operation should be consistent with equality comparison of
     // Character.
 #if _runtime(_ObjC)
@@ -95,18 +158,18 @@ extension String {
       return -rhs._compareDeterministicUnicodeCollation(self)
     case (false, false):
       return Int(_swift_stdlib_unicode_compare_utf16_utf16(
-        _core.startUTF16, Int32(_core.count),
-        rhs._core.startUTF16, Int32(rhs._core.count)))
+        _core.startUTF16 + offset, Int32(_core.count - offset),
+        rhs._core.startUTF16 + offset, Int32(rhs._core.count - offset)))
     case (true, true):
       return Int(_swift_stdlib_unicode_compare_utf8_utf8(
-        _core.startASCII, Int32(_core.count),
-        rhs._core.startASCII, Int32(rhs._core.count)))
+        _core.startASCII + offset, Int32(_core.count - offset),
+        rhs._core.startASCII + offset, Int32(rhs._core.count - offset)))
     }
 #endif
   }
 
   public  // @testable
-  func _compareString(_ rhs: String) -> Int {
+  func _compareString(_ rhs: String, offset: Int = 0) -> Int {
 #if _runtime(_ObjC)
     // We only want to perform this optimization on objc runtimes. Elsewhere,
     // we will make it follow the unicode collation algorithm even for ASCII.
@@ -115,7 +178,7 @@ extension String {
       return _compareASCII(rhs)
     }
 #endif
-    return _compareDeterministicUnicodeCollation(rhs)
+    return _compareDeterministicUnicodeCollation(rhs, offset: offset)
   }
 }
 
@@ -133,14 +196,16 @@ extension String : Equatable {
         lhs._core.startASCII, rhs._core.startASCII,
         rhs._core.count) == 0
     }
-#endif
     return lhs._compareString(rhs) == 0
+#else
+    return lhs._compareCodeUnits(rhs) == 0
+#endif
   }
 }
 
 extension String : Comparable {
   public static func < (lhs: String, rhs: String) -> Bool {
-    return lhs._compareString(rhs) < 0
+    return lhs._compareCodeUnits(rhs) < 0
   }
 }
 
diff --git a/stdlib/public/core/StringHelpers.cpp b/stdlib/public/core/StringHelpers.cpp
@@ -0,0 +1,120 @@
+//===-- StringHelpers.c - Optimized String helper routines ----------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See http://swift.org/LICENSE.txt for license information
+// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains optimized helper routines for various String operations.
+///
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include "../SwiftShims/SwiftStddef.h"
+#include "../SwiftShims/SwiftStdint.h"
+
+using wide_t = __swift_uintptr_t;
+using narrow_t = __swift_uint8_t;
+
+union iterator_t {
+  iterator_t(const void *v) : v(v) {}
+  const wide_t   *w;
+  const narrow_t *b;
+  const void     *v;
+
+  const __swift_uintptr_t i;
+};
+
+constexpr __swift_size_t wide_size = sizeof(wide_t);
+constexpr __swift_size_t min_wide_len = wide_size * 2 - 1;
+constexpr __swift_size_t wide_align_mask = wide_size - 1;
+
+static_assert(sizeof(narrow_t) == 1, "Narrow type expected to be of size 1");
+
+static
+__swift_size_t _swift_string_memcmp_narrow(iterator_t it1,
+                                           iterator_t it2,
+                                           __swift_size_t n) {
+  __swift_size_t bytes_left = n;
+  while (bytes_left > 0) {
+    if (*it1.b != *it2.b)
+      break;
+    ++it1.b;
+    ++it2.b;
+    --bytes_left;
+  }
+  return n - bytes_left;
+}
+
+static
+__swift_size_t _swift_string_memcmp_wide(iterator_t it1,
+                                         iterator_t it2,
+                                         __swift_size_t n) {
+  // See below for why we expect at least this many bytes.
+  assert(n >= (wide_size - 1 + wide_size) && "Too few bytes to compare");
+  __swift_size_t bytes_left = n;
+  // Alignment loop:
+  // Doesn't check bytes_left, assumes caller supplied >= (wide_size - 1) bytes.
+  while ((it1.i & wide_align_mask) != 0) {
+    if (*it1.b != *it2.b)
+      goto matchfail;
+    ++it1.b;
+    ++it2.b;
+    --bytes_left;
+  }
+  // Wide compare loop:
+  // Does at least one iteration, assumes that we have >= wide_size bytes
+  // remaining after the alignment loop.
+  assert((it1.i & wide_align_mask) == 0 && "Expecting first buffer to be aligned");
+  assert((it2.i & wide_align_mask) == 0 && "Expecting second buffer to be aligned");
+  do {
+    if (*it1.w != *it2.w)
+      goto matchfail;
+    ++it1.w;
+    ++it2.w;
+    bytes_left -= wide_size;
+  } while (bytes_left >= wide_size);
+  // Residue loop:
+  while (bytes_left > 0) {
+    if (*it1.b != *it2.b)
+      break;
+    ++it1.b;
+    ++it2.b;
+    --bytes_left;
+  }
+  return n - bytes_left;
+
+matchfail:
+  // Residue loop for mismatched buffers:
+  // We know the buffers contain bytes that don't match, so
+  // we don't have to care about checking bytes_left.
+  while (*it1.b == *it2.b) {
+    ++it1.b;
+    ++it2.b;
+    --bytes_left;
+    assert(bytes_left > 0 && "Expecting a mismatch prior to the end of the buffer");
+  }
+  return n - bytes_left;
+}
+
+// Compares n bytes in s1 and s2, respectively, and returns the offset
+// to the first differing byte, or n if s1 is identical to s2.
+extern "C"
+__swift_size_t _swift_string_memcmp(const void *s1,
+                                    const void *s2,
+                                    __swift_size_t n) {
+  iterator_t it1(s1), it2(s2);
+  // If we want to operate on naturally aligned data we need both inputs
+  // to be aligned -- failing that we want them to at least be misaligned
+  // to the same degree so we can compare bytes until they're aligned.
+  return n >= min_wide_len &&
+	 (it1.i & wide_align_mask) == (it2.i & wide_align_mask) ?
+         _swift_string_memcmp_wide(it1, it2, n) :
+         _swift_string_memcmp_narrow(it1, it2, n);
+}
diff --git a/stdlib/public/stubs/UnicodeNormalization.cpp b/stdlib/public/stubs/UnicodeNormalization.cpp
@@ -196,6 +196,50 @@ swift::_swift_stdlib_unicode_compare_utf8_utf8(const unsigned char *LeftString,
   return Diff;
 }
 
+/// Used by _swift_stdlib_unicode_find_longest_contraction below.
+static int32_t CachedLongestContraction = -1;
+
+/// Finds and returns the longest contraction defined by the root collator.
+/// Results is the length of the longest contraction (in UChars, i.e. UTF16 code units).
+/// The result is cached in the global static CachedLongestContraction.
+int32_t
+swift::_swift_stdlib_unicode_find_longest_contraction(void) {
+  // In order to play nice with other threads that enter this function
+  // on SMP systems we copy CachedLongestContraction to a local and use
+  // that during for calculations, only updating it once we're ready
+  // to return to the caller. We don't need any sort of synchronization
+  // because we expect this function to be idempotent.
+  int32_t LocalLongestContraction = CachedLongestContraction;
+  if (LocalLongestContraction >= 0)
+    return LocalLongestContraction;
+  USet *Contractions = uset_openEmpty();
+  UErrorCode ErrorCode = U_ZERO_ERROR;
+  if (!Contractions) {
+    swift::crash("uset_openEmpty: Unable to create a new set.");
+  }
+  std::unique_ptr<USet, decltype(&uset_close)> ContractionsPtr(Contractions, uset_close);
+  ucol_getContractionsAndExpansions(GetRootCollator(), Contractions, nullptr, FALSE, &ErrorCode);
+  if (U_FAILURE(ErrorCode)) {
+    swift::crash("ucol_getContractionsAndExpansions: Unable to get root collator's contractions.");
+  }
+  int32_t NumContractions = uset_getItemCount(Contractions);
+  UChar32 Start, End;
+  for (int32_t i = 0; i < NumContractions; ++i) {
+    int32_t ItemLength = uset_getItem(Contractions, i, &Start, &End, nullptr, 0,
+                                      &ErrorCode);
+    assert(ItemLength > 0 && "Expecting the set of contractions to only contain strings, not ranges");
+    if (ErrorCode == U_BUFFER_OVERFLOW_ERROR)
+      ErrorCode = U_ZERO_ERROR;
+    if (U_FAILURE(ErrorCode)) {
+      swift::crash("uset_getItem: Unable to get item from set.");
+    }
+    if (ItemLength > LocalLongestContraction)
+      LocalLongestContraction = ItemLength;
+  }
+  CachedLongestContraction = LocalLongestContraction;
+  return LocalLongestContraction;
+}
+
 void *swift::_swift_stdlib_unicodeCollationIterator_create(
     const __swift_uint16_t *Str, __swift_uint32_t Length) {
   UErrorCode ErrorCode = U_ZERO_ERROR;