[gyb] Force Unicode strings in Python 2

RLovelett · RLovelett · commit 7dbb4127f550 · 2015-12-31T16:51:47.000-05:00
All strings are sequences of Unicode characters in Python 3. This is
entirely different than that of Python 2. Python 2's strings were of
bytes. However, Python 2 does have the concept of Unicode strings. This
patch changes the behavior of the file reader to use the same the codecs
module on Python 2 to properly read a string into a unicode string. From
there the strings are meant to be equivalent on 2 and 3. The rest of the
patch just updates the code to natively work with unicode strings.

To test the class `GraphemeClusterBreakPropertyTable`:

    $ python2 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \
    ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb

    $ python3 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp \
    ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb

    $ diff -u /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \
    /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp

To test the method `get_grapheme_cluster_break_tests_as_UTF8`:

    $ python2 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \
    ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb

    $ python3 utils/gyb --test \
    -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \
    -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \
    -DCMAKE_SIZEOF_VOID_P=8 \
    -o /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp \
    ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb

    $ diff -u /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \
    /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp
diff --git a/lib/ClangImporter/SortedCFDatabase.def.gyb b/lib/ClangImporter/SortedCFDatabase.def.gyb
@@ -17,6 +17,8 @@
 %{
 
 import re
+import sys
+import codecs
 
 prologueLines = ""
 epilogueLines = ""
@@ -26,7 +28,7 @@ epilogueLines = ""
 lineForName = {}
 
 # Load the data file.
-with open(CFDatabaseFile, 'rb') as f:
+with codecs.open(CFDatabaseFile, encoding=sys.getfilesystemencoding(), errors='strict') as f:
   for line in f:
     # Pass through preprocessor directives literally.
     # Assume that they all fall into either a strict prologue or epilogue.
diff --git a/utils/GYBUnicodeDataUtils.py b/utils/GYBUnicodeDataUtils.py
@@ -11,6 +11,8 @@
 ##===----------------------------------------------------------------------===##
 
 import re
+import sys
+import codecs
 
 class UnicodeProperty(object):
     """Abstract base class for Unicode properties."""
@@ -68,7 +70,7 @@ def __init__(self, grapheme_break_property_file_name):
             self.symbolic_values[v] = k
 
         # Load the data file.
-        with open(grapheme_break_property_file_name, 'rb') as f:
+        with codecs.open(grapheme_break_property_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
             for line in f:
                 # Strip comments.
                 line = re.sub('#.*', '', line)
@@ -514,9 +516,9 @@ def _convert_line(line):
 
         # Match a list of code points.
         for token in line.split(" "):
-            if token == "÷":
+            if token == u"÷":
                 boundaries += [ curr_bytes ]
-            elif token == "×":
+            elif token == u"×":
                 pass
             else:
                 code_point = int(token, 16)
@@ -529,21 +531,21 @@ def _convert_line(line):
                 # and test separately that we handle ill-formed UTF-8 sequences.
                 if code_point >= 0xd800 and code_point <= 0xdfff:
                     code_point = 0x200b
-                code_point = ('\U%(cp)08x' % { 'cp': code_point }).decode('unicode_escape')
-                as_UTF8_bytes = code_point.encode('utf8')
-                as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in as_UTF8_bytes])
+                code_point = (b'\U%(cp)08x' % { b'cp': code_point }).decode('unicode_escape', 'strict')
+                as_UTF8_bytes = bytearray(code_point.encode('utf8', 'strict'))
+                as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': byte } for byte in as_UTF8_bytes])
                 test += as_UTF8_escaped
                 curr_bytes += len(as_UTF8_bytes)
 
         return (test, boundaries)
 
     # Self-test.
-    assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
-    assert(_convert_line('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
+    assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
+    assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
 
     result = []
 
-    with open(grapheme_break_test_file_name, 'rb') as f:
+    with codecs.open(grapheme_break_test_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
         for line in f:
             test = _convert_line(line)
             if test: