Skip to content

Commit 7dbb412

Browse files
committed
[gyb] Force Unicode strings in Python 2
All strings are sequences of Unicode characters in Python 3. This is entirely different than that of Python 2. Python 2's strings were of bytes. However, Python 2 does have the concept of Unicode strings. This patch changes the behavior of the file reader to use the same the codecs module on Python 2 to properly read a string into a unicode string. From there the strings are meant to be equivalent on 2 and 3. The rest of the patch just updates the code to natively work with unicode strings. To test the class `GraphemeClusterBreakPropertyTable`: $ python2 utils/gyb --test \ -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \ -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \ -DCMAKE_SIZEOF_VOID_P=8 \ -o /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \ ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb $ python3 utils/gyb --test \ -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \ -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \ -DCMAKE_SIZEOF_VOID_P=8 \ -o /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp \ ./stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb $ diff -u /tmp/UnicodeExtendedGraphemeClusters.cpp.2.7.tmp \ /tmp/UnicodeExtendedGraphemeClusters.cpp.3.5.tmp To test the method `get_grapheme_cluster_break_tests_as_UTF8`: $ python2 utils/gyb --test \ -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \ -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \ -DCMAKE_SIZEOF_VOID_P=8 \ -o /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \ ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb $ python3 utils/gyb --test \ -DunicodeGraphemeBreakPropertyFile=./utils/UnicodeData/GraphemeBreakProperty.txt \ -DunicodeGraphemeBreakTestFile=./utils/UnicodeData/GraphemeBreakTest.txt \ -DCMAKE_SIZEOF_VOID_P=8 \ -o /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp \ ./unittests/Basic/UnicodeGraphemeBreakTest.cpp.gyb $ diff -u /tmp/UnicodeGraphemeBreakTest.cpp.2.7.tmp \ /tmp/UnicodeGraphemeBreakTest.cpp.3.5.tmp
1 parent c677844 commit 7dbb412

File tree

2 files changed

+14
-10
lines changed

2 files changed

+14
-10
lines changed

lib/ClangImporter/SortedCFDatabase.def.gyb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
%{
1818

1919
import re
20+
import sys
21+
import codecs
2022

2123
prologueLines = ""
2224
epilogueLines = ""
@@ -26,7 +28,7 @@ epilogueLines = ""
2628
lineForName = {}
2729

2830
# Load the data file.
29-
with open(CFDatabaseFile, 'rb') as f:
31+
with codecs.open(CFDatabaseFile, encoding=sys.getfilesystemencoding(), errors='strict') as f:
3032
for line in f:
3133
# Pass through preprocessor directives literally.
3234
# Assume that they all fall into either a strict prologue or epilogue.

utils/GYBUnicodeDataUtils.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
##===----------------------------------------------------------------------===##
1212

1313
import re
14+
import sys
15+
import codecs
1416

1517
class UnicodeProperty(object):
1618
"""Abstract base class for Unicode properties."""
@@ -68,7 +70,7 @@ def __init__(self, grapheme_break_property_file_name):
6870
self.symbolic_values[v] = k
6971

7072
# Load the data file.
71-
with open(grapheme_break_property_file_name, 'rb') as f:
73+
with codecs.open(grapheme_break_property_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
7274
for line in f:
7375
# Strip comments.
7476
line = re.sub('#.*', '', line)
@@ -514,9 +516,9 @@ def _convert_line(line):
514516

515517
# Match a list of code points.
516518
for token in line.split(" "):
517-
if token == "÷":
519+
if token == u"÷":
518520
boundaries += [ curr_bytes ]
519-
elif token == "×":
521+
elif token == u"×":
520522
pass
521523
else:
522524
code_point = int(token, 16)
@@ -529,21 +531,21 @@ def _convert_line(line):
529531
# and test separately that we handle ill-formed UTF-8 sequences.
530532
if code_point >= 0xd800 and code_point <= 0xdfff:
531533
code_point = 0x200b
532-
code_point = ('\U%(cp)08x' % { 'cp': code_point }).decode('unicode_escape')
533-
as_UTF8_bytes = code_point.encode('utf8')
534-
as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': ord(byte) } for byte in as_UTF8_bytes])
534+
code_point = (b'\U%(cp)08x' % { b'cp': code_point }).decode('unicode_escape', 'strict')
535+
as_UTF8_bytes = bytearray(code_point.encode('utf8', 'strict'))
536+
as_UTF8_escaped = ''.join(['\\x%(byte)02x' % { 'byte': byte } for byte in as_UTF8_bytes])
535537
test += as_UTF8_escaped
536538
curr_bytes += len(as_UTF8_bytes)
537539

538540
return (test, boundaries)
539541

540542
# Self-test.
541-
assert(_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
542-
assert(_convert_line('÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
543+
assert(_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [ 0, 5, 8 ]))
544+
assert(_convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [ 0, 3 ]))
543545

544546
result = []
545547

546-
with open(grapheme_break_test_file_name, 'rb') as f:
548+
with codecs.open(grapheme_break_test_file_name, encoding=sys.getfilesystemencoding(), errors='strict') as f:
547549
for line in f:
548550
test = _convert_line(line)
549551
if test:

0 commit comments

Comments
 (0)