unicode-rs · Manishearth · May 15, 2019 · May 15, 2019 · May 15, 2019 · May 15, 2019
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -20,7 +20,7 @@
 # Since this should not require frequent updates, we just store this
 # out-of-line and check the unicode.rs file into git.
 
-import fileinput, re, os, sys, operator
+import fileinput, re, os, sys
 
 preamble = '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
@@ -59,7 +59,7 @@ def is_surrogate(n):
 
 def fetch(f):
     if not os.path.exists(os.path.basename(f)):
-        os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
+        os.system("curl -O http://www.unicode.org/Public/9.0.0/ucd/%s"
                   % f)
 
     if not os.path.exists(os.path.basename(f)):
@@ -80,7 +80,7 @@ def load_gencats(f):
         if is_surrogate(cp):
             continue
         if range_start >= 0:
-            for i in xrange(range_start, cp):
+            for i in range(range_start, cp):
                 udict[i] = data;
             range_start = -1;
         if data[1].endswith(", First>"):
@@ -150,8 +150,8 @@ def format_table_content(f, content, indent):
 def load_properties(f, interestingprops):
     fetch(f)
     props = {}
-    re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)")
-    re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
+    re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
+    re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
 
     for line in fileinput.input(os.path.basename(f)):
         prop = None
@@ -309,7 +309,7 @@ def emit_break_module(f, break_table, break_cats, name):
         # download and parse all the data
         fetch("ReadMe.txt")
         with open("ReadMe.txt") as readme:
-            pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
+            pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
             unicode_version = re.search(pattern, readme.read()).groups()
         rf.write("""
 /// The version of [Unicode](http://www.unicode.org/)
@@ -342,19 +342,19 @@ def emit_break_module(f, break_table, break_cats, name):
         for cat in grapheme_cats:
             grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
         grapheme_table.sort(key=lambda w: w[0])
-        emit_break_module(rf, grapheme_table, grapheme_cats.keys(), "grapheme")
+        emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
         rf.write("\n")
 
         word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
         word_table = []
         for cat in word_cats:
             word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
         word_table.sort(key=lambda w: w[0])
-        emit_break_module(rf, word_table, word_cats.keys(), "word")
+        emit_break_module(rf, word_table, list(word_cats.keys()), "word")
 
         sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
         sentence_table = []
         for cat in sentence_cats:
             sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
         sentence_table.sort(key=lambda w: w[0])
-        emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence")
+        emit_break_module(rf, sentence_table, list(sentence_cats.keys()), "sentence")
diff --git a/scripts/unicode_gen_breaktests.py b/scripts/unicode_gen_breaktests.py
@@ -17,23 +17,23 @@
 #
 # Since this should not require frequent updates, we just store this
 # out-of-line and check the unicode.rs file into git.
+from __future__ import print_function
 
 import unicode, re, os, fileinput
 
 def load_test_data(f, optsplit=[]):
-    outls = []
-    testRe1 = re.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
+    testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
 
     unicode.fetch(f)
     data = []
     for line in fileinput.input(os.path.basename(f)):
         # lines that include a test start with the ÷ character
-        if len(line) < 2 or line[0:2] != '÷':
+        if len(line) < 2 or not line.startswith('÷'):
             continue
 
         m = testRe1.match(line)
         if not m:
-            print "error: no match on line where test was expected: %s" % line
+            print("error: no match on line where test was expected: %s" % line)
             continue
 
         # process the characters in this test case
@@ -48,9 +48,9 @@ def load_test_data(f, optsplit=[]):
         # make sure that we have break info for each break!
         assert len(chars) - 1 == len(info)
 
-        outls.append((chars, info))
+        data.append((chars, info))
 
-    return outls
+    return data
 
 def process_split_info(s, c, o):
     outcs = []
@@ -59,7 +59,7 @@ def process_split_info(s, c, o):
 
     # are we on a × or a ÷?
     isX = False
-    if s[0:2] == '×':
+    if s.startswith('×'):
         isX = True
 
     # find each instance of '(÷|×) [x.y] '
@@ -81,10 +81,10 @@ def process_split_info(s, c, o):
 
         idx = 1
         while idx < len(s):
-            if s[idx:idx+2] == '×':
+            if s[idx:].startswith('×'):
                 isX = True
                 break
-            if s[idx:idx+2] == '÷':
+            if s[idx:].startswith('÷'):
                 isX = False
                 break
             idx += 1
@@ -172,7 +172,7 @@ def create_grapheme_data(f):
     stype = "&'static [(&'static str, &'static [&'static str])]"
     dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
     f.write("    // official Unicode test data\n")
-    f.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
+    f.write("    // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
     unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
     unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
 
@@ -187,7 +187,7 @@ def create_words_data(f):
 
     wtype = "&'static [(&'static str, &'static [&'static str])]"
     f.write("    // official Unicode test data\n")
-    f.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
+    f.write("    // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakTest.txt\n")
     unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
 
 def create_sentence_data(f):

diff --git a/src/testdata.rs b/src/testdata.rs
@@ -1,4 +1,4 @@
-// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
+// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
@@ -12,7 +12,7 @@
 
 #![allow(missing_docs, non_upper_case_globals, non_snake_case)]
     // official Unicode test data
-    // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt
+    // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.txt
     pub const TEST_SAME: &'static [(&'static str, &'static [&'static str])] = &[
         ("\u{20}\u{20}", &["\u{20}", "\u{20}"]), ("\u{20}\u{308}\u{20}", &["\u{20}\u{308}",
         "\u{20}"]), ("\u{20}\u{d}", &["\u{20}", "\u{d}"]), ("\u{20}\u{308}\u{d}", &["\u{20}\u{308}",
@@ -516,7 +516,7 @@
     ];
 
     // official Unicode test data
-    // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
+    // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakTest.txt
     pub const TEST_WORD: &'static [(&'static str, &'static [&'static str])] = &[
         ("\u{1}\u{1}", &["\u{1}", "\u{1}"]), ("\u{1}\u{308}\u{1}", &["\u{1}\u{308}", "\u{1}"]),
         ("\u{1}\u{d}", &["\u{1}", "\u{d}"]), ("\u{1}\u{308}\u{d}", &["\u{1}\u{308}", "\u{d}"]),