Skip to content

MAINT Fixes for Python scripts #54

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.

import fileinput, re, os, sys, operator
import fileinput, re, os, sys

preamble = '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
Expand Down Expand Up @@ -59,7 +59,7 @@ def is_surrogate(n):

def fetch(f):
if not os.path.exists(os.path.basename(f)):
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
os.system("curl -O http://www.unicode.org/Public/9.0.0/ucd/%s"
% f)

if not os.path.exists(os.path.basename(f)):
Expand All @@ -80,7 +80,7 @@ def load_gencats(f):
if is_surrogate(cp):
continue
if range_start >= 0:
for i in xrange(range_start, cp):
for i in range(range_start, cp):
udict[i] = data;
range_start = -1;
if data[1].endswith(", First>"):
Expand Down Expand Up @@ -150,8 +150,8 @@ def format_table_content(f, content, indent):
def load_properties(f, interestingprops):
fetch(f)
props = {}
re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")

for line in fileinput.input(os.path.basename(f)):
prop = None
Expand Down Expand Up @@ -309,7 +309,7 @@ def emit_break_module(f, break_table, break_cats, name):
# download and parse all the data
fetch("ReadMe.txt")
with open("ReadMe.txt") as readme:
pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
unicode_version = re.search(pattern, readme.read()).groups()
rf.write("""
/// The version of [Unicode](http://www.unicode.org/)
Expand Down Expand Up @@ -342,19 +342,19 @@ def emit_break_module(f, break_table, break_cats, name):
for cat in grapheme_cats:
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
grapheme_table.sort(key=lambda w: w[0])
emit_break_module(rf, grapheme_table, grapheme_cats.keys(), "grapheme")
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
rf.write("\n")

word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
word_table = []
for cat in word_cats:
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
word_table.sort(key=lambda w: w[0])
emit_break_module(rf, word_table, word_cats.keys(), "word")
emit_break_module(rf, word_table, list(word_cats.keys()), "word")

sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
sentence_table = []
for cat in sentence_cats:
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
sentence_table.sort(key=lambda w: w[0])
emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence")
emit_break_module(rf, sentence_table, list(sentence_cats.keys()), "sentence")
22 changes: 11 additions & 11 deletions scripts/unicode_gen_breaktests.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,23 @@
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.
from __future__ import print_function

import unicode, re, os, fileinput

def load_test_data(f, optsplit=[]):
outls = []
testRe1 = re.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")

unicode.fetch(f)
data = []
for line in fileinput.input(os.path.basename(f)):
# lines that include a test start with the ÷ character
if len(line) < 2 or line[0:2] != '÷':
if len(line) < 2 or not line.startswith('÷'):
continue

m = testRe1.match(line)
if not m:
print "error: no match on line where test was expected: %s" % line
print("error: no match on line where test was expected: %s" % line)
continue

# process the characters in this test case
Expand All @@ -48,9 +48,9 @@ def load_test_data(f, optsplit=[]):
# make sure that we have break info for each break!
assert len(chars) - 1 == len(info)

outls.append((chars, info))
data.append((chars, info))

return outls
return data

def process_split_info(s, c, o):
outcs = []
Expand All @@ -59,7 +59,7 @@ def process_split_info(s, c, o):

# are we on a × or a ÷?
isX = False
if s[0:2] == '×':
if s.startswith('×'):
isX = True

# find each instance of '(÷|×) [x.y] '
Expand All @@ -81,10 +81,10 @@ def process_split_info(s, c, o):

idx = 1
while idx < len(s):
if s[idx:idx+2] == '×':
if s[idx:].startswith('×'):
isX = True
break
if s[idx:idx+2] == '÷':
if s[idx:].startswith('÷'):
isX = False
break
idx += 1
Expand Down Expand Up @@ -172,7 +172,7 @@ def create_grapheme_data(f):
stype = "&'static [(&'static str, &'static [&'static str])]"
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
f.write(" // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)

Expand All @@ -187,7 +187,7 @@ def create_words_data(f):

wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
f.write(" // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakTest.txt\n")
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

def create_sentence_data(f):
Expand Down
6 changes: 3 additions & 3 deletions src/testdata.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
Expand All @@ -12,7 +12,7 @@

#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
// official Unicode test data
// http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt
// http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.txt
pub const TEST_SAME: &'static [(&'static str, &'static [&'static str])] = &[
("\u{20}\u{20}", &["\u{20}", "\u{20}"]), ("\u{20}\u{308}\u{20}", &["\u{20}\u{308}",
"\u{20}"]), ("\u{20}\u{d}", &["\u{20}", "\u{d}"]), ("\u{20}\u{308}\u{d}", &["\u{20}\u{308}",
Expand Down Expand Up @@ -516,7 +516,7 @@
];

// official Unicode test data
// http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
// http://www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakTest.txt
pub const TEST_WORD: &'static [(&'static str, &'static [&'static str])] = &[
("\u{1}\u{1}", &["\u{1}", "\u{1}"]), ("\u{1}\u{308}\u{1}", &["\u{1}\u{308}", "\u{1}"]),
("\u{1}\u{d}", &["\u{1}", "\u{d}"]), ("\u{1}\u{308}\u{d}", &["\u{1}\u{308}", "\u{d}"]),
Expand Down