Skip to content

Implement upper, lower case conversion for char #12551

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 67 additions & 25 deletions src/etc/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def fetch(f):
def load_unicode_data(f):
fetch(f)
gencats = {}
case_conversions = {}
combines = []
canon_decomp = {}
compat_decomp = {}
Expand All @@ -44,6 +45,7 @@ def load_unicode_data(f):
c_hi = 0
com_lo = 0
com_hi = 0

for line in fileinput.input(f):
fields = line.split(";")
if len(fields) != 15:
Expand All @@ -52,8 +54,13 @@ def load_unicode_data(f):
decomp, deci, digit, num, mirror,
old, iso, upcase, lowcase, titlecase ] = fields

code_org = code
code = int(code, 16)

# generate char to char direct translations
if gencat == "Lu" and lowcase != "":
case_conversions[code] = int(lowcase, 16)

if decomp != "":
if decomp.startswith('<'):
seq = []
Expand Down Expand Up @@ -96,7 +103,7 @@ def load_unicode_data(f):
com_lo = code
com_hi = code

return (canon_decomp, compat_decomp, gencats, combines)
return (canon_decomp, compat_decomp, gencats, combines, case_conversions)

def load_properties(f, interestingprops):
fetch(f)
Expand Down Expand Up @@ -164,8 +171,9 @@ def emit_property_module(f, mod, tbl):
keys = tbl.keys()
keys.sort()
emit_bsearch_range_table(f);

for cat in keys:
if cat == "Cs": continue
if cat not in ["Nd", "Nl", "No", "Cc", "XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase", "White_Space"]: continue
f.write(" static %s_table : &'static [(char,char)] = &[\n" % cat)
ix = 0
for pair in tbl[cat]:
Expand All @@ -180,29 +188,62 @@ def emit_property_module(f, mod, tbl):
f.write("}\n")


def emit_property_module_old(f, mod, tbl):
f.write("mod %s {\n" % mod)
def emit_conversions_module(f, case_conversions):
f.write("pub mod conversions {\n")
f.write("""
use cmp::{Equal, Less, Greater};
use vec::ImmutableVector;
use tuple::Tuple2;
use option::{ Option, Some, None };

pub fn to_lower(c: char) -> char {
match bsearch_lu(c) {
None => c,
Some(index) => LuLl_table[index].val1()
}
}

pub fn to_upper(c: char) -> char {
match bsearch_ll(c) {
None => c,
Some(index) => LuLl_table[index].val0()
}
}
""");

emit_bsearch_lu_ll(f)
emit_caseconversions(f, case_conversions)
f.write("}\n")

def emit_caseconversions(f, tbl):
f.write(" static LuLl_table : &'static [(char, char)] = &[\n")
keys = tbl.keys()
keys.sort()
for cat in keys:
f.write(" fn %s(c: char) -> bool {\n" % cat)
f.write(" ret alt c {\n")
prefix = ' '
for pair in tbl[cat]:
if pair[0] == pair[1]:
f.write(" %c %s\n" %
(prefix, escape_char(pair[0])))
else:
f.write(" %c %s to %s\n" %
(prefix,
escape_char(pair[0]),
escape_char(pair[1])))
prefix = '|'
f.write(" { true }\n")
f.write(" _ { false }\n")
f.write(" };\n")
f.write(" }\n\n")
f.write("}\n")
ix = 0
for key in keys:
f.write(ch_prefix(ix))
f.write("(%s, %s)" % (escape_char(key), escape_char(tbl[key])))
ix += 1
f.write("\n ];\n\n")

def emit_bsearch_lu_ll(f):
f.write("""
fn bsearch_lu(c: char) -> Option<uint> {
LuLl_table.bsearch(|&(x, _)| {
if x == c { Equal }
else if x < c { Less }
else { Greater }
})
}

fn bsearch_ll(c: char) -> Option<uint> {
LuLl_table.bsearch(|&(_, x)| {
if x == c { Equal }
else if x < c { Less }
else { Greater }
})
}\n
""");

def format_table_content(f, content, indent):
line = " "*indent
Expand Down Expand Up @@ -359,7 +400,7 @@ def emit_decomp_module(f, canon, compat, combine):
os.remove(i);
rf = open(r, "w")

(canon_decomp, compat_decomp, gencats, combines) = load_unicode_data("UnicodeData.txt")
(canon_decomp, compat_decomp, gencats, combines, case_conversions) = load_unicode_data("UnicodeData.txt")

# Preamble
rf.write('''// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
Expand All @@ -376,7 +417,6 @@ def emit_decomp_module(f, canon, compat, combine):

#[allow(missing_doc)];
#[allow(non_uppercase_statics)];

''')

emit_property_module(rf, "general_category", gencats)
Expand All @@ -385,7 +425,9 @@ def emit_decomp_module(f, canon, compat, combine):

derived = load_properties("DerivedCoreProperties.txt",
["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])

emit_property_module(rf, "derived_property", derived)

props = load_properties("PropList.txt", ["White_Space"])
emit_property_module(rf, "property", props)
emit_conversions_module(rf, case_conversions)
48 changes: 47 additions & 1 deletion src/libstd/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use cast::transmute;
use option::{None, Option, Some};
use iter::{Iterator, range_step};
use str::StrSlice;
use unicode::{derived_property, property, general_category, decompose};
use unicode::{derived_property, property, general_category, decompose, conversions};

#[cfg(test)] use str::OwnedStr;

Expand Down Expand Up @@ -195,6 +195,28 @@ pub fn to_digit(c: char, radix: uint) -> Option<uint> {
else { None }
}

/// Convert a char to its uppercase equivalent
///
/// Multi char foldings are not supported at the moment
/// # Return value
///
/// Returns the char itself if no conversion if possible
#[inline]
pub fn to_uppercase(c: char) -> char {
conversions::to_upper(c)
}

/// Convert a char to its lowercase equivalent
///
/// # Return value
///
/// Multi char foldings are not supported at the moment
/// Returns the char itself if no conversion if possible
#[inline]
pub fn to_lowercase(c: char) -> char {
conversions::to_lower(c)
}

///
/// Converts a number to the character representing it.
///
Expand Down Expand Up @@ -355,6 +377,8 @@ pub trait Char {
fn is_digit(&self) -> bool;
fn is_digit_radix(&self, radix: uint) -> bool;
fn to_digit(&self, radix: uint) -> Option<uint>;
fn to_lowercase(&self) -> char;
fn to_uppercase(&self) -> char;
fn from_digit(num: uint, radix: uint) -> Option<char>;
fn escape_unicode(&self, f: |char|);
fn escape_default(&self, f: |char|);
Expand Down Expand Up @@ -390,6 +414,10 @@ impl Char for char {

fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }

fn to_lowercase(&self) -> char { to_lowercase(*self) }

fn to_uppercase(&self) -> char { to_uppercase(*self) }

fn from_digit(num: uint, radix: uint) -> Option<char> { from_digit(num, radix) }

fn escape_unicode(&self, f: |char|) { escape_unicode(*self, f) }
Expand Down Expand Up @@ -485,6 +513,24 @@ fn test_to_digit() {
assert_eq!('$'.to_digit(36u), None);
}

#[test]
fn test_to_lowercase() {
assert_eq!('A'.to_lowercase(), 'a');
assert_eq!('Ö'.to_lowercase(), 'ö');
assert_eq!('ß'.to_lowercase(), 'ß');
assert_eq!('Ü'.to_lowercase(), 'ü');
assert_eq!('💩'.to_lowercase(), '💩');
}

#[test]
fn test_to_uppercase() {
assert_eq!('a'.to_uppercase(), 'A');
assert_eq!('ö'.to_uppercase(), 'Ö');
assert_eq!('ß'.to_uppercase(), 'ß');
assert_eq!('ü'.to_uppercase(), 'Ü');
assert_eq!('💩'.to_uppercase(), '💩');
}

#[test]
fn test_is_control() {
assert!('\u0000'.is_control());
Expand Down
Loading