|
| 1 | +import regex |
| 2 | +import struct |
| 3 | + |
| 4 | +def cpt_to_utf8_str(cpt): |
| 5 | + if cpt <= 0xFF: |
| 6 | + return bytes([cpt, 0, 0, 0]) |
| 7 | + elif cpt <= 0xFFFF: |
| 8 | + return bytes([cpt & 0xFF, cpt >> 8, 0, 0]) |
| 9 | + elif cpt <= 0xFFFFFF: |
| 10 | + return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0]) |
| 11 | + else: |
| 12 | + return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24]) |
| 13 | + |
| 14 | +def is_match(codepoint, regex_expr): |
| 15 | + try: |
| 16 | + res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32')) |
| 17 | + return res is not None |
| 18 | + except: |
| 19 | + return False |
| 20 | + |
| 21 | +def get_matches(regex_expr): |
| 22 | + unicode_ranges = [] |
| 23 | + current_range = None |
| 24 | + |
| 25 | + for codepoint in range(0x110000): |
| 26 | + if is_match(codepoint, regex_expr): |
| 27 | + if current_range is None: |
| 28 | + current_range = [codepoint, codepoint] |
| 29 | + else: |
| 30 | + current_range[1] = codepoint |
| 31 | + elif current_range is not None: |
| 32 | + unicode_ranges.append(tuple(current_range)) |
| 33 | + current_range = None |
| 34 | + |
| 35 | + if current_range is not None: |
| 36 | + unicode_ranges.append(tuple(current_range)) |
| 37 | + |
| 38 | + return unicode_ranges |
| 39 | + |
| 40 | +def print_cat(cat, ranges): |
| 41 | + print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) |
| 42 | + cnt = 0 |
| 43 | + for start, end in ranges: |
| 44 | + if cnt % 4 != 0: |
| 45 | + print(" ", end="") |
| 46 | + print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") |
| 47 | + if cnt % 4 == 3: |
| 48 | + print("") |
| 49 | + cnt += 1 |
| 50 | + |
| 51 | + if cnt % 4 != 0: |
| 52 | + print("") |
| 53 | + print("};") |
| 54 | + print("") |
| 55 | + |
| 56 | +print_cat("number", get_matches(r'\p{N}')) |
| 57 | +print_cat("letter", get_matches(r'\p{L}')) |
| 58 | +print_cat("whitespace", get_matches(r'\p{Z}')) |
| 59 | +print_cat("accent_mark", get_matches(r'\p{M}')) |
| 60 | +print_cat("punctuation", get_matches(r'\p{P}')) |
| 61 | +print_cat("symbol", get_matches(r'\p{S}')) |
| 62 | +print_cat("control", get_matches(r'\p{C}')) |
| 63 | + |
0 commit comments