Skip to content

Commit 5eecc40

Browse files
authored
[mypyc] Add LoadLiteral and use tables to construct and store literals, try 2 (#10147)
This is my second attempt to land this. See the first attempt (#10040) for the details. Previously this broke Windows wheel builds. The change from the first attempt is that instead of generating long static C strings, we generate arrays of shorter strings.
1 parent 5161d93 commit 5eecc40

33 files changed

+909
-407
lines changed

mypyc/analysis/dataflow.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from mypyc.ir.ops import (
88
Value, ControlOp,
99
BasicBlock, OpVisitor, Assign, Integer, LoadErrorValue, RegisterOp, Goto, Branch, Return, Call,
10-
Box, Unbox, Cast, Op, Unreachable, TupleGet, TupleSet, GetAttr, SetAttr,
10+
Box, Unbox, Cast, Op, Unreachable, TupleGet, TupleSet, GetAttr, SetAttr, LoadLiteral,
1111
LoadStatic, InitStatic, MethodCall, RaiseStandardError, CallC, LoadGlobal,
1212
Truncate, IntOp, LoadMem, GetElementPtr, LoadAddress, ComparisonOp, SetMem
1313
)
@@ -165,6 +165,9 @@ def visit_method_call(self, op: MethodCall) -> GenAndKill:
165165
def visit_load_error_value(self, op: LoadErrorValue) -> GenAndKill:
166166
return self.visit_register_op(op)
167167

168+
def visit_load_literal(self, op: LoadLiteral) -> GenAndKill:
169+
return self.visit_register_op(op)
170+
168171
def visit_get_attr(self, op: GetAttr) -> GenAndKill:
169172
return self.visit_register_op(op)
170173

mypyc/codegen/cstring.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919
"""
2020

2121
import string
22-
from typing import Tuple
2322

24-
CHAR_MAP = ['\\{:03o}'.format(i) for i in range(256)]
23+
from typing_extensions import Final
24+
25+
26+
CHAR_MAP = ['\\{:03o}'.format(i) for i in range(256)] # type: Final
2527

2628
# It is safe to use string.printable as it always uses the C locale.
2729
for c in string.printable:
@@ -38,12 +40,15 @@
3840
CHAR_MAP[ord('?')] = r'\?'
3941

4042

41-
def encode_as_c_string(s: str) -> Tuple[str, int]:
42-
"""Produce a quoted C string literal and its size, for a UTF-8 string."""
43-
return encode_bytes_as_c_string(s.encode('utf-8'))
43+
def encode_bytes_as_c_string(b: bytes) -> str:
44+
"""Produce contents of a C string literal for a byte string, without quotes."""
45+
escaped = ''.join([CHAR_MAP[i] for i in b])
46+
return escaped
4447

4548

46-
def encode_bytes_as_c_string(b: bytes) -> Tuple[str, int]:
47-
"""Produce a quoted C string literal and its size, for a byte string."""
48-
escaped = ''.join([CHAR_MAP[i] for i in b])
49-
return '"{}"'.format(escaped), len(b)
49+
def c_string_initializer(value: bytes) -> str:
50+
"""Create initializer for a C char[]/ char * variable from a string.
51+
52+
For example, if value if b'foo', the result would be '"foo"'.
53+
"""
54+
return '"' + encode_bytes_as_c_string(value) + '"'

mypyc/codegen/emit.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from mypyc.ir.class_ir import ClassIR, all_concrete_classes
2121
from mypyc.namegen import NameGenerator, exported_name
2222
from mypyc.sametype import is_same_type
23+
from mypyc.codegen.literals import Literals
2324

2425

2526
class HeaderDeclaration:
@@ -84,6 +85,8 @@ def __init__(self,
8485
# The declaration contains the body of the struct.
8586
self.declarations = OrderedDict() # type: Dict[str, HeaderDeclaration]
8687

88+
self.literals = Literals()
89+
8790

8891
class Emitter:
8992
"""Helper for C code generation."""

mypyc/codegen/emitfunc.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@
1212
LoadStatic, InitStatic, TupleGet, TupleSet, Call, IncRef, DecRef, Box, Cast, Unbox,
1313
BasicBlock, Value, MethodCall, Unreachable, NAMESPACE_STATIC, NAMESPACE_TYPE, NAMESPACE_MODULE,
1414
RaiseStandardError, CallC, LoadGlobal, Truncate, IntOp, LoadMem, GetElementPtr,
15-
LoadAddress, ComparisonOp, SetMem, Register
15+
LoadAddress, ComparisonOp, SetMem, Register, LoadLiteral
1616
)
1717
from mypyc.ir.rtypes import (
1818
RType, RTuple, is_tagged, is_int32_rprimitive, is_int64_rprimitive, RStruct,
19-
is_pointer_rprimitive
19+
is_pointer_rprimitive, is_int_rprimitive
2020
)
2121
from mypyc.ir.func_ir import FuncIR, FuncDecl, FUNC_STATICMETHOD, FUNC_CLASSMETHOD, all_values
2222
from mypyc.ir.class_ir import ClassIR
@@ -96,6 +96,7 @@ def __init__(self,
9696
self.declarations = declarations
9797
self.source_path = source_path
9898
self.module_name = module_name
99+
self.literals = emitter.context.literals
99100

100101
def temp_name(self) -> str:
101102
return self.emitter.temp_name()
@@ -173,6 +174,19 @@ def visit_load_error_value(self, op: LoadErrorValue) -> None:
173174
self.emit_line('%s = %s;' % (self.reg(op),
174175
self.c_error_value(op.type)))
175176

177+
def visit_load_literal(self, op: LoadLiteral) -> None:
178+
index = self.literals.literal_index(op.value)
179+
s = repr(op.value)
180+
if not any(x in s for x in ('/*', '*/', '\0')):
181+
ann = ' /* %s */' % s
182+
else:
183+
ann = ''
184+
if not is_int_rprimitive(op.type):
185+
self.emit_line('%s = CPyStatics[%d];%s' % (self.reg(op), index, ann))
186+
else:
187+
self.emit_line('%s = (CPyTagged)CPyStatics[%d] | 1;%s' % (
188+
self.reg(op), index, ann))
189+
176190
def get_attr_expr(self, obj: str, op: Union[GetAttr, SetAttr], decl_cl: ClassIR) -> str:
177191
"""Generate attribute accessor for normal (non-property) access.
178192

mypyc/codegen/emitmodule.py

Lines changed: 101 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,19 @@
2323
from mypyc.irbuild.prepare import load_type_map
2424
from mypyc.irbuild.mapper import Mapper
2525
from mypyc.common import (
26-
PREFIX, TOP_LEVEL_NAME, INT_PREFIX, MODULE_PREFIX, RUNTIME_C_FILES, USE_FASTCALL,
26+
PREFIX, TOP_LEVEL_NAME, MODULE_PREFIX, RUNTIME_C_FILES, USE_FASTCALL,
2727
USE_VECTORCALL, shared_lib_name,
2828
)
29-
from mypyc.codegen.cstring import encode_as_c_string, encode_bytes_as_c_string
29+
from mypyc.codegen.cstring import c_string_initializer
30+
from mypyc.codegen.literals import Literals
3031
from mypyc.codegen.emit import EmitterContext, Emitter, HeaderDeclaration
3132
from mypyc.codegen.emitfunc import generate_native_function, native_function_header
3233
from mypyc.codegen.emitclass import generate_class_type_decl, generate_class
3334
from mypyc.codegen.emitwrapper import (
3435
generate_wrapper_function, wrapper_function_header,
3536
generate_legacy_wrapper_function, legacy_wrapper_function_header,
3637
)
37-
from mypyc.ir.ops import LiteralsMap, DeserMaps
38+
from mypyc.ir.ops import DeserMaps, LoadLiteral
3839
from mypyc.ir.rtypes import RType, RTuple
3940
from mypyc.ir.func_ir import FuncIR
4041
from mypyc.ir.class_ir import ClassIR
@@ -286,9 +287,8 @@ def compile_ir_to_c(
286287
if not group_modules:
287288
ctext[group_name] = []
288289
continue
289-
literals = mapper.literals[group_name]
290290
generator = GroupGenerator(
291-
literals, group_modules, source_paths,
291+
group_modules, source_paths,
292292
group_name, mapper.group_map, names,
293293
compiler_options
294294
)
@@ -447,7 +447,6 @@ def group_dir(group_name: str) -> str:
447447

448448
class GroupGenerator:
449449
def __init__(self,
450-
literals: LiteralsMap,
451450
modules: List[Tuple[str, ModuleIR]],
452451
source_paths: Dict[str, str],
453452
group_name: Optional[str],
@@ -461,7 +460,6 @@ def __init__(self,
461460
one .c file per module if in multi_file mode.)
462461
463462
Arguments:
464-
literals: The literals declared in this group
465463
modules: (name, ir) pairs for each module in the group
466464
source_paths: Map from module names to source file paths
467465
group_name: The name of the group (or None if this is single-module compilation)
@@ -470,7 +468,6 @@ def __init__(self,
470468
multi_file: Whether to put each module in its own source file regardless
471469
of group structure.
472470
"""
473-
self.literals = literals
474471
self.modules = modules
475472
self.source_paths = source_paths
476473
self.context = EmitterContext(names, group_name, group_map)
@@ -495,6 +492,11 @@ def generate_c_for_modules(self) -> List[Tuple[str, str]]:
495492
file_contents = []
496493
multi_file = self.use_shared_lib and self.multi_file
497494

495+
# Collect all literal refs in IR.
496+
for _, module in self.modules:
497+
for fn in module.functions:
498+
collect_literals(fn, self.context.literals)
499+
498500
base_emitter = Emitter(self.context)
499501
# Optionally just include the runtime library c files to
500502
# reduce the number of compiler invocations needed
@@ -505,12 +507,7 @@ def generate_c_for_modules(self) -> List[Tuple[str, str]]:
505507
base_emitter.emit_line('#include "__native_internal{}.h"'.format(self.short_group_suffix))
506508
emitter = base_emitter
507509

508-
for (_, literal), identifier in self.literals.items():
509-
if isinstance(literal, int):
510-
symbol = emitter.static_name(identifier, None)
511-
self.declare_global('CPyTagged ', symbol)
512-
else:
513-
self.declare_static_pyobject(identifier, emitter)
510+
self.generate_literal_tables()
514511

515512
for module_name, module in self.modules:
516513
if multi_file:
@@ -621,6 +618,32 @@ def generate_c_for_modules(self) -> List[Tuple[str, str]]:
621618
''.join(ext_declarations.fragments)),
622619
]
623620

621+
def generate_literal_tables(self) -> None:
622+
"""Generate tables containing descriptions of Python literals to construct.
623+
624+
We will store the constructed literals in a single array that contains
625+
literals of all types. This way we can refer to an arbitrary literal by
626+
its index.
627+
"""
628+
literals = self.context.literals
629+
# During module initialization we store all the constructed objects here
630+
self.declare_global('PyObject *[%d]' % literals.num_literals(), 'CPyStatics')
631+
# Descriptions of str literals
632+
init_str = c_string_array_initializer(literals.encoded_str_values())
633+
self.declare_global('const char * const []', 'CPyLit_Str', initializer=init_str)
634+
# Descriptions of bytes literals
635+
init_bytes = c_string_array_initializer(literals.encoded_bytes_values())
636+
self.declare_global('const char * const []', 'CPyLit_Bytes', initializer=init_bytes)
637+
# Descriptions of int literals
638+
init_int = c_string_array_initializer(literals.encoded_int_values())
639+
self.declare_global('const char * const []', 'CPyLit_Int', initializer=init_int)
640+
# Descriptions of float literals
641+
init_floats = c_array_initializer(literals.encoded_float_values())
642+
self.declare_global('const double []', 'CPyLit_Float', initializer=init_floats)
643+
# Descriptions of complex literals
644+
init_complex = c_array_initializer(literals.encoded_complex_values())
645+
self.declare_global('const double []', 'CPyLit_Complex', initializer=init_complex)
646+
624647
def generate_export_table(self, decl_emitter: Emitter, code_emitter: Emitter) -> None:
625648
"""Generate the declaration and definition of the group's export struct.
626649
@@ -793,46 +816,10 @@ def generate_globals_init(self, emitter: Emitter) -> None:
793816
for symbol, fixup in self.simple_inits:
794817
emitter.emit_line('{} = {};'.format(symbol, fixup))
795818

796-
for (_, literal), identifier in self.literals.items():
797-
symbol = emitter.static_name(identifier, None)
798-
if isinstance(literal, int):
799-
actual_symbol = symbol
800-
symbol = INT_PREFIX + symbol
801-
emitter.emit_line(
802-
'PyObject * {} = PyLong_FromString(\"{}\", NULL, 10);'.format(
803-
symbol, str(literal))
804-
)
805-
elif isinstance(literal, float):
806-
emitter.emit_line(
807-
'{} = PyFloat_FromDouble({});'.format(symbol, str(literal))
808-
)
809-
elif isinstance(literal, complex):
810-
emitter.emit_line(
811-
'{} = PyComplex_FromDoubles({}, {});'.format(
812-
symbol, str(literal.real), str(literal.imag))
813-
)
814-
elif isinstance(literal, str):
815-
emitter.emit_line(
816-
'{} = PyUnicode_FromStringAndSize({}, {});'.format(
817-
symbol, *encode_as_c_string(literal))
818-
)
819-
elif isinstance(literal, bytes):
820-
emitter.emit_line(
821-
'{} = PyBytes_FromStringAndSize({}, {});'.format(
822-
symbol, *encode_bytes_as_c_string(literal))
823-
)
824-
else:
825-
assert False, ('Literals must be integers, floating point numbers, or strings,',
826-
'but the provided literal is of type {}'.format(type(literal)))
827-
emitter.emit_lines('if (unlikely({} == NULL))'.format(symbol),
828-
' return -1;')
829-
# Ints have an unboxed representation.
830-
if isinstance(literal, int):
831-
emitter.emit_line(
832-
'{} = CPyTagged_FromObject({});'.format(actual_symbol, symbol)
833-
)
834-
elif isinstance(literal, str):
835-
emitter.emit_line('PyUnicode_InternInPlace(&{});'.format(symbol))
819+
values = 'CPyLit_Str, CPyLit_Bytes, CPyLit_Int, CPyLit_Float, CPyLit_Complex'
820+
emitter.emit_lines('if (CPyStatics_Initialize(CPyStatics, {}) < 0) {{'.format(values),
821+
'return -1;',
822+
'}')
836823

837824
emitter.emit_lines(
838825
'is_initialized = 1;',
@@ -974,13 +961,19 @@ def _toposort_visit(name: str) -> None:
974961
def declare_global(self, type_spaced: str, name: str,
975962
*,
976963
initializer: Optional[str] = None) -> None:
964+
if '[' not in type_spaced:
965+
base = '{}{}'.format(type_spaced, name)
966+
else:
967+
a, b = type_spaced.split('[', 1)
968+
base = '{}{}[{}'.format(a, name, b)
969+
977970
if not initializer:
978971
defn = None
979972
else:
980-
defn = ['{}{} = {};'.format(type_spaced, name, initializer)]
973+
defn = ['{} = {};'.format(base, initializer)]
981974
if name not in self.context.declarations:
982975
self.context.declarations[name] = HeaderDeclaration(
983-
'{}{};'.format(type_spaced, name),
976+
'{};'.format(base),
984977
defn=defn,
985978
)
986979

@@ -1080,3 +1073,55 @@ def is_fastcall_supported(fn: FuncIR) -> bool:
10801073
# TODO: Support fastcall for __init__.
10811074
return USE_FASTCALL and fn.name != '__init__'
10821075
return USE_FASTCALL
1076+
1077+
1078+
def collect_literals(fn: FuncIR, literals: Literals) -> None:
1079+
"""Store all Python literal object refs in fn.
1080+
1081+
Collecting literals must happen only after we have the final IR.
1082+
This way we won't include literals that have been optimized away.
1083+
"""
1084+
for block in fn.blocks:
1085+
for op in block.ops:
1086+
if isinstance(op, LoadLiteral):
1087+
literals.record_literal(op.value)
1088+
1089+
1090+
def c_array_initializer(components: List[str]) -> str:
1091+
"""Construct an initializer for a C array variable.
1092+
1093+
Components are C expressions valid in an initializer.
1094+
1095+
For example, if components are ["1", "2"], the result
1096+
would be "{1, 2}", which can be used like this:
1097+
1098+
int a[] = {1, 2};
1099+
1100+
If the result is long, split it into multiple lines.
1101+
"""
1102+
res = []
1103+
current = [] # type: List[str]
1104+
cur_len = 0
1105+
for c in components:
1106+
if not current or cur_len + 2 + len(c) < 70:
1107+
current.append(c)
1108+
cur_len += len(c) + 2
1109+
else:
1110+
res.append(', '.join(current))
1111+
current = [c]
1112+
cur_len = len(c)
1113+
if not res:
1114+
# Result fits on a single line
1115+
return '{%s}' % ', '.join(current)
1116+
# Multi-line result
1117+
res.append(', '.join(current))
1118+
return '{\n ' + ',\n '.join(res) + '\n}'
1119+
1120+
1121+
def c_string_array_initializer(components: List[bytes]) -> str:
1122+
result = []
1123+
result.append('{\n')
1124+
for s in components:
1125+
result.append(' ' + c_string_initializer(s) + ',\n')
1126+
result.append('}')
1127+
return ''.join(result)

0 commit comments

Comments
 (0)