Skip to content

[mypyc] Add LoadLiteral and use tables to construct and store literals #10040

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Feb 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b938483
[WIP] Add LoadLiteral op
JukkaL Jan 16, 2021
3d8210a
[WIP] Use LoadLiteral in generated IR
JukkaL Jan 16, 2021
b4ae005
[WIP] Collect literal values in final IR
JukkaL Jan 16, 2021
f75ae0e
[WIP] Generate C for string literal refs
JukkaL Jan 16, 2021
57b9073
Start work on str literal tables
JukkaL Jan 16, 2021
a325121
Add basic support for string literal tables
JukkaL Jan 18, 2021
edc6515
Declare statics initialization function
JukkaL Jan 18, 2021
fc9800f
Split long lines
JukkaL Jan 18, 2021
256fdaa
Fix integer encoding
JukkaL Jan 18, 2021
6216ef5
Fix comments for static refs
JukkaL Jan 18, 2021
7ea2bb8
Intern strings
JukkaL Jan 30, 2021
7dad782
Implement table-based bytes literals
JukkaL Jan 30, 2021
bacfecc
Implement table-based float literals
JukkaL Jan 30, 2021
9be8ab4
Implement table-based complex literals
JukkaL Jan 30, 2021
c0f57c5
Implement table-based int literals
JukkaL Jan 30, 2021
8baab58
Remove old literals code
JukkaL Jan 30, 2021
e3387ca
Update comment
JukkaL Jan 30, 2021
6e8a208
Fix unit tests
JukkaL Jan 30, 2021
8325636
Split long arrays into multiple lines
JukkaL Jan 30, 2021
6b39ce3
Clean up
JukkaL Jan 30, 2021
f357221
Refactor
JukkaL Jan 30, 2021
7d06969
Minor cleanup
JukkaL Jan 30, 2021
1b8a94a
Refactor
JukkaL Jan 30, 2021
2d6e611
Add comments
JukkaL Jan 30, 2021
df32576
Update tests using --update-data
JukkaL Jan 30, 2021
577cc96
Fix 32-bit tests
JukkaL Jan 30, 2021
21e3020
Add tests and fix float infinities
JukkaL Feb 6, 2021
b0c7d39
Rename statics in generated code
JukkaL Feb 6, 2021
e97d4d1
Minor tweak to test case
JukkaL Feb 6, 2021
f222f1f
Refactor LowLevelIRBuilder
JukkaL Feb 6, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion mypyc/analysis/dataflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from mypyc.ir.ops import (
Value, ControlOp,
BasicBlock, OpVisitor, Assign, Integer, LoadErrorValue, RegisterOp, Goto, Branch, Return, Call,
Box, Unbox, Cast, Op, Unreachable, TupleGet, TupleSet, GetAttr, SetAttr,
Box, Unbox, Cast, Op, Unreachable, TupleGet, TupleSet, GetAttr, SetAttr, LoadLiteral,
LoadStatic, InitStatic, MethodCall, RaiseStandardError, CallC, LoadGlobal,
Truncate, IntOp, LoadMem, GetElementPtr, LoadAddress, ComparisonOp, SetMem
)
Expand Down Expand Up @@ -165,6 +165,9 @@ def visit_method_call(self, op: MethodCall) -> GenAndKill:
def visit_load_error_value(self, op: LoadErrorValue) -> GenAndKill:
return self.visit_register_op(op)

def visit_load_literal(self, op: LoadLiteral) -> GenAndKill:
return self.visit_register_op(op)

def visit_get_attr(self, op: GetAttr) -> GenAndKill:
return self.visit_register_op(op)

Expand Down
42 changes: 33 additions & 9 deletions mypyc/codegen/cstring.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,13 @@
octal digits.
"""

from typing import List
import string
from typing import Tuple

CHAR_MAP = ['\\{:03o}'.format(i) for i in range(256)]
from typing_extensions import Final


CHAR_MAP = ['\\{:03o}'.format(i) for i in range(256)] # type: Final

# It is safe to use string.printable as it always uses the C locale.
for c in string.printable:
Expand All @@ -38,12 +41,33 @@
CHAR_MAP[ord('?')] = r'\?'


def encode_as_c_string(s: str) -> Tuple[str, int]:
"""Produce a quoted C string literal and its size, for a UTF-8 string."""
return encode_bytes_as_c_string(s.encode('utf-8'))
def encode_bytes_as_c_string(b: bytes) -> str:
"""Produce contents of a C string literal for a byte string, without quotes."""
escaped = ''.join([CHAR_MAP[i] for i in b])
return escaped


def encode_bytes_as_c_string(b: bytes) -> Tuple[str, int]:
"""Produce a quoted C string literal and its size, for a byte string."""
escaped = ''.join([CHAR_MAP[i] for i in b])
return '"{}"'.format(escaped), len(b)
def c_string_initializer(components: List[bytes]) -> str:
"""Create initializer for a C char[] variable from a list of fragments.

For example, if components is [b'foo', b'bar'], the result would be
'"foobar"', which could then be used like this to initialize 's':

const char s[] = "foobar";

If the result is long, split it into multiple lines.
"""
res = []
current = ''
for c in components:
enc = encode_bytes_as_c_string(c)
if not current or len(current) + len(enc) < 70:
current += enc
else:
res.append('"%s"' % current)
current = enc
if current:
res.append('"%s"' % current)
if len(res) > 1:
res.insert(0, '')
return '\n '.join(res)
3 changes: 3 additions & 0 deletions mypyc/codegen/emit.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from mypyc.ir.class_ir import ClassIR, all_concrete_classes
from mypyc.namegen import NameGenerator, exported_name
from mypyc.sametype import is_same_type
from mypyc.codegen.literals import Literals


class HeaderDeclaration:
Expand Down Expand Up @@ -84,6 +85,8 @@ def __init__(self,
# The declaration contains the body of the struct.
self.declarations = OrderedDict() # type: Dict[str, HeaderDeclaration]

self.literals = Literals()


class Emitter:
"""Helper for C code generation."""
Expand Down
18 changes: 16 additions & 2 deletions mypyc/codegen/emitfunc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
LoadStatic, InitStatic, TupleGet, TupleSet, Call, IncRef, DecRef, Box, Cast, Unbox,
BasicBlock, Value, MethodCall, Unreachable, NAMESPACE_STATIC, NAMESPACE_TYPE, NAMESPACE_MODULE,
RaiseStandardError, CallC, LoadGlobal, Truncate, IntOp, LoadMem, GetElementPtr,
LoadAddress, ComparisonOp, SetMem, Register
LoadAddress, ComparisonOp, SetMem, Register, LoadLiteral
)
from mypyc.ir.rtypes import (
RType, RTuple, is_tagged, is_int32_rprimitive, is_int64_rprimitive, RStruct,
is_pointer_rprimitive
is_pointer_rprimitive, is_int_rprimitive
)
from mypyc.ir.func_ir import FuncIR, FuncDecl, FUNC_STATICMETHOD, FUNC_CLASSMETHOD, all_values
from mypyc.ir.class_ir import ClassIR
Expand Down Expand Up @@ -96,6 +96,7 @@ def __init__(self,
self.declarations = declarations
self.source_path = source_path
self.module_name = module_name
self.literals = emitter.context.literals

def temp_name(self) -> str:
return self.emitter.temp_name()
Expand Down Expand Up @@ -173,6 +174,19 @@ def visit_load_error_value(self, op: LoadErrorValue) -> None:
self.emit_line('%s = %s;' % (self.reg(op),
self.c_error_value(op.type)))

def visit_load_literal(self, op: LoadLiteral) -> None:
index = self.literals.literal_index(op.value)
s = repr(op.value)
if not any(x in s for x in ('/*', '*/', '\0')):
ann = ' /* %s */' % s
else:
ann = ''
if not is_int_rprimitive(op.type):
self.emit_line('%s = CPyStatics[%d];%s' % (self.reg(op), index, ann))
else:
self.emit_line('%s = (CPyTagged)CPyStatics[%d] | 1;%s' % (
self.reg(op), index, ann))

def get_attr_expr(self, obj: str, op: Union[GetAttr, SetAttr], decl_cl: ClassIR) -> str:
"""Generate attribute accessor for normal (non-property) access.

Expand Down
148 changes: 92 additions & 56 deletions mypyc/codegen/emitmodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,19 @@
from mypyc.irbuild.prepare import load_type_map
from mypyc.irbuild.mapper import Mapper
from mypyc.common import (
PREFIX, TOP_LEVEL_NAME, INT_PREFIX, MODULE_PREFIX, RUNTIME_C_FILES, USE_FASTCALL,
PREFIX, TOP_LEVEL_NAME, MODULE_PREFIX, RUNTIME_C_FILES, USE_FASTCALL,
USE_VECTORCALL, shared_lib_name,
)
from mypyc.codegen.cstring import encode_as_c_string, encode_bytes_as_c_string
from mypyc.codegen.cstring import c_string_initializer
from mypyc.codegen.literals import Literals
from mypyc.codegen.emit import EmitterContext, Emitter, HeaderDeclaration
from mypyc.codegen.emitfunc import generate_native_function, native_function_header
from mypyc.codegen.emitclass import generate_class_type_decl, generate_class
from mypyc.codegen.emitwrapper import (
generate_wrapper_function, wrapper_function_header,
generate_legacy_wrapper_function, legacy_wrapper_function_header,
)
from mypyc.ir.ops import LiteralsMap, DeserMaps
from mypyc.ir.ops import DeserMaps, LoadLiteral
from mypyc.ir.rtypes import RType, RTuple
from mypyc.ir.func_ir import FuncIR
from mypyc.ir.class_ir import ClassIR
Expand Down Expand Up @@ -286,9 +287,8 @@ def compile_ir_to_c(
if not group_modules:
ctext[group_name] = []
continue
literals = mapper.literals[group_name]
generator = GroupGenerator(
literals, group_modules, source_paths,
group_modules, source_paths,
group_name, mapper.group_map, names,
compiler_options
)
Expand Down Expand Up @@ -447,7 +447,6 @@ def group_dir(group_name: str) -> str:

class GroupGenerator:
def __init__(self,
literals: LiteralsMap,
modules: List[Tuple[str, ModuleIR]],
source_paths: Dict[str, str],
group_name: Optional[str],
Expand All @@ -461,7 +460,6 @@ def __init__(self,
one .c file per module if in multi_file mode.)

Arguments:
literals: The literals declared in this group
modules: (name, ir) pairs for each module in the group
source_paths: Map from module names to source file paths
group_name: The name of the group (or None if this is single-module compilation)
Expand All @@ -470,7 +468,6 @@ def __init__(self,
multi_file: Whether to put each module in its own source file regardless
of group structure.
"""
self.literals = literals
self.modules = modules
self.source_paths = source_paths
self.context = EmitterContext(names, group_name, group_map)
Expand All @@ -495,6 +492,11 @@ def generate_c_for_modules(self) -> List[Tuple[str, str]]:
file_contents = []
multi_file = self.use_shared_lib and self.multi_file

# Collect all literal refs in IR.
for _, module in self.modules:
for fn in module.functions:
collect_literals(fn, self.context.literals)

base_emitter = Emitter(self.context)
# Optionally just include the runtime library c files to
# reduce the number of compiler invocations needed
Expand All @@ -505,12 +507,7 @@ def generate_c_for_modules(self) -> List[Tuple[str, str]]:
base_emitter.emit_line('#include "__native_internal{}.h"'.format(self.short_group_suffix))
emitter = base_emitter

for (_, literal), identifier in self.literals.items():
if isinstance(literal, int):
symbol = emitter.static_name(identifier, None)
self.declare_global('CPyTagged ', symbol)
else:
self.declare_static_pyobject(identifier, emitter)
self.generate_literal_tables()

for module_name, module in self.modules:
if multi_file:
Expand Down Expand Up @@ -621,6 +618,32 @@ def generate_c_for_modules(self) -> List[Tuple[str, str]]:
''.join(ext_declarations.fragments)),
]

def generate_literal_tables(self) -> None:
"""Generate tables containing descriptions of Python literals to construct.

We will store the constructed literals in a single array that contains
literals of all types. This way we can refer to an arbitrary literal by
its index.
"""
literals = self.context.literals
# During module initialization we store all the constructed objects here
self.declare_global('PyObject *[%d]' % literals.num_literals(), 'CPyStatics')
# Descriptions of str literals
init_str = c_string_initializer(literals.encoded_str_values())
self.declare_global('const char []', 'CPyLit_Str', initializer=init_str)
# Descriptions of bytes literals
init_bytes = c_string_initializer(literals.encoded_bytes_values())
self.declare_global('const char []', 'CPyLit_Bytes', initializer=init_bytes)
# Descriptions of int literals
init_int = c_string_initializer(literals.encoded_int_values())
self.declare_global('const char []', 'CPyLit_Int', initializer=init_int)
# Descriptions of float literals
init_floats = c_array_initializer(literals.encoded_float_values())
self.declare_global('const double []', 'CPyLit_Float', initializer=init_floats)
# Descriptions of complex literals
init_complex = c_array_initializer(literals.encoded_complex_values())
self.declare_global('const double []', 'CPyLit_Complex', initializer=init_complex)

def generate_export_table(self, decl_emitter: Emitter, code_emitter: Emitter) -> None:
"""Generate the declaration and definition of the group's export struct.

Expand Down Expand Up @@ -793,46 +816,10 @@ def generate_globals_init(self, emitter: Emitter) -> None:
for symbol, fixup in self.simple_inits:
emitter.emit_line('{} = {};'.format(symbol, fixup))

for (_, literal), identifier in self.literals.items():
symbol = emitter.static_name(identifier, None)
if isinstance(literal, int):
actual_symbol = symbol
symbol = INT_PREFIX + symbol
emitter.emit_line(
'PyObject * {} = PyLong_FromString(\"{}\", NULL, 10);'.format(
symbol, str(literal))
)
elif isinstance(literal, float):
emitter.emit_line(
'{} = PyFloat_FromDouble({});'.format(symbol, str(literal))
)
elif isinstance(literal, complex):
emitter.emit_line(
'{} = PyComplex_FromDoubles({}, {});'.format(
symbol, str(literal.real), str(literal.imag))
)
elif isinstance(literal, str):
emitter.emit_line(
'{} = PyUnicode_FromStringAndSize({}, {});'.format(
symbol, *encode_as_c_string(literal))
)
elif isinstance(literal, bytes):
emitter.emit_line(
'{} = PyBytes_FromStringAndSize({}, {});'.format(
symbol, *encode_bytes_as_c_string(literal))
)
else:
assert False, ('Literals must be integers, floating point numbers, or strings,',
'but the provided literal is of type {}'.format(type(literal)))
emitter.emit_lines('if (unlikely({} == NULL))'.format(symbol),
' return -1;')
# Ints have an unboxed representation.
if isinstance(literal, int):
emitter.emit_line(
'{} = CPyTagged_FromObject({});'.format(actual_symbol, symbol)
)
elif isinstance(literal, str):
emitter.emit_line('PyUnicode_InternInPlace(&{});'.format(symbol))
values = 'CPyLit_Str, CPyLit_Bytes, CPyLit_Int, CPyLit_Float, CPyLit_Complex'
emitter.emit_lines('if (CPyStatics_Initialize(CPyStatics, {}) < 0) {{'.format(values),
'return -1;',
'}')

emitter.emit_lines(
'is_initialized = 1;',
Expand Down Expand Up @@ -974,13 +961,19 @@ def _toposort_visit(name: str) -> None:
def declare_global(self, type_spaced: str, name: str,
*,
initializer: Optional[str] = None) -> None:
if '[' not in type_spaced:
base = '{}{}'.format(type_spaced, name)
else:
a, b = type_spaced.split('[', 1)
base = '{}{}[{}'.format(a, name, b)

if not initializer:
defn = None
else:
defn = ['{}{} = {};'.format(type_spaced, name, initializer)]
defn = ['{} = {};'.format(base, initializer)]
if name not in self.context.declarations:
self.context.declarations[name] = HeaderDeclaration(
'{}{};'.format(type_spaced, name),
'{};'.format(base),
defn=defn,
)

Expand Down Expand Up @@ -1080,3 +1073,46 @@ def is_fastcall_supported(fn: FuncIR) -> bool:
# TODO: Support fastcall for __init__.
return USE_FASTCALL and fn.name != '__init__'
return USE_FASTCALL


def collect_literals(fn: FuncIR, literals: Literals) -> None:
"""Store all Python literal object refs in fn.

Collecting literals must happen only after we have the final IR.
This way we won't include literals that have been optimized away.
"""
for block in fn.blocks:
for op in block.ops:
if isinstance(op, LoadLiteral):
literals.record_literal(op.value)


def c_array_initializer(components: List[str]) -> str:
"""Construct an initializer for a C array variable.

Components are C expressions valid in an initializer.

For example, if components are ["1", "2"], the result
would be "{1, 2}", which can be used like this:

int a[] = {1, 2};

If the result is long, split it into multiple lines.
"""
res = []
current = [] # type: List[str]
cur_len = 0
for c in components:
if not current or cur_len + 2 + len(c) < 70:
current.append(c)
cur_len += len(c) + 2
else:
res.append(', '.join(current))
current = [c]
cur_len = len(c)
if not res:
# Result fits on a single line
return '{%s}' % ', '.join(current)
# Multi-line result
res.append(', '.join(current))
return '{\n ' + ',\n '.join(res) + '\n}'
Loading