Skip to content

Commit db77c76

Browse files
authored
Add emsymbolizer (#16095)
Emsymbolizer is a tool for symbolizing a binary, i.e. showing the file/line or symbol info for a code address. As described in #16094 there are several ways to do this with emscripten. The first PR is for item 1, using llvm-symbolizer with DWARF.
1 parent 96af70d commit db77c76

File tree

4 files changed

+109
-4
lines changed

4 files changed

+109
-4
lines changed

emsymbolizer.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/usr/bin/env python3
2+
3+
# This is a utility for looking up the symbol names and/or file+line numbers
4+
# of code addresses. There are several possible sources of this information,
5+
# with varying granularity (listed here in approximate preference order).
6+
7+
# If the wasm has DWARF info, llvm-symbolizer can show the symbol, file, and
8+
# line/column number, potentially including inlining.
9+
# If there is a source map, we can parse it to get file and line number.
10+
# If there is an emscripten symbol map, we can parse that to get the symbol name
11+
# If there is a name section or symbol table, llvm-nm can show the symbol name.
12+
13+
import os
14+
import sys
15+
from tools import shared
16+
from tools import webassembly
17+
from tools.shared import check_call
18+
19+
LLVM_SYMBOLIZER = os.path.expanduser(
20+
shared.build_llvm_tool_path(shared.exe_suffix('llvm-symbolizer')))
21+
22+
23+
class Error(BaseException):
24+
pass
25+
26+
27+
def get_codesec_offset(module):
28+
for sec in module.sections():
29+
if sec.type == webassembly.SecType.CODE:
30+
return sec.offset
31+
raise Error(f'No code section found in {module.filename}')
32+
33+
34+
def has_debug_line_section(module):
35+
for sec in module.sections():
36+
if sec.name == ".debug_line":
37+
return True
38+
return False
39+
40+
41+
def symbolize_address_dwarf(module, address):
42+
vma_adjust = get_codesec_offset(module)
43+
cmd = [LLVM_SYMBOLIZER, '-e', module.filename, f'--adjust-vma={vma_adjust}',
44+
str(address)]
45+
check_call(cmd)
46+
47+
48+
def main(argv):
49+
wasm_file = argv[1]
50+
print('Warning: the command-line and output format of this file are not '
51+
'finalized yet', file=sys.stderr)
52+
module = webassembly.Module(wasm_file)
53+
54+
if not has_debug_line_section(module):
55+
raise Error(f"No .debug_line section found in {module.filename}."
56+
" I don't know how to symbolize this file yet")
57+
58+
symbolize_address_dwarf(module, int(argv[2], 16))
59+
return 0
60+
61+
62+
if __name__ == '__main__':
63+
try:
64+
rv = main(sys.argv)
65+
except (Error, webassembly.InvalidWasmError, OSError) as e:
66+
print(f'{sys.argv[0]}: {str(e)}', file=sys.stderr)
67+
rv = 1
68+
sys.exit(rv)

tests/core/test_dwarf.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,20 @@
22

33
EM_JS(int, out_to_js, (int x), {})
44

5-
void foo() {
5+
void __attribute__((noinline)) foo() {
66
out_to_js(0); // line 5
77
out_to_js(1); // line 6
88
out_to_js(2); // line 7
99
// A silly possible recursion to avoid binaryen doing any inlining.
1010
if (out_to_js(3)) foo();
1111
}
1212

13+
void __attribute__((always_inline)) bar() {
14+
out_to_js(3);
15+
__builtin_trap();
16+
}
17+
1318
int main() {
1419
foo();
20+
bar();
1521
}

tests/test_other.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8219,6 +8219,30 @@ def test(infile, source_map_added_dir=''):
82198219
ensure_dir('inner')
82208220
test('inner/a.cpp', 'inner')
82218221

8222+
def test_emsymbolizer(self):
8223+
# Test DWARF output
8224+
self.run_process([EMCC, test_file('core/test_dwarf.c'),
8225+
'-g', '-O1', '-o', 'test_dwarf.js'])
8226+
8227+
# Use hard-coded addresses. This is potentially brittle, but LLVM's
8228+
# O1 output is pretty minimal so hopefully it won't break too much?
8229+
# Another option would be to disassemble the binary to look for certain
8230+
# instructions or code sequences.
8231+
8232+
def get_addr(address):
8233+
return self.run_process(
8234+
[PYTHON, path_from_root('emsymbolizer.py'), 'test_dwarf.wasm', address],
8235+
stdout=PIPE).stdout
8236+
8237+
# Check a location in foo(), not inlined.
8238+
self.assertIn('test_dwarf.c:6:3', get_addr('0x101'))
8239+
# Check that both bar (inlined) and main (inlinee) are in the output,
8240+
# as described by the DWARF.
8241+
# TODO: consider also checking the function names once the output format
8242+
# stabilizes more
8243+
self.assertRegex(get_addr('0x124').replace('\n', ''),
8244+
'test_dwarf.c:15:3.*test_dwarf.c:20:3')
8245+
82228246
def test_separate_dwarf(self):
82238247
self.run_process([EMCC, test_file('hello_world.c'), '-g'])
82248248
self.assertExists('a.out.wasm')

tools/webassembly.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@ class DylinkType(IntEnum):
108108
IMPORT_INFO = 4
109109

110110

111+
class InvalidWasmError(BaseException):
112+
pass
113+
114+
111115
Section = namedtuple('Section', ['type', 'size', 'offset', 'name'])
112116
Limits = namedtuple('Limits', ['flags', 'initial', 'maximum'])
113117
Import = namedtuple('Import', ['kind', 'module', 'field'])
@@ -123,15 +127,18 @@ class Module:
123127
"""Extremely minimal wasm module reader. Currently only used
124128
for parsing the dylink section."""
125129
def __init__(self, filename):
130+
self.buf = None # Set this before FS calls below in case they throw.
131+
self.filename = filename
126132
self.size = os.path.getsize(filename)
127133
self.buf = open(filename, 'rb')
128134
magic = self.buf.read(4)
129135
version = self.buf.read(4)
130-
assert magic == MAGIC
131-
assert version == VERSION
136+
if magic != MAGIC or version != VERSION:
137+
raise InvalidWasmError(f'{filename} is not a valid wasm file')
132138

133139
def __del__(self):
134-
self.buf.close()
140+
if self.buf:
141+
self.buf.close()
135142

136143
def readAt(self, offset, count):
137144
self.buf.seek(offset)

0 commit comments

Comments
 (0)