|
12 | 12 | import logging
|
13 | 13 | import operator
|
14 | 14 | import os
|
| 15 | +import struct |
| 16 | +import subprocess |
15 | 17 | import tempfile
|
16 | 18 | from typing import final, List
|
17 | 19 |
|
@@ -136,13 +138,89 @@ def dbg_tosa_dump(tosa_fb, path):
|
136 | 138 | fb = tosa_fb.serialize()
|
137 | 139 | js = tosa_fb.writeJson(filename)
|
138 | 140 |
|
139 |
| - f = open(path + filename, "wb") |
140 |
| - f.write(fb) |
141 |
| - f.close() |
| 141 | + with open(path + filename, "wb") as f: |
| 142 | + f.write(fb) |
142 | 143 |
|
143 |
| - f = open(path + "desc.json", "w") |
144 |
| - f.write(js) |
145 |
| - f.close() |
| 144 | + with open(path + "desc.json", "w") as f: |
| 145 | + f.write(js) |
| 146 | + |
| 147 | + |
| 148 | +# Output to Vela with current file-based compilation |
| 149 | +# WARNING: if this changes, the runtime reader also needs to change |
| 150 | +def vela_compile(tosa_fb): |
| 151 | + with tempfile.TemporaryDirectory() as tmpdir: |
| 152 | + tosaname = "out.tosa" |
| 153 | + flatbuffer = tosa_fb.serialize() |
| 154 | + with open(os.path.join(tmpdir, tosaname), "wb") as f: |
| 155 | + f.write(flatbuffer) |
| 156 | + |
| 157 | + # invoke vela |
| 158 | + vela_command = ( |
| 159 | + f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}" |
| 160 | + ) |
| 161 | + subprocess.run([vela_command], shell=True, check=True) |
| 162 | + |
| 163 | + np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz") |
| 164 | + blocks = b"" |
| 165 | + with np.load(np_path, allow_pickle=False) as data: |
| 166 | + # Emit the NPZ regions as: |
| 167 | + # - 16 byte block name null terminated string (padded to 16 if name shorter) |
| 168 | + # - 4 bytes of int32 block length and 12 bytes of 0's |
| 169 | + # - block data (padded to 16 byte alignment at end) |
| 170 | + # Repeat for all blocks |
| 171 | + for key in data.keys(): |
| 172 | + block_name = bytes(key, "utf8")[:15] |
| 173 | + block_name = block_name + b"\x00" * (16 - len(block_name)) |
| 174 | + |
| 175 | + block_data = b"" |
| 176 | + if key in ("input_shape", "output_shape"): |
| 177 | + inputs = data[key] |
| 178 | + # Encode a struct of int len; and one or more int x,y,z,w shape; |
| 179 | + input_struct = struct.pack("<i", len(inputs)) |
| 180 | + for inp in inputs: |
| 181 | + assert len(inp) <= 4 |
| 182 | + inp_pad = inp.tolist() + [0] * (4 - len(inp)) |
| 183 | + input_struct = input_struct + struct.pack("<iiii", *inp_pad) |
| 184 | + block_data = input_struct |
| 185 | + elif key in ("input_offset", "output_offset"): |
| 186 | + inputs = data[key] |
| 187 | + if key == "output_offset" and len(inputs) > 1: |
| 188 | + raise RuntimeError( |
| 189 | + "Currently only support one output in Vela ArmBackend" |
| 190 | + ) |
| 191 | + offset_struct = struct.pack("<i", len(inputs)) |
| 192 | + for inp in inputs: |
| 193 | + offset_struct = offset_struct + struct.pack("<i", inp) |
| 194 | + block_data = offset_struct |
| 195 | + else: |
| 196 | + block_data = data[key].tobytes() |
| 197 | + # We need the acual unpadded block lengths for hw setup |
| 198 | + block_length = len(block_data).to_bytes(16, "little") |
| 199 | + # pad block data to multiple of 16 bytes |
| 200 | + block_data = block_data + b"\x00" * (15 - (len(block_data) - 1) % 16) |
| 201 | + |
| 202 | + block = block_name + block_length + block_data |
| 203 | + blocks = blocks + block |
| 204 | + |
| 205 | + # Add a block for scratch, inputs and outputs |
| 206 | + # scratch shape is a 1 element array giving us size in bytes |
| 207 | + block_name = bytes("scratch_data", "utf8")[:15] |
| 208 | + block_name = block_name + b"\x00" * (16 - len(block_name)) |
| 209 | + block_length = data["scratch_shape"][0].item() |
| 210 | + block_length = block_length + (15 - (block_length - 1) % 16) |
| 211 | + block_data = b"\x00" * block_length |
| 212 | + block_length = block_length.to_bytes(16, "little") |
| 213 | + block = block_name + block_length + block_data |
| 214 | + blocks = blocks + block |
| 215 | + # TODO are these already in scratch shape? look to be |
| 216 | + # input_shape * input_elem_size |
| 217 | + # output_shape * output_elem_size |
| 218 | + # input_offset and output_offset specify the location these arrays are written from base of scratch |
| 219 | + |
| 220 | + # return 16 byte VELA bin header + blocks + footer |
| 221 | + header = bytes("vela_bin_stream", "utf-8") + b"\x00" |
| 222 | + footer = bytes("vela_end_stream", "utf-8") + b"\x00" |
| 223 | + return header + blocks + footer |
146 | 224 |
|
147 | 225 |
|
148 | 226 | def dbg_fail(node, tosa_fb, path):
|
@@ -237,14 +315,13 @@ def preprocess( # noqa: C901
|
237 | 315 | # if a debug/test build capture output files from TOSA stage
|
238 | 316 | path = None
|
239 | 317 | debug_output = False
|
| 318 | + output_format = "vela" |
240 | 319 | for spec in compile_spec:
|
241 | 320 | if spec.key == "debug_tosa_path":
|
242 | 321 | path = spec.value.decode()
|
243 | 322 | debug_output = True
|
244 |
| - |
245 |
| - # in non debug builds we still pass files to vela |
246 |
| - if path is None: |
247 |
| - path = tempfile.mkdtemp(prefix="arm_tosa_") |
| 323 | + if spec.key == "output_format": |
| 324 | + output_format = spec.value.decode() |
248 | 325 |
|
249 | 326 | # Converted output for this subgraph, serializer needs path early as it emits
|
250 | 327 | # const data directly. Path created and data written only in debug builds.
|
@@ -890,6 +967,16 @@ def preprocess( # noqa: C901
|
890 | 967 | if debug_output is True:
|
891 | 968 | dbg_tosa_dump(tosa_fb, path)
|
892 | 969 |
|
893 |
| - # Serialize and return the tosa flatbuffer |
894 |
| - fb = tosa_fb.serialize() |
895 |
| - return PreprocessResult(processed_bytes=bytes(fb)) |
| 970 | + # Serialize and return the program. While we have always produced TOSA |
| 971 | + # output as an intermediate, some flows compile to device binaries in |
| 972 | + # preprocess and some consume TOSA fb directly. |
| 973 | + if output_format == "vela": |
| 974 | + # Emit vela_bin_stream format |
| 975 | + binary = vela_compile(tosa_fb) |
| 976 | + elif output_format == "tosa": |
| 977 | + # Emit TOSA flatbuffer |
| 978 | + binary = bytes(tosa_fb.serialize()) |
| 979 | + else: |
| 980 | + raise RuntimeError(f"Unknown format {output_format}") |
| 981 | + |
| 982 | + return PreprocessResult(processed_bytes=binary) |
0 commit comments