Intorduce XNNPACKHeaderto manage flatbuffer data and constant data

mcr229 · facebook-github-bot · commit ac62d043bc44 · 2024-01-02T17:54:04.000-08:00
Summary:
Introducing the XNNPACKHeader to manage the flatbuffer data and constant data.

Previously, we have serialized constant data along with flatbuffer. However, with large weights and large tensors in general, this takes a large amount of time and memory converting our dataclass --&gt; json --&gt; flatbuffer. This has become a blocker on some larger models

To fix, we circumvent serializing constant tensors via flatbuffer, by appending the constant data after the flatbuffer payload. In order to do this, we need an XNNPACKHeader which will give us the flatbuffer offset, flatbuffer size, constant data offset, and constant data sizes.

It will look something like this:
	
```
             ┌───────────────────────────────────┐
             │XNNPACK Header                     │
             ├───────────────────────────────────┤
             │Padding for 16 byte alignment      │
             ├───────────────────────────────────┤
             │Flatbuffer-serialized payload data │
             │                                   │
             │                                   │
             ├───────────────────────────────────┤
             │Padding for 16 byte alignment      │
             ├───────────────────────────────────┤
             │Constant Data                      │
             │                                   │
             │                                   │
             └───────────────────────────────────┘
```

Within the XNNPACK Header, we hold the following:
- 4 bytes to offset the header magic
- 4 bytes for the header magic
- 4 bytes for the header length
- 8 bytes for the flatbuffer offset
- 8 bytes for the flatbuffer size
- 8 bytes for constant data offset
- 8 bytes for constant data size

Differential Revision: D52497977
diff --git a/backends/xnnpack/serialization/xnnpack_graph_serialize.py b/backends/xnnpack/serialization/xnnpack_graph_serialize.py
@@ -8,14 +8,20 @@
 import os
 import tempfile
 
-from dataclasses import fields, is_dataclass
+from dataclasses import dataclass, fields, is_dataclass
+from typing import ClassVar, Literal
 
 import pkg_resources
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import XNNGraph
 from executorch.exir._serialize._dataclass import _DataclassEncoder
 
 from executorch.exir._serialize._flatbuffer import _flatc_compile
 
+# Byte order of numbers written to program headers. Always little-endian
+# regardless of the host system, since all commonly-used modern CPUs are little
+# endian.
+_HEADER_BYTEORDER: Literal["little"] = "little"
+
 
 def sanity_check_xnngraph_dataclass(table, name: str = ""):
     """
@@ -68,6 +74,76 @@ def check_for_sym(obj, name):
             check_for_sym(o, _name_field)
 
 
+@dataclass
+class XNNPACKHeader:
+    # Class Constants
+
+    # magic bytes that should be at the beginning of the header
+    EXPECTED_MAGIC: ClassVar[bytes] = b"XH00"
+    # The length of the header in bytes.
+    EXPECTED_LENGTH: ClassVar[int] = (
+        # Zeros magic
+        # We offset the magic by 4 bytes so that it is in the same location
+        # as the flatbuffer payload's magic. This way we can dynamically
+        # choose between the XNNPACK Header and Flatbuffer Header
+        4
+        # Header magic
+        + 4
+        # Header Length
+        + 4
+        # Flatbuffer offset
+        + 8
+        # Flatbuffer size
+        + 8
+        # Constant Data offset
+        + 8
+        # Constant Data size
+        + 8
+    )
+
+    # Instance attributes. @dataclass will turn these into ctor args.
+
+    # offset to the flatbuffer data
+    flatbuffer_offset: int
+
+    # flatbuffer size
+    flatbuffer_size: int
+
+    # offset to the constant data
+    constant_data_offset: int
+
+    # constant data size
+    constant_data_size: int
+
+    def to_bytes(self) -> bytes:
+        """
+        Returns the binary representation of the XNNPACK Header.
+        """
+
+        data: bytes = (
+            # Padding for magic bytes. This is so that header magic is in the same position
+            # as the flatbuffer magic, and allows consumer to detect whether the header is
+            # being used or not
+            b"\x00\x00\x00\x00"
+            # XNNPACK Header's magic. This allows consumer to detect whether or not the header
+            # is being used or the flatbuffer header is being used
+            + self.EXPECTED_MAGIC
+            # uint32_t: Size of this header. This makes it easier to add new fields to the header
+            # in the future.
+            + self.EXPECTED_LENGTH.to_bytes(4, byteorder=_HEADER_BYTEORDER)
+            # uint64_t: Offset to the start of the flatbuffer data
+            + self.flatbuffer_offset.to_bytes(8, byteorder=_HEADER_BYTEORDER)
+            # uint64_t: Size of the flatbuffer data payload
+            + self.flatbuffer_size.to_bytes(8, byteorder=_HEADER_BYTEORDER)
+            # uint64_t: Offset to the start of the constant data
+            + self.constant_data_offset.to_bytes(8, byteorder=_HEADER_BYTEORDER)
+            # uint64_t: Size of the constant data
+            + self.constant_data_size.to_bytes(8, byteorder=_HEADER_BYTEORDER)
+        )
+
+        return data
+
+
 def convert_to_flatbuffer(xnnpack_graph: XNNGraph) -> bytes:
     sanity_check_xnngraph_dataclass(xnnpack_graph)
     xnnpack_graph_json = json.dumps(xnnpack_graph, cls=_DataclassEncoder)