Skip to content

Gguf dump start data offset via --data-offset and some extra refactor #8054

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 25 additions & 4 deletions gguf-py/gguf/gguf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class GGUFReader:
# I - same as host, S - swapped
byte_order: Literal['I'] | Literal['S'] = 'I'
alignment: int = GGUF_DEFAULT_ALIGNMENT
data_offset: int

# Note: Internal helper, API may change.
gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
Expand All @@ -88,9 +89,13 @@ class GGUFReader:
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
self.data = np.memmap(path, mode = mode)
offs = 0

# Check for GGUF magic
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
raise ValueError('GGUF magic invalid')
offs += 4

# Check GGUF version
temp_version = self._get(offs, np.uint32)
if temp_version[0] & 65535 == 0:
# If we get 0 here that means it's (probably) a GGUF file created for
Expand All @@ -103,12 +108,16 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
self.tensors: list[ReaderTensor] = []
offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))

# Check tensor count and kv count
temp_counts = self._get(offs, np.uint64, 2)
offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
tensor_count, kv_count = temp_counts
offs = self._build_fields(offs, kv_count)
offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)

# Build Tensor Info Fields
offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
new_align = self.fields.get('general.alignment')
if new_align is not None:
if new_align.types != [GGUFValueType.UINT32]:
Expand All @@ -117,6 +126,7 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r
padding = offs % self.alignment
if padding != 0:
offs += self.alignment - padding
self.data_offset = offs
self._build_tensors(offs, tensors_fields)

_DT = TypeVar('_DT', bound = npt.DTypeLike)
Expand Down Expand Up @@ -193,18 +203,29 @@ def _get_field_parts(
# We can't deal with this one.
raise ValueError('Unknown/unhandled field type {gtype}')

def _get_tensor(self, orig_offs: int) -> ReaderField:
def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
offs = orig_offs

# Get Tensor Name
name_len, name_data = self._get_str(offs)
offs += int(name_len.nbytes + name_data.nbytes)

# Get Tensor Dimensions Count
n_dims = self._get(offs, np.uint32)
offs += int(n_dims.nbytes)

# Get Tensor Dimension Array
dims = self._get(offs, np.uint64, n_dims[0])
offs += int(dims.nbytes)

# Get Tensor Encoding Scheme Type
raw_dtype = self._get(offs, np.uint32)
offs += int(raw_dtype.nbytes)

# Get Tensor Offset
offset_tensor = self._get(offs, np.uint64)
offs += int(offset_tensor.nbytes)

return ReaderField(
orig_offs,
str(bytes(name_data), encoding = 'utf-8'),
Expand Down Expand Up @@ -233,10 +254,10 @@ def _build_fields(self, offs: int, count: int) -> int:
offs += field_size
return offs

def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
tensor_fields = []
for _ in range(count):
field = self._get_tensor(offs)
field = self._get_tensor_info_field(offs)
offs += sum(int(part.nbytes) for part in field.parts)
tensor_fields.append(field)
return offs, tensor_fields
Expand Down
29 changes: 28 additions & 1 deletion gguf-py/scripts/gguf-dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,27 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None

markdown_content += "\n"

markdown_content += "### Tensor Data Offset\n"
markdown_content += '\n'
markdown_content += 'This table contains the offset and data segment relative to start of file\n'
markdown_content += '\n'

tensor_mapping_table: list[dict[str, str | int]] = []
for key, tensor in enumerate(reader.tensors):
data_offset_pretty = '{0:#16x}'.format(tensor.data_offset)
data_size_pretty = '{0:#16x}'.format(tensor.n_bytes)
tensor_mapping_table.append({"t_id":key, "layer_name":tensor.name, "data_offset":data_offset_pretty, "data_size":data_size_pretty})

tensors_mapping_table_header_map = [
{'key_name':'t_id', 'header_name':'T_ID', 'align':'right'},
{'key_name':'layer_name', 'header_name':'Tensor Layer Name', 'align':'left'},
{'key_name':'data_offset', 'header_name':'Data Offset (B)', 'align':'right'},
{'key_name':'data_size', 'header_name':'Data Size (B)', 'align':'right'},
]

markdown_content += markdown_table_with_alignment_support(tensors_mapping_table_header_map, tensor_mapping_table)
markdown_content += "\n"

for group in tensor_prefix_order:
tensors = tensor_groups[group]
group_elements = sum(tensor.n_elements for tensor in tensors)
Expand Down Expand Up @@ -364,14 +385,16 @@ def main() -> None:
parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
parser.add_argument("--json", action="store_true", help="Produce JSON output")
parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
parser.add_argument("--data-offset", action="store_true", help="Start of data offset")
parser.add_argument("--data-alignment", action="store_true", help="Data alignment applied globally to data field")
parser.add_argument("--markdown", action="store_true", help="Produce markdown output")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")

args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])

logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)

if not args.json and not args.markdown:
if not args.json and not args.markdown and not args.data_offset and not args.data_alignment:
logger.info(f'* Loading: {args.model}')

reader = GGUFReader(args.model, 'r')
Expand All @@ -380,6 +403,10 @@ def main() -> None:
dump_metadata_json(reader, args)
elif args.markdown:
dump_markdown_metadata(reader, args)
elif args.data_offset:
print(reader.data_offset) # noqa: NP100
elif args.data_alignment:
print(reader.alignment) # noqa: NP100
else:
dump_metadata(reader, args)

Expand Down
Loading