Skip to content

Commit 9f4fab6

Browse files
mofosyneNeoZhangJianyu
authored andcommitted
Gguf dump start data offset via --data-offset and some extra refactor (ggml-org#8054)
* gguf-dump: add --data-offset * gguf-dump: add tensor data offset table * gguf-dump: refactor GGUFReader for clarity * gguf-dump: add --data-alignment * gguf-dump.py: Rename variables and adjust comments start_data_offset --> data_offset _build_tensors_info_fields --> _build_tensor_info
1 parent a932974 commit 9f4fab6

File tree

2 files changed

+53
-5
lines changed

2 files changed

+53
-5
lines changed

gguf-py/gguf/gguf_reader.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ class GGUFReader:
6969
# I - same as host, S - swapped
7070
byte_order: Literal['I'] | Literal['S'] = 'I'
7171
alignment: int = GGUF_DEFAULT_ALIGNMENT
72+
data_offset: int
7273

7374
# Note: Internal helper, API may change.
7475
gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
@@ -88,9 +89,13 @@ class GGUFReader:
8889
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
8990
self.data = np.memmap(path, mode = mode)
9091
offs = 0
92+
93+
# Check for GGUF magic
9194
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
9295
raise ValueError('GGUF magic invalid')
9396
offs += 4
97+
98+
# Check GGUF version
9499
temp_version = self._get(offs, np.uint32)
95100
if temp_version[0] & 65535 == 0:
96101
# If we get 0 here that means it's (probably) a GGUF file created for
@@ -103,12 +108,16 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r
103108
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
104109
self.tensors: list[ReaderTensor] = []
105110
offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
111+
112+
# Check tensor count and kv count
106113
temp_counts = self._get(offs, np.uint64, 2)
107114
offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
108115
offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
109116
tensor_count, kv_count = temp_counts
110117
offs = self._build_fields(offs, kv_count)
111-
offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
118+
119+
# Build Tensor Info Fields
120+
offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
112121
new_align = self.fields.get('general.alignment')
113122
if new_align is not None:
114123
if new_align.types != [GGUFValueType.UINT32]:
@@ -117,6 +126,7 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r
117126
padding = offs % self.alignment
118127
if padding != 0:
119128
offs += self.alignment - padding
129+
self.data_offset = offs
120130
self._build_tensors(offs, tensors_fields)
121131

122132
_DT = TypeVar('_DT', bound = npt.DTypeLike)
@@ -193,18 +203,29 @@ def _get_field_parts(
193203
# We can't deal with this one.
194204
raise ValueError('Unknown/unhandled field type {gtype}')
195205

196-
def _get_tensor(self, orig_offs: int) -> ReaderField:
206+
def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
197207
offs = orig_offs
208+
209+
# Get Tensor Name
198210
name_len, name_data = self._get_str(offs)
199211
offs += int(name_len.nbytes + name_data.nbytes)
212+
213+
# Get Tensor Dimensions Count
200214
n_dims = self._get(offs, np.uint32)
201215
offs += int(n_dims.nbytes)
216+
217+
# Get Tensor Dimension Array
202218
dims = self._get(offs, np.uint64, n_dims[0])
203219
offs += int(dims.nbytes)
220+
221+
# Get Tensor Encoding Scheme Type
204222
raw_dtype = self._get(offs, np.uint32)
205223
offs += int(raw_dtype.nbytes)
224+
225+
# Get Tensor Offset
206226
offset_tensor = self._get(offs, np.uint64)
207227
offs += int(offset_tensor.nbytes)
228+
208229
return ReaderField(
209230
orig_offs,
210231
str(bytes(name_data), encoding = 'utf-8'),
@@ -233,10 +254,10 @@ def _build_fields(self, offs: int, count: int) -> int:
233254
offs += field_size
234255
return offs
235256

236-
def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
257+
def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
237258
tensor_fields = []
238259
for _ in range(count):
239-
field = self._get_tensor(offs)
260+
field = self._get_tensor_info_field(offs)
240261
offs += sum(int(part.nbytes) for part in field.parts)
241262
tensor_fields.append(field)
242263
return offs, tensor_fields

gguf-py/scripts/gguf-dump.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,27 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
319319

320320
markdown_content += "\n"
321321

322+
markdown_content += "### Tensor Data Offset\n"
323+
markdown_content += '\n'
324+
markdown_content += 'This table contains the offset and data segment relative to start of file\n'
325+
markdown_content += '\n'
326+
327+
tensor_mapping_table: list[dict[str, str | int]] = []
328+
for key, tensor in enumerate(reader.tensors):
329+
data_offset_pretty = '{0:#16x}'.format(tensor.data_offset)
330+
data_size_pretty = '{0:#16x}'.format(tensor.n_bytes)
331+
tensor_mapping_table.append({"t_id":key, "layer_name":tensor.name, "data_offset":data_offset_pretty, "data_size":data_size_pretty})
332+
333+
tensors_mapping_table_header_map = [
334+
{'key_name':'t_id', 'header_name':'T_ID', 'align':'right'},
335+
{'key_name':'layer_name', 'header_name':'Tensor Layer Name', 'align':'left'},
336+
{'key_name':'data_offset', 'header_name':'Data Offset (B)', 'align':'right'},
337+
{'key_name':'data_size', 'header_name':'Data Size (B)', 'align':'right'},
338+
]
339+
340+
markdown_content += markdown_table_with_alignment_support(tensors_mapping_table_header_map, tensor_mapping_table)
341+
markdown_content += "\n"
342+
322343
for group in tensor_prefix_order:
323344
tensors = tensor_groups[group]
324345
group_elements = sum(tensor.n_elements for tensor in tensors)
@@ -370,14 +391,16 @@ def main() -> None:
370391
parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
371392
parser.add_argument("--json", action="store_true", help="Produce JSON output")
372393
parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
394+
parser.add_argument("--data-offset", action="store_true", help="Start of data offset")
395+
parser.add_argument("--data-alignment", action="store_true", help="Data alignment applied globally to data field")
373396
parser.add_argument("--markdown", action="store_true", help="Produce markdown output")
374397
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
375398

376399
args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
377400

378401
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
379402

380-
if not args.json and not args.markdown:
403+
if not args.json and not args.markdown and not args.data_offset and not args.data_alignment:
381404
logger.info(f'* Loading: {args.model}')
382405

383406
reader = GGUFReader(args.model, 'r')
@@ -386,6 +409,10 @@ def main() -> None:
386409
dump_metadata_json(reader, args)
387410
elif args.markdown:
388411
dump_markdown_metadata(reader, args)
412+
elif args.data_offset:
413+
print(reader.data_offset) # noqa: NP100
414+
elif args.data_alignment:
415+
print(reader.alignment) # noqa: NP100
389416
else:
390417
dump_metadata(reader, args)
391418

0 commit comments

Comments
 (0)