Skip to content

Commit 73eefdf

Browse files
logicchainscebtenzzre
authored andcommitted
gguf: add script for converting falcon 180B
1 parent 019ba1d commit 73eefdf

File tree

1 file changed

+268
-0
lines changed

1 file changed

+268
-0
lines changed

convert-falcon180-hf-to-gguf.py

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
#!/usr/bin/env python3
2+
# HF falcon180B--> gguf conversion
3+
4+
from __future__ import annotations
5+
6+
import argparse
7+
import json
8+
import os
9+
import struct
10+
import sys
11+
from pathlib import Path
12+
from typing import Any
13+
14+
import numpy as np
15+
import torch
16+
from transformers import AutoTokenizer # type: ignore[import]
17+
from safetensors import safe_open
18+
19+
if 'NO_LOCAL_GGUF' not in os.environ:
20+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
21+
import gguf
22+
23+
24+
def bytes_to_unicode():
25+
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
26+
"""
27+
Returns list of utf-8 byte and a corresponding list of unicode strings.
28+
The reversible bpe codes work on unicode strings.
29+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
30+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
31+
This is a significant percentage of your normal, say, 32K bpe vocab.
32+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
33+
And avoids mapping to whitespace/control characters the bpe code barfs on.
34+
"""
35+
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
36+
cs = bs[:]
37+
n = 0
38+
for b in range(2**8):
39+
if b not in bs:
40+
bs.append(b)
41+
cs.append(2**8+n)
42+
n += 1
43+
return dict(zip(bs, (chr(n) for n in cs)))
44+
45+
46+
def count_model_parts(dir_model: Path) -> int:
47+
num_parts = 0
48+
for filename in os.listdir(dir_model):
49+
if filename.startswith("model-00"):
50+
num_parts += 1
51+
52+
if num_parts > 0:
53+
print("gguf: found " + str(num_parts) + " model parts")
54+
return num_parts
55+
56+
57+
def parse_args() -> argparse.Namespace:
58+
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
59+
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
60+
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
61+
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
62+
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
63+
return parser.parse_args()
64+
65+
args = parse_args()
66+
67+
dir_model = args.model
68+
ftype = args.ftype
69+
if not dir_model.is_dir():
70+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
71+
sys.exit(1)
72+
73+
# possible tensor data types
74+
# ftype == 0 -> float32
75+
# ftype == 1 -> float16
76+
77+
# map from ftype to string
78+
ftype_str = ["f32", "f16"]
79+
80+
if args.outfile is not None:
81+
fname_out = args.outfile
82+
else:
83+
# output in the same directory as the model by default
84+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
85+
86+
print("gguf: loading model "+dir_model.name)
87+
88+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
89+
hparams = json.load(f)
90+
91+
if hparams["architectures"][0] != "FalconForCausalLM":
92+
print("Model architecture not supported: " + hparams["architectures"][0])
93+
94+
sys.exit(1)
95+
96+
# get number of model parts
97+
num_parts = count_model_parts(dir_model)
98+
99+
ARCH=gguf.MODEL_ARCH.FALCON
100+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
101+
102+
print("gguf: get model metadata")
103+
104+
block_count = hparams["num_hidden_layers"]
105+
106+
gguf_writer.add_name("Falcon")
107+
gguf_writer.add_context_length(2048) # not in config.json
108+
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
109+
gguf_writer.add_embedding_length(hparams["hidden_size"])
110+
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
111+
gguf_writer.add_block_count(block_count)
112+
gguf_writer.add_head_count(hparams["num_attention_heads"])
113+
if "num_kv_heads" in hparams:
114+
gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
115+
else:
116+
gguf_writer.add_head_count_kv(1)
117+
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
118+
gguf_writer.add_file_type(ftype)
119+
120+
# TOKENIZATION
121+
122+
print("gguf: get tokenizer metadata")
123+
124+
tokens: list[bytearray] = []
125+
scores: list[float] = []
126+
toktypes: list[int] = []
127+
128+
tokenizer_json_file = dir_model / 'tokenizer.json'
129+
if not tokenizer_json_file.is_file():
130+
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
131+
sys.exit(1)
132+
133+
# gpt2 tokenizer
134+
gguf_writer.add_tokenizer_model("gpt2")
135+
136+
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
137+
tokenizer_json = json.load(f)
138+
139+
print("gguf: get gpt2 tokenizer vocab")
140+
141+
vocab_size = len(tokenizer_json["model"]["vocab"])
142+
143+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
144+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
145+
146+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
147+
byte_encoder = bytes_to_unicode()
148+
byte_decoder = {v: k for k, v in byte_encoder.items()}
149+
150+
for i in range(vocab_size):
151+
if i in reverse_vocab:
152+
try:
153+
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
154+
except KeyError:
155+
text = bytearray()
156+
for c in reverse_vocab[i]:
157+
if ord(c) < 256: # single byte character
158+
text.append(byte_decoder[ord(c)])
159+
else: # multibyte special token character
160+
text.extend(c.encode('utf-8'))
161+
else:
162+
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
163+
pad_token = f"[PAD{i}]".encode("utf8")
164+
text = bytearray(pad_token)
165+
166+
tokens.append(text)
167+
scores.append(0.0) # dymmy
168+
toktypes.append(gguf.TokenType.NORMAL) # dummy
169+
170+
gguf_writer.add_token_list(tokens)
171+
gguf_writer.add_token_scores(scores)
172+
gguf_writer.add_token_types(toktypes)
173+
174+
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
175+
special_vocab.add_to_gguf(gguf_writer)
176+
177+
# TENSORS
178+
179+
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
180+
181+
# params for qkv transform
182+
n_head = hparams["num_attention_heads"]
183+
n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
184+
185+
head_dim = hparams["hidden_size"] // n_head
186+
187+
# tensor info
188+
print("gguf: get tensor metadata")
189+
190+
if num_parts == 0:
191+
part_names = iter(("pytorch_model.bin",))
192+
else:
193+
part_names = (
194+
f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
195+
)
196+
197+
for part_name in part_names:
198+
if args.vocab_only:
199+
break
200+
print("gguf: loading model part '" + part_name + "'")
201+
with safe_open(dir_model / part_name, framework="pt", device="cpu") as model_part:
202+
203+
for name in model_part.keys():
204+
data = model_part.get_tensor(name)
205+
206+
old_dtype = data.dtype
207+
208+
# convert any unsupported data types to float32
209+
if data.dtype != torch.float16 and data.dtype != torch.float32:
210+
data = data.to(torch.float32)
211+
212+
# QKV tensor transform
213+
# The original query_key_value tensor contains n_head_kv "kv groups",
214+
# each consisting of n_head/n_head_kv query weights followed by one key
215+
# and one value weight (shared by all query heads in the kv group).
216+
# This layout makes it a big pain to work with in GGML.
217+
# So we rearrange them here,, so that we have n_head query weights
218+
# followed by n_head_kv key weights followed by n_head_kv value weights,
219+
# in contiguous fashion.
220+
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
221+
222+
if "query_key_value" in name:
223+
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
224+
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
225+
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
226+
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
227+
data = torch.cat((q,k,v)).reshape_as(data)
228+
229+
data = data.squeeze().numpy()
230+
231+
# map tensor names
232+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
233+
if new_name is None:
234+
print("Can not map tensor '" + name + "'")
235+
sys.exit()
236+
237+
n_dims = len(data.shape)
238+
data_dtype = data.dtype
239+
240+
# if f32 desired, convert any float16 to float32
241+
if ftype == 0 and data_dtype == np.float16:
242+
data = data.astype(np.float32)
243+
244+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
245+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
246+
data = data.astype(np.float32)
247+
248+
# if f16 desired, convert any float32 2-dim weight tensors to float16
249+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
250+
data = data.astype(np.float16)
251+
252+
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
253+
254+
gguf_writer.add_tensor(new_name, data)
255+
256+
257+
print("gguf: write header")
258+
gguf_writer.write_header_to_file()
259+
print("gguf: write metadata")
260+
gguf_writer.write_kv_data_to_file()
261+
if not args.vocab_only:
262+
print("gguf: write tensors")
263+
gguf_writer.write_tensors_to_file()
264+
265+
gguf_writer.close()
266+
267+
print(f"gguf: model successfully exported to '{fname_out}'")
268+
print("")

0 commit comments

Comments
 (0)