blacked gpt4all-to-ggml

Geeks-Sid · Geeks-Sid · commit eec105a86a00 · 2023-03-29T19:15:37.000-04:00
diff --git a/convert-gpt4all-to-ggml.py b/convert-gpt4all-to-ggml.py
@@ -15,37 +15,43 @@
 
 HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
 
+
 def parse_args():
-    parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
-    parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
-    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
+    parser = argparse.ArgumentParser(
+        description="Upgrade a GPT4All model to the current format"
+    )
+    parser.add_argument("gpt4all_model", help="path to gpt4all-lora-quantized.bin")
+    parser.add_argument("tokenizer_model", help="path to LLaMA tokenizer.model file")
     return parser.parse_args()
 
+
 def read_header(f_in):
     struct_fmt = "i" * (3 + len(HPARAMS))
     struct_size = struct.calcsize(struct_fmt)
     buf = f_in.read(struct_size)
     return struct.unpack(struct_fmt, buf)
 
+
 def write_header(f_out, header):
     (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
 
-    if magic != 0x67676d6c:
-        raise Exception('Invalid file magic. Must be an old style ggml file.')
+    if magic != 0x67676D6C:
+        raise Exception("Invalid file magic. Must be an old style ggml file.")
 
     values = [
-        0x67676d66, # magic: ggml in hex
-        1,          # file version
+        0x67676D66,  # magic: ggml in hex
+        1,  # file version
         vocab_size,
         dim,
         multiple_of,
         n_heads,
         n_layers,
         rot,
-        ftype
+        ftype,
     ]
     f_out.write(struct.pack("i" * len(values), *values))
 
+
 def write_tokens(fout, tokenizer):
     for i in range(tokenizer.vocab_size()):
         if tokenizer.is_unknown(i):
@@ -71,22 +77,25 @@ def write_tokens(fout, tokenizer):
     fout.write(text)
     fout.write(struct.pack("f", 0.0))
 
+
 def read_tokens(f_in, tokenizer):
     for i in range(tokenizer.vocab_size()):
         len_b = f_in.read(4)
         (length,) = struct.unpack("i", len_b)
         f_in.read(length)
 
+
 def copy_all_data(f_out, f_in):
     while True:
         buf = f_in.read(1024 * 1024)
         if not buf:
             break
         f_out.write(buf)
 
+
 def convert_one_file(path_in, tokenizer):
     path_tmp = f"{path_in}.tmp"
-    path_orig= f"{path_in}.orig"
+    path_orig = f"{path_in}.orig"
     print(f"converting {path_in}")
     with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
         write_header(f_out, read_header(f_in))
@@ -96,12 +105,14 @@ def convert_one_file(path_in, tokenizer):
     os.rename(path_in, path_orig)
     os.rename(path_tmp, path_in)
 
+
 def main():
     args = parse_args()
 
     tokenizer = SentencePieceProcessor(args.tokenizer_model)
 
     convert_one_file(args.gpt4all_model, tokenizer)
 
+
 if __name__ == "__main__":
     main()