pytorch
diff --git a/‎.github/workflows/et-gguf.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/et-gguf.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎GPTQ.py
Lines changed: 20 additions & 10 deletions b/‎GPTQ.py
Lines changed: 20 additions & 10 deletions
diff --git a/‎build/builder.py
Lines changed: 10 additions & 3 deletions b/‎build/builder.py
Lines changed: 10 additions & 3 deletions
diff --git a/‎build/model.py
Lines changed: 3 additions & 11 deletions b/‎build/model.py
Lines changed: 3 additions & 11 deletions
diff --git a/‎chat_in_browser.py
Lines changed: 24 additions & 17 deletions b/‎chat_in_browser.py
Lines changed: 24 additions & 17 deletions
diff --git a/‎cli.py
Lines changed: 15 additions & 14 deletions b/‎cli.py
Lines changed: 15 additions & 14 deletions
diff --git a/‎download.py
Lines changed: 1 addition & 4 deletions b/‎download.py
Lines changed: 1 addition & 4 deletions
@@ -67,11 +67,11 @@ jobs:
           mkdir gguf_files
           wget -O gguf_files/llama-2-7b.Q4_0.gguf "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true"
           ./llama.cpp/quantize --allow-requantize gguf_files/llama-2-7b.Q4_0.gguf gguf_files/llama-2-7b.Q4_0.requant_F32.gguf F32
-	  wget -O gguf_files/tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+          wget -O gguf_files/tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
       - name: Run inference
         run: |
           export GGUF_PATH=${PWD}/gguf_files/llama-2-7b.Q4_0.gguf
-	  export TOKENIZER_PATH=${PWD}/gguf_files/tokenizer.model
+          export TOKENIZER_PATH=${PWD}/gguf_files/tokenizer.model
           export MODEL_NAME=llama-2-7b_Q4_0_gguf
 
           python generate.py --tokenizer-path ${TOKENIZER_PATH} --gguf-path ${GGUF_PATH} --temperature 0 > ${PWD}/output_eager
 
@@ -14,8 +14,8 @@
 aten = torch.ops.aten
 
 from eval import (
+    GPTFastEvalWrapper,
     setup_cache_padded_seq_input_pos_max_seq_length_for_prefill,
-    GPTFastEvalWrapper
 )
 
 
@@ -64,7 +64,6 @@ def __init__(
                 )
                 self.pad_calibration_inputs = False
 
-
     def add_input(self, args):
         if self.inputs is None:
             self.inputs = [MultiInput([arg]) for arg in args]
@@ -114,7 +113,6 @@ def _model_call(self, inps):
         )
 
 
-
 class MultiInput:
     def __init__(self, inputs):
         self.values = list(inputs)
@@ -127,7 +125,9 @@ def __getitem__(self, slice):
         return MultiInput(self.values[slice])
 
     def cuda(self):
-        self.values = [val.cuda() if isinstance(val, torch.Tensor) else val for val in self.values]
+        self.values = [
+            val.cuda() if isinstance(val, torch.Tensor) else val for val in self.values
+        ]
 
 
 class GenericGPTQRunner(fx.Interpreter):
@@ -236,7 +236,14 @@ def tensors_to_cuda(args):
             )
             transposed_args = list(
                 zip(
-                    *[x.values if isinstance(x, MultiInput) else [x] * multi_input_count for x in flat_args]
+                    *[
+                        (
+                            x.values
+                            if isinstance(x, MultiInput)
+                            else [x] * multi_input_count
+                        )
+                        for x in flat_args
+                    ]
                 )
             )
         else:
@@ -245,8 +252,8 @@ def tensors_to_cuda(args):
 
         # check whether we apply GPTQ to this module
         quantize_linear = (
-            (target == aten.linear.default) # if its a linear
-            and id(args[1]) in self.id_to_name # and if we know the layer name
+            (target == aten.linear.default)  # if its a linear
+            and id(args[1]) in self.id_to_name  # and if we know the layer name
             and not skip_quant  # and if we weren't told to skip quantization
             # and if the skip_layer_func doesn't say we should skip
             and not (self.skip_layer_func is not None and self.skip_layer_func(args[1]))
@@ -334,11 +341,14 @@ def SQNR(x, y):
                     target, (args[0][:2], DQ2, *args[2:]), kwargs, skip_quant=True
                 )
 
-                print("SQNR for output without GPTQ (should be less than above)",
-                    torch.cat([
+                print(
+                    "SQNR for output without GPTQ (should be less than above)",
+                    torch.cat(
+                        [
                             SQNR(old.cpu(), old_q.cpu()).unsqueeze(0)
                             for (old, old_q) in zip(old_out.values, old_q_out.values)
-                    ]).mean(),
+                        ]
+                    ).mean(),
                 )
             return new_out
 
 
@@ -147,7 +147,9 @@ def from_args(cls, args):  # -> TokenizerArgs:
             tokenizer_path = args.tokenizer_path
         elif args.model:  # Using a named, well-known model
             model_config = resolve_model_config(args.model)
-            tokenizer_path = Path(args.model_directory) / model_config.name / "tokenizer.model"
+            tokenizer_path = (
+                Path(args.model_directory) / model_config.name / "tokenizer.model"
+            )
         elif args.checkpoint_path:
             tokenizer_path = args.checkpoint_path.parent / "tokenizer.model"
         elif hasattr(args, "checkpoint_dir") and args.checkpoint_dir:
@@ -297,7 +299,7 @@ def _load_model(builder_args):
 def _initialize_model(
     builder_args,
     quantize,
-    tokenizer = None,
+    tokenizer=None,
 ):
     print("Loading model ...")
     t0 = time.time()
@@ -364,17 +366,22 @@ def _initialize_model(
 
     return model
 
+
 def tokenizer_setting_to_name(tiktoken: bool = False) -> str:
     return "TikToken" if tiktoken else "SentencePiece"
 
+
 def validate_args(model: Transformer, tokenizer_args: TokenizerArgs):
     use_tiktoken = model.config.use_tiktoken
     is_tiktoken = tokenizer_args.is_tiktoken
 
     if use_tiktoken is None:
         model.config.use_tiktoken = is_tiktoken
     elif use_tiktoken != is_tiktoken:
-        raise RuntimeError(f"model-specified tokenizer ({tokenizer_setting_to_name(use_tiktoken)} does not match provided tokenizer ({tokenizer_setting_to_name(is_tiktoken)}")
+        raise RuntimeError(
+            f"model-specified tokenizer ({tokenizer_setting_to_name(use_tiktoken)} does not match provided tokenizer ({tokenizer_setting_to_name(is_tiktoken)}"
+        )
+
 
 def resolve_model_name(model: str) -> str:
     # If the provided model name is an alias, retrieve the full path.
 
@@ -58,8 +58,7 @@ def __post_init__(self):
             self.hidden_dim = find_multiple(hidden_dim, multiple_of)
         self.head_dim = self.dim // self.n_heads
         if isinstance(self.use_tiktoken, str):
-            self.use_tiktoken = (self.use_tiktoken == "True")
-
+            self.use_tiktoken = self.use_tiktoken == "True"
 
     @classmethod
     def from_params(cls, params_path):
@@ -118,7 +117,6 @@ def from_name(cls, name: str):
 
 class KVCache(nn.Module):
     def __init__(self, max_batch_size, max_seq_length, n_heads, head_dim, dtype=None):
-        # torch.float): # bfloat16    ):
         super().__init__()
         if not dtype:
             dtype = get_precision()
@@ -180,11 +178,6 @@ def setup_caches(self, max_batch_size, max_seq_length):
         self.register_buffer("causal_mask", causal_mask, persistent=True)
 
     def forward(self, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
-        # print ("*")
-        # print (f"* shape idx: {idx.shape}")
-        # print (f"* shape pos: {input_pos.shape}")
-        # print("@")
-
         assert self.freqs_cis is not None, "Caches must be initialized first"
         mask = self.causal_mask[None, None, input_pos]
         freqs_cis = self.freqs_cis[input_pos]
@@ -194,7 +187,7 @@ def forward(self, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
             x = layer(x, input_pos, freqs_cis, mask)
         x = self.norm(x)
         logits = self.output(x)
-        # print(f"******** logits shape: {logits.shape}")
+        # print(f"logits shape: {logits.shape}")
         return logits
 
     @classmethod
@@ -360,7 +353,6 @@ def forward(self, x: Tensor) -> Tensor:
         return output * self.weight
 
 
-# transpsoed first two arguments to align with model in ET
 def precompute_freqs_cis(
     n_elem: int, seq_len: int, base: int = 10000, dtype=None
 ) -> Tensor:
@@ -373,7 +365,7 @@ def precompute_freqs_cis(
     freqs = torch.outer(t, freqs)
     freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
     cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
-    return cache.to(dtype=dtype)  # bfloat16)
+    return cache.to(dtype=dtype)
 
 
 def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
 
@@ -1,60 +1,67 @@
 # -*- coding: UTF-8 -*-
-from flask import Flask, render_template, request
-from cli import add_arguments_for_generate, arg_init, check_args
-from generate import main as generate_main
 import subprocess
 import sys
 
+from cli import add_arguments_for_generate, arg_init, check_args
+from flask import Flask, render_template, request
+from generate import main as generate_main
+
 
 convo = ""
 disable_input = False
 
+
 def create_app(*args):
     app = Flask(__name__)
 
     import subprocess
+
     # create a new process and set up pipes for communication
-    proc = subprocess.Popen(["python", "generate.py", *args],
-                            stdin=subprocess.PIPE,
-                            stdout=subprocess.PIPE)
+    proc = subprocess.Popen(
+        ["python", "generate.py", *args], stdin=subprocess.PIPE, stdout=subprocess.PIPE
+    )
 
-    @app.route('/')
+    @app.route("/")
     def main():
         output = ""
         global disable_input
 
         while True:
             line = proc.stdout.readline()
-            if line.decode('utf-8').startswith("What is your prompt?"):
+            if line.decode("utf-8").startswith("What is your prompt?"):
                 break
-            output += line.decode('utf-8').strip() + "\n"
-        return render_template('chat.html', convo="Hello! What is your prompt?", disable_input=disable_input)
+            output += line.decode("utf-8").strip() + "\n"
+        return render_template(
+            "chat.html",
+            convo="Hello! What is your prompt?",
+            disable_input=disable_input,
+        )
 
-    @app.route('/chat', methods=['GET', 'POST'])
+    @app.route("/chat", methods=["GET", "POST"])
     def chat():
         # Retrieve the HTTP POST request parameter value from 'request.form' dictionary
-        _prompt = request.form.get('prompt', '')
-        proc.stdin.write((_prompt + "\n").encode('utf-8'))
+        _prompt = request.form.get("prompt", "")
+        proc.stdin.write((_prompt + "\n").encode("utf-8"))
         proc.stdin.flush()
 
         output = ""
         global disable_input
 
         while True:
             line = proc.stdout.readline()
-            if line.decode('utf-8').startswith("What is your prompt?"):
+            if line.decode("utf-8").startswith("What is your prompt?"):
                 break
-            if line.decode('utf-8').startswith("=========="):
+            if line.decode("utf-8").startswith("=========="):
                 disable_input = True
                 break
-            output += line.decode('utf-8').strip() + "\n"
+            output += line.decode("utf-8").strip() + "\n"
 
         global convo
 
         if _prompt:
             convo += "Your prompt:\n" + _prompt + "\n\n"
             convo += "My response:\n" + output + "\n\n"
 
-        return render_template('chat.html', convo=convo, disable_input=disable_input)
+        return render_template("chat.html", convo=convo, disable_input=disable_input)
 
     return app
@@ -12,9 +12,11 @@
 # CPU is always available and also exportable to ExecuTorch
 default_device = "cpu"  # 'cuda' if torch.cuda.is_available() else 'cpu'
 
+
 def check_args(args, name: str) -> None:
     pass
 
+
 def add_arguments_for_chat(parser):
     # Only chat specific options should be here
     _add_arguments_common(parser)
@@ -24,10 +26,7 @@ def add_arguments_for_browser(parser):
     # Only browser specific options should be here
     _add_arguments_common(parser)
     parser.add_argument(
-        "--port",
-        type=int,
-        default=5000,
-        help="Port for the web server in browser mode"
+        "--port", type=int, default=5000, help="Port for the web server in browser mode"
     )
     _add_arguments_common(parser)
 
@@ -122,10 +121,7 @@ def add_arguments(parser):
         help="Top-k for sampling",
     )
     parser.add_argument(
-        "--temperature",
-        type=float,
-        default=0.8,
-        help="Temperature for sampling"
+        "--temperature", type=float, default=0.8, help="Temperature for sampling"
     )
     parser.add_argument(
         "--compile",
@@ -204,20 +200,25 @@ def add_arguments(parser):
         help="Use the specified ExecuTorch .pte model file",
     )
     parser.add_argument(
-        "-d", "--dtype",
+        "-d",
+        "--dtype",
         default="float32",
         help="Override the dtype of the model (default is the checkpoint dtype). Options: bf16, fp16, fp32",
     )
     parser.add_argument(
-        "-v", "--verbose",
+        "-v",
+        "--verbose",
         action="store_true",
         help="Verbose output",
     )
     parser.add_argument(
-        "--quantize", type=str, default="{ }", help=(
-            'Quantization options. pass in as {"<mode>" : {"<argname1>" : <argval1>, "<argname2>" : <argval2>,...},} '+
-            'modes are: embedding, linear:int8, linear:int4, linear:int4-gptq, linear:int4-hqq, linear:a8w4dq, precision.'
-        )
+        "--quantize",
+        type=str,
+        default="{ }",
+        help=(
+            'Quantization options. pass in as {"<mode>" : {"<argname1>" : <argval1>, "<argname2>" : <argval2>,...},} '
+            + "modes are: embedding, linear:int8, linear:int4, linear:int4-gptq, linear:int4-hqq, linear:a8w4dq, precision."
+        ),
     )
     parser.add_argument(
         "--params-table",
 
@@ -9,10 +9,7 @@
 from typing import Optional, Sequence
 
 from build.convert_hf_checkpoint import convert_hf_checkpoint
-from config.model_config import (
-    ModelDistributionChannel,
-    resolve_model_config,
-)
+from config.model_config import ModelDistributionChannel, resolve_model_config
 
 from requests.exceptions import HTTPError