Allow layer repeats

dnhkng · dnhkng · commit baa1efec3f6e · 2024-01-12T16:52:30.000+01:00
diff --git a/exllamav2/model.py b/exllamav2/model.py
@@ -130,14 +130,14 @@ def __init__(self, config: ExLlamaV2Config, lazy_load = False):
         self.modules.append(ExLlamaV2Embedding(self, "model.embed_tokens"))
         self.modules_dict[self.modules[-1].key] = self.modules[-1]
 
-        for layer_idx in range(self.config.num_hidden_layers):
+        for layer_list in range(self.config.num_hidden_layers):
 
-            self.modules.append(ExLlamaV2Attention(self, f"model.layers.{layer_idx}", layer_idx))
+            self.modules.append(ExLlamaV2Attention(self, f"model.layers.{layer_list}", layer_list))
             for m in self.modules[-1].submodules: self.modules_dict[m.key] = m
             if self.config.architecture == "Mixtral":
-                self.modules.append(ExLlamaV2MoEMLP(self, f"model.layers.{layer_idx}", layer_idx))
+                self.modules.append(ExLlamaV2MoEMLP(self, f"model.layers.{layer_list}", layer_list))
             else:
-                self.modules.append(ExLlamaV2MLP(self, f"model.layers.{layer_idx}", layer_idx))
+                self.modules.append(ExLlamaV2MLP(self, f"model.layers.{layer_list}", layer_list))
             for m in self.modules[-1].submodules: self.modules_dict[m.key] = m
 
 
@@ -150,15 +150,40 @@ def __init__(self, config: ExLlamaV2Config, lazy_load = False):
 
         # Find last layer that affects k/v cache
 
-        layer_idx = len(self.modules)
+        layer_list = len(self.modules)
         while True:
-            layer_idx -= 1
-            if isinstance(self.modules[layer_idx], ExLlamaV2Attention):
+            layer_list -= 1
+            if isinstance(self.modules[layer_list], ExLlamaV2Attention):
                 break
 
-        self.last_kv_layer_idx = layer_idx
+        self.last_kv_layer_idx = layer_list
 
 
+        if hasattr(config, 'repeats'):
+            self.layers = []
+
+            def listLeftIndex(alist, value):
+                if value == 0:
+                    return 0
+                return alist.index(str(value))
+
+            def listRightIndex(alist, value):
+                if value > len(alist):
+                    return -1
+                return len(alist) - alist[-1::-1].index(str(value)) -1
+
+            layer_list = [layer.key.split(".")[-1] for layer in self.modules]
+
+            for interval in config.repeats:
+                start_idx = listLeftIndex(layer_list, interval[0])
+                end_idx = listRightIndex(layer_list, interval[1])
+                self.layers.extend(list(range(start_idx, end_idx + 1)))
+            self.layers.extend(list(range(listRightIndex(layer_list, config.repeats[-1][1]), len(layer_list))))
+
+            # If we have create a Frankenmerge, lets print it to verify!
+            for layer in self.layers:
+                print(layer, self.modules[layer].key)
+
     def set_device_map(self, allocation, embed_cpu = True):
 
         self.cache_map = {}
@@ -582,6 +607,23 @@ def _forward(self,
                  return_last_state = False,
                  position_offsets = None):
 
+        def process_module(module, x, last_state):
+            device = _torch_device(module.device_idx)
+
+            if idx == self.head_layer_idx:
+                if last_id_only and return_last_state:
+                    x = x.narrow(-2, -1, 1)
+                    last_state = x
+                elif last_id_only:
+                    x = x.narrow(-2, -1, 1)
+                elif return_last_state:
+                    last_state = x.narrow(-2, -1, 1)
+
+            x = safe_move_tensor(x, device)
+            x = module.forward(x, cache=cache, attn_params=attn_params, past_len=past_len, loras=loras)
+
+            return x, last_state
+
         batch_size, seq_len = input_ids.shape
         past_len = 0
         if cache is not None:
@@ -596,27 +638,19 @@ def _forward(self,
         attn_params = ExLlamaV2Attention.Params(batch_size, seq_len, past_len, input_mask, position_offsets)
         last_state = None
 
-        for idx, module in enumerate(self.modules):
-
-            device = _torch_device(module.device_idx)
-
-            # Onward
-
-            if idx == self.head_layer_idx:
-                if last_id_only and return_last_state:
-                    x = x.narrow(-2, -1, 1)
-                    last_state = x
-                elif last_id_only:
-                    x = x.narrow(-2, -1, 1)
-                elif return_last_state:
-                    last_state = x.narrow(-2, -1, 1)
-
-            x = safe_move_tensor(x, device)
-            x = module.forward(x, cache = cache, attn_params = attn_params, past_len = past_len, loras = loras)
-
-            if preprocess_only and idx == self.last_kv_layer_idx:
-                x = None
-                break
+        if hasattr(self, 'layers'):
+            for i, idx in enumerate(self.layers):
+                module = self.modules[idx]
+                x, last_state = process_module(module, x, last_state)
+                if preprocess_only and idx == self.last_kv_layer_idx:
+                    x = None
+                    break
+        else:
+            for idx, module in enumerate(self.modules):
+                x, last_state = process_module(module, x, last_state)
+                if preprocess_only and idx == self.last_kv_layer_idx:
+                    x = None
+                    break
 
         # Advance cache
 
diff --git a/exllamav2/model_init.py b/exllamav2/model_init.py
@@ -1,5 +1,5 @@
 
-import argparse, sys, os, glob
+import argparse, sys, os, glob, ast
 
 from exllamav2 import(
     ExLlamaV2,
@@ -17,6 +17,7 @@ def add_args(parser):
     parser.add_argument("-nfa", "--no_flash_attn", action = "store_true", help = "Disable Flash Attention")
     parser.add_argument("-lm", "--low_mem", action = "store_true", help = "Enable VRAM optimizations, potentially trading off speed")
     parser.add_argument("-ept", "--experts_per_token", type = int, help = "Override MoE model's default number of experts per token")
+    parser.add_argument("--repeats", type=parse_tuple_list, help="List of tuples of the layers to repeat")
 
 
 def print_options(args):
@@ -60,6 +61,16 @@ def check_args(args):
             print(f" ## Error: Cannot find {filename} in {args.model_dir}")
             sys.exit()
 
+def parse_tuple_list(string):
+    try:
+        # Safely evaluate the string as a Python literal (list of tuples)
+        tuple_list = ast.literal_eval(string)
+        if not all(isinstance(item, tuple) for item in tuple_list):
+            raise ValueError
+        return tuple_list
+    except:
+        raise argparse.ArgumentTypeError("Input must be a list of tuples")
+
 
 def init(args, quiet = False, allow_auto_split = False, skip_load = False):
 
@@ -76,7 +87,8 @@ def init(args, quiet = False, allow_auto_split = False, skip_load = False):
     if args.rope_alpha: config.scale_alpha_value = args.rope_alpha
     config.no_flash_attn = args.no_flash_attn
     if args.experts_per_token: config.num_experts_per_token = args.experts_per_token
-
+    if args.repeats: config.repeats = args.repeats
+    
     # Set low-mem options
 
     if args.low_mem: config.set_low_mem()