[Compression] Implement a new strategy for finding frequently used substrings.

nadavrot · nadavrot · commit 1ea703c548a4 · 2016-01-06T12:06:45.000-08:00
The code-book compression (compression using a dictionary) shortens words by
encoding references to a string table.  This commit changes the code that
constructs the string table. Before we were scanning all of the substrings in
the input upto a certain length and we sorted them by frequency. The
disadvantage of that approach was that we encoded parts of substrings multiple
times. For example, the word "Collection" and the substring "ollec" had the same
frequency. Attempts to prune the list was too compute intensive and not very
effective (we checked if "ollec" is a substring of "Collection" and if they had
a similar frequency).

This commit implements a completely different approach. We now partition the
long words into tokens. For example, the string "ArrayType10Collection" is split
to "Array" + "Type" + "10Collection.

This method is very effective and with the updated tables we can now reduce the
size of the string table by 57%! This change also reduces the size of the string
table by 1/3.

With this change (the auto-generated h files, that are not included in this
commit) the size  of the swift dylib on x86 is reduced from 4.4MB to 3.6MB.
diff --git a/utils/name-compression/CBCGen.py b/utils/name-compression/CBCGen.py
@@ -15,30 +15,80 @@ def collect_top_entries(val):
   Collect the most frequent substrings and organize them in a table.
   """
   # sort items by hit rate.
-  lst = sorted(hist.items(), key=lambda x: x[1] * len(x[0]) , reverse=True)[0:val]
+  lst = sorted(hist.items(), key=lambda x: x[1] , reverse=True)[0:val]
   # Strip out entries with a small number of hits.
   # These entries are not likely to help the compressor and can extend the compile
   # time of the mangler unnecessarily.
-  lst = filter(lambda p: p[1] > 500, lst)
+  lst = filter(lambda p: p[1] > 15 and len(p[0]) > 3, lst)
   return lst
 
+def getTokens(line):
+  """
+  Split the incoming line into independent parts. The tokenizer has rules for
+  extracting identifiers (strings that start with digits followed by letters),
+  rules for detecting words (strings that start with upper case letters and
+  continue with lower case letters) and rules to glue swift mangling tokens
+  into subsequent words.
+  """
+  # String builder.
+  sb = ""
+  # The last character.
+  Last = ""
+  for ch in line:
+    if Last.isupper():
+      # Uppercase letter to digits -> starts a new token.
+      if ch.isdigit():
+        if len(sb) > 3:
+          yield sb
+          sb = ""
+        sb += ch
+        Last = ch
+        continue
+      # Uppercase letter to lowercase or uppercase -> continue.
+      Last = ch
+      sb += ch
+      continue
+
+    # Digit -> continue.
+    if Last.isdigit():
+      Last = ch
+      sb += ch
+      continue
+
+    # Lowercase letter to digit or uppercase letter -> stop.
+    if Last.islower():
+      if ch.isdigit() or ch.isupper():
+        if len(sb) > 4:
+          yield sb
+          sb = ""
+        sb += ch
+        Last = ch
+        continue
+      Last = ch
+      sb += ch
+      continue
+
+    # Just append unclassified characters to the token.
+    if len(sb) > 3:
+      yield sb
+      sb = ""
+    sb += ch
+    Last = ch
+  yield sb
+
 def addLine(line):
   """
   Extract all of the possible substrings from \p line and insert them into
   the substring dictionary. This method knows to ignore the _T swift prefix.
   """
   if not line.startswith("__T"): return
 
-  # strip the "__T" for the prefix calculations
+  # Strip the "__T" for the prefix calculations.
   line = line[3:]
 
-  max_string_length = 9
-  string_len = len(line)
-  for seg_len in xrange(3, max_string_length):
-    for start_idx in xrange(string_len - seg_len):
-      substr = line[start_idx:start_idx+seg_len]
-      hist[substr] += 1
-
+  # Add all of the tokens in the word to the histogram.
+  for tok in getTokens(line):
+      hist[tok] += 1
 
 # Read all of the input files and add the substrings into the table.
 for f in filenames:
@@ -54,7 +104,8 @@ def addLine(line):
 encoders = [c for c in charset] # alphabet without the escape chars.
 enc_len = len(encoders)
 
-# Take the most frequent entries from the table.
+# Take the most frequent entries from the table that fit into the range of
+# our indices (assuming two characters for indices).
 table = collect_top_entries(enc_len * enc_len)
 
 # Calculate the reverse mapping between the char to its index.