Skip to content

Commit 20726a1

Browse files
committed
Make IsVendor quicker
Although iterating across the regexps is quicker than naively concatenating them, it is still quite slow. This PR proposes a slightly cleverer solution. First instead of just concatenating with groups this PR uses non-capturing groups. This speeds up the regexp processing. Secondly we group the regexps in to 3 groups - those that have to be at the start, those that are segments or at the start and the rest. This makes a considerable speed improvement. Thirdly the regexps are sorted within those groups - which also speeds things up. All in all for a non-vendored file this makes IsVendor around twice as fast. Signed-off-by: Andrew Thornton <[email protected]>
1 parent d2d4c32 commit 20726a1

File tree

1 file changed

+114
-1
lines changed

1 file changed

+114
-1
lines changed

utils.go

Lines changed: 114 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package enry
33
import (
44
"bytes"
55
"path/filepath"
6+
"regexp"
7+
"sort"
68
"strings"
79

810
"github.com/go-enry/go-enry/v2/data"
@@ -61,9 +63,11 @@ func IsDotFile(path string) bool {
6163
return strings.HasPrefix(base, ".") && base != "."
6264
}
6365

66+
var isVendorRegExp *regexp.Regexp
67+
6468
// IsVendor returns whether or not path is a vendor path.
6569
func IsVendor(path string) bool {
66-
return matchRegexSlice(data.VendorMatchers, path)
70+
return isVendorRegExp.MatchString(path)
6771
}
6872

6973
// IsTest returns whether or not path is a test path.
@@ -131,3 +135,112 @@ func IsGenerated(path string, content []byte) bool {
131135

132136
return false
133137
}
138+
139+
func init() {
140+
// We now collate the individual regexps that make up the VendorMatchers to
141+
// produce a single large regexp which is around twice as fast to test than
142+
// simply iterating through all the regexps or naïvely collating the
143+
// regexps.
144+
//
145+
// ---
146+
//
147+
// data.VendorMatchers here is a slice containing individual regexps that
148+
// match a vendor file therefore if we want to test if a filename is a
149+
// Vendor we need to test whether that filename matches one or more of
150+
// those regexps.
151+
//
152+
// Now we could test each matcher in turn using a shortcircuiting test i.e.
153+
//
154+
// func IsVendor(filename string) bool {
155+
// for _, matcher := range data.VendorMatchers {
156+
// if matcher.Match(filename) {
157+
// return true
158+
// }
159+
// }
160+
// return false
161+
// }
162+
//
163+
// Or concatentate all these regexps using groups i.e.
164+
//
165+
// `(regexp1)|(regexp2)|(regexp3)|...`
166+
//
167+
// However both of these are relatively slow and they don't take advantage
168+
// of the inherent structure within our regexps...
169+
//
170+
// If we look at our regexps there are essentially three types of regexp:
171+
//
172+
// 1. Those that start with `^`
173+
// 2. Those that start with `(^|/)`
174+
// 3. Others
175+
//
176+
// If we collate our regexps into these groups that will significantly
177+
// reduce the likelihood of backtracking within the regexp trie matcher.
178+
//
179+
// A further improvement is to use non-capturing groups as otherwise the
180+
// regexp parser, whilst matching, will have to allocate slices for
181+
// matching positions. (A future improvement here could be in the use of
182+
// enforcing non-capturing groups within the sub-regexps too.)
183+
//
184+
// Finally if we sort the segments we can help the matcher build a more
185+
// efficient matcher and trie.
186+
187+
// alias the VendorMatchers to simplify things
188+
matchers := data.VendorMatchers
189+
190+
// Create three temporary string slices for our three groups above - prefixes removed
191+
caretStrings := make([]string, 0, 10)
192+
caretSegmentStrings := make([]string, 0, 10)
193+
matcherStrings := make([]string, 0, len(matchers))
194+
195+
// Walk the matchers and check their string representation for each group prefix, remove it and add to the respective group slices
196+
for _, matcher := range matchers {
197+
str := matcher.String()
198+
if str[0] == '^' {
199+
caretStrings = append(caretStrings, str[1:])
200+
} else if str[0:5] == "(^|/)" {
201+
caretSegmentStrings = append(caretSegmentStrings, str[5:])
202+
} else {
203+
matcherStrings = append(matcherStrings, str)
204+
}
205+
}
206+
207+
// Sort the strings within each group - a potential further improvement could be in simplifying within these groups
208+
sort.Strings(caretSegmentStrings)
209+
sort.Strings(caretStrings)
210+
sort.Strings(matcherStrings)
211+
212+
// Now build the collated regexp
213+
sb := &strings.Builder{}
214+
215+
// Start with group 1 - those that started with `^`
216+
sb.WriteString("(?:^(?:")
217+
sb.WriteString(caretStrings[0])
218+
for _, matcher := range caretStrings[1:] {
219+
sb.WriteString(")|(?:")
220+
sb.WriteString(matcher)
221+
}
222+
sb.WriteString("))")
223+
sb.WriteString("|")
224+
225+
// Now add group 2 - those that started with `(^|/)`
226+
sb.WriteString("(?:(?:^|/)(?:")
227+
sb.WriteString(caretSegmentStrings[0])
228+
for _, matcher := range caretSegmentStrings[1:] {
229+
sb.WriteString(")|(?:")
230+
sb.WriteString(matcher)
231+
}
232+
sb.WriteString("))")
233+
sb.WriteString("|")
234+
235+
// Finally add the rest
236+
sb.WriteString("(?:")
237+
sb.WriteString(matcherStrings[0])
238+
for _, matcher := range matcherStrings[1:] {
239+
sb.WriteString(")|(?:")
240+
sb.WriteString(matcher)
241+
}
242+
sb.WriteString(")")
243+
244+
// Compile the whole thing as the isVendorRegExp
245+
isVendorRegExp = regexp.MustCompile(sb.String())
246+
}

0 commit comments

Comments
 (0)