@@ -3,6 +3,8 @@ package enry
3
3
import (
4
4
"bytes"
5
5
"path/filepath"
6
+ "regexp"
7
+ "sort"
6
8
"strings"
7
9
8
10
"github.com/go-enry/go-enry/v2/data"
@@ -61,9 +63,11 @@ func IsDotFile(path string) bool {
61
63
return strings .HasPrefix (base , "." ) && base != "."
62
64
}
63
65
66
+ var isVendorRegExp * regexp.Regexp
67
+
64
68
// IsVendor returns whether or not path is a vendor path.
65
69
func IsVendor (path string ) bool {
66
- return matchRegexSlice ( data . VendorMatchers , path )
70
+ return isVendorRegExp . MatchString ( path )
67
71
}
68
72
69
73
// IsTest returns whether or not path is a test path.
@@ -131,3 +135,112 @@ func IsGenerated(path string, content []byte) bool {
131
135
132
136
return false
133
137
}
138
+
139
+ func init () {
140
+ // We now collate the individual regexps that make up the VendorMatchers to
141
+ // produce a single large regexp which is around twice as fast to test than
142
+ // simply iterating through all the regexps or naïvely collating the
143
+ // regexps.
144
+ //
145
+ // ---
146
+ //
147
+ // data.VendorMatchers here is a slice containing individual regexps that
148
+ // match a vendor file therefore if we want to test if a filename is a
149
+ // Vendor we need to test whether that filename matches one or more of
150
+ // those regexps.
151
+ //
152
+ // Now we could test each matcher in turn using a shortcircuiting test i.e.
153
+ //
154
+ // func IsVendor(filename string) bool {
155
+ // for _, matcher := range data.VendorMatchers {
156
+ // if matcher.Match(filename) {
157
+ // return true
158
+ // }
159
+ // }
160
+ // return false
161
+ // }
162
+ //
163
+ // Or concatentate all these regexps using groups i.e.
164
+ //
165
+ // `(regexp1)|(regexp2)|(regexp3)|...`
166
+ //
167
+ // However both of these are relatively slow and they don't take advantage
168
+ // of the inherent structure within our regexps...
169
+ //
170
+ // If we look at our regexps there are essentially three types of regexp:
171
+ //
172
+ // 1. Those that start with `^`
173
+ // 2. Those that start with `(^|/)`
174
+ // 3. Others
175
+ //
176
+ // If we collate our regexps into these groups that will significantly
177
+ // reduce the likelihood of backtracking within the regexp trie matcher.
178
+ //
179
+ // A further improvement is to use non-capturing groups as otherwise the
180
+ // regexp parser, whilst matching, will have to allocate slices for
181
+ // matching positions. (A future improvement here could be in the use of
182
+ // enforcing non-capturing groups within the sub-regexps too.)
183
+ //
184
+ // Finally if we sort the segments we can help the matcher build a more
185
+ // efficient matcher and trie.
186
+
187
+ // alias the VendorMatchers to simplify things
188
+ matchers := data .VendorMatchers
189
+
190
+ // Create three temporary string slices for our three groups above - prefixes removed
191
+ caretStrings := make ([]string , 0 , 10 )
192
+ caretSegmentStrings := make ([]string , 0 , 10 )
193
+ matcherStrings := make ([]string , 0 , len (matchers ))
194
+
195
+ // Walk the matchers and check their string representation for each group prefix, remove it and add to the respective group slices
196
+ for _ , matcher := range matchers {
197
+ str := matcher .String ()
198
+ if str [0 ] == '^' {
199
+ caretStrings = append (caretStrings , str [1 :])
200
+ } else if str [0 :5 ] == "(^|/)" {
201
+ caretSegmentStrings = append (caretSegmentStrings , str [5 :])
202
+ } else {
203
+ matcherStrings = append (matcherStrings , str )
204
+ }
205
+ }
206
+
207
+ // Sort the strings within each group - a potential further improvement could be in simplifying within these groups
208
+ sort .Strings (caretSegmentStrings )
209
+ sort .Strings (caretStrings )
210
+ sort .Strings (matcherStrings )
211
+
212
+ // Now build the collated regexp
213
+ sb := & strings.Builder {}
214
+
215
+ // Start with group 1 - those that started with `^`
216
+ sb .WriteString ("(?:^(?:" )
217
+ sb .WriteString (caretStrings [0 ])
218
+ for _ , matcher := range caretStrings [1 :] {
219
+ sb .WriteString (")|(?:" )
220
+ sb .WriteString (matcher )
221
+ }
222
+ sb .WriteString ("))" )
223
+ sb .WriteString ("|" )
224
+
225
+ // Now add group 2 - those that started with `(^|/)`
226
+ sb .WriteString ("(?:(?:^|/)(?:" )
227
+ sb .WriteString (caretSegmentStrings [0 ])
228
+ for _ , matcher := range caretSegmentStrings [1 :] {
229
+ sb .WriteString (")|(?:" )
230
+ sb .WriteString (matcher )
231
+ }
232
+ sb .WriteString ("))" )
233
+ sb .WriteString ("|" )
234
+
235
+ // Finally add the rest
236
+ sb .WriteString ("(?:" )
237
+ sb .WriteString (matcherStrings [0 ])
238
+ for _ , matcher := range matcherStrings [1 :] {
239
+ sb .WriteString (")|(?:" )
240
+ sb .WriteString (matcher )
241
+ }
242
+ sb .WriteString (")" )
243
+
244
+ // Compile the whole thing as the isVendorRegExp
245
+ isVendorRegExp = regexp .MustCompile (sb .String ())
246
+ }
0 commit comments