@@ -7,16 +7,18 @@ package csv
7
7
import (
8
8
"bytes"
9
9
stdcsv "encoding/csv"
10
- "errors"
11
10
"io"
11
+ "path/filepath"
12
12
"regexp"
13
13
"strings"
14
14
15
+ "code.gitea.io/gitea/modules/markup"
15
16
"code.gitea.io/gitea/modules/translation"
16
17
"code.gitea.io/gitea/modules/util"
17
18
)
18
19
19
- var quoteRegexp = regexp .MustCompile (`["'][\s\S]+?["']` )
20
+ const maxLines = 10
21
+ const guessSampleSize = 1e4 // 10k
20
22
21
23
// CreateReader creates a csv.Reader with the given delimiter.
22
24
func CreateReader (input io.Reader , delimiter rune ) * stdcsv.Reader {
@@ -30,70 +32,95 @@ func CreateReader(input io.Reader, delimiter rune) *stdcsv.Reader {
30
32
return rd
31
33
}
32
34
33
- // CreateReaderAndGuessDelimiter tries to guess the field delimiter from the content and creates a csv.Reader.
34
- // Reads at most 10k bytes.
35
- func CreateReaderAndGuessDelimiter ( rd io.Reader ) (* stdcsv.Reader , error ) {
36
- var data = make ([]byte , 1e4 )
35
+ // CreateReaderAndDetermineDelimiter tries to guess the field delimiter from the content and creates a csv.Reader.
36
+ // Reads at most guessSampleSize bytes.
37
+ func CreateReaderAndDetermineDelimiter ( ctx * markup. RenderContext , rd io.Reader ) (* stdcsv.Reader , error ) {
38
+ var data = make ([]byte , guessSampleSize )
37
39
size , err := util .ReadAtMost (rd , data )
38
40
if err != nil {
39
41
return nil , err
40
42
}
41
43
42
44
return CreateReader (
43
45
io .MultiReader (bytes .NewReader (data [:size ]), rd ),
44
- guessDelimiter ( data [:size ]),
46
+ determineDelimiter ( ctx , data [:size ]),
45
47
), nil
46
48
}
47
49
48
- // guessDelimiter scores the input CSV data against delimiters, and returns the best match.
49
- func guessDelimiter ( data [] byte ) rune {
50
- maxLines := 10
51
- text := quoteRegexp . ReplaceAllLiteralString ( string ( data ), "" )
52
- lines := strings . SplitN ( text , " \n " , maxLines + 1 )
53
- lines = lines [: util . Min ( maxLines , len ( lines ))]
54
-
55
- delimiters := [] rune { ',' , ';' , '\t' , '|' , '@' }
56
- bestDelim := delimiters [ 0 ]
57
- bestScore := 0.0
58
- for _ , delim := range delimiters {
59
- score := scoreDelimiter ( lines , delim )
60
- if score > bestScore {
61
- bestScore = score
62
- bestDelim = delim
63
- }
50
+ // determineDelimiter takes a RenderContext and if it isn't nil and the Filename has an extension that specifies the delimiter,
51
+ // it is used as the delimiter. Otherwise we call guessDelimiter with the data passed
52
+ func determineDelimiter ( ctx * markup. RenderContext , data [] byte ) rune {
53
+ extension := ".csv"
54
+ if ctx != nil {
55
+ extension = strings . ToLower ( filepath . Ext ( ctx . Filename ))
56
+ }
57
+
58
+ var delimiter rune
59
+ switch extension {
60
+ case ".tsv" :
61
+ delimiter = '\t'
62
+ case ".psv" :
63
+ delimiter = '|'
64
+ default :
65
+ delimiter = guessDelimiter ( data )
64
66
}
65
67
66
- return bestDelim
68
+ return delimiter
67
69
}
68
70
69
- // scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV.
70
- func scoreDelimiter (lines []string , delim rune ) float64 {
71
- countTotal := 0
72
- countLineMax := 0
73
- linesNotEqual := 0
71
+ // quoteRegexp follows the RFC-4180 CSV standard for when double-quotes are used to enclose fields, then a double-quote appearing inside a
72
+ // field must be escaped by preceding it with another double quote. https://www.ietf.org/rfc/rfc4180.txt
73
+ // This finds all quoted strings that have escaped quotes.
74
+ var quoteRegexp = regexp .MustCompile (`"[^"]*"` )
74
75
75
- for _ , line := range lines {
76
- if len (line ) == 0 {
77
- continue
78
- }
76
+ // removeQuotedStrings uses the quoteRegexp to remove all quoted strings so that we can reliably have each row on one line
77
+ // (quoted strings often have new lines within the string)
78
+ func removeQuotedString (text string ) string {
79
+ return quoteRegexp .ReplaceAllLiteralString (text , "" )
80
+ }
79
81
80
- countLine := strings .Count (line , string (delim ))
81
- countTotal += countLine
82
- if countLine != countLineMax {
83
- if countLineMax != 0 {
84
- linesNotEqual ++
85
- }
86
- countLineMax = util .Max (countLine , countLineMax )
87
- }
82
+ // guessDelimiter takes up to maxLines of the CSV text, iterates through the possible delimiters, and sees if the CSV Reader reads it without throwing any errors.
83
+ // If more than one delimiter passes, the delimiter that results in the most columns is returned.
84
+ func guessDelimiter (data []byte ) rune {
85
+ delimiter := guessFromBeforeAfterQuotes (data )
86
+ if delimiter != 0 {
87
+ return delimiter
88
88
}
89
89
90
- return float64 (countTotal ) * (1 - float64 (linesNotEqual )/ float64 (len (lines )))
90
+ // Removes quoted values so we don't have columns with new lines in them
91
+ text := removeQuotedString (string (data ))
92
+
93
+ // Make the text just be maxLines or less, ignoring truncated lines
94
+ lines := strings .SplitN (text , "\n " , maxLines + 1 ) // Will contain at least one line, and if there are more than MaxLines, the last item holds the rest of the lines
95
+ if len (lines ) > maxLines {
96
+ // If the length of lines is > maxLines we know we have the max number of lines, trim it to maxLines
97
+ lines = lines [:maxLines ]
98
+ } else if len (lines ) > 1 && len (data ) >= guessSampleSize {
99
+ // Even with data >= guessSampleSize, we don't have maxLines + 1 (no extra lines, must have really long lines)
100
+ // thus the last line is probably have a truncated line. Drop the last line if len(lines) > 1
101
+ lines = lines [:len (lines )- 1 ]
102
+ }
103
+
104
+ // Put lines back together as a string
105
+ text = strings .Join (lines , "\n " )
106
+
107
+ delimiters := []rune {',' , '\t' , ';' , '|' , '@' }
108
+ validDelim := delimiters [0 ]
109
+ validDelimColCount := 0
110
+ for _ , delim := range delimiters {
111
+ csvReader := stdcsv .NewReader (strings .NewReader (text ))
112
+ csvReader .Comma = delim
113
+ if rows , err := csvReader .ReadAll (); err == nil && len (rows ) > 0 && len (rows [0 ]) > validDelimColCount {
114
+ validDelim = delim
115
+ validDelimColCount = len (rows [0 ])
116
+ }
117
+ }
118
+ return validDelim
91
119
}
92
120
93
121
// FormatError converts csv errors into readable messages.
94
122
func FormatError (err error , locale translation.Locale ) (string , error ) {
95
- var perr * stdcsv.ParseError
96
- if errors .As (err , & perr ) {
123
+ if perr , ok := err .(* stdcsv.ParseError ); ok {
97
124
if perr .Err == stdcsv .ErrFieldCount {
98
125
return locale .Tr ("repo.error.csv.invalid_field_count" , perr .Line ), nil
99
126
}
@@ -102,3 +129,20 @@ func FormatError(err error, locale translation.Locale) (string, error) {
102
129
103
130
return "" , err
104
131
}
132
+
133
+ // Looks for possible delimiters right before or after (with spaces after the former) double quotes with closing quotes
134
+ var beforeAfterQuotes = regexp .MustCompile (`([,@\t;|]{0,1}) *(?:"[^"]*")+([,@\t;|]{0,1})` )
135
+
136
+ // guessFromBeforeAfterQuotes guesses the limiter by finding a double quote that has a valid delimiter before it and a closing quote,
137
+ // or a double quote with a closing quote and a valid delimiter after it
138
+ func guessFromBeforeAfterQuotes (data []byte ) rune {
139
+ rs := beforeAfterQuotes .FindStringSubmatch (string (data )) // returns first match, or nil if none
140
+ if rs != nil {
141
+ if rs [1 ] != "" {
142
+ return rune (rs [1 ][0 ]) // delimiter found left of quoted string
143
+ } else if rs [2 ] != "" {
144
+ return rune (rs [2 ][0 ]) // delimiter found right of quoted string
145
+ }
146
+ }
147
+ return 0 // no match found
148
+ }
0 commit comments