Skip to content

Commit a1ad188

Browse files
Fix chardet test and add ordering option (#11621)
* Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <[email protected]> * minor fixes Signed-off-by: Andrew Thornton <[email protected]> * remove log Signed-off-by: Andrew Thornton <[email protected]> * remove log2 Signed-off-by: Andrew Thornton <[email protected]> * only iterate through top results Signed-off-by: Andrew Thornton <[email protected]> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <[email protected]> Co-authored-by: techknowlogick <[email protected]>
1 parent fe2cacf commit a1ad188

File tree

5 files changed

+117
-6
lines changed

5 files changed

+117
-6
lines changed

custom/conf/app.ini.sample

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,12 @@ RUN_MODE = dev
1414
[repository]
1515
ROOT =
1616
SCRIPT_TYPE = bash
17-
; Default ANSI charset
17+
; DETECTED_CHARSETS_ORDER tie-break order for detected charsets.
18+
; If the charsets have equal confidence, tie-breaking will be done by order in this list
19+
; with charsets earlier in the list chosen in preference to those later.
20+
; Adding "defaults" will place the unused charsets at that position.
21+
DETECTED_CHARSETS_ORDER=UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr
22+
; Default ANSI charset to override non-UTF-8 charsets to
1823
ANSI_CHARSET =
1924
; Force every new repository to be private
2025
FORCE_PRIVATE = false

docs/content/doc/advanced/config-cheat-sheet.en-us.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ Values containing `#` or `;` must be quoted using `` ` `` or `"""`.
4646
an absolute path.
4747
- `SCRIPT_TYPE`: **bash**: The script type this server supports. Usually this is `bash`,
4848
but some users report that only `sh` is available.
49-
- `ANSI_CHARSET`: **\<empty\>**: The default charset for an unrecognized charset.
49+
- `DETECTED_CHARSETS_ORDER`: **UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr**: Tie-break order of detected charsets - if the detected charsets have equal confidence, charsets earlier in the list will be chosen in preference to those later. Adding `defaults` will place the unnamed charsets at that point.
50+
- `ANSI_CHARSET`: **\<empty\>**: Default ANSI charset to override non-UTF-8 charsets to.
5051
- `FORCE_PRIVATE`: **false**: Force every new repository to be private.
5152
- `DEFAULT_PRIVATE`: **last**: Default private when creating a new repository.
5253
\[last, private, public\]

modules/charset/charset.go

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package charset
77
import (
88
"bytes"
99
"fmt"
10+
"strings"
1011
"unicode/utf8"
1112

1213
"code.gitea.io/gitea/modules/log"
@@ -137,16 +138,42 @@ func DetectEncoding(content []byte) (string, error) {
137138
} else {
138139
detectContent = content
139140
}
140-
result, err := textDetector.DetectBest(detectContent)
141+
142+
// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
143+
results, err := textDetector.DetectAll(detectContent)
141144
if err != nil {
145+
if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 {
146+
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
147+
return setting.Repository.AnsiCharset, nil
148+
}
142149
return "", err
143150
}
151+
152+
topConfidence := results[0].Confidence
153+
topResult := results[0]
154+
priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))]
155+
for _, result := range results {
156+
// As results are sorted in confidence order - if we have a different confidence
157+
// we know it's less than the current confidence and can break out of the loop early
158+
if result.Confidence != topConfidence {
159+
break
160+
}
161+
162+
// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss
163+
resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))]
164+
if resultHas && (!has || resultPriority < priority) {
165+
topResult = result
166+
priority = resultPriority
167+
has = true
168+
}
169+
}
170+
144171
// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
145-
if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
172+
if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
146173
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
147174
return setting.Repository.AnsiCharset, err
148175
}
149176

150-
log.Debug("Detected encoding: %s", result.Charset)
151-
return result.Charset, err
177+
log.Debug("Detected encoding: %s", topResult.Charset)
178+
return topResult.Charset, err
152179
}

modules/charset/charset_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,11 @@ func TestDetectEncoding(t *testing.T) {
230230
// we accept either.
231231
assert.Contains(t, encoding, "ISO-8859")
232232

233+
old := setting.Repository.AnsiCharset
233234
setting.Repository.AnsiCharset = "placeholder"
235+
defer func() {
236+
setting.Repository.AnsiCharset = old
237+
}()
234238
testSuccess(b, "placeholder")
235239

236240
// invalid bytes

modules/setting/repository.go

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ const (
2424
// Repository settings
2525
var (
2626
Repository = struct {
27+
DetectedCharsetsOrder []string
28+
DetectedCharsetScore map[string]int `ini:"-"`
2729
AnsiCharset string
2830
ForcePrivate bool
2931
DefaultPrivate string
@@ -88,6 +90,42 @@ var (
8890
Uncyclo []string
8991
} `ini:"repository.signing"`
9092
}{
93+
DetectedCharsetsOrder: []string{
94+
"UTF-8",
95+
"UTF-16BE",
96+
"UTF-16LE",
97+
"UTF-32BE",
98+
"UTF-32LE",
99+
"ISO-8859-1",
100+
"windows-1252",
101+
"ISO-8859-2",
102+
"windows-1250",
103+
"ISO-8859-5",
104+
"ISO-8859-6",
105+
"ISO-8859-7",
106+
"windows-1253",
107+
"ISO-8859-8-I",
108+
"windows-1255",
109+
"ISO-8859-8",
110+
"windows-1251",
111+
"windows-1256",
112+
"KOI8-R",
113+
"ISO-8859-9",
114+
"windows-1254",
115+
"Shift_JIS",
116+
"GB18030",
117+
"EUC-JP",
118+
"EUC-KR",
119+
"Big5",
120+
"ISO-2022-JP",
121+
"ISO-2022-KR",
122+
"ISO-2022-CN",
123+
"IBM424_rtl",
124+
"IBM424_ltr",
125+
"IBM420_rtl",
126+
"IBM420_ltr",
127+
},
128+
DetectedCharsetScore: map[string]int{},
91129
AnsiCharset: "",
92130
ForcePrivate: false,
93131
DefaultPrivate: RepoCreatingLastUserVisibility,
@@ -208,6 +246,10 @@ func newRepository() {
208246
} else {
209247
RepoRootPath = filepath.Clean(RepoRootPath)
210248
}
249+
defaultDetectedCharsetsOrder := make([]string, 0, len(Repository.DetectedCharsetsOrder))
250+
for _, charset := range Repository.DetectedCharsetsOrder {
251+
defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset)))
252+
}
211253
ScriptType = sec.Key("SCRIPT_TYPE").MustString("bash")
212254

213255
if err = Cfg.Section("repository").MapTo(&Repository); err != nil {
@@ -222,6 +264,38 @@ func newRepository() {
222264
log.Fatal("Failed to map Repository.PullRequest settings: %v", err)
223265
}
224266

267+
preferred := make([]string, 0, len(Repository.DetectedCharsetsOrder))
268+
for _, charset := range Repository.DetectedCharsetsOrder {
269+
canonicalCharset := strings.ToLower(strings.TrimSpace(charset))
270+
preferred = append(preferred, canonicalCharset)
271+
// remove it from the defaults
272+
for i, charset := range defaultDetectedCharsetsOrder {
273+
if charset == canonicalCharset {
274+
defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder[:i], defaultDetectedCharsetsOrder[i+1:]...)
275+
break
276+
}
277+
}
278+
}
279+
280+
i := 0
281+
for _, charset := range preferred {
282+
// Add the defaults
283+
if charset == "defaults" {
284+
for _, charset := range defaultDetectedCharsetsOrder {
285+
canonicalCharset := strings.ToLower(strings.TrimSpace(charset))
286+
if _, has := Repository.DetectedCharsetScore[canonicalCharset]; !has {
287+
Repository.DetectedCharsetScore[canonicalCharset] = i
288+
i++
289+
}
290+
}
291+
continue
292+
}
293+
if _, has := Repository.DetectedCharsetScore[charset]; !has {
294+
Repository.DetectedCharsetScore[charset] = i
295+
i++
296+
}
297+
}
298+
225299
if !filepath.IsAbs(Repository.Upload.TempPath) {
226300
Repository.Upload.TempPath = path.Join(AppWorkPath, Repository.Upload.TempPath)
227301
}

0 commit comments

Comments
 (0)