Skip to content

Commit ecfa89a

Browse files
fix: Try non-English suffix endings on word breaks (#6066)
Co-authored-by: street-side-software-automation[bot] <74785433+street-side-software-automation[bot]@users.noreply.github.com>
1 parent ec89e83 commit ecfa89a

File tree

14 files changed

+181
-66
lines changed

14 files changed

+181
-66
lines changed

packages/cspell-lib/api/api.d.ts

Lines changed: 33 additions & 36 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/cspell-lib/src/lib/__snapshots__/index.test.ts.snap

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,10 +127,6 @@ exports[`Validate the cspell API > Verify API exports 1`] = `
127127
"SuggestionError": [Function],
128128
"SuggestionResult": undefined,
129129
"Text": {
130-
"__testing__": {
131-
"regExWords": /\\\\p\\{L\\}\\\\p\\{M\\}\\?\\(\\?:\\(\\?:\\\\\\\\\\?\\['\\]\\)\\?\\\\p\\{L\\}\\\\p\\{M\\}\\?\\)\\*/gu,
132-
"regExWordsAndDigits": /\\[\\\\p\\{L\\}\\\\w'\`\\.\\+-\\]\\(\\?:\\(\\?:\\\\\\\\\\(\\?=\\['\\]\\)\\)\\?\\[\\\\p\\{L\\}\\\\p\\{M\\}\\\\w'\`\\.\\+-\\]\\)\\*/gu,
133-
},
134130
"calculateTextDocumentOffsets": [Function],
135131
"camelToSnake": [Function],
136132
"cleanText": [Function],

packages/cspell-lib/src/lib/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ export type { TraceOptions, TraceResult, TraceWordResult } from './trace.js';
8383
export { traceWords, traceWordsAsync } from './trace.js';
8484
export { getLogger, Logger, setLogger } from './util/logger.js';
8585
export { resolveFile } from './util/resolveFile.js';
86-
export * as Text from './util/text.js';
86+
export * as Text from './util/textApi.js';
8787
export {
8888
checkText,
8989
checkTextDocument,

packages/cspell-lib/src/lib/textValidation/lineValidatorFactory.ts

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@ import * as RxPat from '../Settings/RegExpPatterns.js';
1010
import {
1111
extractPossibleWordsFromTextOffset,
1212
extractText,
13-
extractWordsFromCodeTextOffset,
1413
extractWordsFromTextOffset,
14+
splitWordWithOffset,
1515
} from '../util/text.js';
16+
import { regExpCamelCaseWordBreaksWithEnglishSuffix } from '../util/textRegex.js';
1617
import { split } from '../util/wordSplitter.js';
1718
import { defaultMinWordLength } from './defaultConstants.js';
1819
import { isWordValidWithEscapeRetry } from './isWordValid.js';
@@ -199,9 +200,51 @@ export function lineValidatorFactory(sDict: SpellingDictionary, options: Validat
199200
// English exceptions :-(
200201
if (isAllCapsWithTrailingCommonEnglishSuffixOk(vr)) return [];
201202

203+
if (isWordIgnored(vr.text) || checkWord(vr).isFound) {
204+
rememberFilter((_) => false)(vr);
205+
return [];
206+
}
207+
if (vr.isFlagged) return [vr];
208+
209+
const codeWordResults: ValidationIssueRO[] = checkCamelCaseWord(vr);
210+
211+
if (!codeWordResults.length) {
212+
rememberFilter((_) => false)(vr);
213+
return [];
214+
}
215+
216+
return codeWordResults;
217+
}
218+
219+
/**
220+
* Break a camel case word into its parts and check each part.
221+
*
222+
* There are two word break patterns:
223+
* - `regExpCamelCaseWordBreaks`
224+
* - `regExpCamelCaseWordBreaksWithEnglishSuffix` is the default pattern with English suffixes on ALL CAPS words.
225+
*
226+
* Note: See [#6066](https://github.com/streetsidesoftware/cspell/pull/6066)
227+
* Using just `regExpCamelCaseWordBreaks` misses unknown 4-letter words.
228+
*
229+
* The code below was tried, but it missed words.
230+
* - `LSTM` was caught. // cspell:disable-line
231+
* - `LSTMs` was missed because it becomes `LST` and `Ms`. // cspell:disable-line
232+
*
233+
* ```ts
234+
* const results = _checkCamelCaseWord(vr, regExpCamelCaseWordBreaks);
235+
* if (!results.length) return results;
236+
* const resultsEnglishBreaks = _checkCamelCaseWord(vr, regExpCamelCaseWordBreaksWithEnglishSuffix);
237+
* return results.length < resultsEnglishBreaks.length ? results : resultsEnglishBreaks;
238+
* ```
239+
*/
240+
function checkCamelCaseWord(vr: ValidationIssueRO): ValidationIssueRO[] {
241+
return _checkCamelCaseWord(vr, regExpCamelCaseWordBreaksWithEnglishSuffix);
242+
}
243+
244+
function _checkCamelCaseWord(vr: ValidationIssueRO, regExpWordBreaks: RegExp): ValidationIssueRO[] {
202245
const codeWordResults: ValidationIssueRO[] = [];
203246

204-
for (const wo of extractWordsFromCodeTextOffset(vr)) {
247+
for (const wo of splitWordWithOffset(vr, regExpWordBreaks)) {
205248
if (setOfKnownSuccessfulWords.has(wo.text)) continue;
206249
const issue = wo as ValidationIssue;
207250
issue.line = vr.line;
@@ -215,11 +258,6 @@ export function lineValidatorFactory(sDict: SpellingDictionary, options: Validat
215258
codeWordResults.push(issue);
216259
}
217260

218-
if (!codeWordResults.length || isWordIgnored(vr.text) || checkWord(vr).isFound) {
219-
rememberFilter((_) => false)(vr);
220-
return [];
221-
}
222-
223261
return codeWordResults;
224262
}
225263

packages/cspell-lib/src/lib/textValidation/textValidator.test.ts

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,21 @@ describe('Validate textValidator functions', () => {
4343
});
4444

4545
test('tests trailing s, ed, ing, etc. are attached to the words', async () => {
46-
const dictEmpty = await createSpellingDictionary([], 'empty', 'test', opts());
47-
const text = 'We have PUBLISHed multiple FIXesToThePROBLEMs';
46+
const dictEmpty = createSpellingDictionary([], 'empty', 'test', opts());
47+
const text = 'We have PUBLISHed multiple FixesToThePROBLEMs';
48+
const result = [...validateText(text, dictEmpty, sToV({}))];
49+
const errors = result.map((wo) => wo.text);
50+
expect(errors).toEqual(['have', 'PUBLISHed', 'multiple', 'Fixes', 'PROBLEMs']);
51+
});
52+
53+
// cspell:ignore UI
54+
55+
test('words breaks', async () => {
56+
const dictEmpty = createSpellingDictionary(['mark', 'as', 'ready'], 'sample', 'test', opts());
57+
const text = 'markUIAsReady() ';
4858
const result = [...validateText(text, dictEmpty, sToV({}))];
4959
const errors = result.map((wo) => wo.text);
50-
expect(errors).toEqual(['have', 'PUBLISHed', 'multiple', 'FIXes', 'PROBLEMs']);
60+
expect(errors).toEqual(['UIAs']);
5161
});
5262

5363
test('tests case in ignore words', async () => {

packages/cspell-lib/src/lib/util/__snapshots__/wordSplitter.test.ts.snap

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,27 @@ exports[`Validate wordSplitter > Extract all possible word breaks to 'hello' 1`]
9191
]
9292
`;
9393

94+
exports[`Validate wordSplitter > Extract all possible word breaks to 'markUIAsReady' 1`] = `
95+
[
96+
"mark|UI|As|Ready",
97+
"mark|UI|A|Ready",
98+
"mark|UI|AsReady",
99+
"mark|UIA|s|Ready",
100+
"mark|UIA|Ready",
101+
"mark|UIA|sReady",
102+
"mark|UIAs|Ready",
103+
"mark|UIAsReady",
104+
"markUI|As|Ready",
105+
"markUI|A|Ready",
106+
"markUI|AsReady",
107+
"markUIA|s|Ready",
108+
"markUIA|Ready",
109+
"markUIA|sReady",
110+
"markUIAs|Ready",
111+
"markUIAsReady",
112+
]
113+
`;
114+
94115
exports[`Validate wordSplitter > Extract all possible word breaks to 'well-educated' 1`] = `
95116
[
96117
"well|educated",

packages/cspell-lib/src/lib/util/text.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ describe('Util Text', () => {
3838
${'ASCIIToUTF16'} | ${['ASCII', 'To', 'UTF16']}
3939
${'URLsAndDBAs'} | ${['URLs', 'And', 'DBAs']}
4040
${'WALKingRUNning'} | ${['WALKing', 'RUNning']}
41+
${'c0de'} | ${['c0de']}
4142
`('splitCamelCaseWord $word', ({ word, expected }) => {
4243
expect(splitCamelCaseWord(word)).toEqual(expected);
4344
});

packages/cspell-lib/src/lib/util/text.ts

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import {
88
regExAllUpper,
99
regExFirstUpper,
1010
regExIgnoreCharacters,
11-
regExpSplitWordBreaks,
11+
regExpCamelCaseWordBreaksWithEnglishSuffix,
1212
regExWords,
1313
regExWordsAndDigits,
1414
} from './textRegex.js';
@@ -20,7 +20,7 @@ export { stringToRegExp } from './textRegex.js';
2020

2121
// CSpell:ignore ings ning gimuy tsmerge
2222

23-
export function splitCamelCaseWordWithOffset(wo: TextOffset): Array<TextOffset> {
23+
export function splitCamelCaseWordWithOffset(wo: TextOffset): TextOffset[] {
2424
return splitCamelCaseWord(wo.text).map(
2525
scanMap<string, TextOffset>((last, text) => ({ text, offset: last.offset + last.text.length }), {
2626
text: '',
@@ -33,7 +33,23 @@ export function splitCamelCaseWordWithOffset(wo: TextOffset): Array<TextOffset>
3333
* Split camelCase words into an array of strings.
3434
*/
3535
export function splitCamelCaseWord(word: string): string[] {
36-
return word.split(regExpSplitWordBreaks);
36+
return splitWord(word, regExpCamelCaseWordBreaksWithEnglishSuffix);
37+
}
38+
39+
export function splitWordWithOffset(wo: TextOffset, regExpWordBreaks: RegExp): TextOffset[] {
40+
return splitWord(wo.text, regExpWordBreaks).map(
41+
scanMap<string, TextOffset>((last, text) => ({ text, offset: last.offset + last.text.length }), {
42+
text: '',
43+
offset: wo.offset,
44+
}),
45+
);
46+
}
47+
48+
/**
49+
* Split camelCase words into an array of strings.
50+
*/
51+
export function splitWord(word: string, regExpWordBreaks: RegExp): string[] {
52+
return word.split(regExpWordBreaks);
3753
}
3854

3955
/**

0 commit comments

Comments
 (0)