Skip to content

Commit 7ea39e3

Browse files
committed
fix(parsing): improve regex patterns for DevInParser and DevInLexer to support more complex identifiers and whitespace handling #101
The commit addresses issues with the DevInParser and DevInLexer in the ext/devin-lang package. It introduces more flexible regex patterns for identifiers, allowing characters other than the initial '$', '@', or '/'. Additionally, the commit refactors the lexer to handle whitespace more efficiently, using a dedicated WHITE_SPACE token type. The parser definition is updated to leverage the new token types, and the build script is modified to reflect the changes in the generated parser and lexer classes. Finally, a test file is updated to demonstrate the new parsing capabilities.
1 parent b2bfd1b commit 7ea39e3

File tree

6 files changed

+46
-23
lines changed

6 files changed

+46
-23
lines changed

build.gradle.kts

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -600,15 +600,12 @@ project(":exts:devin-lang") {
600600
tasks {
601601
generateLexer {
602602
sourceFile.set(file("src/grammar/DevInLexer.flex"))
603-
// targetDir.set("src/gen/com/feakin/intellij/lexer")
604-
targetOutputDir.set(file("src/gen/cc/unitmesh/language/lexer"))
605-
// targetClass.set("_FeakinLexer")
603+
targetOutputDir.set(file("src/gen/cc/unitmesh/language"))
606604
purgeOldFiles.set(true)
607605
}
608606

609607
generateParser {
610608
sourceFile.set(file("src/grammar/DevInParser.bnf"))
611-
// targetRoot.set("src/gen")
612609
targetRootOutputDir.set(file("src/gen"))
613610
pathToParser.set("cc/unitmesh/language/parser/DevInParser.java")
614611
pathToPsiRoot.set("cc/unitmesh/language/psi")
Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
// Copyright 2000-2022 JetBrains s.r.o. and other contributors. Use of this source code is governed by the Apache 2.0 license that can be found in the LICENSE file.
21
package cc.unitmesh.language;
32

43
import com.intellij.lexer.FlexLexer;
54
import com.intellij.psi.tree.IElementType;
6-
import cc.unitmesh.language.psi.DevInTypes;
5+
import static cc.unitmesh.language.psi.DevInTypes.*;
76
import com.intellij.psi.TokenType;
87

98
%%
@@ -23,20 +22,27 @@ import com.intellij.psi.TokenType;
2322
%eof{ return;
2423
%eof}
2524

26-
CRLF=\R
27-
WHITE_SPACE=[\ \n\t\f]
28-
// $ variable
29-
STRING=\"([^\\\"\r\n]|\\[^\r\n])*\"?
30-
IDENTIFIER=[_a-zA-Z][_a-zA-Z0-9]*
25+
EOL=\R
26+
WHITE_SPACE=\s+
3127

32-
%state WAITING_VALUE
28+
IDENTIFIER=[_a-zA-Z][_a-zA-Z0-9]*
29+
TEXT_SEGMENT=[^[@\\$]_a-zA-Z0-9]+
30+
WS=[ \t\n\x0B\f\r]
31+
NEWLINE=\n|\r\n
3332

3433
%%
3534
<YYINITIAL> {
36-
{STRING} { return DevInTypes.STRING; }
37-
{IDENTIFIER} { return DevInTypes.IDENTIFIER; }
38-
}
35+
{WHITE_SPACE} { return TokenType.WHITE_SPACE; }
3936

40-
({CRLF}|{WHITE_SPACE})+ { yybegin(YYINITIAL); return TokenType.WHITE_SPACE; }
37+
"$" { return DOLLAR; }
38+
"@" { return AT; }
39+
"/" { return SLASH; }
40+
41+
{IDENTIFIER} { return IDENTIFIER; }
42+
{TEXT_SEGMENT} { return TEXT_SEGMENT; }
43+
{WS} { return WS; }
44+
{NEWLINE} { return NEWLINE; }
45+
46+
}
4147

4248
[^] { return TokenType.BAD_CHARACTER; }

exts/devin-lang/src/grammar/DevInParser.bnf

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,25 @@
1818
DOLLAR = '$'
1919
AT = '@'
2020
SLASH = '/'
21+
// char should be any character except start with $, @, /
22+
IDENTIFIER = 'regexp:[_a-zA-Z][_a-zA-Z0-9]*'
23+
TEXT_SEGMENT = 'regexp:[^[@\\$]_a-zA-Z0-9]+'
24+
25+
WS = 'regexp:\s'
26+
27+
NEWLINE = 'regexp:\n|\r\n'
2128
]
2229
}
2330

24-
DevInFile ::= item_*
31+
DevInFile ::= item*
2532

26-
item_ ::= (useVariable|useAgent|useCommand|STRING)
33+
private item ::= (useVariable|useAgent|useCommand|TEXT_SEGMENT|NEWLINE)
2734

28-
useVariable ::= '$' IDENTIFIER
2935
// $use-variable
36+
useVariable ::= DOLLAR IDENTIFIER WS*
3037

3138
// @use-agent
32-
useAgent ::= '@' IDENTIFIER
39+
useAgent ::= AT IDENTIFIER WS*
3340

3441
// /use-command
35-
useCommand ::= '/' IDENTIFIER
42+
useCommand ::= SLASH IDENTIFIER WS*

exts/devin-lang/src/main/kotlin/cc/unitmesh/language/DevInParserDefinition.kt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package cc.unitmesh.language
22

33
import cc.unitmesh.language.parser.DevInParser
44
import cc.unitmesh.language.psi.DevInFile
5+
import cc.unitmesh.language.psi.DevInTypes
56
import com.intellij.lang.ASTNode
67
import com.intellij.lang.ParserDefinition
78
import com.intellij.lang.PsiParser
@@ -44,7 +45,7 @@ internal class DevInParserDefinition : ParserDefinition {
4445

4546
@NotNull
4647
override fun createElement(node: ASTNode?): PsiElement {
47-
TODO()
48+
return DevInTypes.Factory.createElement(node)
4849
}
4950

5051
companion object {
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
Gen hello, world @gen
1+
@gen what's this?
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
FILE
2+
DevInUseAgentImpl(USE_AGENT)
3+
PsiElement(DevInTokenType.@)('@')
4+
PsiElement(DevInTokenType.IDENTIFIER)('gen')
5+
PsiWhiteSpace(' ')
6+
PsiErrorElement:DevInTokenType.$, DevInTokenType./, DevInTokenType.@, DevInTokenType.NEWLINE, DevInTokenType.TEXT_SEGMENT or DevInTokenType.WS expected, got 'what'
7+
PsiElement(DevInTokenType.IDENTIFIER)('what')
8+
PsiElement(DevInTokenType.TEXT_SEGMENT)(''')
9+
PsiElement(DevInTokenType.IDENTIFIER)('s')
10+
PsiWhiteSpace(' ')
11+
PsiElement(DevInTokenType.IDENTIFIER)('this')
12+
PsiElement(DevInTokenType.TEXT_SEGMENT)('?')

0 commit comments

Comments
 (0)