Skip to content

Commit 0de435b

Browse files
committed
Add MWTs to English trees if given a flag in UniversalDependenciesConverter
1 parent 4d91ec8 commit 0de435b

File tree

4 files changed

+143
-1
lines changed

4 files changed

+143
-1
lines changed

src/edu/stanford/nlp/trees/ud/CoNLLUDocumentWriter.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
4848
/* Check for multiword tokens. */
4949
if (token.containsKey(CoreAnnotations.CoNLLUTokenSpanAnnotation.class)) {
5050
printSpan(sb, token);
51+
} else if (token.containsKey(CoreAnnotations.IsFirstWordOfMWTAnnotation.class) && token.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class)) {
52+
printMWT(sb, tokenSg, token);
5153
}
5254

5355
/* Try to find main governor and additional dependencies. */
@@ -149,6 +151,30 @@ public static void printSpan(StringBuilder sb, AbstractCoreLabel token) {
149151
}
150152
}
151153

154+
public static void printMWT(StringBuilder sb, SemanticGraph graph, IndexedWord token) {
155+
int startIndex = token.index();
156+
int endIndex = startIndex;
157+
// advance endIndex until we reach the end of the sentence, the start of the next MWT,
158+
// or a word which isn't part of any MWT
159+
IndexedWord nextVertex;
160+
while ((nextVertex = graph.getNodeByIndex(endIndex+1)) != null) {
161+
if (nextVertex.containsKey(CoreAnnotations.IsFirstWordOfMWTAnnotation.class) &&
162+
nextVertex.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class)) {
163+
break;
164+
}
165+
if (!nextVertex.containsKey(CoreAnnotations.IsMultiWordTokenAnnotation.class) ||
166+
!nextVertex.get(CoreAnnotations.IsMultiWordTokenAnnotation.class)) {
167+
break;
168+
}
169+
++endIndex;
170+
}
171+
if (startIndex == endIndex) {
172+
return;
173+
}
174+
String range = String.format("%d-%d", startIndex, endIndex);
175+
sb.append(String.format("%s\t%s\t_\t_\t_\t_\t_\t_\t_\t_%n", range, token.get(CoreAnnotations.MWTTokenTextAnnotation.class)));
176+
}
177+
152178
/**
153179
* Outputs a partial CONLL-U file with token information (form, lemma, POS)
154180
* but without any dependency information.
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package edu.stanford.nlp.trees.ud;
2+
3+
import java.util.List;
4+
5+
import edu.stanford.nlp.semgraph.SemanticGraph;
6+
import edu.stanford.nlp.semgraph.semgrex.ssurgeon.Ssurgeon;
7+
import edu.stanford.nlp.semgraph.semgrex.ssurgeon.SsurgeonPattern;
8+
import edu.stanford.nlp.util.XMLUtils;
9+
10+
public class EnglishMWTCombiner {
11+
static final String newline = System.getProperty("line.separator");
12+
13+
public SemanticGraph combineMWTs(SemanticGraph sg) {
14+
Ssurgeon inst = Ssurgeon.inst();
15+
16+
// combine using the CombineMWT operation, using the default concatenation for the MWT text
17+
String mwt = String.join(newline,
18+
"<ssurgeon-pattern-list>",
19+
" <ssurgeon-pattern>",
20+
" <uid>1</uid>",
21+
" <notes>Edit a node's MWT for common contractions</notes>",
22+
" <semgrex>" + XMLUtils.escapeXML("{}=first . {word:/(?i)'s|n't|'ll|'ve|'re|'d|s'|'m/}=second") + "</semgrex>",
23+
" <edit-list>CombineMWT -node first -node second</edit-list>",
24+
" </ssurgeon-pattern>",
25+
" <ssurgeon-pattern>",
26+
" <uid>2</uid>",
27+
" <notes>Edit a node's MWT for cannot</notes>",
28+
" <semgrex>" + XMLUtils.escapeXML("{word:/(?i)can/;after://}=first . {word:/(?i)not/}=second") + "</semgrex>",
29+
" <edit-list>CombineMWT -node first -node second</edit-list>",
30+
" </ssurgeon-pattern>",
31+
" <ssurgeon-pattern>",
32+
" <uid>3</uid>",
33+
" <notes>Edit a node's MWT for cannot</notes>",
34+
" <semgrex>" + XMLUtils.escapeXML("{word:/(?i)wan|gon/;after://}=first . {word:/(?i)na/}=second") + "</semgrex>",
35+
" <edit-list>CombineMWT -node first -node second</edit-list>",
36+
" </ssurgeon-pattern>",
37+
" <ssurgeon-pattern>",
38+
" <uid>4</uid>",
39+
" <notes>Edit a node's MWT for POS</notes>",
40+
" <semgrex>" + XMLUtils.escapeXML("{}=first . {word:/'/;cpos:PART}=second") + "</semgrex>",
41+
" <edit-list>CombineMWT -node first -node second</edit-list>",
42+
" </ssurgeon-pattern>",
43+
"</ssurgeon-pattern-list>");
44+
List<SsurgeonPattern> patterns = inst.readFromString(mwt);
45+
for (SsurgeonPattern editSsurgeon : patterns) {
46+
sg = editSsurgeon.iterate(sg).first;
47+
}
48+
return sg;
49+
}
50+
}

src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,8 @@ private static void addSpaceAfter(SemanticGraph sg, String text, int graphIdx) {
227227
* {@code -treeFile}: File with PTB-formatted constituency trees<br>
228228
* {@code -conlluFile}: File with basic dependency trees in CoNLL-U format<br>
229229
* {@code -textFile}: A file with text to be used as a guide for SpaceAfter (optional)<br>
230-
* {@code -outputRepresentation}: "basic" (default), "enhanced", or "enhanced++"
230+
* {@code -outputRepresentation}: "basic" (default), "enhanced", or "enhanced++"<br>
231+
* {@code -combineMWTs}: "False" (default), "True" marks things like it's as MWT
231232
*/
232233
public static void main(String[] args) {
233234
Properties props = StringUtils.argsToProperties(args);
@@ -236,6 +237,7 @@ public static void main(String[] args) {
236237
String conlluFileName = props.getProperty("conlluFile");
237238
String outputRepresentation = props.getProperty("outputRepresentation", "basic");
238239
boolean addFeatures = PropertiesUtils.getBool(props, "addFeatures", false);
240+
boolean combineMWTs = PropertiesUtils.getBool(props, "combineMWTs", false);
239241
boolean replaceLemmata = PropertiesUtils.getBool(props, "replaceLemmata", false);
240242

241243
Iterator<Pair<SemanticGraph, SemanticGraph>> sgIterator; // = null;
@@ -268,6 +270,7 @@ public static void main(String[] args) {
268270
}
269271

270272
UniversalDependenciesFeatureAnnotator featureAnnotator = (addFeatures) ? new UniversalDependenciesFeatureAnnotator() : null;
273+
EnglishMWTCombiner mwtCombiner = (combineMWTs) ? new EnglishMWTCombiner() : null;
271274

272275
CoNLLUDocumentWriter writer = new CoNLLUDocumentWriter();
273276

@@ -290,6 +293,10 @@ public static void main(String[] args) {
290293
if (featureAnnotator != null) {
291294
featureAnnotator.addFeatures(sg, tree, false, false);
292295
}
296+
297+
if (mwtCombiner != null) {
298+
sg = mwtCombiner.combineMWTs(sg);
299+
}
293300
} else {
294301
if (replaceLemmata) {
295302
replaceAllLemmata(sg);
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
package edu.stanford.nlp.trees.ud;
2+
3+
import static org.junit.Assert.*;
4+
import org.junit.Test;
5+
6+
import edu.stanford.nlp.semgraph.SemanticGraph;
7+
8+
public class EnglishMWTCombinerTest {
9+
10+
static final String newline = System.getProperty("line.separator");
11+
public static final String expectedITS = String.join(newline,
12+
"1-2 it's _ _ _ _ _ _ _ _",
13+
"1 it _ _ _ _ 4 nsubj _ SpaceAfter=No",
14+
"2 's _ _ _ _ 4 cop _ _",
15+
"3 yours _ _ _ _ 4 advmod _ _",
16+
"4 yours _ _ _ _ 0 root _ _",
17+
"5 ! _ _ _ _ 4 punct _ _");
18+
19+
public static final String expectedCANNOT = String.join(newline,
20+
"1 I _ _ _ _ 4 nsubj _ _",
21+
"2-3 CANNOT _ _ _ _ _ _ _ _",
22+
"2 CAN _ _ _ _ 4 aux _ SpaceAfter=No",
23+
"3 NOT _ _ _ _ 4 advmod _ _",
24+
"4 believe _ _ _ _ 0 root _ _",
25+
"5 it _ _ _ _ 4 obj _ _",
26+
"6 ! _ _ _ _ 4 punct _ _");
27+
28+
public static final String expectedWANNA = String.join(newline,
29+
"1 I _ _ _ _ 2 nsubj _ _",
30+
"2-3 wanna _ _ _ _ _ _ _ _",
31+
"2 wan _ _ _ _ 0 root _ SpaceAfter=No",
32+
"3 na _ _ _ _ 4 mark _ _",
33+
"4 fix _ _ _ _ 2 xcomp _ _",
34+
"5 this _ _ _ _ 4 obj _ _");
35+
36+
@Test
37+
public void testMWT() {
38+
CoNLLUDocumentWriter writer = new CoNLLUDocumentWriter();
39+
EnglishMWTCombiner combiner = new EnglishMWTCombiner();
40+
41+
SemanticGraph sg = SemanticGraph.valueOf("[yours-4 nsubj> it-1 cop> 's-2 advmod> yours-3 punct> !-5]");
42+
sg.getNodeByIndexSafe(1).setAfter("");
43+
sg = combiner.combineMWTs(sg);
44+
String result = writer.printSemanticGraph(sg);
45+
assertEquals(expectedITS, result.trim());
46+
47+
sg = SemanticGraph.valueOf("[believe-4 nsubj> I-1 aux> CAN-2 advmod> NOT-3 obj> it-5 punct> !-6]");
48+
sg.getNodeByIndexSafe(2).setAfter("");
49+
sg = combiner.combineMWTs(sg);
50+
result = writer.printSemanticGraph(sg);
51+
assertEquals(expectedCANNOT, result.trim());
52+
53+
sg = SemanticGraph.valueOf("[wan-2 nsubj> I-1 xcomp> [fix-4 mark> na-3 obj> this-5]]");
54+
sg.getNodeByIndexSafe(2).setAfter("");
55+
sg = combiner.combineMWTs(sg);
56+
result = writer.printSemanticGraph(sg);
57+
assertEquals(expectedWANNA, result.trim());
58+
}
59+
}

0 commit comments

Comments
 (0)