Skip to content

Commit 13ede5a

Browse files
committed
Add an Ssurgeon feature which splits a word into pieces based on regex matches. A word can be specified as the head of the new pieces, along with the relation. Other words are pushed down the sentence to make the indices line up
1 parent bf8ee06 commit 13ede5a

File tree

3 files changed

+237
-0
lines changed

3 files changed

+237
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
package edu.stanford.nlp.semgraph.semgrex.ssurgeon;
2+
3+
import java.util.*;
4+
import java.util.regex.Matcher;
5+
import java.util.regex.Pattern;
6+
import java.io.*;
7+
8+
import edu.stanford.nlp.ling.IndexedWord;
9+
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
10+
import edu.stanford.nlp.semgraph.SemanticGraph;
11+
import edu.stanford.nlp.trees.GrammaticalRelation;
12+
13+
/**
14+
* Split a word into pieces based on the regex expressions provided by the -regex arguments
15+
* <br>
16+
* As an example of where this is useful, a tokenization dataset had "
17+
* stuck to each of the words. We can separate that out by using two
18+
* regex, one which matches the " in a group, one which matches the
19+
* rest of the word without the "
20+
*
21+
* @author John Bauer
22+
*/
23+
public class SplitWord extends SsurgeonEdit {
24+
public static final String LABEL = "splitWord";
25+
26+
final String node;
27+
final List<Pattern> nodeRegex;
28+
final int headIndex;
29+
final GrammaticalRelation relation;
30+
31+
public SplitWord(String node, List<String> nodeRegex, Integer headIndex, GrammaticalRelation relation) {
32+
if (node == null) {
33+
throw new SsurgeonParseException("SplitWord expected -node with the name of the matched node to split");
34+
}
35+
this.node = node;
36+
37+
if (nodeRegex == null || nodeRegex.size() == 0) {
38+
throw new SsurgeonParseException("SplitWord expected -regex with regex to determine which pieces to split the word into");
39+
}
40+
if (nodeRegex.size() == 1) {
41+
throw new SsurgeonParseException("SplitWord expected at least two -regex");
42+
}
43+
this.nodeRegex = new ArrayList<>();
44+
for (int i = 0; i < nodeRegex.size(); ++i) {
45+
this.nodeRegex.add(Pattern.compile(nodeRegex.get(i)));
46+
}
47+
48+
if (headIndex == null) {
49+
throw new SsurgeonParseException("SplitWord expected a -headIndex, 0-indexed for the word piece to use when chopping up the word");
50+
}
51+
this.headIndex = headIndex;
52+
53+
if (relation == null) {
54+
throw new SsurgeonParseException("SplitWord expected a -reln to represent the dependency to use for the new words");
55+
}
56+
this.relation = relation;
57+
}
58+
59+
@Override
60+
public String toEditString() {
61+
StringWriter buf = new StringWriter();
62+
buf.write(LABEL);
63+
buf.write("\t");
64+
buf.write("-node " + node + "\t");
65+
for (Pattern regex : nodeRegex) {
66+
buf.write("-regex " + regex + "\t");
67+
}
68+
buf.write("-reln " + relation.toString() + "\t");
69+
buf.write("-headIndex " + headIndex);
70+
return buf.toString();
71+
}
72+
73+
@Override
74+
public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
75+
IndexedWord matchedNode = sm.getNode(node);
76+
String origWord = matchedNode.word();
77+
78+
// first, iterate over the regex patterns we had at creation time
79+
//
80+
// each new word created will be the concatenation of all of the
81+
// matching groups from this pattern
82+
List<String> words = new ArrayList<>();
83+
for (int i = 0; i < nodeRegex.size(); ++i) {
84+
Matcher regexMatcher = nodeRegex.get(i).matcher(origWord);
85+
if (!regexMatcher.matches()) {
86+
return false;
87+
}
88+
89+
StringBuilder newWordBuilder = new StringBuilder();
90+
for (int j = 0; j < regexMatcher.groupCount(); ++j) {
91+
newWordBuilder.append(regexMatcher.group(j+1));
92+
}
93+
String newWord = newWordBuilder.toString();
94+
if (newWord.length() == 0) {
95+
return false;
96+
}
97+
words.add(newWord);
98+
}
99+
100+
int matchedIndex = matchedNode.index();
101+
102+
// at this point, we can make new words out of each of the patterns
103+
104+
// move all words down by nodeRegex.size() - 1
105+
// then move the original word down by headIndex
106+
AddDep.moveNodes(sg, sm, x -> (x > matchedIndex), x -> x+nodeRegex.size() - 1, true);
107+
// the head node has its word replaced, and its index & links need
108+
// to be rearranged, but none of the links are added or removed
109+
if (headIndex > 0) {
110+
AddDep.moveNode(sg, sm, matchedNode, matchedIndex + headIndex);
111+
}
112+
matchedNode = sm.getNode(node);
113+
matchedNode.setWord(words.get(headIndex));
114+
matchedNode.setValue(words.get(headIndex));
115+
116+
for (int i = 0; i < nodeRegex.size(); ++i) {
117+
if (i == headIndex)
118+
continue;
119+
120+
// otherwise, add a word with the appropriate index,
121+
// then connect it to matchedNode
122+
// TODO: add the ability to set more values, such as POS?
123+
IndexedWord newNode = new IndexedWord();
124+
newNode.setDocID(matchedNode.docID());
125+
newNode.setIndex(matchedIndex + i);
126+
newNode.setSentIndex(matchedNode.sentIndex());
127+
newNode.setWord(words.get(i));
128+
newNode.setValue(words.get(i));
129+
130+
sg.addVertex(newNode);
131+
sg.addEdge(matchedNode, newNode, relation, 0.0, false);
132+
}
133+
return true;
134+
}
135+
}

src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java

+24
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@
8585
* <li> {@code editNode -node node ...attributes...}
8686
* <li> {@code lemmatize -node node}
8787
* <li> {@code combineMWT -node node -word word}
88+
* <li> {@code splitWord -node node -headIndex idx -reln depType -regex w1 -regex w2 ...}
8889
* <li> {@code setRoots n1 (n2 n3 ...)}
8990
* <li> {@code mergeNodes n1 n2}
9091
* <li> {@code killAllIncomingEdges -node node}
@@ -146,6 +147,12 @@
146147
* {@code -node} (repeated) is the nodes to edit.
147148
* {@code -word} is the optional text to use for the new MWT. If not set, the words will be concatenated.
148149
*</p><p>
150+
* {@code splitWord} will split a single word into multiple pieces from the text of the current word
151+
* {@code -node} is the node to split.
152+
* {@code -headIndex} is the index (counting from 0) of the word piece to make the head.
153+
* {@code -reln} is the name of the dependency type to use. pieces other than the head will connect using this relation
154+
* {@code -regex} regex must match the matched node. all matching groups will be concatenated to form a new word. need at least 2 to split a word
155+
*</p><p>
149156
* {@code setRoots} sets the roots of the sentence to a new root.
150157
* {@code n1, n2, ...} are the names of the nodes from the Semgrex to use as the root(s).
151158
* This is best done in conjunction with other operations which actually manipulate the structure
@@ -397,9 +404,12 @@ public Collection<SsurgeonWordlist> getResources() {
397404
public static final String DEP_NODENAME_ARG = "-dep";
398405
public static final String EDGE_NAME_ARG = "-edge";
399406
public static final String NODENAME_ARG = "-node";
407+
public static final String REGEX_ARG = "-regex";
400408
public static final String RELN_ARG = "-reln";
401409
public static final String NODE_PROTO_ARG = "-nodearg";
402410
public static final String WEIGHT_ARG = "-weight";
411+
public static final String HEAD_INDEX_ARG = "-headIndex";
412+
public static final String HEAD_INDEX_LOWER_ARG = "-headindex";
403413
public static final String NAME_ARG = "-name";
404414
public static final String POSITION_ARG = "-position";
405415
public static final String UPDATE_MORPHO_FEATURES = "-updateMorphoFeatures";
@@ -420,6 +430,8 @@ protected static class SsurgeonArgs {
420430

421431
public List<String> nodes = new ArrayList<>();
422432

433+
public List<String> regex = new ArrayList<>();
434+
423435
// below are string representations of the intended values
424436
public String nodeString = null;
425437

@@ -431,6 +443,8 @@ protected static class SsurgeonArgs {
431443

432444
public String updateMorphoFeatures = null;
433445

446+
public Integer headIndex = null;
447+
434448
public Map<String, String> annotations = new TreeMap<>();
435449
}
436450

@@ -489,12 +503,19 @@ private static SsurgeonArgs parseArgsBox(String args, Map<String, String> additi
489503
case NODENAME_ARG:
490504
argsBox.nodes.add(argsValue);
491505
break;
506+
case REGEX_ARG:
507+
argsBox.regex.add(argsValue);
508+
break;
492509
case NODE_PROTO_ARG:
493510
argsBox.nodeString = argsValue;
494511
break;
495512
case WEIGHT_ARG:
496513
argsBox.weight = Double.valueOf(argsValue);
497514
break;
515+
case HEAD_INDEX_ARG:
516+
case HEAD_INDEX_LOWER_ARG:
517+
argsBox.headIndex = Integer.valueOf(argsValue);
518+
break;
498519
case NAME_ARG:
499520
argsBox.name = argsValue;
500521
break;
@@ -602,6 +623,9 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
602623
return new KillAllIncomingEdges(argsBox.nodes.get(0));
603624
} else if (command.equalsIgnoreCase(CombineMWT.LABEL)) {
604625
return new CombineMWT(argsBox.nodes, argsBox.annotations.get("word"));
626+
} else if (command.equalsIgnoreCase(SplitWord.LABEL)) {
627+
GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);
628+
return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln);
605629
}
606630
throw new SsurgeonParseException("Error in SsurgeonEdit.parseEditLine: command '"+command+"' is not supported");
607631
} catch (SsurgeonParseException e) {

test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java

+78
Original file line numberDiff line numberDiff line change
@@ -1954,6 +1954,84 @@ public void readXMLDeleteLeaf() {
19541954
assertEquals(newSg, expected);
19551955
}
19561956

1957+
/**
1958+
* Test splitWord, which should split a word into pieces based on regex matches, with the head at position 0
1959+
*/
1960+
@Test
1961+
public void readXMLSplitTwoWords() {
1962+
String doc = String.join(newline,
1963+
"<ssurgeon-pattern-list>",
1964+
" <ssurgeon-pattern>",
1965+
" <uid>38</uid>",
1966+
" <notes>Test splitting a word into two pieces with the head at the start</notes>",
1967+
" <language>UniversalEnglish</language>",
1968+
" <semgrex>" + XMLUtils.escapeXML("{word:/foobar/}=split") + "</semgrex>",
1969+
" <edit-list>splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 0</edit-list>",
1970+
" </ssurgeon-pattern>",
1971+
"</ssurgeon-pattern-list>");
1972+
Ssurgeon inst = Ssurgeon.inst();
1973+
List<SsurgeonPattern> patterns = inst.readFromString(doc);
1974+
assertEquals(patterns.size(), 1);
1975+
SsurgeonPattern pattern = patterns.get(0);
1976+
1977+
SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
1978+
SemanticGraph newSg = pattern.iterate(sg).first;
1979+
SemanticGraph expected = SemanticGraph.valueOf("[example-4 det> the-1 amod> [foo-2 dep> bar-3]]");
1980+
assertEquals(newSg, expected);
1981+
}
1982+
1983+
/**
1984+
* Test splitWord, which should split a word into pieces based on regex matches, with the head at position 1
1985+
*/
1986+
@Test
1987+
public void readXMLSplitTwoWordsAfter() {
1988+
String doc = String.join(newline,
1989+
"<ssurgeon-pattern-list>",
1990+
" <ssurgeon-pattern>",
1991+
" <uid>38</uid>",
1992+
" <notes>Test splitting a word into two pieces with the head at the start</notes>",
1993+
" <language>UniversalEnglish</language>",
1994+
" <semgrex>" + XMLUtils.escapeXML("{word:/foobar/}=split") + "</semgrex>",
1995+
" <edit-list>splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 1</edit-list>",
1996+
" </ssurgeon-pattern>",
1997+
"</ssurgeon-pattern-list>");
1998+
Ssurgeon inst = Ssurgeon.inst();
1999+
List<SsurgeonPattern> patterns = inst.readFromString(doc);
2000+
assertEquals(patterns.size(), 1);
2001+
SsurgeonPattern pattern = patterns.get(0);
2002+
2003+
SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
2004+
SemanticGraph newSg = pattern.iterate(sg).first;
2005+
SemanticGraph expected = SemanticGraph.valueOf("[example-4 det> the-1 amod> [bar-3 dep> foo-2]]");
2006+
assertEquals(newSg, expected);
2007+
}
2008+
2009+
/**
2010+
* Test splitWord, which should split a word into pieces based on regex matches, with three pieces
2011+
*/
2012+
@Test
2013+
public void readXMLSplitThreeWords() {
2014+
String doc = String.join(newline,
2015+
"<ssurgeon-pattern-list>",
2016+
" <ssurgeon-pattern>",
2017+
" <uid>38</uid>",
2018+
" <notes>Test splitting a word into two pieces with the head at the start</notes>",
2019+
" <language>UniversalEnglish</language>",
2020+
" <semgrex>" + XMLUtils.escapeXML("{word:/foobarbaz/}=split") + "</semgrex>",
2021+
" <edit-list>splitWord -node split -regex ^(foo)barbaz$ -regex ^foo(bar)baz$ -regex ^foobar(baz)$ -reln dep -headIndex 1</edit-list>",
2022+
" </ssurgeon-pattern>",
2023+
"</ssurgeon-pattern-list>");
2024+
Ssurgeon inst = Ssurgeon.inst();
2025+
List<SsurgeonPattern> patterns = inst.readFromString(doc);
2026+
assertEquals(patterns.size(), 1);
2027+
SsurgeonPattern pattern = patterns.get(0);
2028+
2029+
SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobarbaz-2]");
2030+
SemanticGraph newSg = pattern.iterate(sg).first;
2031+
SemanticGraph expected = SemanticGraph.valueOf("[example-5 det> the-1 amod> [bar-3 dep> foo-2 dep>baz-4]]");
2032+
assertEquals(newSg, expected);
2033+
}
2034+
19572035
/**
19582036
* Simple test of an Ssurgeon edit script. This instances a simple semantic graph,
19592037
* a semgrex pattern, and then the resulting actions over the named nodes in the

0 commit comments

Comments
 (0)