Add an Ssurgeon feature which splits a word into pieces based on regex matches. A word can be specified as the head of the new pieces, along with the relation. Other words are pushed down the sentence to make the indices line up

AngledLuffa · AngledLuffa · commit 13ede5a26569 · 2024-07-02T01:00:09.000-07:00
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java
@@ -0,0 +1,135 @@
+package edu.stanford.nlp.semgraph.semgrex.ssurgeon;
+
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.io.*;
+
+import edu.stanford.nlp.ling.IndexedWord;
+import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
+import edu.stanford.nlp.semgraph.SemanticGraph;
+import edu.stanford.nlp.trees.GrammaticalRelation;
+
+/**
+ * Split a word into pieces based on the regex expressions provided by the -regex arguments
+ * <br>
+ * As an example of where this is useful, a tokenization dataset had "
+ * stuck to each of the words.  We can separate that out by using two
+ * regex, one which matches the " in a group, one which matches the
+ * rest of the word without the "
+ *
+ * @author John Bauer
+ */
+public class SplitWord extends SsurgeonEdit {
+  public static final String LABEL = "splitWord";
+
+  final String node;
+  final List<Pattern> nodeRegex;
+  final int headIndex;
+  final GrammaticalRelation relation;
+
+  public SplitWord(String node, List<String> nodeRegex, Integer headIndex, GrammaticalRelation relation) {
+    if (node == null) {
+      throw new SsurgeonParseException("SplitWord expected -node with the name of the matched node to split");
+    }
+    this.node = node;
+
+    if (nodeRegex == null || nodeRegex.size() == 0) {
+      throw new SsurgeonParseException("SplitWord expected -regex with regex to determine which pieces to split the word into");
+    }
+    if (nodeRegex.size() == 1) {
+      throw new SsurgeonParseException("SplitWord expected at least two -regex");
+    }
+    this.nodeRegex = new ArrayList<>();
+    for (int i = 0; i < nodeRegex.size(); ++i) {
+      this.nodeRegex.add(Pattern.compile(nodeRegex.get(i)));
+    }
+
+    if (headIndex == null) {
+      throw new SsurgeonParseException("SplitWord expected a -headIndex, 0-indexed for the word piece to use when chopping up the word");
+    }
+    this.headIndex = headIndex;
+
+    if (relation == null) {
+      throw new SsurgeonParseException("SplitWord expected a -reln to represent the dependency to use for the new words");
+    }
+    this.relation = relation;
+  }
+
+  @Override
+  public String toEditString() {
+    StringWriter buf = new StringWriter();
+    buf.write(LABEL);
+    buf.write("\t");
+    buf.write("-node " + node + "\t");
+    for (Pattern regex : nodeRegex) {
+      buf.write("-regex " + regex + "\t");
+    }
+    buf.write("-reln " + relation.toString() + "\t");
+    buf.write("-headIndex " + headIndex);
+    return buf.toString();
+  }
+
+  @Override
+  public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
+    IndexedWord matchedNode = sm.getNode(node);
+    String origWord = matchedNode.word();
+
+    // first, iterate over the regex patterns we had at creation time
+    //
+    // each new word created will be the concatenation of all of the
+    // matching groups from this pattern
+    List<String> words = new ArrayList<>();
+    for (int i = 0; i < nodeRegex.size(); ++i) {
+      Matcher regexMatcher = nodeRegex.get(i).matcher(origWord);
+      if (!regexMatcher.matches()) {
+        return false;
+      }
+
+      StringBuilder newWordBuilder = new StringBuilder();
+      for (int j = 0; j < regexMatcher.groupCount(); ++j) {
+        newWordBuilder.append(regexMatcher.group(j+1));
+      }
+      String newWord = newWordBuilder.toString();
+      if (newWord.length() == 0) {
+        return false;
+      }
+      words.add(newWord);
+    }
+
+    int matchedIndex = matchedNode.index();
+
+    // at this point, we can make new words out of each of the patterns
+
+    // move all words down by nodeRegex.size() - 1
+    // then move the original word down by headIndex
+    AddDep.moveNodes(sg, sm, x -> (x > matchedIndex), x -> x+nodeRegex.size() - 1, true);
+    // the head node has its word replaced, and its index & links need
+    // to be rearranged, but none of the links are added or removed
+    if (headIndex > 0) {
+      AddDep.moveNode(sg, sm, matchedNode, matchedIndex + headIndex);
+    }
+    matchedNode = sm.getNode(node);
+    matchedNode.setWord(words.get(headIndex));
+    matchedNode.setValue(words.get(headIndex));
+
+    for (int i = 0; i < nodeRegex.size(); ++i) {
+      if (i == headIndex)
+        continue;
+
+      // otherwise, add a word with the appropriate index,
+      // then connect it to matchedNode
+      // TODO: add the ability to set more values, such as POS?
+      IndexedWord newNode = new IndexedWord();
+      newNode.setDocID(matchedNode.docID());
+      newNode.setIndex(matchedIndex + i);
+      newNode.setSentIndex(matchedNode.sentIndex());
+      newNode.setWord(words.get(i));
+      newNode.setValue(words.get(i));
+
+      sg.addVertex(newNode);
+      sg.addEdge(matchedNode, newNode, relation, 0.0, false);
+    }
+    return true;
+  }
+}
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
@@ -85,6 +85,7 @@
  * <li> {@code editNode -node node ...attributes...}
  * <li> {@code lemmatize -node node}
  * <li> {@code combineMWT -node node -word word}
+ * <li> {@code splitWord -node node -headIndex idx -reln depType -regex w1 -regex w2 ...}
  * <li> {@code setRoots n1 (n2 n3 ...)}
  * <li> {@code mergeNodes n1 n2}
  * <li> {@code killAllIncomingEdges -node node}
@@ -146,6 +147,12 @@
  * {@code -node} (repeated) is the nodes to edit.
  * {@code -word} is the optional text to use for the new MWT.  If not set, the words will be concatenated.
  *</p><p>
+ * {@code splitWord} will split a single word into multiple pieces from the text of the current word
+ * {@code -node} is the node to split.
+ * {@code -headIndex} is the index (counting from 0) of the word piece to make the head.
+ * {@code -reln} is the name of the dependency type to use.  pieces other than the head will connect using this relation
+ * {@code -regex} regex must match the matched node.  all matching groups will be concatenated to form a new word.  need at least 2 to split a word
+ *</p><p>
  * {@code setRoots} sets the roots of the sentence to a new root.
  * {@code n1, n2, ...} are the names of the nodes from the Semgrex to use as the root(s).
  * This is best done in conjunction with other operations which actually manipulate the structure
@@ -397,9 +404,12 @@ public Collection<SsurgeonWordlist> getResources() {
   public static final String DEP_NODENAME_ARG = "-dep";
   public static final String EDGE_NAME_ARG = "-edge";
   public static final String NODENAME_ARG = "-node";
+  public static final String REGEX_ARG = "-regex";
   public static final String RELN_ARG = "-reln";
   public static final String NODE_PROTO_ARG = "-nodearg";
   public static final String WEIGHT_ARG = "-weight";
+  public static final String HEAD_INDEX_ARG = "-headIndex";
+  public static final String HEAD_INDEX_LOWER_ARG = "-headindex";
   public static final String NAME_ARG = "-name";
   public static final String POSITION_ARG = "-position";
   public static final String UPDATE_MORPHO_FEATURES = "-updateMorphoFeatures";
@@ -420,6 +430,8 @@ protected static class SsurgeonArgs {
 
     public List<String> nodes = new ArrayList<>();
 
+    public List<String> regex = new ArrayList<>();
+
     // below are string representations of the intended values
     public String nodeString = null;
 
@@ -431,6 +443,8 @@ protected static class SsurgeonArgs {
 
     public String updateMorphoFeatures = null;
 
+    public Integer headIndex = null;
+
     public Map<String, String> annotations = new TreeMap<>();
   }
 
@@ -489,12 +503,19 @@ private static SsurgeonArgs parseArgsBox(String args, Map<String, String> additi
         case NODENAME_ARG:
           argsBox.nodes.add(argsValue);
           break;
+        case REGEX_ARG:
+          argsBox.regex.add(argsValue);
+          break;
         case NODE_PROTO_ARG:
           argsBox.nodeString = argsValue;
           break;
         case WEIGHT_ARG:
           argsBox.weight = Double.valueOf(argsValue);
           break;
+        case HEAD_INDEX_ARG:
+        case HEAD_INDEX_LOWER_ARG:
+          argsBox.headIndex = Integer.valueOf(argsValue);
+          break;
         case NAME_ARG:
           argsBox.name = argsValue;
           break;
@@ -602,6 +623,9 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
         return new KillAllIncomingEdges(argsBox.nodes.get(0));
       } else if (command.equalsIgnoreCase(CombineMWT.LABEL)) {
         return new CombineMWT(argsBox.nodes, argsBox.annotations.get("word"));
+      } else if (command.equalsIgnoreCase(SplitWord.LABEL)) {
+        GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);
+        return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln);
       }
       throw new SsurgeonParseException("Error in SsurgeonEdit.parseEditLine: command '"+command+"' is not supported");
     } catch (SsurgeonParseException e) {
diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
@@ -1954,6 +1954,84 @@ public void readXMLDeleteLeaf() {
     assertEquals(newSg, expected);
   }
 
+  /**
+   * Test splitWord, which should split a word into pieces based on regex matches, with the head at position 0
+   */
+  @Test
+  public void readXMLSplitTwoWords() {
+    String doc = String.join(newline,
+                             "<ssurgeon-pattern-list>",
+                             "  <ssurgeon-pattern>",
+                             "    <uid>38</uid>",
+                             "    <notes>Test splitting a word into two pieces with the head at the start</notes>",
+                             "    <language>UniversalEnglish</language>",
+                             "    <semgrex>" + XMLUtils.escapeXML("{word:/foobar/}=split") + "</semgrex>",
+                             "    <edit-list>splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 0</edit-list>",
+                             "  </ssurgeon-pattern>",
+                             "</ssurgeon-pattern-list>");
+    Ssurgeon inst = Ssurgeon.inst();
+    List<SsurgeonPattern> patterns = inst.readFromString(doc);
+    assertEquals(patterns.size(), 1);
+    SsurgeonPattern pattern = patterns.get(0);
+
+    SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
+    SemanticGraph newSg = pattern.iterate(sg).first;
+    SemanticGraph expected = SemanticGraph.valueOf("[example-4 det> the-1 amod> [foo-2 dep> bar-3]]");
+    assertEquals(newSg, expected);
+  }
+
+  /**
+   * Test splitWord, which should split a word into pieces based on regex matches, with the head at position 1
+   */
+  @Test
+  public void readXMLSplitTwoWordsAfter() {
+    String doc = String.join(newline,
+                             "<ssurgeon-pattern-list>",
+                             "  <ssurgeon-pattern>",
+                             "    <uid>38</uid>",
+                             "    <notes>Test splitting a word into two pieces with the head at the start</notes>",
+                             "    <language>UniversalEnglish</language>",
+                             "    <semgrex>" + XMLUtils.escapeXML("{word:/foobar/}=split") + "</semgrex>",
+                             "    <edit-list>splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 1</edit-list>",
+                             "  </ssurgeon-pattern>",
+                             "</ssurgeon-pattern-list>");
+    Ssurgeon inst = Ssurgeon.inst();
+    List<SsurgeonPattern> patterns = inst.readFromString(doc);
+    assertEquals(patterns.size(), 1);
+    SsurgeonPattern pattern = patterns.get(0);
+
+    SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
+    SemanticGraph newSg = pattern.iterate(sg).first;
+    SemanticGraph expected = SemanticGraph.valueOf("[example-4 det> the-1 amod> [bar-3 dep> foo-2]]");
+    assertEquals(newSg, expected);
+  }
+
+  /**
+   * Test splitWord, which should split a word into pieces based on regex matches, with three pieces
+   */
+  @Test
+  public void readXMLSplitThreeWords() {
+    String doc = String.join(newline,
+                             "<ssurgeon-pattern-list>",
+                             "  <ssurgeon-pattern>",
+                             "    <uid>38</uid>",
+                             "    <notes>Test splitting a word into two pieces with the head at the start</notes>",
+                             "    <language>UniversalEnglish</language>",
+                             "    <semgrex>" + XMLUtils.escapeXML("{word:/foobarbaz/}=split") + "</semgrex>",
+                             "    <edit-list>splitWord -node split -regex ^(foo)barbaz$ -regex ^foo(bar)baz$ -regex ^foobar(baz)$ -reln dep -headIndex 1</edit-list>",
+                             "  </ssurgeon-pattern>",
+                             "</ssurgeon-pattern-list>");
+    Ssurgeon inst = Ssurgeon.inst();
+    List<SsurgeonPattern> patterns = inst.readFromString(doc);
+    assertEquals(patterns.size(), 1);
+    SsurgeonPattern pattern = patterns.get(0);
+
+    SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobarbaz-2]");
+    SemanticGraph newSg = pattern.iterate(sg).first;
+    SemanticGraph expected = SemanticGraph.valueOf("[example-5 det> the-1 amod> [bar-3 dep> foo-2 dep>baz-4]]");
+    assertEquals(newSg, expected);
+  }
+
   /**
    * Simple test of an Ssurgeon edit script.  This instances a simple semantic graph,
    * a semgrex pattern, and then the resulting actions over the named nodes in the