Add the ability to reuse indices in SemanticGraph.valueOf This possibly changes the meaning of existing expressions, since it was previously possible to assign multiple words to the same index, but that was a bad feature anyway

AngledLuffa · AngledLuffa · commit cf97e3647582 · 2023-01-19T22:27:50.000-08:00
diff --git a/src/edu/stanford/nlp/semgraph/SemanticGraph.java b/src/edu/stanford/nlp/semgraph/SemanticGraph.java
@@ -1688,10 +1688,13 @@ public SemanticGraphEdge addEdge(SemanticGraphEdge edge) {
    * dumb, could be made more sophisticated.
    * <br>
    *
-   * Example: "[ate subj>Bill dobj>[muffins compound>blueberry]]"
+   * Example: {@code [ate subj>Bill dobj>[muffins compound>blueberry]]}
    * <br>
    *
    * This is the same format generated by toCompactString().
+   * <br>
+   * Indices are represented by a dash separated number after the word:
+   * {@code [ate-1 subj>Bill-2 ...}
    */
   public static SemanticGraph valueOf(String s, Language language, Integer sentIndex) {
     return (new SemanticGraphParsingTask(s, language, sentIndex)).parse();
@@ -1841,7 +1844,7 @@ public SemanticGraph makeSoftCopy() {
 
   // ============================================================================
 
-  private static final Pattern WORD_AND_INDEX_PATTERN = Pattern.compile("([^-]+)-([0-9]+)");
+  private static final Pattern WORD_AND_INDEX_PATTERN = Pattern.compile("([^-]*)-([0-9]+)");
 
   /**
    * This nested class is a helper for valueOf(). It represents the task of
@@ -1850,7 +1853,7 @@ public SemanticGraph makeSoftCopy() {
   private static class SemanticGraphParsingTask extends StringParsingTask<SemanticGraph> {
 
     private SemanticGraph sg;
-    private Set<Integer> indexesUsed = Generics.newHashSet();
+    private Map<Integer, IndexedWord> indexesUsed = Generics.newHashMap();
     private final Language language;
     private final Integer sentIndex;
 
@@ -1922,21 +1925,19 @@ private IndexedWord makeVertex(String word) {
       } else {
         index = getNextFreeIndex();
       }
-      indexesUsed.add(index);
-      // Note that, despite the use of indexesUsed and getNextFreeIndex(),
-      // nothing is actually enforcing that no indexes are used twice. This
-      // could occur if some words in the string representation being parsed
-      // come with index markers and some do not.
+      if (indexesUsed.containsKey(index)) {
+        return indexesUsed.get(index);
+      }
       IndexedWord ifl = new IndexedWord(null, sentIndex != null ? sentIndex : 0, index);
       // log.info("SemanticGraphParsingTask>>> word = " + word);
       // log.info("SemanticGraphParsingTask>>> index = " + index);
-      // log.info("SemanticGraphParsingTask>>> indexesUsed = " +
-      // indexesUsed);
+      // log.info("SemanticGraphParsingTask>>> indexesUsed = " + indexesUsed);
       String[] wordAndTag = word.split("/");
       ifl.set(CoreAnnotations.TextAnnotation.class, wordAndTag[0]);
       ifl.set(CoreAnnotations.ValueAnnotation.class, wordAndTag[0]);
       if (wordAndTag.length > 1)
         ifl.set(CoreAnnotations.PartOfSpeechAnnotation.class, wordAndTag[1]);
+      indexesUsed.put(index, ifl);
       return ifl;
     }
 
@@ -1953,7 +1954,7 @@ private static Pair<String, Integer> readWordAndIndex(String word) {
 
     private Integer getNextFreeIndex() {
       int i = 0;
-      while (indexesUsed.contains(i))
+      while (indexesUsed.containsKey(i))
         i++;
       return i;
     }
diff --git a/test/src/edu/stanford/nlp/semgraph/SemanticGraphTest.java b/test/src/edu/stanford/nlp/semgraph/SemanticGraphTest.java
@@ -347,4 +347,52 @@ public void testValueOfIndices() {
     assertEquals(sg.getParentsWithReln(E, "obj").size(), 1);
     assertEquals(sg.getParentsWithReln(E, "dep").size(), 0);
   }
+
+  /**
+   * Test the vertices and edges if we reuse some indices in valueOf
+   */
+   public void testValueOfReuseIndices() {
+    SemanticGraph sg = SemanticGraph.valueOf("[A/foo-0 obj> B/bar-1 obj> C/foo-2 obj> -2 dep> B/bar-1 nsubj> [D/bar-3 obj> E/baz-4]]");
+
+    List<IndexedWord> words = sg.vertexListSorted();
+    assertEquals(words.size(), 5);
+
+    for (int i = 0; i < 5; ++i) {
+      assertEquals(words.get(i).index(), i);
+    }
+    IndexedWord A = words.get(0);
+    IndexedWord B = words.get(1);
+    IndexedWord C = words.get(2);
+    IndexedWord D = words.get(3);
+    IndexedWord E = words.get(4);
+
+    assertEquals(A.word(), "A");
+    assertEquals(A.tag(),  "foo");
+    assertEquals(B.word(), "B");
+    assertEquals(B.tag(),  "bar");
+    assertEquals(C.word(), "C");
+    assertEquals(C.tag(),  "foo");
+    assertEquals(D.word(), "D");
+    assertEquals(D.tag(),  "bar");
+    assertEquals(E.word(), "E");
+    assertEquals(E.tag(),  "baz");
+
+    assertEquals(sg.getAllEdges(A, B).size(), 2);
+    assertEquals(sg.getParentsWithReln(B, "obj").size(), 1);
+    assertEquals(sg.getParentsWithReln(B, "dep").size(), 1);
+
+    assertEquals(sg.getAllEdges(A, C).size(), 2);
+    assertEquals(sg.getParentsWithReln(C, "obj").size(), 1);
+
+    assertEquals(sg.getAllEdges(A, D).size(), 1);
+    assertEquals(sg.getParentsWithReln(D, "nsubj").size(), 1);
+    assertEquals(sg.getParentsWithReln(D, "obj").size(), 0);
+    assertEquals(sg.getParentsWithReln(D, "dep").size(), 0);
+
+    assertEquals(sg.getAllEdges(A, E).size(), 0);
+    assertEquals(sg.getAllEdges(D, E).size(), 1);
+    assertEquals(sg.getParentsWithReln(E, "obj").size(), 1);
+    assertEquals(sg.getParentsWithReln(E, "dep").size(), 0);
+  }
+
 }