Do not use defaultdict for Query unknowns

pombredanne · mrombout · commit 2428723959a2 · 2021-11-30T17:34:58.000+01:00
Do not use defaultdict for Query.unknowns_by_pos and Query.stopwords_by_pos. Otherwise there are pernicious side effects to add new entries in these dctionaries when querying them after their creation. Reported-by: Mike Rombout @mrombout Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
@@ -386,7 +386,7 @@ def qcontains_stopwords(self):
             stopwords_pos = qspan & query.stopwords_span
             stopwords_pos = (pos for pos in stopwords_pos if pos != qspe)
             qry_stopxpos = query.stopwords_by_pos
-            return any(qry_stopxpos[pos] for pos in stopwords_pos)
+            return any(qry_stopxpos.get(pos, 0) for pos in stopwords_pos)
 
     def qrange(self):
         """
@@ -1217,8 +1217,12 @@ def filter_low_score(matches, min_score=100):
     return kept, discarded
 
 
-def filter_spurious_single_token(matches, query=None, unknown_count=5,
-                                 trace=TRACE_FILTER_SPURIOUS_SINGLE_TOKEN):
+def filter_spurious_single_token(
+    matches,
+    query=None,
+    unknown_count=5,
+    trace=TRACE_FILTER_SPURIOUS_SINGLE_TOKEN,
+):
     """
     Return a filtered list of kept LicenseMatch matches and a list of
     discardable matches given a `matches` list of LicenseMatch by removing
@@ -1250,9 +1254,10 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5,
         qend = match.qend
 
         # compute the number of unknown tokens before and after this single
-        # matched position note: unknowns_by_pos is a defaultdict(int),
-        # shorts_and_digits is a set of integers
-        before = unknowns_by_pos[qstart - 1]
+        # matched position note:
+        # - unknowns_by_pos is a dict,
+        # - shorts_and_digits is a set of ints
+        before = unknowns_by_pos.get(qstart - 1, 0)
         for p in range(qstart - 1 - unknown_count, qstart):
             if p in shorts_and_digits:
                 before += 1
@@ -1263,7 +1268,7 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5,
             kept.append(match)
             continue
 
-        after = unknowns_by_pos[qstart]
+        after = unknowns_by_pos.get(qstart, 0)
         for p in range(qend, qend + 1 + unknown_count):
             if p in shorts_and_digits:
                 after += 1
diff --git a/src/licensedcode/query.py b/src/licensedcode/query.py
@@ -228,15 +228,15 @@ def __init__(
         # index of "known positions" (yes really!) to a number of unknown tokens
         # after that known position. For unknowns at the start, the position is
         # using the magic -1 key
-        self.unknowns_by_pos = defaultdict(int)
+        self.unknowns_by_pos = {}
 
         # Span of "known positions" (yes really!) followed by unknown token(s)
         self.unknowns_span = None
 
         # index of "known positions" (yes really!) to a number of stopword
         # tokens after that known position. For stopwords at the start, the
         # position is using the magic -1 key
-        self.stopwords_by_pos = defaultdict(int)
+        self.stopwords_by_pos = {}
 
         # Span of "known positions" (yes really!) followed by stopwords
         self.stopwords_span = None
@@ -355,12 +355,12 @@ def tokens_with_unknowns(self):
         """
         unknowns = self.unknowns_by_pos
         # yield anything at the start
-        for _ in range(unknowns[-1]):
+        for _ in range(unknowns.get(-1, 0)):
             yield None
 
         for pos, token in enumerate(self.tokens):
             yield token
-            for _ in range(unknowns[pos]):
+            for _ in range(unknowns.get(pos, 0)):
                 yield None
 
     def tokens_by_line(
@@ -386,11 +386,13 @@ def tokens_by_line(
         # bind frequently called functions to local scope
         line_by_pos_append = self.line_by_pos.append
 
-        self_unknowns_by_pos = self.unknowns_by_pos
+        # we use a defaultdict as a convenience at construction time
+        unknowns_by_pos = defaultdict(int)
         unknowns_pos = set()
         unknowns_pos_add = unknowns_pos.add
 
-        self_stopwords_by_pos = self.stopwords_by_pos
+        # we use a defaultdict as a convenience at construction time
+        stopwords_by_pos = defaultdict(int)
         stopwords_pos = set()
         stopwords_pos_add = stopwords_pos.add
 
@@ -443,11 +445,11 @@ def tokens_by_line(
                             # If we have not yet started globally, then all tokens
                             # seen so far are stopwords and we keep a count of them
                             # in the magic "-1" position.
-                            self_stopwords_by_pos[-1] += 1
+                            stopwords_by_pos[-1] += 1
                         else:
                             # here we have a new unknwon token positioned right after
                             # the current known_pos
-                            self_stopwords_by_pos[known_pos] += 1
+                            stopwords_by_pos[known_pos] += 1
                             stopwords_pos_add(known_pos)
                         # we do not track stopwords, only their position
                         continue
@@ -456,11 +458,11 @@ def tokens_by_line(
                             # If we have not yet started globally, then all tokens
                             # seen so far are unknowns and we keep a count of them
                             # in the magic "-1" position.
-                            self_unknowns_by_pos[-1] += 1
+                            unknowns_by_pos[-1] += 1
                         else:
                             # here we have a new unknwon token positioned right after
                             # the current known_pos
-                            self_unknowns_by_pos[known_pos] += 1
+                            unknowns_by_pos[known_pos] += 1
                             unknowns_pos_add(known_pos)
 
                 line_tokens_append(tid)
@@ -492,11 +494,14 @@ def tokens_by_line(
 
             yield line_tokens
 
-        # finally create a Span of positions followed by unkwnons and another
-        # for positions followed by stopwords used for intersection with the
-        # query span to do the scoring matches correctly
+        # finally update the attributes and create a Span of positions followed
+        # by unkwnons and another for positions followed by stopwords used for
+        # intersection with the query span to do the scoring matches correctly
         self.unknowns_span = Span(unknowns_pos)
         self.stopwords_span = Span(stopwords_pos)
+        # also convert the defaultdicts back to plain discts
+        self.unknowns_by_pos = dict(unknowns_by_pos)
+        self.stopwords_by_pos = dict(stopwords_by_pos)
 
     def tokenize_and_build_runs(self, tokens_by_line, line_threshold=4):
         """
@@ -760,14 +765,14 @@ def tokens_with_unknowns(self):
         unknowns = self.query.unknowns_by_pos
         # yield anything at the start only if this is the first query run
         if self.start == 0:
-            for _ in range(unknowns[-1]):
+            for _ in range(unknowns.get(-1, 0)):
                 yield None
 
         for pos, token in self.tokens_with_pos():
             yield token
             if pos == self.end:
                 break
-            for _ in range(unknowns[pos]):
+            for _ in range(unknowns.get(pos, 0)):
                 yield None
 
     def tokens_with_pos(self):
diff --git a/tests/licensedcode/data/query/unknown_positions/lz4.license.txt b/tests/licensedcode/data/query/unknown_positions/lz4.license.txt
@@ -0,0 +1,11 @@
+This repository uses 2 different licenses :
+- all files in the `lib` directory use a BSD 2-Clause license
+- all other files use a GPLv2 license, unless explicitly stated otherwise
+
+Relevant license is reminded at the top of each source file,
+and with presence of COPYING or LICENSE file in associated directories.
+
+This model is selected to emphasize that
+files in the `lib` directory are designed to be included into 3rd party applications,
+while all other files, in `programs`, `tests` or `examples`,
+receive more limited attention and support for such scenario.
diff --git a/tests/licensedcode/test_query.py b/tests/licensedcode/test_query.py
@@ -9,6 +9,7 @@
 
 import json
 import os
+from collections import defaultdict
 
 from commoncode.testcase import FileBasedTesting
 from licensedcode import cache
@@ -395,6 +396,31 @@ def test_query_run_unknowns(self):
         expected = {-1: 2, 0: 4, 1: 3}
         assert dict(q.unknowns_by_pos) == expected
 
+    def test_query_unknowns_by_pos_and_stopwords_are_not_defaultdic_and_not_changed_on_query(self):
+        idx = index.LicenseIndex(
+            [Rule(stored_text='a is the binary')],
+            _legalese=set(['binary']),
+            _spdx_tokens=set()
+        )
+        q = Query(query_string='binary that was a binary', idx=idx)
+        list(q.tokens_by_line())
+        assert q.unknowns_by_pos == {0: 2}
+        assert q.stopwords_by_pos == {0: 1}
+        
+        assert not isinstance(q.unknowns_by_pos, defaultdict)
+        assert not isinstance(q.stopwords_by_pos, defaultdict)
+
+        try:
+            q.unknowns_by_pos[1]
+            assert q.unknowns_by_pos == {0: 2}
+        except KeyError:
+            pass
+        try:
+            q.stopwords_by_pos[1]
+            assert q.stopwords_by_pos == {0: 1}
+        except KeyError:
+            pass
+
 
 class TestQueryWithMultipleRuns(IndexTesting):
 
@@ -768,3 +794,52 @@ def test_query_run_for_text_with_long_lines(self):
         idx = cache.get_index()
         assert len(Query(location1, idx=idx).query_runs) == 17
         assert len(Query(location2, idx=idx).query_runs) == 15
+
+    def test_match_does_not_change_query_unknown_positions(self):
+        from licensedcode.match import LicenseMatch
+        from licensedcode.spans import Span
+
+        location = self.get_test_loc('query/unknown_positions/lz4.license.txt')
+        idx = cache.get_index()
+        # build a query first
+        qry1 = Query(location, idx=idx)
+        # this has the side effect to populate the unknown
+        txt = u' '.join(f'{i}-{idx.tokens_by_tid[t]}' for i, t in enumerate(qry1.tokens))
+        assert txt == (
+            '0-this 1-repository 2-uses 3-2 4-different 5-licenses '
+            '6-all 7-files 8-in 9-the 10-lib 11-directory 12-use 13-bsd 14-2 15-clause 16-license '
+            '17-all 18-other 19-files 20-use 21-gplv2 22-license 23-unless 24-explicitly 25-stated 26-otherwise '
+            '27-relevant 28-license 29-is 30-reminded 31-at 32-the 33-top 34-of 35-each 36-source 37-file '
+            '38-and 39-with 40-presence 41-of 42-copying 43-or 44-license 45-file 46-in 47-associated 48-directories '
+            '49-this 50-model 51-is 52-selected 53-to 54-emphasize 55-that '
+            '56-files 57-in 58-the 59-lib 60-directory 61-are 62-designed 63-to 64-be 65-included 66-into 67-3rd 68-party 69-applications '
+            '70-while 71-all 72-other 73-files 74-in 75-programs 76-tests 77-or 78-examples '
+            '79-receive 80-more 81-limited 82-attention 83-and 84-support 85-for 86-such 87-scenario'
+        )
+        list(qry1.tokens_by_line())
+        assert qry1.unknowns_by_pos == {}
+
+        # run matching
+        matches = idx.match(location=location)
+        match = matches[0]
+
+        rule = [
+            r for r in idx.rules_by_rid
+            if r.identifier == 'bsd-simplified_and_gpl-2.0_1.RULE'
+        ][0]
+
+        expected = LicenseMatch(
+            matcher='2-aho',
+            rule=rule,
+            qspan=Span(0, 48),
+            ispan=Span(0, 48),
+        )
+
+        assert match == expected
+
+        # check that query unknown by pos is the same and empty
+        qry2 = match.query
+
+        # this was incorrectly returned as {15: 0, 20: 0, 21: 0, 41: 0, 43: 0}
+        # after querying done during matching
+        assert qry2.unknowns_by_pos == {}