Skip to content

Commit 2428723

Browse files
pombredannemrombout
authored andcommitted
Do not use defaultdict for Query unknowns
Do not use defaultdict for Query.unknowns_by_pos and Query.stopwords_by_pos. Otherwise there are pernicious side effects to add new entries in these dctionaries when querying them after their creation. Reported-by: Mike Rombout @mrombout Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
1 parent 1b1d03d commit 2428723

File tree

4 files changed

+118
-22
lines changed

4 files changed

+118
-22
lines changed

src/licensedcode/match.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ def qcontains_stopwords(self):
386386
stopwords_pos = qspan & query.stopwords_span
387387
stopwords_pos = (pos for pos in stopwords_pos if pos != qspe)
388388
qry_stopxpos = query.stopwords_by_pos
389-
return any(qry_stopxpos[pos] for pos in stopwords_pos)
389+
return any(qry_stopxpos.get(pos, 0) for pos in stopwords_pos)
390390

391391
def qrange(self):
392392
"""
@@ -1217,8 +1217,12 @@ def filter_low_score(matches, min_score=100):
12171217
return kept, discarded
12181218

12191219

1220-
def filter_spurious_single_token(matches, query=None, unknown_count=5,
1221-
trace=TRACE_FILTER_SPURIOUS_SINGLE_TOKEN):
1220+
def filter_spurious_single_token(
1221+
matches,
1222+
query=None,
1223+
unknown_count=5,
1224+
trace=TRACE_FILTER_SPURIOUS_SINGLE_TOKEN,
1225+
):
12221226
"""
12231227
Return a filtered list of kept LicenseMatch matches and a list of
12241228
discardable matches given a `matches` list of LicenseMatch by removing
@@ -1250,9 +1254,10 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5,
12501254
qend = match.qend
12511255

12521256
# compute the number of unknown tokens before and after this single
1253-
# matched position note: unknowns_by_pos is a defaultdict(int),
1254-
# shorts_and_digits is a set of integers
1255-
before = unknowns_by_pos[qstart - 1]
1257+
# matched position note:
1258+
# - unknowns_by_pos is a dict,
1259+
# - shorts_and_digits is a set of ints
1260+
before = unknowns_by_pos.get(qstart - 1, 0)
12561261
for p in range(qstart - 1 - unknown_count, qstart):
12571262
if p in shorts_and_digits:
12581263
before += 1
@@ -1263,7 +1268,7 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5,
12631268
kept.append(match)
12641269
continue
12651270

1266-
after = unknowns_by_pos[qstart]
1271+
after = unknowns_by_pos.get(qstart, 0)
12671272
for p in range(qend, qend + 1 + unknown_count):
12681273
if p in shorts_and_digits:
12691274
after += 1

src/licensedcode/query.py

+20-15
Original file line numberDiff line numberDiff line change
@@ -228,15 +228,15 @@ def __init__(
228228
# index of "known positions" (yes really!) to a number of unknown tokens
229229
# after that known position. For unknowns at the start, the position is
230230
# using the magic -1 key
231-
self.unknowns_by_pos = defaultdict(int)
231+
self.unknowns_by_pos = {}
232232

233233
# Span of "known positions" (yes really!) followed by unknown token(s)
234234
self.unknowns_span = None
235235

236236
# index of "known positions" (yes really!) to a number of stopword
237237
# tokens after that known position. For stopwords at the start, the
238238
# position is using the magic -1 key
239-
self.stopwords_by_pos = defaultdict(int)
239+
self.stopwords_by_pos = {}
240240

241241
# Span of "known positions" (yes really!) followed by stopwords
242242
self.stopwords_span = None
@@ -355,12 +355,12 @@ def tokens_with_unknowns(self):
355355
"""
356356
unknowns = self.unknowns_by_pos
357357
# yield anything at the start
358-
for _ in range(unknowns[-1]):
358+
for _ in range(unknowns.get(-1, 0)):
359359
yield None
360360

361361
for pos, token in enumerate(self.tokens):
362362
yield token
363-
for _ in range(unknowns[pos]):
363+
for _ in range(unknowns.get(pos, 0)):
364364
yield None
365365

366366
def tokens_by_line(
@@ -386,11 +386,13 @@ def tokens_by_line(
386386
# bind frequently called functions to local scope
387387
line_by_pos_append = self.line_by_pos.append
388388

389-
self_unknowns_by_pos = self.unknowns_by_pos
389+
# we use a defaultdict as a convenience at construction time
390+
unknowns_by_pos = defaultdict(int)
390391
unknowns_pos = set()
391392
unknowns_pos_add = unknowns_pos.add
392393

393-
self_stopwords_by_pos = self.stopwords_by_pos
394+
# we use a defaultdict as a convenience at construction time
395+
stopwords_by_pos = defaultdict(int)
394396
stopwords_pos = set()
395397
stopwords_pos_add = stopwords_pos.add
396398

@@ -443,11 +445,11 @@ def tokens_by_line(
443445
# If we have not yet started globally, then all tokens
444446
# seen so far are stopwords and we keep a count of them
445447
# in the magic "-1" position.
446-
self_stopwords_by_pos[-1] += 1
448+
stopwords_by_pos[-1] += 1
447449
else:
448450
# here we have a new unknwon token positioned right after
449451
# the current known_pos
450-
self_stopwords_by_pos[known_pos] += 1
452+
stopwords_by_pos[known_pos] += 1
451453
stopwords_pos_add(known_pos)
452454
# we do not track stopwords, only their position
453455
continue
@@ -456,11 +458,11 @@ def tokens_by_line(
456458
# If we have not yet started globally, then all tokens
457459
# seen so far are unknowns and we keep a count of them
458460
# in the magic "-1" position.
459-
self_unknowns_by_pos[-1] += 1
461+
unknowns_by_pos[-1] += 1
460462
else:
461463
# here we have a new unknwon token positioned right after
462464
# the current known_pos
463-
self_unknowns_by_pos[known_pos] += 1
465+
unknowns_by_pos[known_pos] += 1
464466
unknowns_pos_add(known_pos)
465467

466468
line_tokens_append(tid)
@@ -492,11 +494,14 @@ def tokens_by_line(
492494

493495
yield line_tokens
494496

495-
# finally create a Span of positions followed by unkwnons and another
496-
# for positions followed by stopwords used for intersection with the
497-
# query span to do the scoring matches correctly
497+
# finally update the attributes and create a Span of positions followed
498+
# by unkwnons and another for positions followed by stopwords used for
499+
# intersection with the query span to do the scoring matches correctly
498500
self.unknowns_span = Span(unknowns_pos)
499501
self.stopwords_span = Span(stopwords_pos)
502+
# also convert the defaultdicts back to plain discts
503+
self.unknowns_by_pos = dict(unknowns_by_pos)
504+
self.stopwords_by_pos = dict(stopwords_by_pos)
500505

501506
def tokenize_and_build_runs(self, tokens_by_line, line_threshold=4):
502507
"""
@@ -760,14 +765,14 @@ def tokens_with_unknowns(self):
760765
unknowns = self.query.unknowns_by_pos
761766
# yield anything at the start only if this is the first query run
762767
if self.start == 0:
763-
for _ in range(unknowns[-1]):
768+
for _ in range(unknowns.get(-1, 0)):
764769
yield None
765770

766771
for pos, token in self.tokens_with_pos():
767772
yield token
768773
if pos == self.end:
769774
break
770-
for _ in range(unknowns[pos]):
775+
for _ in range(unknowns.get(pos, 0)):
771776
yield None
772777

773778
def tokens_with_pos(self):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
This repository uses 2 different licenses :
2+
- all files in the `lib` directory use a BSD 2-Clause license
3+
- all other files use a GPLv2 license, unless explicitly stated otherwise
4+
5+
Relevant license is reminded at the top of each source file,
6+
and with presence of COPYING or LICENSE file in associated directories.
7+
8+
This model is selected to emphasize that
9+
files in the `lib` directory are designed to be included into 3rd party applications,
10+
while all other files, in `programs`, `tests` or `examples`,
11+
receive more limited attention and support for such scenario.

tests/licensedcode/test_query.py

+75
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import json
1111
import os
12+
from collections import defaultdict
1213

1314
from commoncode.testcase import FileBasedTesting
1415
from licensedcode import cache
@@ -395,6 +396,31 @@ def test_query_run_unknowns(self):
395396
expected = {-1: 2, 0: 4, 1: 3}
396397
assert dict(q.unknowns_by_pos) == expected
397398

399+
def test_query_unknowns_by_pos_and_stopwords_are_not_defaultdic_and_not_changed_on_query(self):
400+
idx = index.LicenseIndex(
401+
[Rule(stored_text='a is the binary')],
402+
_legalese=set(['binary']),
403+
_spdx_tokens=set()
404+
)
405+
q = Query(query_string='binary that was a binary', idx=idx)
406+
list(q.tokens_by_line())
407+
assert q.unknowns_by_pos == {0: 2}
408+
assert q.stopwords_by_pos == {0: 1}
409+
410+
assert not isinstance(q.unknowns_by_pos, defaultdict)
411+
assert not isinstance(q.stopwords_by_pos, defaultdict)
412+
413+
try:
414+
q.unknowns_by_pos[1]
415+
assert q.unknowns_by_pos == {0: 2}
416+
except KeyError:
417+
pass
418+
try:
419+
q.stopwords_by_pos[1]
420+
assert q.stopwords_by_pos == {0: 1}
421+
except KeyError:
422+
pass
423+
398424

399425
class TestQueryWithMultipleRuns(IndexTesting):
400426

@@ -768,3 +794,52 @@ def test_query_run_for_text_with_long_lines(self):
768794
idx = cache.get_index()
769795
assert len(Query(location1, idx=idx).query_runs) == 17
770796
assert len(Query(location2, idx=idx).query_runs) == 15
797+
798+
def test_match_does_not_change_query_unknown_positions(self):
799+
from licensedcode.match import LicenseMatch
800+
from licensedcode.spans import Span
801+
802+
location = self.get_test_loc('query/unknown_positions/lz4.license.txt')
803+
idx = cache.get_index()
804+
# build a query first
805+
qry1 = Query(location, idx=idx)
806+
# this has the side effect to populate the unknown
807+
txt = u' '.join(f'{i}-{idx.tokens_by_tid[t]}' for i, t in enumerate(qry1.tokens))
808+
assert txt == (
809+
'0-this 1-repository 2-uses 3-2 4-different 5-licenses '
810+
'6-all 7-files 8-in 9-the 10-lib 11-directory 12-use 13-bsd 14-2 15-clause 16-license '
811+
'17-all 18-other 19-files 20-use 21-gplv2 22-license 23-unless 24-explicitly 25-stated 26-otherwise '
812+
'27-relevant 28-license 29-is 30-reminded 31-at 32-the 33-top 34-of 35-each 36-source 37-file '
813+
'38-and 39-with 40-presence 41-of 42-copying 43-or 44-license 45-file 46-in 47-associated 48-directories '
814+
'49-this 50-model 51-is 52-selected 53-to 54-emphasize 55-that '
815+
'56-files 57-in 58-the 59-lib 60-directory 61-are 62-designed 63-to 64-be 65-included 66-into 67-3rd 68-party 69-applications '
816+
'70-while 71-all 72-other 73-files 74-in 75-programs 76-tests 77-or 78-examples '
817+
'79-receive 80-more 81-limited 82-attention 83-and 84-support 85-for 86-such 87-scenario'
818+
)
819+
list(qry1.tokens_by_line())
820+
assert qry1.unknowns_by_pos == {}
821+
822+
# run matching
823+
matches = idx.match(location=location)
824+
match = matches[0]
825+
826+
rule = [
827+
r for r in idx.rules_by_rid
828+
if r.identifier == 'bsd-simplified_and_gpl-2.0_1.RULE'
829+
][0]
830+
831+
expected = LicenseMatch(
832+
matcher='2-aho',
833+
rule=rule,
834+
qspan=Span(0, 48),
835+
ispan=Span(0, 48),
836+
)
837+
838+
assert match == expected
839+
840+
# check that query unknown by pos is the same and empty
841+
qry2 = match.query
842+
843+
# this was incorrectly returned as {15: 0, 20: 0, 21: 0, 41: 0, 43: 0}
844+
# after querying done during matching
845+
assert qry2.unknowns_by_pos == {}

0 commit comments

Comments
 (0)