@@ -228,15 +228,15 @@ def __init__(
228
228
# index of "known positions" (yes really!) to a number of unknown tokens
229
229
# after that known position. For unknowns at the start, the position is
230
230
# using the magic -1 key
231
- self .unknowns_by_pos = defaultdict ( int )
231
+ self .unknowns_by_pos = {}
232
232
233
233
# Span of "known positions" (yes really!) followed by unknown token(s)
234
234
self .unknowns_span = None
235
235
236
236
# index of "known positions" (yes really!) to a number of stopword
237
237
# tokens after that known position. For stopwords at the start, the
238
238
# position is using the magic -1 key
239
- self .stopwords_by_pos = defaultdict ( int )
239
+ self .stopwords_by_pos = {}
240
240
241
241
# Span of "known positions" (yes really!) followed by stopwords
242
242
self .stopwords_span = None
@@ -355,12 +355,12 @@ def tokens_with_unknowns(self):
355
355
"""
356
356
unknowns = self .unknowns_by_pos
357
357
# yield anything at the start
358
- for _ in range (unknowns [ - 1 ] ):
358
+ for _ in range (unknowns . get ( - 1 , 0 ) ):
359
359
yield None
360
360
361
361
for pos , token in enumerate (self .tokens ):
362
362
yield token
363
- for _ in range (unknowns [ pos ] ):
363
+ for _ in range (unknowns . get ( pos , 0 ) ):
364
364
yield None
365
365
366
366
def tokens_by_line (
@@ -386,11 +386,13 @@ def tokens_by_line(
386
386
# bind frequently called functions to local scope
387
387
line_by_pos_append = self .line_by_pos .append
388
388
389
- self_unknowns_by_pos = self .unknowns_by_pos
389
+ # we use a defaultdict as a convenience at construction time
390
+ unknowns_by_pos = defaultdict (int )
390
391
unknowns_pos = set ()
391
392
unknowns_pos_add = unknowns_pos .add
392
393
393
- self_stopwords_by_pos = self .stopwords_by_pos
394
+ # we use a defaultdict as a convenience at construction time
395
+ stopwords_by_pos = defaultdict (int )
394
396
stopwords_pos = set ()
395
397
stopwords_pos_add = stopwords_pos .add
396
398
@@ -443,11 +445,11 @@ def tokens_by_line(
443
445
# If we have not yet started globally, then all tokens
444
446
# seen so far are stopwords and we keep a count of them
445
447
# in the magic "-1" position.
446
- self_stopwords_by_pos [- 1 ] += 1
448
+ stopwords_by_pos [- 1 ] += 1
447
449
else :
448
450
# here we have a new unknwon token positioned right after
449
451
# the current known_pos
450
- self_stopwords_by_pos [known_pos ] += 1
452
+ stopwords_by_pos [known_pos ] += 1
451
453
stopwords_pos_add (known_pos )
452
454
# we do not track stopwords, only their position
453
455
continue
@@ -456,11 +458,11 @@ def tokens_by_line(
456
458
# If we have not yet started globally, then all tokens
457
459
# seen so far are unknowns and we keep a count of them
458
460
# in the magic "-1" position.
459
- self_unknowns_by_pos [- 1 ] += 1
461
+ unknowns_by_pos [- 1 ] += 1
460
462
else :
461
463
# here we have a new unknwon token positioned right after
462
464
# the current known_pos
463
- self_unknowns_by_pos [known_pos ] += 1
465
+ unknowns_by_pos [known_pos ] += 1
464
466
unknowns_pos_add (known_pos )
465
467
466
468
line_tokens_append (tid )
@@ -492,11 +494,14 @@ def tokens_by_line(
492
494
493
495
yield line_tokens
494
496
495
- # finally create a Span of positions followed by unkwnons and another
496
- # for positions followed by stopwords used for intersection with the
497
- # query span to do the scoring matches correctly
497
+ # finally update the attributes and create a Span of positions followed
498
+ # by unkwnons and another for positions followed by stopwords used for
499
+ # intersection with the query span to do the scoring matches correctly
498
500
self .unknowns_span = Span (unknowns_pos )
499
501
self .stopwords_span = Span (stopwords_pos )
502
+ # also convert the defaultdicts back to plain discts
503
+ self .unknowns_by_pos = dict (unknowns_by_pos )
504
+ self .stopwords_by_pos = dict (stopwords_by_pos )
500
505
501
506
def tokenize_and_build_runs (self , tokens_by_line , line_threshold = 4 ):
502
507
"""
@@ -760,14 +765,14 @@ def tokens_with_unknowns(self):
760
765
unknowns = self .query .unknowns_by_pos
761
766
# yield anything at the start only if this is the first query run
762
767
if self .start == 0 :
763
- for _ in range (unknowns [ - 1 ] ):
768
+ for _ in range (unknowns . get ( - 1 , 0 ) ):
764
769
yield None
765
770
766
771
for pos , token in self .tokens_with_pos ():
767
772
yield token
768
773
if pos == self .end :
769
774
break
770
- for _ in range (unknowns [ pos ] ):
775
+ for _ in range (unknowns . get ( pos , 0 ) ):
771
776
yield None
772
777
773
778
def tokens_with_pos (self ):
0 commit comments