makeqstrdata: comment my understanding of @ciscorn's code

This commit is contained in:
Jeff Epler 2020-09-16 07:58:55 -05:00
parent d9e336d39f
commit a8e98cda83

View File

@ -156,6 +156,14 @@ def compute_huffman_coding(translations, compression_filename):
sum_len = 0
while True:
# Until the dictionary is filled to capacity, use a heuristic to find
# the best "word" (2- to 9-gram) to add to it.
#
# The TextSplitter allows us to avoid considering parts of the text
# that are already covered by a previously chosen word, for example
# if "the" is in words then not only will "the" not be considered
# again, neither will "there" or "wither", since they have "the"
# as substrings.
extractor = TextSplitter(words)
counter = collections.Counter()
for t in texts:
@ -164,6 +172,8 @@ def compute_huffman_coding(translations, compression_filename):
for substr in iter_substrings(word, minlen=2, maxlen=9):
counter[substr] += 1
# Score the candidates we found. This is an empirical formula only,
# chosen for its effectiveness.
scores = sorted(
(
(s, (len(s) - 1) ** log(max(occ - 2, 1)), occ)
@ -173,6 +183,8 @@ def compute_huffman_coding(translations, compression_filename):
reverse=True,
)
# Do we have a "word" that occurred 5 times and got a score of at least
# 5? Horray. Pick the one with the highest score.
word = None
for (s, score, occ) in scores:
if occ < 5:
@ -182,6 +194,8 @@ def compute_huffman_coding(translations, compression_filename):
word = s
break
# If we can successfully add it to the dictionary, do so. Otherwise,
# we've filled the dictionary to capacity and are done.
if not word:
break
if sum_len + len(word) - 2 > max_words_len: