diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py index 0060217917..39d4a6840f 100644 --- a/py/makeqstrdata.py +++ b/py/makeqstrdata.py @@ -156,6 +156,14 @@ def compute_huffman_coding(translations, compression_filename): sum_len = 0 while True: + # Until the dictionary is filled to capacity, use a heuristic to find + # the best "word" (2- to 9-gram) to add to it. + # + # The TextSplitter allows us to avoid considering parts of the text + # that are already covered by a previously chosen word, for example + # if "the" is in words then not only will "the" not be considered + # again, neither will "there" or "wither", since they have "the" + # as substrings. extractor = TextSplitter(words) counter = collections.Counter() for t in texts: @@ -164,6 +172,8 @@ def compute_huffman_coding(translations, compression_filename): for substr in iter_substrings(word, minlen=2, maxlen=9): counter[substr] += 1 + # Score the candidates we found. This is an empirical formula only, + # chosen for its effectiveness. scores = sorted( ( (s, (len(s) - 1) ** log(max(occ - 2, 1)), occ) @@ -173,6 +183,8 @@ def compute_huffman_coding(translations, compression_filename): reverse=True, ) + # Do we have a "word" that occurred 5 times and got a score of at least + # 5? Horray. Pick the one with the highest score. word = None for (s, score, occ) in scores: if occ < 5: @@ -182,6 +194,8 @@ def compute_huffman_coding(translations, compression_filename): word = s break + # If we can successfully add it to the dictionary, do so. Otherwise, + # we've filled the dictionary to capacity and are done. if not word: break if sum_len + len(word) - 2 > max_words_len: