Merge pull request #4564 from tyomitch/patch-1

[build] simplify makeqstrdata heuristic
This commit is contained in:
Scott Shawcroft 2021-04-19 14:50:42 -07:00 committed by GitHub
commit e54e5e3575
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -12,7 +12,6 @@ from __future__ import print_function
import re import re
import sys import sys
from math import log
import collections import collections
import gettext import gettext
import os.path import os.path
@ -167,7 +166,7 @@ def compute_huffman_coding(translations, compression_filename):
sum_len = 0 sum_len = 0
while True: while True:
# Until the dictionary is filled to capacity, use a heuristic to find # Until the dictionary is filled to capacity, use a heuristic to find
# the best "word" (2- to 9-gram) to add to it. # the best "word" (3- to 9-gram) to add to it.
# #
# The TextSplitter allows us to avoid considering parts of the text # The TextSplitter allows us to avoid considering parts of the text
# that are already covered by a previously chosen word, for example # that are already covered by a previously chosen word, for example
@ -179,32 +178,25 @@ def compute_huffman_coding(translations, compression_filename):
for t in texts: for t in texts:
for (found, word) in extractor.iter_words(t): for (found, word) in extractor.iter_words(t):
if not found: if not found:
for substr in iter_substrings(word, minlen=2, maxlen=9): for substr in iter_substrings(word, minlen=3, maxlen=9):
counter[substr] += 1 counter[substr] += 1
# Score the candidates we found. This is an empirical formula only, # Score the candidates we found. This is an empirical formula only,
# chosen for its effectiveness. # chosen for its effectiveness.
scores = sorted( scores = sorted(
((s, (len(s) - 1) ** log(max(occ - 2, 1)), occ) for (s, occ) in counter.items()), ((s, (len(s) - 1) ** (occ + 4)) for (s, occ) in counter.items() if occ > 4),
key=lambda x: x[1], key=lambda x: x[1],
reverse=True, reverse=True,
) )
# Do we have a "word" that occurred 5 times and got a score of at least # Pick the one with the highest score.
# 5? Horray. Pick the one with the highest score. if not scores:
word = None
for (s, score, occ) in scores:
if occ < 5:
continue
if score < 5:
break
word = s
break break
word = scores[0][0]
# If we can successfully add it to the dictionary, do so. Otherwise, # If we can successfully add it to the dictionary, do so. Otherwise,
# we've filled the dictionary to capacity and are done. # we've filled the dictionary to capacity and are done.
if not word:
break
if sum_len + len(word) - 2 > max_words_len: if sum_len + len(word) - 2 > max_words_len:
break break
if len(words) == max_words: if len(words) == max_words: