From 2e996587fe9feed0f81488df7f38c0a9c97cf5e6 Mon Sep 17 00:00:00 2001 From: Bob Abeles Date: Mon, 23 Oct 2023 20:08:15 -0700 Subject: [PATCH 1/4] Improve make translation data performance. --- py/maketranslationdata.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/py/maketranslationdata.py b/py/maketranslationdata.py index ac3bcc53ee..00ed24ce4e 100644 --- a/py/maketranslationdata.py +++ b/py/maketranslationdata.py @@ -28,6 +28,8 @@ sys.path.append(str(pathlib.Path(__file__).parent.parent / "tools/huffman")) import huffman from html.entities import codepoint2name +import math + codepoint2name[ord("-")] = "hyphen" @@ -220,6 +222,15 @@ def compute_huffman_coding(qstrs, translation_name, translations, f, compression f"Translation {translation_name} expected to fit in 8 bits but required 16 bits" ) + # Prune the qstrs to only those longer than 3 characters that appear in the texts + qstr_counters = collections.Counter() + qstr_extractor = TextSplitter(qstr_strs) + for t in texts: + for qstr in qstr_extractor.iter(t): + if qstr in qstr_strs: + qstr_counters[qstr] += 1 + qstr_strs = list(qstr_counters.keys()) + while len(words) < max_words: # Until the dictionary is filled to capacity, use a heuristic to find # the best "word" (2- to 11-gram) to add to it. @@ -287,9 +298,13 @@ def compute_huffman_coding(qstrs, translation_name, translations, f, compression # to the codeword length the dictionary entry would get, times # the number of occurrences, less the ovehead of the entries in the # words[] array. + # + # The set of candidates is pruned by estimating their relative value and + # picking to top 100 scores. + counter = sorted(counter.items(), key=lambda x: math.log(x[1]) * len(x[0]), reverse=True)[:100] scores = sorted( - ((s, -est_net_savings(s, occ)) for (s, occ) in counter.items() if occ > 1), + ((s, -est_net_savings(s, occ)) for (s, occ) in counter if occ > 1), key=lambda x: x[1], ) From 892d89e8b7003011956e276b04f95666b68297b5 Mon Sep 17 00:00:00 2001 From: Bob Abeles Date: Mon, 23 Oct 2023 20:35:31 -0700 Subject: [PATCH 2/4] Fix CI detected line-length issue --- py/maketranslationdata.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/py/maketranslationdata.py b/py/maketranslationdata.py index 00ed24ce4e..111ba6aa78 100644 --- a/py/maketranslationdata.py +++ b/py/maketranslationdata.py @@ -222,7 +222,7 @@ def compute_huffman_coding(qstrs, translation_name, translations, f, compression f"Translation {translation_name} expected to fit in 8 bits but required 16 bits" ) - # Prune the qstrs to only those longer than 3 characters that appear in the texts + # Prune the qstrs to only those that appear in the texts qstr_counters = collections.Counter() qstr_extractor = TextSplitter(qstr_strs) for t in texts: @@ -302,7 +302,9 @@ def compute_huffman_coding(qstrs, translation_name, translations, f, compression # The set of candidates is pruned by estimating their relative value and # picking to top 100 scores. - counter = sorted(counter.items(), key=lambda x: math.log(x[1]) * len(x[0]), reverse=True)[:100] + counter = sorted( + counter.items(), key=lambda x: math.log(x[1]) * len(x[0]), reverse=True + )[:100] scores = sorted( ((s, -est_net_savings(s, occ)) for (s, occ) in counter if occ > 1), key=lambda x: x[1], From 3f4332be8062a8cbbf1c65971e5cb2685ccea211 Mon Sep 17 00:00:00 2001 From: Bob Abeles Date: Mon, 23 Oct 2023 20:40:51 -0700 Subject: [PATCH 3/4] Another CI formatting issue --- py/maketranslationdata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/py/maketranslationdata.py b/py/maketranslationdata.py index 111ba6aa78..0c41505725 100644 --- a/py/maketranslationdata.py +++ b/py/maketranslationdata.py @@ -302,9 +302,9 @@ def compute_huffman_coding(qstrs, translation_name, translations, f, compression # The set of candidates is pruned by estimating their relative value and # picking to top 100 scores. - counter = sorted( - counter.items(), key=lambda x: math.log(x[1]) * len(x[0]), reverse=True - )[:100] + counter = sorted(counter.items(), key=lambda x: math.log(x[1]) * len(x[0]), reverse=True)[ + :100 + ] scores = sorted( ((s, -est_net_savings(s, occ)) for (s, occ) in counter if occ > 1), key=lambda x: x[1], From 6725be4259b5ab6e3fe02a221932dd84ad64fe31 Mon Sep 17 00:00:00 2001 From: Bob Abeles Date: Mon, 23 Oct 2023 20:43:57 -0700 Subject: [PATCH 4/4] And formatting, again --- py/maketranslationdata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/maketranslationdata.py b/py/maketranslationdata.py index 0c41505725..0be618763c 100644 --- a/py/maketranslationdata.py +++ b/py/maketranslationdata.py @@ -303,7 +303,7 @@ def compute_huffman_coding(qstrs, translation_name, translations, f, compression # picking to top 100 scores. counter = sorted(counter.items(), key=lambda x: math.log(x[1]) * len(x[0]), reverse=True)[ - :100 + :100 ] scores = sorted( ((s, -est_net_savings(s, occ)) for (s, occ) in counter if occ > 1),