From f190b0395da5297691623614272ca36d7b5ea931 Mon Sep 17 00:00:00 2001 From: Jelle Spijker Date: Mon, 3 Apr 2023 10:54:45 +0200 Subject: [PATCH 1/9] Added script to fix a tmx file with existing po files Contribute to CURA-10376 --- scripts/fix_translation_memory.py | 64 +++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 scripts/fix_translation_memory.py diff --git a/scripts/fix_translation_memory.py b/scripts/fix_translation_memory.py new file mode 100644 index 0000000000..622d51cfd3 --- /dev/null +++ b/scripts/fix_translation_memory.py @@ -0,0 +1,64 @@ +##% Script to fix a corrupted Translation memory from existing po files + +import os +import re +import argparse +from pathlib import Path +from fuzzywuzzy import fuzz +from fuzzywuzzy import process +import xml.etree.ElementTree as ET +from xml.sax.saxutils import unescape + + +def load_existing_xmtm(path: Path) -> ET.Element: + """Load existing xmtm file and return the root element""" + tree = ET.parse(path) + return tree.getroot() + +def load_existing_po(path: Path) -> dict: + """Load existing po file and return a dictionary of msgid and msgstr""" + content = path.read_text(encoding="utf-8") + content = "".join(content.splitlines()[16:]) + return dict(re.findall(r'[^#]msgid.?\"+\s?([\s|\S]+?)\"*?msgstr.?\"([\s|\S]+?)\"?#', content)) + + +def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): + + po_content = {} + for file in i18n_path.rglob("cura.po"): + print(os.path.join(i18n_path, file)) + po_content[file.relative_to(i18n_path).parts[0].replace("_", "-")] = load_existing_po(Path(os.path.join(i18n_path, file))) + + root = load_existing_xmtm(tmx_source_path) + root_old = ET.ElementTree(root) + root_old.write("old.tmx", encoding="utf-8", xml_declaration=True) + for tu in root.iter("tu"): + if "cura.pot" not in [t.text for t in tu.findall("prop") if t.attrib["type"] == "x-smartling-file"]: + continue + tuvs = tu.findall("tuv") + key_source = tuvs[0].find("seg").text + key_lang = tuvs[1].attrib["{http://www.w3.org/XML/1998/namespace}lang"] + if key_lang in po_content and key_source in po_content[key_lang]: + tuvs[1].find("seg").text = po_content[key_lang][key_source] + else: + fuzz_match_ratio = [fuzz.ratio(k, key_source) for k in po_content[key_lang].keys()] + fuzz_max_ratio = max(fuzz_match_ratio) + fuzz_match_key = list(po_content[key_lang].keys())[fuzz_match_ratio.index(fuzz_max_ratio)] + if fuzz_max_ratio > 90: + fuzz_match_po_value = po_content[key_lang][fuzz_match_key] + tuvs[1].find("seg").text = fuzz_match_po_value + # print(f"[{key_lang}] Fuzz match: {key_source} -> {fuzz_match_key} -> {fuzz_match_po_value} with a ratio of {fuzz_max_ratio}") + else: + # print(f"[{key_lang}] No match for: {key_source} -> {tuvs[1].find('seg').text} -> highest ratio: {fuzz_max_ratio}") + print(f"[{key_lang}] {key_source} == {fuzz_match_key} [{fuzz_max_ratio}]") + fixed_root = ET.ElementTree(root) + fixed_root.write(tmx_target_path, encoding="utf-8", xml_declaration=True) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Fix a corrupted Translation memory from existing po files") + parser.add_argument("tmx_source_path", type=Path, help="Path to the source TMX file") + parser.add_argument("tmx_target_path", type=Path, help="Path to the target TMX file") + parser.add_argument("i18n_path", type=Path, help="Path to the i18n folder") + args = parser.parse_args() + main(args.tmx_source_path, args.tmx_target_path, args.i18n_path) From 95962b2914503f2c0ae602f24df1f1174343b79e Mon Sep 17 00:00:00 2001 From: Jelle Spijker Date: Mon, 3 Apr 2023 12:42:04 +0200 Subject: [PATCH 2/9] Set fuzz_match_key Contribute to CURA-10376 --- scripts/fix_translation_memory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/fix_translation_memory.py b/scripts/fix_translation_memory.py index 622d51cfd3..1458087887 100644 --- a/scripts/fix_translation_memory.py +++ b/scripts/fix_translation_memory.py @@ -46,6 +46,7 @@ def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): fuzz_match_key = list(po_content[key_lang].keys())[fuzz_match_ratio.index(fuzz_max_ratio)] if fuzz_max_ratio > 90: fuzz_match_po_value = po_content[key_lang][fuzz_match_key] + tuvs[0].find("seg").text = fuzz_match_key tuvs[1].find("seg").text = fuzz_match_po_value # print(f"[{key_lang}] Fuzz match: {key_source} -> {fuzz_match_key} -> {fuzz_match_po_value} with a ratio of {fuzz_max_ratio}") else: From 1792f80ac89d69121ca79654c9938309389a8a76 Mon Sep 17 00:00:00 2001 From: Jelle Spijker Date: Mon, 3 Apr 2023 13:12:23 +0200 Subject: [PATCH 3/9] Sanitize the string before write Contribute to CURA-10376 --- scripts/fix_translation_memory.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/scripts/fix_translation_memory.py b/scripts/fix_translation_memory.py index 1458087887..2c65192bae 100644 --- a/scripts/fix_translation_memory.py +++ b/scripts/fix_translation_memory.py @@ -7,7 +7,7 @@ from pathlib import Path from fuzzywuzzy import fuzz from fuzzywuzzy import process import xml.etree.ElementTree as ET -from xml.sax.saxutils import unescape +from xml.sax.saxutils import unescape, escape, quoteattr def load_existing_xmtm(path: Path) -> ET.Element: @@ -21,6 +21,9 @@ def load_existing_po(path: Path) -> dict: content = "".join(content.splitlines()[16:]) return dict(re.findall(r'[^#]msgid.?\"+\s?([\s|\S]+?)\"*?msgstr.?\"([\s|\S]+?)\"?#', content)) +def sanitize(text: str) -> str: + """Sanitize the text""" + return unescape(text.replace("\"\"", "").replace("\"#~", "")) def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): @@ -31,6 +34,7 @@ def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): root = load_existing_xmtm(tmx_source_path) root_old = ET.ElementTree(root) + ET.indent(root_old, ' ') root_old.write("old.tmx", encoding="utf-8", xml_declaration=True) for tu in root.iter("tu"): if "cura.pot" not in [t.text for t in tu.findall("prop") if t.attrib["type"] == "x-smartling-file"]: @@ -39,20 +43,20 @@ def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): key_source = tuvs[0].find("seg").text key_lang = tuvs[1].attrib["{http://www.w3.org/XML/1998/namespace}lang"] if key_lang in po_content and key_source in po_content[key_lang]: - tuvs[1].find("seg").text = po_content[key_lang][key_source] + replaced_translation = po_content[key_lang][key_source] else: fuzz_match_ratio = [fuzz.ratio(k, key_source) for k in po_content[key_lang].keys()] fuzz_max_ratio = max(fuzz_match_ratio) fuzz_match_key = list(po_content[key_lang].keys())[fuzz_match_ratio.index(fuzz_max_ratio)] if fuzz_max_ratio > 90: - fuzz_match_po_value = po_content[key_lang][fuzz_match_key] - tuvs[0].find("seg").text = fuzz_match_key - tuvs[1].find("seg").text = fuzz_match_po_value - # print(f"[{key_lang}] Fuzz match: {key_source} -> {fuzz_match_key} -> {fuzz_match_po_value} with a ratio of {fuzz_max_ratio}") + replaced_translation = po_content[key_lang][fuzz_match_key] + tuvs[0].find("seg").text = sanitize(fuzz_match_key) else: - # print(f"[{key_lang}] No match for: {key_source} -> {tuvs[1].find('seg').text} -> highest ratio: {fuzz_max_ratio}") print(f"[{key_lang}] {key_source} == {fuzz_match_key} [{fuzz_max_ratio}]") + continue + tuvs[1].find("seg").text = sanitize(replaced_translation) fixed_root = ET.ElementTree(root) + ET.indent(fixed_root, ' ') fixed_root.write(tmx_target_path, encoding="utf-8", xml_declaration=True) From 0feda7fe08ccac297235b37822e63c1ba82ddb92 Mon Sep 17 00:00:00 2001 From: Jelle Spijker Date: Mon, 3 Apr 2023 13:29:05 +0200 Subject: [PATCH 4/9] Sanitize the fuzzy key string before match Contribute to CURA-10376 --- scripts/fix_translation_memory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/fix_translation_memory.py b/scripts/fix_translation_memory.py index 2c65192bae..ee41b59e40 100644 --- a/scripts/fix_translation_memory.py +++ b/scripts/fix_translation_memory.py @@ -23,12 +23,12 @@ def load_existing_po(path: Path) -> dict: def sanitize(text: str) -> str: """Sanitize the text""" - return unescape(text.replace("\"\"", "").replace("\"#~", "")) + return unescape(text.replace("Ultimaker", "UltiMaker").replace("\"\"", "").replace("\"#~", "")) def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): po_content = {} - for file in i18n_path.rglob("cura.po"): + for file in i18n_path.rglob("*.po"): print(os.path.join(i18n_path, file)) po_content[file.relative_to(i18n_path).parts[0].replace("_", "-")] = load_existing_po(Path(os.path.join(i18n_path, file))) @@ -45,7 +45,7 @@ def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): if key_lang in po_content and key_source in po_content[key_lang]: replaced_translation = po_content[key_lang][key_source] else: - fuzz_match_ratio = [fuzz.ratio(k, key_source) for k in po_content[key_lang].keys()] + fuzz_match_ratio = [fuzz.ratio(sanitize(k), key_source) for k in po_content[key_lang].keys()] fuzz_max_ratio = max(fuzz_match_ratio) fuzz_match_key = list(po_content[key_lang].keys())[fuzz_match_ratio.index(fuzz_max_ratio)] if fuzz_max_ratio > 90: From 19b3c93a0cbd8c63d123f762b89295fe7678d345 Mon Sep 17 00:00:00 2001 From: Jelle Spijker Date: Mon, 3 Apr 2023 13:51:03 +0200 Subject: [PATCH 5/9] Hand-over Contribute to CURA-10376 --- scripts/fix_translation_memory.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/fix_translation_memory.py b/scripts/fix_translation_memory.py index ee41b59e40..83f40a4e89 100644 --- a/scripts/fix_translation_memory.py +++ b/scripts/fix_translation_memory.py @@ -19,10 +19,12 @@ def load_existing_po(path: Path) -> dict: """Load existing po file and return a dictionary of msgid and msgstr""" content = path.read_text(encoding="utf-8") content = "".join(content.splitlines()[16:]) + # TODO: check languages with plural forms return dict(re.findall(r'[^#]msgid.?\"+\s?([\s|\S]+?)\"*?msgstr.?\"([\s|\S]+?)\"?#', content)) def sanitize(text: str) -> str: """Sanitize the text""" + # TODO: check if Digitial Factory Ultimaker etc handled correctly return unescape(text.replace("Ultimaker", "UltiMaker").replace("\"\"", "").replace("\"#~", "")) def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): @@ -36,7 +38,9 @@ def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): root_old = ET.ElementTree(root) ET.indent(root_old, ' ') root_old.write("old.tmx", encoding="utf-8", xml_declaration=True) + to_be_removed = [] for tu in root.iter("tu"): + # TODO: also add logic for other pot files if "cura.pot" not in [t.text for t in tu.findall("prop") if t.attrib["type"] == "x-smartling-file"]: continue tuvs = tu.findall("tuv") @@ -55,6 +59,12 @@ def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): print(f"[{key_lang}] {key_source} == {fuzz_match_key} [{fuzz_max_ratio}]") continue tuvs[1].find("seg").text = sanitize(replaced_translation) + # if the tvus[1].find("seg").text is a single ", remove the tu element as whole (since this is an untranslated string) + if tuvs[1].find("seg").text == "\"": + to_be_removed.append(tu) + + for tu in to_be_removed: + root.remove(tu) fixed_root = ET.ElementTree(root) ET.indent(fixed_root, ' ') fixed_root.write(tmx_target_path, encoding="utf-8", xml_declaration=True) From ef09fe640becd1a96458c434f08b625f53600e65 Mon Sep 17 00:00:00 2001 From: "c.lamboo" Date: Mon, 3 Apr 2023 21:46:24 +0200 Subject: [PATCH 6/9] Fix removing `tu` elements CURA-10376 --- scripts/fix_translation_memory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/fix_translation_memory.py b/scripts/fix_translation_memory.py index 83f40a4e89..146a1caf26 100644 --- a/scripts/fix_translation_memory.py +++ b/scripts/fix_translation_memory.py @@ -36,7 +36,7 @@ def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): root = load_existing_xmtm(tmx_source_path) root_old = ET.ElementTree(root) - ET.indent(root_old, ' ') + # ET.indent(root_old, ' ') root_old.write("old.tmx", encoding="utf-8", xml_declaration=True) to_be_removed = [] for tu in root.iter("tu"): @@ -63,10 +63,10 @@ def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): if tuvs[1].find("seg").text == "\"": to_be_removed.append(tu) + body = root.find("body") for tu in to_be_removed: - root.remove(tu) + body.remove(tu) fixed_root = ET.ElementTree(root) - ET.indent(fixed_root, ' ') fixed_root.write(tmx_target_path, encoding="utf-8", xml_declaration=True) From af4efad1cfa3a4708b87adc10d9b00f708d779bb Mon Sep 17 00:00:00 2001 From: "c.lamboo" Date: Mon, 3 Apr 2023 21:46:51 +0200 Subject: [PATCH 7/9] Don't capitalize "UltiMaker" for brand names CURA-10376 --- scripts/fix_translation_memory.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/fix_translation_memory.py b/scripts/fix_translation_memory.py index 146a1caf26..e53384dc21 100644 --- a/scripts/fix_translation_memory.py +++ b/scripts/fix_translation_memory.py @@ -25,7 +25,12 @@ def load_existing_po(path: Path) -> dict: def sanitize(text: str) -> str: """Sanitize the text""" # TODO: check if Digitial Factory Ultimaker etc handled correctly - return unescape(text.replace("Ultimaker", "UltiMaker").replace("\"\"", "").replace("\"#~", "")) + text = text.replace("\"\"", "").replace("\"#~", "") + text = text.replace("Ultimaker", "UltiMaker") + text = text.replace("UltiMaker Digital Library", "Ultimaker Digital Library") + text = text.replace("UltiMaker Digital Factory", "Ultimaker Digital Factory") + text = text.replace("UltiMaker Marketplace", "Ultimaker Marketplace") + return unescape(text) def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): From 1763b9bf9d00e3758c64c83207f06f1d82e28f91 Mon Sep 17 00:00:00 2001 From: Jelle Spijker Date: Tue, 4 Apr 2023 15:47:56 +0200 Subject: [PATCH 8/9] filter on relevant cura files Contribute to CURA-10376 --- scripts/fix_translation_memory.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/fix_translation_memory.py b/scripts/fix_translation_memory.py index e53384dc21..a8eb4678d7 100644 --- a/scripts/fix_translation_memory.py +++ b/scripts/fix_translation_memory.py @@ -46,7 +46,7 @@ def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): to_be_removed = [] for tu in root.iter("tu"): # TODO: also add logic for other pot files - if "cura.pot" not in [t.text for t in tu.findall("prop") if t.attrib["type"] == "x-smartling-file"]: + if [t.text for t in tu.findall("prop") if t.attrib["type"] == "x-smartling-file"][0] not in ("cura.pot", "fdmprinter.def.json.pot", "fdmextruder.def.json.pot"): continue tuvs = tu.findall("tuv") key_source = tuvs[0].find("seg").text @@ -68,6 +68,7 @@ def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): if tuvs[1].find("seg").text == "\"": to_be_removed.append(tu) + print(f"Removed {len(to_be_removed)} elements") body = root.find("body") for tu in to_be_removed: body.remove(tu) From 01d22d1ad462029939c2b4b5b8ed114d20e5b247 Mon Sep 17 00:00:00 2001 From: Jelle Spijker Date: Wed, 5 Apr 2023 12:35:23 +0200 Subject: [PATCH 9/9] Also check on Uranium.pot Contribute to CURA-10376 --- scripts/fix_translation_memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/fix_translation_memory.py b/scripts/fix_translation_memory.py index a8eb4678d7..610fb93ac6 100644 --- a/scripts/fix_translation_memory.py +++ b/scripts/fix_translation_memory.py @@ -46,7 +46,7 @@ def main(tmx_source_path: Path, tmx_target_path: Path, i18n_path: Path): to_be_removed = [] for tu in root.iter("tu"): # TODO: also add logic for other pot files - if [t.text for t in tu.findall("prop") if t.attrib["type"] == "x-smartling-file"][0] not in ("cura.pot", "fdmprinter.def.json.pot", "fdmextruder.def.json.pot"): + if [t.text for t in tu.findall("prop") if t.attrib["type"] == "x-smartling-file"][0] not in ("cura.pot", "fdmprinter.def.json.pot", "fdmextruder.def.json.pot", "uranium.pot"): continue tuvs = tu.findall("tuv") key_source = tuvs[0].find("seg").text