lang: generated missing translations for all languages using translategemma:4b model

2026-04-14 00:04:15 +09:00 · 2026-03-07 18:50:27 +01:00
parent 28970e841e
commit cb0aea7118
40 changed files with 3475 additions and 3210 deletions
--- a/Kit/scripts/i18n.py
+++ b/Kit/scripts/i18n.py
@@ -1,5 +1,14 @@
 import os
 import sys
+import json
+import urllib.request
+import subprocess
+import unicodedata
+
+try:
+    import langcodes
+except Exception:
+    langcodes = None


 def dictionary(lines):
@@ -25,7 +34,8 @@ class i18n:
        self.languages = list(filter(lambda x: x.endswith(".lproj"), os.listdir(self.path)))

    def en_file(self):
-        en_file = open(f"{self.path}/en.lproj/Localizable.strings", "r").readlines()
+        with open(f"{self.path}/en.lproj/Localizable.strings", "r") as f:
+            en_file = f.readlines()
        if en_file is None:
            sys.exit("English language not found.")
        return en_file
@@ -35,7 +45,8 @@ class i18n:
        en_dict = dictionary(en_file)

        for lang in self.languages:
-            file = open(f"{self.path}/{lang}/Localizable.strings", "r").readlines()
+            with open(f"{self.path}/{lang}/Localizable.strings", "r") as f:
+                file = f.readlines()
            name = lang.replace(".lproj", "")
            lang_dict = dictionary(file)

@@ -43,9 +54,9 @@ class i18n:
                en_key = en_dict[v].get("key")
                if v not in lang_dict:
                    sys.exit(f"missing key `{en_key}` in `{name}` on line `{v}`")
-                lang_ley = lang_dict[v].get("key")
-                if lang_ley != en_key:
-                    sys.exit(f"missing or wrong key `{lang_ley}` in `{name}` on line `{v}`, must be `{en_key}`")
+                lang_key = lang_dict[v].get("key")
+                if lang_key != en_key:
+                    sys.exit(f"missing or wrong key `{lang_key}` in `{name}` on line `{v}`, must be `{en_key}`")

        print(f"All fine, found {len(en_file)} lines in {len(self.languages)} languages.")

@@ -59,24 +70,278 @@ class i18n:

            for lang in self.languages:
                lang_path = f"{self.path}/{lang}/Localizable.strings"
-                file = open(lang_path, "r").readlines()
+                with open(lang_path, "r") as f:
+                    file = f.readlines()
                lang_dict = dictionary(file)

                if v not in lang_dict or en_key != lang_dict[v].get("key"):
                    file.insert(v, f"\"{en_key}\" = \"{en_value}\";\n")
                    with open(lang_path, "w") as f:
-                        file = "".join(file)
-                        f.write(file)
-                        f.close()
+                        f.write("".join(file))

        self.check()

+    def _normalize_lang_code(self, code):
+        code = (code or "").strip()
+        if code.endswith(".lproj"):
+            code = code[:-6]
+        return code.replace("-", "_")
+
+    def _extract_translation(self, raw, fallback):
+        raw = (raw or "").strip()
+        if not raw:
+            return fallback
+
+        def _clean(s):
+            return (s or "").strip().strip("*").strip('"').strip("'").strip()
+
+        def _from_dict(obj):
+            if not isinstance(obj, dict):
+                return None
+
+            role = (obj.get("role") or "").strip().lower()
+            obj_type = (obj.get("type") or "").strip().lower()
+
+            text = obj.get("text")
+            if isinstance(text, str) and text.strip():
+                if role in ("assistant", "translation") or obj_type == "translation":
+                    return _clean(text)
+
+            content = obj.get("content")
+            if isinstance(content, list):
+                for item in content:
+                    if not isinstance(item, dict):
+                        continue
+                    item_role = (item.get("role") or role).strip().lower()
+                    item_type = (item.get("type") or "").strip().lower()
+                    t = item.get("text")
+                    if isinstance(t, str) and t.strip():
+                        if item_role in ("assistant", "translation") or item_type in ("translation", "text"):
+                            return _clean(t)
+            return None
+
+        try:
+            parsed = json.loads(raw)
+            if isinstance(parsed, dict):
+                hit = _from_dict(parsed)
+                if hit:
+                    return hit
+            elif isinstance(parsed, list):
+                for item in parsed:
+                    hit = _from_dict(item)
+                    if hit:
+                        return hit
+        except json.JSONDecodeError:
+            pass
+
+        if "\n" not in raw and len(raw) <= 200:
+            candidate = _clean(raw)
+            if candidate and not candidate.startswith("{") and not candidate.startswith("["):
+                return candidate
+
+        for line in raw.splitlines():
+            line = _clean(line)
+            if line and not line.startswith("{") and not line.startswith("["):
+                return line
+
+        return fallback
+
+    def _lang_name_from_code(self, code):
+        c = self._normalize_lang_code(code).replace("_", "-").strip()
+        if not c:
+            return "Unknown"
+
+        if langcodes:
+            try:
+                name = langcodes.get(c).display_name("en")
+                if name:
+                    return name
+            except Exception:
+                pass
+
+        return c
+
+    def _script_hint(self, lang_code):
+        lang = self._normalize_lang_code(lang_code).lower()
+        hints = {
+            "el": "Greek script only (Α-Ω, α-ω) except numbers/punctuation/brand names.",
+            "ru": "Cyrillic script only except numbers/punctuation/brand names.",
+            "uk": "Cyrillic script only except numbers/punctuation/brand names.",
+            "bg": "Cyrillic script only except numbers/punctuation/brand names.",
+            "ja": "Japanese writing system (Hiragana/Katakana/Kanji), no romaji unless required.",
+            "zh_cn": "Simplified Chinese characters.",
+            "zh_hans": "Simplified Chinese characters.",
+            "zh_tw": "Traditional Chinese characters.",
+            "zh_hant": "Traditional Chinese characters.",
+            "ko": "Korean Hangul preferred.",
+            "et": "Use Estonian only. Do not use Russian.",
+        }
+        return hints.get(lang, "")
+
+    def _ollama_translate(self, text, target_lang, model="translategemma:4b", retries=2):
+        url = "http://ai:11434/api/generate"
+        tgt = self._normalize_lang_code(target_lang)
+        lang = self._lang_name_from_code(tgt)
+        script_hint = self._script_hint(tgt)
+
+        prompt = (
+            f"You are a professional English (en) to {lang} ({tgt}) translator. Your goal is to accurately convey the meaning and nuances of the original English text while adhering to {lang} grammar, vocabulary, and cultural sensitivities. Produce only the {lang} translation, without any additional explanations or commentary. Output only the final translated text. Do not add explanations, notes, JSON, markdown, or quotes. Preserve placeholders/tokens exactly \\(e\\.g\\. `%@`, `%d`, `{0}`, `MB/s`\\). Preserve punctuation, casing intent, and technical abbreviations. {script_hint} Please translate the following English text into {lang}:\\n\\n"
+            f"{text}"
+        )
+        
+        payload = {
+            "model": model,
+            "prompt": prompt,
+            "stream": False,
+        }
+
+        req = urllib.request.Request(
+            url,
+            data=json.dumps(payload).encode("utf-8"),
+            headers={"Content-Type": "application/json"},
+            method="POST"
+        )
+
+        with urllib.request.urlopen(req, timeout=240) as resp:
+            data = json.loads(resp.read().decode("utf-8"))
+            raw = data.get("response", "").strip()
+
+        return self._extract_translation(raw, fallback=text)
+
+    def _line_authors(self, file_path):
+        cmd = ["git", "blame", "--line-porcelain", file_path]
+        out = subprocess.check_output(cmd, text=True, cwd=os.getcwd(), stderr=subprocess.DEVNULL)
+        authors = []
+        for line in out.splitlines():
+            if line.startswith("author "):
+                authors.append(line[len("author "):].strip())
+        return authors
+
+    def _my_git_author(self):
+        try:
+            return subprocess.check_output(
+                ["git", "config", "user.name"],
+                text=True,
+                cwd=os.getcwd()
+            ).strip()
+        except Exception:
+            return ""
+
+    def _strings_escape(self, value):
+        s = "" if value is None else str(value)
+        s = s.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
+        return s
+
+    def translate(self, model="translategemma:4b", accept=False):
+        en_lines = self.en_file()
+        en_dict = dictionary(en_lines)
+        my_author = self._my_git_author()
+        omit_keys = ["Swap"]
+        ai_tag = f"// {model}"
+
+        target_languages = [
+            l for l in self.languages
+            if not self._normalize_lang_code(l).lower().startswith("en")
+#            if self._normalize_lang_code(l).lower() in ("sk")
+        ]
+        total_langs = len(target_languages)
+
+        for lang_idx, lang in enumerate(target_languages, start=1):
+            lang_code = lang.replace(".lproj", "")
+            lang_name = self._lang_name_from_code(lang_code)
+            lang_path = f"{self.path}/{lang}/Localizable.strings"
+
+            with open(lang_path, "r") as f:
+                old_lines = f.readlines()
+
+            new_lines = old_lines[:]
+            lang_dict = dictionary(old_lines)
+            changed = False
+
+            try:
+                authors = self._line_authors(lang_path)
+            except Exception:
+                authors = [""] * len(old_lines)
+
+            candidates = []
+            for i, en_item in en_dict.items():
+                en_key = en_item.get("key")
+                en_value = en_item.get("value")
+
+                translate_item = lang_dict.get(i)
+                translate_key = translate_item.get("key") if translate_item else None
+                translate_value = translate_item.get("value") if translate_item else None
+
+                if translate_item is None or translate_key != en_key:
+                    line = f"\"{en_key}\" = \"{en_value}\";\n"
+                    if i < len(new_lines):
+                        new_lines.insert(i, line)
+                    else:
+                        new_lines.append(line)
+                    if i <= len(authors):
+                        authors.insert(i, my_author)
+                    changed = True
+                    translate_value = en_value
+
+                if translate_key != en_key:
+                    continue
+                if en_key in omit_keys:
+                    continue
+                if i < len(authors) and my_author and authors[i] != my_author and en_value != translate_value:
+                    continue
+
+                if translate_value is None or translate_value == en_value:
+                    candidates.append((i, en_key, en_value))
+
+            print("Candidates for translation in {} ({}): {}".format(lang_name, lang_code, len(candidates)))
+
+            for idx, (i, en_key, en_value) in enumerate(candidates, start=1):
+                translated = self._ollama_translate(en_value, lang_code, model=model)
+                safe_translated = self._strings_escape(translated)
+                print(f"[{lang_name} {lang_idx}/{total_langs}] {idx}/{len(candidates)} {en_key} -> {safe_translated}")
+
+                translated_line = f"\"{en_key}\" = \"{safe_translated}\";\n"
+                update_line = f"\"{en_key}\" = \"{safe_translated}\"; {ai_tag}\n"
+                if i < len(new_lines):
+                    if new_lines[i] != translated_line:
+                        new_lines[i] = update_line
+                        changed = True
+                else:
+                    new_lines.append(update_line)
+                    changed = True
+
+            if not changed:
+                print(f"No changes for {lang_code} ({lang_code}).")
+                continue
+
+            if accept:
+                with open(lang_path, "w") as f:
+                    f.write("".join(new_lines))
+                print(f"Saved: {lang_path}")
+            else:
+                answer = input(f"Save changes to {lang_path}? [Y/n]: ").strip().lower()
+                if answer in ("", "y", "yes"):
+                    with open(lang_path, "w") as f:
+                        f.write("".join(new_lines))
+                    print(f"Saved: {lang_path}")
+                else:
+                    print(f"Skipped: {lang_path}")
+
+        print("Translation completed.")
+

 if __name__ == "__main__":
    i18n = i18n()
+    args = sys.argv[1:]
+    accept = "--accept" in args
+    args = [a for a in args if a != "--accept"]
+
    if len(sys.argv) >= 2 and sys.argv[1] == "fix":
        print("running fix command...")
        i18n.fix()
+    elif len(sys.argv) >= 2 and sys.argv[1] == "translate":
+        print("running translate command...")
+        i18n.translate(accept=accept)
    else:
        print("running check command...")
        i18n.check()