lang: generated missing translations for all languages using translategemma:4b model

This commit is contained in:
Serhiy Mytrovtsiy
2026-03-07 18:50:27 +01:00
parent 28970e841e
commit cb0aea7118
40 changed files with 3475 additions and 3210 deletions

View File

@@ -1,5 +1,14 @@
import os
import sys
import json
import urllib.request
import subprocess
import unicodedata
try:
import langcodes
except Exception:
langcodes = None
def dictionary(lines):
@@ -25,7 +34,8 @@ class i18n:
self.languages = list(filter(lambda x: x.endswith(".lproj"), os.listdir(self.path)))
def en_file(self):
en_file = open(f"{self.path}/en.lproj/Localizable.strings", "r").readlines()
with open(f"{self.path}/en.lproj/Localizable.strings", "r") as f:
en_file = f.readlines()
if en_file is None:
sys.exit("English language not found.")
return en_file
@@ -35,7 +45,8 @@ class i18n:
en_dict = dictionary(en_file)
for lang in self.languages:
file = open(f"{self.path}/{lang}/Localizable.strings", "r").readlines()
with open(f"{self.path}/{lang}/Localizable.strings", "r") as f:
file = f.readlines()
name = lang.replace(".lproj", "")
lang_dict = dictionary(file)
@@ -43,9 +54,9 @@ class i18n:
en_key = en_dict[v].get("key")
if v not in lang_dict:
sys.exit(f"missing key `{en_key}` in `{name}` on line `{v}`")
lang_ley = lang_dict[v].get("key")
if lang_ley != en_key:
sys.exit(f"missing or wrong key `{lang_ley}` in `{name}` on line `{v}`, must be `{en_key}`")
lang_key = lang_dict[v].get("key")
if lang_key != en_key:
sys.exit(f"missing or wrong key `{lang_key}` in `{name}` on line `{v}`, must be `{en_key}`")
print(f"All fine, found {len(en_file)} lines in {len(self.languages)} languages.")
@@ -59,24 +70,278 @@ class i18n:
for lang in self.languages:
lang_path = f"{self.path}/{lang}/Localizable.strings"
file = open(lang_path, "r").readlines()
with open(lang_path, "r") as f:
file = f.readlines()
lang_dict = dictionary(file)
if v not in lang_dict or en_key != lang_dict[v].get("key"):
file.insert(v, f"\"{en_key}\" = \"{en_value}\";\n")
with open(lang_path, "w") as f:
file = "".join(file)
f.write(file)
f.close()
f.write("".join(file))
self.check()
def _normalize_lang_code(self, code):
code = (code or "").strip()
if code.endswith(".lproj"):
code = code[:-6]
return code.replace("-", "_")
def _extract_translation(self, raw, fallback):
raw = (raw or "").strip()
if not raw:
return fallback
def _clean(s):
return (s or "").strip().strip("*").strip('"').strip("'").strip()
def _from_dict(obj):
if not isinstance(obj, dict):
return None
role = (obj.get("role") or "").strip().lower()
obj_type = (obj.get("type") or "").strip().lower()
text = obj.get("text")
if isinstance(text, str) and text.strip():
if role in ("assistant", "translation") or obj_type == "translation":
return _clean(text)
content = obj.get("content")
if isinstance(content, list):
for item in content:
if not isinstance(item, dict):
continue
item_role = (item.get("role") or role).strip().lower()
item_type = (item.get("type") or "").strip().lower()
t = item.get("text")
if isinstance(t, str) and t.strip():
if item_role in ("assistant", "translation") or item_type in ("translation", "text"):
return _clean(t)
return None
try:
parsed = json.loads(raw)
if isinstance(parsed, dict):
hit = _from_dict(parsed)
if hit:
return hit
elif isinstance(parsed, list):
for item in parsed:
hit = _from_dict(item)
if hit:
return hit
except json.JSONDecodeError:
pass
if "\n" not in raw and len(raw) <= 200:
candidate = _clean(raw)
if candidate and not candidate.startswith("{") and not candidate.startswith("["):
return candidate
for line in raw.splitlines():
line = _clean(line)
if line and not line.startswith("{") and not line.startswith("["):
return line
return fallback
def _lang_name_from_code(self, code):
c = self._normalize_lang_code(code).replace("_", "-").strip()
if not c:
return "Unknown"
if langcodes:
try:
name = langcodes.get(c).display_name("en")
if name:
return name
except Exception:
pass
return c
def _script_hint(self, lang_code):
lang = self._normalize_lang_code(lang_code).lower()
hints = {
"el": "Greek script only (Α-Ω, α-ω) except numbers/punctuation/brand names.",
"ru": "Cyrillic script only except numbers/punctuation/brand names.",
"uk": "Cyrillic script only except numbers/punctuation/brand names.",
"bg": "Cyrillic script only except numbers/punctuation/brand names.",
"ja": "Japanese writing system (Hiragana/Katakana/Kanji), no romaji unless required.",
"zh_cn": "Simplified Chinese characters.",
"zh_hans": "Simplified Chinese characters.",
"zh_tw": "Traditional Chinese characters.",
"zh_hant": "Traditional Chinese characters.",
"ko": "Korean Hangul preferred.",
"et": "Use Estonian only. Do not use Russian.",
}
return hints.get(lang, "")
def _ollama_translate(self, text, target_lang, model="translategemma:4b", retries=2):
url = "http://ai:11434/api/generate"
tgt = self._normalize_lang_code(target_lang)
lang = self._lang_name_from_code(tgt)
script_hint = self._script_hint(tgt)
prompt = (
f"You are a professional English (en) to {lang} ({tgt}) translator. Your goal is to accurately convey the meaning and nuances of the original English text while adhering to {lang} grammar, vocabulary, and cultural sensitivities. Produce only the {lang} translation, without any additional explanations or commentary. Output only the final translated text. Do not add explanations, notes, JSON, markdown, or quotes. Preserve placeholders/tokens exactly \\(e\\.g\\. `%@`, `%d`, `{0}`, `MB/s`\\). Preserve punctuation, casing intent, and technical abbreviations. {script_hint} Please translate the following English text into {lang}:\\n\\n"
f"{text}"
)
payload = {
"model": model,
"prompt": prompt,
"stream": False,
}
req = urllib.request.Request(
url,
data=json.dumps(payload).encode("utf-8"),
headers={"Content-Type": "application/json"},
method="POST"
)
with urllib.request.urlopen(req, timeout=240) as resp:
data = json.loads(resp.read().decode("utf-8"))
raw = data.get("response", "").strip()
return self._extract_translation(raw, fallback=text)
def _line_authors(self, file_path):
cmd = ["git", "blame", "--line-porcelain", file_path]
out = subprocess.check_output(cmd, text=True, cwd=os.getcwd(), stderr=subprocess.DEVNULL)
authors = []
for line in out.splitlines():
if line.startswith("author "):
authors.append(line[len("author "):].strip())
return authors
def _my_git_author(self):
try:
return subprocess.check_output(
["git", "config", "user.name"],
text=True,
cwd=os.getcwd()
).strip()
except Exception:
return ""
def _strings_escape(self, value):
s = "" if value is None else str(value)
s = s.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
return s
def translate(self, model="translategemma:4b", accept=False):
en_lines = self.en_file()
en_dict = dictionary(en_lines)
my_author = self._my_git_author()
omit_keys = ["Swap"]
ai_tag = f"// {model}"
target_languages = [
l for l in self.languages
if not self._normalize_lang_code(l).lower().startswith("en")
# if self._normalize_lang_code(l).lower() in ("sk")
]
total_langs = len(target_languages)
for lang_idx, lang in enumerate(target_languages, start=1):
lang_code = lang.replace(".lproj", "")
lang_name = self._lang_name_from_code(lang_code)
lang_path = f"{self.path}/{lang}/Localizable.strings"
with open(lang_path, "r") as f:
old_lines = f.readlines()
new_lines = old_lines[:]
lang_dict = dictionary(old_lines)
changed = False
try:
authors = self._line_authors(lang_path)
except Exception:
authors = [""] * len(old_lines)
candidates = []
for i, en_item in en_dict.items():
en_key = en_item.get("key")
en_value = en_item.get("value")
translate_item = lang_dict.get(i)
translate_key = translate_item.get("key") if translate_item else None
translate_value = translate_item.get("value") if translate_item else None
if translate_item is None or translate_key != en_key:
line = f"\"{en_key}\" = \"{en_value}\";\n"
if i < len(new_lines):
new_lines.insert(i, line)
else:
new_lines.append(line)
if i <= len(authors):
authors.insert(i, my_author)
changed = True
translate_value = en_value
if translate_key != en_key:
continue
if en_key in omit_keys:
continue
if i < len(authors) and my_author and authors[i] != my_author and en_value != translate_value:
continue
if translate_value is None or translate_value == en_value:
candidates.append((i, en_key, en_value))
print("Candidates for translation in {} ({}): {}".format(lang_name, lang_code, len(candidates)))
for idx, (i, en_key, en_value) in enumerate(candidates, start=1):
translated = self._ollama_translate(en_value, lang_code, model=model)
safe_translated = self._strings_escape(translated)
print(f"[{lang_name} {lang_idx}/{total_langs}] {idx}/{len(candidates)} {en_key} -> {safe_translated}")
translated_line = f"\"{en_key}\" = \"{safe_translated}\";\n"
update_line = f"\"{en_key}\" = \"{safe_translated}\"; {ai_tag}\n"
if i < len(new_lines):
if new_lines[i] != translated_line:
new_lines[i] = update_line
changed = True
else:
new_lines.append(update_line)
changed = True
if not changed:
print(f"No changes for {lang_code} ({lang_code}).")
continue
if accept:
with open(lang_path, "w") as f:
f.write("".join(new_lines))
print(f"Saved: {lang_path}")
else:
answer = input(f"Save changes to {lang_path}? [Y/n]: ").strip().lower()
if answer in ("", "y", "yes"):
with open(lang_path, "w") as f:
f.write("".join(new_lines))
print(f"Saved: {lang_path}")
else:
print(f"Skipped: {lang_path}")
print("Translation completed.")
if __name__ == "__main__":
i18n = i18n()
args = sys.argv[1:]
accept = "--accept" in args
args = [a for a in args if a != "--accept"]
if len(sys.argv) >= 2 and sys.argv[1] == "fix":
print("running fix command...")
i18n.fix()
elif len(sys.argv) >= 2 and sys.argv[1] == "translate":
print("running translate command...")
i18n.translate(accept=accept)
else:
print("running check command...")
i18n.check()