meshcore-open/tools/translate.py
2026-02-01 16:57:17 -07:00

542 lines
20 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
translate_arb_with_translategemma.py
Translates ARB/JSON localization files using TranslateGemma via Ollama.
Preserves placeholders like {deviceName} and ICU plural/select formats.
Usage:
# Translate all strings:
python translate.py --in lib/l10n/app_en.arb --out lib/l10n/app_es.arb --to-locale es
# Translate only missing strings:
python translate.py --in lib/l10n/app_en.arb --out lib/l10n/app_es.arb --to-locale es --missing-only
# Translate all locales (missing strings only):
python translate.py --in lib/l10n/app_en.arb --l10n-dir lib/l10n --missing-only
"""
import argparse
import json
import os
import re
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from typing import Any, Dict, List, Tuple, Optional
from urllib import request
# Placeholder patterns
SIMPLE_PLACEHOLDER_RE = re.compile(r"\{(\w+)\}")
ICU_VAR_RE = re.compile(r"\{(\w+)\s*,\s*(?:plural|select|selectordinal)\s*,", re.IGNORECASE)
@dataclass
class OllamaConfig:
host: str
model: str
timeout_s: float
temperature: float
# Language mapping (locale_code -> (language_name, translategemma_code))
LOCALE_MAP = {
"es": ("Spanish", "es"),
"fr": ("French", "fr"),
"de": ("German", "de"),
"it": ("Italian", "it"),
"pt": ("Portuguese", "pt"),
"pt-BR": ("Brazilian Portuguese", "pt"),
"ja": ("Japanese", "ja"),
"ko": ("Korean", "ko"),
"zh": ("Chinese", "zh-Hans"),
"zh-Hant": ("Chinese", "zh-Hant"),
"ru": ("Russian", "ru"),
"uk": ("Ukrainian", "uk"),
"ar": ("Arabic", "ar"),
"hi": ("Hindi", "hi"),
"tr": ("Turkish", "tr"),
"nl": ("Dutch", "nl"),
"sv": ("Swedish", "sv"),
"no": ("Norwegian", "no"),
"da": ("Danish", "da"),
"fi": ("Finnish", "fi"),
"pl": ("Polish", "pl"),
"cs": ("Czech", "cs"),
"sk": ("Slovak", "sk"),
"sl": ("Slovenian", "sl"),
"bg": ("Bulgarian", "bg"),
"el": ("Greek", "el"),
"he": ("Hebrew", "he"),
"th": ("Thai", "th"),
"vi": ("Vietnamese", "vi"),
"id": ("Indonesian", "id"),
}
# Keys to skip translation
SKIP_KEYS = {"appTitle"}
# Manual translations for complex strings
MANUAL_TRANSLATIONS: Dict[str, Dict[str, str]] = {
"repeater_daysHoursMinsSecs": {
"es": "{days} días {hours}h {minutes}m {seconds}s",
"fr": "{days} jours {hours}h {minutes}m {seconds}s",
"de": "{days} Tage {hours}h {minutes}m {seconds}s",
"it": "{days} giorni {hours}h {minutes}m {seconds}s",
"pt": "{days} dias {hours}h {minutes}m {seconds}s",
"pl": "{days} dni {hours}h {minutes}m {seconds}s",
"sk": "{days} dní {hours}h {minutes}m {seconds}s",
"sl": "{days} dni {hours}h {minutes}m {seconds}s",
"cs": "{days} dní {hours}h {minutes}m {seconds}s",
"ja": "{days}{hours}時間 {minutes}{seconds}",
"ko": "{days}{hours}시간 {minutes}{seconds}",
"zh": "{days}{hours}小时 {minutes}{seconds}",
"ru": "{days} дней {hours}ч {minutes}м {seconds}с",
"bg": "{days} дни {hours}ч {minutes}м {seconds}с",
"nl": "{days} dagen {hours}u {minutes}m {seconds}s",
"sv": "{days} dagar {hours}t {minutes}m {seconds}s",
},
}
def http_post_json(url: str, payload: Dict[str, Any], timeout_s: float) -> Dict[str, Any]:
data = json.dumps(payload).encode("utf-8")
req = request.Request(url, data=data, headers={"Content-Type": "application/json"}, method="POST")
with request.urlopen(req, timeout=timeout_s) as resp:
return json.loads(resp.read().decode("utf-8"))
def ollama_generate(cfg: OllamaConfig, prompt: str) -> str:
url = cfg.host.rstrip("/") + "/api/generate"
payload = {
"model": cfg.model,
"prompt": prompt,
"stream": False,
"options": {"temperature": cfg.temperature},
}
resp = http_post_json(url, payload, cfg.timeout_s)
return resp.get("response", "").strip()
def extract_placeholder_names(s: str) -> List[str]:
"""Extract placeholder variable names from string."""
names = set()
# Get ICU variable names
for m in ICU_VAR_RE.finditer(s):
names.add(m.group(1))
# Get simple placeholders (excluding ICU text forms)
for m in SIMPLE_PLACEHOLDER_RE.finditer(s):
name = m.group(1)
pos = m.start()
rest = s[pos:]
# Skip if this is part of an ICU block
if re.match(r"\{\w+\s*,\s*(?:plural|select|selectordinal)", rest, re.IGNORECASE):
continue
# Skip if this is a text form inside ICU (preceded by =X{ or other{)
before = s[:pos]
if re.search(r"(?:=\d+|zero|one|two|few|many|other)\s*$", before, re.IGNORECASE):
continue
names.add(name)
return sorted(names)
def has_icu_block(s: str) -> bool:
"""Check if string contains ICU plural/select block."""
return bool(ICU_VAR_RE.search(s))
def build_prompt(text: str, target_lang: str, target_code: str, placeholder_names: List[str], has_icu: bool) -> str:
"""Build TranslateGemma-compatible prompt with placeholder preservation instructions."""
# Build instructions for placeholder preservation
instructions = []
if placeholder_names:
placeholders = ', '.join(f'{{{t}}}' for t in placeholder_names)
instructions.append(f"CRITICAL: Keep these placeholders EXACTLY as they appear: {placeholders}")
if has_icu:
instructions.append("CRITICAL: Preserve ICU message format structure (plural, select, =0, =1, other, etc.). Only translate the text inside the forms.")
# Add instructions to the system prompt, not to the text itself
instruction_text = "\n".join(instructions) if instructions else ""
separator = "\n" if instruction_text else ""
# TranslateGemma expects this exact format (note the two blank lines before text)
return f"""You are a professional English (en) to {target_lang} ({target_code}) translator. Your goal is to accurately convey the meaning and nuances of the original English text while adhering to {target_lang} grammar, vocabulary, and cultural sensitivities.
Produce only the {target_lang} translation, without any additional explanations or commentary.{separator}{instruction_text}
Please translate the following English text into {target_lang}:
{text}"""
def validate_preserved_tokens(src: str, out: str) -> Tuple[bool, Optional[str]]:
"""Validate that placeholder names are preserved."""
src_names = extract_placeholder_names(src)
for name in src_names:
pattern = r"\{" + re.escape(name) + r"(?:\}|\s*,)"
if not re.search(pattern, out):
return False, f"Missing placeholder: {{{name}}}"
if has_icu_block(src) and not has_icu_block(out):
return False, "ICU plural/select block missing"
return True, None
def translate_one(
key: str,
text: str,
target_lang: str,
target_code: str,
cfg: OllamaConfig,
retries: int,
backoff_s: float,
fallback_cfg: Optional[OllamaConfig] = None,
) -> Tuple[str, str, Optional[str], bool]:
"""Translate a single string. Returns (key, translated_text, error_or_none, used_fallback)."""
placeholder_names = extract_placeholder_names(text)
text_has_icu = has_icu_block(text)
prompt = build_prompt(text, target_lang, target_code, placeholder_names, text_has_icu)
last_err: Optional[str] = None
for attempt in range(retries + 1):
try:
out = ollama_generate(cfg, prompt)
# Validate placeholders
ok, why = validate_preserved_tokens(text, out)
if not ok:
last_err = f"Validation failed: {why}"
if attempt < retries:
time.sleep(backoff_s * (attempt + 1))
continue
raise ValueError(last_err)
return key, out, None, False
except Exception as e:
last_err = str(e)
if attempt < retries:
time.sleep(backoff_s * (attempt + 1))
continue
# Try fallback model if available
if fallback_cfg:
try:
fallback_prompt = build_prompt(text, target_lang, target_code, placeholder_names, text_has_icu)
fallback_out = ollama_generate(fallback_cfg, fallback_prompt)
fallback_ok, _ = validate_preserved_tokens(text, fallback_out)
if fallback_ok:
return key, fallback_out, None, True
except Exception:
pass
# Fallback to original
return key, text, last_err, False
def is_translatable_entry(key: str, value: Any) -> bool:
"""Check if an entry should be translated."""
if key == "@@locale" or key.startswith("@") or key in SKIP_KEYS:
return False
return isinstance(value, str) and value.strip() != ""
def find_missing_keys(source_data: Dict[str, Any], target_data: Dict[str, Any]) -> List[str]:
"""Find keys that are missing or empty in target."""
missing = []
for key in source_data:
if key == "@@locale" or key.startswith("@"):
continue
if key not in target_data or (isinstance(target_data.get(key), str) and target_data[key].strip() == ""):
missing.append(key)
return missing
def get_all_locale_files(l10n_dir: str, template_file: str) -> List[Tuple[str, str]]:
"""Find all locale .arb files excluding template. Returns [(locale_code, file_path)]."""
locales = []
template_basename = os.path.basename(template_file)
for filename in os.listdir(l10n_dir):
if filename.endswith('.arb') and filename != template_basename:
if filename.startswith('app_'):
locale = filename[4:-4] # app_es.arb -> es
locales.append((locale, os.path.join(l10n_dir, filename)))
return sorted(locales)
def fmt_duration(seconds: float) -> str:
"""Format duration as human-readable string."""
if seconds < 60:
return f"{seconds:.1f}s"
m = int(seconds // 60)
s = seconds - 60 * m
if m < 60:
return f"{m}m {s:.0f}s"
h = m // 60
m2 = m % 60
return f"{h}h {m2}m"
def translate_locale(
source_data: Dict[str, Any],
target_data: Dict[str, Any],
target_locale: str,
target_lang: str,
target_code: str,
out_path: str,
args,
missing_keys: Optional[List[str]] = None,
) -> int:
"""Translate a single locale. Returns number of strings translated."""
cfg = OllamaConfig(
host=args.host,
model=args.model,
timeout_s=args.timeout,
temperature=args.temperature,
)
fallback_cfg = None
if args.fallback_model:
fallback_cfg = OllamaConfig(
host=args.host,
model=args.fallback_model,
timeout_s=args.timeout,
temperature=args.temperature,
)
# Start with target data or source data
out_data: Dict[str, Any] = dict(target_data) if target_data else dict(source_data)
out_data["@@locale"] = target_locale
# Build list of items to translate
if missing_keys is not None:
items: List[Tuple[str, str]] = [
(k, source_data[k]) for k in missing_keys
if is_translatable_entry(k, source_data.get(k))
]
# Copy metadata for missing items
for key in missing_keys:
meta_key = f"@{key}"
if meta_key in source_data:
out_data[meta_key] = source_data[meta_key]
else:
items: List[Tuple[str, str]] = [(k, v) for k, v in source_data.items() if is_translatable_entry(k, v)]
# Apply manual translations
manual_count = 0
items_to_translate: List[Tuple[str, str]] = []
for k, v in items:
if k in MANUAL_TRANSLATIONS and target_locale in MANUAL_TRANSLATIONS[k]:
out_data[k] = MANUAL_TRANSLATIONS[k][target_locale]
manual_count += 1
else:
items_to_translate.append((k, v))
if manual_count > 0:
print(f"Applied {manual_count} manual translation(s)")
total = len(items_to_translate)
if total == 0:
if manual_count > 0:
print("All strings handled by manual translations.")
return manual_count
fallback_info = f" (fallback: {args.fallback_model})" if args.fallback_model else ""
print(f"Translating {total} strings -> {target_lang} using {cfg.model}{fallback_info} (concurrency={args.concurrency})")
start = time.time()
failures: List[Tuple[str, str]] = []
translated_ok = manual_count
fallback_used = 0
completed = 0
with ThreadPoolExecutor(max_workers=max(1, args.concurrency)) as ex:
future_to_key = {
ex.submit(
translate_one,
key=k,
text=v,
target_lang=target_lang,
target_code=target_code,
cfg=cfg,
retries=args.retries,
backoff_s=args.backoff,
fallback_cfg=fallback_cfg,
): k
for (k, v) in items_to_translate
}
for fut in as_completed(future_to_key):
k, translated, err, used_fallback = fut.result()
out_data[k] = translated
completed += 1
if err:
failures.append((k, err))
status = "FAIL"
else:
translated_ok += 1
if used_fallback:
fallback_used += 1
status = "OK*"
else:
status = "OK"
if completed % args.progress_every == 0 or completed == total:
elapsed = time.time() - start
rate = completed / elapsed if elapsed > 0 else 0.0
remaining = (total - completed) / rate if rate > 0 else 0.0
print(f"[{completed:>4}/{total}] {status:<4} {k} | elapsed {fmt_duration(elapsed)} | ETA {fmt_duration(remaining)}")
elapsed = time.time() - start
fallback_msg = f", fallback_used={fallback_used}" if fallback_used > 0 else ""
print(f"Done in {fmt_duration(elapsed)}. OK={translated_ok}{fallback_msg}, errors={len(failures)}")
if failures:
print(f"{len(failures)} translation(s) kept original English:")
for k, err in failures[:20]:
print(f" - {k}: {err}")
if len(failures) > 20:
print(f" ... and {len(failures) - 20} more")
if args.dry_run:
print("Dry run: not writing output file.")
return translated_ok
try:
with open(out_path, "w", encoding="utf-8") as f:
json.dump(out_data, f, ensure_ascii=False, indent=2)
f.write("\n")
except Exception as e:
print(f"Failed to write output: {e}", file=sys.stderr)
return -1
print(f"Wrote: {out_path}")
return translated_ok
def main() -> int:
ap = argparse.ArgumentParser(description="Translate ARB files using TranslateGemma")
ap.add_argument("--in", dest="in_path", required=True, help="Input .arb file (source/template)")
ap.add_argument("--out", dest="out_path", help="Output .arb file (required unless using --l10n-dir)")
ap.add_argument("--to-locale", help="Target locale code (es, fr, de, etc.)")
ap.add_argument("--l10n-dir", help="Directory with locale files (translates all locales)")
ap.add_argument("--missing-only", action="store_true", help="Only translate missing keys")
ap.add_argument("--model", default="translategemma:latest", help="Ollama model (translategemma:latest or specific versions)")
ap.add_argument("--fallback-model", help="Fallback model for failed translations (e.g., translategemma:27b)")
ap.add_argument("--host", default="http://localhost:11434", help="Ollama host")
ap.add_argument("--timeout", type=float, default=120.0, help="HTTP timeout seconds")
ap.add_argument("--temperature", type=float, default=0.0, help="Model temperature (0.0 for deterministic)")
ap.add_argument("--concurrency", type=int, default=4, help="Parallel requests")
ap.add_argument("--retries", type=int, default=2, help="Retries per string")
ap.add_argument("--backoff", type=float, default=0.6, help="Backoff seconds base")
ap.add_argument("--dry-run", action="store_true", help="Don't write output")
ap.add_argument("--progress-every", type=int, default=1, help="Print progress every N strings")
args = ap.parse_args()
# Read source file
try:
with open(args.in_path, "r", encoding="utf-8") as f:
source_data = json.load(f)
except Exception as e:
print(f"Failed to read input: {e}", file=sys.stderr)
return 2
if not isinstance(source_data, dict):
print("Input JSON must be an object at top-level.", file=sys.stderr)
return 2
# Process all locales if --l10n-dir is provided
if args.l10n_dir:
locales = get_all_locale_files(args.l10n_dir, args.in_path)
if not locales:
print(f"No locale files found in {args.l10n_dir}", file=sys.stderr)
return 1
print(f"Found {len(locales)} locale file(s) to process")
total_translated = 0
for locale_code, locale_path in locales:
lang_name, lang_code = LOCALE_MAP.get(locale_code, (locale_code, locale_code))
try:
with open(locale_path, "r", encoding="utf-8") as f:
target_data = json.load(f)
except Exception as e:
print(f" [{locale_code}] Failed to read {locale_path}: {e}")
continue
if args.missing_only:
missing_keys = find_missing_keys(source_data, target_data)
if not missing_keys:
print(f" [{locale_code}] No missing keys")
continue
print(f" [{locale_code}] {len(missing_keys)} missing key(s)")
else:
missing_keys = None
result = translate_locale(
source_data=source_data,
target_data=target_data,
target_locale=locale_code,
target_lang=lang_name,
target_code=lang_code,
out_path=locale_path,
args=args,
missing_keys=missing_keys,
)
total_translated += result
print(f"\nTotal: {total_translated} string(s) translated across {len(locales)} locale(s)")
return 0
# Single locale mode
if not args.out_path or not args.to_locale:
print("--out and --to-locale are required when not using --l10n-dir", file=sys.stderr)
return 1
lang_name, lang_code = LOCALE_MAP.get(args.to_locale, (args.to_locale, args.to_locale))
# Read existing target file if --missing-only
target_data: Dict[str, Any] = {}
missing_keys: Optional[List[str]] = None
if args.missing_only and os.path.exists(args.out_path):
try:
with open(args.out_path, "r", encoding="utf-8") as f:
target_data = json.load(f)
missing_keys = find_missing_keys(source_data, target_data)
if not missing_keys:
print(f"No missing keys in {args.out_path}")
return 0
print(f"Found {len(missing_keys)} missing key(s) to translate")
except Exception as e:
print(f"Failed to read target file: {e}", file=sys.stderr)
return 2
result = translate_locale(
source_data=source_data,
target_data=target_data,
target_locale=args.to_locale,
target_lang=lang_name,
target_code=lang_code,
out_path=args.out_path,
args=args,
missing_keys=missing_keys,
)
return 0 if result >= 0 else 1
if __name__ == "__main__":
raise SystemExit(main())