#!/usr/bin/env python3 """ translate_arb_with_translategemma.py Translates ARB/JSON localization files using TranslateGemma via Ollama. Preserves placeholders like {deviceName} and ICU plural/select formats. Usage: # Translate all strings: python translate.py --in lib/l10n/app_en.arb --out lib/l10n/app_es.arb --to-locale es # Translate only missing strings: python translate.py --in lib/l10n/app_en.arb --out lib/l10n/app_es.arb --to-locale es --missing-only # Translate all locales (missing strings only): python translate.py --in lib/l10n/app_en.arb --l10n-dir lib/l10n --missing-only """ import argparse import json import os import re import sys import time from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass from typing import Any, Dict, List, Tuple, Optional from urllib import request # Placeholder patterns SIMPLE_PLACEHOLDER_RE = re.compile(r"\{(\w+)\}") ICU_VAR_RE = re.compile(r"\{(\w+)\s*,\s*(?:plural|select|selectordinal)\s*,", re.IGNORECASE) @dataclass class OllamaConfig: host: str model: str timeout_s: float temperature: float # Language mapping (locale_code -> (language_name, translategemma_code)) LOCALE_MAP = { "es": ("Spanish", "es"), "fr": ("French", "fr"), "de": ("German", "de"), "it": ("Italian", "it"), "pt": ("Portuguese", "pt"), "pt-BR": ("Brazilian Portuguese", "pt"), "ja": ("Japanese", "ja"), "ko": ("Korean", "ko"), "zh": ("Chinese", "zh-Hans"), "zh-Hant": ("Chinese", "zh-Hant"), "ru": ("Russian", "ru"), "uk": ("Ukrainian", "uk"), "ar": ("Arabic", "ar"), "hi": ("Hindi", "hi"), "tr": ("Turkish", "tr"), "nl": ("Dutch", "nl"), "sv": ("Swedish", "sv"), "no": ("Norwegian", "no"), "da": ("Danish", "da"), "fi": ("Finnish", "fi"), "pl": ("Polish", "pl"), "cs": ("Czech", "cs"), "sk": ("Slovak", "sk"), "sl": ("Slovenian", "sl"), "bg": ("Bulgarian", "bg"), "el": ("Greek", "el"), "he": ("Hebrew", "he"), "th": ("Thai", "th"), "vi": ("Vietnamese", "vi"), "id": ("Indonesian", "id"), } # Keys to skip translation SKIP_KEYS = {"appTitle"} # Manual translations for complex strings MANUAL_TRANSLATIONS: Dict[str, Dict[str, str]] = { "repeater_daysHoursMinsSecs": { "es": "{days} días {hours}h {minutes}m {seconds}s", "fr": "{days} jours {hours}h {minutes}m {seconds}s", "de": "{days} Tage {hours}h {minutes}m {seconds}s", "it": "{days} giorni {hours}h {minutes}m {seconds}s", "pt": "{days} dias {hours}h {minutes}m {seconds}s", "pl": "{days} dni {hours}h {minutes}m {seconds}s", "sk": "{days} dní {hours}h {minutes}m {seconds}s", "sl": "{days} dni {hours}h {minutes}m {seconds}s", "cs": "{days} dní {hours}h {minutes}m {seconds}s", "ja": "{days}日 {hours}時間 {minutes}分 {seconds}秒", "ko": "{days}일 {hours}시간 {minutes}분 {seconds}초", "zh": "{days}天 {hours}小时 {minutes}分 {seconds}秒", "ru": "{days} дней {hours}ч {minutes}м {seconds}с", "bg": "{days} дни {hours}ч {minutes}м {seconds}с", "nl": "{days} dagen {hours}u {minutes}m {seconds}s", "sv": "{days} dagar {hours}t {minutes}m {seconds}s", }, } def http_post_json(url: str, payload: Dict[str, Any], timeout_s: float) -> Dict[str, Any]: data = json.dumps(payload).encode("utf-8") req = request.Request(url, data=data, headers={"Content-Type": "application/json"}, method="POST") with request.urlopen(req, timeout=timeout_s) as resp: return json.loads(resp.read().decode("utf-8")) def ollama_generate(cfg: OllamaConfig, prompt: str) -> str: url = cfg.host.rstrip("/") + "/api/generate" payload = { "model": cfg.model, "prompt": prompt, "stream": False, "options": {"temperature": cfg.temperature}, } resp = http_post_json(url, payload, cfg.timeout_s) return resp.get("response", "").strip() def extract_placeholder_names(s: str) -> List[str]: """Extract placeholder variable names from string.""" names = set() # Get ICU variable names for m in ICU_VAR_RE.finditer(s): names.add(m.group(1)) # Get simple placeholders (excluding ICU text forms) for m in SIMPLE_PLACEHOLDER_RE.finditer(s): name = m.group(1) pos = m.start() rest = s[pos:] # Skip if this is part of an ICU block if re.match(r"\{\w+\s*,\s*(?:plural|select|selectordinal)", rest, re.IGNORECASE): continue # Skip if this is a text form inside ICU (preceded by =X{ or other{) before = s[:pos] if re.search(r"(?:=\d+|zero|one|two|few|many|other)\s*$", before, re.IGNORECASE): continue names.add(name) return sorted(names) def has_icu_block(s: str) -> bool: """Check if string contains ICU plural/select block.""" return bool(ICU_VAR_RE.search(s)) def build_prompt(text: str, target_lang: str, target_code: str, placeholder_names: List[str], has_icu: bool) -> str: """Build TranslateGemma-compatible prompt with placeholder preservation instructions.""" # Build instructions for placeholder preservation instructions = [] if placeholder_names: placeholders = ', '.join(f'{{{t}}}' for t in placeholder_names) instructions.append(f"CRITICAL: Keep these placeholders EXACTLY as they appear: {placeholders}") if has_icu: instructions.append("CRITICAL: Preserve ICU message format structure (plural, select, =0, =1, other, etc.). Only translate the text inside the forms.") # Add instructions to the system prompt, not to the text itself instruction_text = "\n".join(instructions) if instructions else "" separator = "\n" if instruction_text else "" # TranslateGemma expects this exact format (note the two blank lines before text) return f"""You are a professional English (en) to {target_lang} ({target_code}) translator. Your goal is to accurately convey the meaning and nuances of the original English text while adhering to {target_lang} grammar, vocabulary, and cultural sensitivities. Produce only the {target_lang} translation, without any additional explanations or commentary.{separator}{instruction_text} Please translate the following English text into {target_lang}: {text}""" def validate_preserved_tokens(src: str, out: str) -> Tuple[bool, Optional[str]]: """Validate that placeholder names are preserved.""" src_names = extract_placeholder_names(src) for name in src_names: pattern = r"\{" + re.escape(name) + r"(?:\}|\s*,)" if not re.search(pattern, out): return False, f"Missing placeholder: {{{name}}}" if has_icu_block(src) and not has_icu_block(out): return False, "ICU plural/select block missing" return True, None def translate_one( key: str, text: str, target_lang: str, target_code: str, cfg: OllamaConfig, retries: int, backoff_s: float, fallback_cfg: Optional[OllamaConfig] = None, ) -> Tuple[str, str, Optional[str], bool]: """Translate a single string. Returns (key, translated_text, error_or_none, used_fallback).""" placeholder_names = extract_placeholder_names(text) text_has_icu = has_icu_block(text) prompt = build_prompt(text, target_lang, target_code, placeholder_names, text_has_icu) last_err: Optional[str] = None for attempt in range(retries + 1): try: out = ollama_generate(cfg, prompt) # Validate placeholders ok, why = validate_preserved_tokens(text, out) if not ok: last_err = f"Validation failed: {why}" if attempt < retries: time.sleep(backoff_s * (attempt + 1)) continue raise ValueError(last_err) return key, out, None, False except Exception as e: last_err = str(e) if attempt < retries: time.sleep(backoff_s * (attempt + 1)) continue # Try fallback model if available if fallback_cfg: try: fallback_prompt = build_prompt(text, target_lang, target_code, placeholder_names, text_has_icu) fallback_out = ollama_generate(fallback_cfg, fallback_prompt) fallback_ok, _ = validate_preserved_tokens(text, fallback_out) if fallback_ok: return key, fallback_out, None, True except Exception: pass # Fallback to original return key, text, last_err, False def is_translatable_entry(key: str, value: Any) -> bool: """Check if an entry should be translated.""" if key == "@@locale" or key.startswith("@") or key in SKIP_KEYS: return False return isinstance(value, str) and value.strip() != "" def find_missing_keys(source_data: Dict[str, Any], target_data: Dict[str, Any]) -> List[str]: """Find keys that are missing or empty in target.""" missing = [] for key in source_data: if key == "@@locale" or key.startswith("@"): continue if key not in target_data or (isinstance(target_data.get(key), str) and target_data[key].strip() == ""): missing.append(key) return missing def get_all_locale_files(l10n_dir: str, template_file: str) -> List[Tuple[str, str]]: """Find all locale .arb files excluding template. Returns [(locale_code, file_path)].""" locales = [] template_basename = os.path.basename(template_file) for filename in os.listdir(l10n_dir): if filename.endswith('.arb') and filename != template_basename: if filename.startswith('app_'): locale = filename[4:-4] # app_es.arb -> es locales.append((locale, os.path.join(l10n_dir, filename))) return sorted(locales) def fmt_duration(seconds: float) -> str: """Format duration as human-readable string.""" if seconds < 60: return f"{seconds:.1f}s" m = int(seconds // 60) s = seconds - 60 * m if m < 60: return f"{m}m {s:.0f}s" h = m // 60 m2 = m % 60 return f"{h}h {m2}m" def translate_locale( source_data: Dict[str, Any], target_data: Dict[str, Any], target_locale: str, target_lang: str, target_code: str, out_path: str, args, missing_keys: Optional[List[str]] = None, ) -> int: """Translate a single locale. Returns number of strings translated.""" cfg = OllamaConfig( host=args.host, model=args.model, timeout_s=args.timeout, temperature=args.temperature, ) fallback_cfg = None if args.fallback_model: fallback_cfg = OllamaConfig( host=args.host, model=args.fallback_model, timeout_s=args.timeout, temperature=args.temperature, ) # Start with target data or source data out_data: Dict[str, Any] = dict(target_data) if target_data else dict(source_data) out_data["@@locale"] = target_locale # Build list of items to translate if missing_keys is not None: items: List[Tuple[str, str]] = [ (k, source_data[k]) for k in missing_keys if is_translatable_entry(k, source_data.get(k)) ] # Copy metadata for missing items for key in missing_keys: meta_key = f"@{key}" if meta_key in source_data: out_data[meta_key] = source_data[meta_key] else: items: List[Tuple[str, str]] = [(k, v) for k, v in source_data.items() if is_translatable_entry(k, v)] # Apply manual translations manual_count = 0 items_to_translate: List[Tuple[str, str]] = [] for k, v in items: if k in MANUAL_TRANSLATIONS and target_locale in MANUAL_TRANSLATIONS[k]: out_data[k] = MANUAL_TRANSLATIONS[k][target_locale] manual_count += 1 else: items_to_translate.append((k, v)) if manual_count > 0: print(f"Applied {manual_count} manual translation(s)") total = len(items_to_translate) if total == 0: if manual_count > 0: print("All strings handled by manual translations.") return manual_count fallback_info = f" (fallback: {args.fallback_model})" if args.fallback_model else "" print(f"Translating {total} strings -> {target_lang} using {cfg.model}{fallback_info} (concurrency={args.concurrency})") start = time.time() failures: List[Tuple[str, str]] = [] translated_ok = manual_count fallback_used = 0 completed = 0 with ThreadPoolExecutor(max_workers=max(1, args.concurrency)) as ex: future_to_key = { ex.submit( translate_one, key=k, text=v, target_lang=target_lang, target_code=target_code, cfg=cfg, retries=args.retries, backoff_s=args.backoff, fallback_cfg=fallback_cfg, ): k for (k, v) in items_to_translate } for fut in as_completed(future_to_key): k, translated, err, used_fallback = fut.result() out_data[k] = translated completed += 1 if err: failures.append((k, err)) status = "FAIL" else: translated_ok += 1 if used_fallback: fallback_used += 1 status = "OK*" else: status = "OK" if completed % args.progress_every == 0 or completed == total: elapsed = time.time() - start rate = completed / elapsed if elapsed > 0 else 0.0 remaining = (total - completed) / rate if rate > 0 else 0.0 print(f"[{completed:>4}/{total}] {status:<4} {k} | elapsed {fmt_duration(elapsed)} | ETA {fmt_duration(remaining)}") elapsed = time.time() - start fallback_msg = f", fallback_used={fallback_used}" if fallback_used > 0 else "" print(f"Done in {fmt_duration(elapsed)}. OK={translated_ok}{fallback_msg}, errors={len(failures)}") if failures: print(f"{len(failures)} translation(s) kept original English:") for k, err in failures[:20]: print(f" - {k}: {err}") if len(failures) > 20: print(f" ... and {len(failures) - 20} more") if args.dry_run: print("Dry run: not writing output file.") return translated_ok try: with open(out_path, "w", encoding="utf-8") as f: json.dump(out_data, f, ensure_ascii=False, indent=2) f.write("\n") except Exception as e: print(f"Failed to write output: {e}", file=sys.stderr) return -1 print(f"Wrote: {out_path}") return translated_ok def main() -> int: ap = argparse.ArgumentParser(description="Translate ARB files using TranslateGemma") ap.add_argument("--in", dest="in_path", required=True, help="Input .arb file (source/template)") ap.add_argument("--out", dest="out_path", help="Output .arb file (required unless using --l10n-dir)") ap.add_argument("--to-locale", help="Target locale code (es, fr, de, etc.)") ap.add_argument("--l10n-dir", help="Directory with locale files (translates all locales)") ap.add_argument("--missing-only", action="store_true", help="Only translate missing keys") ap.add_argument("--model", default="translategemma:latest", help="Ollama model (translategemma:latest or specific versions)") ap.add_argument("--fallback-model", help="Fallback model for failed translations (e.g., translategemma:27b)") ap.add_argument("--host", default="http://localhost:11434", help="Ollama host") ap.add_argument("--timeout", type=float, default=120.0, help="HTTP timeout seconds") ap.add_argument("--temperature", type=float, default=0.0, help="Model temperature (0.0 for deterministic)") ap.add_argument("--concurrency", type=int, default=4, help="Parallel requests") ap.add_argument("--retries", type=int, default=2, help="Retries per string") ap.add_argument("--backoff", type=float, default=0.6, help="Backoff seconds base") ap.add_argument("--dry-run", action="store_true", help="Don't write output") ap.add_argument("--progress-every", type=int, default=1, help="Print progress every N strings") args = ap.parse_args() # Read source file try: with open(args.in_path, "r", encoding="utf-8") as f: source_data = json.load(f) except Exception as e: print(f"Failed to read input: {e}", file=sys.stderr) return 2 if not isinstance(source_data, dict): print("Input JSON must be an object at top-level.", file=sys.stderr) return 2 # Process all locales if --l10n-dir is provided if args.l10n_dir: locales = get_all_locale_files(args.l10n_dir, args.in_path) if not locales: print(f"No locale files found in {args.l10n_dir}", file=sys.stderr) return 1 print(f"Found {len(locales)} locale file(s) to process") total_translated = 0 for locale_code, locale_path in locales: lang_name, lang_code = LOCALE_MAP.get(locale_code, (locale_code, locale_code)) try: with open(locale_path, "r", encoding="utf-8") as f: target_data = json.load(f) except Exception as e: print(f" [{locale_code}] Failed to read {locale_path}: {e}") continue if args.missing_only: missing_keys = find_missing_keys(source_data, target_data) if not missing_keys: print(f" [{locale_code}] No missing keys") continue print(f" [{locale_code}] {len(missing_keys)} missing key(s)") else: missing_keys = None result = translate_locale( source_data=source_data, target_data=target_data, target_locale=locale_code, target_lang=lang_name, target_code=lang_code, out_path=locale_path, args=args, missing_keys=missing_keys, ) total_translated += result print(f"\nTotal: {total_translated} string(s) translated across {len(locales)} locale(s)") return 0 # Single locale mode if not args.out_path or not args.to_locale: print("--out and --to-locale are required when not using --l10n-dir", file=sys.stderr) return 1 lang_name, lang_code = LOCALE_MAP.get(args.to_locale, (args.to_locale, args.to_locale)) # Read existing target file if --missing-only target_data: Dict[str, Any] = {} missing_keys: Optional[List[str]] = None if args.missing_only and os.path.exists(args.out_path): try: with open(args.out_path, "r", encoding="utf-8") as f: target_data = json.load(f) missing_keys = find_missing_keys(source_data, target_data) if not missing_keys: print(f"No missing keys in {args.out_path}") return 0 print(f"Found {len(missing_keys)} missing key(s) to translate") except Exception as e: print(f"Failed to read target file: {e}", file=sys.stderr) return 2 result = translate_locale( source_data=source_data, target_data=target_data, target_locale=args.to_locale, target_lang=lang_name, target_code=lang_code, out_path=args.out_path, args=args, missing_keys=missing_keys, ) return 0 if result >= 0 else 1 if __name__ == "__main__": raise SystemExit(main())