mirror of
https://github.com/zjs81/meshcore-open.git
synced 2026-04-20 22:13:48 +00:00
542 lines
20 KiB
Python
542 lines
20 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
translate_arb_with_translategemma.py
|
||
|
||
Translates ARB/JSON localization files using TranslateGemma via Ollama.
|
||
Preserves placeholders like {deviceName} and ICU plural/select formats.
|
||
|
||
Usage:
|
||
# Translate all strings:
|
||
python translate.py --in lib/l10n/app_en.arb --out lib/l10n/app_es.arb --to-locale es
|
||
|
||
# Translate only missing strings:
|
||
python translate.py --in lib/l10n/app_en.arb --out lib/l10n/app_es.arb --to-locale es --missing-only
|
||
|
||
# Translate all locales (missing strings only):
|
||
python translate.py --in lib/l10n/app_en.arb --l10n-dir lib/l10n --missing-only
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
import time
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from dataclasses import dataclass
|
||
from typing import Any, Dict, List, Tuple, Optional
|
||
from urllib import request
|
||
|
||
|
||
# Placeholder patterns
|
||
SIMPLE_PLACEHOLDER_RE = re.compile(r"\{(\w+)\}")
|
||
ICU_VAR_RE = re.compile(r"\{(\w+)\s*,\s*(?:plural|select|selectordinal)\s*,", re.IGNORECASE)
|
||
|
||
|
||
@dataclass
|
||
class OllamaConfig:
|
||
host: str
|
||
model: str
|
||
timeout_s: float
|
||
temperature: float
|
||
|
||
|
||
# Language mapping (locale_code -> (language_name, translategemma_code))
|
||
LOCALE_MAP = {
|
||
"es": ("Spanish", "es"),
|
||
"fr": ("French", "fr"),
|
||
"de": ("German", "de"),
|
||
"it": ("Italian", "it"),
|
||
"pt": ("Portuguese", "pt"),
|
||
"pt-BR": ("Brazilian Portuguese", "pt"),
|
||
"ja": ("Japanese", "ja"),
|
||
"ko": ("Korean", "ko"),
|
||
"zh": ("Chinese", "zh-Hans"),
|
||
"zh-Hant": ("Chinese", "zh-Hant"),
|
||
"ru": ("Russian", "ru"),
|
||
"uk": ("Ukrainian", "uk"),
|
||
"ar": ("Arabic", "ar"),
|
||
"hi": ("Hindi", "hi"),
|
||
"tr": ("Turkish", "tr"),
|
||
"nl": ("Dutch", "nl"),
|
||
"sv": ("Swedish", "sv"),
|
||
"no": ("Norwegian", "no"),
|
||
"da": ("Danish", "da"),
|
||
"fi": ("Finnish", "fi"),
|
||
"pl": ("Polish", "pl"),
|
||
"cs": ("Czech", "cs"),
|
||
"sk": ("Slovak", "sk"),
|
||
"sl": ("Slovenian", "sl"),
|
||
"bg": ("Bulgarian", "bg"),
|
||
"el": ("Greek", "el"),
|
||
"he": ("Hebrew", "he"),
|
||
"th": ("Thai", "th"),
|
||
"vi": ("Vietnamese", "vi"),
|
||
"id": ("Indonesian", "id"),
|
||
}
|
||
|
||
# Keys to skip translation
|
||
SKIP_KEYS = {"appTitle"}
|
||
|
||
# Manual translations for complex strings
|
||
MANUAL_TRANSLATIONS: Dict[str, Dict[str, str]] = {
|
||
"repeater_daysHoursMinsSecs": {
|
||
"es": "{days} días {hours}h {minutes}m {seconds}s",
|
||
"fr": "{days} jours {hours}h {minutes}m {seconds}s",
|
||
"de": "{days} Tage {hours}h {minutes}m {seconds}s",
|
||
"it": "{days} giorni {hours}h {minutes}m {seconds}s",
|
||
"pt": "{days} dias {hours}h {minutes}m {seconds}s",
|
||
"pl": "{days} dni {hours}h {minutes}m {seconds}s",
|
||
"sk": "{days} dní {hours}h {minutes}m {seconds}s",
|
||
"sl": "{days} dni {hours}h {minutes}m {seconds}s",
|
||
"cs": "{days} dní {hours}h {minutes}m {seconds}s",
|
||
"ja": "{days}日 {hours}時間 {minutes}分 {seconds}秒",
|
||
"ko": "{days}일 {hours}시간 {minutes}분 {seconds}초",
|
||
"zh": "{days}天 {hours}小时 {minutes}分 {seconds}秒",
|
||
"ru": "{days} дней {hours}ч {minutes}м {seconds}с",
|
||
"bg": "{days} дни {hours}ч {minutes}м {seconds}с",
|
||
"nl": "{days} dagen {hours}u {minutes}m {seconds}s",
|
||
"sv": "{days} dagar {hours}t {minutes}m {seconds}s",
|
||
},
|
||
}
|
||
|
||
|
||
def http_post_json(url: str, payload: Dict[str, Any], timeout_s: float) -> Dict[str, Any]:
|
||
data = json.dumps(payload).encode("utf-8")
|
||
req = request.Request(url, data=data, headers={"Content-Type": "application/json"}, method="POST")
|
||
with request.urlopen(req, timeout=timeout_s) as resp:
|
||
return json.loads(resp.read().decode("utf-8"))
|
||
|
||
|
||
def ollama_generate(cfg: OllamaConfig, prompt: str) -> str:
|
||
url = cfg.host.rstrip("/") + "/api/generate"
|
||
payload = {
|
||
"model": cfg.model,
|
||
"prompt": prompt,
|
||
"stream": False,
|
||
"options": {"temperature": cfg.temperature},
|
||
}
|
||
resp = http_post_json(url, payload, cfg.timeout_s)
|
||
return resp.get("response", "").strip()
|
||
|
||
|
||
def extract_placeholder_names(s: str) -> List[str]:
|
||
"""Extract placeholder variable names from string."""
|
||
names = set()
|
||
|
||
# Get ICU variable names
|
||
for m in ICU_VAR_RE.finditer(s):
|
||
names.add(m.group(1))
|
||
|
||
# Get simple placeholders (excluding ICU text forms)
|
||
for m in SIMPLE_PLACEHOLDER_RE.finditer(s):
|
||
name = m.group(1)
|
||
pos = m.start()
|
||
rest = s[pos:]
|
||
|
||
# Skip if this is part of an ICU block
|
||
if re.match(r"\{\w+\s*,\s*(?:plural|select|selectordinal)", rest, re.IGNORECASE):
|
||
continue
|
||
|
||
# Skip if this is a text form inside ICU (preceded by =X{ or other{)
|
||
before = s[:pos]
|
||
if re.search(r"(?:=\d+|zero|one|two|few|many|other)\s*$", before, re.IGNORECASE):
|
||
continue
|
||
|
||
names.add(name)
|
||
|
||
return sorted(names)
|
||
|
||
|
||
def has_icu_block(s: str) -> bool:
|
||
"""Check if string contains ICU plural/select block."""
|
||
return bool(ICU_VAR_RE.search(s))
|
||
|
||
|
||
def build_prompt(text: str, target_lang: str, target_code: str, placeholder_names: List[str], has_icu: bool) -> str:
|
||
"""Build TranslateGemma-compatible prompt with placeholder preservation instructions."""
|
||
# Build instructions for placeholder preservation
|
||
instructions = []
|
||
if placeholder_names:
|
||
placeholders = ', '.join(f'{{{t}}}' for t in placeholder_names)
|
||
instructions.append(f"CRITICAL: Keep these placeholders EXACTLY as they appear: {placeholders}")
|
||
if has_icu:
|
||
instructions.append("CRITICAL: Preserve ICU message format structure (plural, select, =0, =1, other, etc.). Only translate the text inside the forms.")
|
||
|
||
# Add instructions to the system prompt, not to the text itself
|
||
instruction_text = "\n".join(instructions) if instructions else ""
|
||
separator = "\n" if instruction_text else ""
|
||
|
||
# TranslateGemma expects this exact format (note the two blank lines before text)
|
||
return f"""You are a professional English (en) to {target_lang} ({target_code}) translator. Your goal is to accurately convey the meaning and nuances of the original English text while adhering to {target_lang} grammar, vocabulary, and cultural sensitivities.
|
||
Produce only the {target_lang} translation, without any additional explanations or commentary.{separator}{instruction_text}
|
||
Please translate the following English text into {target_lang}:
|
||
|
||
|
||
{text}"""
|
||
|
||
|
||
def validate_preserved_tokens(src: str, out: str) -> Tuple[bool, Optional[str]]:
|
||
"""Validate that placeholder names are preserved."""
|
||
src_names = extract_placeholder_names(src)
|
||
|
||
for name in src_names:
|
||
pattern = r"\{" + re.escape(name) + r"(?:\}|\s*,)"
|
||
if not re.search(pattern, out):
|
||
return False, f"Missing placeholder: {{{name}}}"
|
||
|
||
if has_icu_block(src) and not has_icu_block(out):
|
||
return False, "ICU plural/select block missing"
|
||
|
||
return True, None
|
||
|
||
|
||
def translate_one(
|
||
key: str,
|
||
text: str,
|
||
target_lang: str,
|
||
target_code: str,
|
||
cfg: OllamaConfig,
|
||
retries: int,
|
||
backoff_s: float,
|
||
fallback_cfg: Optional[OllamaConfig] = None,
|
||
) -> Tuple[str, str, Optional[str], bool]:
|
||
"""Translate a single string. Returns (key, translated_text, error_or_none, used_fallback)."""
|
||
placeholder_names = extract_placeholder_names(text)
|
||
text_has_icu = has_icu_block(text)
|
||
prompt = build_prompt(text, target_lang, target_code, placeholder_names, text_has_icu)
|
||
|
||
last_err: Optional[str] = None
|
||
for attempt in range(retries + 1):
|
||
try:
|
||
out = ollama_generate(cfg, prompt)
|
||
|
||
# Validate placeholders
|
||
ok, why = validate_preserved_tokens(text, out)
|
||
if not ok:
|
||
last_err = f"Validation failed: {why}"
|
||
if attempt < retries:
|
||
time.sleep(backoff_s * (attempt + 1))
|
||
continue
|
||
raise ValueError(last_err)
|
||
|
||
return key, out, None, False
|
||
|
||
except Exception as e:
|
||
last_err = str(e)
|
||
if attempt < retries:
|
||
time.sleep(backoff_s * (attempt + 1))
|
||
continue
|
||
|
||
# Try fallback model if available
|
||
if fallback_cfg:
|
||
try:
|
||
fallback_prompt = build_prompt(text, target_lang, target_code, placeholder_names, text_has_icu)
|
||
fallback_out = ollama_generate(fallback_cfg, fallback_prompt)
|
||
fallback_ok, _ = validate_preserved_tokens(text, fallback_out)
|
||
if fallback_ok:
|
||
return key, fallback_out, None, True
|
||
except Exception:
|
||
pass
|
||
|
||
# Fallback to original
|
||
return key, text, last_err, False
|
||
|
||
|
||
def is_translatable_entry(key: str, value: Any) -> bool:
|
||
"""Check if an entry should be translated."""
|
||
if key == "@@locale" or key.startswith("@") or key in SKIP_KEYS:
|
||
return False
|
||
return isinstance(value, str) and value.strip() != ""
|
||
|
||
|
||
def find_missing_keys(source_data: Dict[str, Any], target_data: Dict[str, Any]) -> List[str]:
|
||
"""Find keys that are missing or empty in target."""
|
||
missing = []
|
||
for key in source_data:
|
||
if key == "@@locale" or key.startswith("@"):
|
||
continue
|
||
if key not in target_data or (isinstance(target_data.get(key), str) and target_data[key].strip() == ""):
|
||
missing.append(key)
|
||
return missing
|
||
|
||
|
||
def get_all_locale_files(l10n_dir: str, template_file: str) -> List[Tuple[str, str]]:
|
||
"""Find all locale .arb files excluding template. Returns [(locale_code, file_path)]."""
|
||
locales = []
|
||
template_basename = os.path.basename(template_file)
|
||
|
||
for filename in os.listdir(l10n_dir):
|
||
if filename.endswith('.arb') and filename != template_basename:
|
||
if filename.startswith('app_'):
|
||
locale = filename[4:-4] # app_es.arb -> es
|
||
locales.append((locale, os.path.join(l10n_dir, filename)))
|
||
|
||
return sorted(locales)
|
||
|
||
|
||
def fmt_duration(seconds: float) -> str:
|
||
"""Format duration as human-readable string."""
|
||
if seconds < 60:
|
||
return f"{seconds:.1f}s"
|
||
m = int(seconds // 60)
|
||
s = seconds - 60 * m
|
||
if m < 60:
|
||
return f"{m}m {s:.0f}s"
|
||
h = m // 60
|
||
m2 = m % 60
|
||
return f"{h}h {m2}m"
|
||
|
||
|
||
def translate_locale(
|
||
source_data: Dict[str, Any],
|
||
target_data: Dict[str, Any],
|
||
target_locale: str,
|
||
target_lang: str,
|
||
target_code: str,
|
||
out_path: str,
|
||
args,
|
||
missing_keys: Optional[List[str]] = None,
|
||
) -> int:
|
||
"""Translate a single locale. Returns number of strings translated."""
|
||
|
||
cfg = OllamaConfig(
|
||
host=args.host,
|
||
model=args.model,
|
||
timeout_s=args.timeout,
|
||
temperature=args.temperature,
|
||
)
|
||
|
||
fallback_cfg = None
|
||
if args.fallback_model:
|
||
fallback_cfg = OllamaConfig(
|
||
host=args.host,
|
||
model=args.fallback_model,
|
||
timeout_s=args.timeout,
|
||
temperature=args.temperature,
|
||
)
|
||
|
||
# Start with target data or source data
|
||
out_data: Dict[str, Any] = dict(target_data) if target_data else dict(source_data)
|
||
out_data["@@locale"] = target_locale
|
||
|
||
# Build list of items to translate
|
||
if missing_keys is not None:
|
||
items: List[Tuple[str, str]] = [
|
||
(k, source_data[k]) for k in missing_keys
|
||
if is_translatable_entry(k, source_data.get(k))
|
||
]
|
||
# Copy metadata for missing items
|
||
for key in missing_keys:
|
||
meta_key = f"@{key}"
|
||
if meta_key in source_data:
|
||
out_data[meta_key] = source_data[meta_key]
|
||
else:
|
||
items: List[Tuple[str, str]] = [(k, v) for k, v in source_data.items() if is_translatable_entry(k, v)]
|
||
|
||
# Apply manual translations
|
||
manual_count = 0
|
||
items_to_translate: List[Tuple[str, str]] = []
|
||
for k, v in items:
|
||
if k in MANUAL_TRANSLATIONS and target_locale in MANUAL_TRANSLATIONS[k]:
|
||
out_data[k] = MANUAL_TRANSLATIONS[k][target_locale]
|
||
manual_count += 1
|
||
else:
|
||
items_to_translate.append((k, v))
|
||
|
||
if manual_count > 0:
|
||
print(f"Applied {manual_count} manual translation(s)")
|
||
|
||
total = len(items_to_translate)
|
||
if total == 0:
|
||
if manual_count > 0:
|
||
print("All strings handled by manual translations.")
|
||
return manual_count
|
||
|
||
fallback_info = f" (fallback: {args.fallback_model})" if args.fallback_model else ""
|
||
print(f"Translating {total} strings -> {target_lang} using {cfg.model}{fallback_info} (concurrency={args.concurrency})")
|
||
|
||
start = time.time()
|
||
failures: List[Tuple[str, str]] = []
|
||
translated_ok = manual_count
|
||
fallback_used = 0
|
||
completed = 0
|
||
|
||
with ThreadPoolExecutor(max_workers=max(1, args.concurrency)) as ex:
|
||
future_to_key = {
|
||
ex.submit(
|
||
translate_one,
|
||
key=k,
|
||
text=v,
|
||
target_lang=target_lang,
|
||
target_code=target_code,
|
||
cfg=cfg,
|
||
retries=args.retries,
|
||
backoff_s=args.backoff,
|
||
fallback_cfg=fallback_cfg,
|
||
): k
|
||
for (k, v) in items_to_translate
|
||
}
|
||
|
||
for fut in as_completed(future_to_key):
|
||
k, translated, err, used_fallback = fut.result()
|
||
out_data[k] = translated
|
||
|
||
completed += 1
|
||
if err:
|
||
failures.append((k, err))
|
||
status = "FAIL"
|
||
else:
|
||
translated_ok += 1
|
||
if used_fallback:
|
||
fallback_used += 1
|
||
status = "OK*"
|
||
else:
|
||
status = "OK"
|
||
|
||
if completed % args.progress_every == 0 or completed == total:
|
||
elapsed = time.time() - start
|
||
rate = completed / elapsed if elapsed > 0 else 0.0
|
||
remaining = (total - completed) / rate if rate > 0 else 0.0
|
||
print(f"[{completed:>4}/{total}] {status:<4} {k} | elapsed {fmt_duration(elapsed)} | ETA {fmt_duration(remaining)}")
|
||
|
||
elapsed = time.time() - start
|
||
fallback_msg = f", fallback_used={fallback_used}" if fallback_used > 0 else ""
|
||
print(f"Done in {fmt_duration(elapsed)}. OK={translated_ok}{fallback_msg}, errors={len(failures)}")
|
||
|
||
if failures:
|
||
print(f"{len(failures)} translation(s) kept original English:")
|
||
for k, err in failures[:20]:
|
||
print(f" - {k}: {err}")
|
||
if len(failures) > 20:
|
||
print(f" ... and {len(failures) - 20} more")
|
||
|
||
if args.dry_run:
|
||
print("Dry run: not writing output file.")
|
||
return translated_ok
|
||
|
||
try:
|
||
with open(out_path, "w", encoding="utf-8") as f:
|
||
json.dump(out_data, f, ensure_ascii=False, indent=2)
|
||
f.write("\n")
|
||
except Exception as e:
|
||
print(f"Failed to write output: {e}", file=sys.stderr)
|
||
return -1
|
||
|
||
print(f"Wrote: {out_path}")
|
||
return translated_ok
|
||
|
||
|
||
def main() -> int:
|
||
ap = argparse.ArgumentParser(description="Translate ARB files using TranslateGemma")
|
||
ap.add_argument("--in", dest="in_path", required=True, help="Input .arb file (source/template)")
|
||
ap.add_argument("--out", dest="out_path", help="Output .arb file (required unless using --l10n-dir)")
|
||
ap.add_argument("--to-locale", help="Target locale code (es, fr, de, etc.)")
|
||
ap.add_argument("--l10n-dir", help="Directory with locale files (translates all locales)")
|
||
ap.add_argument("--missing-only", action="store_true", help="Only translate missing keys")
|
||
ap.add_argument("--model", default="translategemma:latest", help="Ollama model (translategemma:latest or specific versions)")
|
||
ap.add_argument("--fallback-model", help="Fallback model for failed translations (e.g., translategemma:27b)")
|
||
ap.add_argument("--host", default="http://localhost:11434", help="Ollama host")
|
||
ap.add_argument("--timeout", type=float, default=120.0, help="HTTP timeout seconds")
|
||
ap.add_argument("--temperature", type=float, default=0.0, help="Model temperature (0.0 for deterministic)")
|
||
ap.add_argument("--concurrency", type=int, default=4, help="Parallel requests")
|
||
ap.add_argument("--retries", type=int, default=2, help="Retries per string")
|
||
ap.add_argument("--backoff", type=float, default=0.6, help="Backoff seconds base")
|
||
ap.add_argument("--dry-run", action="store_true", help="Don't write output")
|
||
ap.add_argument("--progress-every", type=int, default=1, help="Print progress every N strings")
|
||
args = ap.parse_args()
|
||
|
||
# Read source file
|
||
try:
|
||
with open(args.in_path, "r", encoding="utf-8") as f:
|
||
source_data = json.load(f)
|
||
except Exception as e:
|
||
print(f"Failed to read input: {e}", file=sys.stderr)
|
||
return 2
|
||
|
||
if not isinstance(source_data, dict):
|
||
print("Input JSON must be an object at top-level.", file=sys.stderr)
|
||
return 2
|
||
|
||
# Process all locales if --l10n-dir is provided
|
||
if args.l10n_dir:
|
||
locales = get_all_locale_files(args.l10n_dir, args.in_path)
|
||
if not locales:
|
||
print(f"No locale files found in {args.l10n_dir}", file=sys.stderr)
|
||
return 1
|
||
|
||
print(f"Found {len(locales)} locale file(s) to process")
|
||
|
||
total_translated = 0
|
||
for locale_code, locale_path in locales:
|
||
lang_name, lang_code = LOCALE_MAP.get(locale_code, (locale_code, locale_code))
|
||
|
||
try:
|
||
with open(locale_path, "r", encoding="utf-8") as f:
|
||
target_data = json.load(f)
|
||
except Exception as e:
|
||
print(f" [{locale_code}] Failed to read {locale_path}: {e}")
|
||
continue
|
||
|
||
if args.missing_only:
|
||
missing_keys = find_missing_keys(source_data, target_data)
|
||
if not missing_keys:
|
||
print(f" [{locale_code}] No missing keys")
|
||
continue
|
||
print(f" [{locale_code}] {len(missing_keys)} missing key(s)")
|
||
else:
|
||
missing_keys = None
|
||
|
||
result = translate_locale(
|
||
source_data=source_data,
|
||
target_data=target_data,
|
||
target_locale=locale_code,
|
||
target_lang=lang_name,
|
||
target_code=lang_code,
|
||
out_path=locale_path,
|
||
args=args,
|
||
missing_keys=missing_keys,
|
||
)
|
||
total_translated += result
|
||
|
||
print(f"\nTotal: {total_translated} string(s) translated across {len(locales)} locale(s)")
|
||
return 0
|
||
|
||
# Single locale mode
|
||
if not args.out_path or not args.to_locale:
|
||
print("--out and --to-locale are required when not using --l10n-dir", file=sys.stderr)
|
||
return 1
|
||
|
||
lang_name, lang_code = LOCALE_MAP.get(args.to_locale, (args.to_locale, args.to_locale))
|
||
|
||
# Read existing target file if --missing-only
|
||
target_data: Dict[str, Any] = {}
|
||
missing_keys: Optional[List[str]] = None
|
||
if args.missing_only and os.path.exists(args.out_path):
|
||
try:
|
||
with open(args.out_path, "r", encoding="utf-8") as f:
|
||
target_data = json.load(f)
|
||
missing_keys = find_missing_keys(source_data, target_data)
|
||
if not missing_keys:
|
||
print(f"No missing keys in {args.out_path}")
|
||
return 0
|
||
print(f"Found {len(missing_keys)} missing key(s) to translate")
|
||
except Exception as e:
|
||
print(f"Failed to read target file: {e}", file=sys.stderr)
|
||
return 2
|
||
|
||
result = translate_locale(
|
||
source_data=source_data,
|
||
target_data=target_data,
|
||
target_locale=args.to_locale,
|
||
target_lang=lang_name,
|
||
target_code=lang_code,
|
||
out_path=args.out_path,
|
||
args=args,
|
||
missing_keys=missing_keys,
|
||
)
|
||
return 0 if result >= 0 else 1
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|