mirror of
https://github.com/zjs81/meshcore-open.git
synced 2026-04-20 22:13:48 +00:00
- Introduced a new extension for localization in Flutter with `LocalizationExtension` in `l10n.dart`. - Added a Python script `translate.py` for translating ARB/JSON localization files using a local Ollama model, preserving keys and placeholders, and handling ICU format rules.
723 lines
28 KiB
Python
723 lines
28 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
translate_arb_with_ollama.py
|
||
|
||
Translates ARB/JSON localization values using a local Ollama model, while:
|
||
- preserving keys
|
||
- skipping "@@locale" and all "@key" metadata blocks
|
||
- preserving placeholders like {deviceName}, {count, plural, ...}
|
||
- writing a new file with updated @@locale
|
||
- printing progress as it runs
|
||
|
||
Usage:
|
||
python translate_arb_with_ollama.py \
|
||
--in /home/zjs81/Desktop/meshcore-open/lib/l10n/app_en.arb \
|
||
--out /home/zjs81/Desktop/meshcore-open/lib/l10n/app_es.arb \
|
||
--to-locale es \
|
||
--model ministral-3:latest \
|
||
--temperature 0 \
|
||
--concurrency 4
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import re
|
||
import sys
|
||
import time
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from dataclasses import dataclass
|
||
from typing import Any, Dict, List, Tuple, Optional
|
||
from urllib import request
|
||
|
||
|
||
# Simple placeholder like {name}, {count}, {deviceName}
|
||
SIMPLE_PLACEHOLDER_RE = re.compile(r"\{(\w+)\}")
|
||
# ICU plural/select variable name extraction: {count, plural, ...} or {gender, select, ...}
|
||
ICU_VAR_RE = re.compile(r"\{(\w+)\s*,\s*(?:plural|select|selectordinal)\s*,", re.IGNORECASE)
|
||
|
||
|
||
@dataclass
|
||
class OllamaConfig:
|
||
host: str
|
||
model: str
|
||
timeout_s: float
|
||
temperature: float
|
||
num_ctx: int
|
||
num_predict: int
|
||
top_p: float
|
||
|
||
|
||
def http_post_json(url: str, payload: Dict[str, Any], timeout_s: float) -> Dict[str, Any]:
|
||
data = json.dumps(payload).encode("utf-8")
|
||
req = request.Request(
|
||
url,
|
||
data=data,
|
||
headers={"Content-Type": "application/json"},
|
||
method="POST",
|
||
)
|
||
with request.urlopen(req, timeout=timeout_s) as resp:
|
||
body = resp.read().decode("utf-8")
|
||
return json.loads(body)
|
||
|
||
|
||
def strip_markdown(s: str) -> str:
|
||
"""Remove common markdown formatting from output."""
|
||
# Remove bold/italic markers
|
||
s = re.sub(r'\*\*(.+?)\*\*', r'\1', s)
|
||
s = re.sub(r'\*(.+?)\*', r'\1', s)
|
||
s = re.sub(r'__(.+?)__', r'\1', s)
|
||
s = re.sub(r'_(.+?)_', r'\1', s)
|
||
# Remove stray asterisks
|
||
s = re.sub(r'\*+', '', s)
|
||
return s.strip()
|
||
|
||
|
||
def ollama_generate(cfg: OllamaConfig, prompt: str) -> str:
|
||
url = cfg.host.rstrip("/") + "/api/generate"
|
||
payload = {
|
||
"model": cfg.model,
|
||
"prompt": prompt,
|
||
"stream": False,
|
||
"options": {
|
||
"temperature": cfg.temperature,
|
||
"num_ctx": cfg.num_ctx,
|
||
"num_predict": cfg.num_predict,
|
||
"top_p": cfg.top_p,
|
||
},
|
||
}
|
||
resp = http_post_json(url, payload, cfg.timeout_s)
|
||
out = resp.get("response", "")
|
||
# Clean up common LLM artifacts
|
||
out = strip_markdown(out)
|
||
return out.strip()
|
||
|
||
|
||
def extract_placeholder_names(s: str) -> List[str]:
|
||
"""Extract placeholder variable names (not the full braced expression).
|
||
|
||
For '{name}' returns ['name']
|
||
For '{count} {count, plural, =1{hop} other{hops}}' returns ['count']
|
||
"""
|
||
names = set()
|
||
# Get ICU variable names first
|
||
for m in ICU_VAR_RE.finditer(s):
|
||
names.add(m.group(1))
|
||
# Get simple placeholders, but skip if they're inside ICU blocks (text forms like {hop})
|
||
# We do this by checking if the name is also an ICU variable - if not, it's a simple placeholder
|
||
# unless it looks like a word (ICU text forms are usually short words)
|
||
for m in SIMPLE_PLACEHOLDER_RE.finditer(s):
|
||
name = m.group(1)
|
||
# Check if this appears as a simple {name} placeholder (not inside ICU)
|
||
# by looking at what comes after it
|
||
full_match = m.group(0)
|
||
pos = m.start()
|
||
# Look for pattern like {name, plural/select - if found, skip (handled by ICU_VAR_RE)
|
||
rest = s[pos:]
|
||
if re.match(r"\{\w+\s*,\s*(?:plural|select|selectordinal)", rest, re.IGNORECASE):
|
||
continue
|
||
# Check if this is likely a text form inside ICU (preceded by =X{ or other{)
|
||
before = s[:pos]
|
||
if re.search(r"(?:=\d+|zero|one|two|few|many|other)\s*$", before, re.IGNORECASE):
|
||
continue # This is a text form like "=1{hop}", skip it
|
||
names.add(name)
|
||
return sorted(names)
|
||
|
||
|
||
def build_prompt(text: str, target_lang: str, placeholder_names: List[str], has_icu: bool, ask_confidence: bool = False) -> str:
|
||
preserve_list = "\n".join(f"- {{{t}}}" for t in placeholder_names) if placeholder_names else "- (none)"
|
||
|
||
icu_note = ""
|
||
if has_icu:
|
||
icu_note = (
|
||
"ICU FORMAT RULES:\n"
|
||
f"- This text uses ICU plural/select format: {{var, plural, =1{{singular}} other{{plural}}}}\n"
|
||
"- Keep structure keywords EXACTLY: plural, select, =0, =1, =2, zero, one, two, few, many, other\n"
|
||
f"- TRANSLATE the words inside each form to {target_lang}\n"
|
||
"- Example: =1{item} other{items} -> translate 'item'/'items' but keep =1{{ }} other{{ }} structure\n\n"
|
||
)
|
||
|
||
if ask_confidence:
|
||
return (
|
||
f"Translate this UI string to {target_lang}.\n\n"
|
||
"RULES:\n"
|
||
"- Placeholders like {name}, {count} must appear EXACTLY unchanged.\n"
|
||
"- Use infinitive verb forms for buttons (Save, Delete, etc.).\n"
|
||
f"- Use natural {target_lang} word order.\n"
|
||
"- Keep brand names and technical terms unchanged.\n\n"
|
||
f"{icu_note}"
|
||
f"Placeholders: {', '.join(f'{{{t}}}' for t in placeholder_names) if placeholder_names else 'none'}\n\n"
|
||
f"English: {text}\n\n"
|
||
"Respond with EXACTLY two lines:\n"
|
||
"1. The translation (no quotes)\n"
|
||
"2. Confidence score 1-5 (5=certain, 1=unsure)\n\n"
|
||
"Example response:\n"
|
||
"Guardar archivo\n"
|
||
"5"
|
||
)
|
||
else:
|
||
return (
|
||
f"Translate this UI string to {target_lang}. Return ONLY the translation.\n\n"
|
||
"RULES:\n"
|
||
"- Output the translated text ONLY. No markdown, no quotes, no explanations.\n"
|
||
"- Placeholders like {name}, {count} must appear EXACTLY unchanged.\n"
|
||
"- Use infinitive verb forms for buttons (Save, Delete, etc.).\n"
|
||
f"- Use natural {target_lang} word order.\n"
|
||
"- Keep brand names and technical terms unchanged.\n"
|
||
"- Translation length should be similar to the original.\n\n"
|
||
f"{icu_note}"
|
||
f"Placeholders: {', '.join(f'{{{t}}}' for t in placeholder_names) if placeholder_names else 'none'}\n\n"
|
||
f"English: {text}\n"
|
||
f"{target_lang}:"
|
||
)
|
||
|
||
|
||
def parse_confidence_response(response: str) -> Tuple[str, int]:
|
||
"""Parse response with translation and confidence score.
|
||
|
||
Returns (translation, confidence) where confidence is 1-5, or 0 if unparseable.
|
||
"""
|
||
lines = response.strip().split('\n')
|
||
if len(lines) >= 2:
|
||
translation = '\n'.join(lines[:-1]).strip() # All but last line
|
||
try:
|
||
# Try to extract number from last line
|
||
last_line = lines[-1].strip()
|
||
# Handle formats like "5", "5/5", "Confidence: 5"
|
||
match = re.search(r'\b([1-5])\b', last_line)
|
||
if match:
|
||
confidence = int(match.group(1))
|
||
return translation, confidence
|
||
except ValueError:
|
||
pass
|
||
# Fallback: treat whole response as translation with unknown confidence
|
||
return strip_markdown(response), 0
|
||
|
||
|
||
def looks_like_translation_failed(src: str, out: str) -> bool:
|
||
if not out:
|
||
return True
|
||
if src.strip() == out.strip() and len(src.strip()) > 8:
|
||
return True
|
||
# Detect hallucination: output much longer than input (3x+ for short strings, 2x for longer)
|
||
src_len = len(src.strip())
|
||
out_len = len(out.strip())
|
||
if src_len < 50 and out_len > src_len * 3:
|
||
return True
|
||
if src_len >= 50 and out_len > src_len * 2:
|
||
return True
|
||
return False
|
||
|
||
|
||
def has_icu_block(s: str) -> bool:
|
||
"""Check if string contains ICU plural/select block."""
|
||
return bool(ICU_VAR_RE.search(s))
|
||
|
||
|
||
def validate_preserved_tokens(src: str, out: str) -> Tuple[bool, Optional[str]]:
|
||
"""Validate that placeholder names are preserved in translation."""
|
||
src_names = extract_placeholder_names(src)
|
||
|
||
# Check each placeholder name appears in output
|
||
for name in src_names:
|
||
# Look for {name} or {name, plural/select...}
|
||
pattern = r"\{" + re.escape(name) + r"(?:\}|\s*,)"
|
||
if not re.search(pattern, out):
|
||
return False, f"Missing placeholder: {{{name}}}"
|
||
|
||
# If source has ICU block, output should too
|
||
if has_icu_block(src) and not has_icu_block(out):
|
||
return False, "ICU plural/select block missing in output"
|
||
|
||
return True, None
|
||
|
||
|
||
def compute_confidence(src: str, out: str) -> Tuple[float, List[str]]:
|
||
"""
|
||
Compute confidence score (0.0 to 1.0) for a translation.
|
||
Returns (score, list of issues).
|
||
"""
|
||
issues = []
|
||
score = 1.0
|
||
|
||
src_len = len(src.strip())
|
||
out_len = len(out.strip())
|
||
|
||
# Length ratio check
|
||
if src_len > 0:
|
||
ratio = out_len / src_len
|
||
if ratio < 0.3: # Way too short
|
||
score -= 0.4
|
||
issues.append("too_short")
|
||
elif ratio < 0.5:
|
||
score -= 0.2
|
||
issues.append("short")
|
||
elif ratio > 2.5: # Way too long
|
||
score -= 0.4
|
||
issues.append("too_long")
|
||
elif ratio > 1.8:
|
||
score -= 0.2
|
||
issues.append("long")
|
||
|
||
# Contains question mark when source doesn't (model asking questions)
|
||
if '?' in out and '?' not in src:
|
||
score -= 0.3
|
||
issues.append("added_question")
|
||
|
||
# Contains common LLM artifacts
|
||
artifacts = ['```', '**', 'translation:', 'here is', 'certainly', 'i can', 'i\'ll']
|
||
out_lower = out.lower()
|
||
for artifact in artifacts:
|
||
if artifact in out_lower:
|
||
score -= 0.3
|
||
issues.append(f"artifact:{artifact}")
|
||
break
|
||
|
||
# Output looks like it's in English still (common words)
|
||
english_indicators = ['the ', ' is ', ' are ', ' was ', ' were ', ' have ', ' has ', 'you ', ' your ']
|
||
english_count = sum(1 for ind in english_indicators if ind in out_lower)
|
||
if english_count >= 3 and src_len > 20:
|
||
score -= 0.3
|
||
issues.append("likely_english")
|
||
|
||
# Contains newlines when source doesn't
|
||
if '\n' in out and '\n' not in src:
|
||
score -= 0.2
|
||
issues.append("added_newlines")
|
||
|
||
# ICU/placeholder validation
|
||
ok, _ = validate_preserved_tokens(src, out)
|
||
if not ok:
|
||
score -= 0.3
|
||
issues.append("placeholder_error")
|
||
|
||
return max(0.0, score), issues
|
||
|
||
|
||
# Keys to skip translation (brand names)
|
||
SKIP_KEYS = {
|
||
"appTitle",
|
||
}
|
||
|
||
# Manual translations for problematic strings (key -> {locale: translation})
|
||
MANUAL_TRANSLATIONS: Dict[str, Dict[str, str]] = {
|
||
"repeater_daysHoursMinsSecs": {
|
||
"es": "{days} días {hours}h {minutes}m {seconds}s",
|
||
"fr": "{days} jours {hours}h {minutes}m {seconds}s",
|
||
"de": "{days} Tage {hours}h {minutes}m {seconds}s",
|
||
"it": "{days} giorni {hours}h {minutes}m {seconds}s",
|
||
"pt": "{days} dias {hours}h {minutes}m {seconds}s",
|
||
"pl": "{days} dni {hours}h {minutes}m {seconds}s",
|
||
"sk": "{days} dní {hours}h {minutes}m {seconds}s",
|
||
"sl": "{days} dni {hours}h {minutes}m {seconds}s",
|
||
"cs": "{days} dní {hours}h {minutes}m {seconds}s",
|
||
"ja": "{days}日 {hours}時間 {minutes}分 {seconds}秒",
|
||
"ko": "{days}일 {hours}시간 {minutes}분 {seconds}초",
|
||
"zh": "{days}天 {hours}小时 {minutes}分 {seconds}秒",
|
||
"ru": "{days} дней {hours}ч {minutes}м {seconds}с",
|
||
"bg": "{days} дни {hours}ч {minutes}м {seconds}с",
|
||
"nl": "{days} dagen {hours}u {minutes}m {seconds}s",
|
||
"sv": "{days} dagar {hours}t {minutes}m {seconds}s",
|
||
},
|
||
}
|
||
|
||
|
||
def is_translatable_entry(key: str, value: Any) -> bool:
|
||
if key == "@@locale":
|
||
return False
|
||
if key in SKIP_KEYS:
|
||
return False
|
||
if key.startswith("@"):
|
||
return False
|
||
if not isinstance(value, str):
|
||
return False
|
||
if value.strip() == "":
|
||
return False
|
||
return True
|
||
|
||
|
||
def translate_one(
|
||
key: str,
|
||
text: str,
|
||
target_lang: str,
|
||
cfg: OllamaConfig,
|
||
retries: int,
|
||
backoff_s: float,
|
||
fallback_cfg: Optional[OllamaConfig] = None,
|
||
confidence_threshold: float = 0.7,
|
||
model_confidence_threshold: int = 4,
|
||
ask_model_confidence: bool = True,
|
||
) -> Tuple[str, str, Optional[str], bool]:
|
||
"""
|
||
Translate a single string.
|
||
Returns (key, translated_text, error_or_none, used_fallback_model).
|
||
"""
|
||
placeholder_names = extract_placeholder_names(text)
|
||
text_has_icu = has_icu_block(text)
|
||
|
||
# Ask for confidence if we have a fallback model
|
||
should_ask_confidence = ask_model_confidence and fallback_cfg and fallback_cfg.model != cfg.model
|
||
prompt = build_prompt(text, target_lang, placeholder_names, text_has_icu, ask_confidence=should_ask_confidence)
|
||
used_fallback = False
|
||
|
||
last_err: Optional[str] = None
|
||
for attempt in range(retries + 1):
|
||
try:
|
||
raw_out = ollama_generate(cfg, prompt)
|
||
|
||
# Parse confidence if we asked for it
|
||
if should_ask_confidence:
|
||
out, model_confidence = parse_confidence_response(raw_out)
|
||
else:
|
||
out = raw_out
|
||
model_confidence = 5 # Assume high confidence if not asked
|
||
|
||
ok, why = validate_preserved_tokens(text, out)
|
||
if not ok:
|
||
last_err = f"Validation failed: {why}"
|
||
# Retry without confidence format for simpler response
|
||
prompt = build_prompt(text, target_lang, placeholder_names, text_has_icu, ask_confidence=False)
|
||
prompt = (
|
||
prompt
|
||
+ "\n\nIMPORTANT: You MUST keep every {...} segment exactly unchanged. "
|
||
"If you cannot, return the original text unchanged."
|
||
)
|
||
raise ValueError(last_err)
|
||
|
||
if looks_like_translation_failed(text, out) and attempt < retries:
|
||
last_err = "Output identical/suspicious; retrying"
|
||
time.sleep(backoff_s * (attempt + 1))
|
||
continue
|
||
|
||
# Check if model reported low confidence - use fallback
|
||
if model_confidence > 0 and model_confidence < model_confidence_threshold and fallback_cfg:
|
||
fallback_prompt = build_prompt(text, target_lang, placeholder_names, text_has_icu, ask_confidence=False)
|
||
fallback_out = ollama_generate(fallback_cfg, fallback_prompt)
|
||
fallback_ok, _ = validate_preserved_tokens(text, fallback_out)
|
||
if fallback_ok and not looks_like_translation_failed(text, fallback_out):
|
||
return key, fallback_out, None, True
|
||
|
||
# Also check computed confidence and use fallback model if needed
|
||
confidence, issues = compute_confidence(text, out)
|
||
if confidence < confidence_threshold and fallback_cfg and fallback_cfg.model != cfg.model:
|
||
# Low confidence - try with bigger model
|
||
fallback_prompt = build_prompt(text, target_lang, placeholder_names, text_has_icu)
|
||
fallback_out = ollama_generate(fallback_cfg, fallback_prompt)
|
||
fallback_ok, _ = validate_preserved_tokens(text, fallback_out)
|
||
fallback_conf, _ = compute_confidence(text, fallback_out)
|
||
|
||
if fallback_ok and fallback_conf > confidence:
|
||
# Fallback is better
|
||
return key, fallback_out, None, True
|
||
elif fallback_ok and not ok:
|
||
# Original failed validation but fallback passed
|
||
return key, fallback_out, None, True
|
||
|
||
return key, out, None, used_fallback
|
||
|
||
except Exception as e:
|
||
last_err = str(e)
|
||
if attempt < retries:
|
||
time.sleep(backoff_s * (attempt + 1))
|
||
continue
|
||
|
||
# Last resort: try fallback model
|
||
if fallback_cfg and fallback_cfg.model != cfg.model:
|
||
try:
|
||
fallback_prompt = build_prompt(text, target_lang, placeholder_names, text_has_icu)
|
||
fallback_out = ollama_generate(fallback_cfg, fallback_prompt)
|
||
fallback_ok, _ = validate_preserved_tokens(text, fallback_out)
|
||
if fallback_ok and not looks_like_translation_failed(text, fallback_out):
|
||
return key, fallback_out, None, True
|
||
except Exception:
|
||
pass
|
||
|
||
return key, text, last_err, False # fallback to original on failure
|
||
|
||
|
||
def fmt_duration(seconds: float) -> str:
|
||
if seconds < 60:
|
||
return f"{seconds:.1f}s"
|
||
m = int(seconds // 60)
|
||
s = seconds - 60 * m
|
||
if m < 60:
|
||
return f"{m}m {s:.0f}s"
|
||
h = m // 60
|
||
m2 = m % 60
|
||
return f"{h}h {m2}m"
|
||
|
||
|
||
def main() -> int:
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--in", dest="in_path", required=True, help="Input .arb/.json file path")
|
||
ap.add_argument("--out", dest="out_path", required=True, help="Output .arb/.json file path")
|
||
ap.add_argument("--to-locale", required=True, help="Target locale code, e.g. es, fr, de")
|
||
ap.add_argument("--target-lang", default=None, help="Target language name for the model, e.g. Spanish (defaults from locale)")
|
||
ap.add_argument("--model", default="gemma3:4b", help="Ollama model name")
|
||
ap.add_argument("--fallback-model", default=None, help="Larger model to use for low-confidence translations")
|
||
ap.add_argument("--confidence-threshold", type=float, default=0.7, help="Computed confidence threshold to trigger fallback (0.0-1.0)")
|
||
ap.add_argument("--model-confidence-threshold", type=int, default=4, help="Model self-reported confidence threshold (1-5, use fallback if below)")
|
||
ap.add_argument("--retry-model", default="ministral-3:latest", help="Model to use for end-of-run retries")
|
||
ap.add_argument("--host", default="http://localhost:11434", help="Ollama host")
|
||
ap.add_argument("--timeout", type=float, default=120.0, help="HTTP timeout seconds")
|
||
ap.add_argument("--temperature", type=float, default=0.2, help="Model temperature")
|
||
ap.add_argument("--num-ctx", type=int, default=4096, help="Context size")
|
||
ap.add_argument("--num-predict", type=int, default=256, help="Max tokens to generate")
|
||
ap.add_argument("--top-p", type=float, default=0.9, help="Top-p")
|
||
ap.add_argument("--concurrency", type=int, default=4, help="Parallel requests")
|
||
ap.add_argument("--retries", type=int, default=2, help="Retries per string")
|
||
ap.add_argument("--backoff", type=float, default=0.6, help="Backoff seconds base")
|
||
ap.add_argument("--dry-run", action="store_true", help="Do not write file; just print summary")
|
||
ap.add_argument("--progress-every", type=int, default=1, help="Print progress every N completed strings (default: 1)")
|
||
args = ap.parse_args()
|
||
|
||
locale_map = {
|
||
"es": "Spanish",
|
||
"fr": "French",
|
||
"de": "German",
|
||
"it": "Italian",
|
||
"pt": "Portuguese",
|
||
"pt-BR": "Brazilian Portuguese",
|
||
"ja": "Japanese",
|
||
"ko": "Korean",
|
||
"zh": "Chinese (Simplified)",
|
||
"zh-Hant": "Chinese (Traditional)",
|
||
"ru": "Russian",
|
||
"uk": "Ukrainian",
|
||
"ar": "Arabic",
|
||
"hi": "Hindi",
|
||
"tr": "Turkish",
|
||
"nl": "Dutch",
|
||
"sv": "Swedish",
|
||
"no": "Norwegian",
|
||
"da": "Danish",
|
||
"fi": "Finnish",
|
||
"pl": "Polish",
|
||
"cs": "Czech",
|
||
"sk": "Slovak",
|
||
"sl": "Slovenian",
|
||
"bg": "Bulgarian",
|
||
"el": "Greek",
|
||
"he": "Hebrew",
|
||
"th": "Thai",
|
||
"vi": "Vietnamese",
|
||
"id": "Indonesian",
|
||
}
|
||
target_lang = args.target_lang or locale_map.get(args.to_locale, args.to_locale)
|
||
|
||
try:
|
||
with open(args.in_path, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
except Exception as e:
|
||
print(f"Failed to read input: {e}", file=sys.stderr)
|
||
return 2
|
||
|
||
if not isinstance(data, dict):
|
||
print("Input JSON must be an object at top-level.", file=sys.stderr)
|
||
return 2
|
||
|
||
cfg = OllamaConfig(
|
||
host=args.host,
|
||
model=args.model,
|
||
timeout_s=args.timeout,
|
||
temperature=args.temperature,
|
||
num_ctx=args.num_ctx,
|
||
num_predict=args.num_predict,
|
||
top_p=args.top_p,
|
||
)
|
||
|
||
# Fallback model for low-confidence translations
|
||
fallback_cfg = None
|
||
if args.fallback_model:
|
||
fallback_cfg = OllamaConfig(
|
||
host=args.host,
|
||
model=args.fallback_model,
|
||
timeout_s=args.timeout,
|
||
temperature=args.temperature,
|
||
num_ctx=args.num_ctx,
|
||
num_predict=args.num_predict,
|
||
top_p=args.top_p,
|
||
)
|
||
|
||
out_data: Dict[str, Any] = dict(data)
|
||
out_data["@@locale"] = args.to_locale
|
||
|
||
items: List[Tuple[str, str]] = [(k, v) for k, v in data.items() if is_translatable_entry(k, v)]
|
||
|
||
# Apply manual translations first
|
||
manual_count = 0
|
||
items_to_translate: List[Tuple[str, str]] = []
|
||
for k, v in items:
|
||
if k in MANUAL_TRANSLATIONS and args.to_locale in MANUAL_TRANSLATIONS[k]:
|
||
out_data[k] = MANUAL_TRANSLATIONS[k][args.to_locale]
|
||
manual_count += 1
|
||
else:
|
||
items_to_translate.append((k, v))
|
||
|
||
if manual_count > 0:
|
||
print(f"Applied {manual_count} manual translation(s)")
|
||
|
||
total = len(items_to_translate)
|
||
if total == 0 and manual_count == 0:
|
||
print("No translatable string entries found (excluding @@locale and @metadata).", file=sys.stderr)
|
||
return 1
|
||
|
||
if total == 0:
|
||
print("All strings handled by manual translations.")
|
||
else:
|
||
fallback_info = f" (fallback: {args.fallback_model})" if args.fallback_model else ""
|
||
print(f"Translating {total} strings -> {target_lang} using {cfg.model}{fallback_info} (concurrency={args.concurrency})")
|
||
|
||
start = time.time()
|
||
|
||
failures: List[Tuple[str, str]] = []
|
||
translated_ok = manual_count # Count manual translations as OK
|
||
fallback_used = 0
|
||
completed = 0
|
||
|
||
# Build a lookup for original text by key
|
||
items_dict: Dict[str, str] = dict(items_to_translate)
|
||
|
||
# Submit all tasks up front
|
||
if total > 0:
|
||
with ThreadPoolExecutor(max_workers=max(1, args.concurrency)) as ex:
|
||
future_to_key = {
|
||
ex.submit(
|
||
translate_one,
|
||
key=k,
|
||
text=v,
|
||
target_lang=target_lang,
|
||
cfg=cfg,
|
||
retries=args.retries,
|
||
backoff_s=args.backoff,
|
||
fallback_cfg=fallback_cfg,
|
||
confidence_threshold=args.confidence_threshold,
|
||
model_confidence_threshold=args.model_confidence_threshold,
|
||
ask_model_confidence=bool(args.fallback_model),
|
||
): k
|
||
for (k, v) in items_to_translate
|
||
}
|
||
|
||
for fut in as_completed(future_to_key):
|
||
k, translated, err, used_fallback = fut.result()
|
||
out_data[k] = translated
|
||
|
||
completed += 1
|
||
if err:
|
||
failures.append((k, err))
|
||
status = "FAIL"
|
||
else:
|
||
translated_ok += 1
|
||
if used_fallback:
|
||
fallback_used += 1
|
||
status = "OK*" # asterisk indicates fallback model was used
|
||
else:
|
||
status = "OK"
|
||
|
||
if args.progress_every > 0 and (completed % args.progress_every == 0 or completed == total):
|
||
elapsed = time.time() - start
|
||
rate = completed / elapsed if elapsed > 0 else 0.0
|
||
remaining = (total - completed) / rate if rate > 0 else 0.0
|
||
# Keep it single-line friendly but readable.
|
||
print(
|
||
f"[{completed:>4}/{total}] {status:<4} {k} | "
|
||
f"elapsed {fmt_duration(elapsed)} | ETA {fmt_duration(remaining)}"
|
||
)
|
||
|
||
elapsed = time.time() - start
|
||
fallback_msg = f", used_fallback_model={fallback_used}" if fallback_used > 0 else ""
|
||
print(f"Done in {fmt_duration(elapsed)}. OK={translated_ok}{fallback_msg}, errors={len(failures)}")
|
||
|
||
# Retry failed translations at the end with increasing temperature
|
||
retry_round = 1
|
||
max_end_retries = 3
|
||
retry_model = args.retry_model
|
||
while failures and retry_round <= max_end_retries:
|
||
# Increase temperature for each retry round
|
||
retry_temp = min(cfg.temperature + (0.2 * retry_round), 1.0)
|
||
print(f"\n--- Retry round {retry_round}/{max_end_retries} for {len(failures)} failed key(s) (model={retry_model}, temp={retry_temp:.1f}) ---")
|
||
retry_items = [(k, items_dict[k]) for k, _ in failures]
|
||
failures = []
|
||
retry_completed = 0
|
||
retry_total = len(retry_items)
|
||
retry_start = time.time()
|
||
|
||
# Create config with higher temperature (and optionally different model) for retries
|
||
retry_cfg = OllamaConfig(
|
||
host=cfg.host,
|
||
model=retry_model,
|
||
timeout_s=cfg.timeout_s,
|
||
temperature=retry_temp,
|
||
num_ctx=cfg.num_ctx,
|
||
num_predict=cfg.num_predict,
|
||
top_p=cfg.top_p,
|
||
)
|
||
|
||
with ThreadPoolExecutor(max_workers=max(1, args.concurrency)) as ex:
|
||
future_to_key = {
|
||
ex.submit(
|
||
translate_one,
|
||
key=k,
|
||
text=v,
|
||
target_lang=target_lang,
|
||
cfg=retry_cfg,
|
||
retries=args.retries,
|
||
backoff_s=args.backoff,
|
||
): k
|
||
for (k, v) in retry_items
|
||
}
|
||
|
||
for fut in as_completed(future_to_key):
|
||
k, translated, err, used_fb = fut.result()
|
||
out_data[k] = translated
|
||
|
||
retry_completed += 1
|
||
if err:
|
||
failures.append((k, err))
|
||
status = "FAIL"
|
||
else:
|
||
translated_ok += 1
|
||
status = "OK"
|
||
|
||
if args.progress_every > 0 and (retry_completed % args.progress_every == 0 or retry_completed == retry_total):
|
||
elapsed = time.time() - retry_start
|
||
rate = retry_completed / elapsed if elapsed > 0 else 0.0
|
||
remaining = (retry_total - retry_completed) / rate if rate > 0 else 0.0
|
||
print(
|
||
f"[{retry_completed:>4}/{retry_total}] {status:<4} {k} | "
|
||
f"elapsed {fmt_duration(elapsed)} | ETA {fmt_duration(remaining)}"
|
||
)
|
||
|
||
retry_elapsed = time.time() - retry_start
|
||
print(f"Retry round {retry_round} done in {fmt_duration(retry_elapsed)}. Remaining failures: {len(failures)}")
|
||
retry_round += 1
|
||
|
||
total_elapsed = time.time() - start
|
||
print(f"\nTotal time: {fmt_duration(total_elapsed)}. OK={translated_ok}, final fallback={len(failures)}")
|
||
|
||
if failures:
|
||
print("Fallback keys (kept original English due to errors):")
|
||
for k, err in failures[:60]:
|
||
print(f" - {k}: {err}")
|
||
if len(failures) > 60:
|
||
print(f" ... and {len(failures) - 60} more")
|
||
|
||
if args.dry_run:
|
||
print("Dry run: not writing output file.")
|
||
return 0
|
||
|
||
try:
|
||
with open(args.out_path, "w", encoding="utf-8") as f:
|
||
json.dump(out_data, f, ensure_ascii=False, indent=2)
|
||
f.write("\n")
|
||
except Exception as e:
|
||
print(f"Failed to write output: {e}", file=sys.stderr)
|
||
return 2
|
||
|
||
print(f"Wrote: {args.out_path}")
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|