Training: Optimize _peek_json_keys to avoid loading entire file into memory

2026-03-10 07:34:00 +01:00 · 2026-03-06 15:39:08 -08:00 · 2026-03-06 15:39:08 -08:00 · d48b53422f
parent 2beaa4b971
commit d48b53422f
1 changed files with 30 additions and 3 deletions
--- a/modules/utils.py
+++ b/modules/utils.py
@ -298,11 +298,38 @@ def get_text_datasets(path: str):
 def _peek_json_keys(filepath):
    """Read the first object in a JSON array file and return its keys."""
    import json
+    decoder = json.JSONDecoder()
+    WS = ' \t\n\r'
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-        if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
-            return set(data[0].keys())
+            buf = ''
+            obj_start = None
+            while len(buf) < 1 << 20:  # Read up to 1MB
+                chunk = f.read(8192)
+                if not chunk:
+                    break
+                buf += chunk
+                if obj_start is None:
+                    idx = 0
+                    while idx < len(buf) and buf[idx] in WS:
+                        idx += 1
+                    if idx >= len(buf):
+                        continue
+                    if buf[idx] != '[':
+                        return set()
+                    idx += 1
+                    while idx < len(buf) and buf[idx] in WS:
+                        idx += 1
+                    if idx >= len(buf):
+                        continue
+                    obj_start = idx
+                try:
+                    obj, _ = decoder.raw_decode(buf, obj_start)
+                    if isinstance(obj, dict):
+                        return set(obj.keys())
+                    return set()
+                except json.JSONDecodeError:
+                    continue
    except Exception:
        pass
    return set()