feat: replace index.json with index.jsonl flat JSONL format

Switch from hierarchical index.json to flat index.jsonl format for snapshot metadata storage. Each line is a self-contained JSON record with a 'type' field (Snapshot, ArchiveResult, Binary, Process). Changes: - Add JSONL_INDEX_FILENAME constant to constants.py - Add TYPE_PROCESS and TYPE_MACHINE to jsonl.py type constants - Add binary_to_jsonl(), process_to_jsonl(), machine_to_jsonl() converters - Add Snapshot.write_index_jsonl() to write new format - Add Snapshot.read_index_jsonl() to read new format - Add Snapshot.convert_index_json_to_jsonl() for migration - Update Snapshot.reconcile_with_index() to handle both formats - Update fs_migrate to convert during filesystem migration - Update load_from_directory/create_from_directory for both formats - Update legacy.py parse_json_links_details for JSONL support The new format is easier to parse, extend, and mix record types.
2026-04-06 07:47:53 +10:00 · 2025-12-30 18:21:06 +00:00
parent 96ee1bf686
commit d36079829b
4 changed files with 359 additions and 44 deletions
--- a/archivebox/misc/jsonl.py
+++ b/archivebox/misc/jsonl.py
@@ -28,8 +28,10 @@ TYPE_ARCHIVERESULT = 'ArchiveResult'
 TYPE_TAG = 'Tag'
 TYPE_CRAWL = 'Crawl'
 TYPE_BINARY = 'Binary'
+TYPE_PROCESS = 'Process'
+TYPE_MACHINE = 'Machine'

-VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_BINARY}
+VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_BINARY, TYPE_PROCESS, TYPE_MACHINE}


 def parse_line(line: str) -> Optional[Dict[str, Any]]:
@@ -227,6 +229,64 @@ def crawl_to_jsonl(crawl) -> Dict[str, Any]:
    }


+def binary_to_jsonl(binary) -> Dict[str, Any]:
+    """
+    Convert a Binary model instance to a JSONL record.
+    """
+    return {
+        'type': TYPE_BINARY,
+        'id': str(binary.id),
+        'machine_id': str(binary.machine_id),
+        'name': binary.name,
+        'binprovider': binary.binprovider,
+        'abspath': binary.abspath,
+        'version': binary.version,
+        'sha256': binary.sha256,
+        'status': binary.status,
+    }
+
+
+def process_to_jsonl(process) -> Dict[str, Any]:
+    """
+    Convert a Process model instance to a JSONL record.
+    """
+    record = {
+        'type': TYPE_PROCESS,
+        'id': str(process.id),
+        'machine_id': str(process.machine_id),
+        'cmd': process.cmd,
+        'pwd': process.pwd,
+        'status': process.status,
+        'exit_code': process.exit_code,
+        'started_at': process.started_at.isoformat() if process.started_at else None,
+        'ended_at': process.ended_at.isoformat() if process.ended_at else None,
+    }
+    # Include optional fields if set
+    if process.binary_id:
+        record['binary_id'] = str(process.binary_id)
+    if process.pid:
+        record['pid'] = process.pid
+    if process.timeout:
+        record['timeout'] = process.timeout
+    return record
+
+
+def machine_to_jsonl(machine) -> Dict[str, Any]:
+    """
+    Convert a Machine model instance to a JSONL record.
+    """
+    return {
+        'type': TYPE_MACHINE,
+        'id': str(machine.id),
+        'guid': machine.guid,
+        'hostname': machine.hostname,
+        'os_arch': machine.os_arch,
+        'os_family': machine.os_family,
+        'os_platform': machine.os_platform,
+        'os_release': machine.os_release,
+    }
+
+
 def process_records(
    records: Iterator[Dict[str, Any]],
    handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]]
--- a/archivebox/misc/legacy.py
+++ b/archivebox/misc/legacy.py
@@ -58,9 +58,10 @@ def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:

 def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
    """
-    Parse links from individual snapshot index.json files in archive directories.
+    Parse links from individual snapshot index.jsonl/index.json files in archive directories.

-    Walks through archive/*/index.json files to discover orphaned snapshots.
+    Walks through archive/*/index.jsonl and archive/*/index.json files to discover orphaned snapshots.
+    Prefers index.jsonl (new format) over index.json (legacy format).
    """
    from archivebox.config import CONSTANTS

@@ -72,19 +73,36 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
        if not entry.is_dir():
            continue

-        index_file = Path(entry.path) / 'index.json'
-        if not index_file.exists():
-            continue
+        # Try index.jsonl first (new format)
+        jsonl_file = Path(entry.path) / CONSTANTS.JSONL_INDEX_FILENAME
+        json_file = Path(entry.path) / CONSTANTS.JSON_INDEX_FILENAME

-        try:
-            with open(index_file, 'r', encoding='utf-8') as f:
-                link = json.load(f)
+        link = None

+        if jsonl_file.exists():
+            try:
+                with open(jsonl_file, 'r', encoding='utf-8') as f:
+                    for line in f:
+                        line = line.strip()
+                        if line.startswith('{'):
+                            record = json.loads(line)
+                            if record.get('type') == 'Snapshot':
+                                link = record
+                                break
+            except (json.JSONDecodeError, KeyError, TypeError):
+                pass
+
+        elif json_file.exists():
+            try:
+                with open(json_file, 'r', encoding='utf-8') as f:
+                    link = json.load(f)
+            except (json.JSONDecodeError, KeyError, TypeError):
+                pass
+
+        if link:
            yield {
                'url': link.get('url', ''),
                'timestamp': link.get('timestamp', entry.name),
                'title': link.get('title'),
                'tags': link.get('tags', ''),
            }
-        except (json.JSONDecodeError, KeyError, TypeError):
-            continue