feat: replace index.json with index.jsonl flat JSONL format

Switch from hierarchical index.json to flat index.jsonl format for
snapshot metadata storage. Each line is a self-contained JSON record
with a 'type' field (Snapshot, ArchiveResult, Binary, Process).

Changes:
- Add JSONL_INDEX_FILENAME constant to constants.py
- Add TYPE_PROCESS and TYPE_MACHINE to jsonl.py type constants
- Add binary_to_jsonl(), process_to_jsonl(), machine_to_jsonl() converters
- Add Snapshot.write_index_jsonl() to write new format
- Add Snapshot.read_index_jsonl() to read new format
- Add Snapshot.convert_index_json_to_jsonl() for migration
- Update Snapshot.reconcile_with_index() to handle both formats
- Update fs_migrate to convert during filesystem migration
- Update load_from_directory/create_from_directory for both formats
- Update legacy.py parse_json_links_details for JSONL support

The new format is easier to parse, extend, and mix record types.
This commit is contained in:
Claude
2025-12-30 18:21:06 +00:00
parent 96ee1bf686
commit d36079829b
4 changed files with 359 additions and 44 deletions

View File

@@ -28,8 +28,10 @@ TYPE_ARCHIVERESULT = 'ArchiveResult'
TYPE_TAG = 'Tag'
TYPE_CRAWL = 'Crawl'
TYPE_BINARY = 'Binary'
TYPE_PROCESS = 'Process'
TYPE_MACHINE = 'Machine'
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_BINARY}
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_BINARY, TYPE_PROCESS, TYPE_MACHINE}
def parse_line(line: str) -> Optional[Dict[str, Any]]:
@@ -227,6 +229,64 @@ def crawl_to_jsonl(crawl) -> Dict[str, Any]:
}
def binary_to_jsonl(binary) -> Dict[str, Any]:
"""
Convert a Binary model instance to a JSONL record.
"""
return {
'type': TYPE_BINARY,
'id': str(binary.id),
'machine_id': str(binary.machine_id),
'name': binary.name,
'binprovider': binary.binprovider,
'abspath': binary.abspath,
'version': binary.version,
'sha256': binary.sha256,
'status': binary.status,
}
def process_to_jsonl(process) -> Dict[str, Any]:
"""
Convert a Process model instance to a JSONL record.
"""
record = {
'type': TYPE_PROCESS,
'id': str(process.id),
'machine_id': str(process.machine_id),
'cmd': process.cmd,
'pwd': process.pwd,
'status': process.status,
'exit_code': process.exit_code,
'started_at': process.started_at.isoformat() if process.started_at else None,
'ended_at': process.ended_at.isoformat() if process.ended_at else None,
}
# Include optional fields if set
if process.binary_id:
record['binary_id'] = str(process.binary_id)
if process.pid:
record['pid'] = process.pid
if process.timeout:
record['timeout'] = process.timeout
return record
def machine_to_jsonl(machine) -> Dict[str, Any]:
"""
Convert a Machine model instance to a JSONL record.
"""
return {
'type': TYPE_MACHINE,
'id': str(machine.id),
'guid': machine.guid,
'hostname': machine.hostname,
'os_arch': machine.os_arch,
'os_family': machine.os_family,
'os_platform': machine.os_platform,
'os_release': machine.os_release,
}
def process_records(
records: Iterator[Dict[str, Any]],
handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]]

View File

@@ -58,9 +58,10 @@ def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
"""
Parse links from individual snapshot index.json files in archive directories.
Parse links from individual snapshot index.jsonl/index.json files in archive directories.
Walks through archive/*/index.json files to discover orphaned snapshots.
Walks through archive/*/index.jsonl and archive/*/index.json files to discover orphaned snapshots.
Prefers index.jsonl (new format) over index.json (legacy format).
"""
from archivebox.config import CONSTANTS
@@ -72,19 +73,36 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
if not entry.is_dir():
continue
index_file = Path(entry.path) / 'index.json'
if not index_file.exists():
continue
# Try index.jsonl first (new format)
jsonl_file = Path(entry.path) / CONSTANTS.JSONL_INDEX_FILENAME
json_file = Path(entry.path) / CONSTANTS.JSON_INDEX_FILENAME
try:
with open(index_file, 'r', encoding='utf-8') as f:
link = json.load(f)
link = None
if jsonl_file.exists():
try:
with open(jsonl_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line.startswith('{'):
record = json.loads(line)
if record.get('type') == 'Snapshot':
link = record
break
except (json.JSONDecodeError, KeyError, TypeError):
pass
elif json_file.exists():
try:
with open(json_file, 'r', encoding='utf-8') as f:
link = json.load(f)
except (json.JSONDecodeError, KeyError, TypeError):
pass
if link:
yield {
'url': link.get('url', ''),
'timestamp': link.get('timestamp', entry.name),
'title': link.get('title'),
'tags': link.get('tags', ''),
}
except (json.JSONDecodeError, KeyError, TypeError):
continue