mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
feat: replace index.json with index.jsonl flat JSONL format
Switch from hierarchical index.json to flat index.jsonl format for snapshot metadata storage. Each line is a self-contained JSON record with a 'type' field (Snapshot, ArchiveResult, Binary, Process). Changes: - Add JSONL_INDEX_FILENAME constant to constants.py - Add TYPE_PROCESS and TYPE_MACHINE to jsonl.py type constants - Add binary_to_jsonl(), process_to_jsonl(), machine_to_jsonl() converters - Add Snapshot.write_index_jsonl() to write new format - Add Snapshot.read_index_jsonl() to read new format - Add Snapshot.convert_index_json_to_jsonl() for migration - Update Snapshot.reconcile_with_index() to handle both formats - Update fs_migrate to convert during filesystem migration - Update load_from_directory/create_from_directory for both formats - Update legacy.py parse_json_links_details for JSONL support The new format is easier to parse, extend, and mix record types.
This commit is contained in:
@@ -28,8 +28,10 @@ TYPE_ARCHIVERESULT = 'ArchiveResult'
|
||||
TYPE_TAG = 'Tag'
|
||||
TYPE_CRAWL = 'Crawl'
|
||||
TYPE_BINARY = 'Binary'
|
||||
TYPE_PROCESS = 'Process'
|
||||
TYPE_MACHINE = 'Machine'
|
||||
|
||||
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_BINARY}
|
||||
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_BINARY, TYPE_PROCESS, TYPE_MACHINE}
|
||||
|
||||
|
||||
def parse_line(line: str) -> Optional[Dict[str, Any]]:
|
||||
@@ -227,6 +229,64 @@ def crawl_to_jsonl(crawl) -> Dict[str, Any]:
|
||||
}
|
||||
|
||||
|
||||
def binary_to_jsonl(binary) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a Binary model instance to a JSONL record.
|
||||
"""
|
||||
return {
|
||||
'type': TYPE_BINARY,
|
||||
'id': str(binary.id),
|
||||
'machine_id': str(binary.machine_id),
|
||||
'name': binary.name,
|
||||
'binprovider': binary.binprovider,
|
||||
'abspath': binary.abspath,
|
||||
'version': binary.version,
|
||||
'sha256': binary.sha256,
|
||||
'status': binary.status,
|
||||
}
|
||||
|
||||
|
||||
def process_to_jsonl(process) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a Process model instance to a JSONL record.
|
||||
"""
|
||||
record = {
|
||||
'type': TYPE_PROCESS,
|
||||
'id': str(process.id),
|
||||
'machine_id': str(process.machine_id),
|
||||
'cmd': process.cmd,
|
||||
'pwd': process.pwd,
|
||||
'status': process.status,
|
||||
'exit_code': process.exit_code,
|
||||
'started_at': process.started_at.isoformat() if process.started_at else None,
|
||||
'ended_at': process.ended_at.isoformat() if process.ended_at else None,
|
||||
}
|
||||
# Include optional fields if set
|
||||
if process.binary_id:
|
||||
record['binary_id'] = str(process.binary_id)
|
||||
if process.pid:
|
||||
record['pid'] = process.pid
|
||||
if process.timeout:
|
||||
record['timeout'] = process.timeout
|
||||
return record
|
||||
|
||||
|
||||
def machine_to_jsonl(machine) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a Machine model instance to a JSONL record.
|
||||
"""
|
||||
return {
|
||||
'type': TYPE_MACHINE,
|
||||
'id': str(machine.id),
|
||||
'guid': machine.guid,
|
||||
'hostname': machine.hostname,
|
||||
'os_arch': machine.os_arch,
|
||||
'os_family': machine.os_family,
|
||||
'os_platform': machine.os_platform,
|
||||
'os_release': machine.os_release,
|
||||
}
|
||||
|
||||
|
||||
def process_records(
|
||||
records: Iterator[Dict[str, Any]],
|
||||
handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]]
|
||||
|
||||
@@ -58,9 +58,10 @@ def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
|
||||
|
||||
def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
|
||||
"""
|
||||
Parse links from individual snapshot index.json files in archive directories.
|
||||
Parse links from individual snapshot index.jsonl/index.json files in archive directories.
|
||||
|
||||
Walks through archive/*/index.json files to discover orphaned snapshots.
|
||||
Walks through archive/*/index.jsonl and archive/*/index.json files to discover orphaned snapshots.
|
||||
Prefers index.jsonl (new format) over index.json (legacy format).
|
||||
"""
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
@@ -72,19 +73,36 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
|
||||
if not entry.is_dir():
|
||||
continue
|
||||
|
||||
index_file = Path(entry.path) / 'index.json'
|
||||
if not index_file.exists():
|
||||
continue
|
||||
# Try index.jsonl first (new format)
|
||||
jsonl_file = Path(entry.path) / CONSTANTS.JSONL_INDEX_FILENAME
|
||||
json_file = Path(entry.path) / CONSTANTS.JSON_INDEX_FILENAME
|
||||
|
||||
try:
|
||||
with open(index_file, 'r', encoding='utf-8') as f:
|
||||
link = json.load(f)
|
||||
link = None
|
||||
|
||||
if jsonl_file.exists():
|
||||
try:
|
||||
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Snapshot':
|
||||
link = record
|
||||
break
|
||||
except (json.JSONDecodeError, KeyError, TypeError):
|
||||
pass
|
||||
|
||||
elif json_file.exists():
|
||||
try:
|
||||
with open(json_file, 'r', encoding='utf-8') as f:
|
||||
link = json.load(f)
|
||||
except (json.JSONDecodeError, KeyError, TypeError):
|
||||
pass
|
||||
|
||||
if link:
|
||||
yield {
|
||||
'url': link.get('url', ''),
|
||||
'timestamp': link.get('timestamp', entry.name),
|
||||
'title': link.get('title'),
|
||||
'tags': link.get('tags', ''),
|
||||
}
|
||||
except (json.JSONDecodeError, KeyError, TypeError):
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user