mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
169 lines
4.9 KiB
Python
169 lines
4.9 KiB
Python
"""
|
|
JSONL (JSON Lines) utilities for ArchiveBox.
|
|
|
|
Provides functions for reading, writing, and processing typed JSONL records.
|
|
All CLI commands that accept stdin can read both plain URLs and typed JSONL.
|
|
|
|
CLI Pipeline:
|
|
archivebox crawl URL -> {"type": "Crawl", "id": "...", "urls": "...", ...}
|
|
archivebox snapshot -> {"type": "Snapshot", "id": "...", "url": "...", ...}
|
|
archivebox extract -> {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", ...}
|
|
|
|
Typed JSONL Format:
|
|
{"type": "Crawl", "id": "...", "urls": "...", "max_depth": 0, ...}
|
|
{"type": "Snapshot", "id": "...", "url": "https://example.com", "title": "...", ...}
|
|
{"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", ...}
|
|
{"type": "Tag", "name": "..."}
|
|
|
|
Plain URLs (also supported):
|
|
https://example.com
|
|
https://foo.com
|
|
"""
|
|
|
|
__package__ = "archivebox.misc"
|
|
|
|
import sys
|
|
import json
|
|
import select
|
|
from typing import Any, TextIO
|
|
from collections.abc import Iterable, Iterator
|
|
from pathlib import Path
|
|
|
|
|
|
# Type constants for JSONL records
|
|
TYPE_SNAPSHOT = "Snapshot"
|
|
TYPE_ARCHIVERESULT = "ArchiveResult"
|
|
TYPE_TAG = "Tag"
|
|
TYPE_CRAWL = "Crawl"
|
|
TYPE_BINARY = "Binary"
|
|
TYPE_PROCESS = "Process"
|
|
TYPE_MACHINE = "Machine"
|
|
|
|
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_BINARY, TYPE_PROCESS, TYPE_MACHINE}
|
|
|
|
|
|
def parse_line(line: str) -> dict[str, Any] | None:
|
|
"""
|
|
Parse a single line of input as either JSONL or plain URL.
|
|
|
|
Returns a dict with at minimum {'type': '...', 'url': '...'} or None if invalid.
|
|
"""
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
return None
|
|
|
|
# Try to parse as JSON first
|
|
if line.startswith("{"):
|
|
try:
|
|
record = json.loads(line)
|
|
# If it has a type, validate it
|
|
if "type" in record and record["type"] not in VALID_TYPES:
|
|
# Unknown type, treat as raw data
|
|
pass
|
|
# If it has url but no type, assume Snapshot
|
|
if "url" in record and "type" not in record:
|
|
record["type"] = TYPE_SNAPSHOT
|
|
return record
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Treat as plain URL if it looks like one
|
|
if line.startswith("http://") or line.startswith("https://") or line.startswith("file://"):
|
|
return {"type": TYPE_SNAPSHOT, "url": line}
|
|
|
|
# Could be a snapshot ID (UUID with dashes or compact 32-char hex)
|
|
if len(line) == 36 and line.count("-") == 4:
|
|
return {"type": TYPE_SNAPSHOT, "id": line}
|
|
if len(line) == 32:
|
|
try:
|
|
int(line, 16)
|
|
except ValueError:
|
|
pass
|
|
else:
|
|
return {"type": TYPE_SNAPSHOT, "id": line}
|
|
|
|
# Unknown format, skip
|
|
return None
|
|
|
|
|
|
def read_stdin(stream: TextIO | None = None) -> Iterator[dict[str, Any]]:
|
|
"""
|
|
Read JSONL or plain URLs from stdin.
|
|
|
|
Yields parsed records as dicts.
|
|
Supports both JSONL format and plain URLs (one per line).
|
|
"""
|
|
active_stream: TextIO = sys.stdin if stream is None else stream
|
|
|
|
# Don't block if stdin is a tty with no input
|
|
if active_stream.isatty():
|
|
return
|
|
|
|
try:
|
|
ready, _, _ = select.select([active_stream], [], [], 0)
|
|
except (OSError, ValueError):
|
|
ready = [active_stream]
|
|
|
|
if not ready:
|
|
return
|
|
|
|
for line in active_stream:
|
|
record = parse_line(line)
|
|
if record:
|
|
yield record
|
|
|
|
|
|
def read_file(path: Path) -> Iterator[dict[str, Any]]:
|
|
"""
|
|
Read JSONL or plain URLs from a file.
|
|
|
|
Yields parsed records as dicts.
|
|
"""
|
|
with open(path) as f:
|
|
for line in f:
|
|
record = parse_line(line)
|
|
if record:
|
|
yield record
|
|
|
|
|
|
def read_args_or_stdin(args: Iterable[str], stream: TextIO | None = None) -> Iterator[dict[str, Any]]:
|
|
"""
|
|
Read from CLI arguments if provided, otherwise from stdin.
|
|
|
|
Handles both URLs and JSONL from either source.
|
|
"""
|
|
if args:
|
|
for arg in args:
|
|
# Check if it's a file path
|
|
path = Path(arg)
|
|
if path.exists() and path.is_file():
|
|
yield from read_file(path)
|
|
else:
|
|
record = parse_line(arg)
|
|
if record:
|
|
yield record
|
|
else:
|
|
yield from read_stdin(stream)
|
|
|
|
|
|
def write_record(record: dict[str, Any], stream: TextIO | None = None) -> None:
|
|
"""
|
|
Write a single JSONL record to stdout (or provided stream).
|
|
"""
|
|
active_stream: TextIO = sys.stdout if stream is None else stream
|
|
active_stream.write(json.dumps(record) + "\n")
|
|
active_stream.flush()
|
|
|
|
|
|
def write_records(records: Iterator[dict[str, Any]], stream: TextIO | None = None) -> int:
|
|
"""
|
|
Write multiple JSONL records to stdout (or provided stream).
|
|
|
|
Returns count of records written.
|
|
"""
|
|
count = 0
|
|
for record in records:
|
|
write_record(record, stream)
|
|
count += 1
|
|
return count
|