wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -54,7 +54,7 @@ def check_data_folder() -> None:
def check_migrations():
from archivebox import DATA_DIR
from ..index.sql import list_migrations
from archivebox.misc.db import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status]
is_migrating = any(arg in sys.argv for arg in ['makemigrations', 'migrate', 'init'])
@@ -210,7 +210,7 @@ def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_ex
lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR
assert lib_dir == archivebox.pm.hook.get_LIB_DIR(), "lib_dir is not the same as the one in the flat config"
# assert lib_dir == STORAGE_CONFIG.LIB_DIR, "lib_dir is not the same as the one in the flat config"
if not must_exist and not os.path.isdir(lib_dir):
return True

57
archivebox/misc/db.py Normal file
View File

@@ -0,0 +1,57 @@
"""
Database utility functions for ArchiveBox.
"""
__package__ = 'archivebox.misc'
from io import StringIO
from pathlib import Path
from typing import List, Tuple
from archivebox.config import DATA_DIR
from archivebox.misc.util import enforce_types
@enforce_types
def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]:
"""List all Django migrations and their status"""
from django.core.management import call_command
out = StringIO()
call_command("showmigrations", list=True, stdout=out)
out.seek(0)
migrations = []
for line in out.readlines():
if line.strip() and ']' in line:
status_str, name_str = line.strip().split(']', 1)
is_applied = 'X' in status_str
migration_name = name_str.strip()
migrations.append((is_applied, migration_name))
return migrations
@enforce_types
def apply_migrations(out_dir: Path = DATA_DIR) -> List[str]:
"""Apply pending Django migrations"""
from django.core.management import call_command
out1, out2 = StringIO(), StringIO()
call_command("migrate", interactive=False, database='default', stdout=out1)
out1.seek(0)
call_command("migrate", "huey_monitor", interactive=False, database='queue', stdout=out2)
out2.seek(0)
return [
line.strip() for line in out1.readlines() + out2.readlines() if line.strip()
]
@enforce_types
def get_admins(out_dir: Path = DATA_DIR) -> List:
"""Get list of superuser accounts"""
from django.contrib.auth.models import User
return User.objects.filter(is_superuser=True).exclude(username='system')

215
archivebox/misc/folders.py Normal file
View File

@@ -0,0 +1,215 @@
"""
Folder status and integrity checking utilities for ArchiveBox.
"""
__package__ = 'archivebox.misc'
import os
import json
import shutil
from pathlib import Path
from itertools import chain
from typing import Dict, Optional, List, Tuple, TYPE_CHECKING
from django.db.models import QuerySet
from archivebox.config import DATA_DIR, CONSTANTS
from archivebox.misc.util import enforce_types
if TYPE_CHECKING:
from core.models import Snapshot
def _is_valid_snapshot(snapshot: 'Snapshot') -> bool:
"""Check if a snapshot's data directory is valid"""
dir_exists = Path(snapshot.output_dir).exists()
index_exists = (Path(snapshot.output_dir) / "index.json").exists()
if not dir_exists:
return False
if dir_exists and not index_exists:
return False
if dir_exists and index_exists:
try:
with open(Path(snapshot.output_dir) / "index.json", 'r') as f:
data = json.load(f)
return snapshot.url == data.get('url')
except Exception:
pass
return False
def _is_corrupt_snapshot(snapshot: 'Snapshot') -> bool:
"""Check if a snapshot's data directory is corrupted"""
if not Path(snapshot.output_dir).exists():
return False
return not _is_valid_snapshot(snapshot)
def get_indexed_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
"""indexed snapshots without checking archive status or data directory validity"""
return {
snapshot.output_dir: snapshot
for snapshot in snapshots.iterator(chunk_size=500)
}
def get_archived_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
"""indexed snapshots that are archived with a valid data directory"""
return {
snapshot.output_dir: snapshot
for snapshot in snapshots.iterator(chunk_size=500)
if snapshot.is_archived
}
def get_unarchived_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
"""indexed snapshots that are unarchived with no data directory or an empty data directory"""
return {
snapshot.output_dir: snapshot
for snapshot in snapshots.iterator(chunk_size=500)
if not snapshot.is_archived
}
def get_present_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
"""dirs that actually exist in the archive/ folder"""
from core.models import Snapshot
all_folders = {}
for entry in (out_dir / CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir():
snapshot = None
try:
snapshot = Snapshot.objects.get(timestamp=entry.name)
except Snapshot.DoesNotExist:
pass
all_folders[entry.name] = snapshot
return all_folders
def get_valid_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
"""dirs with a valid index matched to the main index and archived content"""
return {
snapshot.output_dir: snapshot
for snapshot in snapshots.iterator(chunk_size=500)
if _is_valid_snapshot(snapshot)
}
def get_invalid_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
duplicate = get_duplicate_folders(snapshots, out_dir=out_dir)
orphaned = get_orphaned_folders(snapshots, out_dir=out_dir)
corrupted = get_corrupted_folders(snapshots, out_dir=out_dir)
unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir)
return {**duplicate, **orphaned, **corrupted, **unrecognized}
def get_duplicate_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
"""dirs that conflict with other directories that have the same URL or timestamp"""
from core.models import Snapshot as SnapshotModel
by_url: Dict[str, int] = {}
by_timestamp: Dict[str, int] = {}
duplicate_folders: Dict[str, Optional['Snapshot']] = {}
data_folders = (
str(entry)
for entry in CONSTANTS.ARCHIVE_DIR.iterdir()
if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
)
for item in chain(snapshots.iterator(chunk_size=500), data_folders):
snapshot = None
if isinstance(item, str):
path = item
timestamp = Path(path).name
try:
snapshot = SnapshotModel.objects.get(timestamp=timestamp)
except SnapshotModel.DoesNotExist:
pass
else:
snapshot = item
path = snapshot.output_dir
if snapshot:
by_timestamp[snapshot.timestamp] = by_timestamp.get(snapshot.timestamp, 0) + 1
if by_timestamp[snapshot.timestamp] > 1:
duplicate_folders[path] = snapshot
by_url[snapshot.url] = by_url.get(snapshot.url, 0) + 1
if by_url[snapshot.url] > 1:
duplicate_folders[path] = snapshot
return duplicate_folders
def get_orphaned_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]:
"""dirs that contain a valid index but aren't listed in the main index"""
orphaned_folders: Dict[str, Optional['Snapshot']] = {}
for entry in CONSTANTS.ARCHIVE_DIR.iterdir():
if entry.is_dir():
index_path = entry / "index.json"
if index_path.exists() and not snapshots.filter(timestamp=entry.name).exists():
orphaned_folders[str(entry)] = None
return orphaned_folders
def get_corrupted_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']:
"""dirs that exist but have corrupted/invalid index files"""
corrupted: Dict[str, 'Snapshot'] = {}
for snapshot in snapshots.iterator(chunk_size=500):
if _is_corrupt_snapshot(snapshot):
corrupted[snapshot.output_dir] = snapshot
return corrupted
def get_unrecognized_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, None]:
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
unrecognized_folders: Dict[str, None] = {}
for entry in (Path(out_dir) / CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir():
index_exists = (entry / "index.json").exists()
if index_exists:
try:
with open(entry / "index.json", 'r') as f:
json.load(f)
except Exception:
unrecognized_folders[str(entry)] = None
else:
timestamp = entry.name
if not snapshots.filter(timestamp=timestamp).exists():
unrecognized_folders[str(entry)] = None
return unrecognized_folders
@enforce_types
def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], List[str]]:
"""Move folders to their correct timestamp-named locations based on index.json"""
fixed = []
cant_fix = []
for entry in os.scandir(out_dir / CONSTANTS.ARCHIVE_DIR_NAME):
if entry.is_dir(follow_symlinks=True):
index_path = Path(entry.path) / 'index.json'
if index_path.exists():
try:
with open(index_path, 'r') as f:
data = json.load(f)
timestamp = data.get('timestamp')
url = data.get('url')
except Exception:
continue
if not timestamp:
continue
if not entry.path.endswith(f'/{timestamp}'):
dest = out_dir / CONSTANTS.ARCHIVE_DIR_NAME / timestamp
if dest.exists():
cant_fix.append(entry.path)
else:
shutil.move(entry.path, str(dest))
fixed.append(str(dest))
return fixed, cant_fix

View File

@@ -4,71 +4,65 @@ from functools import lru_cache
from pathlib import Path
from typing import Callable
from datetime import datetime
import blake3 # pip install blake3
@lru_cache(maxsize=1024)
def _cached_file_hashes(filepath: str, size: int, mtime: float) -> tuple[str, str]:
"""Internal function to calculate file hashes with cache key based on path, size and mtime."""
def _cached_file_hash(filepath: str, size: int, mtime: float) -> str:
"""Internal function to calculate file hash with cache key based on path, size and mtime."""
sha256_hash = hashlib.sha256()
blake3_hash = blake3.blake3()
with open(filepath, 'rb') as f:
# Read file once and update both hashes simultaneously
for chunk in iter(lambda: f.read(4096), b''):
sha256_hash.update(chunk)
blake3_hash.update(chunk)
return sha256_hash.hexdigest(), blake3_hash.hexdigest()
return sha256_hash.hexdigest()
@lru_cache(maxsize=10)
def hash_file(file_path: Path, pwd: Path | None = None) -> tuple[str, str]:
"""Calculate SHA256 and BLAKE3 hashes of a file with caching based on path, size and mtime."""
def hash_file(file_path: Path, pwd: Path | None = None) -> str:
"""Calculate SHA256 hash of a file with caching based on path, size and mtime."""
pwd = Path(pwd) if pwd else None
file_path = Path(file_path)
if not file_path.is_absolute():
file_path = pwd / file_path if pwd else file_path.absolute()
abs_path = file_path.resolve()
stat_info = abs_path.stat()
return _cached_file_hashes(
return _cached_file_hash(
str(abs_path),
stat_info.st_size,
stat_info.st_mtime
)
@lru_cache(maxsize=10)
def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict[str, tuple[str, str]]:
"""Calculate SHA256 and BLAKE3 hashes for all files and directories recursively."""
def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict[str, str]:
"""Calculate SHA256 hashes for all files and directories recursively."""
pwd = Path(pwd) if pwd else None
dir_path = Path(dir_path)
if not dir_path.is_absolute():
dir_path = pwd / dir_path if pwd else dir_path.absolute()
if not dir_path.is_dir():
raise ValueError(f"Not a directory: {dir_path}")
if max_depth < -1:
raise ValueError(f"max_depth must be >= -1, got {max_depth}")
# Get all files recursively
all_files = get_dir_entries(
dir_path, pwd=pwd, recursive=True,
include_files=True, include_dirs=False,
filter_func=filter_func
)
hashes: dict[str, tuple[str, str]] = {}
hashable_summary_sha256 = []
hashable_summary_blake3 = []
hashes: dict[str, str] = {}
hashable_summary = []
# Calculate hashes for all files
for subfile in all_files:
subfile_path = dir_path / subfile
sha256_hash, blake3_hash = hash_file(subfile_path)
hashes[subfile] = (sha256_hash, blake3_hash)
hashable_summary_sha256.append(f"{sha256_hash} ./{subfile}")
hashable_summary_blake3.append(f"{blake3_hash} ./{subfile}")
sha256_hash = hash_file(subfile_path)
hashes[subfile] = sha256_hash
hashable_summary.append(f"{sha256_hash} ./{subfile}")
# Calculate hashes for all directories
subdirs = get_dir_entries(
dir_path, pwd=pwd, recursive=True,
@@ -76,7 +70,7 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl
include_hidden=False, filter_func=filter_func,
max_depth=max_depth
)
for subdir in subdirs:
subdir_path = dir_path / subdir
subdir_hashes = get_dir_hashes(
@@ -84,36 +78,34 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl
max_depth=0
)
hashes[subdir] = subdir_hashes['.']
# Filter results by max_depth
if max_depth >= 0:
hashes = {
path: value for path, value in hashes.items()
if len(Path(path).parts) <= max_depth + 1
}
# Calculate root directory hashes
hashable_summary_sha256.sort()
hashable_summary_blake3.sort()
root_sha256 = hashlib.sha256('\n'.join(hashable_summary_sha256).encode()).hexdigest()
root_blake3 = blake3.blake3('\n'.join(hashable_summary_blake3).encode()).hexdigest()
hashes['.'] = (root_sha256, root_blake3)
# Calculate root directory hash
hashable_summary.sort()
root_sha256 = hashlib.sha256('\n'.join(hashable_summary).encode()).hexdigest()
hashes['.'] = root_sha256
return hashes
@lru_cache(maxsize=128)
def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = True,
include_files: bool = True, include_dirs: bool = True, include_hidden: bool = False,
def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = True,
include_files: bool = True, include_dirs: bool = True, include_hidden: bool = False,
filter_func: Callable | None = None, max_depth: int = -1) -> tuple[str, ...]:
"""Get filtered list of directory entries."""
pwd = Path(pwd) if pwd else None
dir_path = Path(dir_path)
if not dir_path.is_absolute():
dir_path = pwd / dir_path if pwd else dir_path.absolute()
results = []
def process_path(path: Path, depth: int):
if not include_hidden and path.name.startswith('.'):
return False
@@ -127,18 +119,18 @@ def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = T
if not filter_func(info):
return False
return True
for path in dir_path.rglob('*') if recursive else dir_path.glob('*'):
current_depth = len(path.relative_to(dir_path).parts)
if path.is_file() and include_files and process_path(path, current_depth):
results.append(str(path.relative_to(dir_path)))
elif path.is_dir() and include_dirs and process_path(path, current_depth):
results.append(str(path.relative_to(dir_path)))
if not recursive:
break
return tuple(sorted(results)) # Make immutable for caching
@lru_cache(maxsize=1024)
@@ -147,7 +139,7 @@ def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str
sizes: dict[str, int] = {}
hashes = get_dir_hashes(dir_path, pwd=pwd, **kwargs)
dir_path = Path(dir_path)
for path_key in hashes:
full_path = dir_path / path_key
if full_path.is_file():
@@ -158,25 +150,25 @@ def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str
if file_path.is_file() and not file_path.name.startswith('.'):
total += file_path.stat().st_size
sizes[path_key + '/'] = total
return sizes
@lru_cache(maxsize=10)
def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict:
"""Get detailed information about directory contents including both hash types and sizes."""
"""Get detailed information about directory contents including hashes and sizes."""
pwd = Path(pwd) if pwd else None
dir_path = Path(dir_path)
if not dir_path.is_absolute():
dir_path = pwd / dir_path if pwd else dir_path.absolute()
hashes = get_dir_hashes(dir_path, pwd=pwd, filter_func=filter_func, max_depth=max_depth)
sizes = get_dir_sizes(str(dir_path), pwd=pwd, filter_func=filter_func, max_depth=max_depth)
num_total_subpaths = sum(1 for name in hashes if name != '.')
details = {}
for filename, (sha256_hash, blake3_hash) in sorted(hashes.items()):
for filename, sha256_hash in sorted(hashes.items()):
abs_path = (dir_path / filename).resolve()
stat_info = abs_path.stat()
num_subpaths = sum(1 for p in hashes if p.startswith(filename + '/'))
@@ -197,7 +189,7 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
extension = abs_path.suffix
basename = abs_path.name.rsplit(extension, 1)[0]
num_bytes = sizes[filename]
details[filename] = {
'basename': basename,
'mime_type': mime_type,
@@ -205,14 +197,13 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
'num_subpaths': num_subpaths,
'num_bytes': num_bytes,
'hash_sha256': sha256_hash,
'hash_blake3': blake3_hash,
'created_at': datetime.fromtimestamp(stat_info.st_ctime).isoformat(),
'modified_at': datetime.fromtimestamp(stat_info.st_mtime).isoformat(),
}
if filter_func and not filter_func(details[filename]):
del details[filename]
return details
@@ -221,7 +212,7 @@ if __name__ == '__main__':
dir_info = get_dir_info(Path('.'), max_depth=6)
with open('.hashes.json', 'w') as f:
json.dump(dir_info, f, indent=4)
print('Wrote .hashes.json')
print('Wrote .hashes.json')
# Example output:
# {
@@ -232,7 +223,6 @@ if __name__ == '__main__':
# "num_subpaths": 25,
# "num_bytes": 214677,
# "hash_sha256": "addfacf88b2ff6b564846415fb7b21dcb7e63ee4e911bc0aec255ee354958530",
# "hash_blake3": "3403a1f876453c7749f17ee3502769eff05cff20b5d6c2f2cf458e6353a380db",
# "created_at": "2024-12-04T00:08:38.537449",
# "modified_at": "2024-12-04T00:08:38.537449"
# },
@@ -243,31 +233,8 @@ if __name__ == '__main__':
# "num_subpaths": null,
# "num_bytes": 32,
# "hash_sha256": "b0e5e7ff17db3b60535cf664282787767c336e3e203a43e21b6326c6fe457551",
# "hash_blake3": "4a801eb2a4cdde8d3422be1e2074b78574a5890afb3027cbe6f3b3cf4d113fd1",
# "created_at": "2024-10-08T00:51:41.001359",
# "modified_at": "2024-10-08T00:51:41.001359"
# },
# "__pycache__/": {
# "basename": "__pycache__",
# "mime_type": "inode/directory",
# "extension": "",
# "num_subpaths": 8,
# "num_bytes": 107593,
# "hash_sha256": "9e917a438be774ffc7ea9125de71008c29a7d9003b6f5e09e2085aa1ef3157b3",
# "hash_blake3": "e87184485bd67bd9b723a9ee4d472e8c1d24a4388d373046a27e5a1e10467a06",
# "created_at": "2024-12-04T00:00:16.149390",
# "modified_at": "2024-12-04T00:00:16.149390"
# },
# "__pycache__/__init__.cpython-313.pyc": {
# "basename": "__init__.cpython-313",
# "mime_type": "application/x-python-code",
# "extension": ".pyc",
# "num_subpaths": null,
# "num_bytes": 223,
# "hash_sha256": "d29e3ee5e6b9b564422d9ef2c7325d28cf759b9fb868f59551ba43cd991d51be",
# "hash_blake3": "279a6dc4c8161d6ddb18fa72c882f375324ed152dc6c7c7eac9ef5fdd066f2fd",
# "created_at": "2024-12-03T03:13:43.257430",
# "modified_at": "2024-12-03T03:13:43.257308"
# },
# ...
# }

343
archivebox/misc/jsonl.py Normal file
View File

@@ -0,0 +1,343 @@
"""
JSONL (JSON Lines) utilities for ArchiveBox.
Provides functions for reading, writing, and processing typed JSONL records.
All CLI commands that accept stdin can read both plain URLs and typed JSONL.
Typed JSONL Format:
{"type": "Snapshot", "url": "https://example.com", "title": "...", "tags": "..."}
{"type": "ArchiveResult", "snapshot_id": "...", "extractor": "wget", ...}
{"type": "Tag", "name": "..."}
Plain URLs (also supported):
https://example.com
https://foo.com
"""
__package__ = 'archivebox.misc'
import sys
import json
from typing import Iterator, Dict, Any, Optional, TextIO, Callable, Union, List
from pathlib import Path
# Type constants for JSONL records
TYPE_SNAPSHOT = 'Snapshot'
TYPE_ARCHIVERESULT = 'ArchiveResult'
TYPE_TAG = 'Tag'
TYPE_CRAWL = 'Crawl'
TYPE_SEED = 'Seed'
TYPE_INSTALLEDBINARY = 'InstalledBinary'
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_SEED, TYPE_INSTALLEDBINARY}
def parse_line(line: str) -> Optional[Dict[str, Any]]:
"""
Parse a single line of input as either JSONL or plain URL.
Returns a dict with at minimum {'type': '...', 'url': '...'} or None if invalid.
"""
line = line.strip()
if not line or line.startswith('#'):
return None
# Try to parse as JSON first
if line.startswith('{'):
try:
record = json.loads(line)
# If it has a type, validate it
if 'type' in record and record['type'] not in VALID_TYPES:
# Unknown type, treat as raw data
pass
# If it has url but no type, assume Snapshot
if 'url' in record and 'type' not in record:
record['type'] = TYPE_SNAPSHOT
return record
except json.JSONDecodeError:
pass
# Treat as plain URL if it looks like one
if line.startswith('http://') or line.startswith('https://') or line.startswith('file://'):
return {'type': TYPE_SNAPSHOT, 'url': line}
# Could be a snapshot ID (UUID)
if len(line) == 36 and line.count('-') == 4:
return {'type': TYPE_SNAPSHOT, 'id': line}
# Unknown format, skip
return None
def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
"""
Read JSONL or plain URLs from stdin.
Yields parsed records as dicts.
Supports both JSONL format and plain URLs (one per line).
"""
stream = stream or sys.stdin
# Don't block if stdin is a tty with no input
if stream.isatty():
return
for line in stream:
record = parse_line(line)
if record:
yield record
def read_file(path: Path) -> Iterator[Dict[str, Any]]:
"""
Read JSONL or plain URLs from a file.
Yields parsed records as dicts.
"""
with open(path, 'r') as f:
for line in f:
record = parse_line(line)
if record:
yield record
def read_args_or_stdin(args: tuple, stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
"""
Read from CLI arguments if provided, otherwise from stdin.
Handles both URLs and JSONL from either source.
"""
if args:
for arg in args:
# Check if it's a file path
path = Path(arg)
if path.exists() and path.is_file():
yield from read_file(path)
else:
record = parse_line(arg)
if record:
yield record
else:
yield from read_stdin(stream)
def write_record(record: Dict[str, Any], stream: Optional[TextIO] = None) -> None:
"""
Write a single JSONL record to stdout (or provided stream).
"""
stream = stream or sys.stdout
stream.write(json.dumps(record) + '\n')
stream.flush()
def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] = None) -> int:
"""
Write multiple JSONL records to stdout (or provided stream).
Returns count of records written.
"""
count = 0
for record in records:
write_record(record, stream)
count += 1
return count
def filter_by_type(records: Iterator[Dict[str, Any]], record_type: str) -> Iterator[Dict[str, Any]]:
"""
Filter records by type.
"""
for record in records:
if record.get('type') == record_type:
yield record
def snapshot_to_jsonl(snapshot) -> Dict[str, Any]:
"""
Convert a Snapshot model instance to a JSONL record.
"""
return {
'type': TYPE_SNAPSHOT,
'id': str(snapshot.id),
'url': snapshot.url,
'title': snapshot.title,
'tags': snapshot.tags_str() if hasattr(snapshot, 'tags_str') else '',
'bookmarked_at': snapshot.bookmarked_at.isoformat() if snapshot.bookmarked_at else None,
'created_at': snapshot.created_at.isoformat() if snapshot.created_at else None,
'timestamp': snapshot.timestamp,
'depth': getattr(snapshot, 'depth', 0),
'status': snapshot.status if hasattr(snapshot, 'status') else None,
}
def archiveresult_to_jsonl(result) -> Dict[str, Any]:
"""
Convert an ArchiveResult model instance to a JSONL record.
"""
return {
'type': TYPE_ARCHIVERESULT,
'id': str(result.id),
'snapshot_id': str(result.snapshot_id),
'extractor': result.extractor,
'status': result.status,
'output': result.output,
'start_ts': result.start_ts.isoformat() if result.start_ts else None,
'end_ts': result.end_ts.isoformat() if result.end_ts else None,
}
def tag_to_jsonl(tag) -> Dict[str, Any]:
"""
Convert a Tag model instance to a JSONL record.
"""
return {
'type': TYPE_TAG,
'id': str(tag.id),
'name': tag.name,
'slug': tag.slug,
}
def crawl_to_jsonl(crawl) -> Dict[str, Any]:
"""
Convert a Crawl model instance to a JSONL record.
"""
return {
'type': TYPE_CRAWL,
'id': str(crawl.id),
'seed_id': str(crawl.seed_id),
'status': crawl.status,
'max_depth': crawl.max_depth,
'created_at': crawl.created_at.isoformat() if crawl.created_at else None,
}
def process_records(
records: Iterator[Dict[str, Any]],
handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]]
) -> Iterator[Dict[str, Any]]:
"""
Process records through type-specific handlers.
Args:
records: Input record iterator
handlers: Dict mapping type names to handler functions
Handlers return output records or None to skip
Yields output records from handlers.
"""
for record in records:
record_type = record.get('type')
handler = handlers.get(record_type)
if handler:
result = handler(record)
if result:
yield result
def get_or_create_snapshot(record: Dict[str, Any], created_by_id: Optional[int] = None):
"""
Get or create a Snapshot from a JSONL record.
Returns the Snapshot instance.
"""
from core.models import Snapshot
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.misc.util import parse_date
created_by_id = created_by_id or get_or_create_system_user_pk()
# Extract fields from record
url = record.get('url')
if not url:
raise ValueError("Record missing required 'url' field")
title = record.get('title')
tags_str = record.get('tags', '')
bookmarked_at = record.get('bookmarked_at')
depth = record.get('depth', 0)
crawl_id = record.get('crawl_id')
# Parse bookmarked_at if string
if bookmarked_at and isinstance(bookmarked_at, str):
bookmarked_at = parse_date(bookmarked_at)
# Use the manager's create_or_update_from_dict method
snapshot = Snapshot.objects.create_or_update_from_dict(
{'url': url, 'title': title, 'tags': tags_str},
created_by_id=created_by_id
)
# Update additional fields if provided
update_fields = []
if depth and snapshot.depth != depth:
snapshot.depth = depth
update_fields.append('depth')
if bookmarked_at and snapshot.bookmarked_at != bookmarked_at:
snapshot.bookmarked_at = bookmarked_at
update_fields.append('bookmarked_at')
if crawl_id and str(snapshot.crawl_id) != str(crawl_id):
snapshot.crawl_id = crawl_id
update_fields.append('crawl_id')
if update_fields:
snapshot.save(update_fields=update_fields + ['modified_at'])
return snapshot
def get_or_create_tag(record: Dict[str, Any]):
"""
Get or create a Tag from a JSONL record.
Returns the Tag instance.
"""
from core.models import Tag
name = record.get('name')
if not name:
raise ValueError("Record missing required 'name' field")
tag, _ = Tag.objects.get_or_create(name=name)
return tag
def process_jsonl_records(records: Iterator[Dict[str, Any]], created_by_id: Optional[int] = None) -> Dict[str, List]:
"""
Process JSONL records, creating Tags and Snapshots as needed.
Args:
records: Iterator of JSONL record dicts
created_by_id: User ID for created objects
Returns:
Dict with 'tags' and 'snapshots' lists of created objects
"""
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = created_by_id or get_or_create_system_user_pk()
results = {
'tags': [],
'snapshots': [],
}
for record in records:
record_type = record.get('type', TYPE_SNAPSHOT)
if record_type == TYPE_TAG:
try:
tag = get_or_create_tag(record)
results['tags'].append(tag)
except ValueError:
continue
elif record_type == TYPE_SNAPSHOT or 'url' in record:
try:
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
results['snapshots'].append(snapshot)
except ValueError:
continue
return results

90
archivebox/misc/legacy.py Normal file
View File

@@ -0,0 +1,90 @@
"""
Legacy archive import utilities.
These functions are used to import data from old ArchiveBox archive formats
(JSON indexes, archive directory structures) into the new database.
This is separate from the hooks-based parser system which handles importing
new URLs from bookmark files, RSS feeds, etc.
"""
__package__ = 'archivebox.misc'
import os
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Iterator, TypedDict, List
class SnapshotDict(TypedDict, total=False):
"""
Dictionary type representing a snapshot/link, compatible with Snapshot model fields.
"""
url: str # Required: the URL to archive
timestamp: str # Optional: unix timestamp string
title: str # Optional: page title
tags: str # Optional: comma-separated tags string
sources: List[str] # Optional: list of source file paths
def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
"""
Parse links from the main JSON index file (archive/index.json).
This is used to recover links from old archive formats.
"""
from archivebox.config import CONSTANTS
index_path = out_dir / CONSTANTS.JSON_INDEX_FILENAME
if not index_path.exists():
return
try:
with open(index_path, 'r', encoding='utf-8') as f:
data = json.load(f)
links = data.get('links', [])
for link in links:
yield {
'url': link.get('url', ''),
'timestamp': link.get('timestamp', str(datetime.now(timezone.utc).timestamp())),
'title': link.get('title'),
'tags': link.get('tags', ''),
}
except (json.JSONDecodeError, KeyError, TypeError):
return
def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
"""
Parse links from individual snapshot index.json files in archive directories.
Walks through archive/*/index.json files to discover orphaned snapshots.
"""
from archivebox.config import CONSTANTS
archive_dir = out_dir / CONSTANTS.ARCHIVE_DIR_NAME
if not archive_dir.exists():
return
for entry in os.scandir(archive_dir):
if not entry.is_dir():
continue
index_file = Path(entry.path) / 'index.json'
if not index_file.exists():
continue
try:
with open(index_file, 'r', encoding='utf-8') as f:
link = json.load(f)
yield {
'url': link.get('url', ''),
'timestamp': link.get('timestamp', entry.name),
'title': link.get('title'),
'tags': link.get('tags', ''),
}
except (json.JSONDecodeError, KeyError, TypeError):
continue

View File

@@ -1,7 +1,7 @@
__package__ = 'archivebox.misc'
# TODO: merge/dedupe this file with archivebox/logging_util.py
# Low-level logging primitives (Rich console, ANSI colors, stdout/stderr helpers)
# Higher-level logging functions are in logging_util.py
import sys
from typing import Optional, Union, Tuple, List

View File

@@ -1,9 +1,11 @@
__package__ = 'archivebox'
# High-level logging functions for CLI output and progress tracking
# Low-level primitives (Rich console, ANSI colors) are in logging.py
import re
import os
import sys
import stat
import time
from math import log
@@ -15,7 +17,7 @@ from dataclasses import dataclass
from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING
if TYPE_CHECKING:
from ..index.schema import Link, ArchiveResult
from core.models import Snapshot
from rich import print
from rich.panel import Panel
@@ -48,77 +50,6 @@ class RuntimeStats:
_LAST_RUN_STATS = RuntimeStats()
def debug_dict_summary(obj: Dict[Any, Any]) -> None:
stderr(' '.join(f'{key}={str(val).ljust(6)}' for key, val in obj.items()))
def get_fd_info(fd) -> Dict[str, Any]:
NAME = fd.name[1:-1]
FILENO = fd.fileno()
MODE = os.fstat(FILENO).st_mode
IS_TTY = hasattr(fd, 'isatty') and fd.isatty()
IS_PIPE = stat.S_ISFIFO(MODE)
IS_FILE = stat.S_ISREG(MODE)
IS_TERMINAL = not (IS_PIPE or IS_FILE)
IS_LINE_BUFFERED = fd.line_buffering
IS_READABLE = fd.readable()
return {
'NAME': NAME, 'FILENO': FILENO, 'MODE': MODE,
'IS_TTY': IS_TTY, 'IS_PIPE': IS_PIPE, 'IS_FILE': IS_FILE,
'IS_TERMINAL': IS_TERMINAL, 'IS_LINE_BUFFERED': IS_LINE_BUFFERED,
'IS_READABLE': IS_READABLE,
}
# # Log debug information about stdin, stdout, and stderr
# sys.stdout.write('[>&1] this is python stdout\n')
# sys.stderr.write('[>&2] this is python stderr\n')
# debug_dict_summary(get_fd_info(sys.stdin))
# debug_dict_summary(get_fd_info(sys.stdout))
# debug_dict_summary(get_fd_info(sys.stderr))
def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
"""Tell the user they passed stdin to a command that doesn't accept it"""
if not stdin:
return None
if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
# when TTY is disabled in docker we cant tell if stdin is being piped in or not
# if we try to read stdin when its not piped we will hang indefinitely waiting for it
return None
if not stdin.isatty():
# stderr('READING STDIN TO REJECT...')
stdin_raw_text = stdin.read()
if stdin_raw_text.strip():
# stderr('GOT STDIN!', len(stdin_str))
stderr(f'[!] The "{caller}" command does not accept stdin (ignoring).', color='red')
stderr(f' Run archivebox "{caller} --help" to see usage and examples.')
stderr()
# raise SystemExit(1)
return None
def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
"""accept any standard input and return it as a string or None"""
if not stdin:
return None
if not stdin.isatty():
# stderr('READING STDIN TO ACCEPT...')
stdin_str = stdin.read()
if stdin_str:
# stderr('GOT STDIN...', len(stdin_str))
return stdin_str
return None
class TimedProgress:
"""Show a progress bar and measure elapsed time until .end() is called"""
@@ -353,7 +284,7 @@ def log_archiving_finished(num_links: int):
print(' archivebox server 0.0.0.0:8000')
def log_link_archiving_started(link: "Link", link_dir: str, is_new: bool):
def log_snapshot_archiving_started(snapshot: "Snapshot", out_dir: str, is_new: bool):
# [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
@@ -363,15 +294,15 @@ def log_link_archiving_started(link: "Link", link_dir: str, is_new: bool):
symbol_color='green' if is_new else 'bright_black',
symbol='+' if is_new else '',
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
title=link.title or link.base_url,
title=snapshot.title or snapshot.base_url,
))
print(f' [sky_blue1]{link.url}[/]')
print(f' [sky_blue1]{snapshot.url}[/]')
print(' {} {}'.format(
'>' if is_new else '',
pretty_path(link_dir),
pretty_path(out_dir),
))
def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats: dict, start_ts: datetime):
def log_snapshot_archiving_finished(snapshot: "Snapshot", out_dir: str, is_new: bool, stats: dict, start_ts: datetime):
total = sum(stats.values())
if stats['failed'] > 0 :
@@ -382,7 +313,7 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
_LAST_RUN_STATS.succeeded += 1
try:
size = get_dir_size(link_dir)
size = get_dir_size(out_dir)
except FileNotFoundError:
size = (0, None, '0')
@@ -391,38 +322,38 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
print(' [bright_black]{} files ({}) in {}s [/]'.format(size[2], printable_filesize(size[0]), duration))
def log_archive_method_started(method: str):
print(' > {}'.format(method))
def log_archive_method_finished(result: "ArchiveResult"):
def log_archive_method_finished(result: dict):
"""
quote the argument with whitespace in a command so the user can
quote the argument with whitespace in a command so the user can
copy-paste the outputted string directly to run the cmd
"""
# Prettify CMD string and make it safe to copy-paste by quoting arguments
quoted_cmd = ' '.join(
'"{}"'.format(arg) if (' ' in arg) or (':' in arg) else arg
for arg in result.cmd
for arg in result['cmd']
)
if result.status == 'failed':
if result.output.__class__.__name__ == 'TimeoutExpired':
duration = (result.end_ts - result.start_ts).seconds
if result['status'] == 'failed':
output = result.get('output')
if output and output.__class__.__name__ == 'TimeoutExpired':
duration = (result['end_ts'] - result['start_ts']).seconds
hint_header = [
f'[yellow3]Extractor timed out after {duration}s.[/]',
]
else:
error_name = result.output.__class__.__name__.replace('ArchiveError', '')
error_name = output.__class__.__name__.replace('ArchiveError', '') if output else 'Error'
hint_header = [
'[yellow3]Extractor failed:[/]',
f' {error_name} [red1]{result.output}[/]',
f' {error_name} [red1]{output}[/]',
]
# import pudb; pudb.set_trace()
# Prettify error output hints string and limit to five lines
hints = getattr(result.output, 'hints', None) or ()
hints = getattr(output, 'hints', None) or () if output else ()
if hints:
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
@@ -448,7 +379,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
*hints,
'[violet]Run to see full output:[/]',
*docker_hints,
*([' cd {};'.format(result.pwd)] if result.pwd else []),
*([' cd {};'.format(result.get('pwd'))] if result.get('pwd') else []),
' {}'.format(quoted_cmd),
]
print('\n'.join(
@@ -463,21 +394,22 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
print(f'[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]')
print(' {}'.format(' '.join(filter_patterns or ())))
def log_list_finished(links):
from archivebox.index.csv import links_to_csv
def log_list_finished(snapshots):
from core.models import Snapshot
print()
print('---------------------------------------------------------------------------------------------------')
print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
print(Snapshot.objects.filter(pk__in=[s.pk for s in snapshots]).to_csv(cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
print('---------------------------------------------------------------------------------------------------')
print()
def log_removal_started(links: List["Link"], yes: bool, delete: bool):
print(f'[yellow3][i] Found {len(links)} matching URLs to remove.[/]')
def log_removal_started(snapshots, yes: bool, delete: bool):
count = snapshots.count() if hasattr(snapshots, 'count') else len(snapshots)
print(f'[yellow3][i] Found {count} matching URLs to remove.[/]')
if delete:
file_counts = [link.num_outputs for link in links if os.access(link.link_dir, os.R_OK)]
file_counts = [s.num_outputs for s in snapshots if os.access(s.output_dir, os.R_OK)]
print(
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
f' {count} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
)
else:
@@ -488,7 +420,7 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
if not yes:
print()
print(f'[yellow3][?] Do you want to proceed with removing these {len(links)} links?[/]')
print(f'[yellow3][?] Do you want to proceed with removing these {count} links?[/]')
try:
assert input(' y/[n]: ').lower() == 'y'
except (KeyboardInterrupt, EOFError, AssertionError):
@@ -504,6 +436,13 @@ def log_removal_finished(all_links: int, to_remove: int):
print(f' Index now contains {all_links - to_remove} links.')
### Search Indexing Stage
def log_index_started(url: str):
print('[green][*] Indexing url: {} in the search index[/]'.format(url))
print()
### Helpers
@enforce_types
@@ -542,10 +481,10 @@ def printable_filesize(num_bytes: Union[int, float]) -> str:
@enforce_types
def printable_folders(folders: Dict[str, Optional["Link"]], with_headers: bool=False) -> str:
def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str:
return '\n'.join(
f'{folder} {link and link.url} "{link and link.title}"'
for folder, link in folders.items()
f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"'
for folder, snapshot in folders.items()
)

View File

@@ -13,7 +13,6 @@ import pydantic # noqa
import requests # noqa
import subprocess # noqa
import archivebox # noqa
import abx # noqa
from benedict import benedict # noqa
from django.utils import timezone # noqa
from datetime import datetime, timedelta # noqa
@@ -21,8 +20,9 @@ from django.conf import settings # noqa
from archivebox import CONSTANTS # noqa
from archivebox.cli import * # noqa
from archivebox.config.configset import get_config
CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
CONFIG = get_config()
if __name__ == '__main__':
# load the rich extension for ipython for pretty printing
@@ -35,7 +35,7 @@ if __name__ == '__main__':
# print the welcome message
prnt('[green]import re, os, sys, psutil, subprocess, reqiests, json, pydantic, benedict, django, abx[/]')
prnt('[green]import re, os, sys, psutil, subprocess, requests, json, pydantic, benedict, django[/]')
prnt('[yellow4]# ArchiveBox Imports[/]')
prnt('[yellow4]import archivebox[/]')
prnt('[yellow4]from archivebox.cli import *[/]')

View File

@@ -345,19 +345,41 @@ class ExtendedEncoder(pyjson.JSONEncoder):
elif isinstance(obj, Exception):
return '{}: {}'.format(obj.__class__.__name__, obj)
elif isinstance(obj, Path):
return str(obj)
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
return tuple(obj)
return list(obj)
elif isinstance(obj, Callable):
return str(obj)
# Try dict/list conversion as fallback
try:
return dict(obj)
except Exception:
pass
try:
return list(obj)
except Exception:
pass
try:
return str(obj)
except Exception:
pass
return pyjson.JSONEncoder.default(self, obj)
@enforce_types
def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True) -> str:
"""Serialize object to JSON string with extended type support"""
return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
### URL PARSING TESTS / ASSERTIONS
# Check that plain text regex URL parsing works as expected
@@ -452,3 +474,78 @@ _test_url_strs = {
for url_str, num_urls in _test_url_strs.items():
assert len(list(find_all_urls(url_str))) == num_urls, (
f'{url_str} does not contain {num_urls} urls')
### Chrome Helpers
def chrome_args(**options) -> List[str]:
"""Helper to build up a chrome shell command with arguments."""
import shutil
from archivebox.config import CHECK_SSL_VALIDITY, RESOLUTION, USER_AGENT, CHROME_BINARY
chrome_binary = options.get('CHROME_BINARY', CHROME_BINARY)
chrome_headless = options.get('CHROME_HEADLESS', True)
chrome_sandbox = options.get('CHROME_SANDBOX', True)
check_ssl = options.get('CHECK_SSL_VALIDITY', CHECK_SSL_VALIDITY)
user_agent = options.get('CHROME_USER_AGENT', USER_AGENT)
resolution = options.get('RESOLUTION', RESOLUTION)
timeout = options.get('CHROME_TIMEOUT', 0)
user_data_dir = options.get('CHROME_USER_DATA_DIR', None)
if not chrome_binary:
raise Exception('Could not find any CHROME_BINARY installed on your system')
cmd_args = [chrome_binary]
if chrome_headless:
cmd_args += ("--headless=new",)
if not chrome_sandbox:
# running in docker or other sandboxed environment
cmd_args += (
"--no-sandbox",
"--no-zygote",
"--disable-dev-shm-usage",
"--disable-software-rasterizer",
"--run-all-compositor-stages-before-draw",
"--hide-scrollbars",
"--autoplay-policy=no-user-gesture-required",
"--no-first-run",
"--use-fake-ui-for-media-stream",
"--use-fake-device-for-media-stream",
"--disable-sync",
)
if not check_ssl:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
if user_agent:
cmd_args += (f'--user-agent={user_agent}',)
if resolution:
cmd_args += (f'--window-size={resolution}',)
if timeout:
cmd_args += (f'--timeout={timeout * 1000}',)
if user_data_dir:
cmd_args += (f'--user-data-dir={user_data_dir}',)
return cmd_args
def chrome_cleanup():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
import os
from archivebox.config.permissions import IN_DOCKER
if IN_DOCKER:
singleton_lock = "/home/archivebox/.config/chromium/SingletonLock"
if os.path.lexists(singleton_lock):
try:
os.remove(singleton_lock)
except OSError:
pass