wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -37,7 +37,13 @@ class ArchiveBoxGroup(click.Group):
'server': 'archivebox.cli.archivebox_server.main',
'shell': 'archivebox.cli.archivebox_shell.main',
'manage': 'archivebox.cli.archivebox_manage.main',
# Worker/orchestrator commands
'orchestrator': 'archivebox.cli.archivebox_orchestrator.main',
'worker': 'archivebox.cli.archivebox_worker.main',
# Task commands (called by workers as subprocesses)
'crawl': 'archivebox.cli.archivebox_crawl.main',
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
'extract': 'archivebox.cli.archivebox_extract.main',
}
all_subcommands = {
**meta_commands,
@@ -118,11 +124,14 @@ def cli(ctx, help=False):
raise
def main(args=None, prog_name=None):
def main(args=None, prog_name=None, stdin=None):
# show `docker run archivebox xyz` in help messages if running in docker
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
IS_TTY = sys.stdin.isatty()
prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox')
# stdin param allows passing input data from caller (used by __main__.py)
# currently not used by click-based CLI, but kept for backwards compatibility
try:
cli(args=args, prog_name=prog_name)

View File

@@ -16,214 +16,135 @@ from archivebox.misc.util import enforce_types, docstring
from archivebox import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.config.permissions import USER, HOSTNAME
from archivebox.parsers import PARSERS
if TYPE_CHECKING:
from core.models import Snapshot
ORCHESTRATOR = None
@enforce_types
def add(urls: str | list[str],
depth: int | str=0,
tag: str='',
parser: str="auto",
extract: str="",
plugins: str="",
persona: str='Default',
overwrite: bool=False,
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
index_only: bool=False,
bg: bool=False,
created_by_id: int | None=None) -> QuerySet['Snapshot']:
"""Add a new URL or list of URLs to your archive"""
"""Add a new URL or list of URLs to your archive.
global ORCHESTRATOR
The new flow is:
1. Save URLs to sources file
2. Create Seed pointing to the file
3. Create Crawl with max_depth
4. Create root Snapshot pointing to file:// URL (depth=0)
5. Orchestrator runs parser extractors on root snapshot
6. Parser extractors output to urls.jsonl
7. URLs are added to Crawl.urls and child Snapshots are created
8. Repeat until max_depth is reached
"""
from rich import print
depth = int(depth)
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
# import models once django is set up
from crawls.models import Seed, Crawl
from workers.orchestrator import Orchestrator
from archivebox.base_models.models import get_or_create_system_user_pk
assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
# import models once django is set up
from core.models import Snapshot
from crawls.models import Seed, Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
from workers.orchestrator import Orchestrator
created_by_id = created_by_id or get_or_create_system_user_pk()
# 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
# 2. Create a new Seed pointing to the sources file
cli_args = [*sys.argv]
if cli_args[0].lower().endswith('archivebox'):
cli_args[0] = 'archivebox' # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox
cli_args[0] = 'archivebox'
cmd_str = ' '.join(cli_args)
seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={
'ONLY_NEW': not update,
'INDEX_ONLY': index_only,
'OVERWRITE': overwrite,
'EXTRACTORS': extract,
'DEFAULT_PERSONA': persona or 'Default',
})
# 3. create a new Crawl pointing to the Seed
crawl = Crawl.from_seed(seed, max_depth=depth)
# 4. start the Orchestrator & wait until it completes
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
# from crawls.actors import CrawlActor
# from core.actors import SnapshotActor, ArchiveResultActor
if not bg:
orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
orchestrator.start()
# 5. return the list of new Snapshots created
seed = Seed.from_file(
sources_file,
label=f'{USER}@{HOSTNAME} $ {cmd_str}',
parser=parser,
tag=tag,
created_by=created_by_id,
config={
'ONLY_NEW': not update,
'INDEX_ONLY': index_only,
'OVERWRITE': overwrite,
'EXTRACTORS': plugins,
'DEFAULT_PERSONA': persona or 'Default',
}
)
# 3. Create a new Crawl pointing to the Seed (status=queued)
crawl = Crawl.from_seed(seed, max_depth=depth)
print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
print(f' [dim]Seed: {seed.uri}[/dim]')
# 4. The CrawlMachine will create the root Snapshot when started
# Root snapshot URL = file:///path/to/sources/...txt
# Parser extractors will run on it and discover URLs
# Those URLs become child Snapshots (depth=1)
if index_only:
# Just create the crawl but don't start processing
print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
# Create root snapshot manually
crawl.create_root_snapshot()
return crawl.snapshot_set.all()
# 5. Start the orchestrator to process the queue
# The orchestrator will:
# - Process Crawl -> create root Snapshot
# - Process root Snapshot -> run parser extractors -> discover URLs
# - Create child Snapshots from discovered URLs
# - Process child Snapshots -> run extractors
# - Repeat until max_depth reached
if bg:
# Background mode: start orchestrator and return immediately
print('[yellow]\\[*] Running in background mode - starting orchestrator...[/yellow]')
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.start() # Fork to background
else:
# Foreground mode: run orchestrator until all work is done
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop() # Block until complete
# 6. Return the list of Snapshots in this crawl
return crawl.snapshot_set.all()
@click.command()
@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
@click.option('--bg', is_flag=True, help='Run archiving in background (start orchestrator and return immediately)')
@click.argument('urls', nargs=-1, type=click.Path())
@docstring(add.__doc__)
def main(**kwargs):
"""Add a new URL or list of URLs to your archive"""
add(**kwargs)
if __name__ == '__main__':
main()
# OLD VERSION:
# def add(urls: Union[str, List[str]],
# tag: str='',
# depth: int=0,
# update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
# update_all: bool=False,
# index_only: bool=False,
# overwrite: bool=False,
# # duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
# init: bool=False,
# extractors: str="",
# parser: str="auto",
# created_by_id: int | None=None,
# out_dir: Path=DATA_DIR) -> List[Link]:
# """Add a new URL or list of URLs to your archive"""
# from core.models import Snapshot, Tag
# # from workers.supervisord_util import start_cli_workers, tail_worker_logs
# # from workers.tasks import bg_archive_link
# assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
# extractors = extractors.split(",") if extractors else []
# if init:
# run_subcommand('init', stdin=None, pwd=out_dir)
# # Load list of links from the existing index
# check_data_folder()
# # worker = start_cli_workers()
# new_links: List[Link] = []
# all_links = load_main_index(out_dir=out_dir)
# log_importing_started(urls=urls, depth=depth, index_only=index_only)
# if isinstance(urls, str):
# # save verbatim stdin to sources
# write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
# elif isinstance(urls, list):
# # save verbatim args to sources
# write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
# new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
# # If we're going one level deeper, download each link and look for more links
# new_links_depth = []
# if new_links and depth == 1:
# log_crawl_started(new_links)
# for new_link in new_links:
# try:
# downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
# new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
# except Exception as err:
# stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
# imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
# new_links = dedupe_links(all_links, imported_links)
# write_main_index(links=new_links, out_dir=out_dir, created_by_id=created_by_id)
# all_links = load_main_index(out_dir=out_dir)
# tags = [
# Tag.objects.get_or_create(name=name.strip(), defaults={'created_by_id': created_by_id})[0]
# for name in tag.split(',')
# if name.strip()
# ]
# if tags:
# for link in imported_links:
# snapshot = Snapshot.objects.get(url=link.url)
# snapshot.tags.add(*tags)
# snapshot.tags_str(nocache=True)
# snapshot.save()
# # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
# if index_only:
# # mock archive all the links using the fake index_only extractor method in order to update their state
# if overwrite:
# archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
# else:
# archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
# else:
# # fully run the archive extractor methods for each link
# archive_kwargs = {
# "out_dir": out_dir,
# "created_by_id": created_by_id,
# }
# if extractors:
# archive_kwargs["methods"] = extractors
# stderr()
# ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
# if update:
# stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
# archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
# elif update_all:
# stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
# archive_links(all_links, overwrite=overwrite, **archive_kwargs)
# elif overwrite:
# stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
# archive_links(imported_links, overwrite=True, **archive_kwargs)
# elif new_links:
# stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
# archive_links(new_links, overwrite=False, **archive_kwargs)
# # tail_worker_logs(worker['stdout_logfile'])
# # if CAN_UPGRADE:
# # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
# return new_links

View File

@@ -20,15 +20,15 @@ def config(*keys,
**kwargs) -> None:
"""Get and set your ArchiveBox project configuration values"""
import archivebox
from archivebox.misc.checks import check_data_folder
from archivebox.misc.logging_util import printable_config
from archivebox.config.collection import load_all_config, write_config_file, get_real_name
from archivebox.config.configset import get_flat_config, get_all_configs
check_data_folder()
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
CONFIGS = archivebox.pm.hook.get_CONFIGS()
FLAT_CONFIG = get_flat_config()
CONFIGS = get_all_configs()
config_options: list[str] = list(kwargs.pop('key=value', []) or keys or [f'{key}={val}' for key, val in kwargs.items()])
no_args = not (get or set or reset or config_options)
@@ -105,7 +105,7 @@ def config(*keys,
if new_config:
before = FLAT_CONFIG
matching_config = write_config_file(new_config)
after = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
after = {**load_all_config(), **get_flat_config()}
print(printable_config(matching_config))
side_effect_changes = {}

View File

@@ -0,0 +1,302 @@
#!/usr/bin/env python3
"""
archivebox crawl [urls_or_snapshot_ids...] [--depth=N] [--plugin=NAME]
Discover outgoing links from URLs or existing Snapshots.
If a URL is passed, creates a Snapshot for it first, then runs parser plugins.
If a snapshot_id is passed, runs parser plugins on the existing Snapshot.
Outputs discovered outlink URLs as JSONL.
Pipe the output to `archivebox snapshot` to archive the discovered URLs.
Input formats:
- Plain URLs (one per line)
- Snapshot UUIDs (one per line)
- JSONL: {"type": "Snapshot", "url": "...", ...}
- JSONL: {"type": "Snapshot", "id": "...", ...}
Output (JSONL):
{"type": "Snapshot", "url": "https://discovered-url.com", "via_extractor": "...", ...}
Examples:
# Discover links from a page (creates snapshot first)
archivebox crawl https://example.com
# Discover links from an existing snapshot
archivebox crawl 01234567-89ab-cdef-0123-456789abcdef
# Full recursive crawl pipeline
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
# Use only specific parser plugin
archivebox crawl --plugin=parse_html_urls https://example.com
# Chain: create snapshot, then crawl its outlinks
archivebox snapshot https://example.com | archivebox crawl | archivebox snapshot | archivebox extract
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox crawl'
import sys
import json
from pathlib import Path
from typing import Optional
import rich_click as click
from archivebox.misc.util import docstring
def discover_outlinks(
args: tuple,
depth: int = 1,
plugin: str = '',
wait: bool = True,
) -> int:
"""
Discover outgoing links from URLs or existing Snapshots.
Accepts URLs or snapshot_ids. For URLs, creates Snapshots first.
Runs parser plugins, outputs discovered URLs as JSONL.
The output can be piped to `archivebox snapshot` to archive the discovered links.
Exit codes:
0: Success
1: Failure
"""
from rich import print as rprint
from django.utils import timezone
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record,
TYPE_SNAPSHOT, get_or_create_snapshot
)
from archivebox.base_models.models import get_or_create_system_user_pk
from core.models import Snapshot, ArchiveResult
from crawls.models import Seed, Crawl
from archivebox.config import CONSTANTS
from workers.orchestrator import Orchestrator
created_by_id = get_or_create_system_user_pk()
is_tty = sys.stdout.isatty()
# Collect all input records
records = list(read_args_or_stdin(args))
if not records:
rprint('[yellow]No URLs or snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
return 1
# Separate records into existing snapshots vs new URLs
existing_snapshot_ids = []
new_url_records = []
for record in records:
# Check if it's an existing snapshot (has id but no url, or looks like a UUID)
if record.get('id') and not record.get('url'):
existing_snapshot_ids.append(record['id'])
elif record.get('id'):
# Has both id and url - check if snapshot exists
try:
Snapshot.objects.get(id=record['id'])
existing_snapshot_ids.append(record['id'])
except Snapshot.DoesNotExist:
new_url_records.append(record)
elif record.get('url'):
new_url_records.append(record)
# For new URLs, create a Crawl and Snapshots
snapshot_ids = list(existing_snapshot_ids)
if new_url_records:
# Create a Crawl to manage this operation
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__crawl.txt'
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url')))
seed = Seed.from_file(
sources_file,
label=f'crawl --depth={depth}',
created_by=created_by_id,
)
crawl = Crawl.from_seed(seed, max_depth=depth)
# Create snapshots for new URLs
for record in new_url_records:
try:
record['crawl_id'] = str(crawl.id)
record['depth'] = record.get('depth', 0)
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
snapshot_ids.append(str(snapshot.id))
except Exception as e:
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
continue
if not snapshot_ids:
rprint('[red]No snapshots to process[/red]', file=sys.stderr)
return 1
if existing_snapshot_ids:
rprint(f'[blue]Using {len(existing_snapshot_ids)} existing snapshots[/blue]', file=sys.stderr)
if new_url_records:
rprint(f'[blue]Created {len(snapshot_ids) - len(existing_snapshot_ids)} new snapshots[/blue]', file=sys.stderr)
rprint(f'[blue]Running parser plugins on {len(snapshot_ids)} snapshots...[/blue]', file=sys.stderr)
# Create ArchiveResults for plugins
# If --plugin is specified, only run that one. Otherwise, run all available plugins.
# The orchestrator will handle dependency ordering (plugins declare deps in config.json)
for snapshot_id in snapshot_ids:
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
if plugin:
# User specified a single plugin to run
ArchiveResult.objects.get_or_create(
snapshot=snapshot,
extractor=plugin,
defaults={
'status': ArchiveResult.StatusChoices.QUEUED,
'retry_at': timezone.now(),
'created_by_id': snapshot.created_by_id,
}
)
else:
# Create pending ArchiveResults for all enabled plugins
# This uses hook discovery to find available plugins dynamically
snapshot.create_pending_archiveresults()
# Mark snapshot as started
snapshot.status = Snapshot.StatusChoices.STARTED
snapshot.retry_at = timezone.now()
snapshot.save()
except Snapshot.DoesNotExist:
continue
# Run plugins
if wait:
rprint('[blue]Running outlink plugins...[/blue]', file=sys.stderr)
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
# Collect discovered URLs from urls.jsonl files
# Uses dynamic discovery - any plugin that outputs urls.jsonl is considered a parser
from archivebox.hooks import collect_urls_from_extractors
discovered_urls = {}
for snapshot_id in snapshot_ids:
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
snapshot_dir = Path(snapshot.output_dir)
# Dynamically collect urls.jsonl from ANY plugin subdirectory
for entry in collect_urls_from_extractors(snapshot_dir):
url = entry.get('url')
if url and url not in discovered_urls:
# Add metadata for crawl tracking
entry['type'] = TYPE_SNAPSHOT
entry['depth'] = snapshot.depth + 1
entry['via_snapshot'] = str(snapshot.id)
discovered_urls[url] = entry
except Snapshot.DoesNotExist:
continue
rprint(f'[green]Discovered {len(discovered_urls)} URLs[/green]', file=sys.stderr)
# Output discovered URLs as JSONL (when piped) or human-readable (when TTY)
for url, entry in discovered_urls.items():
if is_tty:
via = entry.get('via_extractor', 'unknown')
rprint(f' [dim]{via}[/dim] {url[:80]}', file=sys.stderr)
else:
write_record(entry)
return 0
def process_crawl_by_id(crawl_id: str) -> int:
"""
Process a single Crawl by ID (used by workers).
Triggers the Crawl's state machine tick() which will:
- Transition from queued -> started (creates root snapshot)
- Transition from started -> sealed (when all snapshots done)
"""
from rich import print as rprint
from crawls.models import Crawl
try:
crawl = Crawl.objects.get(id=crawl_id)
except Crawl.DoesNotExist:
rprint(f'[red]Crawl {crawl_id} not found[/red]', file=sys.stderr)
return 1
rprint(f'[blue]Processing Crawl {crawl.id} (status={crawl.status})[/blue]', file=sys.stderr)
try:
crawl.sm.tick()
crawl.refresh_from_db()
rprint(f'[green]Crawl complete (status={crawl.status})[/green]', file=sys.stderr)
return 0
except Exception as e:
rprint(f'[red]Crawl error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
def is_crawl_id(value: str) -> bool:
"""Check if value looks like a Crawl UUID."""
import re
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
if not uuid_pattern.match(value):
return False
# Verify it's actually a Crawl (not a Snapshot or other object)
from crawls.models import Crawl
return Crawl.objects.filter(id=value).exists()
@click.command()
@click.option('--depth', '-d', type=int, default=1, help='Max depth for recursive crawling (default: 1)')
@click.option('--plugin', '-p', default='', help='Use only this parser plugin (e.g., parse_html_urls, parse_dom_outlinks)')
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
@click.argument('args', nargs=-1)
def main(depth: int, plugin: str, wait: bool, args: tuple):
"""Discover outgoing links from URLs or existing Snapshots, or process Crawl by ID"""
from archivebox.misc.jsonl import read_args_or_stdin
# Read all input
records = list(read_args_or_stdin(args))
if not records:
from rich import print as rprint
rprint('[yellow]No URLs, Snapshot IDs, or Crawl IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
sys.exit(1)
# Check if input looks like existing Crawl IDs to process
# If ALL inputs are Crawl UUIDs, process them
all_are_crawl_ids = all(
is_crawl_id(r.get('id') or r.get('url', ''))
for r in records
)
if all_are_crawl_ids:
# Process existing Crawls by ID
exit_code = 0
for record in records:
crawl_id = record.get('id') or record.get('url')
result = process_crawl_by_id(crawl_id)
if result != 0:
exit_code = result
sys.exit(exit_code)
else:
# Default behavior: discover outlinks from input (URLs or Snapshot IDs)
sys.exit(discover_outlinks(args, depth=depth, plugin=plugin, wait=wait))
if __name__ == '__main__':
main()

View File

@@ -1,49 +1,262 @@
#!/usr/bin/env python3
"""
archivebox extract [snapshot_ids...] [--plugin=NAME]
Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.
Input formats:
- Snapshot UUIDs (one per line)
- JSONL: {"type": "Snapshot", "id": "...", "url": "..."}
- JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."}
Output (JSONL):
{"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."}
Examples:
# Extract specific snapshot
archivebox extract 01234567-89ab-cdef-0123-456789abcdef
# Pipe from snapshot command
archivebox snapshot https://example.com | archivebox extract
# Run specific plugin only
archivebox extract --plugin=screenshot 01234567-89ab-cdef-0123-456789abcdef
# Chain commands
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox extract'
import sys
from typing import TYPE_CHECKING, Generator
from typing import Optional, List
import rich_click as click
from django.db.models import Q
from archivebox.misc.util import enforce_types, docstring
def process_archiveresult_by_id(archiveresult_id: str) -> int:
"""
Run extraction for a single ArchiveResult by ID (used by workers).
if TYPE_CHECKING:
Triggers the ArchiveResult's state machine tick() to run the extractor.
"""
from rich import print as rprint
from core.models import ArchiveResult
try:
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
except ArchiveResult.DoesNotExist:
rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
return 1
ORCHESTRATOR = None
rprint(f'[blue]Extracting {archiveresult.extractor} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
@enforce_types
def extract(archiveresult_id: str) -> Generator['ArchiveResult', None, None]:
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
if not archiveresult:
raise Exception(f'ArchiveResult {archiveresult_id} not found')
return archiveresult.EXTRACTOR.extract()
try:
# Trigger state machine tick - this runs the actual extraction
archiveresult.sm.tick()
archiveresult.refresh_from_db()
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
print(f'[green]Extraction succeeded: {archiveresult.output}[/green]')
return 0
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
print(f'[red]Extraction failed: {archiveresult.output}[/red]', file=sys.stderr)
return 1
else:
# Still in progress or backoff - not a failure
print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
return 0
except Exception as e:
print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
def run_plugins(
args: tuple,
plugin: str = '',
wait: bool = True,
) -> int:
"""
Run plugins on Snapshots from input.
Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL.
Exit codes:
0: Success
1: Failure
"""
from rich import print as rprint
from django.utils import timezone
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record, archiveresult_to_jsonl,
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
)
from core.models import Snapshot, ArchiveResult
from workers.orchestrator import Orchestrator
is_tty = sys.stdout.isatty()
# Collect all input records
records = list(read_args_or_stdin(args))
if not records:
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
return 1
# Gather snapshot IDs to process
snapshot_ids = set()
for record in records:
record_type = record.get('type')
if record_type == TYPE_SNAPSHOT:
snapshot_id = record.get('id')
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif record.get('url'):
# Look up by URL
try:
snap = Snapshot.objects.get(url=record['url'])
snapshot_ids.add(str(snap.id))
except Snapshot.DoesNotExist:
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
elif record_type == TYPE_ARCHIVERESULT:
snapshot_id = record.get('snapshot_id')
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif 'id' in record:
# Assume it's a snapshot ID
snapshot_ids.add(record['id'])
if not snapshot_ids:
rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
return 1
# Get snapshots and ensure they have pending ArchiveResults
processed_count = 0
for snapshot_id in snapshot_ids:
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
continue
# Create pending ArchiveResults if needed
if plugin:
# Only create for specific plugin
result, created = ArchiveResult.objects.get_or_create(
snapshot=snapshot,
extractor=plugin,
defaults={
'status': ArchiveResult.StatusChoices.QUEUED,
'retry_at': timezone.now(),
'created_by_id': snapshot.created_by_id,
}
)
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
# Reset for retry
result.status = ArchiveResult.StatusChoices.QUEUED
result.retry_at = timezone.now()
result.save()
else:
# Create all pending plugins
snapshot.create_pending_archiveresults()
# Reset snapshot status to allow processing
if snapshot.status == Snapshot.StatusChoices.SEALED:
snapshot.status = Snapshot.StatusChoices.STARTED
snapshot.retry_at = timezone.now()
snapshot.save()
processed_count += 1
if processed_count == 0:
rprint('[red]No snapshots to process[/red]', file=sys.stderr)
return 1
rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
# Run orchestrator if --wait (default)
if wait:
rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
# Output results as JSONL (when piped) or human-readable (when TTY)
for snapshot_id in snapshot_ids:
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
results = snapshot.archiveresult_set.all()
if plugin:
results = results.filter(extractor=plugin)
for result in results:
if is_tty:
status_color = {
'succeeded': 'green',
'failed': 'red',
'skipped': 'yellow',
}.get(result.status, 'dim')
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.extractor}{result.output or ""}', file=sys.stderr)
else:
write_record(archiveresult_to_jsonl(result))
except Snapshot.DoesNotExist:
continue
return 0
def is_archiveresult_id(value: str) -> bool:
"""Check if value looks like an ArchiveResult UUID."""
import re
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
if not uuid_pattern.match(value):
return False
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
from core.models import ArchiveResult
return ArchiveResult.objects.filter(id=value).exists()
# <user>@<machine_id>#<datetime>/absolute/path/to/binary
# 2014.24.01
@click.command()
@click.option('--plugin', '-p', default='', help='Run only this plugin (e.g., screenshot, singlefile)')
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
@click.argument('args', nargs=-1)
def main(plugin: str, wait: bool, args: tuple):
"""Run plugins on Snapshots, or process existing ArchiveResults by ID"""
from archivebox.misc.jsonl import read_args_or_stdin
@click.argument('archiveresult_ids', nargs=-1, type=str)
@docstring(extract.__doc__)
def main(archiveresult_ids: list[str]):
"""Add a new URL or list of URLs to your archive"""
for archiveresult_id in (archiveresult_ids or sys.stdin):
print(f'Extracting {archiveresult_id}...')
archiveresult = extract(str(archiveresult_id))
print(archiveresult.as_json())
# Read all input
records = list(read_args_or_stdin(args))
if not records:
from rich import print as rprint
rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
sys.exit(1)
# Check if input looks like existing ArchiveResult IDs to process
all_are_archiveresult_ids = all(
is_archiveresult_id(r.get('id') or r.get('url', ''))
for r in records
)
if all_are_archiveresult_ids:
# Process existing ArchiveResults by ID
exit_code = 0
for record in records:
archiveresult_id = record.get('id') or record.get('url')
result = process_archiveresult_by_id(archiveresult_id)
if result != 0:
exit_code = result
sys.exit(exit_code)
else:
# Default behavior: run plugins on Snapshots from input
sys.exit(run_plugins(args, plugin=plugin, wait=wait))
if __name__ == '__main__':
main()

View File

@@ -21,10 +21,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
from archivebox.config import CONSTANTS, VERSION, DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.collection import write_config_file
from archivebox.index import load_main_index, write_main_index, fix_invalid_folder_locations, get_invalid_folders
from archivebox.index.schema import Link
from archivebox.index.json import parse_json_main_index, parse_json_links_details
from archivebox.index.sql import apply_migrations
from archivebox.misc.folders import fix_invalid_folder_locations, get_invalid_folders
from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details, SnapshotDict
from archivebox.misc.db import apply_migrations
# if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
# print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
@@ -100,10 +99,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
from core.models import Snapshot
all_links = Snapshot.objects.none()
pending_links: dict[str, Link] = {}
pending_links: dict[str, SnapshotDict] = {}
if existing_index:
all_links = load_main_index(DATA_DIR, warn=False)
all_links = Snapshot.objects.all()
print(f' √ Loaded {all_links.count()} links from existing main index.')
if quick:
@@ -119,9 +118,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
# Links in JSON index but not in main index
orphaned_json_links = {
link.url: link
for link in parse_json_main_index(DATA_DIR)
if not all_links.filter(url=link.url).exists()
link_dict['url']: link_dict
for link_dict in parse_json_main_index(DATA_DIR)
if not all_links.filter(url=link_dict['url']).exists()
}
if orphaned_json_links:
pending_links.update(orphaned_json_links)
@@ -129,9 +128,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
# Links in data dir indexes but not in main index
orphaned_data_dir_links = {
link.url: link
for link in parse_json_links_details(DATA_DIR)
if not all_links.filter(url=link.url).exists()
link_dict['url']: link_dict
for link_dict in parse_json_links_details(DATA_DIR)
if not all_links.filter(url=link_dict['url']).exists()
}
if orphaned_data_dir_links:
pending_links.update(orphaned_data_dir_links)
@@ -159,7 +158,8 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
print(' archivebox init --quick', file=sys.stderr)
raise SystemExit(1)
write_main_index(list(pending_links.values()), DATA_DIR)
if pending_links:
Snapshot.objects.create_from_dicts(list(pending_links.values()))
print('\n[green]----------------------------------------------------------------------[/green]')

View File

@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
import os
import sys
from typing import Optional, List
import shutil
import rich_click as click
from rich import print
@@ -13,149 +13,86 @@ from archivebox.misc.util import docstring, enforce_types
@enforce_types
def install(binproviders: Optional[List[str]]=None, binaries: Optional[List[str]]=None, dry_run: bool=False) -> None:
"""Automatically install all ArchiveBox dependencies and extras"""
# if running as root:
# - run init to create index + lib dir
# - chown -R 911 DATA_DIR
# - install all binaries as root
# - chown -R 911 LIB_DIR
# else:
# - run init to create index + lib dir as current user
# - install all binaries as current user
# - recommend user re-run with sudo if any deps need to be installed as root
def install(dry_run: bool=False) -> None:
"""Detect and install ArchiveBox dependencies by running a dependency-check crawl"""
import abx
import archivebox
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
from archivebox.config.paths import DATA_DIR, ARCHIVE_DIR, get_or_create_working_lib_dir
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
from archivebox.config.paths import ARCHIVE_DIR
from archivebox.misc.logging import stderr
from archivebox.cli.archivebox_init import init
from archivebox.misc.system import run as run_shell
if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
init() # must init full index because we need a db to store InstalledBinary entries in
print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]')
# we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID
print('\n[green][+] Detecting ArchiveBox dependencies...[/green]')
if IS_ROOT:
EUID = os.geteuid()
# if we have sudo/root permissions, take advantage of them just while installing dependencies
print()
print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue] with [red]sudo[/red] only for dependencies that need it.[/yellow]')
print(f' DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]')
print(f' DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
print()
LIB_DIR = get_or_create_working_lib_dir()
package_manager_names = ', '.join(
f'[yellow]{binprovider.name}[/yellow]'
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values()))
if not binproviders or (binproviders and binprovider.name in binproviders)
)
print(f'[+] Setting up package managers {package_manager_names}...')
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())):
if binproviders and binprovider.name not in binproviders:
continue
try:
binprovider.setup()
except Exception:
# it's ok, installing binaries below will automatically set up package managers as needed
# e.g. if user does not have npm available we cannot set it up here yet, but once npm Binary is installed
# the next package that depends on npm will automatically call binprovider.setup() during its own install
pass
print()
for binary in reversed(list(abx.as_dict(abx.pm.hook.get_BINARIES()).values())):
if binary.name in ('archivebox', 'django', 'sqlite', 'python'):
# obviously must already be installed if we are running
continue
if binaries and binary.name not in binaries:
continue
providers = ' [grey53]or[/grey53] '.join(
provider.name for provider in binary.binproviders_supported
if not binproviders or (binproviders and provider.name in binproviders)
)
if not providers:
continue
print(f'[+] Detecting / Installing [yellow]{binary.name.ljust(22)}[/yellow] using [red]{providers}[/red]...')
try:
with SudoPermission(uid=0, fallback=True):
# print(binary.load_or_install(fresh=True).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}))
if binproviders:
providers_supported_by_binary = [provider.name for provider in binary.binproviders_supported]
for binprovider_name in binproviders:
if binprovider_name not in providers_supported_by_binary:
continue
try:
if dry_run:
# always show install commands when doing a dry run
sys.stderr.write("\033[2;49;90m") # grey53
result = binary.install(binproviders=[binprovider_name], dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
sys.stderr.write("\033[00m\n") # reset
else:
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False)
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
if result and result['loaded_version']:
break
except Exception as e:
print(f'[red]:cross_mark: Failed to install {binary.name} as using {binprovider_name} as user {ARCHIVEBOX_USER}: {e}[/red]')
else:
if dry_run:
sys.stderr.write("\033[2;49;90m") # grey53
binary.install(dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
sys.stderr.write("\033[00m\n") # reset
else:
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, fresh=True, dry_run=dry_run)
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
if IS_ROOT and LIB_DIR:
with SudoPermission(uid=0):
if ARCHIVEBOX_USER == 0:
os.system(f'chmod -R 777 "{LIB_DIR.resolve()}"')
else:
os.system(f'chown -R {ARCHIVEBOX_USER} "{LIB_DIR.resolve()}"')
except Exception as e:
print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]')
if binaries and len(binaries) == 1:
# if we are only installing a single binary, raise the exception so the user can see what went wrong
raise
if dry_run:
print('[dim]Dry run - would create a crawl to detect dependencies[/dim]')
return
# Set up Django
from archivebox.config.django import setup_django
setup_django()
from django.utils import timezone
from crawls.models import Seed, Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
# Create a seed and crawl for dependency detection
# Using a minimal crawl that will trigger on_Crawl hooks
created_by_id = get_or_create_system_user_pk()
seed = Seed.objects.create(
uri='archivebox://install',
label='Dependency detection',
created_by_id=created_by_id,
)
crawl = Crawl.objects.create(
seed=seed,
max_depth=0,
created_by_id=created_by_id,
status='queued',
)
print(f'[+] Created dependency detection crawl: {crawl.id}')
print('[+] Running crawl to detect binaries via on_Crawl hooks...')
print()
# Run the crawl synchronously (this triggers on_Crawl hooks)
from workers.orchestrator import Orchestrator
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
print()
# Check for superuser
from django.contrib.auth import get_user_model
User = get_user_model()
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
stderr(' archivebox manage createsuperuser')
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
extra_args = []
if binproviders:
extra_args.append(f'--binproviders={",".join(binproviders)}')
if binaries:
extra_args.append(f'--binaries={",".join(binaries)}')
proc = run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version', *extra_args], capture_output=False, cwd=DATA_DIR)
raise SystemExit(proc.returncode)
print()
# Run version to show full status
archivebox_path = shutil.which('archivebox') or sys.executable
if 'python' in archivebox_path:
os.system(f'{sys.executable} -m archivebox version')
else:
os.system(f'{archivebox_path} version')
@click.command()
@click.option('--binproviders', '-p', type=str, help='Select binproviders to use DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)', default=None)
@click.option('--binaries', '-b', type=str, help='Select binaries to install DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)', default=None)
@click.option('--dry-run', '-d', is_flag=True, help='Show what would be installed without actually installing anything', default=False)
@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
@docstring(install.__doc__)
def main(**kwargs) -> None:
install(**kwargs)

View File

@@ -0,0 +1,67 @@
#!/usr/bin/env python3
"""
archivebox orchestrator [--daemon]
Start the orchestrator process that manages workers.
The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
and lazily spawns worker processes when there is work to be done.
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox orchestrator'
import sys
import rich_click as click
from archivebox.misc.util import docstring
def orchestrator(daemon: bool = False, watch: bool = False) -> int:
"""
Start the orchestrator process.
The orchestrator:
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
2. Spawns worker processes when there is work to do
3. Monitors worker health and restarts failed workers
4. Exits when all queues are empty (unless --daemon)
Args:
daemon: Run forever (don't exit when idle)
watch: Just watch the queues without spawning workers (for debugging)
Exit codes:
0: All work completed successfully
1: Error occurred
"""
from workers.orchestrator import Orchestrator
if Orchestrator.is_running():
print('[yellow]Orchestrator is already running[/yellow]')
return 0
try:
orchestrator_instance = Orchestrator(exit_on_idle=not daemon)
orchestrator_instance.runloop()
return 0
except KeyboardInterrupt:
return 0
except Exception as e:
print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
@click.command()
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
@click.option('--watch', '-w', is_flag=True, help="Watch queues without spawning workers")
@docstring(orchestrator.__doc__)
def main(daemon: bool, watch: bool):
"""Start the ArchiveBox orchestrator process"""
sys.exit(orchestrator(daemon=daemon, watch=watch))
if __name__ == '__main__':
main()

View File

@@ -12,10 +12,7 @@ import rich_click as click
from django.db.models import QuerySet
from archivebox.config import DATA_DIR
from archivebox.index.schema import Link
from archivebox.config.django import setup_django
from archivebox.index import load_main_index
from archivebox.index.sql import remove_from_sql_main_index
from archivebox.misc.util import enforce_types, docstring
from archivebox.misc.checks import check_data_folder
from archivebox.misc.logging_util import (
@@ -35,7 +32,7 @@ def remove(filter_patterns: Iterable[str]=(),
before: float | None=None,
yes: bool=False,
delete: bool=False,
out_dir: Path=DATA_DIR) -> Iterable[Link]:
out_dir: Path=DATA_DIR) -> QuerySet:
"""Remove the specified URLs from the archive"""
setup_django()
@@ -63,27 +60,27 @@ def remove(filter_patterns: Iterable[str]=(),
log_removal_finished(0, 0)
raise SystemExit(1)
log_links = [link.as_link() for link in snapshots]
log_list_finished(log_links)
log_removal_started(log_links, yes=yes, delete=delete)
log_list_finished(snapshots)
log_removal_started(snapshots, yes=yes, delete=delete)
timer = TimedProgress(360, prefix=' ')
try:
for snapshot in snapshots:
if delete:
shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
shutil.rmtree(snapshot.output_dir, ignore_errors=True)
finally:
timer.end()
to_remove = snapshots.count()
from archivebox.search import flush_search_index
from core.models import Snapshot
flush_search_index(snapshots=snapshots)
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
all_snapshots = load_main_index(out_dir=out_dir)
snapshots.delete()
all_snapshots = Snapshot.objects.all()
log_removal_finished(all_snapshots.count(), to_remove)
return all_snapshots

View File

@@ -35,9 +35,12 @@ def schedule(add: bool=False,
depth = int(depth)
import shutil
from crontab import CronTab, CronSlices
from archivebox.misc.system import dedupe_cron_jobs
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
# Find the archivebox binary path
ARCHIVEBOX_ABSPATH = shutil.which('archivebox') or sys.executable.replace('python', 'archivebox')
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
@@ -58,7 +61,7 @@ def schedule(add: bool=False,
'cd',
quoted(out_dir),
'&&',
quoted(ARCHIVEBOX_BINARY.load().abspath),
quoted(ARCHIVEBOX_ABSPATH),
*([
'add',
*(['--overwrite'] if overwrite else []),

View File

@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox search'
from pathlib import Path
from typing import Optional, List, Iterable
from typing import Optional, List, Any
import rich_click as click
from rich import print
@@ -12,11 +12,19 @@ from rich import print
from django.db.models import QuerySet
from archivebox.config import DATA_DIR
from archivebox.index import LINK_FILTERS
from archivebox.index.schema import Link
from archivebox.misc.logging import stderr
from archivebox.misc.util import enforce_types, docstring
# Filter types for URL matching
LINK_FILTERS = {
'exact': lambda pattern: {'url': pattern},
'substring': lambda pattern: {'url__icontains': pattern},
'regex': lambda pattern: {'url__iregex': pattern},
'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
'tag': lambda pattern: {'tags__name': pattern},
'timestamp': lambda pattern: {'timestamp': pattern},
}
STATUS_CHOICES = [
'indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid',
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
@@ -24,38 +32,37 @@ STATUS_CHOICES = [
def list_links(snapshots: Optional[QuerySet]=None,
filter_patterns: Optional[List[str]]=None,
filter_type: str='substring',
after: Optional[float]=None,
before: Optional[float]=None,
out_dir: Path=DATA_DIR) -> Iterable[Link]:
from archivebox.index import load_main_index
from archivebox.index import snapshot_filter
def get_snapshots(snapshots: Optional[QuerySet]=None,
filter_patterns: Optional[List[str]]=None,
filter_type: str='substring',
after: Optional[float]=None,
before: Optional[float]=None,
out_dir: Path=DATA_DIR) -> QuerySet:
"""Filter and return Snapshots matching the given criteria."""
from core.models import Snapshot
if snapshots:
all_snapshots = snapshots
result = snapshots
else:
all_snapshots = load_main_index(out_dir=out_dir)
result = Snapshot.objects.all()
if after is not None:
all_snapshots = all_snapshots.filter(timestamp__gte=after)
result = result.filter(timestamp__gte=after)
if before is not None:
all_snapshots = all_snapshots.filter(timestamp__lt=before)
result = result.filter(timestamp__lt=before)
if filter_patterns:
all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type)
if not all_snapshots:
if not result:
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
return all_snapshots
return result
def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict[str, Link | None]:
def list_folders(snapshots: QuerySet, status: str, out_dir: Path=DATA_DIR) -> dict[str, Any]:
from archivebox.misc.checks import check_data_folder
from archivebox.index import (
from archivebox.misc.folders import (
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
@@ -67,7 +74,7 @@ def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict
get_corrupted_folders,
get_unrecognized_folders,
)
check_data_folder()
STATUS_FUNCTIONS = {
@@ -84,7 +91,7 @@ def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict
}
try:
return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
return STATUS_FUNCTIONS[status](snapshots, out_dir=out_dir)
except KeyError:
raise ValueError('Status not recognized.')
@@ -109,7 +116,7 @@ def search(filter_patterns: list[str] | None=None,
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
raise SystemExit(2)
snapshots = list_links(
snapshots = get_snapshots(
filter_patterns=list(filter_patterns) if filter_patterns else None,
filter_type=filter_type,
before=before,
@@ -120,20 +127,24 @@ def search(filter_patterns: list[str] | None=None,
snapshots = snapshots.order_by(sort)
folders = list_folders(
links=snapshots,
snapshots=snapshots,
status=status,
out_dir=DATA_DIR,
)
if json:
from archivebox.index.json import generate_json_index_from_links
output = generate_json_index_from_links(folders.values(), with_headers)
from core.models import Snapshot
# Filter for non-None snapshots
valid_snapshots = [s for s in folders.values() if s is not None]
output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_json(with_headers=with_headers)
elif html:
from archivebox.index.html import generate_index_from_links
output = generate_index_from_links(folders.values(), with_headers)
from core.models import Snapshot
valid_snapshots = [s for s in folders.values() if s is not None]
output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_html(with_headers=with_headers)
elif csv:
from archivebox.index.csv import links_to_csv
output = links_to_csv(folders.values(), csv.split(','), with_headers)
from core.models import Snapshot
valid_snapshots = [s for s in folders.values() if s is not None]
output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_csv(cols=csv.split(','), header=with_headers)
else:
from archivebox.misc.logging_util import printable_folders
output = printable_folders(folders, with_headers)

View File

@@ -0,0 +1,218 @@
#!/usr/bin/env python3
"""
archivebox snapshot [urls...] [--depth=N] [--tag=TAG] [--plugins=...]
Create Snapshots from URLs. Accepts URLs as arguments, from stdin, or via JSONL.
Input formats:
- Plain URLs (one per line)
- JSONL: {"type": "Snapshot", "url": "...", "title": "...", "tags": "..."}
Output (JSONL):
{"type": "Snapshot", "id": "...", "url": "...", "status": "queued", ...}
Examples:
# Create snapshots from URLs
archivebox snapshot https://example.com https://foo.com
# Pipe from stdin
echo 'https://example.com' | archivebox snapshot
# Chain with extract
archivebox snapshot https://example.com | archivebox extract
# With crawl depth
archivebox snapshot --depth=1 https://example.com
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox snapshot'
import sys
from typing import Optional
import rich_click as click
from archivebox.misc.util import docstring
def process_snapshot_by_id(snapshot_id: str) -> int:
"""
Process a single Snapshot by ID (used by workers).
Triggers the Snapshot's state machine tick() which will:
- Transition from queued -> started (creates pending ArchiveResults)
- Transition from started -> sealed (when all ArchiveResults done)
"""
from rich import print as rprint
from core.models import Snapshot
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
rprint(f'[red]Snapshot {snapshot_id} not found[/red]', file=sys.stderr)
return 1
rprint(f'[blue]Processing Snapshot {snapshot.id} {snapshot.url[:50]} (status={snapshot.status})[/blue]', file=sys.stderr)
try:
snapshot.sm.tick()
snapshot.refresh_from_db()
rprint(f'[green]Snapshot complete (status={snapshot.status})[/green]', file=sys.stderr)
return 0
except Exception as e:
rprint(f'[red]Snapshot error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
def create_snapshots(
urls: tuple,
depth: int = 0,
tag: str = '',
plugins: str = '',
created_by_id: Optional[int] = None,
) -> int:
"""
Create Snapshots from URLs or JSONL records.
Reads from args or stdin, creates Snapshot objects, outputs JSONL.
If --plugins is passed, also runs specified plugins (blocking).
Exit codes:
0: Success
1: Failure
"""
from rich import print as rprint
from django.utils import timezone
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record, snapshot_to_jsonl,
TYPE_SNAPSHOT, TYPE_TAG, get_or_create_snapshot
)
from archivebox.base_models.models import get_or_create_system_user_pk
from core.models import Snapshot
from crawls.models import Seed, Crawl
from archivebox.config import CONSTANTS
created_by_id = created_by_id or get_or_create_system_user_pk()
is_tty = sys.stdout.isatty()
# Collect all input records
records = list(read_args_or_stdin(urls))
if not records:
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
return 1
# If depth > 0, we need a Crawl to manage recursive discovery
crawl = None
if depth > 0:
# Create a seed for this batch
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt'
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url')))
seed = Seed.from_file(
sources_file,
label=f'snapshot --depth={depth}',
created_by=created_by_id,
)
crawl = Crawl.from_seed(seed, max_depth=depth)
# Process each record
created_snapshots = []
for record in records:
if record.get('type') != TYPE_SNAPSHOT and 'url' not in record:
continue
try:
# Add crawl info if we have one
if crawl:
record['crawl_id'] = str(crawl.id)
record['depth'] = record.get('depth', 0)
# Add tags if provided via CLI
if tag and not record.get('tags'):
record['tags'] = tag
# Get or create the snapshot
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
created_snapshots.append(snapshot)
# Output JSONL record (only when piped)
if not is_tty:
write_record(snapshot_to_jsonl(snapshot))
except Exception as e:
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
continue
if not created_snapshots:
rprint('[red]No snapshots created[/red]', file=sys.stderr)
return 1
rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
# If TTY, show human-readable output
if is_tty:
for snapshot in created_snapshots:
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
# If --plugins is passed, run the orchestrator for those plugins
if plugins:
from workers.orchestrator import Orchestrator
rprint(f'[blue]Running plugins: {plugins or "all"}...[/blue]', file=sys.stderr)
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
return 0
def is_snapshot_id(value: str) -> bool:
"""Check if value looks like a Snapshot UUID."""
import re
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
return bool(uuid_pattern.match(value))
@click.command()
@click.option('--depth', '-d', type=int, default=0, help='Recursively crawl linked pages up to N levels deep')
@click.option('--tag', '-t', default='', help='Comma-separated tags to add to each snapshot')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run after creating snapshots (e.g. title,screenshot)')
@click.argument('args', nargs=-1)
def main(depth: int, tag: str, plugins: str, args: tuple):
"""Create Snapshots from URLs, or process existing Snapshots by ID"""
from archivebox.misc.jsonl import read_args_or_stdin
# Read all input
records = list(read_args_or_stdin(args))
if not records:
from rich import print as rprint
rprint('[yellow]No URLs or Snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
sys.exit(1)
# Check if input looks like existing Snapshot IDs to process
# If ALL inputs are UUIDs with no URL, assume we're processing existing Snapshots
all_are_ids = all(
(r.get('id') and not r.get('url')) or is_snapshot_id(r.get('url', ''))
for r in records
)
if all_are_ids:
# Process existing Snapshots by ID
exit_code = 0
for record in records:
snapshot_id = record.get('id') or record.get('url')
result = process_snapshot_by_id(snapshot_id)
if result != 0:
exit_code = result
sys.exit(exit_code)
else:
# Create new Snapshots from URLs
sys.exit(create_snapshots(args, depth=depth, tag=tag, plugins=plugins))
if __name__ == '__main__':
main()

View File

@@ -10,9 +10,8 @@ from rich import print
from archivebox.misc.util import enforce_types, docstring
from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
from archivebox.config.common import SHELL_CONFIG
from archivebox.index.json import parse_json_links_details
from archivebox.index import (
load_main_index,
from archivebox.misc.legacy import parse_json_links_details
from archivebox.misc.folders import (
get_indexed_folders,
get_archived_folders,
get_invalid_folders,
@@ -33,7 +32,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
"""Print out some info and statistics about the archive collection"""
from django.contrib.auth import get_user_model
from archivebox.index.sql import get_admins
from archivebox.misc.db import get_admins
from core.models import Snapshot
User = get_user_model()
@@ -44,7 +43,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
print(f' Index size: {size} across {num_files} files')
print()
links = load_main_index(out_dir=out_dir)
links = Snapshot.objects.all()
num_sql_links = links.count()
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')

View File

@@ -8,8 +8,7 @@ import rich_click as click
from typing import Iterable
from archivebox.misc.util import enforce_types, docstring
from archivebox.index import (
LINK_FILTERS,
from archivebox.misc.folders import (
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
@@ -22,6 +21,16 @@ from archivebox.index import (
get_unrecognized_folders,
)
# Filter types for URL matching
LINK_FILTERS = {
'exact': lambda pattern: {'url': pattern},
'substring': lambda pattern: {'url__icontains': pattern},
'regex': lambda pattern: {'url__iregex': pattern},
'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
'tag': lambda pattern: {'tags__name': pattern},
'timestamp': lambda pattern: {'timestamp': pattern},
}
@enforce_types
def update(filter_patterns: Iterable[str]=(),
@@ -33,15 +42,66 @@ def update(filter_patterns: Iterable[str]=(),
after: float | None=None,
status: str='indexed',
filter_type: str='exact',
extract: str="") -> None:
plugins: str="",
max_workers: int=4) -> None:
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
from rich import print
from archivebox.config.django import setup_django
setup_django()
from django.utils import timezone
from core.models import Snapshot
from workers.orchestrator import parallel_archive
from workers.orchestrator import Orchestrator
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.start()
# Get snapshots to update based on filters
snapshots = Snapshot.objects.all()
if filter_patterns:
snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type)
if status == 'unarchived':
snapshots = snapshots.filter(downloaded_at__isnull=True)
elif status == 'archived':
snapshots = snapshots.filter(downloaded_at__isnull=False)
if before:
from datetime import datetime
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
if after:
from datetime import datetime
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
if resume:
snapshots = snapshots.filter(timestamp__gte=str(resume))
snapshot_ids = list(snapshots.values_list('pk', flat=True))
if not snapshot_ids:
print('[yellow]No snapshots found matching the given filters[/yellow]')
return
print(f'[green]\\[*] Found {len(snapshot_ids)} snapshots to update[/green]')
if index_only:
print('[yellow]Index-only mode - skipping archiving[/yellow]')
return
methods = plugins.split(',') if plugins else None
# Queue snapshots for archiving via the state machine system
# Workers will pick them up and run the plugins
if len(snapshot_ids) > 1 and max_workers > 1:
parallel_archive(snapshot_ids, max_workers=max_workers, overwrite=overwrite, methods=methods)
else:
# Queue snapshots by setting status to queued
for snapshot in snapshots:
Snapshot.objects.filter(id=snapshot.id).update(
status=Snapshot.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
print(f'[green]Queued {len(snapshot_ids)} snapshots for archiving[/green]')
@click.command()
@@ -71,7 +131,8 @@ Update only links or data directories that have the given status:
unrecognized {get_unrecognized_folders.__doc__}
''')
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to use e.g. title,favicon,screenshot,singlefile,...')
@click.option('--max-workers', '-j', type=int, default=4, help='Number of parallel worker processes for archiving')
@click.argument('filter_patterns', nargs=-1)
@docstring(update.__doc__)
def main(**kwargs):

View File

@@ -3,7 +3,10 @@
__package__ = 'archivebox.cli'
import sys
from typing import Iterable
import os
import platform
from pathlib import Path
from typing import Iterable, Optional
import rich_click as click
@@ -12,7 +15,6 @@ from archivebox.misc.util import docstring, enforce_types
@enforce_types
def version(quiet: bool=False,
binproviders: Iterable[str]=(),
binaries: Iterable[str]=()) -> list[str]:
"""Print the ArchiveBox version, debug metadata, and installed dependency versions"""
@@ -22,37 +24,24 @@ def version(quiet: bool=False,
if quiet or '--version' in sys.argv:
return []
# Only do slower imports when getting full version info
import os
import platform
from pathlib import Path
from rich.panel import Panel
from rich.console import Console
from abx_pkg import Binary
import abx
import archivebox
from archivebox.config import CONSTANTS, DATA_DIR
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER
from archivebox.config.paths import get_data_locations, get_code_locations
from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.misc.logging_util import printable_folder_status
from abx_plugin_default_binproviders import apt, brew, env
from archivebox.config.configset import get_config
console = Console()
prnt = console.print
LDAP_ENABLED = archivebox.pm.hook.get_SCOPE_CONFIG().LDAP_ENABLED
# Check if LDAP is enabled (simple config lookup)
config = get_config()
LDAP_ENABLED = config.get('LDAP_ENABLED', False)
# 0.7.1
# ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
# IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython
# FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644
# DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
p = platform.uname()
COMMIT_HASH = get_COMMIT_HASH()
prnt(
@@ -68,15 +57,26 @@ def version(quiet: bool=False,
f'PLATFORM={platform.platform()}',
f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
)
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
prnt(
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
)
try:
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
except Exception:
OUTPUT_IS_REMOTE_FS = False
try:
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
prnt(
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
)
except Exception:
prnt(
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
)
prnt(
f'DEBUG={SHELL_CONFIG.DEBUG}',
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
@@ -84,14 +84,11 @@ def version(quiet: bool=False,
f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
f'LDAP={LDAP_ENABLED}',
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
)
prnt()
if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
PANEL_TEXT = '\n'.join((
# '',
# f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]',
'',
'[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
@@ -105,77 +102,94 @@ def version(quiet: bool=False,
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
failures = []
BINARIES = abx.as_dict(archivebox.pm.hook.get_BINARIES())
for name, binary in list(BINARIES.items()):
if binary.name == 'archivebox':
continue
# skip if the binary is not in the requested list of binaries
if binaries and binary.name not in binaries:
continue
# skip if the binary is not supported by any of the requested binproviders
if binproviders and binary.binproviders_supported and not any(provider.name in binproviders for provider in binary.binproviders_supported):
continue
err = None
try:
loaded_bin = binary.load()
except Exception as e:
err = e
loaded_bin = binary
provider_summary = f'[dark_sea_green3]{loaded_bin.binprovider.name.ljust(10)}[/dark_sea_green3]' if loaded_bin.binprovider else '[grey23]not found[/grey23] '
if loaded_bin.abspath:
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
if ' ' in abspath:
abspath = abspath.replace(' ', r'\ ')
else:
abspath = f'[red]{err}[/red]'
prnt('', '[green]√[/green]' if loaded_bin.is_valid else '[red]X[/red]', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(12), provider_summary, abspath, overflow='ignore', crop=False)
if not loaded_bin.is_valid:
failures.append(loaded_bin.name)
prnt()
prnt('[gold3][i] Package Managers:[/gold3]')
BINPROVIDERS = abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS())
for name, binprovider in list(BINPROVIDERS.items()):
err = None
if binproviders and binprovider.name not in binproviders:
continue
# TODO: implement a BinProvider.BINARY() method that gets the loaded binary for a binprovider's INSTALLER_BIN
loaded_bin = binprovider.INSTALLER_BINARY or Binary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
abspath = None
if loaded_bin.abspath:
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
if ' ' in abspath:
abspath = abspath.replace(' ', r'\ ')
PATH = str(binprovider.PATH).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
ownership_summary = f'UID=[blue]{str(binprovider.EUID).ljust(4)}[/blue]'
provider_summary = f'[dark_sea_green3]{str(abspath).ljust(52)}[/dark_sea_green3]' if abspath else f'[grey23]{"not available".ljust(52)}[/grey23]'
prnt('', '[green]√[/green]' if binprovider.is_valid else '[grey53]-[/grey53]', '', binprovider.name.ljust(11), provider_summary, ownership_summary, f'PATH={PATH}', overflow='ellipsis', soft_wrap=True)
if not (binaries or binproviders):
# dont show source code / data dir info if we just want to get version info for a binary or binprovider
# Setup Django before importing models
from archivebox.config.django import setup_django
setup_django()
from machine.models import Machine, InstalledBinary
machine = Machine.current()
# Get all *_BINARY config values
binary_config_keys = [key for key in config.keys() if key.endswith('_BINARY')]
if not binary_config_keys:
prnt('', '[grey53]No binary dependencies defined in config.[/grey53]')
else:
for key in sorted(set(binary_config_keys)):
# Get the actual binary name/path from config value
bin_value = config.get(key, '').strip()
if not bin_value:
continue
# Check if it's a path (has slashes) or just a name
is_path = '/' in bin_value
if is_path:
# It's a full path - match against abspath
bin_name = Path(bin_value).name
# Skip if user specified specific binaries and this isn't one
if binaries and bin_name not in binaries:
continue
# Find InstalledBinary where abspath ends with this path
installed = InstalledBinary.objects.filter(
machine=machine,
abspath__endswith=bin_value,
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
else:
# It's just a binary name - match against name
bin_name = bin_value
# Skip if user specified specific binaries and this isn't one
if binaries and bin_name not in binaries:
continue
# Find InstalledBinary by name
installed = InstalledBinary.objects.filter(
machine=machine,
name__iexact=bin_name,
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
if installed and installed.is_valid:
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
version_str = (installed.version or 'unknown')[:15]
provider = (installed.binprovider or 'env')[:8]
prnt('', '[green]√[/green]', '', bin_name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
else:
prnt('', '[red]X[/red]', '', bin_name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
failures.append(bin_name)
# Show hint if no binaries are installed yet
has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()
if not has_any_installed:
prnt()
prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
if not binaries:
# Show code and data locations
prnt()
prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
for name, path in get_code_locations().items():
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
try:
for name, path in get_code_locations().items():
if isinstance(path, dict):
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
except Exception as e:
prnt(f' [red]Error getting code locations: {e}[/red]')
prnt()
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
for name, path in get_data_locations().items():
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
from archivebox.misc.checks import check_data_dir_permissions
try:
for name, path in get_data_locations().items():
if isinstance(path, dict):
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
except Exception as e:
prnt(f' [red]Error getting data locations: {e}[/red]')
check_data_dir_permissions()
try:
from archivebox.misc.checks import check_data_dir_permissions
check_data_dir_permissions()
except Exception:
pass
else:
prnt()
prnt('[red][i] Data locations:[/red] (not in a data directory)')
@@ -194,7 +208,6 @@ def version(quiet: bool=False,
@click.command()
@click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)')
@click.option('--binproviders', '-p', help='Select binproviders to detect DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)')
@click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)')
@docstring(version.__doc__)
def main(**kwargs):

View File

@@ -4,29 +4,46 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox worker'
import sys
import json
import rich_click as click
from archivebox.misc.util import docstring
def worker(worker_type: str, daemon: bool = False, plugin: str | None = None):
"""
Start a worker process to process items from the queue.
Worker types:
- crawl: Process Crawl objects (parse seeds, create snapshots)
- snapshot: Process Snapshot objects (create archive results)
- archiveresult: Process ArchiveResult objects (run plugins)
Workers poll the database for queued items, claim them atomically,
and spawn subprocess tasks to handle each item.
"""
from workers.worker import get_worker_class
WorkerClass = get_worker_class(worker_type)
# Build kwargs
kwargs = {'daemon': daemon}
if plugin and worker_type == 'archiveresult':
kwargs['extractor'] = plugin # internal field still called extractor
# Create and run worker
worker_instance = WorkerClass(**kwargs)
worker_instance.runloop()
@click.command()
@click.argument('worker_type')
@click.option('--wait-for-first-event', is_flag=True)
@click.option('--exit-on-idle', is_flag=True)
def main(worker_type: str, wait_for_first_event: bool, exit_on_idle: bool):
"""Start an ArchiveBox worker process of the given type"""
from workers.worker import get_worker_type
# allow piping in events to process from stdin
# if not sys.stdin.isatty():
# for line in sys.stdin.readlines():
# Event.dispatch(event=json.loads(line), parent=None)
# run the actor
Worker = get_worker_type(worker_type)
for event in Worker.run(wait_for_first_event=wait_for_first_event, exit_on_idle=exit_on_idle):
print(event)
@click.argument('worker_type', type=click.Choice(['crawl', 'snapshot', 'archiveresult']))
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
@click.option('--plugin', '-p', default=None, help='Filter by plugin (archiveresult only)')
@docstring(worker.__doc__)
def main(worker_type: str, daemon: bool, plugin: str | None):
"""Start an ArchiveBox worker process"""
worker(worker_type, daemon=daemon, plugin=plugin)
if __name__ == '__main__':

View File

@@ -31,7 +31,6 @@ DATA_DIR = 'data.tests'
os.environ.update(TEST_CONFIG)
from ..main import init
from ..index import load_main_index
from archivebox.config.constants import (
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,

View File

@@ -0,0 +1,966 @@
#!/usr/bin/env python3
"""
Tests for CLI piping workflow: crawl | snapshot | extract
This module tests the JSONL-based piping between CLI commands as described in:
https://github.com/ArchiveBox/ArchiveBox/issues/1363
Workflows tested:
archivebox snapshot URL | archivebox extract
archivebox crawl URL | archivebox snapshot | archivebox extract
archivebox crawl --plugin=PARSER URL | archivebox snapshot | archivebox extract
Each command should:
- Accept URLs, snapshot_ids, or JSONL as input (args or stdin)
- Output JSONL to stdout when piped (not TTY)
- Output human-readable to stderr when TTY
"""
__package__ = 'archivebox.cli'
import os
import sys
import json
import shutil
import tempfile
import unittest
from io import StringIO
from pathlib import Path
from unittest.mock import patch, MagicMock
# Test configuration - disable slow extractors
TEST_CONFIG = {
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
'SAVE_ARCHIVE_DOT_ORG': 'False',
'SAVE_TITLE': 'True', # Fast extractor
'SAVE_FAVICON': 'False',
'SAVE_WGET': 'False',
'SAVE_WARC': 'False',
'SAVE_PDF': 'False',
'SAVE_SCREENSHOT': 'False',
'SAVE_DOM': 'False',
'SAVE_SINGLEFILE': 'False',
'SAVE_READABILITY': 'False',
'SAVE_MERCURY': 'False',
'SAVE_GIT': 'False',
'SAVE_MEDIA': 'False',
'SAVE_HEADERS': 'False',
'USE_CURL': 'False',
'USE_WGET': 'False',
'USE_GIT': 'False',
'USE_CHROME': 'False',
'USE_YOUTUBEDL': 'False',
'USE_NODE': 'False',
}
os.environ.update(TEST_CONFIG)
# =============================================================================
# JSONL Utility Tests
# =============================================================================
class TestJSONLParsing(unittest.TestCase):
"""Test JSONL input parsing utilities."""
def test_parse_plain_url(self):
"""Plain URLs should be parsed as Snapshot records."""
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
result = parse_line('https://example.com')
self.assertIsNotNone(result)
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['url'], 'https://example.com')
def test_parse_jsonl_snapshot(self):
"""JSONL Snapshot records should preserve all fields."""
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
line = '{"type": "Snapshot", "url": "https://example.com", "tags": "test,demo"}'
result = parse_line(line)
self.assertIsNotNone(result)
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['url'], 'https://example.com')
self.assertEqual(result['tags'], 'test,demo')
def test_parse_jsonl_with_id(self):
"""JSONL with id field should be recognized."""
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
result = parse_line(line)
self.assertIsNotNone(result)
self.assertEqual(result['id'], 'abc123')
self.assertEqual(result['url'], 'https://example.com')
def test_parse_uuid_as_snapshot_id(self):
"""Bare UUIDs should be parsed as snapshot IDs."""
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
uuid = '01234567-89ab-cdef-0123-456789abcdef'
result = parse_line(uuid)
self.assertIsNotNone(result)
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['id'], uuid)
def test_parse_empty_line(self):
"""Empty lines should return None."""
from archivebox.misc.jsonl import parse_line
self.assertIsNone(parse_line(''))
self.assertIsNone(parse_line(' '))
self.assertIsNone(parse_line('\n'))
def test_parse_comment_line(self):
"""Comment lines should return None."""
from archivebox.misc.jsonl import parse_line
self.assertIsNone(parse_line('# This is a comment'))
self.assertIsNone(parse_line(' # Indented comment'))
def test_parse_invalid_url(self):
"""Invalid URLs should return None."""
from archivebox.misc.jsonl import parse_line
self.assertIsNone(parse_line('not-a-url'))
self.assertIsNone(parse_line('ftp://example.com')) # Only http/https/file
def test_parse_file_url(self):
"""file:// URLs should be parsed."""
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
result = parse_line('file:///path/to/file.txt')
self.assertIsNotNone(result)
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['url'], 'file:///path/to/file.txt')
class TestJSONLOutput(unittest.TestCase):
"""Test JSONL output formatting."""
def test_snapshot_to_jsonl(self):
"""Snapshot model should serialize to JSONL correctly."""
from archivebox.misc.jsonl import snapshot_to_jsonl, TYPE_SNAPSHOT
# Create a mock snapshot
mock_snapshot = MagicMock()
mock_snapshot.id = 'test-uuid-1234'
mock_snapshot.url = 'https://example.com'
mock_snapshot.title = 'Example Title'
mock_snapshot.tags_str.return_value = 'tag1,tag2'
mock_snapshot.bookmarked_at = None
mock_snapshot.created_at = None
mock_snapshot.timestamp = '1234567890'
mock_snapshot.depth = 0
mock_snapshot.status = 'queued'
result = snapshot_to_jsonl(mock_snapshot)
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['id'], 'test-uuid-1234')
self.assertEqual(result['url'], 'https://example.com')
self.assertEqual(result['title'], 'Example Title')
def test_archiveresult_to_jsonl(self):
"""ArchiveResult model should serialize to JSONL correctly."""
from archivebox.misc.jsonl import archiveresult_to_jsonl, TYPE_ARCHIVERESULT
mock_result = MagicMock()
mock_result.id = 'result-uuid-5678'
mock_result.snapshot_id = 'snapshot-uuid-1234'
mock_result.extractor = 'title'
mock_result.status = 'succeeded'
mock_result.output = 'Example Title'
mock_result.start_ts = None
mock_result.end_ts = None
result = archiveresult_to_jsonl(mock_result)
self.assertEqual(result['type'], TYPE_ARCHIVERESULT)
self.assertEqual(result['id'], 'result-uuid-5678')
self.assertEqual(result['snapshot_id'], 'snapshot-uuid-1234')
self.assertEqual(result['extractor'], 'title')
self.assertEqual(result['status'], 'succeeded')
class TestReadArgsOrStdin(unittest.TestCase):
"""Test reading from args or stdin."""
def test_read_from_args(self):
"""Should read URLs from command line args."""
from archivebox.misc.jsonl import read_args_or_stdin
args = ('https://example1.com', 'https://example2.com')
records = list(read_args_or_stdin(args))
self.assertEqual(len(records), 2)
self.assertEqual(records[0]['url'], 'https://example1.com')
self.assertEqual(records[1]['url'], 'https://example2.com')
def test_read_from_stdin(self):
"""Should read URLs from stdin when no args provided."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin_content = 'https://example1.com\nhttps://example2.com\n'
stream = StringIO(stdin_content)
# Mock isatty to return False (simulating piped input)
stream.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stream))
self.assertEqual(len(records), 2)
self.assertEqual(records[0]['url'], 'https://example1.com')
self.assertEqual(records[1]['url'], 'https://example2.com')
def test_read_jsonl_from_stdin(self):
"""Should read JSONL from stdin."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin_content = '{"type": "Snapshot", "url": "https://example.com", "tags": "test"}\n'
stream = StringIO(stdin_content)
stream.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stream))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['url'], 'https://example.com')
self.assertEqual(records[0]['tags'], 'test')
def test_skip_tty_stdin(self):
"""Should not read from TTY stdin (would block)."""
from archivebox.misc.jsonl import read_args_or_stdin
stream = StringIO('https://example.com')
stream.isatty = lambda: True # Simulate TTY
records = list(read_args_or_stdin((), stream=stream))
self.assertEqual(len(records), 0)
# =============================================================================
# Unit Tests for Individual Commands
# =============================================================================
class TestCrawlCommand(unittest.TestCase):
"""Unit tests for archivebox crawl command."""
def setUp(self):
"""Set up test environment."""
self.test_dir = tempfile.mkdtemp()
os.environ['DATA_DIR'] = self.test_dir
def tearDown(self):
"""Clean up test environment."""
shutil.rmtree(self.test_dir, ignore_errors=True)
def test_crawl_accepts_url(self):
"""crawl should accept URLs as input."""
from archivebox.misc.jsonl import read_args_or_stdin
args = ('https://example.com',)
records = list(read_args_or_stdin(args))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['url'], 'https://example.com')
def test_crawl_accepts_snapshot_id(self):
"""crawl should accept snapshot IDs as input."""
from archivebox.misc.jsonl import read_args_or_stdin
uuid = '01234567-89ab-cdef-0123-456789abcdef'
args = (uuid,)
records = list(read_args_or_stdin(args))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['id'], uuid)
def test_crawl_accepts_jsonl(self):
"""crawl should accept JSONL with snapshot info."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['id'], 'abc123')
self.assertEqual(records[0]['url'], 'https://example.com')
def test_crawl_separates_existing_vs_new(self):
"""crawl should identify existing snapshots vs new URLs."""
# This tests the logic in discover_outlinks() that separates
# records with 'id' (existing) from records with just 'url' (new)
records = [
{'type': 'Snapshot', 'id': 'existing-id-1'}, # Existing (id only)
{'type': 'Snapshot', 'url': 'https://new-url.com'}, # New (url only)
{'type': 'Snapshot', 'id': 'existing-id-2', 'url': 'https://existing.com'}, # Existing (has id)
]
existing = []
new = []
for record in records:
if record.get('id') and not record.get('url'):
existing.append(record['id'])
elif record.get('id'):
existing.append(record['id']) # Has both id and url - treat as existing
elif record.get('url'):
new.append(record)
self.assertEqual(len(existing), 2)
self.assertEqual(len(new), 1)
self.assertEqual(new[0]['url'], 'https://new-url.com')
class TestSnapshotCommand(unittest.TestCase):
"""Unit tests for archivebox snapshot command."""
def setUp(self):
"""Set up test environment."""
self.test_dir = tempfile.mkdtemp()
os.environ['DATA_DIR'] = self.test_dir
def tearDown(self):
"""Clean up test environment."""
shutil.rmtree(self.test_dir, ignore_errors=True)
def test_snapshot_accepts_url(self):
"""snapshot should accept URLs as input."""
from archivebox.misc.jsonl import read_args_or_stdin
args = ('https://example.com',)
records = list(read_args_or_stdin(args))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['url'], 'https://example.com')
def test_snapshot_accepts_jsonl_with_metadata(self):
"""snapshot should accept JSONL with tags and other metadata."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin = StringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n')
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['url'], 'https://example.com')
self.assertEqual(records[0]['tags'], 'tag1,tag2')
self.assertEqual(records[0]['title'], 'Test')
def test_snapshot_output_format(self):
"""snapshot output should include id and url."""
from archivebox.misc.jsonl import snapshot_to_jsonl
mock_snapshot = MagicMock()
mock_snapshot.id = 'test-id'
mock_snapshot.url = 'https://example.com'
mock_snapshot.title = 'Test'
mock_snapshot.tags_str.return_value = ''
mock_snapshot.bookmarked_at = None
mock_snapshot.created_at = None
mock_snapshot.timestamp = '123'
mock_snapshot.depth = 0
mock_snapshot.status = 'queued'
output = snapshot_to_jsonl(mock_snapshot)
self.assertIn('id', output)
self.assertIn('url', output)
self.assertEqual(output['type'], 'Snapshot')
class TestExtractCommand(unittest.TestCase):
"""Unit tests for archivebox extract command."""
def setUp(self):
"""Set up test environment."""
self.test_dir = tempfile.mkdtemp()
os.environ['DATA_DIR'] = self.test_dir
def tearDown(self):
"""Clean up test environment."""
shutil.rmtree(self.test_dir, ignore_errors=True)
def test_extract_accepts_snapshot_id(self):
"""extract should accept snapshot IDs as input."""
from archivebox.misc.jsonl import read_args_or_stdin
uuid = '01234567-89ab-cdef-0123-456789abcdef'
args = (uuid,)
records = list(read_args_or_stdin(args))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['id'], uuid)
def test_extract_accepts_jsonl_snapshot(self):
"""extract should accept JSONL Snapshot records."""
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
self.assertEqual(records[0]['id'], 'abc123')
def test_extract_gathers_snapshot_ids(self):
"""extract should gather snapshot IDs from various input formats."""
from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
records = [
{'type': TYPE_SNAPSHOT, 'id': 'snap-1'},
{'type': TYPE_SNAPSHOT, 'id': 'snap-2', 'url': 'https://example.com'},
{'type': TYPE_ARCHIVERESULT, 'snapshot_id': 'snap-3'},
{'id': 'snap-4'}, # Bare id
]
snapshot_ids = set()
for record in records:
record_type = record.get('type')
if record_type == TYPE_SNAPSHOT:
snapshot_id = record.get('id')
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif record_type == TYPE_ARCHIVERESULT:
snapshot_id = record.get('snapshot_id')
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif 'id' in record:
snapshot_ids.add(record['id'])
self.assertEqual(len(snapshot_ids), 4)
self.assertIn('snap-1', snapshot_ids)
self.assertIn('snap-2', snapshot_ids)
self.assertIn('snap-3', snapshot_ids)
self.assertIn('snap-4', snapshot_ids)
# =============================================================================
# URL Collection Tests
# =============================================================================
class TestURLCollection(unittest.TestCase):
"""Test collecting urls.jsonl from extractor output."""
def setUp(self):
"""Create test directory structure."""
self.test_dir = Path(tempfile.mkdtemp())
# Create fake extractor output directories with urls.jsonl
(self.test_dir / 'wget').mkdir()
(self.test_dir / 'wget' / 'urls.jsonl').write_text(
'{"url": "https://wget-link-1.com"}\n'
'{"url": "https://wget-link-2.com"}\n'
)
(self.test_dir / 'parse_html_urls').mkdir()
(self.test_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
'{"url": "https://html-link-1.com"}\n'
'{"url": "https://html-link-2.com", "title": "HTML Link 2"}\n'
)
(self.test_dir / 'screenshot').mkdir()
# No urls.jsonl in screenshot dir - not a parser
def tearDown(self):
"""Clean up test directory."""
shutil.rmtree(self.test_dir, ignore_errors=True)
def test_collect_urls_from_extractors(self):
"""Should collect urls.jsonl from all extractor subdirectories."""
from archivebox.hooks import collect_urls_from_extractors
urls = collect_urls_from_extractors(self.test_dir)
self.assertEqual(len(urls), 4)
# Check that via_extractor is set
extractors = {u['via_extractor'] for u in urls}
self.assertIn('wget', extractors)
self.assertIn('parse_html_urls', extractors)
self.assertNotIn('screenshot', extractors) # No urls.jsonl
def test_collect_urls_preserves_metadata(self):
"""Should preserve metadata from urls.jsonl entries."""
from archivebox.hooks import collect_urls_from_extractors
urls = collect_urls_from_extractors(self.test_dir)
# Find the entry with title
titled = [u for u in urls if u.get('title') == 'HTML Link 2']
self.assertEqual(len(titled), 1)
self.assertEqual(titled[0]['url'], 'https://html-link-2.com')
def test_collect_urls_empty_dir(self):
"""Should handle empty or non-existent directories."""
from archivebox.hooks import collect_urls_from_extractors
empty_dir = self.test_dir / 'nonexistent'
urls = collect_urls_from_extractors(empty_dir)
self.assertEqual(len(urls), 0)
# =============================================================================
# Integration Tests
# =============================================================================
class TestPipingWorkflowIntegration(unittest.TestCase):
"""
Integration tests for the complete piping workflow.
These tests require Django to be set up and use the actual database.
"""
@classmethod
def setUpClass(cls):
"""Set up Django and test database."""
cls.test_dir = tempfile.mkdtemp()
os.environ['DATA_DIR'] = cls.test_dir
# Initialize Django
from archivebox.config.django import setup_django
setup_django()
# Initialize the archive
from archivebox.cli.archivebox_init import init
init()
@classmethod
def tearDownClass(cls):
"""Clean up test database."""
shutil.rmtree(cls.test_dir, ignore_errors=True)
def test_snapshot_creates_and_outputs_jsonl(self):
"""
Test: archivebox snapshot URL
Should create a Snapshot and output JSONL when piped.
"""
from core.models import Snapshot
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record, snapshot_to_jsonl,
TYPE_SNAPSHOT, get_or_create_snapshot
)
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
# Simulate input
url = 'https://test-snapshot-1.example.com'
records = list(read_args_or_stdin((url,)))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['url'], url)
# Create snapshot
snapshot = get_or_create_snapshot(records[0], created_by_id=created_by_id)
self.assertIsNotNone(snapshot.id)
self.assertEqual(snapshot.url, url)
# Verify output format
output = snapshot_to_jsonl(snapshot)
self.assertEqual(output['type'], TYPE_SNAPSHOT)
self.assertIn('id', output)
self.assertEqual(output['url'], url)
def test_extract_accepts_snapshot_from_previous_command(self):
"""
Test: archivebox snapshot URL | archivebox extract
Extract should accept JSONL output from snapshot command.
"""
from core.models import Snapshot, ArchiveResult
from archivebox.misc.jsonl import (
snapshot_to_jsonl, read_args_or_stdin, get_or_create_snapshot,
TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
# Step 1: Create snapshot (simulating 'archivebox snapshot')
url = 'https://test-extract-1.example.com'
snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
snapshot_output = snapshot_to_jsonl(snapshot)
# Step 2: Parse snapshot output as extract input
stdin = StringIO(json.dumps(snapshot_output) + '\n')
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
self.assertEqual(records[0]['id'], str(snapshot.id))
# Step 3: Gather snapshot IDs (as extract does)
snapshot_ids = set()
for record in records:
if record.get('type') == TYPE_SNAPSHOT and record.get('id'):
snapshot_ids.add(record['id'])
self.assertIn(str(snapshot.id), snapshot_ids)
def test_crawl_outputs_discovered_urls(self):
"""
Test: archivebox crawl URL
Should create snapshot, run plugins, output discovered URLs.
"""
from archivebox.hooks import collect_urls_from_extractors
from archivebox.misc.jsonl import TYPE_SNAPSHOT
# Create a mock snapshot directory with urls.jsonl
test_snapshot_dir = Path(self.test_dir) / 'archive' / 'test-crawl-snapshot'
test_snapshot_dir.mkdir(parents=True, exist_ok=True)
# Create mock extractor output
(test_snapshot_dir / 'parse_html_urls').mkdir()
(test_snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
'{"url": "https://discovered-1.com"}\n'
'{"url": "https://discovered-2.com", "title": "Discovered 2"}\n'
)
# Collect URLs (as crawl does)
discovered = collect_urls_from_extractors(test_snapshot_dir)
self.assertEqual(len(discovered), 2)
# Add crawl metadata (as crawl does)
for entry in discovered:
entry['type'] = TYPE_SNAPSHOT
entry['depth'] = 1
entry['via_snapshot'] = 'test-crawl-snapshot'
# Verify output format
self.assertEqual(discovered[0]['type'], TYPE_SNAPSHOT)
self.assertEqual(discovered[0]['depth'], 1)
self.assertEqual(discovered[0]['url'], 'https://discovered-1.com')
def test_full_pipeline_snapshot_extract(self):
"""
Test: archivebox snapshot URL | archivebox extract
This is equivalent to: archivebox add URL
"""
from core.models import Snapshot
from archivebox.misc.jsonl import (
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
# === archivebox snapshot https://example.com ===
url = 'https://test-pipeline-1.example.com'
snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
snapshot_jsonl = json.dumps(snapshot_to_jsonl(snapshot))
# === | archivebox extract ===
stdin = StringIO(snapshot_jsonl + '\n')
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
# Extract should receive the snapshot ID
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['id'], str(snapshot.id))
# Verify snapshot exists in DB
db_snapshot = Snapshot.objects.get(id=snapshot.id)
self.assertEqual(db_snapshot.url, url)
def test_full_pipeline_crawl_snapshot_extract(self):
"""
Test: archivebox crawl URL | archivebox snapshot | archivebox extract
This is equivalent to: archivebox add --depth=1 URL
"""
from core.models import Snapshot
from archivebox.misc.jsonl import (
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.hooks import collect_urls_from_extractors
created_by_id = get_or_create_system_user_pk()
# === archivebox crawl https://example.com ===
# Step 1: Create snapshot for starting URL
start_url = 'https://test-crawl-pipeline.example.com'
start_snapshot = get_or_create_snapshot({'url': start_url}, created_by_id=created_by_id)
# Step 2: Simulate extractor output with discovered URLs
snapshot_dir = Path(self.test_dir) / 'archive' / str(start_snapshot.timestamp)
snapshot_dir.mkdir(parents=True, exist_ok=True)
(snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
(snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
'{"url": "https://outlink-1.example.com"}\n'
'{"url": "https://outlink-2.example.com"}\n'
)
# Step 3: Collect discovered URLs (crawl output)
discovered = collect_urls_from_extractors(snapshot_dir)
crawl_output = []
for entry in discovered:
entry['type'] = TYPE_SNAPSHOT
entry['depth'] = 1
crawl_output.append(json.dumps(entry))
# === | archivebox snapshot ===
stdin = StringIO('\n'.join(crawl_output) + '\n')
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 2)
# Create snapshots for discovered URLs
created_snapshots = []
for record in records:
snap = get_or_create_snapshot(record, created_by_id=created_by_id)
created_snapshots.append(snap)
self.assertEqual(len(created_snapshots), 2)
# === | archivebox extract ===
snapshot_jsonl_lines = [json.dumps(snapshot_to_jsonl(s)) for s in created_snapshots]
stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 2)
# Verify all snapshots exist in DB
for record in records:
db_snapshot = Snapshot.objects.get(id=record['id'])
self.assertIn(db_snapshot.url, [
'https://outlink-1.example.com',
'https://outlink-2.example.com'
])
class TestDepthWorkflows(unittest.TestCase):
"""Test various depth crawl workflows."""
@classmethod
def setUpClass(cls):
"""Set up Django and test database."""
cls.test_dir = tempfile.mkdtemp()
os.environ['DATA_DIR'] = cls.test_dir
from archivebox.config.django import setup_django
setup_django()
from archivebox.cli.archivebox_init import init
init()
@classmethod
def tearDownClass(cls):
"""Clean up test database."""
shutil.rmtree(cls.test_dir, ignore_errors=True)
def test_depth_0_workflow(self):
"""
Test: archivebox snapshot URL | archivebox extract
Depth 0: Only archive the specified URL, no crawling.
"""
from core.models import Snapshot
from archivebox.misc.jsonl import get_or_create_snapshot
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
# Create snapshot
url = 'https://depth0-test.example.com'
snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
# Verify only one snapshot created
self.assertEqual(Snapshot.objects.filter(url=url).count(), 1)
self.assertEqual(snapshot.url, url)
def test_depth_1_workflow(self):
"""
Test: archivebox crawl URL | archivebox snapshot | archivebox extract
Depth 1: Archive URL + all outlinks from that URL.
"""
# This is tested in test_full_pipeline_crawl_snapshot_extract
pass
def test_depth_metadata_propagation(self):
"""Test that depth metadata propagates through the pipeline."""
from archivebox.misc.jsonl import TYPE_SNAPSHOT
# Simulate crawl output with depth metadata
crawl_output = [
{'type': TYPE_SNAPSHOT, 'url': 'https://hop1.com', 'depth': 1, 'via_snapshot': 'root'},
{'type': TYPE_SNAPSHOT, 'url': 'https://hop2.com', 'depth': 2, 'via_snapshot': 'hop1'},
]
# Verify depth is preserved
for entry in crawl_output:
self.assertIn('depth', entry)
self.assertIn('via_snapshot', entry)
class TestParserPluginWorkflows(unittest.TestCase):
"""Test workflows with specific parser plugins."""
@classmethod
def setUpClass(cls):
"""Set up Django and test database."""
cls.test_dir = tempfile.mkdtemp()
os.environ['DATA_DIR'] = cls.test_dir
from archivebox.config.django import setup_django
setup_django()
from archivebox.cli.archivebox_init import init
init()
@classmethod
def tearDownClass(cls):
"""Clean up test database."""
shutil.rmtree(cls.test_dir, ignore_errors=True)
def test_html_parser_workflow(self):
"""
Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
"""
from archivebox.hooks import collect_urls_from_extractors
from archivebox.misc.jsonl import TYPE_SNAPSHOT
# Create mock output directory
snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
snapshot_dir.mkdir(parents=True, exist_ok=True)
(snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
(snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
'{"url": "https://html-discovered.com", "title": "HTML Link"}\n'
)
# Collect URLs
discovered = collect_urls_from_extractors(snapshot_dir)
self.assertEqual(len(discovered), 1)
self.assertEqual(discovered[0]['url'], 'https://html-discovered.com')
self.assertEqual(discovered[0]['via_extractor'], 'parse_html_urls')
def test_rss_parser_workflow(self):
"""
Test: archivebox crawl --plugin=parse_rss_urls URL | archivebox snapshot | archivebox extract
"""
from archivebox.hooks import collect_urls_from_extractors
# Create mock output directory
snapshot_dir = Path(self.test_dir) / 'archive' / 'rss-parser-test'
snapshot_dir.mkdir(parents=True, exist_ok=True)
(snapshot_dir / 'parse_rss_urls').mkdir(exist_ok=True)
(snapshot_dir / 'parse_rss_urls' / 'urls.jsonl').write_text(
'{"url": "https://rss-item-1.com", "title": "RSS Item 1"}\n'
'{"url": "https://rss-item-2.com", "title": "RSS Item 2"}\n'
)
# Collect URLs
discovered = collect_urls_from_extractors(snapshot_dir)
self.assertEqual(len(discovered), 2)
self.assertTrue(all(d['via_extractor'] == 'parse_rss_urls' for d in discovered))
def test_multiple_parsers_dedupe(self):
"""
Multiple parsers may discover the same URL - should be deduplicated.
"""
from archivebox.hooks import collect_urls_from_extractors
# Create mock output with duplicate URLs from different parsers
snapshot_dir = Path(self.test_dir) / 'archive' / 'dedupe-test'
snapshot_dir.mkdir(parents=True, exist_ok=True)
(snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
(snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
'{"url": "https://same-url.com"}\n'
)
(snapshot_dir / 'wget').mkdir(exist_ok=True)
(snapshot_dir / 'wget' / 'urls.jsonl').write_text(
'{"url": "https://same-url.com"}\n' # Same URL, different extractor
)
# Collect URLs
all_discovered = collect_urls_from_extractors(snapshot_dir)
# Both entries are returned (deduplication happens at the crawl command level)
self.assertEqual(len(all_discovered), 2)
# Verify both extractors found the same URL
urls = {d['url'] for d in all_discovered}
self.assertEqual(urls, {'https://same-url.com'})
class TestEdgeCases(unittest.TestCase):
"""Test edge cases and error handling."""
def test_empty_input(self):
"""Commands should handle empty input gracefully."""
from archivebox.misc.jsonl import read_args_or_stdin
# Empty args, TTY stdin (should not block)
stdin = StringIO('')
stdin.isatty = lambda: True
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 0)
def test_malformed_jsonl(self):
"""Should skip malformed JSONL lines."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin = StringIO(
'{"url": "https://good.com"}\n'
'not valid json\n'
'{"url": "https://also-good.com"}\n'
)
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 2)
urls = {r['url'] for r in records}
self.assertEqual(urls, {'https://good.com', 'https://also-good.com'})
def test_mixed_input_formats(self):
"""Should handle mixed URLs and JSONL."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin = StringIO(
'https://plain-url.com\n'
'{"type": "Snapshot", "url": "https://jsonl-url.com", "tags": "test"}\n'
'01234567-89ab-cdef-0123-456789abcdef\n' # UUID
)
stdin.isatty = lambda: False
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 3)
# Plain URL
self.assertEqual(records[0]['url'], 'https://plain-url.com')
# JSONL with metadata
self.assertEqual(records[1]['url'], 'https://jsonl-url.com')
self.assertEqual(records[1]['tags'], 'test')
# UUID
self.assertEqual(records[2]['id'], '01234567-89ab-cdef-0123-456789abcdef')
if __name__ == '__main__':
unittest.main()