mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-04 14:57:56 +10:00
wip major changes
This commit is contained in:
@@ -37,7 +37,13 @@ class ArchiveBoxGroup(click.Group):
|
||||
'server': 'archivebox.cli.archivebox_server.main',
|
||||
'shell': 'archivebox.cli.archivebox_shell.main',
|
||||
'manage': 'archivebox.cli.archivebox_manage.main',
|
||||
# Worker/orchestrator commands
|
||||
'orchestrator': 'archivebox.cli.archivebox_orchestrator.main',
|
||||
'worker': 'archivebox.cli.archivebox_worker.main',
|
||||
# Task commands (called by workers as subprocesses)
|
||||
'crawl': 'archivebox.cli.archivebox_crawl.main',
|
||||
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
|
||||
'extract': 'archivebox.cli.archivebox_extract.main',
|
||||
}
|
||||
all_subcommands = {
|
||||
**meta_commands,
|
||||
@@ -118,11 +124,14 @@ def cli(ctx, help=False):
|
||||
raise
|
||||
|
||||
|
||||
def main(args=None, prog_name=None):
|
||||
def main(args=None, prog_name=None, stdin=None):
|
||||
# show `docker run archivebox xyz` in help messages if running in docker
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
IS_TTY = sys.stdin.isatty()
|
||||
prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox')
|
||||
|
||||
# stdin param allows passing input data from caller (used by __main__.py)
|
||||
# currently not used by click-based CLI, but kept for backwards compatibility
|
||||
|
||||
try:
|
||||
cli(args=args, prog_name=prog_name)
|
||||
|
||||
@@ -16,214 +16,135 @@ from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox import CONSTANTS
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
from archivebox.config.permissions import USER, HOSTNAME
|
||||
from archivebox.parsers import PARSERS
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from core.models import Snapshot
|
||||
|
||||
|
||||
ORCHESTRATOR = None
|
||||
|
||||
@enforce_types
|
||||
def add(urls: str | list[str],
|
||||
depth: int | str=0,
|
||||
tag: str='',
|
||||
parser: str="auto",
|
||||
extract: str="",
|
||||
plugins: str="",
|
||||
persona: str='Default',
|
||||
overwrite: bool=False,
|
||||
update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
index_only: bool=False,
|
||||
bg: bool=False,
|
||||
created_by_id: int | None=None) -> QuerySet['Snapshot']:
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
"""Add a new URL or list of URLs to your archive.
|
||||
|
||||
global ORCHESTRATOR
|
||||
The new flow is:
|
||||
1. Save URLs to sources file
|
||||
2. Create Seed pointing to the file
|
||||
3. Create Crawl with max_depth
|
||||
4. Create root Snapshot pointing to file:// URL (depth=0)
|
||||
5. Orchestrator runs parser extractors on root snapshot
|
||||
6. Parser extractors output to urls.jsonl
|
||||
7. URLs are added to Crawl.urls and child Snapshots are created
|
||||
8. Repeat until max_depth is reached
|
||||
"""
|
||||
|
||||
from rich import print
|
||||
|
||||
depth = int(depth)
|
||||
|
||||
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||
|
||||
# import models once django is set up
|
||||
from crawls.models import Seed, Crawl
|
||||
from workers.orchestrator import Orchestrator
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
|
||||
|
||||
# import models once django is set up
|
||||
from core.models import Snapshot
|
||||
from crawls.models import Seed, Crawl
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
|
||||
# 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
|
||||
|
||||
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
||||
|
||||
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
|
||||
|
||||
# 2. Create a new Seed pointing to the sources file
|
||||
cli_args = [*sys.argv]
|
||||
if cli_args[0].lower().endswith('archivebox'):
|
||||
cli_args[0] = 'archivebox' # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox
|
||||
cli_args[0] = 'archivebox'
|
||||
cmd_str = ' '.join(cli_args)
|
||||
seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={
|
||||
'ONLY_NEW': not update,
|
||||
'INDEX_ONLY': index_only,
|
||||
'OVERWRITE': overwrite,
|
||||
'EXTRACTORS': extract,
|
||||
'DEFAULT_PERSONA': persona or 'Default',
|
||||
})
|
||||
# 3. create a new Crawl pointing to the Seed
|
||||
crawl = Crawl.from_seed(seed, max_depth=depth)
|
||||
|
||||
# 4. start the Orchestrator & wait until it completes
|
||||
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
|
||||
# from crawls.actors import CrawlActor
|
||||
# from core.actors import SnapshotActor, ArchiveResultActor
|
||||
|
||||
if not bg:
|
||||
orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
|
||||
orchestrator.start()
|
||||
|
||||
# 5. return the list of new Snapshots created
|
||||
seed = Seed.from_file(
|
||||
sources_file,
|
||||
label=f'{USER}@{HOSTNAME} $ {cmd_str}',
|
||||
parser=parser,
|
||||
tag=tag,
|
||||
created_by=created_by_id,
|
||||
config={
|
||||
'ONLY_NEW': not update,
|
||||
'INDEX_ONLY': index_only,
|
||||
'OVERWRITE': overwrite,
|
||||
'EXTRACTORS': plugins,
|
||||
'DEFAULT_PERSONA': persona or 'Default',
|
||||
}
|
||||
)
|
||||
|
||||
# 3. Create a new Crawl pointing to the Seed (status=queued)
|
||||
crawl = Crawl.from_seed(seed, max_depth=depth)
|
||||
|
||||
print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
|
||||
print(f' [dim]Seed: {seed.uri}[/dim]')
|
||||
|
||||
# 4. The CrawlMachine will create the root Snapshot when started
|
||||
# Root snapshot URL = file:///path/to/sources/...txt
|
||||
# Parser extractors will run on it and discover URLs
|
||||
# Those URLs become child Snapshots (depth=1)
|
||||
|
||||
if index_only:
|
||||
# Just create the crawl but don't start processing
|
||||
print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
|
||||
# Create root snapshot manually
|
||||
crawl.create_root_snapshot()
|
||||
return crawl.snapshot_set.all()
|
||||
|
||||
# 5. Start the orchestrator to process the queue
|
||||
# The orchestrator will:
|
||||
# - Process Crawl -> create root Snapshot
|
||||
# - Process root Snapshot -> run parser extractors -> discover URLs
|
||||
# - Create child Snapshots from discovered URLs
|
||||
# - Process child Snapshots -> run extractors
|
||||
# - Repeat until max_depth reached
|
||||
|
||||
if bg:
|
||||
# Background mode: start orchestrator and return immediately
|
||||
print('[yellow]\\[*] Running in background mode - starting orchestrator...[/yellow]')
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.start() # Fork to background
|
||||
else:
|
||||
# Foreground mode: run orchestrator until all work is done
|
||||
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.runloop() # Block until complete
|
||||
|
||||
# 6. Return the list of Snapshots in this crawl
|
||||
return crawl.snapshot_set.all()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
|
||||
@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
|
||||
@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
|
||||
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
|
||||
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
|
||||
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
|
||||
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
|
||||
# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
|
||||
@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
|
||||
@click.option('--bg', is_flag=True, help='Run archiving in background (start orchestrator and return immediately)')
|
||||
@click.argument('urls', nargs=-1, type=click.Path())
|
||||
@docstring(add.__doc__)
|
||||
def main(**kwargs):
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
|
||||
|
||||
add(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
# OLD VERSION:
|
||||
# def add(urls: Union[str, List[str]],
|
||||
# tag: str='',
|
||||
# depth: int=0,
|
||||
# update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
# update_all: bool=False,
|
||||
# index_only: bool=False,
|
||||
# overwrite: bool=False,
|
||||
# # duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
|
||||
# init: bool=False,
|
||||
# extractors: str="",
|
||||
# parser: str="auto",
|
||||
# created_by_id: int | None=None,
|
||||
# out_dir: Path=DATA_DIR) -> List[Link]:
|
||||
# """Add a new URL or list of URLs to your archive"""
|
||||
|
||||
# from core.models import Snapshot, Tag
|
||||
# # from workers.supervisord_util import start_cli_workers, tail_worker_logs
|
||||
# # from workers.tasks import bg_archive_link
|
||||
|
||||
|
||||
# assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||
|
||||
# extractors = extractors.split(",") if extractors else []
|
||||
|
||||
# if init:
|
||||
# run_subcommand('init', stdin=None, pwd=out_dir)
|
||||
|
||||
# # Load list of links from the existing index
|
||||
# check_data_folder()
|
||||
|
||||
# # worker = start_cli_workers()
|
||||
|
||||
# new_links: List[Link] = []
|
||||
# all_links = load_main_index(out_dir=out_dir)
|
||||
|
||||
# log_importing_started(urls=urls, depth=depth, index_only=index_only)
|
||||
# if isinstance(urls, str):
|
||||
# # save verbatim stdin to sources
|
||||
# write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
|
||||
# elif isinstance(urls, list):
|
||||
# # save verbatim args to sources
|
||||
# write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
|
||||
|
||||
|
||||
# new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
|
||||
|
||||
# # If we're going one level deeper, download each link and look for more links
|
||||
# new_links_depth = []
|
||||
# if new_links and depth == 1:
|
||||
# log_crawl_started(new_links)
|
||||
# for new_link in new_links:
|
||||
# try:
|
||||
# downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
|
||||
# new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
||||
# except Exception as err:
|
||||
# stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
|
||||
|
||||
# imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
|
||||
|
||||
# new_links = dedupe_links(all_links, imported_links)
|
||||
|
||||
# write_main_index(links=new_links, out_dir=out_dir, created_by_id=created_by_id)
|
||||
# all_links = load_main_index(out_dir=out_dir)
|
||||
|
||||
# tags = [
|
||||
# Tag.objects.get_or_create(name=name.strip(), defaults={'created_by_id': created_by_id})[0]
|
||||
# for name in tag.split(',')
|
||||
# if name.strip()
|
||||
# ]
|
||||
# if tags:
|
||||
# for link in imported_links:
|
||||
# snapshot = Snapshot.objects.get(url=link.url)
|
||||
# snapshot.tags.add(*tags)
|
||||
# snapshot.tags_str(nocache=True)
|
||||
# snapshot.save()
|
||||
# # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
|
||||
|
||||
# if index_only:
|
||||
# # mock archive all the links using the fake index_only extractor method in order to update their state
|
||||
# if overwrite:
|
||||
# archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
|
||||
# else:
|
||||
# archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
|
||||
# else:
|
||||
# # fully run the archive extractor methods for each link
|
||||
# archive_kwargs = {
|
||||
# "out_dir": out_dir,
|
||||
# "created_by_id": created_by_id,
|
||||
# }
|
||||
# if extractors:
|
||||
# archive_kwargs["methods"] = extractors
|
||||
|
||||
# stderr()
|
||||
|
||||
# ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# if update:
|
||||
# stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
|
||||
# archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
|
||||
# elif update_all:
|
||||
# stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
|
||||
# archive_links(all_links, overwrite=overwrite, **archive_kwargs)
|
||||
# elif overwrite:
|
||||
# stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
|
||||
# archive_links(imported_links, overwrite=True, **archive_kwargs)
|
||||
# elif new_links:
|
||||
# stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
|
||||
# archive_links(new_links, overwrite=False, **archive_kwargs)
|
||||
|
||||
# # tail_worker_logs(worker['stdout_logfile'])
|
||||
|
||||
# # if CAN_UPGRADE:
|
||||
# # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||
|
||||
# return new_links
|
||||
|
||||
|
||||
@@ -20,15 +20,15 @@ def config(*keys,
|
||||
**kwargs) -> None:
|
||||
"""Get and set your ArchiveBox project configuration values"""
|
||||
|
||||
import archivebox
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
from archivebox.misc.logging_util import printable_config
|
||||
from archivebox.config.collection import load_all_config, write_config_file, get_real_name
|
||||
from archivebox.config.configset import get_flat_config, get_all_configs
|
||||
|
||||
check_data_folder()
|
||||
|
||||
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
FLAT_CONFIG = get_flat_config()
|
||||
CONFIGS = get_all_configs()
|
||||
|
||||
config_options: list[str] = list(kwargs.pop('key=value', []) or keys or [f'{key}={val}' for key, val in kwargs.items()])
|
||||
no_args = not (get or set or reset or config_options)
|
||||
@@ -105,7 +105,7 @@ def config(*keys,
|
||||
if new_config:
|
||||
before = FLAT_CONFIG
|
||||
matching_config = write_config_file(new_config)
|
||||
after = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
|
||||
after = {**load_all_config(), **get_flat_config()}
|
||||
print(printable_config(matching_config))
|
||||
|
||||
side_effect_changes = {}
|
||||
|
||||
302
archivebox/cli/archivebox_crawl.py
Normal file
302
archivebox/cli/archivebox_crawl.py
Normal file
@@ -0,0 +1,302 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox crawl [urls_or_snapshot_ids...] [--depth=N] [--plugin=NAME]
|
||||
|
||||
Discover outgoing links from URLs or existing Snapshots.
|
||||
|
||||
If a URL is passed, creates a Snapshot for it first, then runs parser plugins.
|
||||
If a snapshot_id is passed, runs parser plugins on the existing Snapshot.
|
||||
Outputs discovered outlink URLs as JSONL.
|
||||
|
||||
Pipe the output to `archivebox snapshot` to archive the discovered URLs.
|
||||
|
||||
Input formats:
|
||||
- Plain URLs (one per line)
|
||||
- Snapshot UUIDs (one per line)
|
||||
- JSONL: {"type": "Snapshot", "url": "...", ...}
|
||||
- JSONL: {"type": "Snapshot", "id": "...", ...}
|
||||
|
||||
Output (JSONL):
|
||||
{"type": "Snapshot", "url": "https://discovered-url.com", "via_extractor": "...", ...}
|
||||
|
||||
Examples:
|
||||
# Discover links from a page (creates snapshot first)
|
||||
archivebox crawl https://example.com
|
||||
|
||||
# Discover links from an existing snapshot
|
||||
archivebox crawl 01234567-89ab-cdef-0123-456789abcdef
|
||||
|
||||
# Full recursive crawl pipeline
|
||||
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
||||
|
||||
# Use only specific parser plugin
|
||||
archivebox crawl --plugin=parse_html_urls https://example.com
|
||||
|
||||
# Chain: create snapshot, then crawl its outlinks
|
||||
archivebox snapshot https://example.com | archivebox crawl | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox crawl'
|
||||
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
|
||||
|
||||
def discover_outlinks(
|
||||
args: tuple,
|
||||
depth: int = 1,
|
||||
plugin: str = '',
|
||||
wait: bool = True,
|
||||
) -> int:
|
||||
"""
|
||||
Discover outgoing links from URLs or existing Snapshots.
|
||||
|
||||
Accepts URLs or snapshot_ids. For URLs, creates Snapshots first.
|
||||
Runs parser plugins, outputs discovered URLs as JSONL.
|
||||
The output can be piped to `archivebox snapshot` to archive the discovered links.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT, get_or_create_snapshot
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from crawls.models import Seed, Crawl
|
||||
from archivebox.config import CONSTANTS
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Collect all input records
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No URLs or snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Separate records into existing snapshots vs new URLs
|
||||
existing_snapshot_ids = []
|
||||
new_url_records = []
|
||||
|
||||
for record in records:
|
||||
# Check if it's an existing snapshot (has id but no url, or looks like a UUID)
|
||||
if record.get('id') and not record.get('url'):
|
||||
existing_snapshot_ids.append(record['id'])
|
||||
elif record.get('id'):
|
||||
# Has both id and url - check if snapshot exists
|
||||
try:
|
||||
Snapshot.objects.get(id=record['id'])
|
||||
existing_snapshot_ids.append(record['id'])
|
||||
except Snapshot.DoesNotExist:
|
||||
new_url_records.append(record)
|
||||
elif record.get('url'):
|
||||
new_url_records.append(record)
|
||||
|
||||
# For new URLs, create a Crawl and Snapshots
|
||||
snapshot_ids = list(existing_snapshot_ids)
|
||||
|
||||
if new_url_records:
|
||||
# Create a Crawl to manage this operation
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__crawl.txt'
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url')))
|
||||
|
||||
seed = Seed.from_file(
|
||||
sources_file,
|
||||
label=f'crawl --depth={depth}',
|
||||
created_by=created_by_id,
|
||||
)
|
||||
crawl = Crawl.from_seed(seed, max_depth=depth)
|
||||
|
||||
# Create snapshots for new URLs
|
||||
for record in new_url_records:
|
||||
try:
|
||||
record['crawl_id'] = str(crawl.id)
|
||||
record['depth'] = record.get('depth', 0)
|
||||
|
||||
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
|
||||
snapshot_ids.append(str(snapshot.id))
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
if not snapshot_ids:
|
||||
rprint('[red]No snapshots to process[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if existing_snapshot_ids:
|
||||
rprint(f'[blue]Using {len(existing_snapshot_ids)} existing snapshots[/blue]', file=sys.stderr)
|
||||
if new_url_records:
|
||||
rprint(f'[blue]Created {len(snapshot_ids) - len(existing_snapshot_ids)} new snapshots[/blue]', file=sys.stderr)
|
||||
rprint(f'[blue]Running parser plugins on {len(snapshot_ids)} snapshots...[/blue]', file=sys.stderr)
|
||||
|
||||
# Create ArchiveResults for plugins
|
||||
# If --plugin is specified, only run that one. Otherwise, run all available plugins.
|
||||
# The orchestrator will handle dependency ordering (plugins declare deps in config.json)
|
||||
for snapshot_id in snapshot_ids:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
|
||||
if plugin:
|
||||
# User specified a single plugin to run
|
||||
ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
extractor=plugin,
|
||||
defaults={
|
||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
'created_by_id': snapshot.created_by_id,
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Create pending ArchiveResults for all enabled plugins
|
||||
# This uses hook discovery to find available plugins dynamically
|
||||
snapshot.create_pending_archiveresults()
|
||||
|
||||
# Mark snapshot as started
|
||||
snapshot.status = Snapshot.StatusChoices.STARTED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save()
|
||||
|
||||
except Snapshot.DoesNotExist:
|
||||
continue
|
||||
|
||||
# Run plugins
|
||||
if wait:
|
||||
rprint('[blue]Running outlink plugins...[/blue]', file=sys.stderr)
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.runloop()
|
||||
|
||||
# Collect discovered URLs from urls.jsonl files
|
||||
# Uses dynamic discovery - any plugin that outputs urls.jsonl is considered a parser
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
|
||||
discovered_urls = {}
|
||||
for snapshot_id in snapshot_ids:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
snapshot_dir = Path(snapshot.output_dir)
|
||||
|
||||
# Dynamically collect urls.jsonl from ANY plugin subdirectory
|
||||
for entry in collect_urls_from_extractors(snapshot_dir):
|
||||
url = entry.get('url')
|
||||
if url and url not in discovered_urls:
|
||||
# Add metadata for crawl tracking
|
||||
entry['type'] = TYPE_SNAPSHOT
|
||||
entry['depth'] = snapshot.depth + 1
|
||||
entry['via_snapshot'] = str(snapshot.id)
|
||||
discovered_urls[url] = entry
|
||||
|
||||
except Snapshot.DoesNotExist:
|
||||
continue
|
||||
|
||||
rprint(f'[green]Discovered {len(discovered_urls)} URLs[/green]', file=sys.stderr)
|
||||
|
||||
# Output discovered URLs as JSONL (when piped) or human-readable (when TTY)
|
||||
for url, entry in discovered_urls.items():
|
||||
if is_tty:
|
||||
via = entry.get('via_extractor', 'unknown')
|
||||
rprint(f' [dim]{via}[/dim] {url[:80]}', file=sys.stderr)
|
||||
else:
|
||||
write_record(entry)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def process_crawl_by_id(crawl_id: str) -> int:
|
||||
"""
|
||||
Process a single Crawl by ID (used by workers).
|
||||
|
||||
Triggers the Crawl's state machine tick() which will:
|
||||
- Transition from queued -> started (creates root snapshot)
|
||||
- Transition from started -> sealed (when all snapshots done)
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from crawls.models import Crawl
|
||||
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
except Crawl.DoesNotExist:
|
||||
rprint(f'[red]Crawl {crawl_id} not found[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[blue]Processing Crawl {crawl.id} (status={crawl.status})[/blue]', file=sys.stderr)
|
||||
|
||||
try:
|
||||
crawl.sm.tick()
|
||||
crawl.refresh_from_db()
|
||||
rprint(f'[green]Crawl complete (status={crawl.status})[/green]', file=sys.stderr)
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Crawl error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def is_crawl_id(value: str) -> bool:
|
||||
"""Check if value looks like a Crawl UUID."""
|
||||
import re
|
||||
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually a Crawl (not a Snapshot or other object)
|
||||
from crawls.models import Crawl
|
||||
return Crawl.objects.filter(id=value).exists()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--depth', '-d', type=int, default=1, help='Max depth for recursive crawling (default: 1)')
|
||||
@click.option('--plugin', '-p', default='', help='Use only this parser plugin (e.g., parse_html_urls, parse_dom_outlinks)')
|
||||
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
|
||||
@click.argument('args', nargs=-1)
|
||||
def main(depth: int, plugin: str, wait: bool, args: tuple):
|
||||
"""Discover outgoing links from URLs or existing Snapshots, or process Crawl by ID"""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
# Read all input
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
if not records:
|
||||
from rich import print as rprint
|
||||
rprint('[yellow]No URLs, Snapshot IDs, or Crawl IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Check if input looks like existing Crawl IDs to process
|
||||
# If ALL inputs are Crawl UUIDs, process them
|
||||
all_are_crawl_ids = all(
|
||||
is_crawl_id(r.get('id') or r.get('url', ''))
|
||||
for r in records
|
||||
)
|
||||
|
||||
if all_are_crawl_ids:
|
||||
# Process existing Crawls by ID
|
||||
exit_code = 0
|
||||
for record in records:
|
||||
crawl_id = record.get('id') or record.get('url')
|
||||
result = process_crawl_by_id(crawl_id)
|
||||
if result != 0:
|
||||
exit_code = result
|
||||
sys.exit(exit_code)
|
||||
else:
|
||||
# Default behavior: discover outlinks from input (URLs or Snapshot IDs)
|
||||
sys.exit(discover_outlinks(args, depth=depth, plugin=plugin, wait=wait))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,49 +1,262 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox extract [snapshot_ids...] [--plugin=NAME]
|
||||
|
||||
Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.
|
||||
|
||||
Input formats:
|
||||
- Snapshot UUIDs (one per line)
|
||||
- JSONL: {"type": "Snapshot", "id": "...", "url": "..."}
|
||||
- JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."}
|
||||
|
||||
Output (JSONL):
|
||||
{"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."}
|
||||
|
||||
Examples:
|
||||
# Extract specific snapshot
|
||||
archivebox extract 01234567-89ab-cdef-0123-456789abcdef
|
||||
|
||||
# Pipe from snapshot command
|
||||
archivebox snapshot https://example.com | archivebox extract
|
||||
|
||||
# Run specific plugin only
|
||||
archivebox extract --plugin=screenshot 01234567-89ab-cdef-0123-456789abcdef
|
||||
|
||||
# Chain commands
|
||||
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox extract'
|
||||
|
||||
|
||||
import sys
|
||||
from typing import TYPE_CHECKING, Generator
|
||||
from typing import Optional, List
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from django.db.models import Q
|
||||
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
"""
|
||||
Run extraction for a single ArchiveResult by ID (used by workers).
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
Triggers the ArchiveResult's state machine tick() to run the extractor.
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from core.models import ArchiveResult
|
||||
|
||||
try:
|
||||
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
|
||||
except ArchiveResult.DoesNotExist:
|
||||
rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
ORCHESTRATOR = None
|
||||
rprint(f'[blue]Extracting {archiveresult.extractor} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
|
||||
|
||||
@enforce_types
|
||||
def extract(archiveresult_id: str) -> Generator['ArchiveResult', None, None]:
|
||||
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
|
||||
if not archiveresult:
|
||||
raise Exception(f'ArchiveResult {archiveresult_id} not found')
|
||||
|
||||
return archiveresult.EXTRACTOR.extract()
|
||||
try:
|
||||
# Trigger state machine tick - this runs the actual extraction
|
||||
archiveresult.sm.tick()
|
||||
archiveresult.refresh_from_db()
|
||||
|
||||
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
|
||||
print(f'[green]Extraction succeeded: {archiveresult.output}[/green]')
|
||||
return 0
|
||||
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
|
||||
print(f'[red]Extraction failed: {archiveresult.output}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
# Still in progress or backoff - not a failure
|
||||
print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def run_plugins(
|
||||
args: tuple,
|
||||
plugin: str = '',
|
||||
wait: bool = True,
|
||||
) -> int:
|
||||
"""
|
||||
Run plugins on Snapshots from input.
|
||||
|
||||
Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record, archiveresult_to_jsonl,
|
||||
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
)
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Collect all input records
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Gather snapshot IDs to process
|
||||
snapshot_ids = set()
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
|
||||
if record_type == TYPE_SNAPSHOT:
|
||||
snapshot_id = record.get('id')
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
elif record.get('url'):
|
||||
# Look up by URL
|
||||
try:
|
||||
snap = Snapshot.objects.get(url=record['url'])
|
||||
snapshot_ids.add(str(snap.id))
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
|
||||
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
|
||||
elif 'id' in record:
|
||||
# Assume it's a snapshot ID
|
||||
snapshot_ids.add(record['id'])
|
||||
|
||||
if not snapshot_ids:
|
||||
rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Get snapshots and ensure they have pending ArchiveResults
|
||||
processed_count = 0
|
||||
for snapshot_id in snapshot_ids:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
# Create pending ArchiveResults if needed
|
||||
if plugin:
|
||||
# Only create for specific plugin
|
||||
result, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
extractor=plugin,
|
||||
defaults={
|
||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
'created_by_id': snapshot.created_by_id,
|
||||
}
|
||||
)
|
||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||
# Reset for retry
|
||||
result.status = ArchiveResult.StatusChoices.QUEUED
|
||||
result.retry_at = timezone.now()
|
||||
result.save()
|
||||
else:
|
||||
# Create all pending plugins
|
||||
snapshot.create_pending_archiveresults()
|
||||
|
||||
# Reset snapshot status to allow processing
|
||||
if snapshot.status == Snapshot.StatusChoices.SEALED:
|
||||
snapshot.status = Snapshot.StatusChoices.STARTED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save()
|
||||
|
||||
processed_count += 1
|
||||
|
||||
if processed_count == 0:
|
||||
rprint('[red]No snapshots to process[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
|
||||
|
||||
# Run orchestrator if --wait (default)
|
||||
if wait:
|
||||
rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.runloop()
|
||||
|
||||
# Output results as JSONL (when piped) or human-readable (when TTY)
|
||||
for snapshot_id in snapshot_ids:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
results = snapshot.archiveresult_set.all()
|
||||
if plugin:
|
||||
results = results.filter(extractor=plugin)
|
||||
|
||||
for result in results:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'succeeded': 'green',
|
||||
'failed': 'red',
|
||||
'skipped': 'yellow',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.extractor} → {result.output or ""}', file=sys.stderr)
|
||||
else:
|
||||
write_record(archiveresult_to_jsonl(result))
|
||||
except Snapshot.DoesNotExist:
|
||||
continue
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def is_archiveresult_id(value: str) -> bool:
|
||||
"""Check if value looks like an ArchiveResult UUID."""
|
||||
import re
|
||||
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
|
||||
from core.models import ArchiveResult
|
||||
return ArchiveResult.objects.filter(id=value).exists()
|
||||
|
||||
# <user>@<machine_id>#<datetime>/absolute/path/to/binary
|
||||
# 2014.24.01
|
||||
|
||||
@click.command()
|
||||
@click.option('--plugin', '-p', default='', help='Run only this plugin (e.g., screenshot, singlefile)')
|
||||
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
|
||||
@click.argument('args', nargs=-1)
|
||||
def main(plugin: str, wait: bool, args: tuple):
|
||||
"""Run plugins on Snapshots, or process existing ArchiveResults by ID"""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
@click.argument('archiveresult_ids', nargs=-1, type=str)
|
||||
@docstring(extract.__doc__)
|
||||
def main(archiveresult_ids: list[str]):
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
|
||||
for archiveresult_id in (archiveresult_ids or sys.stdin):
|
||||
print(f'Extracting {archiveresult_id}...')
|
||||
archiveresult = extract(str(archiveresult_id))
|
||||
print(archiveresult.as_json())
|
||||
# Read all input
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
if not records:
|
||||
from rich import print as rprint
|
||||
rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Check if input looks like existing ArchiveResult IDs to process
|
||||
all_are_archiveresult_ids = all(
|
||||
is_archiveresult_id(r.get('id') or r.get('url', ''))
|
||||
for r in records
|
||||
)
|
||||
|
||||
if all_are_archiveresult_ids:
|
||||
# Process existing ArchiveResults by ID
|
||||
exit_code = 0
|
||||
for record in records:
|
||||
archiveresult_id = record.get('id') or record.get('url')
|
||||
result = process_archiveresult_by_id(archiveresult_id)
|
||||
if result != 0:
|
||||
exit_code = result
|
||||
sys.exit(exit_code)
|
||||
else:
|
||||
# Default behavior: run plugins on Snapshots from input
|
||||
sys.exit(run_plugins(args, plugin=plugin, wait=wait))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
||||
@@ -21,10 +21,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
|
||||
from archivebox.config import CONSTANTS, VERSION, DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.collection import write_config_file
|
||||
from archivebox.index import load_main_index, write_main_index, fix_invalid_folder_locations, get_invalid_folders
|
||||
from archivebox.index.schema import Link
|
||||
from archivebox.index.json import parse_json_main_index, parse_json_links_details
|
||||
from archivebox.index.sql import apply_migrations
|
||||
from archivebox.misc.folders import fix_invalid_folder_locations, get_invalid_folders
|
||||
from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details, SnapshotDict
|
||||
from archivebox.misc.db import apply_migrations
|
||||
|
||||
# if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
|
||||
# print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
|
||||
@@ -100,10 +99,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
|
||||
from core.models import Snapshot
|
||||
|
||||
all_links = Snapshot.objects.none()
|
||||
pending_links: dict[str, Link] = {}
|
||||
pending_links: dict[str, SnapshotDict] = {}
|
||||
|
||||
if existing_index:
|
||||
all_links = load_main_index(DATA_DIR, warn=False)
|
||||
all_links = Snapshot.objects.all()
|
||||
print(f' √ Loaded {all_links.count()} links from existing main index.')
|
||||
|
||||
if quick:
|
||||
@@ -119,9 +118,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
|
||||
|
||||
# Links in JSON index but not in main index
|
||||
orphaned_json_links = {
|
||||
link.url: link
|
||||
for link in parse_json_main_index(DATA_DIR)
|
||||
if not all_links.filter(url=link.url).exists()
|
||||
link_dict['url']: link_dict
|
||||
for link_dict in parse_json_main_index(DATA_DIR)
|
||||
if not all_links.filter(url=link_dict['url']).exists()
|
||||
}
|
||||
if orphaned_json_links:
|
||||
pending_links.update(orphaned_json_links)
|
||||
@@ -129,9 +128,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
|
||||
|
||||
# Links in data dir indexes but not in main index
|
||||
orphaned_data_dir_links = {
|
||||
link.url: link
|
||||
for link in parse_json_links_details(DATA_DIR)
|
||||
if not all_links.filter(url=link.url).exists()
|
||||
link_dict['url']: link_dict
|
||||
for link_dict in parse_json_links_details(DATA_DIR)
|
||||
if not all_links.filter(url=link_dict['url']).exists()
|
||||
}
|
||||
if orphaned_data_dir_links:
|
||||
pending_links.update(orphaned_data_dir_links)
|
||||
@@ -159,7 +158,8 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
|
||||
print(' archivebox init --quick', file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
|
||||
write_main_index(list(pending_links.values()), DATA_DIR)
|
||||
if pending_links:
|
||||
Snapshot.objects.create_from_dicts(list(pending_links.values()))
|
||||
|
||||
print('\n[green]----------------------------------------------------------------------[/green]')
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
|
||||
|
||||
import os
|
||||
import sys
|
||||
from typing import Optional, List
|
||||
import shutil
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
@@ -13,149 +13,86 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def install(binproviders: Optional[List[str]]=None, binaries: Optional[List[str]]=None, dry_run: bool=False) -> None:
|
||||
"""Automatically install all ArchiveBox dependencies and extras"""
|
||||
|
||||
# if running as root:
|
||||
# - run init to create index + lib dir
|
||||
# - chown -R 911 DATA_DIR
|
||||
# - install all binaries as root
|
||||
# - chown -R 911 LIB_DIR
|
||||
# else:
|
||||
# - run init to create index + lib dir as current user
|
||||
# - install all binaries as current user
|
||||
# - recommend user re-run with sudo if any deps need to be installed as root
|
||||
def install(dry_run: bool=False) -> None:
|
||||
"""Detect and install ArchiveBox dependencies by running a dependency-check crawl"""
|
||||
|
||||
import abx
|
||||
import archivebox
|
||||
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
|
||||
from archivebox.config.paths import DATA_DIR, ARCHIVE_DIR, get_or_create_working_lib_dir
|
||||
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
|
||||
from archivebox.config.paths import ARCHIVE_DIR
|
||||
from archivebox.misc.logging import stderr
|
||||
from archivebox.cli.archivebox_init import init
|
||||
from archivebox.misc.system import run as run_shell
|
||||
|
||||
|
||||
if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
|
||||
init() # must init full index because we need a db to store InstalledBinary entries in
|
||||
|
||||
print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]')
|
||||
|
||||
# we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID
|
||||
print('\n[green][+] Detecting ArchiveBox dependencies...[/green]')
|
||||
|
||||
if IS_ROOT:
|
||||
EUID = os.geteuid()
|
||||
|
||||
# if we have sudo/root permissions, take advantage of them just while installing dependencies
|
||||
print()
|
||||
print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue] with [red]sudo[/red] only for dependencies that need it.[/yellow]')
|
||||
print(f' DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
|
||||
print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]')
|
||||
print(f' DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
|
||||
print()
|
||||
|
||||
LIB_DIR = get_or_create_working_lib_dir()
|
||||
|
||||
package_manager_names = ', '.join(
|
||||
f'[yellow]{binprovider.name}[/yellow]'
|
||||
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values()))
|
||||
if not binproviders or (binproviders and binprovider.name in binproviders)
|
||||
)
|
||||
print(f'[+] Setting up package managers {package_manager_names}...')
|
||||
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())):
|
||||
if binproviders and binprovider.name not in binproviders:
|
||||
continue
|
||||
try:
|
||||
binprovider.setup()
|
||||
except Exception:
|
||||
# it's ok, installing binaries below will automatically set up package managers as needed
|
||||
# e.g. if user does not have npm available we cannot set it up here yet, but once npm Binary is installed
|
||||
# the next package that depends on npm will automatically call binprovider.setup() during its own install
|
||||
pass
|
||||
|
||||
print()
|
||||
|
||||
for binary in reversed(list(abx.as_dict(abx.pm.hook.get_BINARIES()).values())):
|
||||
if binary.name in ('archivebox', 'django', 'sqlite', 'python'):
|
||||
# obviously must already be installed if we are running
|
||||
continue
|
||||
|
||||
if binaries and binary.name not in binaries:
|
||||
continue
|
||||
|
||||
providers = ' [grey53]or[/grey53] '.join(
|
||||
provider.name for provider in binary.binproviders_supported
|
||||
if not binproviders or (binproviders and provider.name in binproviders)
|
||||
)
|
||||
if not providers:
|
||||
continue
|
||||
print(f'[+] Detecting / Installing [yellow]{binary.name.ljust(22)}[/yellow] using [red]{providers}[/red]...')
|
||||
try:
|
||||
with SudoPermission(uid=0, fallback=True):
|
||||
# print(binary.load_or_install(fresh=True).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}))
|
||||
if binproviders:
|
||||
providers_supported_by_binary = [provider.name for provider in binary.binproviders_supported]
|
||||
for binprovider_name in binproviders:
|
||||
if binprovider_name not in providers_supported_by_binary:
|
||||
continue
|
||||
try:
|
||||
if dry_run:
|
||||
# always show install commands when doing a dry run
|
||||
sys.stderr.write("\033[2;49;90m") # grey53
|
||||
result = binary.install(binproviders=[binprovider_name], dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
sys.stderr.write("\033[00m\n") # reset
|
||||
else:
|
||||
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False)
|
||||
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
if result and result['loaded_version']:
|
||||
break
|
||||
except Exception as e:
|
||||
print(f'[red]:cross_mark: Failed to install {binary.name} as using {binprovider_name} as user {ARCHIVEBOX_USER}: {e}[/red]')
|
||||
else:
|
||||
if dry_run:
|
||||
sys.stderr.write("\033[2;49;90m") # grey53
|
||||
binary.install(dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
sys.stderr.write("\033[00m\n") # reset
|
||||
else:
|
||||
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, fresh=True, dry_run=dry_run)
|
||||
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
if IS_ROOT and LIB_DIR:
|
||||
with SudoPermission(uid=0):
|
||||
if ARCHIVEBOX_USER == 0:
|
||||
os.system(f'chmod -R 777 "{LIB_DIR.resolve()}"')
|
||||
else:
|
||||
os.system(f'chown -R {ARCHIVEBOX_USER} "{LIB_DIR.resolve()}"')
|
||||
except Exception as e:
|
||||
print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]')
|
||||
if binaries and len(binaries) == 1:
|
||||
# if we are only installing a single binary, raise the exception so the user can see what went wrong
|
||||
raise
|
||||
|
||||
|
||||
if dry_run:
|
||||
print('[dim]Dry run - would create a crawl to detect dependencies[/dim]')
|
||||
return
|
||||
|
||||
# Set up Django
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
|
||||
from django.utils import timezone
|
||||
from crawls.models import Seed, Crawl
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
# Create a seed and crawl for dependency detection
|
||||
# Using a minimal crawl that will trigger on_Crawl hooks
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
seed = Seed.objects.create(
|
||||
uri='archivebox://install',
|
||||
label='Dependency detection',
|
||||
created_by_id=created_by_id,
|
||||
)
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
seed=seed,
|
||||
max_depth=0,
|
||||
created_by_id=created_by_id,
|
||||
status='queued',
|
||||
)
|
||||
|
||||
print(f'[+] Created dependency detection crawl: {crawl.id}')
|
||||
print('[+] Running crawl to detect binaries via on_Crawl hooks...')
|
||||
print()
|
||||
|
||||
# Run the crawl synchronously (this triggers on_Crawl hooks)
|
||||
from workers.orchestrator import Orchestrator
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.runloop()
|
||||
|
||||
print()
|
||||
|
||||
# Check for superuser
|
||||
from django.contrib.auth import get_user_model
|
||||
User = get_user_model()
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
|
||||
stderr(' archivebox manage createsuperuser')
|
||||
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
|
||||
|
||||
print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
|
||||
|
||||
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
|
||||
|
||||
extra_args = []
|
||||
if binproviders:
|
||||
extra_args.append(f'--binproviders={",".join(binproviders)}')
|
||||
if binaries:
|
||||
extra_args.append(f'--binaries={",".join(binaries)}')
|
||||
|
||||
proc = run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version', *extra_args], capture_output=False, cwd=DATA_DIR)
|
||||
raise SystemExit(proc.returncode)
|
||||
|
||||
print()
|
||||
|
||||
# Run version to show full status
|
||||
archivebox_path = shutil.which('archivebox') or sys.executable
|
||||
if 'python' in archivebox_path:
|
||||
os.system(f'{sys.executable} -m archivebox version')
|
||||
else:
|
||||
os.system(f'{archivebox_path} version')
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--binproviders', '-p', type=str, help='Select binproviders to use DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)', default=None)
|
||||
@click.option('--binaries', '-b', type=str, help='Select binaries to install DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)', default=None)
|
||||
@click.option('--dry-run', '-d', is_flag=True, help='Show what would be installed without actually installing anything', default=False)
|
||||
@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
|
||||
@docstring(install.__doc__)
|
||||
def main(**kwargs) -> None:
|
||||
install(**kwargs)
|
||||
|
||||
67
archivebox/cli/archivebox_orchestrator.py
Normal file
67
archivebox/cli/archivebox_orchestrator.py
Normal file
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox orchestrator [--daemon]
|
||||
|
||||
Start the orchestrator process that manages workers.
|
||||
|
||||
The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
|
||||
and lazily spawns worker processes when there is work to be done.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox orchestrator'
|
||||
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
|
||||
|
||||
def orchestrator(daemon: bool = False, watch: bool = False) -> int:
|
||||
"""
|
||||
Start the orchestrator process.
|
||||
|
||||
The orchestrator:
|
||||
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
|
||||
2. Spawns worker processes when there is work to do
|
||||
3. Monitors worker health and restarts failed workers
|
||||
4. Exits when all queues are empty (unless --daemon)
|
||||
|
||||
Args:
|
||||
daemon: Run forever (don't exit when idle)
|
||||
watch: Just watch the queues without spawning workers (for debugging)
|
||||
|
||||
Exit codes:
|
||||
0: All work completed successfully
|
||||
1: Error occurred
|
||||
"""
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
if Orchestrator.is_running():
|
||||
print('[yellow]Orchestrator is already running[/yellow]')
|
||||
return 0
|
||||
|
||||
try:
|
||||
orchestrator_instance = Orchestrator(exit_on_idle=not daemon)
|
||||
orchestrator_instance.runloop()
|
||||
return 0
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
except Exception as e:
|
||||
print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
|
||||
@click.option('--watch', '-w', is_flag=True, help="Watch queues without spawning workers")
|
||||
@docstring(orchestrator.__doc__)
|
||||
def main(daemon: bool, watch: bool):
|
||||
"""Start the ArchiveBox orchestrator process"""
|
||||
sys.exit(orchestrator(daemon=daemon, watch=watch))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -12,10 +12,7 @@ import rich_click as click
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.index.schema import Link
|
||||
from archivebox.config.django import setup_django
|
||||
from archivebox.index import load_main_index
|
||||
from archivebox.index.sql import remove_from_sql_main_index
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
from archivebox.misc.logging_util import (
|
||||
@@ -35,7 +32,7 @@ def remove(filter_patterns: Iterable[str]=(),
|
||||
before: float | None=None,
|
||||
yes: bool=False,
|
||||
delete: bool=False,
|
||||
out_dir: Path=DATA_DIR) -> Iterable[Link]:
|
||||
out_dir: Path=DATA_DIR) -> QuerySet:
|
||||
"""Remove the specified URLs from the archive"""
|
||||
|
||||
setup_django()
|
||||
@@ -63,27 +60,27 @@ def remove(filter_patterns: Iterable[str]=(),
|
||||
log_removal_finished(0, 0)
|
||||
raise SystemExit(1)
|
||||
|
||||
log_links = [link.as_link() for link in snapshots]
|
||||
log_list_finished(log_links)
|
||||
log_removal_started(log_links, yes=yes, delete=delete)
|
||||
log_list_finished(snapshots)
|
||||
log_removal_started(snapshots, yes=yes, delete=delete)
|
||||
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
try:
|
||||
for snapshot in snapshots:
|
||||
if delete:
|
||||
shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
|
||||
shutil.rmtree(snapshot.output_dir, ignore_errors=True)
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
to_remove = snapshots.count()
|
||||
|
||||
from archivebox.search import flush_search_index
|
||||
from core.models import Snapshot
|
||||
|
||||
flush_search_index(snapshots=snapshots)
|
||||
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
|
||||
all_snapshots = load_main_index(out_dir=out_dir)
|
||||
snapshots.delete()
|
||||
all_snapshots = Snapshot.objects.all()
|
||||
log_removal_finished(all_snapshots.count(), to_remove)
|
||||
|
||||
|
||||
return all_snapshots
|
||||
|
||||
|
||||
|
||||
@@ -35,9 +35,12 @@ def schedule(add: bool=False,
|
||||
|
||||
depth = int(depth)
|
||||
|
||||
import shutil
|
||||
from crontab import CronTab, CronSlices
|
||||
from archivebox.misc.system import dedupe_cron_jobs
|
||||
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
|
||||
|
||||
# Find the archivebox binary path
|
||||
ARCHIVEBOX_ABSPATH = shutil.which('archivebox') or sys.executable.replace('python', 'archivebox')
|
||||
|
||||
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||
|
||||
@@ -58,7 +61,7 @@ def schedule(add: bool=False,
|
||||
'cd',
|
||||
quoted(out_dir),
|
||||
'&&',
|
||||
quoted(ARCHIVEBOX_BINARY.load().abspath),
|
||||
quoted(ARCHIVEBOX_ABSPATH),
|
||||
*([
|
||||
'add',
|
||||
*(['--overwrite'] if overwrite else []),
|
||||
|
||||
@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox search'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Iterable
|
||||
from typing import Optional, List, Any
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
@@ -12,11 +12,19 @@ from rich import print
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.index import LINK_FILTERS
|
||||
from archivebox.index.schema import Link
|
||||
from archivebox.misc.logging import stderr
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
|
||||
# Filter types for URL matching
|
||||
LINK_FILTERS = {
|
||||
'exact': lambda pattern: {'url': pattern},
|
||||
'substring': lambda pattern: {'url__icontains': pattern},
|
||||
'regex': lambda pattern: {'url__iregex': pattern},
|
||||
'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
|
||||
'tag': lambda pattern: {'tags__name': pattern},
|
||||
'timestamp': lambda pattern: {'timestamp': pattern},
|
||||
}
|
||||
|
||||
STATUS_CHOICES = [
|
||||
'indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid',
|
||||
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
|
||||
@@ -24,38 +32,37 @@ STATUS_CHOICES = [
|
||||
|
||||
|
||||
|
||||
def list_links(snapshots: Optional[QuerySet]=None,
|
||||
filter_patterns: Optional[List[str]]=None,
|
||||
filter_type: str='substring',
|
||||
after: Optional[float]=None,
|
||||
before: Optional[float]=None,
|
||||
out_dir: Path=DATA_DIR) -> Iterable[Link]:
|
||||
|
||||
from archivebox.index import load_main_index
|
||||
from archivebox.index import snapshot_filter
|
||||
def get_snapshots(snapshots: Optional[QuerySet]=None,
|
||||
filter_patterns: Optional[List[str]]=None,
|
||||
filter_type: str='substring',
|
||||
after: Optional[float]=None,
|
||||
before: Optional[float]=None,
|
||||
out_dir: Path=DATA_DIR) -> QuerySet:
|
||||
"""Filter and return Snapshots matching the given criteria."""
|
||||
from core.models import Snapshot
|
||||
|
||||
if snapshots:
|
||||
all_snapshots = snapshots
|
||||
result = snapshots
|
||||
else:
|
||||
all_snapshots = load_main_index(out_dir=out_dir)
|
||||
result = Snapshot.objects.all()
|
||||
|
||||
if after is not None:
|
||||
all_snapshots = all_snapshots.filter(timestamp__gte=after)
|
||||
result = result.filter(timestamp__gte=after)
|
||||
if before is not None:
|
||||
all_snapshots = all_snapshots.filter(timestamp__lt=before)
|
||||
result = result.filter(timestamp__lt=before)
|
||||
if filter_patterns:
|
||||
all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
|
||||
result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type)
|
||||
|
||||
if not all_snapshots:
|
||||
if not result:
|
||||
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
|
||||
|
||||
return all_snapshots
|
||||
return result
|
||||
|
||||
|
||||
def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict[str, Link | None]:
|
||||
|
||||
def list_folders(snapshots: QuerySet, status: str, out_dir: Path=DATA_DIR) -> dict[str, Any]:
|
||||
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
from archivebox.index import (
|
||||
from archivebox.misc.folders import (
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_unarchived_folders,
|
||||
@@ -67,7 +74,7 @@ def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict
|
||||
get_corrupted_folders,
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
|
||||
|
||||
check_data_folder()
|
||||
|
||||
STATUS_FUNCTIONS = {
|
||||
@@ -84,7 +91,7 @@ def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict
|
||||
}
|
||||
|
||||
try:
|
||||
return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
|
||||
return STATUS_FUNCTIONS[status](snapshots, out_dir=out_dir)
|
||||
except KeyError:
|
||||
raise ValueError('Status not recognized.')
|
||||
|
||||
@@ -109,7 +116,7 @@ def search(filter_patterns: list[str] | None=None,
|
||||
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
|
||||
raise SystemExit(2)
|
||||
|
||||
snapshots = list_links(
|
||||
snapshots = get_snapshots(
|
||||
filter_patterns=list(filter_patterns) if filter_patterns else None,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
@@ -120,20 +127,24 @@ def search(filter_patterns: list[str] | None=None,
|
||||
snapshots = snapshots.order_by(sort)
|
||||
|
||||
folders = list_folders(
|
||||
links=snapshots,
|
||||
snapshots=snapshots,
|
||||
status=status,
|
||||
out_dir=DATA_DIR,
|
||||
)
|
||||
|
||||
if json:
|
||||
from archivebox.index.json import generate_json_index_from_links
|
||||
output = generate_json_index_from_links(folders.values(), with_headers)
|
||||
from core.models import Snapshot
|
||||
# Filter for non-None snapshots
|
||||
valid_snapshots = [s for s in folders.values() if s is not None]
|
||||
output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_json(with_headers=with_headers)
|
||||
elif html:
|
||||
from archivebox.index.html import generate_index_from_links
|
||||
output = generate_index_from_links(folders.values(), with_headers)
|
||||
from core.models import Snapshot
|
||||
valid_snapshots = [s for s in folders.values() if s is not None]
|
||||
output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_html(with_headers=with_headers)
|
||||
elif csv:
|
||||
from archivebox.index.csv import links_to_csv
|
||||
output = links_to_csv(folders.values(), csv.split(','), with_headers)
|
||||
from core.models import Snapshot
|
||||
valid_snapshots = [s for s in folders.values() if s is not None]
|
||||
output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_csv(cols=csv.split(','), header=with_headers)
|
||||
else:
|
||||
from archivebox.misc.logging_util import printable_folders
|
||||
output = printable_folders(folders, with_headers)
|
||||
|
||||
218
archivebox/cli/archivebox_snapshot.py
Normal file
218
archivebox/cli/archivebox_snapshot.py
Normal file
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox snapshot [urls...] [--depth=N] [--tag=TAG] [--plugins=...]
|
||||
|
||||
Create Snapshots from URLs. Accepts URLs as arguments, from stdin, or via JSONL.
|
||||
|
||||
Input formats:
|
||||
- Plain URLs (one per line)
|
||||
- JSONL: {"type": "Snapshot", "url": "...", "title": "...", "tags": "..."}
|
||||
|
||||
Output (JSONL):
|
||||
{"type": "Snapshot", "id": "...", "url": "...", "status": "queued", ...}
|
||||
|
||||
Examples:
|
||||
# Create snapshots from URLs
|
||||
archivebox snapshot https://example.com https://foo.com
|
||||
|
||||
# Pipe from stdin
|
||||
echo 'https://example.com' | archivebox snapshot
|
||||
|
||||
# Chain with extract
|
||||
archivebox snapshot https://example.com | archivebox extract
|
||||
|
||||
# With crawl depth
|
||||
archivebox snapshot --depth=1 https://example.com
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox snapshot'
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
|
||||
|
||||
def process_snapshot_by_id(snapshot_id: str) -> int:
|
||||
"""
|
||||
Process a single Snapshot by ID (used by workers).
|
||||
|
||||
Triggers the Snapshot's state machine tick() which will:
|
||||
- Transition from queued -> started (creates pending ArchiveResults)
|
||||
- Transition from started -> sealed (when all ArchiveResults done)
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from core.models import Snapshot
|
||||
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[red]Snapshot {snapshot_id} not found[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[blue]Processing Snapshot {snapshot.id} {snapshot.url[:50]} (status={snapshot.status})[/blue]', file=sys.stderr)
|
||||
|
||||
try:
|
||||
snapshot.sm.tick()
|
||||
snapshot.refresh_from_db()
|
||||
rprint(f'[green]Snapshot complete (status={snapshot.status})[/green]', file=sys.stderr)
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Snapshot error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def create_snapshots(
|
||||
urls: tuple,
|
||||
depth: int = 0,
|
||||
tag: str = '',
|
||||
plugins: str = '',
|
||||
created_by_id: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create Snapshots from URLs or JSONL records.
|
||||
|
||||
Reads from args or stdin, creates Snapshot objects, outputs JSONL.
|
||||
If --plugins is passed, also runs specified plugins (blocking).
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record, snapshot_to_jsonl,
|
||||
TYPE_SNAPSHOT, TYPE_TAG, get_or_create_snapshot
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from core.models import Snapshot
|
||||
from crawls.models import Seed, Crawl
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Collect all input records
|
||||
records = list(read_args_or_stdin(urls))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# If depth > 0, we need a Crawl to manage recursive discovery
|
||||
crawl = None
|
||||
if depth > 0:
|
||||
# Create a seed for this batch
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt'
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url')))
|
||||
|
||||
seed = Seed.from_file(
|
||||
sources_file,
|
||||
label=f'snapshot --depth={depth}',
|
||||
created_by=created_by_id,
|
||||
)
|
||||
crawl = Crawl.from_seed(seed, max_depth=depth)
|
||||
|
||||
# Process each record
|
||||
created_snapshots = []
|
||||
for record in records:
|
||||
if record.get('type') != TYPE_SNAPSHOT and 'url' not in record:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Add crawl info if we have one
|
||||
if crawl:
|
||||
record['crawl_id'] = str(crawl.id)
|
||||
record['depth'] = record.get('depth', 0)
|
||||
|
||||
# Add tags if provided via CLI
|
||||
if tag and not record.get('tags'):
|
||||
record['tags'] = tag
|
||||
|
||||
# Get or create the snapshot
|
||||
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
|
||||
created_snapshots.append(snapshot)
|
||||
|
||||
# Output JSONL record (only when piped)
|
||||
if not is_tty:
|
||||
write_record(snapshot_to_jsonl(snapshot))
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
if not created_snapshots:
|
||||
rprint('[red]No snapshots created[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
|
||||
|
||||
# If TTY, show human-readable output
|
||||
if is_tty:
|
||||
for snapshot in created_snapshots:
|
||||
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
|
||||
|
||||
# If --plugins is passed, run the orchestrator for those plugins
|
||||
if plugins:
|
||||
from workers.orchestrator import Orchestrator
|
||||
rprint(f'[blue]Running plugins: {plugins or "all"}...[/blue]', file=sys.stderr)
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.runloop()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def is_snapshot_id(value: str) -> bool:
|
||||
"""Check if value looks like a Snapshot UUID."""
|
||||
import re
|
||||
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
|
||||
return bool(uuid_pattern.match(value))
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Recursively crawl linked pages up to N levels deep')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add to each snapshot')
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run after creating snapshots (e.g. title,screenshot)')
|
||||
@click.argument('args', nargs=-1)
|
||||
def main(depth: int, tag: str, plugins: str, args: tuple):
|
||||
"""Create Snapshots from URLs, or process existing Snapshots by ID"""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
# Read all input
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
if not records:
|
||||
from rich import print as rprint
|
||||
rprint('[yellow]No URLs or Snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Check if input looks like existing Snapshot IDs to process
|
||||
# If ALL inputs are UUIDs with no URL, assume we're processing existing Snapshots
|
||||
all_are_ids = all(
|
||||
(r.get('id') and not r.get('url')) or is_snapshot_id(r.get('url', ''))
|
||||
for r in records
|
||||
)
|
||||
|
||||
if all_are_ids:
|
||||
# Process existing Snapshots by ID
|
||||
exit_code = 0
|
||||
for record in records:
|
||||
snapshot_id = record.get('id') or record.get('url')
|
||||
result = process_snapshot_by_id(snapshot_id)
|
||||
if result != 0:
|
||||
exit_code = result
|
||||
sys.exit(exit_code)
|
||||
else:
|
||||
# Create new Snapshots from URLs
|
||||
sys.exit(create_snapshots(args, depth=depth, tag=tag, plugins=plugins))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -10,9 +10,8 @@ from rich import print
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
from archivebox.index.json import parse_json_links_details
|
||||
from archivebox.index import (
|
||||
load_main_index,
|
||||
from archivebox.misc.legacy import parse_json_links_details
|
||||
from archivebox.misc.folders import (
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_invalid_folders,
|
||||
@@ -33,7 +32,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
"""Print out some info and statistics about the archive collection"""
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from archivebox.index.sql import get_admins
|
||||
from archivebox.misc.db import get_admins
|
||||
from core.models import Snapshot
|
||||
User = get_user_model()
|
||||
|
||||
@@ -44,7 +43,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
print(f' Index size: {size} across {num_files} files')
|
||||
print()
|
||||
|
||||
links = load_main_index(out_dir=out_dir)
|
||||
links = Snapshot.objects.all()
|
||||
num_sql_links = links.count()
|
||||
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
|
||||
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
|
||||
|
||||
@@ -8,8 +8,7 @@ import rich_click as click
|
||||
from typing import Iterable
|
||||
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox.index import (
|
||||
LINK_FILTERS,
|
||||
from archivebox.misc.folders import (
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_unarchived_folders,
|
||||
@@ -22,6 +21,16 @@ from archivebox.index import (
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
|
||||
# Filter types for URL matching
|
||||
LINK_FILTERS = {
|
||||
'exact': lambda pattern: {'url': pattern},
|
||||
'substring': lambda pattern: {'url__icontains': pattern},
|
||||
'regex': lambda pattern: {'url__iregex': pattern},
|
||||
'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
|
||||
'tag': lambda pattern: {'tags__name': pattern},
|
||||
'timestamp': lambda pattern: {'timestamp': pattern},
|
||||
}
|
||||
|
||||
|
||||
@enforce_types
|
||||
def update(filter_patterns: Iterable[str]=(),
|
||||
@@ -33,15 +42,66 @@ def update(filter_patterns: Iterable[str]=(),
|
||||
after: float | None=None,
|
||||
status: str='indexed',
|
||||
filter_type: str='exact',
|
||||
extract: str="") -> None:
|
||||
plugins: str="",
|
||||
max_workers: int=4) -> None:
|
||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||
|
||||
from rich import print
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from django.utils import timezone
|
||||
from core.models import Snapshot
|
||||
from workers.orchestrator import parallel_archive
|
||||
|
||||
from workers.orchestrator import Orchestrator
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator.start()
|
||||
# Get snapshots to update based on filters
|
||||
snapshots = Snapshot.objects.all()
|
||||
|
||||
if filter_patterns:
|
||||
snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type)
|
||||
|
||||
if status == 'unarchived':
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=True)
|
||||
elif status == 'archived':
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=False)
|
||||
|
||||
if before:
|
||||
from datetime import datetime
|
||||
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
|
||||
if after:
|
||||
from datetime import datetime
|
||||
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
|
||||
|
||||
if resume:
|
||||
snapshots = snapshots.filter(timestamp__gte=str(resume))
|
||||
|
||||
snapshot_ids = list(snapshots.values_list('pk', flat=True))
|
||||
|
||||
if not snapshot_ids:
|
||||
print('[yellow]No snapshots found matching the given filters[/yellow]')
|
||||
return
|
||||
|
||||
print(f'[green]\\[*] Found {len(snapshot_ids)} snapshots to update[/green]')
|
||||
|
||||
if index_only:
|
||||
print('[yellow]Index-only mode - skipping archiving[/yellow]')
|
||||
return
|
||||
|
||||
methods = plugins.split(',') if plugins else None
|
||||
|
||||
# Queue snapshots for archiving via the state machine system
|
||||
# Workers will pick them up and run the plugins
|
||||
if len(snapshot_ids) > 1 and max_workers > 1:
|
||||
parallel_archive(snapshot_ids, max_workers=max_workers, overwrite=overwrite, methods=methods)
|
||||
else:
|
||||
# Queue snapshots by setting status to queued
|
||||
for snapshot in snapshots:
|
||||
Snapshot.objects.filter(id=snapshot.id).update(
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
print(f'[green]Queued {len(snapshot_ids)} snapshots for archiving[/green]')
|
||||
|
||||
|
||||
@click.command()
|
||||
@@ -71,7 +131,8 @@ Update only links or data directories that have the given status:
|
||||
unrecognized {get_unrecognized_folders.__doc__}
|
||||
''')
|
||||
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
|
||||
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to use e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.option('--max-workers', '-j', type=int, default=4, help='Number of parallel worker processes for archiving')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@docstring(update.__doc__)
|
||||
def main(**kwargs):
|
||||
|
||||
@@ -3,7 +3,10 @@
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
import sys
|
||||
from typing import Iterable
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -12,7 +15,6 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
@enforce_types
|
||||
def version(quiet: bool=False,
|
||||
binproviders: Iterable[str]=(),
|
||||
binaries: Iterable[str]=()) -> list[str]:
|
||||
"""Print the ArchiveBox version, debug metadata, and installed dependency versions"""
|
||||
|
||||
@@ -22,37 +24,24 @@ def version(quiet: bool=False,
|
||||
if quiet or '--version' in sys.argv:
|
||||
return []
|
||||
|
||||
# Only do slower imports when getting full version info
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
|
||||
from rich.panel import Panel
|
||||
from rich.console import Console
|
||||
from abx_pkg import Binary
|
||||
|
||||
import abx
|
||||
import archivebox
|
||||
from archivebox.config import CONSTANTS, DATA_DIR
|
||||
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER
|
||||
from archivebox.config.paths import get_data_locations, get_code_locations
|
||||
from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
|
||||
from archivebox.misc.logging_util import printable_folder_status
|
||||
|
||||
from abx_plugin_default_binproviders import apt, brew, env
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
console = Console()
|
||||
prnt = console.print
|
||||
|
||||
LDAP_ENABLED = archivebox.pm.hook.get_SCOPE_CONFIG().LDAP_ENABLED
|
||||
# Check if LDAP is enabled (simple config lookup)
|
||||
config = get_config()
|
||||
LDAP_ENABLED = config.get('LDAP_ENABLED', False)
|
||||
|
||||
# 0.7.1
|
||||
# ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
|
||||
# IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython
|
||||
# FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644
|
||||
# DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
|
||||
|
||||
p = platform.uname()
|
||||
COMMIT_HASH = get_COMMIT_HASH()
|
||||
prnt(
|
||||
@@ -68,15 +57,26 @@ def version(quiet: bool=False,
|
||||
f'PLATFORM={platform.platform()}',
|
||||
f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
|
||||
)
|
||||
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
|
||||
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
|
||||
prnt(
|
||||
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
|
||||
f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
|
||||
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
|
||||
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||
)
|
||||
|
||||
try:
|
||||
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
|
||||
except Exception:
|
||||
OUTPUT_IS_REMOTE_FS = False
|
||||
|
||||
try:
|
||||
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
|
||||
prnt(
|
||||
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
|
||||
f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
|
||||
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
|
||||
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||
)
|
||||
except Exception:
|
||||
prnt(
|
||||
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
|
||||
)
|
||||
|
||||
prnt(
|
||||
f'DEBUG={SHELL_CONFIG.DEBUG}',
|
||||
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
|
||||
@@ -84,14 +84,11 @@ def version(quiet: bool=False,
|
||||
f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
|
||||
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
|
||||
f'LDAP={LDAP_ENABLED}',
|
||||
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
|
||||
)
|
||||
prnt()
|
||||
|
||||
if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
|
||||
PANEL_TEXT = '\n'.join((
|
||||
# '',
|
||||
# f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]',
|
||||
'',
|
||||
'[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
|
||||
' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
|
||||
@@ -105,77 +102,94 @@ def version(quiet: bool=False,
|
||||
|
||||
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
|
||||
failures = []
|
||||
BINARIES = abx.as_dict(archivebox.pm.hook.get_BINARIES())
|
||||
for name, binary in list(BINARIES.items()):
|
||||
if binary.name == 'archivebox':
|
||||
continue
|
||||
|
||||
# skip if the binary is not in the requested list of binaries
|
||||
if binaries and binary.name not in binaries:
|
||||
continue
|
||||
|
||||
# skip if the binary is not supported by any of the requested binproviders
|
||||
if binproviders and binary.binproviders_supported and not any(provider.name in binproviders for provider in binary.binproviders_supported):
|
||||
continue
|
||||
|
||||
err = None
|
||||
try:
|
||||
loaded_bin = binary.load()
|
||||
except Exception as e:
|
||||
err = e
|
||||
loaded_bin = binary
|
||||
provider_summary = f'[dark_sea_green3]{loaded_bin.binprovider.name.ljust(10)}[/dark_sea_green3]' if loaded_bin.binprovider else '[grey23]not found[/grey23] '
|
||||
if loaded_bin.abspath:
|
||||
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
|
||||
if ' ' in abspath:
|
||||
abspath = abspath.replace(' ', r'\ ')
|
||||
else:
|
||||
abspath = f'[red]{err}[/red]'
|
||||
prnt('', '[green]√[/green]' if loaded_bin.is_valid else '[red]X[/red]', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(12), provider_summary, abspath, overflow='ignore', crop=False)
|
||||
if not loaded_bin.is_valid:
|
||||
failures.append(loaded_bin.name)
|
||||
|
||||
prnt()
|
||||
prnt('[gold3][i] Package Managers:[/gold3]')
|
||||
BINPROVIDERS = abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS())
|
||||
for name, binprovider in list(BINPROVIDERS.items()):
|
||||
err = None
|
||||
|
||||
if binproviders and binprovider.name not in binproviders:
|
||||
continue
|
||||
|
||||
# TODO: implement a BinProvider.BINARY() method that gets the loaded binary for a binprovider's INSTALLER_BIN
|
||||
loaded_bin = binprovider.INSTALLER_BINARY or Binary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
|
||||
|
||||
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
|
||||
abspath = None
|
||||
if loaded_bin.abspath:
|
||||
abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
|
||||
if ' ' in abspath:
|
||||
abspath = abspath.replace(' ', r'\ ')
|
||||
|
||||
PATH = str(binprovider.PATH).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
|
||||
ownership_summary = f'UID=[blue]{str(binprovider.EUID).ljust(4)}[/blue]'
|
||||
provider_summary = f'[dark_sea_green3]{str(abspath).ljust(52)}[/dark_sea_green3]' if abspath else f'[grey23]{"not available".ljust(52)}[/grey23]'
|
||||
prnt('', '[green]√[/green]' if binprovider.is_valid else '[grey53]-[/grey53]', '', binprovider.name.ljust(11), provider_summary, ownership_summary, f'PATH={PATH}', overflow='ellipsis', soft_wrap=True)
|
||||
|
||||
if not (binaries or binproviders):
|
||||
# dont show source code / data dir info if we just want to get version info for a binary or binprovider
|
||||
|
||||
# Setup Django before importing models
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from machine.models import Machine, InstalledBinary
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Get all *_BINARY config values
|
||||
binary_config_keys = [key for key in config.keys() if key.endswith('_BINARY')]
|
||||
|
||||
if not binary_config_keys:
|
||||
prnt('', '[grey53]No binary dependencies defined in config.[/grey53]')
|
||||
else:
|
||||
for key in sorted(set(binary_config_keys)):
|
||||
# Get the actual binary name/path from config value
|
||||
bin_value = config.get(key, '').strip()
|
||||
if not bin_value:
|
||||
continue
|
||||
|
||||
# Check if it's a path (has slashes) or just a name
|
||||
is_path = '/' in bin_value
|
||||
|
||||
if is_path:
|
||||
# It's a full path - match against abspath
|
||||
bin_name = Path(bin_value).name
|
||||
# Skip if user specified specific binaries and this isn't one
|
||||
if binaries and bin_name not in binaries:
|
||||
continue
|
||||
# Find InstalledBinary where abspath ends with this path
|
||||
installed = InstalledBinary.objects.filter(
|
||||
machine=machine,
|
||||
abspath__endswith=bin_value,
|
||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
|
||||
else:
|
||||
# It's just a binary name - match against name
|
||||
bin_name = bin_value
|
||||
# Skip if user specified specific binaries and this isn't one
|
||||
if binaries and bin_name not in binaries:
|
||||
continue
|
||||
# Find InstalledBinary by name
|
||||
installed = InstalledBinary.objects.filter(
|
||||
machine=machine,
|
||||
name__iexact=bin_name,
|
||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
|
||||
|
||||
if installed and installed.is_valid:
|
||||
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
|
||||
version_str = (installed.version or 'unknown')[:15]
|
||||
provider = (installed.binprovider or 'env')[:8]
|
||||
prnt('', '[green]√[/green]', '', bin_name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
|
||||
else:
|
||||
prnt('', '[red]X[/red]', '', bin_name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
|
||||
failures.append(bin_name)
|
||||
|
||||
# Show hint if no binaries are installed yet
|
||||
has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()
|
||||
if not has_any_installed:
|
||||
prnt()
|
||||
prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
|
||||
|
||||
if not binaries:
|
||||
# Show code and data locations
|
||||
prnt()
|
||||
prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
|
||||
for name, path in get_code_locations().items():
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
try:
|
||||
for name, path in get_code_locations().items():
|
||||
if isinstance(path, dict):
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
except Exception as e:
|
||||
prnt(f' [red]Error getting code locations: {e}[/red]')
|
||||
|
||||
prnt()
|
||||
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
|
||||
prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
|
||||
for name, path in get_data_locations().items():
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
|
||||
from archivebox.misc.checks import check_data_dir_permissions
|
||||
try:
|
||||
for name, path in get_data_locations().items():
|
||||
if isinstance(path, dict):
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
except Exception as e:
|
||||
prnt(f' [red]Error getting data locations: {e}[/red]')
|
||||
|
||||
check_data_dir_permissions()
|
||||
try:
|
||||
from archivebox.misc.checks import check_data_dir_permissions
|
||||
check_data_dir_permissions()
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
prnt()
|
||||
prnt('[red][i] Data locations:[/red] (not in a data directory)')
|
||||
@@ -194,7 +208,6 @@ def version(quiet: bool=False,
|
||||
|
||||
@click.command()
|
||||
@click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)')
|
||||
@click.option('--binproviders', '-p', help='Select binproviders to detect DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)')
|
||||
@click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)')
|
||||
@docstring(version.__doc__)
|
||||
def main(**kwargs):
|
||||
|
||||
@@ -4,29 +4,46 @@ __package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox worker'
|
||||
|
||||
import sys
|
||||
import json
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
|
||||
|
||||
def worker(worker_type: str, daemon: bool = False, plugin: str | None = None):
|
||||
"""
|
||||
Start a worker process to process items from the queue.
|
||||
|
||||
Worker types:
|
||||
- crawl: Process Crawl objects (parse seeds, create snapshots)
|
||||
- snapshot: Process Snapshot objects (create archive results)
|
||||
- archiveresult: Process ArchiveResult objects (run plugins)
|
||||
|
||||
Workers poll the database for queued items, claim them atomically,
|
||||
and spawn subprocess tasks to handle each item.
|
||||
"""
|
||||
from workers.worker import get_worker_class
|
||||
|
||||
WorkerClass = get_worker_class(worker_type)
|
||||
|
||||
# Build kwargs
|
||||
kwargs = {'daemon': daemon}
|
||||
if plugin and worker_type == 'archiveresult':
|
||||
kwargs['extractor'] = plugin # internal field still called extractor
|
||||
|
||||
# Create and run worker
|
||||
worker_instance = WorkerClass(**kwargs)
|
||||
worker_instance.runloop()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('worker_type')
|
||||
@click.option('--wait-for-first-event', is_flag=True)
|
||||
@click.option('--exit-on-idle', is_flag=True)
|
||||
def main(worker_type: str, wait_for_first_event: bool, exit_on_idle: bool):
|
||||
"""Start an ArchiveBox worker process of the given type"""
|
||||
|
||||
from workers.worker import get_worker_type
|
||||
|
||||
# allow piping in events to process from stdin
|
||||
# if not sys.stdin.isatty():
|
||||
# for line in sys.stdin.readlines():
|
||||
# Event.dispatch(event=json.loads(line), parent=None)
|
||||
|
||||
# run the actor
|
||||
Worker = get_worker_type(worker_type)
|
||||
for event in Worker.run(wait_for_first_event=wait_for_first_event, exit_on_idle=exit_on_idle):
|
||||
print(event)
|
||||
@click.argument('worker_type', type=click.Choice(['crawl', 'snapshot', 'archiveresult']))
|
||||
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
|
||||
@click.option('--plugin', '-p', default=None, help='Filter by plugin (archiveresult only)')
|
||||
@docstring(worker.__doc__)
|
||||
def main(worker_type: str, daemon: bool, plugin: str | None):
|
||||
"""Start an ArchiveBox worker process"""
|
||||
worker(worker_type, daemon=daemon, plugin=plugin)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -31,7 +31,6 @@ DATA_DIR = 'data.tests'
|
||||
os.environ.update(TEST_CONFIG)
|
||||
|
||||
from ..main import init
|
||||
from ..index import load_main_index
|
||||
from archivebox.config.constants import (
|
||||
SQL_INDEX_FILENAME,
|
||||
JSON_INDEX_FILENAME,
|
||||
|
||||
966
archivebox/cli/tests_piping.py
Normal file
966
archivebox/cli/tests_piping.py
Normal file
@@ -0,0 +1,966 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for CLI piping workflow: crawl | snapshot | extract
|
||||
|
||||
This module tests the JSONL-based piping between CLI commands as described in:
|
||||
https://github.com/ArchiveBox/ArchiveBox/issues/1363
|
||||
|
||||
Workflows tested:
|
||||
archivebox snapshot URL | archivebox extract
|
||||
archivebox crawl URL | archivebox snapshot | archivebox extract
|
||||
archivebox crawl --plugin=PARSER URL | archivebox snapshot | archivebox extract
|
||||
|
||||
Each command should:
|
||||
- Accept URLs, snapshot_ids, or JSONL as input (args or stdin)
|
||||
- Output JSONL to stdout when piped (not TTY)
|
||||
- Output human-readable to stderr when TTY
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
# Test configuration - disable slow extractors
|
||||
TEST_CONFIG = {
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
'SAVE_ARCHIVE_DOT_ORG': 'False',
|
||||
'SAVE_TITLE': 'True', # Fast extractor
|
||||
'SAVE_FAVICON': 'False',
|
||||
'SAVE_WGET': 'False',
|
||||
'SAVE_WARC': 'False',
|
||||
'SAVE_PDF': 'False',
|
||||
'SAVE_SCREENSHOT': 'False',
|
||||
'SAVE_DOM': 'False',
|
||||
'SAVE_SINGLEFILE': 'False',
|
||||
'SAVE_READABILITY': 'False',
|
||||
'SAVE_MERCURY': 'False',
|
||||
'SAVE_GIT': 'False',
|
||||
'SAVE_MEDIA': 'False',
|
||||
'SAVE_HEADERS': 'False',
|
||||
'USE_CURL': 'False',
|
||||
'USE_WGET': 'False',
|
||||
'USE_GIT': 'False',
|
||||
'USE_CHROME': 'False',
|
||||
'USE_YOUTUBEDL': 'False',
|
||||
'USE_NODE': 'False',
|
||||
}
|
||||
|
||||
os.environ.update(TEST_CONFIG)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# JSONL Utility Tests
|
||||
# =============================================================================
|
||||
|
||||
class TestJSONLParsing(unittest.TestCase):
|
||||
"""Test JSONL input parsing utilities."""
|
||||
|
||||
def test_parse_plain_url(self):
|
||||
"""Plain URLs should be parsed as Snapshot records."""
|
||||
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
|
||||
|
||||
result = parse_line('https://example.com')
|
||||
self.assertIsNotNone(result)
|
||||
self.assertEqual(result['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(result['url'], 'https://example.com')
|
||||
|
||||
def test_parse_jsonl_snapshot(self):
|
||||
"""JSONL Snapshot records should preserve all fields."""
|
||||
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
|
||||
|
||||
line = '{"type": "Snapshot", "url": "https://example.com", "tags": "test,demo"}'
|
||||
result = parse_line(line)
|
||||
self.assertIsNotNone(result)
|
||||
self.assertEqual(result['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(result['url'], 'https://example.com')
|
||||
self.assertEqual(result['tags'], 'test,demo')
|
||||
|
||||
def test_parse_jsonl_with_id(self):
|
||||
"""JSONL with id field should be recognized."""
|
||||
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
|
||||
|
||||
line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
|
||||
result = parse_line(line)
|
||||
self.assertIsNotNone(result)
|
||||
self.assertEqual(result['id'], 'abc123')
|
||||
self.assertEqual(result['url'], 'https://example.com')
|
||||
|
||||
def test_parse_uuid_as_snapshot_id(self):
|
||||
"""Bare UUIDs should be parsed as snapshot IDs."""
|
||||
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
|
||||
|
||||
uuid = '01234567-89ab-cdef-0123-456789abcdef'
|
||||
result = parse_line(uuid)
|
||||
self.assertIsNotNone(result)
|
||||
self.assertEqual(result['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(result['id'], uuid)
|
||||
|
||||
def test_parse_empty_line(self):
|
||||
"""Empty lines should return None."""
|
||||
from archivebox.misc.jsonl import parse_line
|
||||
|
||||
self.assertIsNone(parse_line(''))
|
||||
self.assertIsNone(parse_line(' '))
|
||||
self.assertIsNone(parse_line('\n'))
|
||||
|
||||
def test_parse_comment_line(self):
|
||||
"""Comment lines should return None."""
|
||||
from archivebox.misc.jsonl import parse_line
|
||||
|
||||
self.assertIsNone(parse_line('# This is a comment'))
|
||||
self.assertIsNone(parse_line(' # Indented comment'))
|
||||
|
||||
def test_parse_invalid_url(self):
|
||||
"""Invalid URLs should return None."""
|
||||
from archivebox.misc.jsonl import parse_line
|
||||
|
||||
self.assertIsNone(parse_line('not-a-url'))
|
||||
self.assertIsNone(parse_line('ftp://example.com')) # Only http/https/file
|
||||
|
||||
def test_parse_file_url(self):
|
||||
"""file:// URLs should be parsed."""
|
||||
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
|
||||
|
||||
result = parse_line('file:///path/to/file.txt')
|
||||
self.assertIsNotNone(result)
|
||||
self.assertEqual(result['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(result['url'], 'file:///path/to/file.txt')
|
||||
|
||||
|
||||
class TestJSONLOutput(unittest.TestCase):
|
||||
"""Test JSONL output formatting."""
|
||||
|
||||
def test_snapshot_to_jsonl(self):
|
||||
"""Snapshot model should serialize to JSONL correctly."""
|
||||
from archivebox.misc.jsonl import snapshot_to_jsonl, TYPE_SNAPSHOT
|
||||
|
||||
# Create a mock snapshot
|
||||
mock_snapshot = MagicMock()
|
||||
mock_snapshot.id = 'test-uuid-1234'
|
||||
mock_snapshot.url = 'https://example.com'
|
||||
mock_snapshot.title = 'Example Title'
|
||||
mock_snapshot.tags_str.return_value = 'tag1,tag2'
|
||||
mock_snapshot.bookmarked_at = None
|
||||
mock_snapshot.created_at = None
|
||||
mock_snapshot.timestamp = '1234567890'
|
||||
mock_snapshot.depth = 0
|
||||
mock_snapshot.status = 'queued'
|
||||
|
||||
result = snapshot_to_jsonl(mock_snapshot)
|
||||
self.assertEqual(result['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(result['id'], 'test-uuid-1234')
|
||||
self.assertEqual(result['url'], 'https://example.com')
|
||||
self.assertEqual(result['title'], 'Example Title')
|
||||
|
||||
def test_archiveresult_to_jsonl(self):
|
||||
"""ArchiveResult model should serialize to JSONL correctly."""
|
||||
from archivebox.misc.jsonl import archiveresult_to_jsonl, TYPE_ARCHIVERESULT
|
||||
|
||||
mock_result = MagicMock()
|
||||
mock_result.id = 'result-uuid-5678'
|
||||
mock_result.snapshot_id = 'snapshot-uuid-1234'
|
||||
mock_result.extractor = 'title'
|
||||
mock_result.status = 'succeeded'
|
||||
mock_result.output = 'Example Title'
|
||||
mock_result.start_ts = None
|
||||
mock_result.end_ts = None
|
||||
|
||||
result = archiveresult_to_jsonl(mock_result)
|
||||
self.assertEqual(result['type'], TYPE_ARCHIVERESULT)
|
||||
self.assertEqual(result['id'], 'result-uuid-5678')
|
||||
self.assertEqual(result['snapshot_id'], 'snapshot-uuid-1234')
|
||||
self.assertEqual(result['extractor'], 'title')
|
||||
self.assertEqual(result['status'], 'succeeded')
|
||||
|
||||
|
||||
class TestReadArgsOrStdin(unittest.TestCase):
|
||||
"""Test reading from args or stdin."""
|
||||
|
||||
def test_read_from_args(self):
|
||||
"""Should read URLs from command line args."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
args = ('https://example1.com', 'https://example2.com')
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
self.assertEqual(len(records), 2)
|
||||
self.assertEqual(records[0]['url'], 'https://example1.com')
|
||||
self.assertEqual(records[1]['url'], 'https://example2.com')
|
||||
|
||||
def test_read_from_stdin(self):
|
||||
"""Should read URLs from stdin when no args provided."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin_content = 'https://example1.com\nhttps://example2.com\n'
|
||||
stream = StringIO(stdin_content)
|
||||
|
||||
# Mock isatty to return False (simulating piped input)
|
||||
stream.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stream))
|
||||
|
||||
self.assertEqual(len(records), 2)
|
||||
self.assertEqual(records[0]['url'], 'https://example1.com')
|
||||
self.assertEqual(records[1]['url'], 'https://example2.com')
|
||||
|
||||
def test_read_jsonl_from_stdin(self):
|
||||
"""Should read JSONL from stdin."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin_content = '{"type": "Snapshot", "url": "https://example.com", "tags": "test"}\n'
|
||||
stream = StringIO(stdin_content)
|
||||
stream.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stream))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['url'], 'https://example.com')
|
||||
self.assertEqual(records[0]['tags'], 'test')
|
||||
|
||||
def test_skip_tty_stdin(self):
|
||||
"""Should not read from TTY stdin (would block)."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stream = StringIO('https://example.com')
|
||||
stream.isatty = lambda: True # Simulate TTY
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stream))
|
||||
self.assertEqual(len(records), 0)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Unit Tests for Individual Commands
|
||||
# =============================================================================
|
||||
|
||||
class TestCrawlCommand(unittest.TestCase):
|
||||
"""Unit tests for archivebox crawl command."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.test_dir = tempfile.mkdtemp()
|
||||
os.environ['DATA_DIR'] = self.test_dir
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test environment."""
|
||||
shutil.rmtree(self.test_dir, ignore_errors=True)
|
||||
|
||||
def test_crawl_accepts_url(self):
|
||||
"""crawl should accept URLs as input."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
args = ('https://example.com',)
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['url'], 'https://example.com')
|
||||
|
||||
def test_crawl_accepts_snapshot_id(self):
|
||||
"""crawl should accept snapshot IDs as input."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
uuid = '01234567-89ab-cdef-0123-456789abcdef'
|
||||
args = (uuid,)
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['id'], uuid)
|
||||
|
||||
def test_crawl_accepts_jsonl(self):
|
||||
"""crawl should accept JSONL with snapshot info."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['id'], 'abc123')
|
||||
self.assertEqual(records[0]['url'], 'https://example.com')
|
||||
|
||||
def test_crawl_separates_existing_vs_new(self):
|
||||
"""crawl should identify existing snapshots vs new URLs."""
|
||||
# This tests the logic in discover_outlinks() that separates
|
||||
# records with 'id' (existing) from records with just 'url' (new)
|
||||
|
||||
records = [
|
||||
{'type': 'Snapshot', 'id': 'existing-id-1'}, # Existing (id only)
|
||||
{'type': 'Snapshot', 'url': 'https://new-url.com'}, # New (url only)
|
||||
{'type': 'Snapshot', 'id': 'existing-id-2', 'url': 'https://existing.com'}, # Existing (has id)
|
||||
]
|
||||
|
||||
existing = []
|
||||
new = []
|
||||
|
||||
for record in records:
|
||||
if record.get('id') and not record.get('url'):
|
||||
existing.append(record['id'])
|
||||
elif record.get('id'):
|
||||
existing.append(record['id']) # Has both id and url - treat as existing
|
||||
elif record.get('url'):
|
||||
new.append(record)
|
||||
|
||||
self.assertEqual(len(existing), 2)
|
||||
self.assertEqual(len(new), 1)
|
||||
self.assertEqual(new[0]['url'], 'https://new-url.com')
|
||||
|
||||
|
||||
class TestSnapshotCommand(unittest.TestCase):
|
||||
"""Unit tests for archivebox snapshot command."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.test_dir = tempfile.mkdtemp()
|
||||
os.environ['DATA_DIR'] = self.test_dir
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test environment."""
|
||||
shutil.rmtree(self.test_dir, ignore_errors=True)
|
||||
|
||||
def test_snapshot_accepts_url(self):
|
||||
"""snapshot should accept URLs as input."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
args = ('https://example.com',)
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['url'], 'https://example.com')
|
||||
|
||||
def test_snapshot_accepts_jsonl_with_metadata(self):
|
||||
"""snapshot should accept JSONL with tags and other metadata."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin = StringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['url'], 'https://example.com')
|
||||
self.assertEqual(records[0]['tags'], 'tag1,tag2')
|
||||
self.assertEqual(records[0]['title'], 'Test')
|
||||
|
||||
def test_snapshot_output_format(self):
|
||||
"""snapshot output should include id and url."""
|
||||
from archivebox.misc.jsonl import snapshot_to_jsonl
|
||||
|
||||
mock_snapshot = MagicMock()
|
||||
mock_snapshot.id = 'test-id'
|
||||
mock_snapshot.url = 'https://example.com'
|
||||
mock_snapshot.title = 'Test'
|
||||
mock_snapshot.tags_str.return_value = ''
|
||||
mock_snapshot.bookmarked_at = None
|
||||
mock_snapshot.created_at = None
|
||||
mock_snapshot.timestamp = '123'
|
||||
mock_snapshot.depth = 0
|
||||
mock_snapshot.status = 'queued'
|
||||
|
||||
output = snapshot_to_jsonl(mock_snapshot)
|
||||
|
||||
self.assertIn('id', output)
|
||||
self.assertIn('url', output)
|
||||
self.assertEqual(output['type'], 'Snapshot')
|
||||
|
||||
|
||||
class TestExtractCommand(unittest.TestCase):
|
||||
"""Unit tests for archivebox extract command."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.test_dir = tempfile.mkdtemp()
|
||||
os.environ['DATA_DIR'] = self.test_dir
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test environment."""
|
||||
shutil.rmtree(self.test_dir, ignore_errors=True)
|
||||
|
||||
def test_extract_accepts_snapshot_id(self):
|
||||
"""extract should accept snapshot IDs as input."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
uuid = '01234567-89ab-cdef-0123-456789abcdef'
|
||||
args = (uuid,)
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['id'], uuid)
|
||||
|
||||
def test_extract_accepts_jsonl_snapshot(self):
|
||||
"""extract should accept JSONL Snapshot records."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
|
||||
|
||||
stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(records[0]['id'], 'abc123')
|
||||
|
||||
def test_extract_gathers_snapshot_ids(self):
|
||||
"""extract should gather snapshot IDs from various input formats."""
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
|
||||
records = [
|
||||
{'type': TYPE_SNAPSHOT, 'id': 'snap-1'},
|
||||
{'type': TYPE_SNAPSHOT, 'id': 'snap-2', 'url': 'https://example.com'},
|
||||
{'type': TYPE_ARCHIVERESULT, 'snapshot_id': 'snap-3'},
|
||||
{'id': 'snap-4'}, # Bare id
|
||||
]
|
||||
|
||||
snapshot_ids = set()
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
|
||||
if record_type == TYPE_SNAPSHOT:
|
||||
snapshot_id = record.get('id')
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
elif 'id' in record:
|
||||
snapshot_ids.add(record['id'])
|
||||
|
||||
self.assertEqual(len(snapshot_ids), 4)
|
||||
self.assertIn('snap-1', snapshot_ids)
|
||||
self.assertIn('snap-2', snapshot_ids)
|
||||
self.assertIn('snap-3', snapshot_ids)
|
||||
self.assertIn('snap-4', snapshot_ids)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# URL Collection Tests
|
||||
# =============================================================================
|
||||
|
||||
class TestURLCollection(unittest.TestCase):
|
||||
"""Test collecting urls.jsonl from extractor output."""
|
||||
|
||||
def setUp(self):
|
||||
"""Create test directory structure."""
|
||||
self.test_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
# Create fake extractor output directories with urls.jsonl
|
||||
(self.test_dir / 'wget').mkdir()
|
||||
(self.test_dir / 'wget' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://wget-link-1.com"}\n'
|
||||
'{"url": "https://wget-link-2.com"}\n'
|
||||
)
|
||||
|
||||
(self.test_dir / 'parse_html_urls').mkdir()
|
||||
(self.test_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://html-link-1.com"}\n'
|
||||
'{"url": "https://html-link-2.com", "title": "HTML Link 2"}\n'
|
||||
)
|
||||
|
||||
(self.test_dir / 'screenshot').mkdir()
|
||||
# No urls.jsonl in screenshot dir - not a parser
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test directory."""
|
||||
shutil.rmtree(self.test_dir, ignore_errors=True)
|
||||
|
||||
def test_collect_urls_from_extractors(self):
|
||||
"""Should collect urls.jsonl from all extractor subdirectories."""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
|
||||
urls = collect_urls_from_extractors(self.test_dir)
|
||||
|
||||
self.assertEqual(len(urls), 4)
|
||||
|
||||
# Check that via_extractor is set
|
||||
extractors = {u['via_extractor'] for u in urls}
|
||||
self.assertIn('wget', extractors)
|
||||
self.assertIn('parse_html_urls', extractors)
|
||||
self.assertNotIn('screenshot', extractors) # No urls.jsonl
|
||||
|
||||
def test_collect_urls_preserves_metadata(self):
|
||||
"""Should preserve metadata from urls.jsonl entries."""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
|
||||
urls = collect_urls_from_extractors(self.test_dir)
|
||||
|
||||
# Find the entry with title
|
||||
titled = [u for u in urls if u.get('title') == 'HTML Link 2']
|
||||
self.assertEqual(len(titled), 1)
|
||||
self.assertEqual(titled[0]['url'], 'https://html-link-2.com')
|
||||
|
||||
def test_collect_urls_empty_dir(self):
|
||||
"""Should handle empty or non-existent directories."""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
|
||||
empty_dir = self.test_dir / 'nonexistent'
|
||||
urls = collect_urls_from_extractors(empty_dir)
|
||||
|
||||
self.assertEqual(len(urls), 0)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Integration Tests
|
||||
# =============================================================================
|
||||
|
||||
class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
"""
|
||||
Integration tests for the complete piping workflow.
|
||||
|
||||
These tests require Django to be set up and use the actual database.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""Set up Django and test database."""
|
||||
cls.test_dir = tempfile.mkdtemp()
|
||||
os.environ['DATA_DIR'] = cls.test_dir
|
||||
|
||||
# Initialize Django
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
# Initialize the archive
|
||||
from archivebox.cli.archivebox_init import init
|
||||
init()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
"""Clean up test database."""
|
||||
shutil.rmtree(cls.test_dir, ignore_errors=True)
|
||||
|
||||
def test_snapshot_creates_and_outputs_jsonl(self):
|
||||
"""
|
||||
Test: archivebox snapshot URL
|
||||
Should create a Snapshot and output JSONL when piped.
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record, snapshot_to_jsonl,
|
||||
TYPE_SNAPSHOT, get_or_create_snapshot
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# Simulate input
|
||||
url = 'https://test-snapshot-1.example.com'
|
||||
records = list(read_args_or_stdin((url,)))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['url'], url)
|
||||
|
||||
# Create snapshot
|
||||
snapshot = get_or_create_snapshot(records[0], created_by_id=created_by_id)
|
||||
|
||||
self.assertIsNotNone(snapshot.id)
|
||||
self.assertEqual(snapshot.url, url)
|
||||
|
||||
# Verify output format
|
||||
output = snapshot_to_jsonl(snapshot)
|
||||
self.assertEqual(output['type'], TYPE_SNAPSHOT)
|
||||
self.assertIn('id', output)
|
||||
self.assertEqual(output['url'], url)
|
||||
|
||||
def test_extract_accepts_snapshot_from_previous_command(self):
|
||||
"""
|
||||
Test: archivebox snapshot URL | archivebox extract
|
||||
Extract should accept JSONL output from snapshot command.
|
||||
"""
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from archivebox.misc.jsonl import (
|
||||
snapshot_to_jsonl, read_args_or_stdin, get_or_create_snapshot,
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# Step 1: Create snapshot (simulating 'archivebox snapshot')
|
||||
url = 'https://test-extract-1.example.com'
|
||||
snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
|
||||
snapshot_output = snapshot_to_jsonl(snapshot)
|
||||
|
||||
# Step 2: Parse snapshot output as extract input
|
||||
stdin = StringIO(json.dumps(snapshot_output) + '\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(records[0]['id'], str(snapshot.id))
|
||||
|
||||
# Step 3: Gather snapshot IDs (as extract does)
|
||||
snapshot_ids = set()
|
||||
for record in records:
|
||||
if record.get('type') == TYPE_SNAPSHOT and record.get('id'):
|
||||
snapshot_ids.add(record['id'])
|
||||
|
||||
self.assertIn(str(snapshot.id), snapshot_ids)
|
||||
|
||||
def test_crawl_outputs_discovered_urls(self):
|
||||
"""
|
||||
Test: archivebox crawl URL
|
||||
Should create snapshot, run plugins, output discovered URLs.
|
||||
"""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||
|
||||
# Create a mock snapshot directory with urls.jsonl
|
||||
test_snapshot_dir = Path(self.test_dir) / 'archive' / 'test-crawl-snapshot'
|
||||
test_snapshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create mock extractor output
|
||||
(test_snapshot_dir / 'parse_html_urls').mkdir()
|
||||
(test_snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://discovered-1.com"}\n'
|
||||
'{"url": "https://discovered-2.com", "title": "Discovered 2"}\n'
|
||||
)
|
||||
|
||||
# Collect URLs (as crawl does)
|
||||
discovered = collect_urls_from_extractors(test_snapshot_dir)
|
||||
|
||||
self.assertEqual(len(discovered), 2)
|
||||
|
||||
# Add crawl metadata (as crawl does)
|
||||
for entry in discovered:
|
||||
entry['type'] = TYPE_SNAPSHOT
|
||||
entry['depth'] = 1
|
||||
entry['via_snapshot'] = 'test-crawl-snapshot'
|
||||
|
||||
# Verify output format
|
||||
self.assertEqual(discovered[0]['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(discovered[0]['depth'], 1)
|
||||
self.assertEqual(discovered[0]['url'], 'https://discovered-1.com')
|
||||
|
||||
def test_full_pipeline_snapshot_extract(self):
|
||||
"""
|
||||
Test: archivebox snapshot URL | archivebox extract
|
||||
|
||||
This is equivalent to: archivebox add URL
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# === archivebox snapshot https://example.com ===
|
||||
url = 'https://test-pipeline-1.example.com'
|
||||
snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
|
||||
snapshot_jsonl = json.dumps(snapshot_to_jsonl(snapshot))
|
||||
|
||||
# === | archivebox extract ===
|
||||
stdin = StringIO(snapshot_jsonl + '\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
# Extract should receive the snapshot ID
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['id'], str(snapshot.id))
|
||||
|
||||
# Verify snapshot exists in DB
|
||||
db_snapshot = Snapshot.objects.get(id=snapshot.id)
|
||||
self.assertEqual(db_snapshot.url, url)
|
||||
|
||||
def test_full_pipeline_crawl_snapshot_extract(self):
|
||||
"""
|
||||
Test: archivebox crawl URL | archivebox snapshot | archivebox extract
|
||||
|
||||
This is equivalent to: archivebox add --depth=1 URL
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# === archivebox crawl https://example.com ===
|
||||
# Step 1: Create snapshot for starting URL
|
||||
start_url = 'https://test-crawl-pipeline.example.com'
|
||||
start_snapshot = get_or_create_snapshot({'url': start_url}, created_by_id=created_by_id)
|
||||
|
||||
# Step 2: Simulate extractor output with discovered URLs
|
||||
snapshot_dir = Path(self.test_dir) / 'archive' / str(start_snapshot.timestamp)
|
||||
snapshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
(snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
|
||||
(snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://outlink-1.example.com"}\n'
|
||||
'{"url": "https://outlink-2.example.com"}\n'
|
||||
)
|
||||
|
||||
# Step 3: Collect discovered URLs (crawl output)
|
||||
discovered = collect_urls_from_extractors(snapshot_dir)
|
||||
crawl_output = []
|
||||
for entry in discovered:
|
||||
entry['type'] = TYPE_SNAPSHOT
|
||||
entry['depth'] = 1
|
||||
crawl_output.append(json.dumps(entry))
|
||||
|
||||
# === | archivebox snapshot ===
|
||||
stdin = StringIO('\n'.join(crawl_output) + '\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
self.assertEqual(len(records), 2)
|
||||
|
||||
# Create snapshots for discovered URLs
|
||||
created_snapshots = []
|
||||
for record in records:
|
||||
snap = get_or_create_snapshot(record, created_by_id=created_by_id)
|
||||
created_snapshots.append(snap)
|
||||
|
||||
self.assertEqual(len(created_snapshots), 2)
|
||||
|
||||
# === | archivebox extract ===
|
||||
snapshot_jsonl_lines = [json.dumps(snapshot_to_jsonl(s)) for s in created_snapshots]
|
||||
stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
self.assertEqual(len(records), 2)
|
||||
|
||||
# Verify all snapshots exist in DB
|
||||
for record in records:
|
||||
db_snapshot = Snapshot.objects.get(id=record['id'])
|
||||
self.assertIn(db_snapshot.url, [
|
||||
'https://outlink-1.example.com',
|
||||
'https://outlink-2.example.com'
|
||||
])
|
||||
|
||||
|
||||
class TestDepthWorkflows(unittest.TestCase):
|
||||
"""Test various depth crawl workflows."""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""Set up Django and test database."""
|
||||
cls.test_dir = tempfile.mkdtemp()
|
||||
os.environ['DATA_DIR'] = cls.test_dir
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.cli.archivebox_init import init
|
||||
init()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
"""Clean up test database."""
|
||||
shutil.rmtree(cls.test_dir, ignore_errors=True)
|
||||
|
||||
def test_depth_0_workflow(self):
|
||||
"""
|
||||
Test: archivebox snapshot URL | archivebox extract
|
||||
|
||||
Depth 0: Only archive the specified URL, no crawling.
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.misc.jsonl import get_or_create_snapshot
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# Create snapshot
|
||||
url = 'https://depth0-test.example.com'
|
||||
snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
|
||||
|
||||
# Verify only one snapshot created
|
||||
self.assertEqual(Snapshot.objects.filter(url=url).count(), 1)
|
||||
self.assertEqual(snapshot.url, url)
|
||||
|
||||
def test_depth_1_workflow(self):
|
||||
"""
|
||||
Test: archivebox crawl URL | archivebox snapshot | archivebox extract
|
||||
|
||||
Depth 1: Archive URL + all outlinks from that URL.
|
||||
"""
|
||||
# This is tested in test_full_pipeline_crawl_snapshot_extract
|
||||
pass
|
||||
|
||||
def test_depth_metadata_propagation(self):
|
||||
"""Test that depth metadata propagates through the pipeline."""
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||
|
||||
# Simulate crawl output with depth metadata
|
||||
crawl_output = [
|
||||
{'type': TYPE_SNAPSHOT, 'url': 'https://hop1.com', 'depth': 1, 'via_snapshot': 'root'},
|
||||
{'type': TYPE_SNAPSHOT, 'url': 'https://hop2.com', 'depth': 2, 'via_snapshot': 'hop1'},
|
||||
]
|
||||
|
||||
# Verify depth is preserved
|
||||
for entry in crawl_output:
|
||||
self.assertIn('depth', entry)
|
||||
self.assertIn('via_snapshot', entry)
|
||||
|
||||
|
||||
class TestParserPluginWorkflows(unittest.TestCase):
|
||||
"""Test workflows with specific parser plugins."""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""Set up Django and test database."""
|
||||
cls.test_dir = tempfile.mkdtemp()
|
||||
os.environ['DATA_DIR'] = cls.test_dir
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.cli.archivebox_init import init
|
||||
init()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
"""Clean up test database."""
|
||||
shutil.rmtree(cls.test_dir, ignore_errors=True)
|
||||
|
||||
def test_html_parser_workflow(self):
|
||||
"""
|
||||
Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||
|
||||
# Create mock output directory
|
||||
snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
|
||||
snapshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
(snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
|
||||
(snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://html-discovered.com", "title": "HTML Link"}\n'
|
||||
)
|
||||
|
||||
# Collect URLs
|
||||
discovered = collect_urls_from_extractors(snapshot_dir)
|
||||
|
||||
self.assertEqual(len(discovered), 1)
|
||||
self.assertEqual(discovered[0]['url'], 'https://html-discovered.com')
|
||||
self.assertEqual(discovered[0]['via_extractor'], 'parse_html_urls')
|
||||
|
||||
def test_rss_parser_workflow(self):
|
||||
"""
|
||||
Test: archivebox crawl --plugin=parse_rss_urls URL | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
|
||||
# Create mock output directory
|
||||
snapshot_dir = Path(self.test_dir) / 'archive' / 'rss-parser-test'
|
||||
snapshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
(snapshot_dir / 'parse_rss_urls').mkdir(exist_ok=True)
|
||||
(snapshot_dir / 'parse_rss_urls' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://rss-item-1.com", "title": "RSS Item 1"}\n'
|
||||
'{"url": "https://rss-item-2.com", "title": "RSS Item 2"}\n'
|
||||
)
|
||||
|
||||
# Collect URLs
|
||||
discovered = collect_urls_from_extractors(snapshot_dir)
|
||||
|
||||
self.assertEqual(len(discovered), 2)
|
||||
self.assertTrue(all(d['via_extractor'] == 'parse_rss_urls' for d in discovered))
|
||||
|
||||
def test_multiple_parsers_dedupe(self):
|
||||
"""
|
||||
Multiple parsers may discover the same URL - should be deduplicated.
|
||||
"""
|
||||
from archivebox.hooks import collect_urls_from_extractors
|
||||
|
||||
# Create mock output with duplicate URLs from different parsers
|
||||
snapshot_dir = Path(self.test_dir) / 'archive' / 'dedupe-test'
|
||||
snapshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
(snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
|
||||
(snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://same-url.com"}\n'
|
||||
)
|
||||
|
||||
(snapshot_dir / 'wget').mkdir(exist_ok=True)
|
||||
(snapshot_dir / 'wget' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://same-url.com"}\n' # Same URL, different extractor
|
||||
)
|
||||
|
||||
# Collect URLs
|
||||
all_discovered = collect_urls_from_extractors(snapshot_dir)
|
||||
|
||||
# Both entries are returned (deduplication happens at the crawl command level)
|
||||
self.assertEqual(len(all_discovered), 2)
|
||||
|
||||
# Verify both extractors found the same URL
|
||||
urls = {d['url'] for d in all_discovered}
|
||||
self.assertEqual(urls, {'https://same-url.com'})
|
||||
|
||||
|
||||
class TestEdgeCases(unittest.TestCase):
|
||||
"""Test edge cases and error handling."""
|
||||
|
||||
def test_empty_input(self):
|
||||
"""Commands should handle empty input gracefully."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
# Empty args, TTY stdin (should not block)
|
||||
stdin = StringIO('')
|
||||
stdin.isatty = lambda: True
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
self.assertEqual(len(records), 0)
|
||||
|
||||
def test_malformed_jsonl(self):
|
||||
"""Should skip malformed JSONL lines."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin = StringIO(
|
||||
'{"url": "https://good.com"}\n'
|
||||
'not valid json\n'
|
||||
'{"url": "https://also-good.com"}\n'
|
||||
)
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 2)
|
||||
urls = {r['url'] for r in records}
|
||||
self.assertEqual(urls, {'https://good.com', 'https://also-good.com'})
|
||||
|
||||
def test_mixed_input_formats(self):
|
||||
"""Should handle mixed URLs and JSONL."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin = StringIO(
|
||||
'https://plain-url.com\n'
|
||||
'{"type": "Snapshot", "url": "https://jsonl-url.com", "tags": "test"}\n'
|
||||
'01234567-89ab-cdef-0123-456789abcdef\n' # UUID
|
||||
)
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 3)
|
||||
|
||||
# Plain URL
|
||||
self.assertEqual(records[0]['url'], 'https://plain-url.com')
|
||||
|
||||
# JSONL with metadata
|
||||
self.assertEqual(records[1]['url'], 'https://jsonl-url.com')
|
||||
self.assertEqual(records[1]['tags'], 'test')
|
||||
|
||||
# UUID
|
||||
self.assertEqual(records[2]['id'], '01234567-89ab-cdef-0123-456789abcdef')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user