new jsonl cli interface

This commit is contained in:
Nick Sweeting
2025-12-30 16:12:53 -08:00
parent ba8c28a866
commit dd2302ad92
37 changed files with 2919 additions and 1602 deletions

View File

@@ -27,36 +27,43 @@ class ArchiveBoxGroup(click.Group):
'init': 'archivebox.cli.archivebox_init.main',
'install': 'archivebox.cli.archivebox_install.main',
}
# Model commands (CRUD operations via subcommands)
model_commands = {
'crawl': 'archivebox.cli.archivebox_crawl.main',
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
'archiveresult': 'archivebox.cli.archivebox_archiveresult.main',
'tag': 'archivebox.cli.archivebox_tag.main',
'binary': 'archivebox.cli.archivebox_binary.main',
'process': 'archivebox.cli.archivebox_process.main',
'machine': 'archivebox.cli.archivebox_machine.main',
}
archive_commands = {
# High-level commands
'add': 'archivebox.cli.archivebox_add.main',
'remove': 'archivebox.cli.archivebox_remove.main',
'run': 'archivebox.cli.archivebox_run.main',
'update': 'archivebox.cli.archivebox_update.main',
'search': 'archivebox.cli.archivebox_search.main',
'status': 'archivebox.cli.archivebox_status.main',
'config': 'archivebox.cli.archivebox_config.main',
'schedule': 'archivebox.cli.archivebox_schedule.main',
'server': 'archivebox.cli.archivebox_server.main',
'shell': 'archivebox.cli.archivebox_shell.main',
'manage': 'archivebox.cli.archivebox_manage.main',
# Worker/orchestrator commands
'orchestrator': 'archivebox.cli.archivebox_orchestrator.main',
# Worker command
'worker': 'archivebox.cli.archivebox_worker.main',
# Task commands (called by workers as subprocesses)
'crawl': 'archivebox.cli.archivebox_crawl.main',
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
'extract': 'archivebox.cli.archivebox_extract.main',
}
all_subcommands = {
**meta_commands,
**setup_commands,
**model_commands,
**archive_commands,
}
renamed_commands = {
'setup': 'install',
'list': 'search',
'import': 'add',
'archive': 'add',
'export': 'search',
# Old commands replaced by new model commands
'orchestrator': 'run',
'extract': 'archiveresult',
}
@classmethod
@@ -110,9 +117,9 @@ def cli(ctx, help=False):
if help or ctx.invoked_subcommand is None:
ctx.invoke(ctx.command.get_command(ctx, 'help'))
# if the subcommand is in the archive_commands dict and is not 'manage',
# if the subcommand is in archive_commands or model_commands,
# then we need to set up the django environment and check that we're in a valid data folder
if subcommand in ArchiveBoxGroup.archive_commands:
if subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands:
# print('SETUP DJANGO AND CHECK DATA FOLDER')
try:
from archivebox.config.django import setup_django

View File

@@ -0,0 +1,365 @@
#!/usr/bin/env python3
"""
archivebox archiveresult <action> [args...] [--filters]
Manage ArchiveResult records (plugin extraction results).
Actions:
create - Create ArchiveResults for Snapshots (queue extractions)
list - List ArchiveResults as JSONL (with optional filters)
update - Update ArchiveResults from stdin JSONL
delete - Delete ArchiveResults from stdin JSONL
Examples:
# Create ArchiveResults for snapshots (queue for extraction)
archivebox snapshot list --status=queued | archivebox archiveresult create
archivebox archiveresult create --plugin=screenshot --snapshot-id=<uuid>
# List with filters
archivebox archiveresult list --status=failed
archivebox archiveresult list --plugin=screenshot --status=succeeded
# Update (reset failed extractions to queued)
archivebox archiveresult list --status=failed | archivebox archiveresult update --status=queued
# Delete
archivebox archiveresult list --plugin=singlefile | archivebox archiveresult delete --yes
# Re-run failed extractions
archivebox archiveresult list --status=failed | archivebox run
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox archiveresult'
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# =============================================================================
# CREATE
# =============================================================================
def create_archiveresults(
snapshot_id: Optional[str] = None,
plugin: Optional[str] = None,
status: str = 'queued',
) -> int:
"""
Create ArchiveResults for Snapshots.
Reads Snapshot records from stdin and creates ArchiveResult entries.
If --plugin is specified, only creates results for that plugin.
Otherwise, creates results for all pending plugins.
Exit codes:
0: Success
1: Failure
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT
from archivebox.core.models import Snapshot, ArchiveResult
is_tty = sys.stdout.isatty()
# If snapshot_id provided directly, use that
if snapshot_id:
try:
snapshots = [Snapshot.objects.get(id=snapshot_id)]
except Snapshot.DoesNotExist:
rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
return 1
else:
# Read from stdin
records = list(read_stdin())
if not records:
rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
return 1
# Filter to only Snapshot records
snapshot_ids = []
for record in records:
if record.get('type') == TYPE_SNAPSHOT:
if record.get('id'):
snapshot_ids.append(record['id'])
elif record.get('id'):
# Assume it's a snapshot ID if no type specified
snapshot_ids.append(record['id'])
if not snapshot_ids:
rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
return 1
snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids))
if not snapshots:
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
return 1
created_count = 0
for snapshot in snapshots:
if plugin:
# Create for specific plugin only
result, created = ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin=plugin,
defaults={
'status': status,
'retry_at': timezone.now(),
}
)
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
# Reset for retry
result.status = status
result.retry_at = timezone.now()
result.save()
if not is_tty:
write_record(result.to_json())
created_count += 1
else:
# Create all pending plugins
snapshot.create_pending_archiveresults()
for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
if not is_tty:
write_record(result.to_json())
created_count += 1
rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
return 0
# =============================================================================
# LIST
# =============================================================================
def list_archiveresults(
status: Optional[str] = None,
plugin: Optional[str] = None,
snapshot_id: Optional[str] = None,
limit: Optional[int] = None,
) -> int:
"""
List ArchiveResults as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.core.models import ArchiveResult
is_tty = sys.stdout.isatty()
queryset = ArchiveResult.objects.all().order_by('-start_ts')
# Apply filters
filter_kwargs = {
'status': status,
'plugin': plugin,
'snapshot_id': snapshot_id,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for result in queryset:
if is_tty:
status_color = {
'queued': 'yellow',
'started': 'blue',
'succeeded': 'green',
'failed': 'red',
'skipped': 'dim',
'backoff': 'magenta',
}.get(result.status, 'dim')
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
else:
write_record(result.to_json())
count += 1
rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr)
return 0
# =============================================================================
# UPDATE
# =============================================================================
def update_archiveresults(
status: Optional[str] = None,
) -> int:
"""
Update ArchiveResults from stdin JSONL.
Reads ArchiveResult records from stdin and applies updates.
Uses PATCH semantics - only specified fields are updated.
Exit codes:
0: Success
1: No input or error
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.core.models import ArchiveResult
is_tty = sys.stdout.isatty()
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
updated_count = 0
for record in records:
result_id = record.get('id')
if not result_id:
continue
try:
result = ArchiveResult.objects.get(id=result_id)
# Apply updates from CLI flags
if status:
result.status = status
result.retry_at = timezone.now()
result.save()
updated_count += 1
if not is_tty:
write_record(result.to_json())
except ArchiveResult.DoesNotExist:
rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} archive results[/green]', file=sys.stderr)
return 0
# =============================================================================
# DELETE
# =============================================================================
def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete ArchiveResults from stdin JSONL.
Requires --yes flag to confirm deletion.
Exit codes:
0: Success
1: No input or missing --yes flag
"""
from archivebox.misc.jsonl import read_stdin
from archivebox.core.models import ArchiveResult
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
result_ids = [r.get('id') for r in records if r.get('id')]
if not result_ids:
rprint('[yellow]No valid archive result IDs in input[/yellow]', file=sys.stderr)
return 1
results = ArchiveResult.objects.filter(id__in=result_ids)
count = results.count()
if count == 0:
rprint('[yellow]No matching archive results found[/yellow]', file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} archive results (dry run)[/yellow]', file=sys.stderr)
for result in results[:10]:
rprint(f' [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}', file=sys.stderr)
if count > 10:
rprint(f' ... and {count - 10} more', file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = results.delete()
rprint(f'[green]Deleted {deleted_count} archive results[/green]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage ArchiveResult records (plugin extraction results)."""
pass
@main.command('create')
@click.option('--snapshot-id', help='Snapshot ID to create results for')
@click.option('--plugin', '-p', help='Plugin name (e.g., screenshot, singlefile)')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
def create_cmd(snapshot_id: Optional[str], plugin: Optional[str], status: str):
"""Create ArchiveResults for Snapshots from stdin JSONL."""
sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status))
@main.command('list')
@click.option('--status', '-s', help='Filter by status (queued, started, succeeded, failed, skipped)')
@click.option('--plugin', '-p', help='Filter by plugin name')
@click.option('--snapshot-id', help='Filter by snapshot ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(status: Optional[str], plugin: Optional[str],
snapshot_id: Optional[str], limit: Optional[int]):
"""List ArchiveResults as JSONL."""
sys.exit(list_archiveresults(
status=status,
plugin=plugin,
snapshot_id=snapshot_id,
limit=limit,
))
@main.command('update')
@click.option('--status', '-s', help='Set status')
def update_cmd(status: Optional[str]):
"""Update ArchiveResults from stdin JSONL."""
sys.exit(update_archiveresults(status=status))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
def delete_cmd(yes: bool, dry_run: bool):
"""Delete ArchiveResults from stdin JSONL."""
sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,304 @@
#!/usr/bin/env python3
"""
archivebox binary <action> [args...] [--filters]
Manage Binary records (detected executables like chrome, wget, etc.).
Actions:
create - Create/register a Binary
list - List Binaries as JSONL (with optional filters)
update - Update Binaries from stdin JSONL
delete - Delete Binaries from stdin JSONL
Examples:
# List all binaries
archivebox binary list
# List specific binary
archivebox binary list --name=chrome
# List binaries with specific version
archivebox binary list --version__icontains=120
# Delete old binary entries
archivebox binary list --name=chrome | archivebox binary delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox binary'
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# =============================================================================
# CREATE
# =============================================================================
def create_binary(
name: str,
abspath: str,
version: str = '',
) -> int:
"""
Create/register a Binary.
Exit codes:
0: Success
1: Failure
"""
from archivebox.misc.jsonl import write_record
from archivebox.machine.models import Binary
is_tty = sys.stdout.isatty()
if not name or not abspath:
rprint('[red]Both --name and --abspath are required[/red]', file=sys.stderr)
return 1
try:
binary, created = Binary.objects.get_or_create(
name=name,
abspath=abspath,
defaults={'version': version}
)
if not is_tty:
write_record(binary.to_json())
if created:
rprint(f'[green]Created binary: {name} at {abspath}[/green]', file=sys.stderr)
else:
rprint(f'[dim]Binary already exists: {name} at {abspath}[/dim]', file=sys.stderr)
return 0
except Exception as e:
rprint(f'[red]Error creating binary: {e}[/red]', file=sys.stderr)
return 1
# =============================================================================
# LIST
# =============================================================================
def list_binaries(
name: Optional[str] = None,
abspath__icontains: Optional[str] = None,
version__icontains: Optional[str] = None,
limit: Optional[int] = None,
) -> int:
"""
List Binaries as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.machine.models import Binary
is_tty = sys.stdout.isatty()
queryset = Binary.objects.all().order_by('name', '-loaded_at')
# Apply filters
filter_kwargs = {
'name': name,
'abspath__icontains': abspath__icontains,
'version__icontains': version__icontains,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for binary in queryset:
if is_tty:
rprint(f'[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}')
else:
write_record(binary.to_json())
count += 1
rprint(f'[dim]Listed {count} binaries[/dim]', file=sys.stderr)
return 0
# =============================================================================
# UPDATE
# =============================================================================
def update_binaries(
version: Optional[str] = None,
abspath: Optional[str] = None,
) -> int:
"""
Update Binaries from stdin JSONL.
Reads Binary records from stdin and applies updates.
Uses PATCH semantics - only specified fields are updated.
Exit codes:
0: Success
1: No input or error
"""
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.machine.models import Binary
is_tty = sys.stdout.isatty()
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
updated_count = 0
for record in records:
binary_id = record.get('id')
if not binary_id:
continue
try:
binary = Binary.objects.get(id=binary_id)
# Apply updates from CLI flags
if version:
binary.version = version
if abspath:
binary.abspath = abspath
binary.save()
updated_count += 1
if not is_tty:
write_record(binary.to_json())
except Binary.DoesNotExist:
rprint(f'[yellow]Binary not found: {binary_id}[/yellow]', file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} binaries[/green]', file=sys.stderr)
return 0
# =============================================================================
# DELETE
# =============================================================================
def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Binaries from stdin JSONL.
Requires --yes flag to confirm deletion.
Exit codes:
0: Success
1: No input or missing --yes flag
"""
from archivebox.misc.jsonl import read_stdin
from archivebox.machine.models import Binary
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
binary_ids = [r.get('id') for r in records if r.get('id')]
if not binary_ids:
rprint('[yellow]No valid binary IDs in input[/yellow]', file=sys.stderr)
return 1
binaries = Binary.objects.filter(id__in=binary_ids)
count = binaries.count()
if count == 0:
rprint('[yellow]No matching binaries found[/yellow]', file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} binaries (dry run)[/yellow]', file=sys.stderr)
for binary in binaries:
rprint(f' {binary.name} {binary.abspath}', file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = binaries.delete()
rprint(f'[green]Deleted {deleted_count} binaries[/green]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Binary records (detected executables)."""
pass
@main.command('create')
@click.option('--name', '-n', required=True, help='Binary name (e.g., chrome, wget)')
@click.option('--abspath', '-p', required=True, help='Absolute path to binary')
@click.option('--version', '-v', default='', help='Binary version')
def create_cmd(name: str, abspath: str, version: str):
"""Create/register a Binary."""
sys.exit(create_binary(name=name, abspath=abspath, version=version))
@main.command('list')
@click.option('--name', '-n', help='Filter by name')
@click.option('--abspath__icontains', help='Filter by path contains')
@click.option('--version__icontains', help='Filter by version contains')
@click.option('--limit', type=int, help='Limit number of results')
def list_cmd(name: Optional[str], abspath__icontains: Optional[str],
version__icontains: Optional[str], limit: Optional[int]):
"""List Binaries as JSONL."""
sys.exit(list_binaries(
name=name,
abspath__icontains=abspath__icontains,
version__icontains=version__icontains,
limit=limit,
))
@main.command('update')
@click.option('--version', '-v', help='Set version')
@click.option('--abspath', '-p', help='Set path')
def update_cmd(version: Optional[str], abspath: Optional[str]):
"""Update Binaries from stdin JSONL."""
sys.exit(update_binaries(version=version, abspath=abspath))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Binaries from stdin JSONL."""
sys.exit(delete_binaries(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
main()

View File

@@ -1,108 +1,134 @@
#!/usr/bin/env python3
"""
archivebox crawl [urls...] [--depth=N] [--tag=TAG]
archivebox crawl <action> [args...] [--filters]
Create Crawl jobs from URLs. Accepts URLs as arguments, from stdin, or via JSONL.
Does NOT immediately start the crawl - pipe to `archivebox snapshot` to process.
Manage Crawl records.
Input formats:
- Plain URLs (one per line)
- JSONL: {"url": "...", "depth": 1, "tags": "..."}
Output (JSONL):
{"type": "Crawl", "id": "...", "urls": "...", "status": "queued", ...}
Actions:
create - Create Crawl jobs from URLs
list - List Crawls as JSONL (with optional filters)
update - Update Crawls from stdin JSONL
delete - Delete Crawls from stdin JSONL
Examples:
# Create a crawl job
archivebox crawl https://example.com
# Create
archivebox crawl create https://example.com https://foo.com --depth=1
archivebox crawl create --tag=news https://example.com
# Create crawl with depth
archivebox crawl --depth=1 https://example.com
# List with filters
archivebox crawl list --status=queued
archivebox crawl list --urls__icontains=example.com
# Full pipeline: create crawl, create snapshots, run extractors
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
# Update
archivebox crawl list --status=started | archivebox crawl update --status=queued
# Process existing Crawl by ID (runs the crawl state machine)
archivebox crawl 01234567-89ab-cdef-0123-456789abcdef
# Delete
archivebox crawl list --urls__icontains=spam.com | archivebox crawl delete --yes
# Full pipeline
archivebox crawl create https://example.com | archivebox snapshot create | archivebox run
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox crawl'
import sys
from typing import Optional
from typing import Optional, Iterable
import rich_click as click
from rich import print as rprint
def create_crawls(
records: list,
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# =============================================================================
# CREATE
# =============================================================================
def create_crawl(
urls: Iterable[str],
depth: int = 0,
tag: str = '',
status: str = 'queued',
created_by_id: Optional[int] = None,
) -> int:
"""
Create a single Crawl job from all input URLs.
Create a Crawl job from URLs.
Takes pre-read records, creates one Crawl with all URLs, outputs JSONL.
Does NOT start the crawl - just creates the job in QUEUED state.
Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL.
Exit codes:
0: Success
1: Failure
"""
from rich import print as rprint
from archivebox.misc.jsonl import write_record
from archivebox.misc.jsonl import read_args_or_stdin, write_record
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
created_by_id = created_by_id or get_or_create_system_user_pk()
is_tty = sys.stdout.isatty()
# Collect all input records
records = list(read_args_or_stdin(urls))
if not records:
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
return 1
# Collect all URLs into a single newline-separated string
urls = []
url_list = []
for record in records:
url = record.get('url')
if url:
urls.append(url)
url_list.append(url)
if not urls:
if not url_list:
rprint('[red]No valid URLs found[/red]', file=sys.stderr)
return 1
try:
# Build crawl record with all URLs as newline-separated string
crawl_record = {
'urls': '\n'.join(urls),
'urls': '\n'.join(url_list),
'max_depth': depth,
'tags_str': tag,
'status': status,
'label': '',
}
crawl = Crawl.from_jsonl(crawl_record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(crawl_record, overrides={'created_by_id': created_by_id})
if not crawl:
rprint('[red]Failed to create crawl[/red]', file=sys.stderr)
return 1
# Output JSONL record (only when piped)
if not is_tty:
write_record(crawl.to_jsonl())
write_record(crawl.to_json())
rprint(f'[green]Created crawl with {len(urls)} URLs[/green]', file=sys.stderr)
rprint(f'[green]Created crawl with {len(url_list)} URLs[/green]', file=sys.stderr)
# If TTY, show human-readable output
if is_tty:
rprint(f' [dim]{crawl.id}[/dim]', file=sys.stderr)
for url in urls[:5]: # Show first 5 URLs
for url in url_list[:5]: # Show first 5 URLs
rprint(f' {url[:70]}', file=sys.stderr)
if len(urls) > 5:
rprint(f' ... and {len(urls) - 5} more', file=sys.stderr)
if len(url_list) > 5:
rprint(f' ... and {len(url_list) - 5} more', file=sys.stderr)
return 0
@@ -111,81 +137,217 @@ def create_crawls(
return 1
def process_crawl_by_id(crawl_id: str) -> int:
"""
Process a single Crawl by ID (used by workers).
# =============================================================================
# LIST
# =============================================================================
Triggers the Crawl's state machine tick() which will:
- Transition from queued -> started (creates root snapshot)
- Transition from started -> sealed (when all snapshots done)
def list_crawls(
status: Optional[str] = None,
urls__icontains: Optional[str] = None,
max_depth: Optional[int] = None,
limit: Optional[int] = None,
) -> int:
"""
from rich import print as rprint
List Crawls as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.crawls.models import Crawl
try:
crawl = Crawl.objects.get(id=crawl_id)
except Crawl.DoesNotExist:
rprint(f'[red]Crawl {crawl_id} not found[/red]', file=sys.stderr)
return 1
is_tty = sys.stdout.isatty()
rprint(f'[blue]Processing Crawl {crawl.id} (status={crawl.status})[/blue]', file=sys.stderr)
queryset = Crawl.objects.all().order_by('-created_at')
try:
crawl.sm.tick()
crawl.refresh_from_db()
rprint(f'[green]Crawl complete (status={crawl.status})[/green]', file=sys.stderr)
return 0
except Exception as e:
rprint(f'[red]Crawl error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
# Apply filters
filter_kwargs = {
'status': status,
'urls__icontains': urls__icontains,
'max_depth': max_depth,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for crawl in queryset:
if is_tty:
status_color = {
'queued': 'yellow',
'started': 'blue',
'sealed': 'green',
}.get(crawl.status, 'dim')
url_preview = crawl.urls[:50].replace('\n', ' ')
rprint(f'[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...')
else:
write_record(crawl.to_json())
count += 1
rprint(f'[dim]Listed {count} crawls[/dim]', file=sys.stderr)
return 0
def is_crawl_id(value: str) -> bool:
"""Check if value looks like a Crawl UUID."""
import re
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
if not uuid_pattern.match(value):
return False
# Verify it's actually a Crawl (not a Snapshot or other object)
# =============================================================================
# UPDATE
# =============================================================================
def update_crawls(
status: Optional[str] = None,
max_depth: Optional[int] = None,
) -> int:
"""
Update Crawls from stdin JSONL.
Reads Crawl records from stdin and applies updates.
Uses PATCH semantics - only specified fields are updated.
Exit codes:
0: Success
1: No input or error
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.crawls.models import Crawl
return Crawl.objects.filter(id=value).exists()
is_tty = sys.stdout.isatty()
@click.command()
@click.option('--depth', '-d', type=int, default=0, help='Max depth for recursive crawling (default: 0, no recursion)')
@click.option('--tag', '-t', default='', help='Comma-separated tags to add to snapshots')
@click.argument('args', nargs=-1)
def main(depth: int, tag: str, args: tuple):
"""Create Crawl jobs from URLs, or process existing Crawls by ID"""
from archivebox.misc.jsonl import read_args_or_stdin
# Read all input
records = list(read_args_or_stdin(args))
records = list(read_stdin())
if not records:
from rich import print as rprint
rprint('[yellow]No URLs or Crawl IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
sys.exit(1)
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
# Check if input looks like existing Crawl IDs to process
# If ALL inputs are Crawl UUIDs, process them
all_are_crawl_ids = all(
is_crawl_id(r.get('id') or r.get('url', ''))
for r in records
)
updated_count = 0
for record in records:
crawl_id = record.get('id')
if not crawl_id:
continue
if all_are_crawl_ids:
# Process existing Crawls by ID
exit_code = 0
for record in records:
crawl_id = record.get('id') or record.get('url')
result = process_crawl_by_id(crawl_id)
if result != 0:
exit_code = result
sys.exit(exit_code)
else:
# Default behavior: create Crawl jobs from URLs
sys.exit(create_crawls(records, depth=depth, tag=tag))
try:
crawl = Crawl.objects.get(id=crawl_id)
# Apply updates from CLI flags
if status:
crawl.status = status
crawl.retry_at = timezone.now()
if max_depth is not None:
crawl.max_depth = max_depth
crawl.save()
updated_count += 1
if not is_tty:
write_record(crawl.to_json())
except Crawl.DoesNotExist:
rprint(f'[yellow]Crawl not found: {crawl_id}[/yellow]', file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} crawls[/green]', file=sys.stderr)
return 0
# =============================================================================
# DELETE
# =============================================================================
def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Crawls from stdin JSONL.
Requires --yes flag to confirm deletion.
Exit codes:
0: Success
1: No input or missing --yes flag
"""
from archivebox.misc.jsonl import read_stdin
from archivebox.crawls.models import Crawl
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
crawl_ids = [r.get('id') for r in records if r.get('id')]
if not crawl_ids:
rprint('[yellow]No valid crawl IDs in input[/yellow]', file=sys.stderr)
return 1
crawls = Crawl.objects.filter(id__in=crawl_ids)
count = crawls.count()
if count == 0:
rprint('[yellow]No matching crawls found[/yellow]', file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} crawls (dry run)[/yellow]', file=sys.stderr)
for crawl in crawls:
url_preview = crawl.urls[:50].replace('\n', ' ')
rprint(f' [dim]{crawl.id}[/dim] {url_preview}...', file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = crawls.delete()
rprint(f'[green]Deleted {deleted_count} crawls[/green]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Crawl records."""
pass
@main.command('create')
@click.argument('urls', nargs=-1)
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
def create_cmd(urls: tuple, depth: int, tag: str, status: str):
"""Create a Crawl job from URLs or stdin."""
sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status))
@main.command('list')
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
@click.option('--urls__icontains', help='Filter by URLs contains')
@click.option('--max-depth', type=int, help='Filter by max depth')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(status: Optional[str], urls__icontains: Optional[str],
max_depth: Optional[int], limit: Optional[int]):
"""List Crawls as JSONL."""
sys.exit(list_crawls(
status=status,
urls__icontains=urls__icontains,
max_depth=max_depth,
limit=limit,
))
@main.command('update')
@click.option('--status', '-s', help='Set status')
@click.option('--max-depth', type=int, help='Set max depth')
def update_cmd(status: Optional[str], max_depth: Optional[int]):
"""Update Crawls from stdin JSONL."""
sys.exit(update_crawls(status=status, max_depth=max_depth))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Crawls from stdin JSONL."""
sys.exit(delete_crawls(yes=yes, dry_run=dry_run))
if __name__ == '__main__':

View File

@@ -1,265 +0,0 @@
#!/usr/bin/env python3
"""
archivebox extract [snapshot_ids...] [--plugins=NAMES]
Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.
Input formats:
- Snapshot UUIDs (one per line)
- JSONL: {"type": "Snapshot", "id": "...", "url": "..."}
- JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."}
Output (JSONL):
{"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."}
Examples:
# Extract specific snapshot
archivebox extract 01234567-89ab-cdef-0123-456789abcdef
# Pipe from snapshot command
archivebox snapshot https://example.com | archivebox extract
# Run specific plugins only
archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef
# Chain commands
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox extract'
import sys
from typing import Optional, List
import rich_click as click
def process_archiveresult_by_id(archiveresult_id: str) -> int:
"""
Run extraction for a single ArchiveResult by ID (used by workers).
Triggers the ArchiveResult's state machine tick() to run the extractor plugin.
"""
from rich import print as rprint
from archivebox.core.models import ArchiveResult
try:
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
except ArchiveResult.DoesNotExist:
rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
return 1
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
try:
# Trigger state machine tick - this runs the actual extraction
archiveresult.sm.tick()
archiveresult.refresh_from_db()
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
return 0
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
return 1
else:
# Still in progress or backoff - not a failure
print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
return 0
except Exception as e:
print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
def run_plugins(
args: tuple,
plugins: str = '',
wait: bool = True,
) -> int:
"""
Run plugins on Snapshots from input.
Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL.
Exit codes:
0: Success
1: Failure
"""
from rich import print as rprint
from django.utils import timezone
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record,
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
)
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.workers.orchestrator import Orchestrator
is_tty = sys.stdout.isatty()
# Parse comma-separated plugins list once (reused in creation and filtering)
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []
# Collect all input records
records = list(read_args_or_stdin(args))
if not records:
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
return 1
# Gather snapshot IDs to process
snapshot_ids = set()
for record in records:
record_type = record.get('type')
if record_type == TYPE_SNAPSHOT:
snapshot_id = record.get('id')
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif record.get('url'):
# Look up by URL (get most recent if multiple exist)
snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
if snap:
snapshot_ids.add(str(snap.id))
else:
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
elif record_type == TYPE_ARCHIVERESULT:
snapshot_id = record.get('snapshot_id')
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif 'id' in record:
# Assume it's a snapshot ID
snapshot_ids.add(record['id'])
if not snapshot_ids:
rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
return 1
# Get snapshots and ensure they have pending ArchiveResults
processed_count = 0
for snapshot_id in snapshot_ids:
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
continue
# Create pending ArchiveResults if needed
if plugins_list:
# Only create for specific plugins
for plugin_name in plugins_list:
result, created = ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin=plugin_name,
defaults={
'status': ArchiveResult.StatusChoices.QUEUED,
'retry_at': timezone.now(),
}
)
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
# Reset for retry
result.status = ArchiveResult.StatusChoices.QUEUED
result.retry_at = timezone.now()
result.save()
else:
# Create all pending plugins
snapshot.create_pending_archiveresults()
# Reset snapshot status to allow processing
if snapshot.status == Snapshot.StatusChoices.SEALED:
snapshot.status = Snapshot.StatusChoices.STARTED
snapshot.retry_at = timezone.now()
snapshot.save()
processed_count += 1
if processed_count == 0:
rprint('[red]No snapshots to process[/red]', file=sys.stderr)
return 1
rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
# Run orchestrator if --wait (default)
if wait:
rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
# Output results as JSONL (when piped) or human-readable (when TTY)
for snapshot_id in snapshot_ids:
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
results = snapshot.archiveresult_set.all()
if plugins_list:
results = results.filter(plugin__in=plugins_list)
for result in results:
if is_tty:
status_color = {
'succeeded': 'green',
'failed': 'red',
'skipped': 'yellow',
}.get(result.status, 'dim')
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin}{result.output_str or ""}', file=sys.stderr)
else:
write_record(result.to_jsonl())
except Snapshot.DoesNotExist:
continue
return 0
def is_archiveresult_id(value: str) -> bool:
"""Check if value looks like an ArchiveResult UUID."""
import re
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
if not uuid_pattern.match(value):
return False
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
from archivebox.core.models import ArchiveResult
return ArchiveResult.objects.filter(id=value).exists()
@click.command()
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
@click.argument('args', nargs=-1)
def main(plugins: str, wait: bool, args: tuple):
"""Run plugins on Snapshots, or process existing ArchiveResults by ID"""
from archivebox.misc.jsonl import read_args_or_stdin
# Read all input
records = list(read_args_or_stdin(args))
if not records:
from rich import print as rprint
rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
sys.exit(1)
# Check if input looks like existing ArchiveResult IDs to process
all_are_archiveresult_ids = all(
is_archiveresult_id(r.get('id') or r.get('url', ''))
for r in records
)
if all_are_archiveresult_ids:
# Process existing ArchiveResults by ID
exit_code = 0
for record in records:
archiveresult_id = record.get('id') or record.get('url')
result = process_archiveresult_by_id(archiveresult_id)
if result != 0:
exit_code = result
sys.exit(exit_code)
else:
# Default behavior: run plugins on Snapshots from input
sys.exit(run_plugins(args, plugins=plugins, wait=wait))
if __name__ == '__main__':
main()

View File

@@ -127,7 +127,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
if pending_links:
for link_dict in pending_links.values():
Snapshot.from_jsonl(link_dict)
Snapshot.from_json(link_dict)
# Hint for orphaned snapshot directories
print()

View File

@@ -0,0 +1,113 @@
#!/usr/bin/env python3
"""
archivebox machine <action> [--filters]
Manage Machine records (system-managed, mostly read-only).
Machine records track the host machines where ArchiveBox runs.
They are created automatically by the system and are primarily for debugging.
Actions:
list - List Machines as JSONL (with optional filters)
Examples:
# List all machines
archivebox machine list
# List machines by hostname
archivebox machine list --hostname__icontains=myserver
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox machine'
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# =============================================================================
# LIST
# =============================================================================
def list_machines(
hostname__icontains: Optional[str] = None,
os_platform: Optional[str] = None,
limit: Optional[int] = None,
) -> int:
"""
List Machines as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.machine.models import Machine
is_tty = sys.stdout.isatty()
queryset = Machine.objects.all().order_by('-created_at')
# Apply filters
filter_kwargs = {
'hostname__icontains': hostname__icontains,
'os_platform': os_platform,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for machine in queryset:
if is_tty:
rprint(f'[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}')
else:
write_record(machine.to_json())
count += 1
rprint(f'[dim]Listed {count} machines[/dim]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Machine records (read-only, system-managed)."""
pass
@main.command('list')
@click.option('--hostname__icontains', help='Filter by hostname contains')
@click.option('--os-platform', help='Filter by OS platform')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(hostname__icontains: Optional[str], os_platform: Optional[str], limit: Optional[int]):
"""List Machines as JSONL."""
sys.exit(list_machines(
hostname__icontains=hostname__icontains,
os_platform=os_platform,
limit=limit,
))
if __name__ == '__main__':
main()

View File

@@ -1,67 +0,0 @@
#!/usr/bin/env python3
"""
archivebox orchestrator [--daemon]
Start the orchestrator process that manages workers.
The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
and lazily spawns worker processes when there is work to be done.
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox orchestrator'
import sys
import rich_click as click
from archivebox.misc.util import docstring
def orchestrator(daemon: bool = False, watch: bool = False) -> int:
"""
Start the orchestrator process.
The orchestrator:
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
2. Spawns worker processes when there is work to do
3. Monitors worker health and restarts failed workers
4. Exits when all queues are empty (unless --daemon)
Args:
daemon: Run forever (don't exit when idle)
watch: Just watch the queues without spawning workers (for debugging)
Exit codes:
0: All work completed successfully
1: Error occurred
"""
from archivebox.workers.orchestrator import Orchestrator
if Orchestrator.is_running():
print('[yellow]Orchestrator is already running[/yellow]')
return 0
try:
orchestrator_instance = Orchestrator(exit_on_idle=not daemon)
orchestrator_instance.runloop()
return 0
except KeyboardInterrupt:
return 0
except Exception as e:
print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
@click.command()
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
@click.option('--watch', '-w', is_flag=True, help="Watch queues without spawning workers")
@docstring(orchestrator.__doc__)
def main(daemon: bool, watch: bool):
"""Start the ArchiveBox orchestrator process"""
sys.exit(orchestrator(daemon=daemon, watch=watch))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,121 @@
#!/usr/bin/env python3
"""
archivebox process <action> [--filters]
Manage Process records (system-managed, mostly read-only).
Process records track executions of binaries during extraction.
They are created automatically by the system and are primarily for debugging.
Actions:
list - List Processes as JSONL (with optional filters)
Examples:
# List all processes
archivebox process list
# List processes by binary
archivebox process list --binary-name=chrome
# List recent processes
archivebox process list --limit=10
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox process'
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# =============================================================================
# LIST
# =============================================================================
def list_processes(
binary_name: Optional[str] = None,
machine_id: Optional[str] = None,
limit: Optional[int] = None,
) -> int:
"""
List Processes as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.machine.models import Process
is_tty = sys.stdout.isatty()
queryset = Process.objects.all().select_related('binary', 'machine').order_by('-start_ts')
# Apply filters
filter_kwargs = {}
if binary_name:
filter_kwargs['binary__name'] = binary_name
if machine_id:
filter_kwargs['machine_id'] = machine_id
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for process in queryset:
if is_tty:
binary_name_str = process.binary.name if process.binary else 'unknown'
exit_code = process.returncode if process.returncode is not None else '?'
status_color = 'green' if process.returncode == 0 else 'red' if process.returncode else 'yellow'
rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]')
else:
write_record(process.to_json())
count += 1
rprint(f'[dim]Listed {count} processes[/dim]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Process records (read-only, system-managed)."""
pass
@main.command('list')
@click.option('--binary-name', '-b', help='Filter by binary name')
@click.option('--machine-id', '-m', help='Filter by machine ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(binary_name: Optional[str], machine_id: Optional[str], limit: Optional[int]):
"""List Processes as JSONL."""
sys.exit(list_processes(
binary_name=binary_name,
machine_id=machine_id,
limit=limit,
))
if __name__ == '__main__':
main()

View File

@@ -1,98 +0,0 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox remove'
import shutil
from pathlib import Path
from typing import Iterable
import rich_click as click
from django.db.models import QuerySet
from archivebox.config import DATA_DIR
from archivebox.config.django import setup_django
from archivebox.misc.util import enforce_types, docstring
from archivebox.misc.checks import check_data_folder
from archivebox.misc.logging_util import (
log_list_started,
log_list_finished,
log_removal_started,
log_removal_finished,
TimedProgress,
)
@enforce_types
def remove(filter_patterns: Iterable[str]=(),
filter_type: str='exact',
snapshots: QuerySet | None=None,
after: float | None=None,
before: float | None=None,
yes: bool=False,
delete: bool=False,
out_dir: Path=DATA_DIR) -> QuerySet:
"""Remove the specified URLs from the archive"""
setup_django()
check_data_folder()
from archivebox.cli.archivebox_search import get_snapshots
log_list_started(filter_patterns, filter_type)
timer = TimedProgress(360, prefix=' ')
try:
snapshots = get_snapshots(
snapshots=snapshots,
filter_patterns=list(filter_patterns) if filter_patterns else None,
filter_type=filter_type,
after=after,
before=before,
)
finally:
timer.end()
if not snapshots.exists():
log_removal_finished(0, 0)
raise SystemExit(1)
log_list_finished(snapshots)
log_removal_started(snapshots, yes=yes, delete=delete)
timer = TimedProgress(360, prefix=' ')
try:
for snapshot in snapshots:
if delete:
shutil.rmtree(snapshot.output_dir, ignore_errors=True)
finally:
timer.end()
to_remove = snapshots.count()
from archivebox.search import flush_search_index
from archivebox.core.models import Snapshot
flush_search_index(snapshots=snapshots)
snapshots.delete()
all_snapshots = Snapshot.objects.all()
log_removal_finished(all_snapshots.count(), to_remove)
return all_snapshots
@click.command()
@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
@click.argument('filter_patterns', nargs=-1)
@docstring(remove.__doc__)
def main(**kwargs):
"""Remove the specified URLs from the archive"""
remove(**kwargs)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,155 @@
#!/usr/bin/env python3
"""
archivebox run [--daemon]
Unified command for processing queued work.
Modes:
- With stdin JSONL: Process piped records, exit when complete
- Without stdin (TTY): Run orchestrator in foreground until killed
Examples:
# Run orchestrator in foreground (replaces `archivebox orchestrator`)
archivebox run
# Run as daemon (don't exit on idle)
archivebox run --daemon
# Process specific records (pipe any JSONL type, exits when done)
archivebox snapshot list --status=queued | archivebox run
archivebox archiveresult list --status=failed | archivebox run
archivebox crawl list --status=queued | archivebox run
# Mixed types work too
cat mixed_records.jsonl | archivebox run
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox run'
import sys
import rich_click as click
from rich import print as rprint
def process_stdin_records() -> int:
"""
Process JSONL records from stdin.
Reads records, queues them for processing, then runs orchestrator until complete.
Handles any record type: Crawl, Snapshot, ArchiveResult, etc.
Returns exit code (0 = success, 1 = error).
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.crawls.models import Crawl
from archivebox.workers.orchestrator import Orchestrator
records = list(read_stdin())
if not records:
return 0 # Nothing to process
queued_count = 0
for record in records:
record_type = record.get('type')
record_id = record.get('id')
if not record_id:
continue
try:
if record_type == TYPE_CRAWL:
crawl = Crawl.objects.get(id=record_id)
if crawl.status in [Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]:
crawl.retry_at = timezone.now()
crawl.save()
queued_count += 1
elif record_type == TYPE_SNAPSHOT:
snapshot = Snapshot.objects.get(id=record_id)
if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]:
snapshot.retry_at = timezone.now()
snapshot.save()
queued_count += 1
elif record_type == TYPE_ARCHIVERESULT:
archiveresult = ArchiveResult.objects.get(id=record_id)
if archiveresult.status in [ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED, ArchiveResult.StatusChoices.BACKOFF]:
archiveresult.retry_at = timezone.now()
archiveresult.save()
queued_count += 1
except (Crawl.DoesNotExist, Snapshot.DoesNotExist, ArchiveResult.DoesNotExist):
rprint(f'[yellow]Record not found: {record_type} {record_id}[/yellow]', file=sys.stderr)
continue
if queued_count == 0:
rprint('[yellow]No records to process[/yellow]', file=sys.stderr)
return 0
rprint(f'[blue]Processing {queued_count} records...[/blue]', file=sys.stderr)
# Run orchestrator until all queued work is done
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
return 0
def run_orchestrator(daemon: bool = False) -> int:
"""
Run the orchestrator process.
The orchestrator:
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
2. Spawns worker processes when there is work to do
3. Monitors worker health and restarts failed workers
4. Exits when all queues are empty (unless --daemon)
Args:
daemon: Run forever (don't exit when idle)
Returns exit code (0 = success, 1 = error).
"""
from archivebox.workers.orchestrator import Orchestrator
if Orchestrator.is_running():
rprint('[yellow]Orchestrator is already running[/yellow]', file=sys.stderr)
return 0
try:
orchestrator = Orchestrator(exit_on_idle=not daemon)
orchestrator.runloop()
return 0
except KeyboardInterrupt:
return 0
except Exception as e:
rprint(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
@click.command()
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
def main(daemon: bool):
"""
Process queued work.
When stdin is piped: Process those specific records and exit.
When run standalone: Run orchestrator in foreground.
"""
# Check if stdin has data (non-TTY means piped input)
if not sys.stdin.isatty():
sys.exit(process_stdin_records())
else:
sys.exit(run_orchestrator(daemon=daemon))
if __name__ == '__main__':
main()

View File

@@ -1,131 +0,0 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox search'
from pathlib import Path
from typing import Optional, List, Any
import rich_click as click
from rich import print
from django.db.models import QuerySet
from archivebox.config import DATA_DIR
from archivebox.misc.logging import stderr
from archivebox.misc.util import enforce_types, docstring
# Filter types for URL matching
LINK_FILTERS = {
'exact': lambda pattern: {'url': pattern},
'substring': lambda pattern: {'url__icontains': pattern},
'regex': lambda pattern: {'url__iregex': pattern},
'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
'tag': lambda pattern: {'tags__name': pattern},
'timestamp': lambda pattern: {'timestamp': pattern},
}
STATUS_CHOICES = ['indexed', 'archived', 'unarchived']
def get_snapshots(snapshots: Optional[QuerySet]=None,
filter_patterns: Optional[List[str]]=None,
filter_type: str='substring',
after: Optional[float]=None,
before: Optional[float]=None,
out_dir: Path=DATA_DIR) -> QuerySet:
"""Filter and return Snapshots matching the given criteria."""
from archivebox.core.models import Snapshot
if snapshots:
result = snapshots
else:
result = Snapshot.objects.all()
if after is not None:
result = result.filter(timestamp__gte=after)
if before is not None:
result = result.filter(timestamp__lt=before)
if filter_patterns:
result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type)
if not result:
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
return result
@enforce_types
def search(filter_patterns: list[str] | None=None,
filter_type: str='substring',
status: str='indexed',
before: float | None=None,
after: float | None=None,
sort: str | None=None,
json: bool=False,
html: bool=False,
csv: str | None=None,
with_headers: bool=False):
"""List, filter, and export information about archive entries"""
from archivebox.core.models import Snapshot
if with_headers and not (json or html or csv):
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
raise SystemExit(2)
# Query DB directly - no filesystem scanning
snapshots = get_snapshots(
filter_patterns=list(filter_patterns) if filter_patterns else None,
filter_type=filter_type,
before=before,
after=after,
)
# Apply status filter
if status == 'archived':
snapshots = snapshots.filter(downloaded_at__isnull=False)
elif status == 'unarchived':
snapshots = snapshots.filter(downloaded_at__isnull=True)
# 'indexed' = all snapshots (no filter)
if sort:
snapshots = snapshots.order_by(sort)
# Export to requested format
if json:
output = snapshots.to_json(with_headers=with_headers)
elif html:
output = snapshots.to_html(with_headers=with_headers)
elif csv:
output = snapshots.to_csv(cols=csv.split(','), header=with_headers)
else:
from archivebox.misc.logging_util import printable_folders
# Convert to dict for printable_folders
folders = {s.output_dir: s for s in snapshots}
output = printable_folders(folders, with_headers)
print(output)
return output
@click.command()
@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
@click.help_option('--help', '-h')
@click.argument('filter_patterns', nargs=-1)
@docstring(search.__doc__)
def main(**kwargs):
return search(**kwargs)
if __name__ == '__main__':
main()

View File

@@ -1,93 +1,76 @@
#!/usr/bin/env python3
"""
archivebox snapshot [urls_or_crawl_ids...] [--tag=TAG] [--plugins=NAMES]
archivebox snapshot <action> [args...] [--filters]
Create Snapshots from URLs or Crawl jobs. Accepts URLs, Crawl JSONL, or Crawl IDs.
Manage Snapshot records.
Input formats:
- Plain URLs (one per line)
- JSONL: {"type": "Crawl", "id": "...", "urls": "..."}
- JSONL: {"type": "Snapshot", "url": "...", "title": "...", "tags": "..."}
- Crawl UUIDs (one per line)
Output (JSONL):
{"type": "Snapshot", "id": "...", "url": "...", "status": "queued", ...}
Actions:
create - Create Snapshots from URLs or Crawl JSONL
list - List Snapshots as JSONL (with optional filters)
update - Update Snapshots from stdin JSONL
delete - Delete Snapshots from stdin JSONL
Examples:
# Create snapshots from URLs directly
archivebox snapshot https://example.com https://foo.com
# Create
archivebox snapshot create https://example.com --tag=news
archivebox crawl create https://example.com | archivebox snapshot create
# Pipe from crawl command
archivebox crawl https://example.com | archivebox snapshot
# List with filters
archivebox snapshot list --status=queued
archivebox snapshot list --url__icontains=example.com
# Chain with extract
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
# Update
archivebox snapshot list --tag=old | archivebox snapshot update --tag=new
# Run specific plugins after creating snapshots
archivebox snapshot --plugins=screenshot,singlefile https://example.com
# Process existing Snapshot by ID
archivebox snapshot 01234567-89ab-cdef-0123-456789abcdef
# Delete
archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox snapshot'
import sys
from typing import Optional
from typing import Optional, Iterable
import rich_click as click
from archivebox.misc.util import docstring
from rich import print as rprint
def process_snapshot_by_id(snapshot_id: str) -> int:
"""
Process a single Snapshot by ID (used by workers).
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
Triggers the Snapshot's state machine tick() which will:
- Transition from queued -> started (creates pending ArchiveResults)
- Transition from started -> sealed (when all ArchiveResults done)
"""
from rich import print as rprint
from archivebox.core.models import Snapshot
if filters:
queryset = queryset.filter(**filters)
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
rprint(f'[red]Snapshot {snapshot_id} not found[/red]', file=sys.stderr)
return 1
if limit:
queryset = queryset[:limit]
rprint(f'[blue]Processing Snapshot {snapshot.id} {snapshot.url[:50]} (status={snapshot.status})[/blue]', file=sys.stderr)
return queryset
try:
snapshot.sm.tick()
snapshot.refresh_from_db()
rprint(f'[green]Snapshot complete (status={snapshot.status})[/green]', file=sys.stderr)
return 0
except Exception as e:
rprint(f'[red]Snapshot error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
# =============================================================================
# CREATE
# =============================================================================
def create_snapshots(
args: tuple,
urls: Iterable[str],
tag: str = '',
plugins: str = '',
status: str = 'queued',
depth: int = 0,
created_by_id: Optional[int] = None,
) -> int:
"""
Create Snapshots from URLs, Crawl JSONL, or Crawl IDs.
Reads from args or stdin, creates Snapshot objects, outputs JSONL.
If --plugins is passed, also runs specified plugins (blocking).
Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
Exit codes:
0: Success
1: Failure
"""
from rich import print as rprint
from django.utils import timezone
from archivebox.misc.jsonl import (
@@ -102,7 +85,7 @@ def create_snapshots(
is_tty = sys.stdout.isatty()
# Collect all input records
records = list(read_args_or_stdin(args))
records = list(read_args_or_stdin(urls))
if not records:
rprint('[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
@@ -122,47 +105,44 @@ def create_snapshots(
try:
crawl = Crawl.objects.get(id=crawl_id)
except Crawl.DoesNotExist:
# Crawl doesn't exist, create it
crawl = Crawl.from_jsonl(record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
else:
# No ID, create new crawl
crawl = Crawl.from_jsonl(record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
if not crawl:
continue
# Create snapshots for each URL in the crawl
for url in crawl.get_urls_list():
# Merge CLI tags with crawl tags
merged_tags = crawl.tags_str
if tag:
if merged_tags:
merged_tags = f"{merged_tags},{tag}"
else:
merged_tags = tag
merged_tags = f"{merged_tags},{tag}" if merged_tags else tag
snapshot_record = {
'url': url,
'tags': merged_tags,
'crawl_id': str(crawl.id),
'depth': 0,
'depth': depth,
'status': status,
}
snapshot = Snapshot.from_jsonl(snapshot_record, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json(snapshot_record, overrides={'created_by_id': created_by_id})
if snapshot:
created_snapshots.append(snapshot)
if not is_tty:
write_record(snapshot.to_jsonl())
write_record(snapshot.to_json())
elif record_type == TYPE_SNAPSHOT or record.get('url'):
# Input is a Snapshot or plain URL
# Add tags if provided via CLI
if tag and not record.get('tags'):
record['tags'] = tag
if status:
record['status'] = status
record['depth'] = record.get('depth', depth)
snapshot = Snapshot.from_jsonl(record, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
if snapshot:
created_snapshots.append(snapshot)
if not is_tty:
write_record(snapshot.to_jsonl())
write_record(snapshot.to_json())
except Exception as e:
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
@@ -174,93 +154,237 @@ def create_snapshots(
rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
# If TTY, show human-readable output
if is_tty:
for snapshot in created_snapshots:
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
# If --plugins is passed, create ArchiveResults and run the orchestrator
if plugins:
from archivebox.core.models import ArchiveResult
from archivebox.workers.orchestrator import Orchestrator
# Parse comma-separated plugins list
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()]
# Create ArchiveResults for the specific plugins on each snapshot
for snapshot in created_snapshots:
for plugin_name in plugins_list:
result, created = ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin=plugin_name,
defaults={
'status': ArchiveResult.StatusChoices.QUEUED,
'retry_at': timezone.now(),
}
)
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
# Reset for retry
result.status = ArchiveResult.StatusChoices.QUEUED
result.retry_at = timezone.now()
result.save()
rprint(f'[blue]Running plugins: {plugins}...[/blue]', file=sys.stderr)
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
return 0
def is_snapshot_id(value: str) -> bool:
"""Check if value looks like a Snapshot UUID."""
import re
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
if not uuid_pattern.match(value):
return False
# Verify it's actually a Snapshot (not a Crawl or other object)
# =============================================================================
# LIST
# =============================================================================
def list_snapshots(
status: Optional[str] = None,
url__icontains: Optional[str] = None,
url__istartswith: Optional[str] = None,
tag: Optional[str] = None,
crawl_id: Optional[str] = None,
limit: Optional[int] = None,
) -> int:
"""
List Snapshots as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.core.models import Snapshot
return Snapshot.objects.filter(id=value).exists()
is_tty = sys.stdout.isatty()
queryset = Snapshot.objects.all().order_by('-created_at')
# Apply filters
filter_kwargs = {
'status': status,
'url__icontains': url__icontains,
'url__istartswith': url__istartswith,
'crawl_id': crawl_id,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
# Tag filter requires special handling (M2M)
if tag:
queryset = queryset.filter(tags__name__iexact=tag)
count = 0
for snapshot in queryset:
if is_tty:
status_color = {
'queued': 'yellow',
'started': 'blue',
'sealed': 'green',
}.get(snapshot.status, 'dim')
rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}')
else:
write_record(snapshot.to_json())
count += 1
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
return 0
@click.command()
@click.option('--tag', '-t', default='', help='Comma-separated tags to add to each snapshot')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run after creating snapshots (e.g., screenshot,singlefile)')
@click.argument('args', nargs=-1)
def main(tag: str, plugins: str, args: tuple):
"""Create Snapshots from URLs/Crawls, or process existing Snapshots by ID"""
from archivebox.misc.jsonl import read_args_or_stdin
# =============================================================================
# UPDATE
# =============================================================================
# Read all input
records = list(read_args_or_stdin(args))
def update_snapshots(
status: Optional[str] = None,
tag: Optional[str] = None,
) -> int:
"""
Update Snapshots from stdin JSONL.
Reads Snapshot records from stdin and applies updates.
Uses PATCH semantics - only specified fields are updated.
Exit codes:
0: Success
1: No input or error
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.core.models import Snapshot
is_tty = sys.stdout.isatty()
records = list(read_stdin())
if not records:
from rich import print as rprint
rprint('[yellow]No URLs, Crawl IDs, or Snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
sys.exit(1)
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
# Check if input looks like existing Snapshot IDs to process
# If ALL inputs are UUIDs with no URL and exist as Snapshots, process them
all_are_snapshot_ids = all(
is_snapshot_id(r.get('id') or r.get('url', ''))
for r in records
if r.get('type') != 'Crawl' # Don't check Crawl records as Snapshot IDs
)
updated_count = 0
for record in records:
snapshot_id = record.get('id')
if not snapshot_id:
continue
# But also check that we're not receiving Crawl JSONL
has_crawl_records = any(r.get('type') == 'Crawl' for r in records)
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
if all_are_snapshot_ids and not has_crawl_records:
# Process existing Snapshots by ID
exit_code = 0
for record in records:
snapshot_id = record.get('id') or record.get('url')
result = process_snapshot_by_id(snapshot_id)
if result != 0:
exit_code = result
sys.exit(exit_code)
else:
# Create new Snapshots from URLs or Crawls
sys.exit(create_snapshots(args, tag=tag, plugins=plugins))
# Apply updates from CLI flags (override stdin values)
if status:
snapshot.status = status
snapshot.retry_at = timezone.now()
if tag:
# Add tag to existing tags
snapshot.save() # Ensure saved before M2M
from archivebox.core.models import Tag
tag_obj, _ = Tag.objects.get_or_create(name=tag)
snapshot.tags.add(tag_obj)
snapshot.save()
updated_count += 1
if not is_tty:
write_record(snapshot.to_json())
except Snapshot.DoesNotExist:
rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} snapshots[/green]', file=sys.stderr)
return 0
# =============================================================================
# DELETE
# =============================================================================
def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Snapshots from stdin JSONL.
Requires --yes flag to confirm deletion.
Exit codes:
0: Success
1: No input or missing --yes flag
"""
from archivebox.misc.jsonl import read_stdin
from archivebox.core.models import Snapshot
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
snapshot_ids = [r.get('id') for r in records if r.get('id')]
if not snapshot_ids:
rprint('[yellow]No valid snapshot IDs in input[/yellow]', file=sys.stderr)
return 1
snapshots = Snapshot.objects.filter(id__in=snapshot_ids)
count = snapshots.count()
if count == 0:
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} snapshots (dry run)[/yellow]', file=sys.stderr)
for snapshot in snapshots:
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = snapshots.delete()
rprint(f'[green]Deleted {deleted_count} snapshots[/green]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Snapshot records."""
pass
@main.command('create')
@click.argument('urls', nargs=-1)
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
def create_cmd(urls: tuple, tag: str, status: str, depth: int):
"""Create Snapshots from URLs or stdin JSONL."""
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
@main.command('list')
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
@click.option('--url__icontains', help='Filter by URL contains')
@click.option('--url__istartswith', help='Filter by URL starts with')
@click.option('--tag', '-t', help='Filter by tag name')
@click.option('--crawl-id', help='Filter by crawl ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]):
"""List Snapshots as JSONL."""
sys.exit(list_snapshots(
status=status,
url__icontains=url__icontains,
url__istartswith=url__istartswith,
tag=tag,
crawl_id=crawl_id,
limit=limit,
))
@main.command('update')
@click.option('--status', '-s', help='Set status')
@click.option('--tag', '-t', help='Add tag')
def update_cmd(status: Optional[str], tag: Optional[str]):
"""Update Snapshots from stdin JSONL."""
sys.exit(update_snapshots(status=status, tag=tag))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Snapshots from stdin JSONL."""
sys.exit(delete_snapshots(yes=yes, dry_run=dry_run))
if __name__ == '__main__':

View File

@@ -0,0 +1,307 @@
#!/usr/bin/env python3
"""
archivebox tag <action> [args...] [--filters]
Manage Tag records.
Actions:
create - Create Tags
list - List Tags as JSONL (with optional filters)
update - Update Tags from stdin JSONL
delete - Delete Tags from stdin JSONL
Examples:
# Create
archivebox tag create news tech science
archivebox tag create "important stuff"
# List
archivebox tag list
archivebox tag list --name__icontains=news
# Update (rename tags)
archivebox tag list --name=oldname | archivebox tag update --name=newname
# Delete
archivebox tag list --name=unused | archivebox tag delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox tag'
import sys
from typing import Optional, Iterable
import rich_click as click
from rich import print as rprint
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# =============================================================================
# CREATE
# =============================================================================
def create_tags(names: Iterable[str]) -> int:
"""
Create Tags from names.
Exit codes:
0: Success
1: Failure
"""
from archivebox.misc.jsonl import write_record
from archivebox.core.models import Tag
is_tty = sys.stdout.isatty()
# Convert to list if needed
name_list = list(names) if names else []
if not name_list:
rprint('[yellow]No tag names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
return 1
created_count = 0
for name in name_list:
name = name.strip()
if not name:
continue
tag, created = Tag.objects.get_or_create(name=name)
if not is_tty:
write_record(tag.to_json())
if created:
created_count += 1
rprint(f'[green]Created tag: {name}[/green]', file=sys.stderr)
else:
rprint(f'[dim]Tag already exists: {name}[/dim]', file=sys.stderr)
rprint(f'[green]Created {created_count} new tags[/green]', file=sys.stderr)
return 0
# =============================================================================
# LIST
# =============================================================================
def list_tags(
name: Optional[str] = None,
name__icontains: Optional[str] = None,
limit: Optional[int] = None,
) -> int:
"""
List Tags as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.core.models import Tag
is_tty = sys.stdout.isatty()
queryset = Tag.objects.all().order_by('name')
# Apply filters
filter_kwargs = {
'name': name,
'name__icontains': name__icontains,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for tag in queryset:
snapshot_count = tag.snapshot_set.count()
if is_tty:
rprint(f'[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]')
else:
write_record(tag.to_json())
count += 1
rprint(f'[dim]Listed {count} tags[/dim]', file=sys.stderr)
return 0
# =============================================================================
# UPDATE
# =============================================================================
def update_tags(name: Optional[str] = None) -> int:
"""
Update Tags from stdin JSONL.
Reads Tag records from stdin and applies updates.
Uses PATCH semantics - only specified fields are updated.
Exit codes:
0: Success
1: No input or error
"""
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.core.models import Tag
is_tty = sys.stdout.isatty()
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
updated_count = 0
for record in records:
tag_id = record.get('id')
old_name = record.get('name')
if not tag_id and not old_name:
continue
try:
if tag_id:
tag = Tag.objects.get(id=tag_id)
else:
tag = Tag.objects.get(name=old_name)
# Apply updates from CLI flags
if name:
tag.name = name
tag.save()
updated_count += 1
if not is_tty:
write_record(tag.to_json())
except Tag.DoesNotExist:
rprint(f'[yellow]Tag not found: {tag_id or old_name}[/yellow]', file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} tags[/green]', file=sys.stderr)
return 0
# =============================================================================
# DELETE
# =============================================================================
def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Tags from stdin JSONL.
Requires --yes flag to confirm deletion.
Exit codes:
0: Success
1: No input or missing --yes flag
"""
from archivebox.misc.jsonl import read_stdin
from archivebox.core.models import Tag
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
# Collect tag IDs or names
tag_ids = []
tag_names = []
for r in records:
if r.get('id'):
tag_ids.append(r['id'])
elif r.get('name'):
tag_names.append(r['name'])
if not tag_ids and not tag_names:
rprint('[yellow]No valid tag IDs or names in input[/yellow]', file=sys.stderr)
return 1
from django.db.models import Q
query = Q()
if tag_ids:
query |= Q(id__in=tag_ids)
if tag_names:
query |= Q(name__in=tag_names)
tags = Tag.objects.filter(query)
count = tags.count()
if count == 0:
rprint('[yellow]No matching tags found[/yellow]', file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} tags (dry run)[/yellow]', file=sys.stderr)
for tag in tags:
rprint(f' {tag.name}', file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = tags.delete()
rprint(f'[green]Deleted {deleted_count} tags[/green]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Tag records."""
pass
@main.command('create')
@click.argument('names', nargs=-1)
def create_cmd(names: tuple):
"""Create Tags from names."""
sys.exit(create_tags(names))
@main.command('list')
@click.option('--name', help='Filter by exact name')
@click.option('--name__icontains', help='Filter by name contains')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
"""List Tags as JSONL."""
sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit))
@main.command('update')
@click.option('--name', '-n', help='Set new name')
def update_cmd(name: Optional[str]):
"""Update Tags from stdin JSONL."""
sys.exit(update_tags(name=name))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Tags from stdin JSONL."""
sys.exit(delete_tags(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
main()

View File

@@ -1,17 +1,18 @@
#!/usr/bin/env python3
"""
Tests for CLI piping workflow: crawl | snapshot | extract
Tests for CLI piping workflow: crawl | snapshot | archiveresult | run
This module tests the JSONL-based piping between CLI commands as described in:
https://github.com/ArchiveBox/ArchiveBox/issues/1363
Workflows tested:
archivebox crawl URL -> Crawl JSONL
archivebox snapshot -> Snapshot JSONL (accepts Crawl or URL input)
archivebox extract -> ArchiveResult JSONL (accepts Snapshot input)
archivebox crawl create URL -> Crawl JSONL
archivebox snapshot create -> Snapshot JSONL (accepts Crawl or URL input)
archivebox archiveresult create -> ArchiveResult JSONL (accepts Snapshot input)
archivebox run -> Process queued records (accepts any JSONL)
Pipeline:
archivebox crawl URL | archivebox snapshot | archivebox extract
archivebox crawl create URL | archivebox snapshot create | archivebox archiveresult create | archivebox run
Each command should:
- Accept URLs, IDs, or JSONL as input (args or stdin)
@@ -154,13 +155,13 @@ class TestJSONLParsing(unittest.TestCase):
class TestJSONLOutput(unittest.TestCase):
"""Test JSONL output formatting."""
def test_crawl_to_jsonl(self):
"""Crawl model should serialize to JSONL correctly."""
def test_crawl_to_json(self):
"""Crawl model should serialize to JSON correctly."""
from archivebox.misc.jsonl import TYPE_CRAWL
# Create a mock crawl with to_jsonl method configured
# Create a mock crawl with to_json method configured
mock_crawl = MagicMock()
mock_crawl.to_jsonl.return_value = {
mock_crawl.to_json.return_value = {
'type': TYPE_CRAWL,
'schema_version': '0.9.0',
'id': 'test-crawl-uuid',
@@ -172,7 +173,7 @@ class TestJSONLOutput(unittest.TestCase):
'created_at': None,
}
result = mock_crawl.to_jsonl()
result = mock_crawl.to_json()
self.assertEqual(result['type'], TYPE_CRAWL)
self.assertEqual(result['id'], 'test-crawl-uuid')
self.assertEqual(result['urls'], 'https://example.com')
@@ -351,8 +352,8 @@ class TestSnapshotCommand(unittest.TestCase):
# using real Snapshot instances.
class TestExtractCommand(unittest.TestCase):
"""Unit tests for archivebox extract command."""
class TestArchiveResultCommand(unittest.TestCase):
"""Unit tests for archivebox archiveresult command."""
def setUp(self):
"""Set up test environment."""
@@ -363,8 +364,8 @@ class TestExtractCommand(unittest.TestCase):
"""Clean up test environment."""
shutil.rmtree(self.test_dir, ignore_errors=True)
def test_extract_accepts_snapshot_id(self):
"""extract should accept snapshot IDs as input."""
def test_archiveresult_accepts_snapshot_id(self):
"""archiveresult should accept snapshot IDs as input."""
from archivebox.misc.jsonl import read_args_or_stdin
uuid = '01234567-89ab-cdef-0123-456789abcdef'
@@ -374,8 +375,8 @@ class TestExtractCommand(unittest.TestCase):
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['id'], uuid)
def test_extract_accepts_jsonl_snapshot(self):
"""extract should accept JSONL Snapshot records."""
def test_archiveresult_accepts_jsonl_snapshot(self):
"""archiveresult should accept JSONL Snapshot records."""
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
@@ -387,8 +388,8 @@ class TestExtractCommand(unittest.TestCase):
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
self.assertEqual(records[0]['id'], 'abc123')
def test_extract_gathers_snapshot_ids(self):
"""extract should gather snapshot IDs from various input formats."""
def test_archiveresult_gathers_snapshot_ids(self):
"""archiveresult should gather snapshot IDs from various input formats."""
from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
records = [
@@ -529,7 +530,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Create crawl with multiple URLs (as newline-separated string)
urls = 'https://test-crawl-1.example.com\nhttps://test-crawl-2.example.com'
crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
self.assertIsNotNone(crawl)
self.assertIsNotNone(crawl.id)
@@ -543,7 +544,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
self.assertIn('https://test-crawl-2.example.com', urls_list)
# Verify output format
output = crawl.to_jsonl()
output = crawl.to_json()
self.assertEqual(output['type'], TYPE_CRAWL)
self.assertIn('id', output)
self.assertEqual(output['urls'], urls)
@@ -566,8 +567,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Step 1: Create crawl (simulating 'archivebox crawl')
urls = 'https://crawl-to-snap-1.example.com\nhttps://crawl-to-snap-2.example.com'
crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id})
crawl_output = crawl.to_jsonl()
crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
crawl_output = crawl.to_json()
# Step 2: Parse crawl output as snapshot input
stdin = StringIO(json.dumps(crawl_output) + '\n')
@@ -581,7 +582,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Step 3: Create snapshots from crawl URLs
created_snapshots = []
for url in crawl.get_urls_list():
snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
if snapshot:
created_snapshots.append(snapshot)
@@ -589,7 +590,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Verify snapshot output
for snapshot in created_snapshots:
output = snapshot.to_jsonl()
output = snapshot.to_json()
self.assertEqual(output['type'], TYPE_SNAPSHOT)
self.assertIn(output['url'], [
'https://crawl-to-snap-1.example.com',
@@ -619,13 +620,13 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Create snapshot
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_jsonl(records[0], overrides=overrides)
snapshot = Snapshot.from_json(records[0], overrides=overrides)
self.assertIsNotNone(snapshot.id)
self.assertEqual(snapshot.url, url)
# Verify output format
output = snapshot.to_jsonl()
output = snapshot.to_json()
self.assertEqual(output['type'], TYPE_SNAPSHOT)
self.assertIn('id', output)
self.assertEqual(output['url'], url)
@@ -647,8 +648,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Step 1: Create snapshot (simulating 'archivebox snapshot')
url = 'https://test-extract-1.example.com'
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
snapshot_output = snapshot.to_jsonl()
snapshot = Snapshot.from_json({'url': url}, overrides=overrides)
snapshot_output = snapshot.to_json()
# Step 2: Parse snapshot output as extract input
stdin = StringIO(json.dumps(snapshot_output) + '\n')
@@ -686,8 +687,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# === archivebox crawl https://example.com ===
url = 'https://test-pipeline-full.example.com'
crawl = Crawl.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
crawl_jsonl = json.dumps(crawl.to_jsonl())
crawl = Crawl.from_json({'url': url}, overrides={'created_by_id': created_by_id})
crawl_jsonl = json.dumps(crawl.to_json())
# === | archivebox snapshot ===
stdin = StringIO(crawl_jsonl + '\n')
@@ -705,7 +706,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
if crawl_id:
db_crawl = Crawl.objects.get(id=crawl_id)
for crawl_url in db_crawl.get_urls_list():
snapshot = Snapshot.from_jsonl({'url': crawl_url}, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json({'url': crawl_url}, overrides={'created_by_id': created_by_id})
if snapshot:
created_snapshots.append(snapshot)
@@ -713,7 +714,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
self.assertEqual(created_snapshots[0].url, url)
# === | archivebox extract ===
snapshot_jsonl_lines = [json.dumps(s.to_jsonl()) for s in created_snapshots]
snapshot_jsonl_lines = [json.dumps(s.to_json()) for s in created_snapshots]
stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
stdin.isatty = lambda: False
@@ -757,12 +758,12 @@ class TestDepthWorkflows(unittest.TestCase):
# Create crawl with depth 0
url = 'https://depth0-test.example.com'
crawl = Crawl.from_jsonl({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id})
self.assertEqual(crawl.max_depth, 0)
# Create snapshot
snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
self.assertEqual(snapshot.url, url)
def test_depth_metadata_in_crawl(self):
@@ -773,7 +774,7 @@ class TestDepthWorkflows(unittest.TestCase):
created_by_id = get_or_create_system_user_pk()
# Create crawl with depth
crawl = Crawl.from_jsonl(
crawl = Crawl.from_json(
{'url': 'https://depth-meta-test.example.com', 'max_depth': 2},
overrides={'created_by_id': created_by_id}
)
@@ -781,7 +782,7 @@ class TestDepthWorkflows(unittest.TestCase):
self.assertEqual(crawl.max_depth, 2)
# Verify in JSONL output
output = crawl.to_jsonl()
output = crawl.to_json()
self.assertEqual(output['max_depth'], 2)

View File

@@ -158,7 +158,7 @@ class AddLinkForm(forms.Form):
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
}
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'}
# Populate plugin field choices
self.fields['chrome_plugins'].choices = [

View File

@@ -1,6 +1,6 @@
__package__ = 'archivebox.core'
from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING, Iterator, Set
from archivebox.uuid_compat import uuid7
from datetime import datetime, timedelta
from django_stubs_ext.db.models import TypedModelMeta
@@ -41,6 +41,8 @@ from archivebox.machine.models import NetworkInterface, Binary
class Tag(ModelWithSerializers):
JSONL_TYPE = 'Tag'
# Keep AutoField for compatibility with main branch migrations
# Don't use UUIDField here - requires complex FK transformation
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
@@ -91,26 +93,66 @@ class Tag(ModelWithSerializers):
def api_url(self) -> str:
return reverse_lazy('api-1:get_tag', args=[self.id])
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Tag model instance to a JSONL record.
Convert Tag model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
'type': 'Tag',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'name': self.name,
'slug': self.slug,
}
@staticmethod
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
"""
Create/update Tag from JSONL record.
Yield this Tag as a JSON record.
Args:
record: JSONL record with 'name' field
seen: Set of (type, id) tuples already emitted (for deduplication)
**kwargs: Passed to children (none for Tag, leaf node)
Yields:
dict: JSON-serializable record for this tag
"""
if seen is not None:
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
@classmethod
def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['Tag']:
"""
Create/update Tags from an iterable of JSONL records.
Filters to only records with type='Tag'.
Args:
records: Iterable of dicts (JSONL records)
overrides: Optional dict with 'snapshot' to auto-attach tags
Returns:
List of Tag instances (skips None results)
"""
results = []
for record in records:
record_type = record.get('type', cls.JSONL_TYPE)
if record_type == cls.JSONL_TYPE:
instance = cls.from_json(record, overrides=overrides)
if instance:
results.append(instance)
return results
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'Tag | None':
"""
Create/update a single Tag from a JSON record dict.
Args:
record: Dict with 'name' field
overrides: Optional dict with 'snapshot' to auto-attach tag
Returns:
@@ -289,6 +331,8 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
JSONL_TYPE = 'Snapshot'
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -968,38 +1012,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
Each line is a JSON record with a 'type' field:
- Snapshot: snapshot metadata (crawl_id, url, tags, etc.)
- ArchiveResult: extractor results (plugin, status, output, etc.)
- Binary: binary info used for the extraction
- Process: process execution details (cmd, exit_code, timing, etc.)
- ArchiveResult: extractor results (plugin, status, output, etc.)
"""
import json
index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
index_path.parent.mkdir(parents=True, exist_ok=True)
# Track unique binaries and processes to avoid duplicates
binaries_seen = set()
processes_seen = set()
with open(index_path, 'w') as f:
# Write Snapshot record first (to_jsonl includes crawl_id, fs_version)
f.write(json.dumps(self.to_jsonl()) + '\n')
# Write ArchiveResult records with their associated Binary and Process
# Use select_related to optimize queries
for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
# Write Binary record if not already written
if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen:
binaries_seen.add(ar.process.binary_id)
f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n')
# Write Process record if not already written
if ar.process and ar.process_id not in processes_seen:
processes_seen.add(ar.process_id)
f.write(json.dumps(ar.process.to_jsonl()) + '\n')
# Write ArchiveResult record
f.write(json.dumps(ar.to_jsonl()) + '\n')
for record in self.to_jsonl():
f.write(json.dumps(record) + '\n')
def read_index_jsonl(self) -> dict:
"""
@@ -1420,14 +1444,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return False
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Snapshot model instance to a JSONL record.
Convert Snapshot model instance to a JSON-serializable dict.
Includes all fields needed to fully reconstruct/identify this snapshot.
"""
from archivebox.config import VERSION
return {
'type': 'Snapshot',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'crawl_id': str(self.crawl_id),
@@ -1442,12 +1466,68 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'fs_version': self.fs_version,
}
@staticmethod
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
def to_jsonl(self, seen: Set[tuple] = None, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
"""
Create/update Snapshot from JSONL record or dict.
Yield this Snapshot and optionally related objects as JSON records.
Unified method that handles:
Uses select_related for efficient querying. Deduplicates automatically.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
archiveresult: Include related ArchiveResults (default: True)
process: Include Process for each ArchiveResult (default: True)
binary: Include Binary for each Process (default: True)
machine: Include Machine for each Process (default: False)
iface: Include NetworkInterface for each Process (default: False)
**kwargs: Additional options passed to children
Yields:
dict: JSON-serializable records
"""
if seen is None:
seen = set()
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
if archiveresult:
# Use select_related to optimize queries
for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
yield from ar.to_jsonl(seen=seen, process=process, binary=binary, machine=machine, iface=iface, **kwargs)
@classmethod
def from_jsonl(cls, records, overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> list['Snapshot']:
"""
Create/update Snapshots from an iterable of JSONL records.
Filters to only records with type='Snapshot' (or no type).
Args:
records: Iterable of dicts (JSONL records)
overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
Returns:
List of Snapshot instances (skips None results)
"""
results = []
for record in records:
record_type = record.get('type', cls.JSONL_TYPE)
if record_type == cls.JSONL_TYPE:
instance = cls.from_json(record, overrides=overrides, queue_for_extraction=queue_for_extraction)
if instance:
results.append(instance)
return results
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> 'Snapshot | None':
"""
Create/update a single Snapshot from a JSON record dict.
Handles:
- ID-based patching: {"id": "...", "title": "new title"}
- URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
- Auto-creates Crawl if not provided
@@ -2054,8 +2134,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
result['canonical'] = self.canonical_outputs()
return result
def to_json(self, indent: int = 4) -> str:
"""Convert to JSON string"""
def to_json_str(self, indent: int = 4) -> str:
"""Convert to JSON string for file output."""
return to_json(self.to_dict(extended=True), indent=indent)
def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
@@ -2203,6 +2283,8 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
JSONL_TYPE = 'ArchiveResult'
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
@@ -2274,13 +2356,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
"""Convenience property to access the user who created this archive result via its snapshot's crawl."""
return self.snapshot.crawl.created_by
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert ArchiveResult model instance to a JSONL record.
Convert ArchiveResult model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
record = {
'type': 'ArchiveResult',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'snapshot_id': str(self.snapshot_id),
@@ -2308,6 +2390,31 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
record['process_id'] = str(self.process_id)
return record
def to_jsonl(self, seen: Set[tuple] = None, process: bool = True, **kwargs) -> Iterator[dict]:
"""
Yield this ArchiveResult and optionally related objects as JSON records.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
process: Include related Process and its children (default: True)
**kwargs: Passed to Process.to_jsonl() (e.g., binary=True, machine=False)
Yields:
dict: JSON-serializable records
"""
if seen is None:
seen = set()
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
if process and self.process:
yield from self.process.to_jsonl(seen=seen, **kwargs)
def save(self, *args, **kwargs):
is_new = self._state.adding

View File

@@ -1,6 +1,6 @@
__package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING, Iterable
from typing import TYPE_CHECKING, Iterable, Iterator, Set
from datetime import timedelta
from archivebox.uuid_compat import uuid7
from pathlib import Path
@@ -59,6 +59,8 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine):
JSONL_TYPE = 'Crawl'
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
@@ -134,13 +136,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
def api_url(self) -> str:
return reverse_lazy('api-1:get_crawl', args=[self.id])
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Crawl model instance to a JSONL record.
Convert Crawl model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
'type': 'Crawl',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'urls': self.urls,
@@ -151,10 +153,63 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
'created_at': self.created_at.isoformat() if self.created_at else None,
}
@staticmethod
def from_jsonl(record: dict, overrides: dict = None):
def to_jsonl(self, seen: Set[tuple] = None, snapshot: bool = True, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
"""
Create or get a Crawl from a JSONL record.
Yield this Crawl and optionally related objects as JSON records.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
snapshot: Include related Snapshots (default: True)
archiveresult: Include ArchiveResults for each Snapshot (default: True)
process: Include Process for each ArchiveResult (default: True)
binary: Include Binary for each Process (default: True)
machine: Include Machine for each Process (default: False)
iface: Include NetworkInterface for each Process (default: False)
**kwargs: Additional options passed to children
Yields:
dict: JSON-serializable records
"""
if seen is None:
seen = set()
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
if snapshot:
for snap in self.snapshot_set.all():
yield from snap.to_jsonl(seen=seen, archiveresult=archiveresult, process=process, binary=binary, machine=machine, iface=iface, **kwargs)
@classmethod
def from_jsonl(cls, records, overrides: dict = None) -> list['Crawl']:
"""
Create/update Crawls from an iterable of JSONL records.
Filters to only records with type='Crawl' (or no type).
Args:
records: Iterable of dicts (JSONL records)
overrides: Dict of field overrides (e.g., created_by_id)
Returns:
List of Crawl instances (skips None results)
"""
results = []
for record in records:
record_type = record.get('type', cls.JSONL_TYPE)
if record_type == cls.JSONL_TYPE:
instance = cls.from_json(record, overrides=overrides)
if instance:
results.append(instance)
return results
@staticmethod
def from_json(record: dict, overrides: dict = None) -> 'Crawl | None':
"""
Create or get a single Crawl from a JSON record dict.
Args:
record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label'

View File

@@ -1176,7 +1176,9 @@ def create_model_record(record: Dict[str, Any]) -> Any:
def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]:
"""
Process JSONL records from hook output.
Dispatches to Model.from_jsonl() for each record type.
Uses Model.from_jsonl() which automatically filters by JSONL_TYPE.
Each model only processes records matching its type.
Args:
records: List of JSONL record dicts from result['records']
@@ -1185,54 +1187,26 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
Returns:
Dict with counts by record type
"""
stats = {}
from archivebox.core.models import Snapshot, Tag
from archivebox.machine.models import Binary, Machine
overrides = overrides or {}
for record in records:
record_type = record.get('type')
if not record_type:
continue
# Filter out ArchiveResult records (they update the calling AR, not create new ones)
filtered_records = [r for r in records if r.get('type') != 'ArchiveResult']
# Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones)
if record_type == 'ArchiveResult':
continue
# Each model's from_jsonl() filters to only its own type
snapshots = Snapshot.from_jsonl(filtered_records, overrides)
tags = Tag.from_jsonl(filtered_records, overrides)
binaries = Binary.from_jsonl(filtered_records, overrides)
machines = Machine.from_jsonl(filtered_records, overrides)
try:
# Dispatch to appropriate model's from_jsonl() method
if record_type == 'Snapshot':
from archivebox.core.models import Snapshot
obj = Snapshot.from_jsonl(record.copy(), overrides)
if obj:
stats['Snapshot'] = stats.get('Snapshot', 0) + 1
elif record_type == 'Tag':
from archivebox.core.models import Tag
obj = Tag.from_jsonl(record.copy(), overrides)
if obj:
stats['Tag'] = stats.get('Tag', 0) + 1
elif record_type == 'Binary':
from archivebox.machine.models import Binary
obj = Binary.from_jsonl(record.copy(), overrides)
if obj:
stats['Binary'] = stats.get('Binary', 0) + 1
elif record_type == 'Machine':
from archivebox.machine.models import Machine
obj = Machine.from_jsonl(record.copy(), overrides)
if obj:
stats['Machine'] = stats.get('Machine', 0) + 1
else:
import sys
print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
except Exception as e:
import sys
print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr)
continue
return stats
return {
'Snapshot': len(snapshots),
'Tag': len(tags),
'Binary': len(binaries),
'Machine': len(machines),
}
def process_is_alive(pid_file: Path) -> bool:

View File

@@ -1,6 +1,7 @@
__package__ = 'archivebox.machine'
import socket
from typing import Iterator, Set
from archivebox.uuid_compat import uuid7
from datetime import timedelta
@@ -29,6 +30,8 @@ class MachineManager(models.Manager):
class Machine(ModelWithHealthStats):
JSONL_TYPE = 'Machine'
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -69,13 +72,35 @@ class Machine(ModelWithHealthStats):
)
return _CURRENT_MACHINE
@staticmethod
def from_jsonl(record: dict, overrides: dict = None):
@classmethod
def from_jsonl(cls, records, overrides: dict = None) -> list['Machine']:
"""
Update Machine config from JSONL record.
Update Machine configs from an iterable of JSONL records.
Filters to only records with type='Machine'.
Args:
record: JSONL record with '_method': 'update', 'key': '...', 'value': '...'
records: Iterable of dicts (JSONL records)
overrides: Not used
Returns:
List of Machine instances (skips None results)
"""
results = []
for record in records:
record_type = record.get('type', cls.JSONL_TYPE)
if record_type == cls.JSONL_TYPE:
instance = cls.from_json(record, overrides=overrides)
if instance:
results.append(instance)
return results
@staticmethod
def from_json(record: dict, overrides: dict = None) -> 'Machine | None':
"""
Update a single Machine config from a JSON record dict.
Args:
record: Dict with '_method': 'update', 'key': '...', 'value': '...'
overrides: Not used
Returns:
@@ -94,6 +119,44 @@ class Machine(ModelWithHealthStats):
return machine
return None
def to_json(self) -> dict:
"""
Convert Machine model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'guid': self.guid,
'hostname': self.hostname,
'hw_in_docker': self.hw_in_docker,
'hw_in_vm': self.hw_in_vm,
'os_arch': self.os_arch,
'os_family': self.os_family,
'os_platform': self.os_platform,
'os_release': self.os_release,
'created_at': self.created_at.isoformat() if self.created_at else None,
}
def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
"""
Yield this Machine as a JSON record.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
**kwargs: Passed to children (none for Machine, leaf node)
Yields:
dict: JSON-serializable record for this machine
"""
if seen is not None:
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
class NetworkInterfaceManager(models.Manager):
def current(self) -> 'NetworkInterface':
@@ -101,6 +164,8 @@ class NetworkInterfaceManager(models.Manager):
class NetworkInterface(ModelWithHealthStats):
JSONL_TYPE = 'NetworkInterface'
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -139,6 +204,46 @@ class NetworkInterface(ModelWithHealthStats):
)
return _CURRENT_INTERFACE
def to_json(self) -> dict:
"""
Convert NetworkInterface model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'machine_id': str(self.machine_id),
'hostname': self.hostname,
'iface': self.iface,
'ip_public': self.ip_public,
'ip_local': self.ip_local,
'mac_address': self.mac_address,
'dns_server': self.dns_server,
'isp': self.isp,
'city': self.city,
'region': self.region,
'country': self.country,
'created_at': self.created_at.isoformat() if self.created_at else None,
}
def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
"""
Yield this NetworkInterface as a JSON record.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
**kwargs: Passed to children (none for NetworkInterface, leaf node)
Yields:
dict: JSON-serializable record for this network interface
"""
if seen is not None:
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
class BinaryManager(models.Manager):
@@ -165,7 +270,7 @@ class BinaryManager(models.Manager):
class Binary(ModelWithHealthStats):
"""
Tracks an binary on a specific machine.
Tracks a binary on a specific machine.
Follows the unified state machine pattern:
- queued: Binary needs to be installed
@@ -176,6 +281,7 @@ class Binary(ModelWithHealthStats):
State machine calls run() which executes on_Binary__install_* hooks
to install the binary using the specified providers.
"""
JSONL_TYPE = 'Binary'
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
@@ -242,13 +348,13 @@ class Binary(ModelWithHealthStats):
'is_valid': self.is_valid,
}
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Binary model instance to a JSONL record.
Convert Binary model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
'type': 'Binary',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'machine_id': str(self.machine_id),
@@ -260,17 +366,57 @@ class Binary(ModelWithHealthStats):
'status': self.status,
}
@staticmethod
def from_jsonl(record: dict, overrides: dict = None):
def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
"""
Create/update Binary from JSONL record.
Yield this Binary as a JSON record.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
**kwargs: Passed to children (none for Binary, leaf node)
Yields:
dict: JSON-serializable record for this binary
"""
if seen is not None:
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
@classmethod
def from_jsonl(cls, records, overrides: dict = None) -> list['Binary']:
"""
Create/update Binaries from an iterable of JSONL records.
Filters to only records with type='Binary'.
Args:
records: Iterable of dicts (JSONL records)
overrides: Not used
Returns:
List of Binary instances (skips None results)
"""
results = []
for record in records:
record_type = record.get('type', cls.JSONL_TYPE)
if record_type == cls.JSONL_TYPE:
instance = cls.from_json(record, overrides=overrides)
if instance:
results.append(instance)
return results
@staticmethod
def from_json(record: dict, overrides: dict = None) -> 'Binary | None':
"""
Create/update a single Binary from a JSON record dict.
Handles two cases:
1. From binaries.jsonl: creates queued binary with name, binproviders, overrides
2. From hook output: updates binary with abspath, version, sha256, binprovider
Args:
record: JSONL record with 'name' and either:
record: Dict with 'name' and either:
- 'binproviders', 'overrides' (from binaries.jsonl)
- 'abspath', 'version', 'sha256', 'binprovider' (from hook output)
overrides: Not used
@@ -494,6 +640,7 @@ class Process(ModelWithHealthStats):
State machine calls launch() to spawn the process and monitors its lifecycle.
"""
JSONL_TYPE = 'Process'
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
@@ -624,13 +771,13 @@ class Process(ModelWithHealthStats):
return self.archiveresult.hook_name
return ''
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Process model instance to a JSONL record.
Convert Process model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
record = {
'type': 'Process',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'machine_id': str(self.machine_id),
@@ -650,6 +797,37 @@ class Process(ModelWithHealthStats):
record['timeout'] = self.timeout
return record
def to_jsonl(self, seen: Set[tuple] = None, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
"""
Yield this Process and optionally related objects as JSON records.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
binary: Include related Binary (default: True)
machine: Include related Machine (default: False)
iface: Include related NetworkInterface (default: False)
**kwargs: Passed to children
Yields:
dict: JSON-serializable records
"""
if seen is None:
seen = set()
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
if binary and self.binary:
yield from self.binary.to_jsonl(seen=seen, **kwargs)
if machine and self.machine:
yield from self.machine.to_jsonl(seen=seen, **kwargs)
if iface and self.iface:
yield from self.iface.to_jsonl(seen=seen, **kwargs)
def update_and_requeue(self, **kwargs):
"""
Update process fields and requeue for worker state machine.

View File

@@ -24,7 +24,7 @@ __package__ = 'archivebox.misc'
import sys
import json
from typing import Iterator, Dict, Any, Optional, TextIO, Callable
from typing import Iterator, Dict, Any, Optional, TextIO
from pathlib import Path
@@ -150,36 +150,3 @@ def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] =
count += 1
return count
def filter_by_type(records: Iterator[Dict[str, Any]], record_type: str) -> Iterator[Dict[str, Any]]:
"""
Filter records by type.
"""
for record in records:
if record.get('type') == record_type:
yield record
def process_records(
records: Iterator[Dict[str, Any]],
handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]]
) -> Iterator[Dict[str, Any]]:
"""
Process records through type-specific handlers.
Args:
records: Input record iterator
handlers: Dict mapping type names to handler functions
Handlers return output records or None to skip
Yields output records from handlers.
"""
for record in records:
record_type = record.get('type')
handler = handlers.get(record_type)
if handler:
result = handler(record)
if result:
yield result

View File

@@ -3,7 +3,12 @@
Install hook for Chrome/Chromium and puppeteer-core.
Runs at crawl start to install/find Chromium and puppeteer-core.
Outputs JSONL for Binary and Machine config updates.
Also validates config and computes derived values.
Outputs:
- JSONL for Binary and Machine config updates
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
Respects CHROME_BINARY env var for custom binary paths.
Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
@@ -19,6 +24,28 @@ import subprocess
from pathlib import Path
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def detect_docker() -> bool:
"""Detect if running inside Docker container."""
return (
os.path.exists('/.dockerenv') or
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
os.path.exists('/run/.containerenv')
)
def get_chrome_version(binary_path: str) -> str | None:
"""Get Chrome/Chromium version string."""
try:
@@ -131,13 +158,41 @@ def install_chromium() -> dict | None:
def main():
warnings = []
errors = []
computed = {}
# Install puppeteer-core if NODE_MODULES_DIR is set
install_puppeteer_core()
# Check if Chrome is enabled
chrome_enabled = get_env_bool('CHROME_ENABLED', True)
# Detect Docker and adjust sandbox
in_docker = detect_docker()
computed['IN_DOCKER'] = str(in_docker).lower()
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
if in_docker and chrome_sandbox:
warnings.append(
"Running in Docker with CHROME_SANDBOX=true. "
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
)
# Auto-disable sandbox in Docker unless explicitly set
if not get_env('CHROME_SANDBOX'):
computed['CHROME_SANDBOX'] = 'false'
# Check Node.js availability
node_binary = get_env('NODE_BINARY', 'node')
computed['NODE_BINARY'] = node_binary
# Check if CHROME_BINARY is already set and valid
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
configured_binary = get_env('CHROME_BINARY', '')
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
version = get_chrome_version(configured_binary)
computed['CHROME_BINARY'] = configured_binary
computed['CHROME_VERSION'] = version or 'unknown'
print(json.dumps({
'type': 'Binary',
'name': 'chromium',
@@ -145,12 +200,22 @@ def main():
'version': version,
'binprovider': 'env',
}))
# Output computed values
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
sys.exit(0)
# Install/find Chromium via puppeteer
result = install_chromium()
if result and result.get('abspath'):
computed['CHROME_BINARY'] = result['abspath']
computed['CHROME_VERSION'] = result['version'] or 'unknown'
print(json.dumps({
'type': 'Binary',
'name': result['name'],
@@ -174,9 +239,25 @@ def main():
'value': result['version'],
}))
# Output computed values
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
sys.exit(0)
else:
print("Chromium binary not found", file=sys.stderr)
errors.append("Chromium binary not found")
computed['CHROME_BINARY'] = ''
# Output computed values and errors
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
for error in errors:
print(f"ERROR:{error}", file=sys.stderr)
sys.exit(1)

View File

@@ -1,172 +0,0 @@
#!/usr/bin/env python3
"""
Validate and compute derived Chrome config values.
This hook runs early in the Crawl lifecycle to:
1. Auto-detect Chrome binary location
2. Compute sandbox settings based on Docker detection
3. Validate binary availability and version
4. Set computed env vars for subsequent hooks
Output:
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
- Binary JSONL records to stdout when binaries are found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
# Chrome binary search order
CHROME_BINARY_NAMES = [
'chromium',
'chromium-browser',
'google-chrome',
'google-chrome-stable',
'chrome',
]
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def detect_docker() -> bool:
"""Detect if running inside Docker container."""
return (
os.path.exists('/.dockerenv') or
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
os.path.exists('/run/.containerenv')
)
def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None:
"""Find Chrome binary using abx-pkg, checking configured path first."""
# Try configured binary first
if configured:
try:
binary = Binary(name=configured, binproviders=[provider]).load()
if binary.abspath:
return binary
except Exception:
pass
# Search common names
for name in CHROME_BINARY_NAMES:
try:
binary = Binary(name=name, binproviders=[provider]).load()
if binary.abspath:
return binary
except Exception:
continue
return None
def output_binary(binary: Binary, name: str):
"""Output Binary JSONL record to stdout."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env',
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
warnings = []
errors = []
computed = {}
# Get config values
chrome_binary = get_env('CHROME_BINARY', 'chromium')
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
screenshot_enabled = get_env_bool('SCREENSHOT_ENABLED', True)
pdf_enabled = get_env_bool('PDF_ENABLED', True)
dom_enabled = get_env_bool('DOM_ENABLED', True)
# Compute USE_CHROME (derived from extractor enabled flags)
use_chrome = screenshot_enabled or pdf_enabled or dom_enabled
computed['USE_CHROME'] = str(use_chrome).lower()
# Detect Docker and adjust sandbox
in_docker = detect_docker()
computed['IN_DOCKER'] = str(in_docker).lower()
if in_docker and chrome_sandbox:
warnings.append(
"Running in Docker with CHROME_SANDBOX=true. "
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
)
# Auto-disable sandbox in Docker unless explicitly set
if not get_env('CHROME_SANDBOX'):
computed['CHROME_SANDBOX'] = 'false'
# Find Chrome binary using abx-pkg
provider = EnvProvider()
if use_chrome:
chrome = find_chrome_binary(chrome_binary, provider)
if not chrome or not chrome.abspath:
errors.append(
f"Chrome binary not found (tried: {chrome_binary}). "
"Install Chrome/Chromium or set CHROME_BINARY path."
)
computed['CHROME_BINARY'] = ''
else:
computed['CHROME_BINARY'] = str(chrome.abspath)
computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown'
# Output Binary JSONL record for Chrome
output_binary(chrome, name='chrome')
# Check Node.js for Puppeteer
node_binary_name = get_env('NODE_BINARY', 'node')
try:
node = Binary(name=node_binary_name, binproviders=[provider]).load()
node_path = str(node.abspath) if node.abspath else ''
except Exception:
node = None
node_path = ''
if use_chrome and not node_path:
errors.append(
f"Node.js not found (tried: {node_binary_name}). "
"Install Node.js or set NODE_BINARY path for Puppeteer."
)
else:
computed['NODE_BINARY'] = node_path
if node and node.abspath:
# Output Binary JSONL record for Node
output_binary(node, name='node')
# Output computed values
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
for error in errors:
print(f"ERROR:{error}", file=sys.stderr)
sys.exit(1 if errors else 0)
if __name__ == '__main__':
main()

View File

@@ -9,7 +9,7 @@
* --load-extension and --disable-extensions-except flags.
*
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Output: Creates chrome/ directory under crawl output dir with:
* Output: Writes to current directory (executor creates chrome/ dir):
* - cdp_url.txt: WebSocket URL for CDP connection
* - chrome.pid: Chromium process ID (for cleanup)
* - port.txt: Debug port number
@@ -42,7 +42,7 @@ const {
// Extractor metadata
const PLUGIN_NAME = 'chrome_launch';
const OUTPUT_DIR = 'chrome';
const OUTPUT_DIR = '.';
// Global state for cleanup
let chromePid = null;

View File

@@ -1,268 +0,0 @@
#!/usr/bin/env node
/**
* SingleFile Extension Plugin
*
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
* Falls back to single-file-cli if the extension is not available.
*
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
*
* Priority: 04 (early) - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:
* - Saves complete web pages as single HTML files
* - Inlines all resources (CSS, JS, images, fonts)
* - Preserves page fidelity better than wget/curl
* - Works with SPAs and dynamically loaded content
*/
const path = require('path');
const fs = require('fs');
const { promisify } = require('util');
const { exec } = require('child_process');
const execAsync = promisify(exec);
// Import extension utilities
const extensionUtils = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
name: 'singlefile',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'singlefile.html';
/**
* Install the SingleFile extension
*/
async function installSinglefileExtension() {
console.log('[*] Installing SingleFile extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install SingleFile extension');
return null;
}
console.log('[+] SingleFile extension installed');
console.log('[+] Web pages will be saved as single HTML files');
return extension;
}
/**
* Wait for a specified amount of time
*/
function wait(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Save a page using the SingleFile extension
*
* @param {Object} page - Puppeteer page object
* @param {Object} extension - Extension metadata with dispatchAction method
* @param {Object} options - Additional options
* @returns {Promise<string|null>} - Path to saved file or null on failure
*/
async function saveSinglefileWithExtension(page, extension, options = {}) {
if (!extension || !extension.version) {
throw new Error('SingleFile extension not found or not loaded');
}
const url = await page.url();
// Check for unsupported URL schemes
const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
const scheme = url.split(':')[0];
if (URL_SCHEMES_IGNORED.includes(scheme)) {
console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
return null;
}
// Ensure downloads directory exists
await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
// Get list of existing files to ignore
const files_before = new Set(
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'))
);
// Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
// Bring page to front (extension action button acts on foreground tab)
await page.bringToFront();
// Trigger the extension's action (toolbar button click)
await extension.dispatchAction();
// Wait for file to appear in downloads directory
const check_delay = 3000; // 3 seconds
const max_tries = 10;
let files_new = [];
for (let attempt = 0; attempt < max_tries; attempt++) {
await wait(check_delay);
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'));
files_new = files_after.filter(file => !files_before.has(file));
if (files_new.length === 0) {
continue;
}
// Find the matching file by checking if it contains the URL in the HTML header
for (const file of files_new) {
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
const dl_header = dl_text.split('meta charset')[0];
if (dl_header.includes(`url: ${url}`)) {
console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
await fs.promises.rename(dl_path, out_path);
return out_path;
}
}
}
console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
return null;
}
/**
* Save a page using single-file-cli (fallback method)
*
* @param {string} url - URL to archive
* @param {Object} options - Additional options
* @returns {Promise<string|null>} - Path to saved file or null on failure
*/
async function saveSinglefileWithCLI(url, options = {}) {
console.log('[*] Falling back to single-file-cli...');
// Find single-file binary
let binary = null;
try {
const { stdout } = await execAsync('which single-file');
binary = stdout.trim();
} catch (err) {
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
return null;
}
// Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Build command
const cmd = [
binary,
'--browser-headless',
url,
out_path,
];
// Add optional args
if (options.userAgent) {
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
}
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
}
if (options.ignoreSSL) {
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
}
// Execute
try {
const timeout = options.timeout || 120000;
await execAsync(cmd.join(' '), { timeout });
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
return out_path;
}
console.error('[❌] SingleFile CLI completed but no output file found');
return null;
} catch (err) {
console.error(`[❌] SingleFile CLI error: ${err.message}`);
return null;
}
}
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] SingleFile extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installSinglefileExtension();
// Export extension metadata for chrome plugin to load
if (extension) {
// Write extension info to a cache file that chrome plugin can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
}
return extension;
}
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installSinglefileExtension,
saveSinglefileWithExtension,
saveSinglefileWithCLI,
};
// Run if executed directly
if (require.main === module) {
main().then(() => {
console.log('[✓] SingleFile extension setup complete');
process.exit(0);
}).catch(err => {
console.error('[❌] SingleFile extension setup failed:', err);
process.exit(1);
});
}

View File

@@ -0,0 +1,281 @@
#!/usr/bin/env node
/**
* SingleFile Extension Plugin
*
* DISABLED: Extension functionality commented out - using single-file-cli only
*
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
* Falls back to single-file-cli if the extension is not available.
*
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
*
* Priority: 04 (early) - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:
* - Saves complete web pages as single HTML files
* - Inlines all resources (CSS, JS, images, fonts)
* - Preserves page fidelity better than wget/curl
* - Works with SPAs and dynamically loaded content
*/
const path = require('path');
const fs = require('fs');
const { promisify } = require('util');
const { exec } = require('child_process');
const execAsync = promisify(exec);
// DISABLED: Extension functionality - using single-file-cli only
// // Import extension utilities
// const extensionUtils = require('../chrome/chrome_utils.js');
// // Extension metadata
// const EXTENSION = {
// webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
// name: 'singlefile',
// };
// // Get extensions directory from environment or use default
// const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
// const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'singlefile.html';
// DISABLED: Extension functionality - using single-file-cli only
// /**
// * Install the SingleFile extension
// */
// async function installSinglefileExtension() {
// console.log('[*] Installing SingleFile extension...');
// // Install the extension
// const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
// if (!extension) {
// console.error('[❌] Failed to install SingleFile extension');
// return null;
// }
// console.log('[+] SingleFile extension installed');
// console.log('[+] Web pages will be saved as single HTML files');
// return extension;
// }
// /**
// * Wait for a specified amount of time
// */
// function wait(ms) {
// return new Promise(resolve => setTimeout(resolve, ms));
// }
// /**
// * Save a page using the SingleFile extension
// *
// * @param {Object} page - Puppeteer page object
// * @param {Object} extension - Extension metadata with dispatchAction method
// * @param {Object} options - Additional options
// * @returns {Promise<string|null>} - Path to saved file or null on failure
// */
// async function saveSinglefileWithExtension(page, extension, options = {}) {
// if (!extension || !extension.version) {
// throw new Error('SingleFile extension not found or not loaded');
// }
// const url = await page.url();
// // Check for unsupported URL schemes
// const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
// const scheme = url.split(':')[0];
// if (URL_SCHEMES_IGNORED.includes(scheme)) {
// console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
// return null;
// }
// // Ensure downloads directory exists
// await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
// // Get list of existing files to ignore
// const files_before = new Set(
// (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
// .filter(fn => fn.endsWith('.html'))
// );
// // Output directory is current directory (hook already runs in output dir)
// const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
// // Bring page to front (extension action button acts on foreground tab)
// await page.bringToFront();
// // Trigger the extension's action (toolbar button click)
// await extension.dispatchAction();
// // Wait for file to appear in downloads directory
// const check_delay = 3000; // 3 seconds
// const max_tries = 10;
// let files_new = [];
// for (let attempt = 0; attempt < max_tries; attempt++) {
// await wait(check_delay);
// const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
// .filter(fn => fn.endsWith('.html'));
// files_new = files_after.filter(file => !files_before.has(file));
// if (files_new.length === 0) {
// continue;
// }
// // Find the matching file by checking if it contains the URL in the HTML header
// for (const file of files_new) {
// const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
// const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
// const dl_header = dl_text.split('meta charset')[0];
// if (dl_header.includes(`url: ${url}`)) {
// console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
// await fs.promises.rename(dl_path, out_path);
// return out_path;
// }
// }
// }
// console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
// console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
// return null;
// }
/**
* Save a page using single-file-cli (fallback method)
*
* @param {string} url - URL to archive
* @param {Object} options - Additional options
* @returns {Promise<string|null>} - Path to saved file or null on failure
*/
async function saveSinglefileWithCLI(url, options = {}) {
console.log('[*] Falling back to single-file-cli...');
// Find single-file binary
let binary = null;
try {
const { stdout } = await execAsync('which single-file');
binary = stdout.trim();
} catch (err) {
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
return null;
}
// Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Build command
const cmd = [
binary,
'--browser-headless',
url,
out_path,
];
// Add optional args
if (options.userAgent) {
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
}
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
}
if (options.ignoreSSL) {
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
}
// Execute
try {
const timeout = options.timeout || 120000;
await execAsync(cmd.join(' '), { timeout });
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
return out_path;
}
console.error('[❌] SingleFile CLI completed but no output file found');
return null;
} catch (err) {
console.error(`[❌] SingleFile CLI error: ${err.message}`);
return null;
}
}
// DISABLED: Extension functionality - using single-file-cli only
// /**
// * Main entry point - install extension before archiving
// */
// async function main() {
// // Check if extension is already cached
// const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
// if (fs.existsSync(cacheFile)) {
// try {
// const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
// const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
// if (fs.existsSync(manifestPath)) {
// console.log('[*] SingleFile extension already installed (using cache)');
// return cached;
// }
// } catch (e) {
// // Cache file corrupted, re-install
// console.warn('[⚠️] Extension cache corrupted, re-installing...');
// }
// }
// // Install extension
// const extension = await installSinglefileExtension();
// // Export extension metadata for chrome plugin to load
// if (extension) {
// // Write extension info to a cache file that chrome plugin can read
// await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
// await fs.promises.writeFile(
// cacheFile,
// JSON.stringify(extension, null, 2)
// );
// console.log(`[+] Extension metadata written to ${cacheFile}`);
// }
// return extension;
// }
// Export functions for use by other plugins
module.exports = {
// DISABLED: Extension functionality - using single-file-cli only
// EXTENSION,
// installSinglefileExtension,
// saveSinglefileWithExtension,
saveSinglefileWithCLI,
};
// DISABLED: Extension functionality - using single-file-cli only
// // Run if executed directly
// if (require.main === module) {
// main().then(() => {
// console.log('[✓] SingleFile extension setup complete');
// process.exit(0);
// }).catch(err => {
// console.error('[❌] SingleFile extension setup failed:', err);
// process.exit(1);
// });
// }
// No-op when run directly (extension install disabled)
if (require.main === module) {
console.log('[*] SingleFile extension install disabled - using single-file-cli only');
process.exit(0);
}

View File

@@ -2,16 +2,15 @@
Integration tests for singlefile plugin
Tests verify:
1. Hook script exists and has correct metadata
2. Extension installation and caching works
3. Chrome/node dependencies available
4. Hook can be executed successfully
1. Hook scripts exist with correct naming
2. CLI-based singlefile extraction works
3. Dependencies available via abx-pkg
4. Output contains valid HTML
"""
import json
import os
import subprocess
import sys
import tempfile
from pathlib import Path
@@ -20,177 +19,63 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_singlefile.*'), None)
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
SNAPSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_singlefile.py'), None)
TEST_URL = "https://example.com"
def test_install_script_exists():
"""Verify install script exists"""
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
def test_snapshot_hook_exists():
"""Verify snapshot extraction hook exists"""
assert SNAPSHOT_HOOK is not None and SNAPSHOT_HOOK.exists(), f"Snapshot hook not found in {PLUGIN_DIR}"
def test_extension_metadata():
"""Test that SingleFile extension has correct metadata"""
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
result = subprocess.run(
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
capture_output=True,
text=True,
env=env
)
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
metadata = json.loads(result.stdout)
assert metadata["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
assert metadata["name"] == "singlefile"
def test_install_creates_cache():
"""Test that install creates extension cache"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Check output mentions installation
assert "SingleFile" in result.stdout or "singlefile" in result.stdout
# Check cache file was created
cache_file = ext_dir / "singlefile.extension.json"
assert cache_file.exists(), "Cache file should be created"
# Verify cache content
cache_data = json.loads(cache_file.read_text())
assert cache_data["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
assert cache_data["name"] == "singlefile"
def test_install_twice_uses_cache():
"""Test that running install twice uses existing cache on second run"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
# First install - downloads the extension
result1 = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
assert result1.returncode == 0, f"First install failed: {result1.stderr}"
# Verify cache was created
cache_file = ext_dir / "singlefile.extension.json"
assert cache_file.exists(), "Cache file should exist after first install"
# Second install - should use cache
result2 = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
# Second run should be faster (uses cache) and mention cache
assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
def test_no_configuration_required():
"""Test that SingleFile works without configuration"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
# No API keys needed
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should work without API keys
assert result.returncode == 0
def test_priority_order():
"""Test that singlefile has correct priority (04)"""
# Extract priority from filename
filename = INSTALL_SCRIPT.name
assert "04" in filename, "SingleFile should have priority 04"
assert filename.startswith("on_Crawl__04_"), "Should follow priority naming convention for Crawl hooks"
def test_output_directory_structure():
"""Test that plugin defines correct output structure"""
# Verify the script mentions singlefile output directory
script_content = INSTALL_SCRIPT.read_text()
# Should mention singlefile output directory
assert "singlefile" in script_content.lower()
# Should mention HTML output
assert ".html" in script_content or "html" in script_content.lower()
def test_snapshot_hook_priority():
"""Test that snapshot hook has correct priority (50)"""
filename = SNAPSHOT_HOOK.name
assert "50" in filename, "SingleFile snapshot hook should have priority 50"
assert filename.startswith("on_Snapshot__50_"), "Should follow priority naming convention"
def test_verify_deps_with_abx_pkg():
"""Verify dependencies are available via abx-pkg after hook installation."""
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
"""Verify dependencies are available via abx-pkg."""
from abx_pkg import Binary, EnvProvider
EnvProvider.model_rebuild()
# Verify node is available (singlefile uses Chrome extension, needs Node)
# Verify node is available
node_binary = Binary(name='node', binproviders=[EnvProvider()])
node_loaded = node_binary.load()
assert node_loaded and node_loaded.abspath, "Node.js required for singlefile plugin"
def test_singlefile_hook_runs():
"""Verify singlefile hook can be executed and completes."""
# Prerequisites checked by earlier test
def test_singlefile_cli_archives_example_com():
"""Test that singlefile CLI archives example.com and produces valid HTML."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run singlefile extraction hook
env = os.environ.copy()
env['SINGLEFILE_ENABLED'] = 'true'
# Run singlefile snapshot hook
result = subprocess.run(
['node', str(INSTALL_SCRIPT), f'--url={TEST_URL}', '--snapshot-id=test789'],
['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=120
)
# Hook should complete successfully (even if it just installs extension)
assert result.returncode == 0, f"Hook execution failed: {result.stderr}"
# Verify extension installation happens
assert 'SingleFile extension' in result.stdout or result.returncode == 0, "Should install extension or complete"
# Verify output file exists
output_file = tmpdir / 'singlefile.html'
assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}"
# Verify it contains real HTML
html_content = output_file.read_text()
assert len(html_content) > 500, "Output file too small to be valid HTML"
assert '<!DOCTYPE html>' in html_content or '<html' in html_content, "Output should contain HTML doctype or html tag"
assert 'Example Domain' in html_content, "Output should contain example.com content"
if __name__ == '__main__':

View File

@@ -25,7 +25,7 @@ const extensionUtils = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo',
name: 'captcha2',
name: 'twocaptcha',
};
// Get extensions directory from environment or use default
@@ -69,7 +69,7 @@ async function installCaptchaExtension() {
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'captcha2.extension.json');
const cacheFile = path.join(EXTENSIONS_DIR, 'twocaptcha.extension.json');
if (fs.existsSync(cacheFile)) {
try {

View File

@@ -29,7 +29,7 @@ function getCrawlChromeSessionDir() {
}
const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome';
const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.captcha2_configured');
const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.twocaptcha_configured');
// Get environment variable with default
function getEnv(name, defaultValue = '') {
@@ -70,7 +70,7 @@ async function configure2Captcha() {
}
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
const captchaExt = extensions.find(ext => ext.name === 'captcha2');
const captchaExt = extensions.find(ext => ext.name === 'twocaptcha');
if (!captchaExt) {
console.error('[*] 2captcha extension not installed, skipping configuration');
@@ -236,7 +236,7 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__21_captcha2_config.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Snapshot__21_twocaptcha_config.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}

View File

@@ -1,5 +1,5 @@
"""
Unit tests for captcha2 plugin
Unit tests for twocaptcha plugin
Tests invoke the plugin hooks as external processes and verify outputs/side effects.
"""
@@ -14,8 +14,8 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2.*'), None)
CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2_config.*'), None)
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_twocaptcha_extension.*'), None)
CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_configure_twocaptcha_extension_options.*'), None)
def test_install_script_exists():
@@ -29,7 +29,7 @@ def test_config_script_exists():
def test_extension_metadata():
"""Test that captcha2 extension has correct metadata"""
"""Test that twocaptcha extension has correct metadata"""
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
@@ -46,7 +46,7 @@ def test_extension_metadata():
metadata = json.loads(result.stdout)
assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
assert metadata["name"] == "captcha2"
assert metadata["name"] == "twocaptcha"
def test_install_creates_cache():
@@ -72,13 +72,13 @@ def test_install_creates_cache():
assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
# Check cache file was created
cache_file = ext_dir / "captcha2.extension.json"
cache_file = ext_dir / "twocaptcha.extension.json"
assert cache_file.exists(), "Cache file should be created"
# Verify cache content
cache_data = json.loads(cache_file.read_text())
assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
assert cache_data["name"] == "captcha2"
assert cache_data["name"] == "twocaptcha"
assert "unpacked_path" in cache_data
assert "version" in cache_data
@@ -104,7 +104,7 @@ def test_install_twice_uses_cache():
assert result1.returncode == 0, f"First install failed: {result1.stderr}"
# Verify cache was created
cache_file = ext_dir / "captcha2.extension.json"
cache_file = ext_dir / "twocaptcha.extension.json"
assert cache_file.exists(), "Cache file should exist after first install"
# Second install - should use cache
@@ -175,7 +175,7 @@ def test_config_script_structure():
script_content = CONFIG_SCRIPT.read_text()
# Should mention configuration marker file
assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content
assert "CONFIG_MARKER" in script_content or "twocaptcha_configured" in script_content
# Should mention API key
assert "API_KEY_2CAPTCHA" in script_content