mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-04 09:55:33 +10:00
new jsonl cli interface
This commit is contained in:
@@ -27,36 +27,43 @@ class ArchiveBoxGroup(click.Group):
|
||||
'init': 'archivebox.cli.archivebox_init.main',
|
||||
'install': 'archivebox.cli.archivebox_install.main',
|
||||
}
|
||||
# Model commands (CRUD operations via subcommands)
|
||||
model_commands = {
|
||||
'crawl': 'archivebox.cli.archivebox_crawl.main',
|
||||
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
|
||||
'archiveresult': 'archivebox.cli.archivebox_archiveresult.main',
|
||||
'tag': 'archivebox.cli.archivebox_tag.main',
|
||||
'binary': 'archivebox.cli.archivebox_binary.main',
|
||||
'process': 'archivebox.cli.archivebox_process.main',
|
||||
'machine': 'archivebox.cli.archivebox_machine.main',
|
||||
}
|
||||
archive_commands = {
|
||||
# High-level commands
|
||||
'add': 'archivebox.cli.archivebox_add.main',
|
||||
'remove': 'archivebox.cli.archivebox_remove.main',
|
||||
'run': 'archivebox.cli.archivebox_run.main',
|
||||
'update': 'archivebox.cli.archivebox_update.main',
|
||||
'search': 'archivebox.cli.archivebox_search.main',
|
||||
'status': 'archivebox.cli.archivebox_status.main',
|
||||
'config': 'archivebox.cli.archivebox_config.main',
|
||||
'schedule': 'archivebox.cli.archivebox_schedule.main',
|
||||
'server': 'archivebox.cli.archivebox_server.main',
|
||||
'shell': 'archivebox.cli.archivebox_shell.main',
|
||||
'manage': 'archivebox.cli.archivebox_manage.main',
|
||||
# Worker/orchestrator commands
|
||||
'orchestrator': 'archivebox.cli.archivebox_orchestrator.main',
|
||||
# Worker command
|
||||
'worker': 'archivebox.cli.archivebox_worker.main',
|
||||
# Task commands (called by workers as subprocesses)
|
||||
'crawl': 'archivebox.cli.archivebox_crawl.main',
|
||||
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
|
||||
'extract': 'archivebox.cli.archivebox_extract.main',
|
||||
}
|
||||
all_subcommands = {
|
||||
**meta_commands,
|
||||
**setup_commands,
|
||||
**model_commands,
|
||||
**archive_commands,
|
||||
}
|
||||
renamed_commands = {
|
||||
'setup': 'install',
|
||||
'list': 'search',
|
||||
'import': 'add',
|
||||
'archive': 'add',
|
||||
'export': 'search',
|
||||
# Old commands replaced by new model commands
|
||||
'orchestrator': 'run',
|
||||
'extract': 'archiveresult',
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -110,9 +117,9 @@ def cli(ctx, help=False):
|
||||
if help or ctx.invoked_subcommand is None:
|
||||
ctx.invoke(ctx.command.get_command(ctx, 'help'))
|
||||
|
||||
# if the subcommand is in the archive_commands dict and is not 'manage',
|
||||
# if the subcommand is in archive_commands or model_commands,
|
||||
# then we need to set up the django environment and check that we're in a valid data folder
|
||||
if subcommand in ArchiveBoxGroup.archive_commands:
|
||||
if subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands:
|
||||
# print('SETUP DJANGO AND CHECK DATA FOLDER')
|
||||
try:
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
365
archivebox/cli/archivebox_archiveresult.py
Normal file
365
archivebox/cli/archivebox_archiveresult.py
Normal file
@@ -0,0 +1,365 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox archiveresult <action> [args...] [--filters]
|
||||
|
||||
Manage ArchiveResult records (plugin extraction results).
|
||||
|
||||
Actions:
|
||||
create - Create ArchiveResults for Snapshots (queue extractions)
|
||||
list - List ArchiveResults as JSONL (with optional filters)
|
||||
update - Update ArchiveResults from stdin JSONL
|
||||
delete - Delete ArchiveResults from stdin JSONL
|
||||
|
||||
Examples:
|
||||
# Create ArchiveResults for snapshots (queue for extraction)
|
||||
archivebox snapshot list --status=queued | archivebox archiveresult create
|
||||
archivebox archiveresult create --plugin=screenshot --snapshot-id=<uuid>
|
||||
|
||||
# List with filters
|
||||
archivebox archiveresult list --status=failed
|
||||
archivebox archiveresult list --plugin=screenshot --status=succeeded
|
||||
|
||||
# Update (reset failed extractions to queued)
|
||||
archivebox archiveresult list --status=failed | archivebox archiveresult update --status=queued
|
||||
|
||||
# Delete
|
||||
archivebox archiveresult list --plugin=singlefile | archivebox archiveresult delete --yes
|
||||
|
||||
# Re-run failed extractions
|
||||
archivebox archiveresult list --status=failed | archivebox run
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox archiveresult'
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
||||
filters = {}
|
||||
for key, value in filter_kwargs.items():
|
||||
if value is not None and key not in ('limit', 'offset'):
|
||||
filters[key] = value
|
||||
|
||||
if filters:
|
||||
queryset = queryset.filter(**filters)
|
||||
|
||||
if limit:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
return queryset
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
def create_archiveresults(
|
||||
snapshot_id: Optional[str] = None,
|
||||
plugin: Optional[str] = None,
|
||||
status: str = 'queued',
|
||||
) -> int:
|
||||
"""
|
||||
Create ArchiveResults for Snapshots.
|
||||
|
||||
Reads Snapshot records from stdin and creates ArchiveResult entries.
|
||||
If --plugin is specified, only creates results for that plugin.
|
||||
Otherwise, creates results for all pending plugins.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# If snapshot_id provided directly, use that
|
||||
if snapshot_id:
|
||||
try:
|
||||
snapshots = [Snapshot.objects.get(id=snapshot_id)]
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
# Read from stdin
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Filter to only Snapshot records
|
||||
snapshot_ids = []
|
||||
for record in records:
|
||||
if record.get('type') == TYPE_SNAPSHOT:
|
||||
if record.get('id'):
|
||||
snapshot_ids.append(record['id'])
|
||||
elif record.get('id'):
|
||||
# Assume it's a snapshot ID if no type specified
|
||||
snapshot_ids.append(record['id'])
|
||||
|
||||
if not snapshot_ids:
|
||||
rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids))
|
||||
|
||||
if not snapshots:
|
||||
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
created_count = 0
|
||||
for snapshot in snapshots:
|
||||
if plugin:
|
||||
# Create for specific plugin only
|
||||
result, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=plugin,
|
||||
defaults={
|
||||
'status': status,
|
||||
'retry_at': timezone.now(),
|
||||
}
|
||||
)
|
||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||
# Reset for retry
|
||||
result.status = status
|
||||
result.retry_at = timezone.now()
|
||||
result.save()
|
||||
|
||||
if not is_tty:
|
||||
write_record(result.to_json())
|
||||
created_count += 1
|
||||
else:
|
||||
# Create all pending plugins
|
||||
snapshot.create_pending_archiveresults()
|
||||
for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
|
||||
if not is_tty:
|
||||
write_record(result.to_json())
|
||||
created_count += 1
|
||||
|
||||
rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
def list_archiveresults(
|
||||
status: Optional[str] = None,
|
||||
plugin: Optional[str] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
List ArchiveResults as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = ArchiveResult.objects.all().order_by('-start_ts')
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'plugin': plugin,
|
||||
'snapshot_id': snapshot_id,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for result in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'succeeded': 'green',
|
||||
'failed': 'red',
|
||||
'skipped': 'dim',
|
||||
'backoff': 'magenta',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
|
||||
else:
|
||||
write_record(result.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_archiveresults(
|
||||
status: Optional[str] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update ArchiveResults from stdin JSONL.
|
||||
|
||||
Reads ArchiveResult records from stdin and applies updates.
|
||||
Uses PATCH semantics - only specified fields are updated.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or error
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import read_stdin, write_record
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
result_id = record.get('id')
|
||||
if not result_id:
|
||||
continue
|
||||
|
||||
try:
|
||||
result = ArchiveResult.objects.get(id=result_id)
|
||||
|
||||
# Apply updates from CLI flags
|
||||
if status:
|
||||
result.status = status
|
||||
result.retry_at = timezone.now()
|
||||
|
||||
result.save()
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record(result.to_json())
|
||||
|
||||
except ArchiveResult.DoesNotExist:
|
||||
rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} archive results[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete ArchiveResults from stdin JSONL.
|
||||
|
||||
Requires --yes flag to confirm deletion.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or missing --yes flag
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
result_ids = [r.get('id') for r in records if r.get('id')]
|
||||
|
||||
if not result_ids:
|
||||
rprint('[yellow]No valid archive result IDs in input[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
results = ArchiveResult.objects.filter(id__in=result_ids)
|
||||
count = results.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching archive results found[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} archive results (dry run)[/yellow]', file=sys.stderr)
|
||||
for result in results[:10]:
|
||||
rprint(f' [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}', file=sys.stderr)
|
||||
if count > 10:
|
||||
rprint(f' ... and {count - 10} more', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = results.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} archive results[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage ArchiveResult records (plugin extraction results)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.option('--snapshot-id', help='Snapshot ID to create results for')
|
||||
@click.option('--plugin', '-p', help='Plugin name (e.g., screenshot, singlefile)')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
def create_cmd(snapshot_id: Optional[str], plugin: Optional[str], status: str):
|
||||
"""Create ArchiveResults for Snapshots from stdin JSONL."""
|
||||
sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, succeeded, failed, skipped)')
|
||||
@click.option('--plugin', '-p', help='Filter by plugin name')
|
||||
@click.option('--snapshot-id', help='Filter by snapshot ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], plugin: Optional[str],
|
||||
snapshot_id: Optional[str], limit: Optional[int]):
|
||||
"""List ArchiveResults as JSONL."""
|
||||
sys.exit(list_archiveresults(
|
||||
status=status,
|
||||
plugin=plugin,
|
||||
snapshot_id=snapshot_id,
|
||||
limit=limit,
|
||||
))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
def update_cmd(status: Optional[str]):
|
||||
"""Update ArchiveResults from stdin JSONL."""
|
||||
sys.exit(update_archiveresults(status=status))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete ArchiveResults from stdin JSONL."""
|
||||
sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
304
archivebox/cli/archivebox_binary.py
Normal file
304
archivebox/cli/archivebox_binary.py
Normal file
@@ -0,0 +1,304 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox binary <action> [args...] [--filters]
|
||||
|
||||
Manage Binary records (detected executables like chrome, wget, etc.).
|
||||
|
||||
Actions:
|
||||
create - Create/register a Binary
|
||||
list - List Binaries as JSONL (with optional filters)
|
||||
update - Update Binaries from stdin JSONL
|
||||
delete - Delete Binaries from stdin JSONL
|
||||
|
||||
Examples:
|
||||
# List all binaries
|
||||
archivebox binary list
|
||||
|
||||
# List specific binary
|
||||
archivebox binary list --name=chrome
|
||||
|
||||
# List binaries with specific version
|
||||
archivebox binary list --version__icontains=120
|
||||
|
||||
# Delete old binary entries
|
||||
archivebox binary list --name=chrome | archivebox binary delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox binary'
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
||||
filters = {}
|
||||
for key, value in filter_kwargs.items():
|
||||
if value is not None and key not in ('limit', 'offset'):
|
||||
filters[key] = value
|
||||
|
||||
if filters:
|
||||
queryset = queryset.filter(**filters)
|
||||
|
||||
if limit:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
return queryset
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
def create_binary(
|
||||
name: str,
|
||||
abspath: str,
|
||||
version: str = '',
|
||||
) -> int:
|
||||
"""
|
||||
Create/register a Binary.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
if not name or not abspath:
|
||||
rprint('[red]Both --name and --abspath are required[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
binary, created = Binary.objects.get_or_create(
|
||||
name=name,
|
||||
abspath=abspath,
|
||||
defaults={'version': version}
|
||||
)
|
||||
|
||||
if not is_tty:
|
||||
write_record(binary.to_json())
|
||||
|
||||
if created:
|
||||
rprint(f'[green]Created binary: {name} at {abspath}[/green]', file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Binary already exists: {name} at {abspath}[/dim]', file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating binary: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
def list_binaries(
|
||||
name: Optional[str] = None,
|
||||
abspath__icontains: Optional[str] = None,
|
||||
version__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Binaries as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Binary.objects.all().order_by('name', '-loaded_at')
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'abspath__icontains': abspath__icontains,
|
||||
'version__icontains': version__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for binary in queryset:
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}')
|
||||
else:
|
||||
write_record(binary.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} binaries[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_binaries(
|
||||
version: Optional[str] = None,
|
||||
abspath: Optional[str] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Binaries from stdin JSONL.
|
||||
|
||||
Reads Binary records from stdin and applies updates.
|
||||
Uses PATCH semantics - only specified fields are updated.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or error
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin, write_record
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
binary_id = record.get('id')
|
||||
if not binary_id:
|
||||
continue
|
||||
|
||||
try:
|
||||
binary = Binary.objects.get(id=binary_id)
|
||||
|
||||
# Apply updates from CLI flags
|
||||
if version:
|
||||
binary.version = version
|
||||
if abspath:
|
||||
binary.abspath = abspath
|
||||
|
||||
binary.save()
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record(binary.to_json())
|
||||
|
||||
except Binary.DoesNotExist:
|
||||
rprint(f'[yellow]Binary not found: {binary_id}[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} binaries[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Binaries from stdin JSONL.
|
||||
|
||||
Requires --yes flag to confirm deletion.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or missing --yes flag
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
binary_ids = [r.get('id') for r in records if r.get('id')]
|
||||
|
||||
if not binary_ids:
|
||||
rprint('[yellow]No valid binary IDs in input[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
binaries = Binary.objects.filter(id__in=binary_ids)
|
||||
count = binaries.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching binaries found[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} binaries (dry run)[/yellow]', file=sys.stderr)
|
||||
for binary in binaries:
|
||||
rprint(f' {binary.name} {binary.abspath}', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = binaries.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} binaries[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Binary records (detected executables)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.option('--name', '-n', required=True, help='Binary name (e.g., chrome, wget)')
|
||||
@click.option('--abspath', '-p', required=True, help='Absolute path to binary')
|
||||
@click.option('--version', '-v', default='', help='Binary version')
|
||||
def create_cmd(name: str, abspath: str, version: str):
|
||||
"""Create/register a Binary."""
|
||||
sys.exit(create_binary(name=name, abspath=abspath, version=version))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', '-n', help='Filter by name')
|
||||
@click.option('--abspath__icontains', help='Filter by path contains')
|
||||
@click.option('--version__icontains', help='Filter by version contains')
|
||||
@click.option('--limit', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], abspath__icontains: Optional[str],
|
||||
version__icontains: Optional[str], limit: Optional[int]):
|
||||
"""List Binaries as JSONL."""
|
||||
sys.exit(list_binaries(
|
||||
name=name,
|
||||
abspath__icontains=abspath__icontains,
|
||||
version__icontains=version__icontains,
|
||||
limit=limit,
|
||||
))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--version', '-v', help='Set version')
|
||||
@click.option('--abspath', '-p', help='Set path')
|
||||
def update_cmd(version: Optional[str], abspath: Optional[str]):
|
||||
"""Update Binaries from stdin JSONL."""
|
||||
sys.exit(update_binaries(version=version, abspath=abspath))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Binaries from stdin JSONL."""
|
||||
sys.exit(delete_binaries(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,108 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox crawl [urls...] [--depth=N] [--tag=TAG]
|
||||
archivebox crawl <action> [args...] [--filters]
|
||||
|
||||
Create Crawl jobs from URLs. Accepts URLs as arguments, from stdin, or via JSONL.
|
||||
Does NOT immediately start the crawl - pipe to `archivebox snapshot` to process.
|
||||
Manage Crawl records.
|
||||
|
||||
Input formats:
|
||||
- Plain URLs (one per line)
|
||||
- JSONL: {"url": "...", "depth": 1, "tags": "..."}
|
||||
|
||||
Output (JSONL):
|
||||
{"type": "Crawl", "id": "...", "urls": "...", "status": "queued", ...}
|
||||
Actions:
|
||||
create - Create Crawl jobs from URLs
|
||||
list - List Crawls as JSONL (with optional filters)
|
||||
update - Update Crawls from stdin JSONL
|
||||
delete - Delete Crawls from stdin JSONL
|
||||
|
||||
Examples:
|
||||
# Create a crawl job
|
||||
archivebox crawl https://example.com
|
||||
# Create
|
||||
archivebox crawl create https://example.com https://foo.com --depth=1
|
||||
archivebox crawl create --tag=news https://example.com
|
||||
|
||||
# Create crawl with depth
|
||||
archivebox crawl --depth=1 https://example.com
|
||||
# List with filters
|
||||
archivebox crawl list --status=queued
|
||||
archivebox crawl list --urls__icontains=example.com
|
||||
|
||||
# Full pipeline: create crawl, create snapshots, run extractors
|
||||
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
||||
# Update
|
||||
archivebox crawl list --status=started | archivebox crawl update --status=queued
|
||||
|
||||
# Process existing Crawl by ID (runs the crawl state machine)
|
||||
archivebox crawl 01234567-89ab-cdef-0123-456789abcdef
|
||||
# Delete
|
||||
archivebox crawl list --urls__icontains=spam.com | archivebox crawl delete --yes
|
||||
|
||||
# Full pipeline
|
||||
archivebox crawl create https://example.com | archivebox snapshot create | archivebox run
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox crawl'
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
from typing import Optional, Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
|
||||
def create_crawls(
|
||||
records: list,
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
||||
filters = {}
|
||||
for key, value in filter_kwargs.items():
|
||||
if value is not None and key not in ('limit', 'offset'):
|
||||
filters[key] = value
|
||||
|
||||
if filters:
|
||||
queryset = queryset.filter(**filters)
|
||||
|
||||
if limit:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
return queryset
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
def create_crawl(
|
||||
urls: Iterable[str],
|
||||
depth: int = 0,
|
||||
tag: str = '',
|
||||
status: str = 'queued',
|
||||
created_by_id: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create a single Crawl job from all input URLs.
|
||||
Create a Crawl job from URLs.
|
||||
|
||||
Takes pre-read records, creates one Crawl with all URLs, outputs JSONL.
|
||||
Does NOT start the crawl - just creates the job in QUEUED state.
|
||||
Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from rich import print as rprint
|
||||
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.misc.jsonl import read_args_or_stdin, write_record
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Collect all input records
|
||||
records = list(read_args_or_stdin(urls))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Collect all URLs into a single newline-separated string
|
||||
urls = []
|
||||
url_list = []
|
||||
for record in records:
|
||||
url = record.get('url')
|
||||
if url:
|
||||
urls.append(url)
|
||||
url_list.append(url)
|
||||
|
||||
if not urls:
|
||||
if not url_list:
|
||||
rprint('[red]No valid URLs found[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
# Build crawl record with all URLs as newline-separated string
|
||||
crawl_record = {
|
||||
'urls': '\n'.join(urls),
|
||||
'urls': '\n'.join(url_list),
|
||||
'max_depth': depth,
|
||||
'tags_str': tag,
|
||||
'status': status,
|
||||
'label': '',
|
||||
}
|
||||
|
||||
crawl = Crawl.from_jsonl(crawl_record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(crawl_record, overrides={'created_by_id': created_by_id})
|
||||
if not crawl:
|
||||
rprint('[red]Failed to create crawl[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Output JSONL record (only when piped)
|
||||
if not is_tty:
|
||||
write_record(crawl.to_jsonl())
|
||||
write_record(crawl.to_json())
|
||||
|
||||
rprint(f'[green]Created crawl with {len(urls)} URLs[/green]', file=sys.stderr)
|
||||
rprint(f'[green]Created crawl with {len(url_list)} URLs[/green]', file=sys.stderr)
|
||||
|
||||
# If TTY, show human-readable output
|
||||
if is_tty:
|
||||
rprint(f' [dim]{crawl.id}[/dim]', file=sys.stderr)
|
||||
for url in urls[:5]: # Show first 5 URLs
|
||||
for url in url_list[:5]: # Show first 5 URLs
|
||||
rprint(f' {url[:70]}', file=sys.stderr)
|
||||
if len(urls) > 5:
|
||||
rprint(f' ... and {len(urls) - 5} more', file=sys.stderr)
|
||||
if len(url_list) > 5:
|
||||
rprint(f' ... and {len(url_list) - 5} more', file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
@@ -111,81 +137,217 @@ def create_crawls(
|
||||
return 1
|
||||
|
||||
|
||||
def process_crawl_by_id(crawl_id: str) -> int:
|
||||
"""
|
||||
Process a single Crawl by ID (used by workers).
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
Triggers the Crawl's state machine tick() which will:
|
||||
- Transition from queued -> started (creates root snapshot)
|
||||
- Transition from started -> sealed (when all snapshots done)
|
||||
def list_crawls(
|
||||
status: Optional[str] = None,
|
||||
urls__icontains: Optional[str] = None,
|
||||
max_depth: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
from rich import print as rprint
|
||||
List Crawls as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
except Crawl.DoesNotExist:
|
||||
rprint(f'[red]Crawl {crawl_id} not found[/red]', file=sys.stderr)
|
||||
return 1
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
rprint(f'[blue]Processing Crawl {crawl.id} (status={crawl.status})[/blue]', file=sys.stderr)
|
||||
queryset = Crawl.objects.all().order_by('-created_at')
|
||||
|
||||
try:
|
||||
crawl.sm.tick()
|
||||
crawl.refresh_from_db()
|
||||
rprint(f'[green]Crawl complete (status={crawl.status})[/green]', file=sys.stderr)
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Crawl error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'urls__icontains': urls__icontains,
|
||||
'max_depth': max_depth,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for crawl in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'sealed': 'green',
|
||||
}.get(crawl.status, 'dim')
|
||||
url_preview = crawl.urls[:50].replace('\n', ' ')
|
||||
rprint(f'[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...')
|
||||
else:
|
||||
write_record(crawl.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} crawls[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
def is_crawl_id(value: str) -> bool:
|
||||
"""Check if value looks like a Crawl UUID."""
|
||||
import re
|
||||
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually a Crawl (not a Snapshot or other object)
|
||||
# =============================================================================
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_crawls(
|
||||
status: Optional[str] = None,
|
||||
max_depth: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Crawls from stdin JSONL.
|
||||
|
||||
Reads Crawl records from stdin and applies updates.
|
||||
Uses PATCH semantics - only specified fields are updated.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or error
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import read_stdin, write_record
|
||||
from archivebox.crawls.models import Crawl
|
||||
return Crawl.objects.filter(id=value).exists()
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
@click.command()
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Max depth for recursive crawling (default: 0, no recursion)')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add to snapshots')
|
||||
@click.argument('args', nargs=-1)
|
||||
def main(depth: int, tag: str, args: tuple):
|
||||
"""Create Crawl jobs from URLs, or process existing Crawls by ID"""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
# Read all input
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
from rich import print as rprint
|
||||
rprint('[yellow]No URLs or Crawl IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Check if input looks like existing Crawl IDs to process
|
||||
# If ALL inputs are Crawl UUIDs, process them
|
||||
all_are_crawl_ids = all(
|
||||
is_crawl_id(r.get('id') or r.get('url', ''))
|
||||
for r in records
|
||||
)
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
crawl_id = record.get('id')
|
||||
if not crawl_id:
|
||||
continue
|
||||
|
||||
if all_are_crawl_ids:
|
||||
# Process existing Crawls by ID
|
||||
exit_code = 0
|
||||
for record in records:
|
||||
crawl_id = record.get('id') or record.get('url')
|
||||
result = process_crawl_by_id(crawl_id)
|
||||
if result != 0:
|
||||
exit_code = result
|
||||
sys.exit(exit_code)
|
||||
else:
|
||||
# Default behavior: create Crawl jobs from URLs
|
||||
sys.exit(create_crawls(records, depth=depth, tag=tag))
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
|
||||
# Apply updates from CLI flags
|
||||
if status:
|
||||
crawl.status = status
|
||||
crawl.retry_at = timezone.now()
|
||||
if max_depth is not None:
|
||||
crawl.max_depth = max_depth
|
||||
|
||||
crawl.save()
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record(crawl.to_json())
|
||||
|
||||
except Crawl.DoesNotExist:
|
||||
rprint(f'[yellow]Crawl not found: {crawl_id}[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} crawls[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Crawls from stdin JSONL.
|
||||
|
||||
Requires --yes flag to confirm deletion.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or missing --yes flag
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
crawl_ids = [r.get('id') for r in records if r.get('id')]
|
||||
|
||||
if not crawl_ids:
|
||||
rprint('[yellow]No valid crawl IDs in input[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
crawls = Crawl.objects.filter(id__in=crawl_ids)
|
||||
count = crawls.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching crawls found[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} crawls (dry run)[/yellow]', file=sys.stderr)
|
||||
for crawl in crawls:
|
||||
url_preview = crawl.urls[:50].replace('\n', ' ')
|
||||
rprint(f' [dim]{crawl.id}[/dim] {url_preview}...', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = crawls.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} crawls[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Crawl records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
def create_cmd(urls: tuple, depth: int, tag: str, status: str):
|
||||
"""Create a Crawl job from URLs or stdin."""
|
||||
sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
|
||||
@click.option('--urls__icontains', help='Filter by URLs contains')
|
||||
@click.option('--max-depth', type=int, help='Filter by max depth')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], urls__icontains: Optional[str],
|
||||
max_depth: Optional[int], limit: Optional[int]):
|
||||
"""List Crawls as JSONL."""
|
||||
sys.exit(list_crawls(
|
||||
status=status,
|
||||
urls__icontains=urls__icontains,
|
||||
max_depth=max_depth,
|
||||
limit=limit,
|
||||
))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
@click.option('--max-depth', type=int, help='Set max depth')
|
||||
def update_cmd(status: Optional[str], max_depth: Optional[int]):
|
||||
"""Update Crawls from stdin JSONL."""
|
||||
sys.exit(update_crawls(status=status, max_depth=max_depth))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Crawls from stdin JSONL."""
|
||||
sys.exit(delete_crawls(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -1,265 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox extract [snapshot_ids...] [--plugins=NAMES]
|
||||
|
||||
Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.
|
||||
|
||||
Input formats:
|
||||
- Snapshot UUIDs (one per line)
|
||||
- JSONL: {"type": "Snapshot", "id": "...", "url": "..."}
|
||||
- JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."}
|
||||
|
||||
Output (JSONL):
|
||||
{"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."}
|
||||
|
||||
Examples:
|
||||
# Extract specific snapshot
|
||||
archivebox extract 01234567-89ab-cdef-0123-456789abcdef
|
||||
|
||||
# Pipe from snapshot command
|
||||
archivebox snapshot https://example.com | archivebox extract
|
||||
|
||||
# Run specific plugins only
|
||||
archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef
|
||||
|
||||
# Chain commands
|
||||
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox extract'
|
||||
|
||||
import sys
|
||||
from typing import Optional, List
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
"""
|
||||
Run extraction for a single ArchiveResult by ID (used by workers).
|
||||
|
||||
Triggers the ArchiveResult's state machine tick() to run the extractor plugin.
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
try:
|
||||
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
|
||||
except ArchiveResult.DoesNotExist:
|
||||
rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
|
||||
|
||||
try:
|
||||
# Trigger state machine tick - this runs the actual extraction
|
||||
archiveresult.sm.tick()
|
||||
archiveresult.refresh_from_db()
|
||||
|
||||
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
|
||||
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
|
||||
return 0
|
||||
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
|
||||
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
# Still in progress or backoff - not a failure
|
||||
print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def run_plugins(
|
||||
args: tuple,
|
||||
plugins: str = '',
|
||||
wait: bool = True,
|
||||
) -> int:
|
||||
"""
|
||||
Run plugins on Snapshots from input.
|
||||
|
||||
Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
)
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Parse comma-separated plugins list once (reused in creation and filtering)
|
||||
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []
|
||||
|
||||
# Collect all input records
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Gather snapshot IDs to process
|
||||
snapshot_ids = set()
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
|
||||
if record_type == TYPE_SNAPSHOT:
|
||||
snapshot_id = record.get('id')
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
elif record.get('url'):
|
||||
# Look up by URL (get most recent if multiple exist)
|
||||
snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
|
||||
if snap:
|
||||
snapshot_ids.add(str(snap.id))
|
||||
else:
|
||||
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
|
||||
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
|
||||
elif 'id' in record:
|
||||
# Assume it's a snapshot ID
|
||||
snapshot_ids.add(record['id'])
|
||||
|
||||
if not snapshot_ids:
|
||||
rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Get snapshots and ensure they have pending ArchiveResults
|
||||
processed_count = 0
|
||||
for snapshot_id in snapshot_ids:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
# Create pending ArchiveResults if needed
|
||||
if plugins_list:
|
||||
# Only create for specific plugins
|
||||
for plugin_name in plugins_list:
|
||||
result, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=plugin_name,
|
||||
defaults={
|
||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
}
|
||||
)
|
||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||
# Reset for retry
|
||||
result.status = ArchiveResult.StatusChoices.QUEUED
|
||||
result.retry_at = timezone.now()
|
||||
result.save()
|
||||
else:
|
||||
# Create all pending plugins
|
||||
snapshot.create_pending_archiveresults()
|
||||
|
||||
# Reset snapshot status to allow processing
|
||||
if snapshot.status == Snapshot.StatusChoices.SEALED:
|
||||
snapshot.status = Snapshot.StatusChoices.STARTED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save()
|
||||
|
||||
processed_count += 1
|
||||
|
||||
if processed_count == 0:
|
||||
rprint('[red]No snapshots to process[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
|
||||
|
||||
# Run orchestrator if --wait (default)
|
||||
if wait:
|
||||
rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.runloop()
|
||||
|
||||
# Output results as JSONL (when piped) or human-readable (when TTY)
|
||||
for snapshot_id in snapshot_ids:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
results = snapshot.archiveresult_set.all()
|
||||
if plugins_list:
|
||||
results = results.filter(plugin__in=plugins_list)
|
||||
|
||||
for result in results:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'succeeded': 'green',
|
||||
'failed': 'red',
|
||||
'skipped': 'yellow',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr)
|
||||
else:
|
||||
write_record(result.to_jsonl())
|
||||
except Snapshot.DoesNotExist:
|
||||
continue
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def is_archiveresult_id(value: str) -> bool:
|
||||
"""Check if value looks like an ArchiveResult UUID."""
|
||||
import re
|
||||
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
|
||||
from archivebox.core.models import ArchiveResult
|
||||
return ArchiveResult.objects.filter(id=value).exists()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
|
||||
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
|
||||
@click.argument('args', nargs=-1)
|
||||
def main(plugins: str, wait: bool, args: tuple):
|
||||
"""Run plugins on Snapshots, or process existing ArchiveResults by ID"""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
# Read all input
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
if not records:
|
||||
from rich import print as rprint
|
||||
rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Check if input looks like existing ArchiveResult IDs to process
|
||||
all_are_archiveresult_ids = all(
|
||||
is_archiveresult_id(r.get('id') or r.get('url', ''))
|
||||
for r in records
|
||||
)
|
||||
|
||||
if all_are_archiveresult_ids:
|
||||
# Process existing ArchiveResults by ID
|
||||
exit_code = 0
|
||||
for record in records:
|
||||
archiveresult_id = record.get('id') or record.get('url')
|
||||
result = process_archiveresult_by_id(archiveresult_id)
|
||||
if result != 0:
|
||||
exit_code = result
|
||||
sys.exit(exit_code)
|
||||
else:
|
||||
# Default behavior: run plugins on Snapshots from input
|
||||
sys.exit(run_plugins(args, plugins=plugins, wait=wait))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -127,7 +127,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
|
||||
if pending_links:
|
||||
for link_dict in pending_links.values():
|
||||
Snapshot.from_jsonl(link_dict)
|
||||
Snapshot.from_json(link_dict)
|
||||
|
||||
# Hint for orphaned snapshot directories
|
||||
print()
|
||||
|
||||
113
archivebox/cli/archivebox_machine.py
Normal file
113
archivebox/cli/archivebox_machine.py
Normal file
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox machine <action> [--filters]
|
||||
|
||||
Manage Machine records (system-managed, mostly read-only).
|
||||
|
||||
Machine records track the host machines where ArchiveBox runs.
|
||||
They are created automatically by the system and are primarily for debugging.
|
||||
|
||||
Actions:
|
||||
list - List Machines as JSONL (with optional filters)
|
||||
|
||||
Examples:
|
||||
# List all machines
|
||||
archivebox machine list
|
||||
|
||||
# List machines by hostname
|
||||
archivebox machine list --hostname__icontains=myserver
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox machine'
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
||||
filters = {}
|
||||
for key, value in filter_kwargs.items():
|
||||
if value is not None and key not in ('limit', 'offset'):
|
||||
filters[key] = value
|
||||
|
||||
if filters:
|
||||
queryset = queryset.filter(**filters)
|
||||
|
||||
if limit:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
return queryset
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
def list_machines(
|
||||
hostname__icontains: Optional[str] = None,
|
||||
os_platform: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Machines as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Machine.objects.all().order_by('-created_at')
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'hostname__icontains': hostname__icontains,
|
||||
'os_platform': os_platform,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for machine in queryset:
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}')
|
||||
else:
|
||||
write_record(machine.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} machines[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Machine records (read-only, system-managed)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--hostname__icontains', help='Filter by hostname contains')
|
||||
@click.option('--os-platform', help='Filter by OS platform')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(hostname__icontains: Optional[str], os_platform: Optional[str], limit: Optional[int]):
|
||||
"""List Machines as JSONL."""
|
||||
sys.exit(list_machines(
|
||||
hostname__icontains=hostname__icontains,
|
||||
os_platform=os_platform,
|
||||
limit=limit,
|
||||
))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,67 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox orchestrator [--daemon]
|
||||
|
||||
Start the orchestrator process that manages workers.
|
||||
|
||||
The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
|
||||
and lazily spawns worker processes when there is work to be done.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox orchestrator'
|
||||
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
|
||||
|
||||
def orchestrator(daemon: bool = False, watch: bool = False) -> int:
|
||||
"""
|
||||
Start the orchestrator process.
|
||||
|
||||
The orchestrator:
|
||||
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
|
||||
2. Spawns worker processes when there is work to do
|
||||
3. Monitors worker health and restarts failed workers
|
||||
4. Exits when all queues are empty (unless --daemon)
|
||||
|
||||
Args:
|
||||
daemon: Run forever (don't exit when idle)
|
||||
watch: Just watch the queues without spawning workers (for debugging)
|
||||
|
||||
Exit codes:
|
||||
0: All work completed successfully
|
||||
1: Error occurred
|
||||
"""
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
if Orchestrator.is_running():
|
||||
print('[yellow]Orchestrator is already running[/yellow]')
|
||||
return 0
|
||||
|
||||
try:
|
||||
orchestrator_instance = Orchestrator(exit_on_idle=not daemon)
|
||||
orchestrator_instance.runloop()
|
||||
return 0
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
except Exception as e:
|
||||
print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
|
||||
@click.option('--watch', '-w', is_flag=True, help="Watch queues without spawning workers")
|
||||
@docstring(orchestrator.__doc__)
|
||||
def main(daemon: bool, watch: bool):
|
||||
"""Start the ArchiveBox orchestrator process"""
|
||||
sys.exit(orchestrator(daemon=daemon, watch=watch))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
121
archivebox/cli/archivebox_process.py
Normal file
121
archivebox/cli/archivebox_process.py
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox process <action> [--filters]
|
||||
|
||||
Manage Process records (system-managed, mostly read-only).
|
||||
|
||||
Process records track executions of binaries during extraction.
|
||||
They are created automatically by the system and are primarily for debugging.
|
||||
|
||||
Actions:
|
||||
list - List Processes as JSONL (with optional filters)
|
||||
|
||||
Examples:
|
||||
# List all processes
|
||||
archivebox process list
|
||||
|
||||
# List processes by binary
|
||||
archivebox process list --binary-name=chrome
|
||||
|
||||
# List recent processes
|
||||
archivebox process list --limit=10
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox process'
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
||||
filters = {}
|
||||
for key, value in filter_kwargs.items():
|
||||
if value is not None and key not in ('limit', 'offset'):
|
||||
filters[key] = value
|
||||
|
||||
if filters:
|
||||
queryset = queryset.filter(**filters)
|
||||
|
||||
if limit:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
return queryset
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
def list_processes(
|
||||
binary_name: Optional[str] = None,
|
||||
machine_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Processes as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Process.objects.all().select_related('binary', 'machine').order_by('-start_ts')
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {}
|
||||
if binary_name:
|
||||
filter_kwargs['binary__name'] = binary_name
|
||||
if machine_id:
|
||||
filter_kwargs['machine_id'] = machine_id
|
||||
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for process in queryset:
|
||||
if is_tty:
|
||||
binary_name_str = process.binary.name if process.binary else 'unknown'
|
||||
exit_code = process.returncode if process.returncode is not None else '?'
|
||||
status_color = 'green' if process.returncode == 0 else 'red' if process.returncode else 'yellow'
|
||||
rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]')
|
||||
else:
|
||||
write_record(process.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} processes[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Process records (read-only, system-managed)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--binary-name', '-b', help='Filter by binary name')
|
||||
@click.option('--machine-id', '-m', help='Filter by machine ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(binary_name: Optional[str], machine_id: Optional[str], limit: Optional[int]):
|
||||
"""List Processes as JSONL."""
|
||||
sys.exit(list_processes(
|
||||
binary_name=binary_name,
|
||||
machine_id=machine_id,
|
||||
limit=limit,
|
||||
))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,98 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox remove'
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.config.django import setup_django
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
from archivebox.misc.logging_util import (
|
||||
log_list_started,
|
||||
log_list_finished,
|
||||
log_removal_started,
|
||||
log_removal_finished,
|
||||
TimedProgress,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def remove(filter_patterns: Iterable[str]=(),
|
||||
filter_type: str='exact',
|
||||
snapshots: QuerySet | None=None,
|
||||
after: float | None=None,
|
||||
before: float | None=None,
|
||||
yes: bool=False,
|
||||
delete: bool=False,
|
||||
out_dir: Path=DATA_DIR) -> QuerySet:
|
||||
"""Remove the specified URLs from the archive"""
|
||||
|
||||
setup_django()
|
||||
check_data_folder()
|
||||
|
||||
from archivebox.cli.archivebox_search import get_snapshots
|
||||
|
||||
log_list_started(filter_patterns, filter_type)
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
try:
|
||||
snapshots = get_snapshots(
|
||||
snapshots=snapshots,
|
||||
filter_patterns=list(filter_patterns) if filter_patterns else None,
|
||||
filter_type=filter_type,
|
||||
after=after,
|
||||
before=before,
|
||||
)
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
if not snapshots.exists():
|
||||
log_removal_finished(0, 0)
|
||||
raise SystemExit(1)
|
||||
|
||||
log_list_finished(snapshots)
|
||||
log_removal_started(snapshots, yes=yes, delete=delete)
|
||||
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
try:
|
||||
for snapshot in snapshots:
|
||||
if delete:
|
||||
shutil.rmtree(snapshot.output_dir, ignore_errors=True)
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
to_remove = snapshots.count()
|
||||
|
||||
from archivebox.search import flush_search_index
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
flush_search_index(snapshots=snapshots)
|
||||
snapshots.delete()
|
||||
all_snapshots = Snapshot.objects.all()
|
||||
log_removal_finished(all_snapshots.count(), to_remove)
|
||||
|
||||
return all_snapshots
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
|
||||
@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
|
||||
@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
|
||||
@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
|
||||
@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@docstring(remove.__doc__)
|
||||
def main(**kwargs):
|
||||
"""Remove the specified URLs from the archive"""
|
||||
remove(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
155
archivebox/cli/archivebox_run.py
Normal file
155
archivebox/cli/archivebox_run.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox run [--daemon]
|
||||
|
||||
Unified command for processing queued work.
|
||||
|
||||
Modes:
|
||||
- With stdin JSONL: Process piped records, exit when complete
|
||||
- Without stdin (TTY): Run orchestrator in foreground until killed
|
||||
|
||||
Examples:
|
||||
# Run orchestrator in foreground (replaces `archivebox orchestrator`)
|
||||
archivebox run
|
||||
|
||||
# Run as daemon (don't exit on idle)
|
||||
archivebox run --daemon
|
||||
|
||||
# Process specific records (pipe any JSONL type, exits when done)
|
||||
archivebox snapshot list --status=queued | archivebox run
|
||||
archivebox archiveresult list --status=failed | archivebox run
|
||||
archivebox crawl list --status=queued | archivebox run
|
||||
|
||||
# Mixed types work too
|
||||
cat mixed_records.jsonl | archivebox run
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox run'
|
||||
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
|
||||
def process_stdin_records() -> int:
|
||||
"""
|
||||
Process JSONL records from stdin.
|
||||
|
||||
Reads records, queues them for processing, then runs orchestrator until complete.
|
||||
Handles any record type: Crawl, Snapshot, ArchiveResult, etc.
|
||||
|
||||
Returns exit code (0 = success, 1 = error).
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import read_stdin, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
records = list(read_stdin())
|
||||
|
||||
if not records:
|
||||
return 0 # Nothing to process
|
||||
|
||||
queued_count = 0
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
record_id = record.get('id')
|
||||
|
||||
if not record_id:
|
||||
continue
|
||||
|
||||
try:
|
||||
if record_type == TYPE_CRAWL:
|
||||
crawl = Crawl.objects.get(id=record_id)
|
||||
if crawl.status in [Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]:
|
||||
crawl.retry_at = timezone.now()
|
||||
crawl.save()
|
||||
queued_count += 1
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT:
|
||||
snapshot = Snapshot.objects.get(id=record_id)
|
||||
if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]:
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save()
|
||||
queued_count += 1
|
||||
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
archiveresult = ArchiveResult.objects.get(id=record_id)
|
||||
if archiveresult.status in [ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED, ArchiveResult.StatusChoices.BACKOFF]:
|
||||
archiveresult.retry_at = timezone.now()
|
||||
archiveresult.save()
|
||||
queued_count += 1
|
||||
|
||||
except (Crawl.DoesNotExist, Snapshot.DoesNotExist, ArchiveResult.DoesNotExist):
|
||||
rprint(f'[yellow]Record not found: {record_type} {record_id}[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
if queued_count == 0:
|
||||
rprint('[yellow]No records to process[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
rprint(f'[blue]Processing {queued_count} records...[/blue]', file=sys.stderr)
|
||||
|
||||
# Run orchestrator until all queued work is done
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.runloop()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def run_orchestrator(daemon: bool = False) -> int:
|
||||
"""
|
||||
Run the orchestrator process.
|
||||
|
||||
The orchestrator:
|
||||
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
|
||||
2. Spawns worker processes when there is work to do
|
||||
3. Monitors worker health and restarts failed workers
|
||||
4. Exits when all queues are empty (unless --daemon)
|
||||
|
||||
Args:
|
||||
daemon: Run forever (don't exit when idle)
|
||||
|
||||
Returns exit code (0 = success, 1 = error).
|
||||
"""
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
if Orchestrator.is_running():
|
||||
rprint('[yellow]Orchestrator is already running[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
try:
|
||||
orchestrator = Orchestrator(exit_on_idle=not daemon)
|
||||
orchestrator.runloop()
|
||||
return 0
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
|
||||
def main(daemon: bool):
|
||||
"""
|
||||
Process queued work.
|
||||
|
||||
When stdin is piped: Process those specific records and exit.
|
||||
When run standalone: Run orchestrator in foreground.
|
||||
"""
|
||||
# Check if stdin has data (non-TTY means piped input)
|
||||
if not sys.stdin.isatty():
|
||||
sys.exit(process_stdin_records())
|
||||
else:
|
||||
sys.exit(run_orchestrator(daemon=daemon))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,131 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox search'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Any
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.misc.logging import stderr
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
|
||||
# Filter types for URL matching
|
||||
LINK_FILTERS = {
|
||||
'exact': lambda pattern: {'url': pattern},
|
||||
'substring': lambda pattern: {'url__icontains': pattern},
|
||||
'regex': lambda pattern: {'url__iregex': pattern},
|
||||
'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
|
||||
'tag': lambda pattern: {'tags__name': pattern},
|
||||
'timestamp': lambda pattern: {'timestamp': pattern},
|
||||
}
|
||||
|
||||
STATUS_CHOICES = ['indexed', 'archived', 'unarchived']
|
||||
|
||||
|
||||
|
||||
def get_snapshots(snapshots: Optional[QuerySet]=None,
|
||||
filter_patterns: Optional[List[str]]=None,
|
||||
filter_type: str='substring',
|
||||
after: Optional[float]=None,
|
||||
before: Optional[float]=None,
|
||||
out_dir: Path=DATA_DIR) -> QuerySet:
|
||||
"""Filter and return Snapshots matching the given criteria."""
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
if snapshots:
|
||||
result = snapshots
|
||||
else:
|
||||
result = Snapshot.objects.all()
|
||||
|
||||
if after is not None:
|
||||
result = result.filter(timestamp__gte=after)
|
||||
if before is not None:
|
||||
result = result.filter(timestamp__lt=before)
|
||||
if filter_patterns:
|
||||
result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type)
|
||||
|
||||
if not result:
|
||||
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@enforce_types
|
||||
def search(filter_patterns: list[str] | None=None,
|
||||
filter_type: str='substring',
|
||||
status: str='indexed',
|
||||
before: float | None=None,
|
||||
after: float | None=None,
|
||||
sort: str | None=None,
|
||||
json: bool=False,
|
||||
html: bool=False,
|
||||
csv: str | None=None,
|
||||
with_headers: bool=False):
|
||||
"""List, filter, and export information about archive entries"""
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
if with_headers and not (json or html or csv):
|
||||
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
|
||||
raise SystemExit(2)
|
||||
|
||||
# Query DB directly - no filesystem scanning
|
||||
snapshots = get_snapshots(
|
||||
filter_patterns=list(filter_patterns) if filter_patterns else None,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
)
|
||||
|
||||
# Apply status filter
|
||||
if status == 'archived':
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=False)
|
||||
elif status == 'unarchived':
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=True)
|
||||
# 'indexed' = all snapshots (no filter)
|
||||
|
||||
if sort:
|
||||
snapshots = snapshots.order_by(sort)
|
||||
|
||||
# Export to requested format
|
||||
if json:
|
||||
output = snapshots.to_json(with_headers=with_headers)
|
||||
elif html:
|
||||
output = snapshots.to_html(with_headers=with_headers)
|
||||
elif csv:
|
||||
output = snapshots.to_csv(cols=csv.split(','), header=with_headers)
|
||||
else:
|
||||
from archivebox.misc.logging_util import printable_folders
|
||||
# Convert to dict for printable_folders
|
||||
folders = {s.output_dir: s for s in snapshots}
|
||||
output = printable_folders(folders, with_headers)
|
||||
|
||||
print(output)
|
||||
return output
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
|
||||
@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
|
||||
@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
|
||||
@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
|
||||
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
|
||||
@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
|
||||
@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
|
||||
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
|
||||
@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
|
||||
@click.help_option('--help', '-h')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@docstring(search.__doc__)
|
||||
def main(**kwargs):
|
||||
return search(**kwargs)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,93 +1,76 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox snapshot [urls_or_crawl_ids...] [--tag=TAG] [--plugins=NAMES]
|
||||
archivebox snapshot <action> [args...] [--filters]
|
||||
|
||||
Create Snapshots from URLs or Crawl jobs. Accepts URLs, Crawl JSONL, or Crawl IDs.
|
||||
Manage Snapshot records.
|
||||
|
||||
Input formats:
|
||||
- Plain URLs (one per line)
|
||||
- JSONL: {"type": "Crawl", "id": "...", "urls": "..."}
|
||||
- JSONL: {"type": "Snapshot", "url": "...", "title": "...", "tags": "..."}
|
||||
- Crawl UUIDs (one per line)
|
||||
|
||||
Output (JSONL):
|
||||
{"type": "Snapshot", "id": "...", "url": "...", "status": "queued", ...}
|
||||
Actions:
|
||||
create - Create Snapshots from URLs or Crawl JSONL
|
||||
list - List Snapshots as JSONL (with optional filters)
|
||||
update - Update Snapshots from stdin JSONL
|
||||
delete - Delete Snapshots from stdin JSONL
|
||||
|
||||
Examples:
|
||||
# Create snapshots from URLs directly
|
||||
archivebox snapshot https://example.com https://foo.com
|
||||
# Create
|
||||
archivebox snapshot create https://example.com --tag=news
|
||||
archivebox crawl create https://example.com | archivebox snapshot create
|
||||
|
||||
# Pipe from crawl command
|
||||
archivebox crawl https://example.com | archivebox snapshot
|
||||
# List with filters
|
||||
archivebox snapshot list --status=queued
|
||||
archivebox snapshot list --url__icontains=example.com
|
||||
|
||||
# Chain with extract
|
||||
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
||||
# Update
|
||||
archivebox snapshot list --tag=old | archivebox snapshot update --tag=new
|
||||
|
||||
# Run specific plugins after creating snapshots
|
||||
archivebox snapshot --plugins=screenshot,singlefile https://example.com
|
||||
|
||||
# Process existing Snapshot by ID
|
||||
archivebox snapshot 01234567-89ab-cdef-0123-456789abcdef
|
||||
# Delete
|
||||
archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox snapshot'
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
from typing import Optional, Iterable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
from rich import print as rprint
|
||||
|
||||
|
||||
def process_snapshot_by_id(snapshot_id: str) -> int:
|
||||
"""
|
||||
Process a single Snapshot by ID (used by workers).
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
||||
filters = {}
|
||||
for key, value in filter_kwargs.items():
|
||||
if value is not None and key not in ('limit', 'offset'):
|
||||
filters[key] = value
|
||||
|
||||
Triggers the Snapshot's state machine tick() which will:
|
||||
- Transition from queued -> started (creates pending ArchiveResults)
|
||||
- Transition from started -> sealed (when all ArchiveResults done)
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from archivebox.core.models import Snapshot
|
||||
if filters:
|
||||
queryset = queryset.filter(**filters)
|
||||
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[red]Snapshot {snapshot_id} not found[/red]', file=sys.stderr)
|
||||
return 1
|
||||
if limit:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
rprint(f'[blue]Processing Snapshot {snapshot.id} {snapshot.url[:50]} (status={snapshot.status})[/blue]', file=sys.stderr)
|
||||
return queryset
|
||||
|
||||
try:
|
||||
snapshot.sm.tick()
|
||||
snapshot.refresh_from_db()
|
||||
rprint(f'[green]Snapshot complete (status={snapshot.status})[/green]', file=sys.stderr)
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Snapshot error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# =============================================================================
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
def create_snapshots(
|
||||
args: tuple,
|
||||
urls: Iterable[str],
|
||||
tag: str = '',
|
||||
plugins: str = '',
|
||||
status: str = 'queued',
|
||||
depth: int = 0,
|
||||
created_by_id: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create Snapshots from URLs, Crawl JSONL, or Crawl IDs.
|
||||
|
||||
Reads from args or stdin, creates Snapshot objects, outputs JSONL.
|
||||
If --plugins is passed, also runs specified plugins (blocking).
|
||||
Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
@@ -102,7 +85,7 @@ def create_snapshots(
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Collect all input records
|
||||
records = list(read_args_or_stdin(args))
|
||||
records = list(read_args_or_stdin(urls))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
@@ -122,47 +105,44 @@ def create_snapshots(
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
except Crawl.DoesNotExist:
|
||||
# Crawl doesn't exist, create it
|
||||
crawl = Crawl.from_jsonl(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
else:
|
||||
# No ID, create new crawl
|
||||
crawl = Crawl.from_jsonl(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
|
||||
if not crawl:
|
||||
continue
|
||||
|
||||
# Create snapshots for each URL in the crawl
|
||||
for url in crawl.get_urls_list():
|
||||
# Merge CLI tags with crawl tags
|
||||
merged_tags = crawl.tags_str
|
||||
if tag:
|
||||
if merged_tags:
|
||||
merged_tags = f"{merged_tags},{tag}"
|
||||
else:
|
||||
merged_tags = tag
|
||||
merged_tags = f"{merged_tags},{tag}" if merged_tags else tag
|
||||
snapshot_record = {
|
||||
'url': url,
|
||||
'tags': merged_tags,
|
||||
'crawl_id': str(crawl.id),
|
||||
'depth': 0,
|
||||
'depth': depth,
|
||||
'status': status,
|
||||
}
|
||||
snapshot = Snapshot.from_jsonl(snapshot_record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(snapshot_record, overrides={'created_by_id': created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
if not is_tty:
|
||||
write_record(snapshot.to_jsonl())
|
||||
write_record(snapshot.to_json())
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT or record.get('url'):
|
||||
# Input is a Snapshot or plain URL
|
||||
# Add tags if provided via CLI
|
||||
if tag and not record.get('tags'):
|
||||
record['tags'] = tag
|
||||
if status:
|
||||
record['status'] = status
|
||||
record['depth'] = record.get('depth', depth)
|
||||
|
||||
snapshot = Snapshot.from_jsonl(record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
if not is_tty:
|
||||
write_record(snapshot.to_jsonl())
|
||||
write_record(snapshot.to_json())
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
|
||||
@@ -174,93 +154,237 @@ def create_snapshots(
|
||||
|
||||
rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
|
||||
|
||||
# If TTY, show human-readable output
|
||||
if is_tty:
|
||||
for snapshot in created_snapshots:
|
||||
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
|
||||
|
||||
# If --plugins is passed, create ArchiveResults and run the orchestrator
|
||||
if plugins:
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
# Parse comma-separated plugins list
|
||||
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()]
|
||||
|
||||
# Create ArchiveResults for the specific plugins on each snapshot
|
||||
for snapshot in created_snapshots:
|
||||
for plugin_name in plugins_list:
|
||||
result, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=plugin_name,
|
||||
defaults={
|
||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
}
|
||||
)
|
||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||
# Reset for retry
|
||||
result.status = ArchiveResult.StatusChoices.QUEUED
|
||||
result.retry_at = timezone.now()
|
||||
result.save()
|
||||
|
||||
rprint(f'[blue]Running plugins: {plugins}...[/blue]', file=sys.stderr)
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.runloop()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def is_snapshot_id(value: str) -> bool:
|
||||
"""Check if value looks like a Snapshot UUID."""
|
||||
import re
|
||||
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually a Snapshot (not a Crawl or other object)
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
def list_snapshots(
|
||||
status: Optional[str] = None,
|
||||
url__icontains: Optional[str] = None,
|
||||
url__istartswith: Optional[str] = None,
|
||||
tag: Optional[str] = None,
|
||||
crawl_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Snapshots as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.core.models import Snapshot
|
||||
return Snapshot.objects.filter(id=value).exists()
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Snapshot.objects.all().order_by('-created_at')
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'url__icontains': url__icontains,
|
||||
'url__istartswith': url__istartswith,
|
||||
'crawl_id': crawl_id,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
# Tag filter requires special handling (M2M)
|
||||
if tag:
|
||||
queryset = queryset.filter(tags__name__iexact=tag)
|
||||
|
||||
count = 0
|
||||
for snapshot in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'sealed': 'green',
|
||||
}.get(snapshot.status, 'dim')
|
||||
rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}')
|
||||
else:
|
||||
write_record(snapshot.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add to each snapshot')
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run after creating snapshots (e.g., screenshot,singlefile)')
|
||||
@click.argument('args', nargs=-1)
|
||||
def main(tag: str, plugins: str, args: tuple):
|
||||
"""Create Snapshots from URLs/Crawls, or process existing Snapshots by ID"""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
# =============================================================================
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
# Read all input
|
||||
records = list(read_args_or_stdin(args))
|
||||
def update_snapshots(
|
||||
status: Optional[str] = None,
|
||||
tag: Optional[str] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Snapshots from stdin JSONL.
|
||||
|
||||
Reads Snapshot records from stdin and applies updates.
|
||||
Uses PATCH semantics - only specified fields are updated.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or error
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import read_stdin, write_record
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
from rich import print as rprint
|
||||
rprint('[yellow]No URLs, Crawl IDs, or Snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Check if input looks like existing Snapshot IDs to process
|
||||
# If ALL inputs are UUIDs with no URL and exist as Snapshots, process them
|
||||
all_are_snapshot_ids = all(
|
||||
is_snapshot_id(r.get('id') or r.get('url', ''))
|
||||
for r in records
|
||||
if r.get('type') != 'Crawl' # Don't check Crawl records as Snapshot IDs
|
||||
)
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
snapshot_id = record.get('id')
|
||||
if not snapshot_id:
|
||||
continue
|
||||
|
||||
# But also check that we're not receiving Crawl JSONL
|
||||
has_crawl_records = any(r.get('type') == 'Crawl' for r in records)
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
|
||||
if all_are_snapshot_ids and not has_crawl_records:
|
||||
# Process existing Snapshots by ID
|
||||
exit_code = 0
|
||||
for record in records:
|
||||
snapshot_id = record.get('id') or record.get('url')
|
||||
result = process_snapshot_by_id(snapshot_id)
|
||||
if result != 0:
|
||||
exit_code = result
|
||||
sys.exit(exit_code)
|
||||
else:
|
||||
# Create new Snapshots from URLs or Crawls
|
||||
sys.exit(create_snapshots(args, tag=tag, plugins=plugins))
|
||||
# Apply updates from CLI flags (override stdin values)
|
||||
if status:
|
||||
snapshot.status = status
|
||||
snapshot.retry_at = timezone.now()
|
||||
if tag:
|
||||
# Add tag to existing tags
|
||||
snapshot.save() # Ensure saved before M2M
|
||||
from archivebox.core.models import Tag
|
||||
tag_obj, _ = Tag.objects.get_or_create(name=tag)
|
||||
snapshot.tags.add(tag_obj)
|
||||
|
||||
snapshot.save()
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record(snapshot.to_json())
|
||||
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} snapshots[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Snapshots from stdin JSONL.
|
||||
|
||||
Requires --yes flag to confirm deletion.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or missing --yes flag
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshot_ids = [r.get('id') for r in records if r.get('id')]
|
||||
|
||||
if not snapshot_ids:
|
||||
rprint('[yellow]No valid snapshot IDs in input[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshots = Snapshot.objects.filter(id__in=snapshot_ids)
|
||||
count = snapshots.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} snapshots (dry run)[/yellow]', file=sys.stderr)
|
||||
for snapshot in snapshots:
|
||||
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = snapshots.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} snapshots[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Snapshot records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
|
||||
def create_cmd(urls: tuple, tag: str, status: str, depth: int):
|
||||
"""Create Snapshots from URLs or stdin JSONL."""
|
||||
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
|
||||
@click.option('--url__icontains', help='Filter by URL contains')
|
||||
@click.option('--url__istartswith', help='Filter by URL starts with')
|
||||
@click.option('--tag', '-t', help='Filter by tag name')
|
||||
@click.option('--crawl-id', help='Filter by crawl ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
|
||||
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]):
|
||||
"""List Snapshots as JSONL."""
|
||||
sys.exit(list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
@click.option('--tag', '-t', help='Add tag')
|
||||
def update_cmd(status: Optional[str], tag: Optional[str]):
|
||||
"""Update Snapshots from stdin JSONL."""
|
||||
sys.exit(update_snapshots(status=status, tag=tag))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Snapshots from stdin JSONL."""
|
||||
sys.exit(delete_snapshots(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
307
archivebox/cli/archivebox_tag.py
Normal file
307
archivebox/cli/archivebox_tag.py
Normal file
@@ -0,0 +1,307 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox tag <action> [args...] [--filters]
|
||||
|
||||
Manage Tag records.
|
||||
|
||||
Actions:
|
||||
create - Create Tags
|
||||
list - List Tags as JSONL (with optional filters)
|
||||
update - Update Tags from stdin JSONL
|
||||
delete - Delete Tags from stdin JSONL
|
||||
|
||||
Examples:
|
||||
# Create
|
||||
archivebox tag create news tech science
|
||||
archivebox tag create "important stuff"
|
||||
|
||||
# List
|
||||
archivebox tag list
|
||||
archivebox tag list --name__icontains=news
|
||||
|
||||
# Update (rename tags)
|
||||
archivebox tag list --name=oldname | archivebox tag update --name=newname
|
||||
|
||||
# Delete
|
||||
archivebox tag list --name=unused | archivebox tag delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox tag'
|
||||
|
||||
import sys
|
||||
from typing import Optional, Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
||||
filters = {}
|
||||
for key, value in filter_kwargs.items():
|
||||
if value is not None and key not in ('limit', 'offset'):
|
||||
filters[key] = value
|
||||
|
||||
if filters:
|
||||
queryset = queryset.filter(**filters)
|
||||
|
||||
if limit:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
return queryset
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
def create_tags(names: Iterable[str]) -> int:
|
||||
"""
|
||||
Create Tags from names.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Convert to list if needed
|
||||
name_list = list(names) if names else []
|
||||
|
||||
if not name_list:
|
||||
rprint('[yellow]No tag names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
created_count = 0
|
||||
for name in name_list:
|
||||
name = name.strip()
|
||||
if not name:
|
||||
continue
|
||||
|
||||
tag, created = Tag.objects.get_or_create(name=name)
|
||||
|
||||
if not is_tty:
|
||||
write_record(tag.to_json())
|
||||
|
||||
if created:
|
||||
created_count += 1
|
||||
rprint(f'[green]Created tag: {name}[/green]', file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Tag already exists: {name}[/dim]', file=sys.stderr)
|
||||
|
||||
rprint(f'[green]Created {created_count} new tags[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
def list_tags(
|
||||
name: Optional[str] = None,
|
||||
name__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Tags as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Tag.objects.all().order_by('name')
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'name__icontains': name__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for tag in queryset:
|
||||
snapshot_count = tag.snapshot_set.count()
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]')
|
||||
else:
|
||||
write_record(tag.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} tags[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_tags(name: Optional[str] = None) -> int:
|
||||
"""
|
||||
Update Tags from stdin JSONL.
|
||||
|
||||
Reads Tag records from stdin and applies updates.
|
||||
Uses PATCH semantics - only specified fields are updated.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or error
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin, write_record
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
tag_id = record.get('id')
|
||||
old_name = record.get('name')
|
||||
|
||||
if not tag_id and not old_name:
|
||||
continue
|
||||
|
||||
try:
|
||||
if tag_id:
|
||||
tag = Tag.objects.get(id=tag_id)
|
||||
else:
|
||||
tag = Tag.objects.get(name=old_name)
|
||||
|
||||
# Apply updates from CLI flags
|
||||
if name:
|
||||
tag.name = name
|
||||
tag.save()
|
||||
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record(tag.to_json())
|
||||
|
||||
except Tag.DoesNotExist:
|
||||
rprint(f'[yellow]Tag not found: {tag_id or old_name}[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} tags[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Tags from stdin JSONL.
|
||||
|
||||
Requires --yes flag to confirm deletion.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or missing --yes flag
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Collect tag IDs or names
|
||||
tag_ids = []
|
||||
tag_names = []
|
||||
for r in records:
|
||||
if r.get('id'):
|
||||
tag_ids.append(r['id'])
|
||||
elif r.get('name'):
|
||||
tag_names.append(r['name'])
|
||||
|
||||
if not tag_ids and not tag_names:
|
||||
rprint('[yellow]No valid tag IDs or names in input[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
from django.db.models import Q
|
||||
query = Q()
|
||||
if tag_ids:
|
||||
query |= Q(id__in=tag_ids)
|
||||
if tag_names:
|
||||
query |= Q(name__in=tag_names)
|
||||
|
||||
tags = Tag.objects.filter(query)
|
||||
count = tags.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching tags found[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} tags (dry run)[/yellow]', file=sys.stderr)
|
||||
for tag in tags:
|
||||
rprint(f' {tag.name}', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = tags.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} tags[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Tag records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('names', nargs=-1)
|
||||
def create_cmd(names: tuple):
|
||||
"""Create Tags from names."""
|
||||
sys.exit(create_tags(names))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', help='Filter by exact name')
|
||||
@click.option('--name__icontains', help='Filter by name contains')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
|
||||
"""List Tags as JSONL."""
|
||||
sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--name', '-n', help='Set new name')
|
||||
def update_cmd(name: Optional[str]):
|
||||
"""Update Tags from stdin JSONL."""
|
||||
sys.exit(update_tags(name=name))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Tags from stdin JSONL."""
|
||||
sys.exit(delete_tags(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,17 +1,18 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for CLI piping workflow: crawl | snapshot | extract
|
||||
Tests for CLI piping workflow: crawl | snapshot | archiveresult | run
|
||||
|
||||
This module tests the JSONL-based piping between CLI commands as described in:
|
||||
https://github.com/ArchiveBox/ArchiveBox/issues/1363
|
||||
|
||||
Workflows tested:
|
||||
archivebox crawl URL -> Crawl JSONL
|
||||
archivebox snapshot -> Snapshot JSONL (accepts Crawl or URL input)
|
||||
archivebox extract -> ArchiveResult JSONL (accepts Snapshot input)
|
||||
archivebox crawl create URL -> Crawl JSONL
|
||||
archivebox snapshot create -> Snapshot JSONL (accepts Crawl or URL input)
|
||||
archivebox archiveresult create -> ArchiveResult JSONL (accepts Snapshot input)
|
||||
archivebox run -> Process queued records (accepts any JSONL)
|
||||
|
||||
Pipeline:
|
||||
archivebox crawl URL | archivebox snapshot | archivebox extract
|
||||
archivebox crawl create URL | archivebox snapshot create | archivebox archiveresult create | archivebox run
|
||||
|
||||
Each command should:
|
||||
- Accept URLs, IDs, or JSONL as input (args or stdin)
|
||||
@@ -154,13 +155,13 @@ class TestJSONLParsing(unittest.TestCase):
|
||||
class TestJSONLOutput(unittest.TestCase):
|
||||
"""Test JSONL output formatting."""
|
||||
|
||||
def test_crawl_to_jsonl(self):
|
||||
"""Crawl model should serialize to JSONL correctly."""
|
||||
def test_crawl_to_json(self):
|
||||
"""Crawl model should serialize to JSON correctly."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||
|
||||
# Create a mock crawl with to_jsonl method configured
|
||||
# Create a mock crawl with to_json method configured
|
||||
mock_crawl = MagicMock()
|
||||
mock_crawl.to_jsonl.return_value = {
|
||||
mock_crawl.to_json.return_value = {
|
||||
'type': TYPE_CRAWL,
|
||||
'schema_version': '0.9.0',
|
||||
'id': 'test-crawl-uuid',
|
||||
@@ -172,7 +173,7 @@ class TestJSONLOutput(unittest.TestCase):
|
||||
'created_at': None,
|
||||
}
|
||||
|
||||
result = mock_crawl.to_jsonl()
|
||||
result = mock_crawl.to_json()
|
||||
self.assertEqual(result['type'], TYPE_CRAWL)
|
||||
self.assertEqual(result['id'], 'test-crawl-uuid')
|
||||
self.assertEqual(result['urls'], 'https://example.com')
|
||||
@@ -351,8 +352,8 @@ class TestSnapshotCommand(unittest.TestCase):
|
||||
# using real Snapshot instances.
|
||||
|
||||
|
||||
class TestExtractCommand(unittest.TestCase):
|
||||
"""Unit tests for archivebox extract command."""
|
||||
class TestArchiveResultCommand(unittest.TestCase):
|
||||
"""Unit tests for archivebox archiveresult command."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
@@ -363,8 +364,8 @@ class TestExtractCommand(unittest.TestCase):
|
||||
"""Clean up test environment."""
|
||||
shutil.rmtree(self.test_dir, ignore_errors=True)
|
||||
|
||||
def test_extract_accepts_snapshot_id(self):
|
||||
"""extract should accept snapshot IDs as input."""
|
||||
def test_archiveresult_accepts_snapshot_id(self):
|
||||
"""archiveresult should accept snapshot IDs as input."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
uuid = '01234567-89ab-cdef-0123-456789abcdef'
|
||||
@@ -374,8 +375,8 @@ class TestExtractCommand(unittest.TestCase):
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['id'], uuid)
|
||||
|
||||
def test_extract_accepts_jsonl_snapshot(self):
|
||||
"""extract should accept JSONL Snapshot records."""
|
||||
def test_archiveresult_accepts_jsonl_snapshot(self):
|
||||
"""archiveresult should accept JSONL Snapshot records."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
|
||||
|
||||
stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
|
||||
@@ -387,8 +388,8 @@ class TestExtractCommand(unittest.TestCase):
|
||||
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(records[0]['id'], 'abc123')
|
||||
|
||||
def test_extract_gathers_snapshot_ids(self):
|
||||
"""extract should gather snapshot IDs from various input formats."""
|
||||
def test_archiveresult_gathers_snapshot_ids(self):
|
||||
"""archiveresult should gather snapshot IDs from various input formats."""
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
|
||||
records = [
|
||||
@@ -529,7 +530,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
# Create crawl with multiple URLs (as newline-separated string)
|
||||
urls = 'https://test-crawl-1.example.com\nhttps://test-crawl-2.example.com'
|
||||
crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
|
||||
|
||||
self.assertIsNotNone(crawl)
|
||||
self.assertIsNotNone(crawl.id)
|
||||
@@ -543,7 +544,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
self.assertIn('https://test-crawl-2.example.com', urls_list)
|
||||
|
||||
# Verify output format
|
||||
output = crawl.to_jsonl()
|
||||
output = crawl.to_json()
|
||||
self.assertEqual(output['type'], TYPE_CRAWL)
|
||||
self.assertIn('id', output)
|
||||
self.assertEqual(output['urls'], urls)
|
||||
@@ -566,8 +567,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
# Step 1: Create crawl (simulating 'archivebox crawl')
|
||||
urls = 'https://crawl-to-snap-1.example.com\nhttps://crawl-to-snap-2.example.com'
|
||||
crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id})
|
||||
crawl_output = crawl.to_jsonl()
|
||||
crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
|
||||
crawl_output = crawl.to_json()
|
||||
|
||||
# Step 2: Parse crawl output as snapshot input
|
||||
stdin = StringIO(json.dumps(crawl_output) + '\n')
|
||||
@@ -581,7 +582,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
# Step 3: Create snapshots from crawl URLs
|
||||
created_snapshots = []
|
||||
for url in crawl.get_urls_list():
|
||||
snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
|
||||
@@ -589,7 +590,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
# Verify snapshot output
|
||||
for snapshot in created_snapshots:
|
||||
output = snapshot.to_jsonl()
|
||||
output = snapshot.to_json()
|
||||
self.assertEqual(output['type'], TYPE_SNAPSHOT)
|
||||
self.assertIn(output['url'], [
|
||||
'https://crawl-to-snap-1.example.com',
|
||||
@@ -619,13 +620,13 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
# Create snapshot
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = Snapshot.from_jsonl(records[0], overrides=overrides)
|
||||
snapshot = Snapshot.from_json(records[0], overrides=overrides)
|
||||
|
||||
self.assertIsNotNone(snapshot.id)
|
||||
self.assertEqual(snapshot.url, url)
|
||||
|
||||
# Verify output format
|
||||
output = snapshot.to_jsonl()
|
||||
output = snapshot.to_json()
|
||||
self.assertEqual(output['type'], TYPE_SNAPSHOT)
|
||||
self.assertIn('id', output)
|
||||
self.assertEqual(output['url'], url)
|
||||
@@ -647,8 +648,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
# Step 1: Create snapshot (simulating 'archivebox snapshot')
|
||||
url = 'https://test-extract-1.example.com'
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
|
||||
snapshot_output = snapshot.to_jsonl()
|
||||
snapshot = Snapshot.from_json({'url': url}, overrides=overrides)
|
||||
snapshot_output = snapshot.to_json()
|
||||
|
||||
# Step 2: Parse snapshot output as extract input
|
||||
stdin = StringIO(json.dumps(snapshot_output) + '\n')
|
||||
@@ -686,8 +687,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
# === archivebox crawl https://example.com ===
|
||||
url = 'https://test-pipeline-full.example.com'
|
||||
crawl = Crawl.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
|
||||
crawl_jsonl = json.dumps(crawl.to_jsonl())
|
||||
crawl = Crawl.from_json({'url': url}, overrides={'created_by_id': created_by_id})
|
||||
crawl_jsonl = json.dumps(crawl.to_json())
|
||||
|
||||
# === | archivebox snapshot ===
|
||||
stdin = StringIO(crawl_jsonl + '\n')
|
||||
@@ -705,7 +706,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
if crawl_id:
|
||||
db_crawl = Crawl.objects.get(id=crawl_id)
|
||||
for crawl_url in db_crawl.get_urls_list():
|
||||
snapshot = Snapshot.from_jsonl({'url': crawl_url}, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json({'url': crawl_url}, overrides={'created_by_id': created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
|
||||
@@ -713,7 +714,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
self.assertEqual(created_snapshots[0].url, url)
|
||||
|
||||
# === | archivebox extract ===
|
||||
snapshot_jsonl_lines = [json.dumps(s.to_jsonl()) for s in created_snapshots]
|
||||
snapshot_jsonl_lines = [json.dumps(s.to_json()) for s in created_snapshots]
|
||||
stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
@@ -757,12 +758,12 @@ class TestDepthWorkflows(unittest.TestCase):
|
||||
|
||||
# Create crawl with depth 0
|
||||
url = 'https://depth0-test.example.com'
|
||||
crawl = Crawl.from_jsonl({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id})
|
||||
|
||||
self.assertEqual(crawl.max_depth, 0)
|
||||
|
||||
# Create snapshot
|
||||
snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
|
||||
self.assertEqual(snapshot.url, url)
|
||||
|
||||
def test_depth_metadata_in_crawl(self):
|
||||
@@ -773,7 +774,7 @@ class TestDepthWorkflows(unittest.TestCase):
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# Create crawl with depth
|
||||
crawl = Crawl.from_jsonl(
|
||||
crawl = Crawl.from_json(
|
||||
{'url': 'https://depth-meta-test.example.com', 'max_depth': 2},
|
||||
overrides={'created_by_id': created_by_id}
|
||||
)
|
||||
@@ -781,7 +782,7 @@ class TestDepthWorkflows(unittest.TestCase):
|
||||
self.assertEqual(crawl.max_depth, 2)
|
||||
|
||||
# Verify in JSONL output
|
||||
output = crawl.to_jsonl()
|
||||
output = crawl.to_json()
|
||||
self.assertEqual(output['max_depth'], 2)
|
||||
|
||||
|
||||
|
||||
@@ -158,7 +158,7 @@ class AddLinkForm(forms.Form):
|
||||
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
|
||||
}
|
||||
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
|
||||
extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
|
||||
extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'}
|
||||
|
||||
# Populate plugin field choices
|
||||
self.fields['chrome_plugins'].choices = [
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
|
||||
from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING, Iterator, Set
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from datetime import datetime, timedelta
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
@@ -41,6 +41,8 @@ from archivebox.machine.models import NetworkInterface, Binary
|
||||
|
||||
|
||||
class Tag(ModelWithSerializers):
|
||||
JSONL_TYPE = 'Tag'
|
||||
|
||||
# Keep AutoField for compatibility with main branch migrations
|
||||
# Don't use UUIDField here - requires complex FK transformation
|
||||
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
||||
@@ -91,26 +93,66 @@ class Tag(ModelWithSerializers):
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_tag', args=[self.id])
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Tag model instance to a JSONL record.
|
||||
Convert Tag model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
'type': 'Tag',
|
||||
'type': self.JSONL_TYPE,
|
||||
'schema_version': VERSION,
|
||||
'id': str(self.id),
|
||||
'name': self.name,
|
||||
'slug': self.slug,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
|
||||
def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
|
||||
"""
|
||||
Create/update Tag from JSONL record.
|
||||
Yield this Tag as a JSON record.
|
||||
|
||||
Args:
|
||||
record: JSONL record with 'name' field
|
||||
seen: Set of (type, id) tuples already emitted (for deduplication)
|
||||
**kwargs: Passed to children (none for Tag, leaf node)
|
||||
|
||||
Yields:
|
||||
dict: JSON-serializable record for this tag
|
||||
"""
|
||||
if seen is not None:
|
||||
key = (self.JSONL_TYPE, str(self.id))
|
||||
if key in seen:
|
||||
return
|
||||
seen.add(key)
|
||||
yield self.to_json()
|
||||
|
||||
@classmethod
|
||||
def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['Tag']:
|
||||
"""
|
||||
Create/update Tags from an iterable of JSONL records.
|
||||
Filters to only records with type='Tag'.
|
||||
|
||||
Args:
|
||||
records: Iterable of dicts (JSONL records)
|
||||
overrides: Optional dict with 'snapshot' to auto-attach tags
|
||||
|
||||
Returns:
|
||||
List of Tag instances (skips None results)
|
||||
"""
|
||||
results = []
|
||||
for record in records:
|
||||
record_type = record.get('type', cls.JSONL_TYPE)
|
||||
if record_type == cls.JSONL_TYPE:
|
||||
instance = cls.from_json(record, overrides=overrides)
|
||||
if instance:
|
||||
results.append(instance)
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'Tag | None':
|
||||
"""
|
||||
Create/update a single Tag from a JSON record dict.
|
||||
|
||||
Args:
|
||||
record: Dict with 'name' field
|
||||
overrides: Optional dict with 'snapshot' to auto-attach tag
|
||||
|
||||
Returns:
|
||||
@@ -289,6 +331,8 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
|
||||
|
||||
|
||||
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
JSONL_TYPE = 'Snapshot'
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
@@ -968,38 +1012,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
Each line is a JSON record with a 'type' field:
|
||||
- Snapshot: snapshot metadata (crawl_id, url, tags, etc.)
|
||||
- ArchiveResult: extractor results (plugin, status, output, etc.)
|
||||
- Binary: binary info used for the extraction
|
||||
- Process: process execution details (cmd, exit_code, timing, etc.)
|
||||
- ArchiveResult: extractor results (plugin, status, output, etc.)
|
||||
"""
|
||||
import json
|
||||
|
||||
index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
|
||||
index_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Track unique binaries and processes to avoid duplicates
|
||||
binaries_seen = set()
|
||||
processes_seen = set()
|
||||
|
||||
with open(index_path, 'w') as f:
|
||||
# Write Snapshot record first (to_jsonl includes crawl_id, fs_version)
|
||||
f.write(json.dumps(self.to_jsonl()) + '\n')
|
||||
|
||||
# Write ArchiveResult records with their associated Binary and Process
|
||||
# Use select_related to optimize queries
|
||||
for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
|
||||
# Write Binary record if not already written
|
||||
if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen:
|
||||
binaries_seen.add(ar.process.binary_id)
|
||||
f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n')
|
||||
|
||||
# Write Process record if not already written
|
||||
if ar.process and ar.process_id not in processes_seen:
|
||||
processes_seen.add(ar.process_id)
|
||||
f.write(json.dumps(ar.process.to_jsonl()) + '\n')
|
||||
|
||||
# Write ArchiveResult record
|
||||
f.write(json.dumps(ar.to_jsonl()) + '\n')
|
||||
for record in self.to_jsonl():
|
||||
f.write(json.dumps(record) + '\n')
|
||||
|
||||
def read_index_jsonl(self) -> dict:
|
||||
"""
|
||||
@@ -1420,14 +1444,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
return False
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Snapshot model instance to a JSONL record.
|
||||
Convert Snapshot model instance to a JSON-serializable dict.
|
||||
Includes all fields needed to fully reconstruct/identify this snapshot.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
'type': 'Snapshot',
|
||||
'type': self.JSONL_TYPE,
|
||||
'schema_version': VERSION,
|
||||
'id': str(self.id),
|
||||
'crawl_id': str(self.crawl_id),
|
||||
@@ -1442,12 +1466,68 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
'fs_version': self.fs_version,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
|
||||
def to_jsonl(self, seen: Set[tuple] = None, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
|
||||
"""
|
||||
Create/update Snapshot from JSONL record or dict.
|
||||
Yield this Snapshot and optionally related objects as JSON records.
|
||||
|
||||
Unified method that handles:
|
||||
Uses select_related for efficient querying. Deduplicates automatically.
|
||||
|
||||
Args:
|
||||
seen: Set of (type, id) tuples already emitted (for deduplication)
|
||||
archiveresult: Include related ArchiveResults (default: True)
|
||||
process: Include Process for each ArchiveResult (default: True)
|
||||
binary: Include Binary for each Process (default: True)
|
||||
machine: Include Machine for each Process (default: False)
|
||||
iface: Include NetworkInterface for each Process (default: False)
|
||||
**kwargs: Additional options passed to children
|
||||
|
||||
Yields:
|
||||
dict: JSON-serializable records
|
||||
"""
|
||||
if seen is None:
|
||||
seen = set()
|
||||
|
||||
key = (self.JSONL_TYPE, str(self.id))
|
||||
if key in seen:
|
||||
return
|
||||
seen.add(key)
|
||||
|
||||
yield self.to_json()
|
||||
|
||||
if archiveresult:
|
||||
# Use select_related to optimize queries
|
||||
for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
|
||||
yield from ar.to_jsonl(seen=seen, process=process, binary=binary, machine=machine, iface=iface, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_jsonl(cls, records, overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> list['Snapshot']:
|
||||
"""
|
||||
Create/update Snapshots from an iterable of JSONL records.
|
||||
Filters to only records with type='Snapshot' (or no type).
|
||||
|
||||
Args:
|
||||
records: Iterable of dicts (JSONL records)
|
||||
overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
|
||||
queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
|
||||
|
||||
Returns:
|
||||
List of Snapshot instances (skips None results)
|
||||
"""
|
||||
results = []
|
||||
for record in records:
|
||||
record_type = record.get('type', cls.JSONL_TYPE)
|
||||
if record_type == cls.JSONL_TYPE:
|
||||
instance = cls.from_json(record, overrides=overrides, queue_for_extraction=queue_for_extraction)
|
||||
if instance:
|
||||
results.append(instance)
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> 'Snapshot | None':
|
||||
"""
|
||||
Create/update a single Snapshot from a JSON record dict.
|
||||
|
||||
Handles:
|
||||
- ID-based patching: {"id": "...", "title": "new title"}
|
||||
- URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
|
||||
- Auto-creates Crawl if not provided
|
||||
@@ -2054,8 +2134,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
result['canonical'] = self.canonical_outputs()
|
||||
return result
|
||||
|
||||
def to_json(self, indent: int = 4) -> str:
|
||||
"""Convert to JSON string"""
|
||||
def to_json_str(self, indent: int = 4) -> str:
|
||||
"""Convert to JSON string for file output."""
|
||||
return to_json(self.to_dict(extended=True), indent=indent)
|
||||
|
||||
def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
|
||||
@@ -2203,6 +2283,8 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
|
||||
|
||||
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
JSONL_TYPE = 'ArchiveResult'
|
||||
|
||||
class StatusChoices(models.TextChoices):
|
||||
QUEUED = 'queued', 'Queued'
|
||||
STARTED = 'started', 'Started'
|
||||
@@ -2274,13 +2356,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
"""Convenience property to access the user who created this archive result via its snapshot's crawl."""
|
||||
return self.snapshot.crawl.created_by
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert ArchiveResult model instance to a JSONL record.
|
||||
Convert ArchiveResult model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
record = {
|
||||
'type': 'ArchiveResult',
|
||||
'type': self.JSONL_TYPE,
|
||||
'schema_version': VERSION,
|
||||
'id': str(self.id),
|
||||
'snapshot_id': str(self.snapshot_id),
|
||||
@@ -2308,6 +2390,31 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
record['process_id'] = str(self.process_id)
|
||||
return record
|
||||
|
||||
def to_jsonl(self, seen: Set[tuple] = None, process: bool = True, **kwargs) -> Iterator[dict]:
|
||||
"""
|
||||
Yield this ArchiveResult and optionally related objects as JSON records.
|
||||
|
||||
Args:
|
||||
seen: Set of (type, id) tuples already emitted (for deduplication)
|
||||
process: Include related Process and its children (default: True)
|
||||
**kwargs: Passed to Process.to_jsonl() (e.g., binary=True, machine=False)
|
||||
|
||||
Yields:
|
||||
dict: JSON-serializable records
|
||||
"""
|
||||
if seen is None:
|
||||
seen = set()
|
||||
|
||||
key = (self.JSONL_TYPE, str(self.id))
|
||||
if key in seen:
|
||||
return
|
||||
seen.add(key)
|
||||
|
||||
yield self.to_json()
|
||||
|
||||
if process and self.process:
|
||||
yield from self.process.to_jsonl(seen=seen, **kwargs)
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
__package__ = 'archivebox.crawls'
|
||||
|
||||
from typing import TYPE_CHECKING, Iterable
|
||||
from typing import TYPE_CHECKING, Iterable, Iterator, Set
|
||||
from datetime import timedelta
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from pathlib import Path
|
||||
@@ -59,6 +59,8 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
|
||||
|
||||
|
||||
class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine):
|
||||
JSONL_TYPE = 'Crawl'
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
|
||||
@@ -134,13 +136,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_crawl', args=[self.id])
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Crawl model instance to a JSONL record.
|
||||
Convert Crawl model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
'type': 'Crawl',
|
||||
'type': self.JSONL_TYPE,
|
||||
'schema_version': VERSION,
|
||||
'id': str(self.id),
|
||||
'urls': self.urls,
|
||||
@@ -151,10 +153,63 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
'created_at': self.created_at.isoformat() if self.created_at else None,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: dict, overrides: dict = None):
|
||||
def to_jsonl(self, seen: Set[tuple] = None, snapshot: bool = True, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
|
||||
"""
|
||||
Create or get a Crawl from a JSONL record.
|
||||
Yield this Crawl and optionally related objects as JSON records.
|
||||
|
||||
Args:
|
||||
seen: Set of (type, id) tuples already emitted (for deduplication)
|
||||
snapshot: Include related Snapshots (default: True)
|
||||
archiveresult: Include ArchiveResults for each Snapshot (default: True)
|
||||
process: Include Process for each ArchiveResult (default: True)
|
||||
binary: Include Binary for each Process (default: True)
|
||||
machine: Include Machine for each Process (default: False)
|
||||
iface: Include NetworkInterface for each Process (default: False)
|
||||
**kwargs: Additional options passed to children
|
||||
|
||||
Yields:
|
||||
dict: JSON-serializable records
|
||||
"""
|
||||
if seen is None:
|
||||
seen = set()
|
||||
|
||||
key = (self.JSONL_TYPE, str(self.id))
|
||||
if key in seen:
|
||||
return
|
||||
seen.add(key)
|
||||
|
||||
yield self.to_json()
|
||||
|
||||
if snapshot:
|
||||
for snap in self.snapshot_set.all():
|
||||
yield from snap.to_jsonl(seen=seen, archiveresult=archiveresult, process=process, binary=binary, machine=machine, iface=iface, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_jsonl(cls, records, overrides: dict = None) -> list['Crawl']:
|
||||
"""
|
||||
Create/update Crawls from an iterable of JSONL records.
|
||||
Filters to only records with type='Crawl' (or no type).
|
||||
|
||||
Args:
|
||||
records: Iterable of dicts (JSONL records)
|
||||
overrides: Dict of field overrides (e.g., created_by_id)
|
||||
|
||||
Returns:
|
||||
List of Crawl instances (skips None results)
|
||||
"""
|
||||
results = []
|
||||
for record in records:
|
||||
record_type = record.get('type', cls.JSONL_TYPE)
|
||||
if record_type == cls.JSONL_TYPE:
|
||||
instance = cls.from_json(record, overrides=overrides)
|
||||
if instance:
|
||||
results.append(instance)
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: dict, overrides: dict = None) -> 'Crawl | None':
|
||||
"""
|
||||
Create or get a single Crawl from a JSON record dict.
|
||||
|
||||
Args:
|
||||
record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label'
|
||||
|
||||
@@ -1176,7 +1176,9 @@ def create_model_record(record: Dict[str, Any]) -> Any:
|
||||
def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]:
|
||||
"""
|
||||
Process JSONL records from hook output.
|
||||
Dispatches to Model.from_jsonl() for each record type.
|
||||
|
||||
Uses Model.from_jsonl() which automatically filters by JSONL_TYPE.
|
||||
Each model only processes records matching its type.
|
||||
|
||||
Args:
|
||||
records: List of JSONL record dicts from result['records']
|
||||
@@ -1185,54 +1187,26 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
|
||||
Returns:
|
||||
Dict with counts by record type
|
||||
"""
|
||||
stats = {}
|
||||
from archivebox.core.models import Snapshot, Tag
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
overrides = overrides or {}
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
if not record_type:
|
||||
continue
|
||||
# Filter out ArchiveResult records (they update the calling AR, not create new ones)
|
||||
filtered_records = [r for r in records if r.get('type') != 'ArchiveResult']
|
||||
|
||||
# Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones)
|
||||
if record_type == 'ArchiveResult':
|
||||
continue
|
||||
# Each model's from_jsonl() filters to only its own type
|
||||
snapshots = Snapshot.from_jsonl(filtered_records, overrides)
|
||||
tags = Tag.from_jsonl(filtered_records, overrides)
|
||||
binaries = Binary.from_jsonl(filtered_records, overrides)
|
||||
machines = Machine.from_jsonl(filtered_records, overrides)
|
||||
|
||||
try:
|
||||
# Dispatch to appropriate model's from_jsonl() method
|
||||
if record_type == 'Snapshot':
|
||||
from archivebox.core.models import Snapshot
|
||||
obj = Snapshot.from_jsonl(record.copy(), overrides)
|
||||
if obj:
|
||||
stats['Snapshot'] = stats.get('Snapshot', 0) + 1
|
||||
|
||||
elif record_type == 'Tag':
|
||||
from archivebox.core.models import Tag
|
||||
obj = Tag.from_jsonl(record.copy(), overrides)
|
||||
if obj:
|
||||
stats['Tag'] = stats.get('Tag', 0) + 1
|
||||
|
||||
elif record_type == 'Binary':
|
||||
from archivebox.machine.models import Binary
|
||||
obj = Binary.from_jsonl(record.copy(), overrides)
|
||||
if obj:
|
||||
stats['Binary'] = stats.get('Binary', 0) + 1
|
||||
|
||||
elif record_type == 'Machine':
|
||||
from archivebox.machine.models import Machine
|
||||
obj = Machine.from_jsonl(record.copy(), overrides)
|
||||
if obj:
|
||||
stats['Machine'] = stats.get('Machine', 0) + 1
|
||||
|
||||
else:
|
||||
import sys
|
||||
print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
|
||||
|
||||
except Exception as e:
|
||||
import sys
|
||||
print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
return stats
|
||||
return {
|
||||
'Snapshot': len(snapshots),
|
||||
'Tag': len(tags),
|
||||
'Binary': len(binaries),
|
||||
'Machine': len(machines),
|
||||
}
|
||||
|
||||
|
||||
def process_is_alive(pid_file: Path) -> bool:
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
__package__ = 'archivebox.machine'
|
||||
|
||||
import socket
|
||||
from typing import Iterator, Set
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from datetime import timedelta
|
||||
|
||||
@@ -29,6 +30,8 @@ class MachineManager(models.Manager):
|
||||
|
||||
|
||||
class Machine(ModelWithHealthStats):
|
||||
JSONL_TYPE = 'Machine'
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
@@ -69,13 +72,35 @@ class Machine(ModelWithHealthStats):
|
||||
)
|
||||
return _CURRENT_MACHINE
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: dict, overrides: dict = None):
|
||||
@classmethod
|
||||
def from_jsonl(cls, records, overrides: dict = None) -> list['Machine']:
|
||||
"""
|
||||
Update Machine config from JSONL record.
|
||||
Update Machine configs from an iterable of JSONL records.
|
||||
Filters to only records with type='Machine'.
|
||||
|
||||
Args:
|
||||
record: JSONL record with '_method': 'update', 'key': '...', 'value': '...'
|
||||
records: Iterable of dicts (JSONL records)
|
||||
overrides: Not used
|
||||
|
||||
Returns:
|
||||
List of Machine instances (skips None results)
|
||||
"""
|
||||
results = []
|
||||
for record in records:
|
||||
record_type = record.get('type', cls.JSONL_TYPE)
|
||||
if record_type == cls.JSONL_TYPE:
|
||||
instance = cls.from_json(record, overrides=overrides)
|
||||
if instance:
|
||||
results.append(instance)
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: dict, overrides: dict = None) -> 'Machine | None':
|
||||
"""
|
||||
Update a single Machine config from a JSON record dict.
|
||||
|
||||
Args:
|
||||
record: Dict with '_method': 'update', 'key': '...', 'value': '...'
|
||||
overrides: Not used
|
||||
|
||||
Returns:
|
||||
@@ -94,6 +119,44 @@ class Machine(ModelWithHealthStats):
|
||||
return machine
|
||||
return None
|
||||
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Machine model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
'type': self.JSONL_TYPE,
|
||||
'schema_version': VERSION,
|
||||
'id': str(self.id),
|
||||
'guid': self.guid,
|
||||
'hostname': self.hostname,
|
||||
'hw_in_docker': self.hw_in_docker,
|
||||
'hw_in_vm': self.hw_in_vm,
|
||||
'os_arch': self.os_arch,
|
||||
'os_family': self.os_family,
|
||||
'os_platform': self.os_platform,
|
||||
'os_release': self.os_release,
|
||||
'created_at': self.created_at.isoformat() if self.created_at else None,
|
||||
}
|
||||
|
||||
def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
|
||||
"""
|
||||
Yield this Machine as a JSON record.
|
||||
|
||||
Args:
|
||||
seen: Set of (type, id) tuples already emitted (for deduplication)
|
||||
**kwargs: Passed to children (none for Machine, leaf node)
|
||||
|
||||
Yields:
|
||||
dict: JSON-serializable record for this machine
|
||||
"""
|
||||
if seen is not None:
|
||||
key = (self.JSONL_TYPE, str(self.id))
|
||||
if key in seen:
|
||||
return
|
||||
seen.add(key)
|
||||
yield self.to_json()
|
||||
|
||||
|
||||
class NetworkInterfaceManager(models.Manager):
|
||||
def current(self) -> 'NetworkInterface':
|
||||
@@ -101,6 +164,8 @@ class NetworkInterfaceManager(models.Manager):
|
||||
|
||||
|
||||
class NetworkInterface(ModelWithHealthStats):
|
||||
JSONL_TYPE = 'NetworkInterface'
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
@@ -139,6 +204,46 @@ class NetworkInterface(ModelWithHealthStats):
|
||||
)
|
||||
return _CURRENT_INTERFACE
|
||||
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert NetworkInterface model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
'type': self.JSONL_TYPE,
|
||||
'schema_version': VERSION,
|
||||
'id': str(self.id),
|
||||
'machine_id': str(self.machine_id),
|
||||
'hostname': self.hostname,
|
||||
'iface': self.iface,
|
||||
'ip_public': self.ip_public,
|
||||
'ip_local': self.ip_local,
|
||||
'mac_address': self.mac_address,
|
||||
'dns_server': self.dns_server,
|
||||
'isp': self.isp,
|
||||
'city': self.city,
|
||||
'region': self.region,
|
||||
'country': self.country,
|
||||
'created_at': self.created_at.isoformat() if self.created_at else None,
|
||||
}
|
||||
|
||||
def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
|
||||
"""
|
||||
Yield this NetworkInterface as a JSON record.
|
||||
|
||||
Args:
|
||||
seen: Set of (type, id) tuples already emitted (for deduplication)
|
||||
**kwargs: Passed to children (none for NetworkInterface, leaf node)
|
||||
|
||||
Yields:
|
||||
dict: JSON-serializable record for this network interface
|
||||
"""
|
||||
if seen is not None:
|
||||
key = (self.JSONL_TYPE, str(self.id))
|
||||
if key in seen:
|
||||
return
|
||||
seen.add(key)
|
||||
yield self.to_json()
|
||||
|
||||
|
||||
class BinaryManager(models.Manager):
|
||||
@@ -165,7 +270,7 @@ class BinaryManager(models.Manager):
|
||||
|
||||
class Binary(ModelWithHealthStats):
|
||||
"""
|
||||
Tracks an binary on a specific machine.
|
||||
Tracks a binary on a specific machine.
|
||||
|
||||
Follows the unified state machine pattern:
|
||||
- queued: Binary needs to be installed
|
||||
@@ -176,6 +281,7 @@ class Binary(ModelWithHealthStats):
|
||||
State machine calls run() which executes on_Binary__install_* hooks
|
||||
to install the binary using the specified providers.
|
||||
"""
|
||||
JSONL_TYPE = 'Binary'
|
||||
|
||||
class StatusChoices(models.TextChoices):
|
||||
QUEUED = 'queued', 'Queued'
|
||||
@@ -242,13 +348,13 @@ class Binary(ModelWithHealthStats):
|
||||
'is_valid': self.is_valid,
|
||||
}
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Binary model instance to a JSONL record.
|
||||
Convert Binary model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
'type': 'Binary',
|
||||
'type': self.JSONL_TYPE,
|
||||
'schema_version': VERSION,
|
||||
'id': str(self.id),
|
||||
'machine_id': str(self.machine_id),
|
||||
@@ -260,17 +366,57 @@ class Binary(ModelWithHealthStats):
|
||||
'status': self.status,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: dict, overrides: dict = None):
|
||||
def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
|
||||
"""
|
||||
Create/update Binary from JSONL record.
|
||||
Yield this Binary as a JSON record.
|
||||
|
||||
Args:
|
||||
seen: Set of (type, id) tuples already emitted (for deduplication)
|
||||
**kwargs: Passed to children (none for Binary, leaf node)
|
||||
|
||||
Yields:
|
||||
dict: JSON-serializable record for this binary
|
||||
"""
|
||||
if seen is not None:
|
||||
key = (self.JSONL_TYPE, str(self.id))
|
||||
if key in seen:
|
||||
return
|
||||
seen.add(key)
|
||||
yield self.to_json()
|
||||
|
||||
@classmethod
|
||||
def from_jsonl(cls, records, overrides: dict = None) -> list['Binary']:
|
||||
"""
|
||||
Create/update Binaries from an iterable of JSONL records.
|
||||
Filters to only records with type='Binary'.
|
||||
|
||||
Args:
|
||||
records: Iterable of dicts (JSONL records)
|
||||
overrides: Not used
|
||||
|
||||
Returns:
|
||||
List of Binary instances (skips None results)
|
||||
"""
|
||||
results = []
|
||||
for record in records:
|
||||
record_type = record.get('type', cls.JSONL_TYPE)
|
||||
if record_type == cls.JSONL_TYPE:
|
||||
instance = cls.from_json(record, overrides=overrides)
|
||||
if instance:
|
||||
results.append(instance)
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: dict, overrides: dict = None) -> 'Binary | None':
|
||||
"""
|
||||
Create/update a single Binary from a JSON record dict.
|
||||
|
||||
Handles two cases:
|
||||
1. From binaries.jsonl: creates queued binary with name, binproviders, overrides
|
||||
2. From hook output: updates binary with abspath, version, sha256, binprovider
|
||||
|
||||
Args:
|
||||
record: JSONL record with 'name' and either:
|
||||
record: Dict with 'name' and either:
|
||||
- 'binproviders', 'overrides' (from binaries.jsonl)
|
||||
- 'abspath', 'version', 'sha256', 'binprovider' (from hook output)
|
||||
overrides: Not used
|
||||
@@ -494,6 +640,7 @@ class Process(ModelWithHealthStats):
|
||||
|
||||
State machine calls launch() to spawn the process and monitors its lifecycle.
|
||||
"""
|
||||
JSONL_TYPE = 'Process'
|
||||
|
||||
class StatusChoices(models.TextChoices):
|
||||
QUEUED = 'queued', 'Queued'
|
||||
@@ -624,13 +771,13 @@ class Process(ModelWithHealthStats):
|
||||
return self.archiveresult.hook_name
|
||||
return ''
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Process model instance to a JSONL record.
|
||||
Convert Process model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
record = {
|
||||
'type': 'Process',
|
||||
'type': self.JSONL_TYPE,
|
||||
'schema_version': VERSION,
|
||||
'id': str(self.id),
|
||||
'machine_id': str(self.machine_id),
|
||||
@@ -650,6 +797,37 @@ class Process(ModelWithHealthStats):
|
||||
record['timeout'] = self.timeout
|
||||
return record
|
||||
|
||||
def to_jsonl(self, seen: Set[tuple] = None, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
|
||||
"""
|
||||
Yield this Process and optionally related objects as JSON records.
|
||||
|
||||
Args:
|
||||
seen: Set of (type, id) tuples already emitted (for deduplication)
|
||||
binary: Include related Binary (default: True)
|
||||
machine: Include related Machine (default: False)
|
||||
iface: Include related NetworkInterface (default: False)
|
||||
**kwargs: Passed to children
|
||||
|
||||
Yields:
|
||||
dict: JSON-serializable records
|
||||
"""
|
||||
if seen is None:
|
||||
seen = set()
|
||||
|
||||
key = (self.JSONL_TYPE, str(self.id))
|
||||
if key in seen:
|
||||
return
|
||||
seen.add(key)
|
||||
|
||||
yield self.to_json()
|
||||
|
||||
if binary and self.binary:
|
||||
yield from self.binary.to_jsonl(seen=seen, **kwargs)
|
||||
if machine and self.machine:
|
||||
yield from self.machine.to_jsonl(seen=seen, **kwargs)
|
||||
if iface and self.iface:
|
||||
yield from self.iface.to_jsonl(seen=seen, **kwargs)
|
||||
|
||||
def update_and_requeue(self, **kwargs):
|
||||
"""
|
||||
Update process fields and requeue for worker state machine.
|
||||
|
||||
@@ -24,7 +24,7 @@ __package__ = 'archivebox.misc'
|
||||
|
||||
import sys
|
||||
import json
|
||||
from typing import Iterator, Dict, Any, Optional, TextIO, Callable
|
||||
from typing import Iterator, Dict, Any, Optional, TextIO
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@@ -150,36 +150,3 @@ def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] =
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def filter_by_type(records: Iterator[Dict[str, Any]], record_type: str) -> Iterator[Dict[str, Any]]:
|
||||
"""
|
||||
Filter records by type.
|
||||
"""
|
||||
for record in records:
|
||||
if record.get('type') == record_type:
|
||||
yield record
|
||||
|
||||
|
||||
def process_records(
|
||||
records: Iterator[Dict[str, Any]],
|
||||
handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]]
|
||||
) -> Iterator[Dict[str, Any]]:
|
||||
"""
|
||||
Process records through type-specific handlers.
|
||||
|
||||
Args:
|
||||
records: Input record iterator
|
||||
handlers: Dict mapping type names to handler functions
|
||||
Handlers return output records or None to skip
|
||||
|
||||
Yields output records from handlers.
|
||||
"""
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
handler = handlers.get(record_type)
|
||||
if handler:
|
||||
result = handler(record)
|
||||
if result:
|
||||
yield result
|
||||
|
||||
|
||||
|
||||
@@ -3,7 +3,12 @@
|
||||
Install hook for Chrome/Chromium and puppeteer-core.
|
||||
|
||||
Runs at crawl start to install/find Chromium and puppeteer-core.
|
||||
Outputs JSONL for Binary and Machine config updates.
|
||||
Also validates config and computes derived values.
|
||||
|
||||
Outputs:
|
||||
- JSONL for Binary and Machine config updates
|
||||
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
|
||||
|
||||
Respects CHROME_BINARY env var for custom binary paths.
|
||||
Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
|
||||
|
||||
@@ -19,6 +24,28 @@ import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def detect_docker() -> bool:
|
||||
"""Detect if running inside Docker container."""
|
||||
return (
|
||||
os.path.exists('/.dockerenv') or
|
||||
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
|
||||
os.path.exists('/run/.containerenv')
|
||||
)
|
||||
|
||||
|
||||
def get_chrome_version(binary_path: str) -> str | None:
|
||||
"""Get Chrome/Chromium version string."""
|
||||
try:
|
||||
@@ -131,13 +158,41 @@ def install_chromium() -> dict | None:
|
||||
|
||||
|
||||
def main():
|
||||
warnings = []
|
||||
errors = []
|
||||
computed = {}
|
||||
|
||||
# Install puppeteer-core if NODE_MODULES_DIR is set
|
||||
install_puppeteer_core()
|
||||
|
||||
# Check if Chrome is enabled
|
||||
chrome_enabled = get_env_bool('CHROME_ENABLED', True)
|
||||
|
||||
# Detect Docker and adjust sandbox
|
||||
in_docker = detect_docker()
|
||||
computed['IN_DOCKER'] = str(in_docker).lower()
|
||||
|
||||
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
|
||||
if in_docker and chrome_sandbox:
|
||||
warnings.append(
|
||||
"Running in Docker with CHROME_SANDBOX=true. "
|
||||
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
|
||||
)
|
||||
# Auto-disable sandbox in Docker unless explicitly set
|
||||
if not get_env('CHROME_SANDBOX'):
|
||||
computed['CHROME_SANDBOX'] = 'false'
|
||||
|
||||
# Check Node.js availability
|
||||
node_binary = get_env('NODE_BINARY', 'node')
|
||||
computed['NODE_BINARY'] = node_binary
|
||||
|
||||
# Check if CHROME_BINARY is already set and valid
|
||||
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
|
||||
configured_binary = get_env('CHROME_BINARY', '')
|
||||
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
|
||||
version = get_chrome_version(configured_binary)
|
||||
computed['CHROME_BINARY'] = configured_binary
|
||||
computed['CHROME_VERSION'] = version or 'unknown'
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Binary',
|
||||
'name': 'chromium',
|
||||
@@ -145,12 +200,22 @@ def main():
|
||||
'version': version,
|
||||
'binprovider': 'env',
|
||||
}))
|
||||
|
||||
# Output computed values
|
||||
for key, value in computed.items():
|
||||
print(f"COMPUTED:{key}={value}")
|
||||
for warning in warnings:
|
||||
print(f"WARNING:{warning}", file=sys.stderr)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
# Install/find Chromium via puppeteer
|
||||
result = install_chromium()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
computed['CHROME_BINARY'] = result['abspath']
|
||||
computed['CHROME_VERSION'] = result['version'] or 'unknown'
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Binary',
|
||||
'name': result['name'],
|
||||
@@ -174,9 +239,25 @@ def main():
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
# Output computed values
|
||||
for key, value in computed.items():
|
||||
print(f"COMPUTED:{key}={value}")
|
||||
for warning in warnings:
|
||||
print(f"WARNING:{warning}", file=sys.stderr)
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
print("Chromium binary not found", file=sys.stderr)
|
||||
errors.append("Chromium binary not found")
|
||||
computed['CHROME_BINARY'] = ''
|
||||
|
||||
# Output computed values and errors
|
||||
for key, value in computed.items():
|
||||
print(f"COMPUTED:{key}={value}")
|
||||
for warning in warnings:
|
||||
print(f"WARNING:{warning}", file=sys.stderr)
|
||||
for error in errors:
|
||||
print(f"ERROR:{error}", file=sys.stderr)
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -1,172 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate and compute derived Chrome config values.
|
||||
|
||||
This hook runs early in the Crawl lifecycle to:
|
||||
1. Auto-detect Chrome binary location
|
||||
2. Compute sandbox settings based on Docker detection
|
||||
3. Validate binary availability and version
|
||||
4. Set computed env vars for subsequent hooks
|
||||
|
||||
Output:
|
||||
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
|
||||
- Binary JSONL records to stdout when binaries are found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
# Chrome binary search order
|
||||
CHROME_BINARY_NAMES = [
|
||||
'chromium',
|
||||
'chromium-browser',
|
||||
'google-chrome',
|
||||
'google-chrome-stable',
|
||||
'chrome',
|
||||
]
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def detect_docker() -> bool:
|
||||
"""Detect if running inside Docker container."""
|
||||
return (
|
||||
os.path.exists('/.dockerenv') or
|
||||
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
|
||||
os.path.exists('/run/.containerenv')
|
||||
)
|
||||
|
||||
|
||||
def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None:
|
||||
"""Find Chrome binary using abx-pkg, checking configured path first."""
|
||||
# Try configured binary first
|
||||
if configured:
|
||||
try:
|
||||
binary = Binary(name=configured, binproviders=[provider]).load()
|
||||
if binary.abspath:
|
||||
return binary
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Search common names
|
||||
for name in CHROME_BINARY_NAMES:
|
||||
try:
|
||||
binary = Binary(name=name, binproviders=[provider]).load()
|
||||
if binary.abspath:
|
||||
return binary
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def output_binary(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record to stdout."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env',
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
warnings = []
|
||||
errors = []
|
||||
computed = {}
|
||||
|
||||
# Get config values
|
||||
chrome_binary = get_env('CHROME_BINARY', 'chromium')
|
||||
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
|
||||
screenshot_enabled = get_env_bool('SCREENSHOT_ENABLED', True)
|
||||
pdf_enabled = get_env_bool('PDF_ENABLED', True)
|
||||
dom_enabled = get_env_bool('DOM_ENABLED', True)
|
||||
|
||||
# Compute USE_CHROME (derived from extractor enabled flags)
|
||||
use_chrome = screenshot_enabled or pdf_enabled or dom_enabled
|
||||
computed['USE_CHROME'] = str(use_chrome).lower()
|
||||
|
||||
# Detect Docker and adjust sandbox
|
||||
in_docker = detect_docker()
|
||||
computed['IN_DOCKER'] = str(in_docker).lower()
|
||||
|
||||
if in_docker and chrome_sandbox:
|
||||
warnings.append(
|
||||
"Running in Docker with CHROME_SANDBOX=true. "
|
||||
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
|
||||
)
|
||||
# Auto-disable sandbox in Docker unless explicitly set
|
||||
if not get_env('CHROME_SANDBOX'):
|
||||
computed['CHROME_SANDBOX'] = 'false'
|
||||
|
||||
# Find Chrome binary using abx-pkg
|
||||
provider = EnvProvider()
|
||||
if use_chrome:
|
||||
chrome = find_chrome_binary(chrome_binary, provider)
|
||||
if not chrome or not chrome.abspath:
|
||||
errors.append(
|
||||
f"Chrome binary not found (tried: {chrome_binary}). "
|
||||
"Install Chrome/Chromium or set CHROME_BINARY path."
|
||||
)
|
||||
computed['CHROME_BINARY'] = ''
|
||||
else:
|
||||
computed['CHROME_BINARY'] = str(chrome.abspath)
|
||||
computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown'
|
||||
|
||||
# Output Binary JSONL record for Chrome
|
||||
output_binary(chrome, name='chrome')
|
||||
|
||||
# Check Node.js for Puppeteer
|
||||
node_binary_name = get_env('NODE_BINARY', 'node')
|
||||
try:
|
||||
node = Binary(name=node_binary_name, binproviders=[provider]).load()
|
||||
node_path = str(node.abspath) if node.abspath else ''
|
||||
except Exception:
|
||||
node = None
|
||||
node_path = ''
|
||||
|
||||
if use_chrome and not node_path:
|
||||
errors.append(
|
||||
f"Node.js not found (tried: {node_binary_name}). "
|
||||
"Install Node.js or set NODE_BINARY path for Puppeteer."
|
||||
)
|
||||
else:
|
||||
computed['NODE_BINARY'] = node_path
|
||||
if node and node.abspath:
|
||||
# Output Binary JSONL record for Node
|
||||
output_binary(node, name='node')
|
||||
|
||||
# Output computed values
|
||||
for key, value in computed.items():
|
||||
print(f"COMPUTED:{key}={value}")
|
||||
|
||||
for warning in warnings:
|
||||
print(f"WARNING:{warning}", file=sys.stderr)
|
||||
|
||||
for error in errors:
|
||||
print(f"ERROR:{error}", file=sys.stderr)
|
||||
|
||||
sys.exit(1 if errors else 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -9,7 +9,7 @@
|
||||
* --load-extension and --disable-extensions-except flags.
|
||||
*
|
||||
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
||||
* Output: Creates chrome/ directory under crawl output dir with:
|
||||
* Output: Writes to current directory (executor creates chrome/ dir):
|
||||
* - cdp_url.txt: WebSocket URL for CDP connection
|
||||
* - chrome.pid: Chromium process ID (for cleanup)
|
||||
* - port.txt: Debug port number
|
||||
@@ -42,7 +42,7 @@ const {
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'chrome_launch';
|
||||
const OUTPUT_DIR = 'chrome';
|
||||
const OUTPUT_DIR = '.';
|
||||
|
||||
// Global state for cleanup
|
||||
let chromePid = null;
|
||||
@@ -1,268 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* SingleFile Extension Plugin
|
||||
*
|
||||
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
|
||||
* Falls back to single-file-cli if the extension is not available.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
|
||||
*
|
||||
* Priority: 04 (early) - Must install before Chrome session starts at Crawl level
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* This extension automatically:
|
||||
* - Saves complete web pages as single HTML files
|
||||
* - Inlines all resources (CSS, JS, images, fonts)
|
||||
* - Preserves page fidelity better than wget/curl
|
||||
* - Works with SPAs and dynamically loaded content
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const { promisify } = require('util');
|
||||
const { exec } = require('child_process');
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
|
||||
name: 'singlefile',
|
||||
};
|
||||
|
||||
// Get extensions directory from environment or use default
|
||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
||||
|
||||
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
|
||||
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'singlefile.html';
|
||||
|
||||
/**
|
||||
* Install the SingleFile extension
|
||||
*/
|
||||
async function installSinglefileExtension() {
|
||||
console.log('[*] Installing SingleFile extension...');
|
||||
|
||||
// Install the extension
|
||||
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
||||
|
||||
if (!extension) {
|
||||
console.error('[❌] Failed to install SingleFile extension');
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log('[+] SingleFile extension installed');
|
||||
console.log('[+] Web pages will be saved as single HTML files');
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for a specified amount of time
|
||||
*/
|
||||
function wait(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/**
|
||||
* Save a page using the SingleFile extension
|
||||
*
|
||||
* @param {Object} page - Puppeteer page object
|
||||
* @param {Object} extension - Extension metadata with dispatchAction method
|
||||
* @param {Object} options - Additional options
|
||||
* @returns {Promise<string|null>} - Path to saved file or null on failure
|
||||
*/
|
||||
async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
if (!extension || !extension.version) {
|
||||
throw new Error('SingleFile extension not found or not loaded');
|
||||
}
|
||||
|
||||
const url = await page.url();
|
||||
|
||||
// Check for unsupported URL schemes
|
||||
const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
|
||||
const scheme = url.split(':')[0];
|
||||
if (URL_SCHEMES_IGNORED.includes(scheme)) {
|
||||
console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Ensure downloads directory exists
|
||||
await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
|
||||
|
||||
// Get list of existing files to ignore
|
||||
const files_before = new Set(
|
||||
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
.filter(fn => fn.endsWith('.html'))
|
||||
);
|
||||
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
|
||||
|
||||
// Bring page to front (extension action button acts on foreground tab)
|
||||
await page.bringToFront();
|
||||
|
||||
// Trigger the extension's action (toolbar button click)
|
||||
await extension.dispatchAction();
|
||||
|
||||
// Wait for file to appear in downloads directory
|
||||
const check_delay = 3000; // 3 seconds
|
||||
const max_tries = 10;
|
||||
let files_new = [];
|
||||
|
||||
for (let attempt = 0; attempt < max_tries; attempt++) {
|
||||
await wait(check_delay);
|
||||
|
||||
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
.filter(fn => fn.endsWith('.html'));
|
||||
|
||||
files_new = files_after.filter(file => !files_before.has(file));
|
||||
|
||||
if (files_new.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find the matching file by checking if it contains the URL in the HTML header
|
||||
for (const file of files_new) {
|
||||
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
|
||||
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
|
||||
const dl_header = dl_text.split('meta charset')[0];
|
||||
|
||||
if (dl_header.includes(`url: ${url}`)) {
|
||||
console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
|
||||
await fs.promises.rename(dl_path, out_path);
|
||||
return out_path;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
|
||||
console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Save a page using single-file-cli (fallback method)
|
||||
*
|
||||
* @param {string} url - URL to archive
|
||||
* @param {Object} options - Additional options
|
||||
* @returns {Promise<string|null>} - Path to saved file or null on failure
|
||||
*/
|
||||
async function saveSinglefileWithCLI(url, options = {}) {
|
||||
console.log('[*] Falling back to single-file-cli...');
|
||||
|
||||
// Find single-file binary
|
||||
let binary = null;
|
||||
try {
|
||||
const { stdout } = await execAsync('which single-file');
|
||||
binary = stdout.trim();
|
||||
} catch (err) {
|
||||
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Build command
|
||||
const cmd = [
|
||||
binary,
|
||||
'--browser-headless',
|
||||
url,
|
||||
out_path,
|
||||
];
|
||||
|
||||
// Add optional args
|
||||
if (options.userAgent) {
|
||||
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
|
||||
}
|
||||
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
|
||||
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
|
||||
}
|
||||
if (options.ignoreSSL) {
|
||||
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
|
||||
}
|
||||
|
||||
// Execute
|
||||
try {
|
||||
const timeout = options.timeout || 120000;
|
||||
await execAsync(cmd.join(' '), { timeout });
|
||||
|
||||
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
|
||||
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
|
||||
return out_path;
|
||||
}
|
||||
|
||||
console.error('[❌] SingleFile CLI completed but no output file found');
|
||||
return null;
|
||||
} catch (err) {
|
||||
console.error(`[❌] SingleFile CLI error: ${err.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point - install extension before archiving
|
||||
*/
|
||||
async function main() {
|
||||
// Check if extension is already cached
|
||||
const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
|
||||
|
||||
if (fs.existsSync(cacheFile)) {
|
||||
try {
|
||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
||||
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
console.log('[*] SingleFile extension already installed (using cache)');
|
||||
return cached;
|
||||
}
|
||||
} catch (e) {
|
||||
// Cache file corrupted, re-install
|
||||
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
||||
}
|
||||
}
|
||||
|
||||
// Install extension
|
||||
const extension = await installSinglefileExtension();
|
||||
|
||||
// Export extension metadata for chrome plugin to load
|
||||
if (extension) {
|
||||
// Write extension info to a cache file that chrome plugin can read
|
||||
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
||||
await fs.promises.writeFile(
|
||||
cacheFile,
|
||||
JSON.stringify(extension, null, 2)
|
||||
);
|
||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
EXTENSION,
|
||||
installSinglefileExtension,
|
||||
saveSinglefileWithExtension,
|
||||
saveSinglefileWithCLI,
|
||||
};
|
||||
|
||||
// Run if executed directly
|
||||
if (require.main === module) {
|
||||
main().then(() => {
|
||||
console.log('[✓] SingleFile extension setup complete');
|
||||
process.exit(0);
|
||||
}).catch(err => {
|
||||
console.error('[❌] SingleFile extension setup failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
281
archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js
Executable file
281
archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js
Executable file
@@ -0,0 +1,281 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* SingleFile Extension Plugin
|
||||
*
|
||||
* DISABLED: Extension functionality commented out - using single-file-cli only
|
||||
*
|
||||
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
|
||||
* Falls back to single-file-cli if the extension is not available.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
|
||||
*
|
||||
* Priority: 04 (early) - Must install before Chrome session starts at Crawl level
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* This extension automatically:
|
||||
* - Saves complete web pages as single HTML files
|
||||
* - Inlines all resources (CSS, JS, images, fonts)
|
||||
* - Preserves page fidelity better than wget/curl
|
||||
* - Works with SPAs and dynamically loaded content
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const { promisify } = require('util');
|
||||
const { exec } = require('child_process');
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// // Import extension utilities
|
||||
// const extensionUtils = require('../chrome/chrome_utils.js');
|
||||
|
||||
// // Extension metadata
|
||||
// const EXTENSION = {
|
||||
// webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
|
||||
// name: 'singlefile',
|
||||
// };
|
||||
|
||||
// // Get extensions directory from environment or use default
|
||||
// const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
||||
|
||||
// const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
|
||||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
|
||||
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'singlefile.html';
|
||||
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// /**
|
||||
// * Install the SingleFile extension
|
||||
// */
|
||||
// async function installSinglefileExtension() {
|
||||
// console.log('[*] Installing SingleFile extension...');
|
||||
|
||||
// // Install the extension
|
||||
// const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
||||
|
||||
// if (!extension) {
|
||||
// console.error('[❌] Failed to install SingleFile extension');
|
||||
// return null;
|
||||
// }
|
||||
|
||||
// console.log('[+] SingleFile extension installed');
|
||||
// console.log('[+] Web pages will be saved as single HTML files');
|
||||
|
||||
// return extension;
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * Wait for a specified amount of time
|
||||
// */
|
||||
// function wait(ms) {
|
||||
// return new Promise(resolve => setTimeout(resolve, ms));
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * Save a page using the SingleFile extension
|
||||
// *
|
||||
// * @param {Object} page - Puppeteer page object
|
||||
// * @param {Object} extension - Extension metadata with dispatchAction method
|
||||
// * @param {Object} options - Additional options
|
||||
// * @returns {Promise<string|null>} - Path to saved file or null on failure
|
||||
// */
|
||||
// async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
// if (!extension || !extension.version) {
|
||||
// throw new Error('SingleFile extension not found or not loaded');
|
||||
// }
|
||||
|
||||
// const url = await page.url();
|
||||
|
||||
// // Check for unsupported URL schemes
|
||||
// const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
|
||||
// const scheme = url.split(':')[0];
|
||||
// if (URL_SCHEMES_IGNORED.includes(scheme)) {
|
||||
// console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
|
||||
// return null;
|
||||
// }
|
||||
|
||||
// // Ensure downloads directory exists
|
||||
// await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
|
||||
|
||||
// // Get list of existing files to ignore
|
||||
// const files_before = new Set(
|
||||
// (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
// .filter(fn => fn.endsWith('.html'))
|
||||
// );
|
||||
|
||||
// // Output directory is current directory (hook already runs in output dir)
|
||||
// const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
|
||||
|
||||
// // Bring page to front (extension action button acts on foreground tab)
|
||||
// await page.bringToFront();
|
||||
|
||||
// // Trigger the extension's action (toolbar button click)
|
||||
// await extension.dispatchAction();
|
||||
|
||||
// // Wait for file to appear in downloads directory
|
||||
// const check_delay = 3000; // 3 seconds
|
||||
// const max_tries = 10;
|
||||
// let files_new = [];
|
||||
|
||||
// for (let attempt = 0; attempt < max_tries; attempt++) {
|
||||
// await wait(check_delay);
|
||||
|
||||
// const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
// .filter(fn => fn.endsWith('.html'));
|
||||
|
||||
// files_new = files_after.filter(file => !files_before.has(file));
|
||||
|
||||
// if (files_new.length === 0) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
// // Find the matching file by checking if it contains the URL in the HTML header
|
||||
// for (const file of files_new) {
|
||||
// const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
|
||||
// const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
|
||||
// const dl_header = dl_text.split('meta charset')[0];
|
||||
|
||||
// if (dl_header.includes(`url: ${url}`)) {
|
||||
// console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
|
||||
// await fs.promises.rename(dl_path, out_path);
|
||||
// return out_path;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
|
||||
// console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
|
||||
// return null;
|
||||
// }
|
||||
|
||||
/**
|
||||
* Save a page using single-file-cli (fallback method)
|
||||
*
|
||||
* @param {string} url - URL to archive
|
||||
* @param {Object} options - Additional options
|
||||
* @returns {Promise<string|null>} - Path to saved file or null on failure
|
||||
*/
|
||||
async function saveSinglefileWithCLI(url, options = {}) {
|
||||
console.log('[*] Falling back to single-file-cli...');
|
||||
|
||||
// Find single-file binary
|
||||
let binary = null;
|
||||
try {
|
||||
const { stdout } = await execAsync('which single-file');
|
||||
binary = stdout.trim();
|
||||
} catch (err) {
|
||||
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Build command
|
||||
const cmd = [
|
||||
binary,
|
||||
'--browser-headless',
|
||||
url,
|
||||
out_path,
|
||||
];
|
||||
|
||||
// Add optional args
|
||||
if (options.userAgent) {
|
||||
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
|
||||
}
|
||||
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
|
||||
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
|
||||
}
|
||||
if (options.ignoreSSL) {
|
||||
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
|
||||
}
|
||||
|
||||
// Execute
|
||||
try {
|
||||
const timeout = options.timeout || 120000;
|
||||
await execAsync(cmd.join(' '), { timeout });
|
||||
|
||||
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
|
||||
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
|
||||
return out_path;
|
||||
}
|
||||
|
||||
console.error('[❌] SingleFile CLI completed but no output file found');
|
||||
return null;
|
||||
} catch (err) {
|
||||
console.error(`[❌] SingleFile CLI error: ${err.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// /**
|
||||
// * Main entry point - install extension before archiving
|
||||
// */
|
||||
// async function main() {
|
||||
// // Check if extension is already cached
|
||||
// const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
|
||||
|
||||
// if (fs.existsSync(cacheFile)) {
|
||||
// try {
|
||||
// const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
// const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
||||
|
||||
// if (fs.existsSync(manifestPath)) {
|
||||
// console.log('[*] SingleFile extension already installed (using cache)');
|
||||
// return cached;
|
||||
// }
|
||||
// } catch (e) {
|
||||
// // Cache file corrupted, re-install
|
||||
// console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
||||
// }
|
||||
// }
|
||||
|
||||
// // Install extension
|
||||
// const extension = await installSinglefileExtension();
|
||||
|
||||
// // Export extension metadata for chrome plugin to load
|
||||
// if (extension) {
|
||||
// // Write extension info to a cache file that chrome plugin can read
|
||||
// await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
||||
// await fs.promises.writeFile(
|
||||
// cacheFile,
|
||||
// JSON.stringify(extension, null, 2)
|
||||
// );
|
||||
// console.log(`[+] Extension metadata written to ${cacheFile}`);
|
||||
// }
|
||||
|
||||
// return extension;
|
||||
// }
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// EXTENSION,
|
||||
// installSinglefileExtension,
|
||||
// saveSinglefileWithExtension,
|
||||
saveSinglefileWithCLI,
|
||||
};
|
||||
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// // Run if executed directly
|
||||
// if (require.main === module) {
|
||||
// main().then(() => {
|
||||
// console.log('[✓] SingleFile extension setup complete');
|
||||
// process.exit(0);
|
||||
// }).catch(err => {
|
||||
// console.error('[❌] SingleFile extension setup failed:', err);
|
||||
// process.exit(1);
|
||||
// });
|
||||
// }
|
||||
|
||||
// No-op when run directly (extension install disabled)
|
||||
if (require.main === module) {
|
||||
console.log('[*] SingleFile extension install disabled - using single-file-cli only');
|
||||
process.exit(0);
|
||||
}
|
||||
@@ -2,16 +2,15 @@
|
||||
Integration tests for singlefile plugin
|
||||
|
||||
Tests verify:
|
||||
1. Hook script exists and has correct metadata
|
||||
2. Extension installation and caching works
|
||||
3. Chrome/node dependencies available
|
||||
4. Hook can be executed successfully
|
||||
1. Hook scripts exist with correct naming
|
||||
2. CLI-based singlefile extraction works
|
||||
3. Dependencies available via abx-pkg
|
||||
4. Output contains valid HTML
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
@@ -20,177 +19,63 @@ import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_singlefile.*'), None)
|
||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
|
||||
SNAPSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_singlefile.py'), None)
|
||||
TEST_URL = "https://example.com"
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
"""Verify install script exists"""
|
||||
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
|
||||
def test_snapshot_hook_exists():
|
||||
"""Verify snapshot extraction hook exists"""
|
||||
assert SNAPSHOT_HOOK is not None and SNAPSHOT_HOOK.exists(), f"Snapshot hook not found in {PLUGIN_DIR}"
|
||||
|
||||
|
||||
def test_extension_metadata():
|
||||
"""Test that SingleFile extension has correct metadata"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
|
||||
|
||||
result = subprocess.run(
|
||||
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
|
||||
|
||||
metadata = json.loads(result.stdout)
|
||||
assert metadata["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
|
||||
assert metadata["name"] == "singlefile"
|
||||
|
||||
|
||||
def test_install_creates_cache():
|
||||
"""Test that install creates extension cache"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Check output mentions installation
|
||||
assert "SingleFile" in result.stdout or "singlefile" in result.stdout
|
||||
|
||||
# Check cache file was created
|
||||
cache_file = ext_dir / "singlefile.extension.json"
|
||||
assert cache_file.exists(), "Cache file should be created"
|
||||
|
||||
# Verify cache content
|
||||
cache_data = json.loads(cache_file.read_text())
|
||||
assert cache_data["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
|
||||
assert cache_data["name"] == "singlefile"
|
||||
|
||||
|
||||
def test_install_twice_uses_cache():
|
||||
"""Test that running install twice uses existing cache on second run"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
|
||||
# First install - downloads the extension
|
||||
result1 = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
assert result1.returncode == 0, f"First install failed: {result1.stderr}"
|
||||
|
||||
# Verify cache was created
|
||||
cache_file = ext_dir / "singlefile.extension.json"
|
||||
assert cache_file.exists(), "Cache file should exist after first install"
|
||||
|
||||
# Second install - should use cache
|
||||
result2 = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
|
||||
|
||||
# Second run should be faster (uses cache) and mention cache
|
||||
assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
|
||||
|
||||
|
||||
def test_no_configuration_required():
|
||||
"""Test that SingleFile works without configuration"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
# No API keys needed
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should work without API keys
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_priority_order():
|
||||
"""Test that singlefile has correct priority (04)"""
|
||||
# Extract priority from filename
|
||||
filename = INSTALL_SCRIPT.name
|
||||
assert "04" in filename, "SingleFile should have priority 04"
|
||||
assert filename.startswith("on_Crawl__04_"), "Should follow priority naming convention for Crawl hooks"
|
||||
|
||||
|
||||
def test_output_directory_structure():
|
||||
"""Test that plugin defines correct output structure"""
|
||||
# Verify the script mentions singlefile output directory
|
||||
script_content = INSTALL_SCRIPT.read_text()
|
||||
|
||||
# Should mention singlefile output directory
|
||||
assert "singlefile" in script_content.lower()
|
||||
# Should mention HTML output
|
||||
assert ".html" in script_content or "html" in script_content.lower()
|
||||
def test_snapshot_hook_priority():
|
||||
"""Test that snapshot hook has correct priority (50)"""
|
||||
filename = SNAPSHOT_HOOK.name
|
||||
assert "50" in filename, "SingleFile snapshot hook should have priority 50"
|
||||
assert filename.startswith("on_Snapshot__50_"), "Should follow priority naming convention"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify dependencies are available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
||||
"""Verify dependencies are available via abx-pkg."""
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Verify node is available (singlefile uses Chrome extension, needs Node)
|
||||
# Verify node is available
|
||||
node_binary = Binary(name='node', binproviders=[EnvProvider()])
|
||||
node_loaded = node_binary.load()
|
||||
assert node_loaded and node_loaded.abspath, "Node.js required for singlefile plugin"
|
||||
|
||||
|
||||
def test_singlefile_hook_runs():
|
||||
"""Verify singlefile hook can be executed and completes."""
|
||||
# Prerequisites checked by earlier test
|
||||
|
||||
def test_singlefile_cli_archives_example_com():
|
||||
"""Test that singlefile CLI archives example.com and produces valid HTML."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run singlefile extraction hook
|
||||
env = os.environ.copy()
|
||||
env['SINGLEFILE_ENABLED'] = 'true'
|
||||
|
||||
# Run singlefile snapshot hook
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT), f'--url={TEST_URL}', '--snapshot-id=test789'],
|
||||
['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
# Hook should complete successfully (even if it just installs extension)
|
||||
assert result.returncode == 0, f"Hook execution failed: {result.stderr}"
|
||||
|
||||
# Verify extension installation happens
|
||||
assert 'SingleFile extension' in result.stdout or result.returncode == 0, "Should install extension or complete"
|
||||
# Verify output file exists
|
||||
output_file = tmpdir / 'singlefile.html'
|
||||
assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}"
|
||||
|
||||
# Verify it contains real HTML
|
||||
html_content = output_file.read_text()
|
||||
assert len(html_content) > 500, "Output file too small to be valid HTML"
|
||||
assert '<!DOCTYPE html>' in html_content or '<html' in html_content, "Output should contain HTML doctype or html tag"
|
||||
assert 'Example Domain' in html_content, "Output should contain example.com content"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -25,7 +25,7 @@ const extensionUtils = require('../chrome/chrome_utils.js');
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo',
|
||||
name: 'captcha2',
|
||||
name: 'twocaptcha',
|
||||
};
|
||||
|
||||
// Get extensions directory from environment or use default
|
||||
@@ -69,7 +69,7 @@ async function installCaptchaExtension() {
|
||||
*/
|
||||
async function main() {
|
||||
// Check if extension is already cached
|
||||
const cacheFile = path.join(EXTENSIONS_DIR, 'captcha2.extension.json');
|
||||
const cacheFile = path.join(EXTENSIONS_DIR, 'twocaptcha.extension.json');
|
||||
|
||||
if (fs.existsSync(cacheFile)) {
|
||||
try {
|
||||
@@ -29,7 +29,7 @@ function getCrawlChromeSessionDir() {
|
||||
}
|
||||
|
||||
const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome';
|
||||
const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.captcha2_configured');
|
||||
const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.twocaptcha_configured');
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
@@ -70,7 +70,7 @@ async function configure2Captcha() {
|
||||
}
|
||||
|
||||
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
|
||||
const captchaExt = extensions.find(ext => ext.name === 'captcha2');
|
||||
const captchaExt = extensions.find(ext => ext.name === 'twocaptcha');
|
||||
|
||||
if (!captchaExt) {
|
||||
console.error('[*] 2captcha extension not installed, skipping configuration');
|
||||
@@ -236,7 +236,7 @@ async function main() {
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__21_captcha2_config.js --url=<url> --snapshot-id=<uuid>');
|
||||
console.error('Usage: on_Snapshot__21_twocaptcha_config.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Unit tests for captcha2 plugin
|
||||
Unit tests for twocaptcha plugin
|
||||
|
||||
Tests invoke the plugin hooks as external processes and verify outputs/side effects.
|
||||
"""
|
||||
@@ -14,8 +14,8 @@ import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2.*'), None)
|
||||
CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2_config.*'), None)
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_twocaptcha_extension.*'), None)
|
||||
CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_configure_twocaptcha_extension_options.*'), None)
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
@@ -29,7 +29,7 @@ def test_config_script_exists():
|
||||
|
||||
|
||||
def test_extension_metadata():
|
||||
"""Test that captcha2 extension has correct metadata"""
|
||||
"""Test that twocaptcha extension has correct metadata"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
|
||||
@@ -46,7 +46,7 @@ def test_extension_metadata():
|
||||
|
||||
metadata = json.loads(result.stdout)
|
||||
assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
|
||||
assert metadata["name"] == "captcha2"
|
||||
assert metadata["name"] == "twocaptcha"
|
||||
|
||||
|
||||
def test_install_creates_cache():
|
||||
@@ -72,13 +72,13 @@ def test_install_creates_cache():
|
||||
assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
|
||||
|
||||
# Check cache file was created
|
||||
cache_file = ext_dir / "captcha2.extension.json"
|
||||
cache_file = ext_dir / "twocaptcha.extension.json"
|
||||
assert cache_file.exists(), "Cache file should be created"
|
||||
|
||||
# Verify cache content
|
||||
cache_data = json.loads(cache_file.read_text())
|
||||
assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
|
||||
assert cache_data["name"] == "captcha2"
|
||||
assert cache_data["name"] == "twocaptcha"
|
||||
assert "unpacked_path" in cache_data
|
||||
assert "version" in cache_data
|
||||
|
||||
@@ -104,7 +104,7 @@ def test_install_twice_uses_cache():
|
||||
assert result1.returncode == 0, f"First install failed: {result1.stderr}"
|
||||
|
||||
# Verify cache was created
|
||||
cache_file = ext_dir / "captcha2.extension.json"
|
||||
cache_file = ext_dir / "twocaptcha.extension.json"
|
||||
assert cache_file.exists(), "Cache file should exist after first install"
|
||||
|
||||
# Second install - should use cache
|
||||
@@ -175,7 +175,7 @@ def test_config_script_structure():
|
||||
script_content = CONFIG_SCRIPT.read_text()
|
||||
|
||||
# Should mention configuration marker file
|
||||
assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content
|
||||
assert "CONFIG_MARKER" in script_content or "twocaptcha_configured" in script_content
|
||||
|
||||
# Should mention API key
|
||||
assert "API_KEY_2CAPTCHA" in script_content
|
||||
Reference in New Issue
Block a user